| { | |
| "best_metric": 1.1103906631469727, | |
| "best_model_checkpoint": "/home/wani/Desktop/roberta-pretrain/ckpt/roberta/pretrain/medium/256/checkpoint-12330", | |
| "epoch": 10.386703853378108, | |
| "eval_steps": 90, | |
| "global_step": 12330, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008423928510444533, | |
| "grad_norm": 5.073121070861816, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 7.2395, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016847857020889066, | |
| "grad_norm": 4.587955474853516, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 7.0836, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0252717855313336, | |
| "grad_norm": 3.8589327335357666, | |
| "learning_rate": 1.25e-05, | |
| "loss": 6.8156, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03369571404177813, | |
| "grad_norm": 3.4427683353424072, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 6.5549, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04211964255222266, | |
| "grad_norm": 3.109060525894165, | |
| "learning_rate": 2.0833333333333333e-05, | |
| "loss": 6.3522, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0505435710626672, | |
| "grad_norm": 2.86232590675354, | |
| "learning_rate": 2.5e-05, | |
| "loss": 6.1983, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05896749957311173, | |
| "grad_norm": 2.6880924701690674, | |
| "learning_rate": 2.9166666666666666e-05, | |
| "loss": 6.0796, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06739142808355626, | |
| "grad_norm": 2.490527629852295, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 5.9754, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0758153565940008, | |
| "grad_norm": 2.3156356811523438, | |
| "learning_rate": 3.75e-05, | |
| "loss": 5.8736, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0758153565940008, | |
| "eval_accuracy": 0.22415329938580753, | |
| "eval_loss": 5.8054423332214355, | |
| "eval_runtime": 910.9652, | |
| "eval_samples_per_second": 548.183, | |
| "eval_steps_per_second": 5.076, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08423928510444532, | |
| "grad_norm": 2.1557302474975586, | |
| "learning_rate": 4.1666666666666665e-05, | |
| "loss": 5.7691, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09266321361488987, | |
| "grad_norm": 1.9360383749008179, | |
| "learning_rate": 4.5833333333333334e-05, | |
| "loss": 5.6653, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1010871421253344, | |
| "grad_norm": 1.731399655342102, | |
| "learning_rate": 5e-05, | |
| "loss": 5.5598, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.10951107063577893, | |
| "grad_norm": 1.508693814277649, | |
| "learning_rate": 5.416666666666667e-05, | |
| "loss": 5.4574, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.11793499914622346, | |
| "grad_norm": 1.2835007905960083, | |
| "learning_rate": 5.833333333333333e-05, | |
| "loss": 5.3585, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.126358927656668, | |
| "grad_norm": 1.0747231245040894, | |
| "learning_rate": 6.25e-05, | |
| "loss": 5.2667, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13478285616711252, | |
| "grad_norm": 0.852271318435669, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 5.1779, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14320678467755707, | |
| "grad_norm": 0.7001814842224121, | |
| "learning_rate": 7.083333333333334e-05, | |
| "loss": 5.0965, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1516307131880016, | |
| "grad_norm": 0.5657457709312439, | |
| "learning_rate": 7.5e-05, | |
| "loss": 5.0237, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1516307131880016, | |
| "eval_accuracy": 0.23888299376264316, | |
| "eval_loss": 4.981535911560059, | |
| "eval_runtime": 882.341, | |
| "eval_samples_per_second": 565.967, | |
| "eval_steps_per_second": 5.241, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16005464169844613, | |
| "grad_norm": 0.4981703758239746, | |
| "learning_rate": 7.916666666666666e-05, | |
| "loss": 4.9662, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.16847857020889065, | |
| "grad_norm": 0.40254291892051697, | |
| "learning_rate": 8.333333333333333e-05, | |
| "loss": 4.9195, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1769024987193352, | |
| "grad_norm": 0.32726043462753296, | |
| "learning_rate": 8.75e-05, | |
| "loss": 4.8766, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.18532642722977974, | |
| "grad_norm": 0.2471727877855301, | |
| "learning_rate": 9.166666666666667e-05, | |
| "loss": 4.8458, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.19375035574022426, | |
| "grad_norm": 0.2568261921405792, | |
| "learning_rate": 9.583333333333334e-05, | |
| "loss": 4.8169, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2021742842506688, | |
| "grad_norm": 0.19310955703258514, | |
| "learning_rate": 0.0001, | |
| "loss": 4.7926, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.21059821276111332, | |
| "grad_norm": 0.20584674179553986, | |
| "learning_rate": 0.00010416666666666667, | |
| "loss": 4.7714, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.21902214127155786, | |
| "grad_norm": 0.26360729336738586, | |
| "learning_rate": 0.00010833333333333334, | |
| "loss": 4.7511, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.22744606978200238, | |
| "grad_norm": 0.1681978851556778, | |
| "learning_rate": 0.00011250000000000001, | |
| "loss": 4.7309, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.22744606978200238, | |
| "eval_accuracy": 0.28488370423336357, | |
| "eval_loss": 4.706047534942627, | |
| "eval_runtime": 889.3977, | |
| "eval_samples_per_second": 561.477, | |
| "eval_steps_per_second": 5.199, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.23586999829244693, | |
| "grad_norm": 0.17959143221378326, | |
| "learning_rate": 0.00011666666666666667, | |
| "loss": 4.7148, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.24429392680289147, | |
| "grad_norm": 0.27109047770500183, | |
| "learning_rate": 0.00012083333333333333, | |
| "loss": 4.6989, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.252717855313336, | |
| "grad_norm": 0.2674080431461334, | |
| "learning_rate": 0.000125, | |
| "loss": 4.6826, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2611417838237805, | |
| "grad_norm": 0.24386395514011383, | |
| "learning_rate": 0.00012916666666666667, | |
| "loss": 4.6707, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.26956571233422505, | |
| "grad_norm": 0.5274083614349365, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 4.6553, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2779896408446696, | |
| "grad_norm": 0.4005141258239746, | |
| "learning_rate": 0.0001375, | |
| "loss": 4.6446, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.28641356935511414, | |
| "grad_norm": 0.3732853829860687, | |
| "learning_rate": 0.00014166666666666668, | |
| "loss": 4.6315, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.29483749786555863, | |
| "grad_norm": 0.2742752730846405, | |
| "learning_rate": 0.00014583333333333335, | |
| "loss": 4.6221, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3032614263760032, | |
| "grad_norm": 0.20482462644577026, | |
| "learning_rate": 0.00015, | |
| "loss": 4.6138, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3032614263760032, | |
| "eval_accuracy": 0.28836420126551926, | |
| "eval_loss": 4.5933918952941895, | |
| "eval_runtime": 880.4452, | |
| "eval_samples_per_second": 567.186, | |
| "eval_steps_per_second": 5.252, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3116853548864477, | |
| "grad_norm": 0.26613757014274597, | |
| "learning_rate": 0.00015416666666666668, | |
| "loss": 4.5983, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.32010928339689226, | |
| "grad_norm": 0.20205098390579224, | |
| "learning_rate": 0.00015833333333333332, | |
| "loss": 4.5922, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.3285332119073368, | |
| "grad_norm": 0.5084218978881836, | |
| "learning_rate": 0.00016250000000000002, | |
| "loss": 4.5826, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3369571404177813, | |
| "grad_norm": 0.2835780084133148, | |
| "learning_rate": 0.00016666666666666666, | |
| "loss": 4.5771, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.34538106892822584, | |
| "grad_norm": 0.23976200819015503, | |
| "learning_rate": 0.00017083333333333333, | |
| "loss": 4.5726, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3538049974386704, | |
| "grad_norm": 0.2275087982416153, | |
| "learning_rate": 0.000175, | |
| "loss": 4.5666, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.36222892594911493, | |
| "grad_norm": 0.27758899331092834, | |
| "learning_rate": 0.00017916666666666667, | |
| "loss": 4.5654, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3706528544595595, | |
| "grad_norm": 0.18581350147724152, | |
| "learning_rate": 0.00018333333333333334, | |
| "loss": 4.5593, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.37907678297000397, | |
| "grad_norm": 0.1667676419019699, | |
| "learning_rate": 0.0001875, | |
| "loss": 4.5538, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.37907678297000397, | |
| "eval_accuracy": 0.28966679521500804, | |
| "eval_loss": 4.547606468200684, | |
| "eval_runtime": 890.3979, | |
| "eval_samples_per_second": 560.846, | |
| "eval_steps_per_second": 5.193, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3875007114804485, | |
| "grad_norm": 0.32489290833473206, | |
| "learning_rate": 0.00019166666666666667, | |
| "loss": 4.5532, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.39592463999089306, | |
| "grad_norm": 0.7000045776367188, | |
| "learning_rate": 0.00019583333333333334, | |
| "loss": 4.5484, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.4043485685013376, | |
| "grad_norm": 0.43668240308761597, | |
| "learning_rate": 0.0002, | |
| "loss": 4.5489, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4127724970117821, | |
| "grad_norm": 0.36716368794441223, | |
| "learning_rate": 0.00020416666666666668, | |
| "loss": 4.5459, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.42119642552222664, | |
| "grad_norm": 0.30332931876182556, | |
| "learning_rate": 0.00020833333333333335, | |
| "loss": 4.5418, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4296203540326712, | |
| "grad_norm": 0.5920347571372986, | |
| "learning_rate": 0.0002125, | |
| "loss": 4.5406, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.4380442825431157, | |
| "grad_norm": 0.45020386576652527, | |
| "learning_rate": 0.00021666666666666668, | |
| "loss": 4.5372, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.44646821105356027, | |
| "grad_norm": 0.33357909321784973, | |
| "learning_rate": 0.00022083333333333333, | |
| "loss": 4.5367, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.45489213956400476, | |
| "grad_norm": 0.45888572931289673, | |
| "learning_rate": 0.00022500000000000002, | |
| "loss": 4.5344, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.45489213956400476, | |
| "eval_accuracy": 0.2902362393111046, | |
| "eval_loss": 4.531790256500244, | |
| "eval_runtime": 882.2427, | |
| "eval_samples_per_second": 566.03, | |
| "eval_steps_per_second": 5.241, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.4633160680744493, | |
| "grad_norm": 0.4458440840244293, | |
| "learning_rate": 0.00022916666666666666, | |
| "loss": 4.5328, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.47173999658489385, | |
| "grad_norm": 0.1917838305234909, | |
| "learning_rate": 0.00023333333333333333, | |
| "loss": 4.5296, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4801639250953384, | |
| "grad_norm": 0.8310424089431763, | |
| "learning_rate": 0.0002375, | |
| "loss": 4.5275, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.48858785360578294, | |
| "grad_norm": 0.4216615855693817, | |
| "learning_rate": 0.00024166666666666667, | |
| "loss": 4.531, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.49701178211622743, | |
| "grad_norm": 0.2320231944322586, | |
| "learning_rate": 0.0002458333333333333, | |
| "loss": 4.5276, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.505435710626672, | |
| "grad_norm": 0.3115006983280182, | |
| "learning_rate": 0.00025, | |
| "loss": 4.5252, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.5138596391371165, | |
| "grad_norm": 0.13032270967960358, | |
| "learning_rate": 0.00025416666666666665, | |
| "loss": 4.5227, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.522283567647561, | |
| "grad_norm": 0.5333927273750305, | |
| "learning_rate": 0.00025833333333333334, | |
| "loss": 4.5214, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.5307074961580056, | |
| "grad_norm": 0.8976441025733948, | |
| "learning_rate": 0.00026250000000000004, | |
| "loss": 4.5218, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5307074961580056, | |
| "eval_accuracy": 0.290083406000685, | |
| "eval_loss": 4.522771835327148, | |
| "eval_runtime": 892.1941, | |
| "eval_samples_per_second": 559.717, | |
| "eval_steps_per_second": 5.183, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.5391314246684501, | |
| "grad_norm": 0.1657322496175766, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 4.523, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.5475553531788947, | |
| "grad_norm": 0.1890048235654831, | |
| "learning_rate": 0.0002708333333333333, | |
| "loss": 4.5185, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5559792816893392, | |
| "grad_norm": 0.8254080414772034, | |
| "learning_rate": 0.000275, | |
| "loss": 4.5196, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.5644032101997837, | |
| "grad_norm": 0.1703944355249405, | |
| "learning_rate": 0.00027916666666666666, | |
| "loss": 4.52, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5728271387102283, | |
| "grad_norm": 0.33486783504486084, | |
| "learning_rate": 0.00028333333333333335, | |
| "loss": 4.5139, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5812510672206728, | |
| "grad_norm": 0.4759036600589752, | |
| "learning_rate": 0.0002875, | |
| "loss": 4.5158, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.5896749957311173, | |
| "grad_norm": 0.26314422488212585, | |
| "learning_rate": 0.0002916666666666667, | |
| "loss": 4.5135, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5980989242415619, | |
| "grad_norm": 0.39898937940597534, | |
| "learning_rate": 0.00029583333333333333, | |
| "loss": 4.5114, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.6065228527520063, | |
| "grad_norm": 0.5003794431686401, | |
| "learning_rate": 0.0003, | |
| "loss": 4.5148, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.6065228527520063, | |
| "eval_accuracy": 0.2903979539286128, | |
| "eval_loss": 4.508981704711914, | |
| "eval_runtime": 878.8487, | |
| "eval_samples_per_second": 568.216, | |
| "eval_steps_per_second": 5.261, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.614946781262451, | |
| "grad_norm": 0.2276950627565384, | |
| "learning_rate": 0.00030416666666666667, | |
| "loss": 4.5111, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.6233707097728954, | |
| "grad_norm": 0.21725377440452576, | |
| "learning_rate": 0.00030833333333333337, | |
| "loss": 4.5088, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6317946382833399, | |
| "grad_norm": 0.8084585666656494, | |
| "learning_rate": 0.0003125, | |
| "loss": 4.5074, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6402185667937845, | |
| "grad_norm": 0.46915069222450256, | |
| "learning_rate": 0.00031666666666666665, | |
| "loss": 4.5072, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.648642495304229, | |
| "grad_norm": 0.15649260580539703, | |
| "learning_rate": 0.00032083333333333334, | |
| "loss": 4.5039, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.6570664238146736, | |
| "grad_norm": 0.42916274070739746, | |
| "learning_rate": 0.00032500000000000004, | |
| "loss": 4.5056, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.6654903523251181, | |
| "grad_norm": 0.287572979927063, | |
| "learning_rate": 0.0003291666666666667, | |
| "loss": 4.5045, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.6739142808355626, | |
| "grad_norm": 0.6869699358940125, | |
| "learning_rate": 0.0003333333333333333, | |
| "loss": 4.5029, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.6823382093460072, | |
| "grad_norm": 0.2973476052284241, | |
| "learning_rate": 0.0003375, | |
| "loss": 4.5009, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6823382093460072, | |
| "eval_accuracy": 0.29041409279207236, | |
| "eval_loss": 4.497637748718262, | |
| "eval_runtime": 872.3603, | |
| "eval_samples_per_second": 572.442, | |
| "eval_steps_per_second": 5.301, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6907621378564517, | |
| "grad_norm": 0.5773557424545288, | |
| "learning_rate": 0.00034166666666666666, | |
| "loss": 4.5024, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6991860663668963, | |
| "grad_norm": 0.31921157240867615, | |
| "learning_rate": 0.00034583333333333335, | |
| "loss": 4.5006, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.7076099948773408, | |
| "grad_norm": 0.4232361912727356, | |
| "learning_rate": 0.00035, | |
| "loss": 4.5001, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.7160339233877853, | |
| "grad_norm": 0.30865538120269775, | |
| "learning_rate": 0.0003541666666666667, | |
| "loss": 4.4998, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.7244578518982299, | |
| "grad_norm": 0.6191368699073792, | |
| "learning_rate": 0.00035833333333333333, | |
| "loss": 4.4967, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.7328817804086744, | |
| "grad_norm": 0.3202773630619049, | |
| "learning_rate": 0.0003625, | |
| "loss": 4.499, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.741305708919119, | |
| "grad_norm": 0.3090028464794159, | |
| "learning_rate": 0.00036666666666666667, | |
| "loss": 4.4967, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7497296374295634, | |
| "grad_norm": 0.9248805046081543, | |
| "learning_rate": 0.00037083333333333337, | |
| "loss": 4.4962, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.7581535659400079, | |
| "grad_norm": 0.27745822072029114, | |
| "learning_rate": 0.000375, | |
| "loss": 4.4956, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7581535659400079, | |
| "eval_accuracy": 0.29047371761644103, | |
| "eval_loss": 4.492140293121338, | |
| "eval_runtime": 888.1144, | |
| "eval_samples_per_second": 562.288, | |
| "eval_steps_per_second": 5.207, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7665774944504525, | |
| "grad_norm": 0.2972380518913269, | |
| "learning_rate": 0.00037916666666666665, | |
| "loss": 4.4936, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.775001422960897, | |
| "grad_norm": 1.4440104961395264, | |
| "learning_rate": 0.00038333333333333334, | |
| "loss": 4.4956, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.7834253514713415, | |
| "grad_norm": 0.2894129455089569, | |
| "learning_rate": 0.00038750000000000004, | |
| "loss": 4.4961, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.7918492799817861, | |
| "grad_norm": 0.22757315635681152, | |
| "learning_rate": 0.0003916666666666667, | |
| "loss": 4.495, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.8002732084922306, | |
| "grad_norm": 0.2084762305021286, | |
| "learning_rate": 0.0003958333333333333, | |
| "loss": 4.4921, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.8086971370026752, | |
| "grad_norm": 0.4823535084724426, | |
| "learning_rate": 0.0004, | |
| "loss": 4.4928, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.8171210655131197, | |
| "grad_norm": 0.22939594089984894, | |
| "learning_rate": 0.00040416666666666666, | |
| "loss": 4.4889, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.8255449940235642, | |
| "grad_norm": 0.4983462989330292, | |
| "learning_rate": 0.00040833333333333336, | |
| "loss": 4.4888, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.8339689225340088, | |
| "grad_norm": 0.7445792555809021, | |
| "learning_rate": 0.0004125, | |
| "loss": 4.4899, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8339689225340088, | |
| "eval_accuracy": 0.2903607895100575, | |
| "eval_loss": 4.490144729614258, | |
| "eval_runtime": 872.9885, | |
| "eval_samples_per_second": 572.03, | |
| "eval_steps_per_second": 5.297, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8423928510444533, | |
| "grad_norm": 0.3264559805393219, | |
| "learning_rate": 0.0004166666666666667, | |
| "loss": 4.4879, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8508167795548979, | |
| "grad_norm": 0.5130082964897156, | |
| "learning_rate": 0.00042083333333333333, | |
| "loss": 4.4881, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.8592407080653424, | |
| "grad_norm": 0.2776341736316681, | |
| "learning_rate": 0.000425, | |
| "loss": 4.4872, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.8676646365757869, | |
| "grad_norm": 0.9157618880271912, | |
| "learning_rate": 0.00042916666666666667, | |
| "loss": 4.4868, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.8760885650862315, | |
| "grad_norm": 0.22099615633487701, | |
| "learning_rate": 0.00043333333333333337, | |
| "loss": 4.4877, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.8845124935966759, | |
| "grad_norm": 0.2313142567873001, | |
| "learning_rate": 0.0004375, | |
| "loss": 4.4845, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8929364221071205, | |
| "grad_norm": 0.4353635907173157, | |
| "learning_rate": 0.00044166666666666665, | |
| "loss": 4.4888, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.901360350617565, | |
| "grad_norm": 0.2390984743833542, | |
| "learning_rate": 0.00044583333333333335, | |
| "loss": 4.4827, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.9097842791280095, | |
| "grad_norm": 0.31369632482528687, | |
| "learning_rate": 0.00045000000000000004, | |
| "loss": 4.4832, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9097842791280095, | |
| "eval_accuracy": 0.2904605834264481, | |
| "eval_loss": 4.480494499206543, | |
| "eval_runtime": 880.1337, | |
| "eval_samples_per_second": 567.386, | |
| "eval_steps_per_second": 5.254, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.9182082076384541, | |
| "grad_norm": 0.6700971722602844, | |
| "learning_rate": 0.0004541666666666667, | |
| "loss": 4.483, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.9266321361488986, | |
| "grad_norm": 0.25950998067855835, | |
| "learning_rate": 0.0004583333333333333, | |
| "loss": 4.4832, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.9350560646593432, | |
| "grad_norm": 0.2840316593647003, | |
| "learning_rate": 0.0004625, | |
| "loss": 4.4819, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.9434799931697877, | |
| "grad_norm": 0.6859279274940491, | |
| "learning_rate": 0.00046666666666666666, | |
| "loss": 4.4819, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.9519039216802322, | |
| "grad_norm": 0.2865343391895294, | |
| "learning_rate": 0.00047083333333333336, | |
| "loss": 4.48, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.9603278501906768, | |
| "grad_norm": 1.179539442062378, | |
| "learning_rate": 0.000475, | |
| "loss": 4.4762, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.9687517787011213, | |
| "grad_norm": 0.4731704294681549, | |
| "learning_rate": 0.0004791666666666667, | |
| "loss": 4.4831, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.9771757072115659, | |
| "grad_norm": 0.298757404088974, | |
| "learning_rate": 0.00048333333333333334, | |
| "loss": 4.4742, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.9855996357220104, | |
| "grad_norm": 1.0954639911651611, | |
| "learning_rate": 0.0004875, | |
| "loss": 4.46, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.9855996357220104, | |
| "eval_accuracy": 0.29021425691327735, | |
| "eval_loss": 4.458162784576416, | |
| "eval_runtime": 887.8161, | |
| "eval_samples_per_second": 562.477, | |
| "eval_steps_per_second": 5.208, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.9940235642324549, | |
| "grad_norm": 0.441949725151062, | |
| "learning_rate": 0.0004916666666666666, | |
| "loss": 4.4549, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.0024474927428995, | |
| "grad_norm": 0.5917736887931824, | |
| "learning_rate": 0.0004958333333333334, | |
| "loss": 4.4425, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.010871421253344, | |
| "grad_norm": 0.3910304307937622, | |
| "learning_rate": 0.0005, | |
| "loss": 4.4376, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.0192953497637884, | |
| "grad_norm": 0.446277916431427, | |
| "learning_rate": 0.0005041666666666667, | |
| "loss": 4.4284, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.027719278274233, | |
| "grad_norm": 0.7843539118766785, | |
| "learning_rate": 0.0005083333333333333, | |
| "loss": 4.4216, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.0361432067846776, | |
| "grad_norm": 0.5028587579727173, | |
| "learning_rate": 0.0005124999999999999, | |
| "loss": 4.418, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.044567135295122, | |
| "grad_norm": 0.5062530636787415, | |
| "learning_rate": 0.0005166666666666667, | |
| "loss": 4.4099, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.0529910638055666, | |
| "grad_norm": 0.4109475016593933, | |
| "learning_rate": 0.0005208333333333334, | |
| "loss": 4.4005, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.0614149923160112, | |
| "grad_norm": 0.494357705116272, | |
| "learning_rate": 0.0005250000000000001, | |
| "loss": 4.3924, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0614149923160112, | |
| "eval_accuracy": 0.29121270831959656, | |
| "eval_loss": 4.368500232696533, | |
| "eval_runtime": 885.6194, | |
| "eval_samples_per_second": 563.872, | |
| "eval_steps_per_second": 5.221, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.0698389208264556, | |
| "grad_norm": 0.4964124858379364, | |
| "learning_rate": 0.0005291666666666667, | |
| "loss": 4.3843, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.0782628493369002, | |
| "grad_norm": 0.6328290700912476, | |
| "learning_rate": 0.0005333333333333334, | |
| "loss": 4.3756, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.0866867778473448, | |
| "grad_norm": 0.8674759268760681, | |
| "learning_rate": 0.0005375, | |
| "loss": 4.3697, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.0951107063577892, | |
| "grad_norm": 0.4631132185459137, | |
| "learning_rate": 0.0005416666666666666, | |
| "loss": 4.3676, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.1035346348682338, | |
| "grad_norm": 0.5043870210647583, | |
| "learning_rate": 0.0005458333333333333, | |
| "loss": 4.3582, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.1119585633786784, | |
| "grad_norm": 0.5791853666305542, | |
| "learning_rate": 0.00055, | |
| "loss": 4.3529, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.120382491889123, | |
| "grad_norm": 0.6443321108818054, | |
| "learning_rate": 0.0005541666666666667, | |
| "loss": 4.3471, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.1288064203995674, | |
| "grad_norm": 0.6193282008171082, | |
| "learning_rate": 0.0005583333333333333, | |
| "loss": 4.338, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.137230348910012, | |
| "grad_norm": 0.6169930696487427, | |
| "learning_rate": 0.0005625000000000001, | |
| "loss": 4.3365, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.137230348910012, | |
| "eval_accuracy": 0.2912005471998471, | |
| "eval_loss": 4.2970428466796875, | |
| "eval_runtime": 875.1704, | |
| "eval_samples_per_second": 570.604, | |
| "eval_steps_per_second": 5.284, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.1456542774204566, | |
| "grad_norm": 0.8051270246505737, | |
| "learning_rate": 0.0005666666666666667, | |
| "loss": 4.3252, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.154078205930901, | |
| "grad_norm": 0.7985979914665222, | |
| "learning_rate": 0.0005708333333333333, | |
| "loss": 4.3185, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.1625021344413455, | |
| "grad_norm": 0.7459626793861389, | |
| "learning_rate": 0.000575, | |
| "loss": 4.3119, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.1709260629517901, | |
| "grad_norm": 0.572289228439331, | |
| "learning_rate": 0.0005791666666666667, | |
| "loss": 4.3066, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.1793499914622347, | |
| "grad_norm": 0.5565480589866638, | |
| "learning_rate": 0.0005833333333333334, | |
| "loss": 4.2973, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1877739199726791, | |
| "grad_norm": 0.789574384689331, | |
| "learning_rate": 0.0005875, | |
| "loss": 4.2922, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.1961978484831237, | |
| "grad_norm": 1.0027601718902588, | |
| "learning_rate": 0.0005916666666666667, | |
| "loss": 4.2824, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.204621776993568, | |
| "grad_norm": 0.8137519359588623, | |
| "learning_rate": 0.0005958333333333333, | |
| "loss": 4.2808, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.2130457055040127, | |
| "grad_norm": 0.8705686330795288, | |
| "learning_rate": 0.0006, | |
| "loss": 4.2685, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2130457055040127, | |
| "eval_accuracy": 0.2922224943254529, | |
| "eval_loss": 4.225285053253174, | |
| "eval_runtime": 885.6768, | |
| "eval_samples_per_second": 563.835, | |
| "eval_steps_per_second": 5.221, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.2214696340144573, | |
| "grad_norm": 1.0055943727493286, | |
| "learning_rate": 0.0006041666666666666, | |
| "loss": 4.2639, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.229893562524902, | |
| "grad_norm": 0.9747255444526672, | |
| "learning_rate": 0.0006083333333333333, | |
| "loss": 4.2622, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.2383174910353463, | |
| "grad_norm": 0.6799793243408203, | |
| "learning_rate": 0.0006125000000000001, | |
| "loss": 4.251, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.2467414195457909, | |
| "grad_norm": 0.8863984942436218, | |
| "learning_rate": 0.0006166666666666667, | |
| "loss": 4.2476, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.2551653480562355, | |
| "grad_norm": 0.891790509223938, | |
| "learning_rate": 0.0006208333333333334, | |
| "loss": 4.2434, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2635892765666799, | |
| "grad_norm": 0.731626033782959, | |
| "learning_rate": 0.000625, | |
| "loss": 4.233, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.2720132050771245, | |
| "grad_norm": 0.7038396000862122, | |
| "learning_rate": 0.0006291666666666667, | |
| "loss": 4.2264, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.280437133587569, | |
| "grad_norm": 1.0247654914855957, | |
| "learning_rate": 0.0006333333333333333, | |
| "loss": 4.2198, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.2888610620980137, | |
| "grad_norm": 1.0854212045669556, | |
| "learning_rate": 0.0006374999999999999, | |
| "loss": 4.2126, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.2888610620980137, | |
| "eval_accuracy": 0.2953678601775117, | |
| "eval_loss": 4.152132034301758, | |
| "eval_runtime": 880.7951, | |
| "eval_samples_per_second": 566.96, | |
| "eval_steps_per_second": 5.25, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.297284990608458, | |
| "grad_norm": 0.8179611563682556, | |
| "learning_rate": 0.0006416666666666667, | |
| "loss": 4.2081, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.3057089191189026, | |
| "grad_norm": 1.4174506664276123, | |
| "learning_rate": 0.0006458333333333334, | |
| "loss": 4.2027, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.314132847629347, | |
| "grad_norm": 1.1611113548278809, | |
| "learning_rate": 0.0006500000000000001, | |
| "loss": 4.1992, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.3225567761397916, | |
| "grad_norm": 1.1475598812103271, | |
| "learning_rate": 0.0006541666666666667, | |
| "loss": 4.1875, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.3309807046502362, | |
| "grad_norm": 1.158115267753601, | |
| "learning_rate": 0.0006583333333333334, | |
| "loss": 4.1883, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.3394046331606808, | |
| "grad_norm": 1.325655221939087, | |
| "learning_rate": 0.0006625, | |
| "loss": 4.181, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.3478285616711254, | |
| "grad_norm": 1.077793836593628, | |
| "learning_rate": 0.0006666666666666666, | |
| "loss": 4.1727, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.3562524901815698, | |
| "grad_norm": 1.2139134407043457, | |
| "learning_rate": 0.0006708333333333333, | |
| "loss": 4.1691, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.3646764186920144, | |
| "grad_norm": 1.075778603553772, | |
| "learning_rate": 0.000675, | |
| "loss": 4.1563, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.3646764186920144, | |
| "eval_accuracy": 0.2982954422675167, | |
| "eval_loss": 4.0783562660217285, | |
| "eval_runtime": 880.4076, | |
| "eval_samples_per_second": 567.21, | |
| "eval_steps_per_second": 5.252, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.3731003472024588, | |
| "grad_norm": 1.8017152547836304, | |
| "learning_rate": 0.0006791666666666667, | |
| "loss": 4.1523, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.3815242757129034, | |
| "grad_norm": 1.2614473104476929, | |
| "learning_rate": 0.0006833333333333333, | |
| "loss": 4.1481, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.389948204223348, | |
| "grad_norm": 1.179167628288269, | |
| "learning_rate": 0.0006875, | |
| "loss": 4.1421, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.3983721327337926, | |
| "grad_norm": 1.463998794555664, | |
| "learning_rate": 0.0006916666666666667, | |
| "loss": 4.1331, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.406796061244237, | |
| "grad_norm": 1.086358666419983, | |
| "learning_rate": 0.0006958333333333334, | |
| "loss": 4.1276, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.4152199897546816, | |
| "grad_norm": 1.3272647857666016, | |
| "learning_rate": 0.0007, | |
| "loss": 4.1357, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.4236439182651262, | |
| "grad_norm": 1.4760971069335938, | |
| "learning_rate": 0.0007041666666666667, | |
| "loss": 4.1299, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.4320678467755705, | |
| "grad_norm": 1.7591749429702759, | |
| "learning_rate": 0.0007083333333333334, | |
| "loss": 4.129, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.4404917752860151, | |
| "grad_norm": 1.7945603132247925, | |
| "learning_rate": 0.0007125, | |
| "loss": 4.1221, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.4404917752860151, | |
| "eval_accuracy": 0.3010639405026742, | |
| "eval_loss": 4.012106895446777, | |
| "eval_runtime": 881.7425, | |
| "eval_samples_per_second": 566.351, | |
| "eval_steps_per_second": 5.244, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.4489157037964597, | |
| "grad_norm": 1.7016360759735107, | |
| "learning_rate": 0.0007166666666666667, | |
| "loss": 4.1043, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.4573396323069043, | |
| "grad_norm": 1.8240207433700562, | |
| "learning_rate": 0.0007208333333333333, | |
| "loss": 4.1034, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.4657635608173487, | |
| "grad_norm": 2.4510786533355713, | |
| "learning_rate": 0.000725, | |
| "loss": 4.0924, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.4741874893277933, | |
| "grad_norm": 1.7411324977874756, | |
| "learning_rate": 0.0007291666666666666, | |
| "loss": 4.1041, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.4826114178382377, | |
| "grad_norm": 1.1133612394332886, | |
| "learning_rate": 0.0007333333333333333, | |
| "loss": 4.1064, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.4910353463486823, | |
| "grad_norm": 1.3936740159988403, | |
| "learning_rate": 0.0007375000000000001, | |
| "loss": 4.0954, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.499459274859127, | |
| "grad_norm": 2.3855819702148438, | |
| "learning_rate": 0.0007416666666666667, | |
| "loss": 4.0836, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.5078832033695715, | |
| "grad_norm": 1.2734453678131104, | |
| "learning_rate": 0.0007458333333333334, | |
| "loss": 4.0834, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.516307131880016, | |
| "grad_norm": 1.432719349861145, | |
| "learning_rate": 0.00075, | |
| "loss": 4.0711, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.516307131880016, | |
| "eval_accuracy": 0.3055703004736556, | |
| "eval_loss": 3.976287841796875, | |
| "eval_runtime": 881.3595, | |
| "eval_samples_per_second": 566.597, | |
| "eval_steps_per_second": 5.246, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.5247310603904605, | |
| "grad_norm": 1.5839996337890625, | |
| "learning_rate": 0.0007541666666666667, | |
| "loss": 4.0712, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.5331549889009048, | |
| "grad_norm": 3.0461270809173584, | |
| "learning_rate": 0.0007583333333333333, | |
| "loss": 4.0617, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.5415789174113494, | |
| "grad_norm": 1.760568380355835, | |
| "learning_rate": 0.0007624999999999999, | |
| "loss": 4.0486, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.550002845921794, | |
| "grad_norm": 1.6682184934616089, | |
| "learning_rate": 0.0007666666666666667, | |
| "loss": 4.0034, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.5584267744322386, | |
| "grad_norm": 1.4350653886795044, | |
| "learning_rate": 0.0007708333333333334, | |
| "loss": 3.9644, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.5668507029426832, | |
| "grad_norm": 1.4870712757110596, | |
| "learning_rate": 0.0007750000000000001, | |
| "loss": 3.9314, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.5752746314531276, | |
| "grad_norm": 1.7954463958740234, | |
| "learning_rate": 0.0007791666666666667, | |
| "loss": 3.8939, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.5836985599635722, | |
| "grad_norm": 2.1485602855682373, | |
| "learning_rate": 0.0007833333333333334, | |
| "loss": 3.8576, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.5921224884740166, | |
| "grad_norm": 1.647570252418518, | |
| "learning_rate": 0.0007875, | |
| "loss": 3.8159, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.5921224884740166, | |
| "eval_accuracy": 0.3353472770952767, | |
| "eval_loss": 3.6341910362243652, | |
| "eval_runtime": 881.1424, | |
| "eval_samples_per_second": 566.737, | |
| "eval_steps_per_second": 5.248, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.6005464169844612, | |
| "grad_norm": 1.7171742916107178, | |
| "learning_rate": 0.0007916666666666666, | |
| "loss": 3.7812, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.6089703454949058, | |
| "grad_norm": 2.12190580368042, | |
| "learning_rate": 0.0007958333333333333, | |
| "loss": 3.7402, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.6173942740053504, | |
| "grad_norm": 1.7334414720535278, | |
| "learning_rate": 0.0008, | |
| "loss": 3.7025, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.625818202515795, | |
| "grad_norm": 1.8880668878555298, | |
| "learning_rate": 0.0008041666666666667, | |
| "loss": 3.6808, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.6342421310262394, | |
| "grad_norm": 2.3294591903686523, | |
| "learning_rate": 0.0008083333333333333, | |
| "loss": 3.6419, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.642666059536684, | |
| "grad_norm": 2.4122796058654785, | |
| "learning_rate": 0.0008125000000000001, | |
| "loss": 3.6114, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.6510899880471284, | |
| "grad_norm": 2.090388774871826, | |
| "learning_rate": 0.0008166666666666667, | |
| "loss": 3.5867, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.659513916557573, | |
| "grad_norm": 2.267676830291748, | |
| "learning_rate": 0.0008208333333333334, | |
| "loss": 3.5501, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.6679378450680176, | |
| "grad_norm": 2.253739833831787, | |
| "learning_rate": 0.000825, | |
| "loss": 3.5114, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.6679378450680176, | |
| "eval_accuracy": 0.38861593633258434, | |
| "eval_loss": 3.2597665786743164, | |
| "eval_runtime": 889.3264, | |
| "eval_samples_per_second": 561.522, | |
| "eval_steps_per_second": 5.199, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.6763617735784622, | |
| "grad_norm": 2.269505739212036, | |
| "learning_rate": 0.0008291666666666667, | |
| "loss": 3.4854, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.6847857020889065, | |
| "grad_norm": 1.7237802743911743, | |
| "learning_rate": 0.0008333333333333334, | |
| "loss": 3.4651, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.6932096305993511, | |
| "grad_norm": 2.1117663383483887, | |
| "learning_rate": 0.0008375, | |
| "loss": 3.4558, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.7016335591097955, | |
| "grad_norm": 2.1351046562194824, | |
| "learning_rate": 0.0008416666666666667, | |
| "loss": 3.4256, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.7100574876202401, | |
| "grad_norm": 2.326232671737671, | |
| "learning_rate": 0.0008458333333333333, | |
| "loss": 3.3998, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.7184814161306847, | |
| "grad_norm": 2.1802730560302734, | |
| "learning_rate": 0.00085, | |
| "loss": 3.3865, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.7269053446411293, | |
| "grad_norm": 2.042966604232788, | |
| "learning_rate": 0.0008541666666666666, | |
| "loss": 3.3539, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.735329273151574, | |
| "grad_norm": 2.052464008331299, | |
| "learning_rate": 0.0008583333333333333, | |
| "loss": 3.3308, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.7437532016620183, | |
| "grad_norm": 1.5790934562683105, | |
| "learning_rate": 0.0008625000000000001, | |
| "loss": 3.3122, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.7437532016620183, | |
| "eval_accuracy": 0.41178756961484836, | |
| "eval_loss": 3.0882680416107178, | |
| "eval_runtime": 878.4742, | |
| "eval_samples_per_second": 568.458, | |
| "eval_steps_per_second": 5.264, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.752177130172463, | |
| "grad_norm": 2.2859761714935303, | |
| "learning_rate": 0.0008666666666666667, | |
| "loss": 3.3034, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.7606010586829073, | |
| "grad_norm": 2.912191867828369, | |
| "learning_rate": 0.0008708333333333334, | |
| "loss": 3.289, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.7690249871933519, | |
| "grad_norm": 2.143118143081665, | |
| "learning_rate": 0.000875, | |
| "loss": 3.2547, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.7774489157037965, | |
| "grad_norm": 1.8577404022216797, | |
| "learning_rate": 0.0008791666666666667, | |
| "loss": 3.2383, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.785872844214241, | |
| "grad_norm": 1.9692562818527222, | |
| "learning_rate": 0.0008833333333333333, | |
| "loss": 3.2137, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.7942967727246857, | |
| "grad_norm": 1.938915729522705, | |
| "learning_rate": 0.0008874999999999999, | |
| "loss": 3.1909, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.80272070123513, | |
| "grad_norm": 1.395321011543274, | |
| "learning_rate": 0.0008916666666666667, | |
| "loss": 3.1346, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.8111446297455744, | |
| "grad_norm": 1.8771544694900513, | |
| "learning_rate": 0.0008958333333333334, | |
| "loss": 3.1035, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.819568558256019, | |
| "grad_norm": 1.5829336643218994, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 3.0328, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.819568558256019, | |
| "eval_accuracy": 0.45304088376136725, | |
| "eval_loss": 2.8062996864318848, | |
| "eval_runtime": 886.0675, | |
| "eval_samples_per_second": 563.587, | |
| "eval_steps_per_second": 5.219, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.8279924867664636, | |
| "grad_norm": 1.5085866451263428, | |
| "learning_rate": 0.0009041666666666667, | |
| "loss": 3.0089, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.8364164152769082, | |
| "grad_norm": 1.4988549947738647, | |
| "learning_rate": 0.0009083333333333334, | |
| "loss": 2.9786, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.8448403437873528, | |
| "grad_norm": 1.5726799964904785, | |
| "learning_rate": 0.0009125, | |
| "loss": 2.936, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.8532642722977972, | |
| "grad_norm": 1.2175358533859253, | |
| "learning_rate": 0.0009166666666666666, | |
| "loss": 2.8996, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.8616882008082418, | |
| "grad_norm": 1.4195218086242676, | |
| "learning_rate": 0.0009208333333333333, | |
| "loss": 2.8664, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.8701121293186862, | |
| "grad_norm": 1.1213312149047852, | |
| "learning_rate": 0.000925, | |
| "loss": 2.8382, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.8785360578291308, | |
| "grad_norm": 1.169554591178894, | |
| "learning_rate": 0.0009291666666666667, | |
| "loss": 2.8026, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.8869599863395754, | |
| "grad_norm": 1.4759305715560913, | |
| "learning_rate": 0.0009333333333333333, | |
| "loss": 2.7654, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.89538391485002, | |
| "grad_norm": 1.3071763515472412, | |
| "learning_rate": 0.0009375, | |
| "loss": 2.7311, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.89538391485002, | |
| "eval_accuracy": 0.4917409385648686, | |
| "eval_loss": 2.5433878898620605, | |
| "eval_runtime": 879.3794, | |
| "eval_samples_per_second": 567.873, | |
| "eval_steps_per_second": 5.258, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.9038078433604646, | |
| "grad_norm": 0.9968194961547852, | |
| "learning_rate": 0.0009416666666666667, | |
| "loss": 2.7044, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.912231771870909, | |
| "grad_norm": 1.1783692836761475, | |
| "learning_rate": 0.0009458333333333334, | |
| "loss": 2.6819, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.9206557003813534, | |
| "grad_norm": 0.9856918454170227, | |
| "learning_rate": 0.00095, | |
| "loss": 2.6528, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.929079628891798, | |
| "grad_norm": 1.0605028867721558, | |
| "learning_rate": 0.0009541666666666667, | |
| "loss": 2.6226, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.9375035574022426, | |
| "grad_norm": 0.8553977608680725, | |
| "learning_rate": 0.0009583333333333334, | |
| "loss": 2.608, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.9459274859126872, | |
| "grad_norm": 0.9543612599372864, | |
| "learning_rate": 0.0009625, | |
| "loss": 2.5865, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.9543514144231318, | |
| "grad_norm": 1.1085282564163208, | |
| "learning_rate": 0.0009666666666666667, | |
| "loss": 2.5586, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.9627753429335761, | |
| "grad_norm": 0.8689624667167664, | |
| "learning_rate": 0.0009708333333333333, | |
| "loss": 2.541, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.9711992714440207, | |
| "grad_norm": 0.6790447235107422, | |
| "learning_rate": 0.000975, | |
| "loss": 2.5214, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.9711992714440207, | |
| "eval_accuracy": 0.5198810557311793, | |
| "eval_loss": 2.3582663536071777, | |
| "eval_runtime": 891.4654, | |
| "eval_samples_per_second": 560.174, | |
| "eval_steps_per_second": 5.187, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.9796231999544651, | |
| "grad_norm": 1.1572414636611938, | |
| "learning_rate": 0.0009791666666666666, | |
| "loss": 2.5126, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.9880471284649097, | |
| "grad_norm": 0.8218650221824646, | |
| "learning_rate": 0.0009833333333333332, | |
| "loss": 2.4903, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.9964710569753543, | |
| "grad_norm": 0.9195880889892578, | |
| "learning_rate": 0.0009875, | |
| "loss": 2.479, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 2.004894985485799, | |
| "grad_norm": 0.6436383724212646, | |
| "learning_rate": 0.0009916666666666667, | |
| "loss": 2.4509, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 2.0133189139962435, | |
| "grad_norm": 0.9757860898971558, | |
| "learning_rate": 0.0009958333333333334, | |
| "loss": 2.453, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 2.021742842506688, | |
| "grad_norm": 0.8884423971176147, | |
| "learning_rate": 0.001, | |
| "loss": 2.428, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.0301667710171323, | |
| "grad_norm": 1.097330093383789, | |
| "learning_rate": 0.000999009900990099, | |
| "loss": 2.4139, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 2.038590699527577, | |
| "grad_norm": 1.095337152481079, | |
| "learning_rate": 0.0009980198019801981, | |
| "loss": 2.4024, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 2.0470146280380215, | |
| "grad_norm": 1.0757551193237305, | |
| "learning_rate": 0.000997029702970297, | |
| "loss": 2.3853, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.0470146280380215, | |
| "eval_accuracy": 0.538133837771306, | |
| "eval_loss": 2.2352097034454346, | |
| "eval_runtime": 883.4374, | |
| "eval_samples_per_second": 565.265, | |
| "eval_steps_per_second": 5.234, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 2.055438556548466, | |
| "grad_norm": 0.9356153011322021, | |
| "learning_rate": 0.000996039603960396, | |
| "loss": 2.3669, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 2.0638624850589107, | |
| "grad_norm": 0.8463107347488403, | |
| "learning_rate": 0.000995049504950495, | |
| "loss": 2.3604, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.0722864135693553, | |
| "grad_norm": 0.8833483457565308, | |
| "learning_rate": 0.0009940594059405941, | |
| "loss": 2.3574, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 2.0807103420797994, | |
| "grad_norm": 0.7081923484802246, | |
| "learning_rate": 0.0009930693069306932, | |
| "loss": 2.3338, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 2.089134270590244, | |
| "grad_norm": 0.5993143916130066, | |
| "learning_rate": 0.000992079207920792, | |
| "loss": 2.3219, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 2.0975581991006886, | |
| "grad_norm": 0.8431512117385864, | |
| "learning_rate": 0.000991089108910891, | |
| "loss": 2.3108, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.1059821276111332, | |
| "grad_norm": 0.9983824491500854, | |
| "learning_rate": 0.0009900990099009901, | |
| "loss": 2.305, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.114406056121578, | |
| "grad_norm": 0.6354156732559204, | |
| "learning_rate": 0.0009891089108910892, | |
| "loss": 2.2965, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.1228299846320224, | |
| "grad_norm": 0.8491016626358032, | |
| "learning_rate": 0.0009881188118811882, | |
| "loss": 2.2763, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.1228299846320224, | |
| "eval_accuracy": 0.5540495533549666, | |
| "eval_loss": 2.135758399963379, | |
| "eval_runtime": 895.5557, | |
| "eval_samples_per_second": 557.616, | |
| "eval_steps_per_second": 5.163, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.131253913142467, | |
| "grad_norm": 0.6909253001213074, | |
| "learning_rate": 0.000987128712871287, | |
| "loss": 2.2696, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.139677841652911, | |
| "grad_norm": 0.5072851181030273, | |
| "learning_rate": 0.000986138613861386, | |
| "loss": 2.2555, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.148101770163356, | |
| "grad_norm": 0.7575969696044922, | |
| "learning_rate": 0.0009851485148514852, | |
| "loss": 2.2552, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.1565256986738004, | |
| "grad_norm": 0.7418563365936279, | |
| "learning_rate": 0.0009841584158415842, | |
| "loss": 2.2439, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.164949627184245, | |
| "grad_norm": 0.5893211960792542, | |
| "learning_rate": 0.0009831683168316833, | |
| "loss": 2.2282, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.1733735556946896, | |
| "grad_norm": 0.892035186290741, | |
| "learning_rate": 0.000982178217821782, | |
| "loss": 2.2201, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.181797484205134, | |
| "grad_norm": 0.688275933265686, | |
| "learning_rate": 0.0009811881188118811, | |
| "loss": 2.2174, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.1902214127155784, | |
| "grad_norm": 0.5092687010765076, | |
| "learning_rate": 0.0009801980198019802, | |
| "loss": 2.2032, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.198645341226023, | |
| "grad_norm": 0.6715185642242432, | |
| "learning_rate": 0.0009792079207920793, | |
| "loss": 2.189, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.198645341226023, | |
| "eval_accuracy": 0.5674450081410035, | |
| "eval_loss": 2.053079605102539, | |
| "eval_runtime": 876.7453, | |
| "eval_samples_per_second": 569.579, | |
| "eval_steps_per_second": 5.274, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.2070692697364676, | |
| "grad_norm": 0.5717750191688538, | |
| "learning_rate": 0.0009782178217821783, | |
| "loss": 2.1894, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.215493198246912, | |
| "grad_norm": 0.7002500295639038, | |
| "learning_rate": 0.0009772277227722771, | |
| "loss": 2.1851, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.2239171267573568, | |
| "grad_norm": 0.6041799783706665, | |
| "learning_rate": 0.0009762376237623762, | |
| "loss": 2.1899, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.2323410552678014, | |
| "grad_norm": 0.40263745188713074, | |
| "learning_rate": 0.0009752475247524752, | |
| "loss": 2.1633, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.240764983778246, | |
| "grad_norm": 0.47779303789138794, | |
| "learning_rate": 0.0009742574257425743, | |
| "loss": 2.1478, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.24918891228869, | |
| "grad_norm": 0.8906975984573364, | |
| "learning_rate": 0.0009732673267326732, | |
| "loss": 2.1508, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.2576128407991347, | |
| "grad_norm": 0.4588846266269684, | |
| "learning_rate": 0.0009722772277227723, | |
| "loss": 2.1422, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.2660367693095793, | |
| "grad_norm": 0.6038916707038879, | |
| "learning_rate": 0.0009712871287128712, | |
| "loss": 2.1229, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.274460697820024, | |
| "grad_norm": 0.792378842830658, | |
| "learning_rate": 0.0009702970297029703, | |
| "loss": 2.1262, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.274460697820024, | |
| "eval_accuracy": 0.5767164906847645, | |
| "eval_loss": 1.9968212842941284, | |
| "eval_runtime": 890.0794, | |
| "eval_samples_per_second": 561.047, | |
| "eval_steps_per_second": 5.195, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.2828846263304685, | |
| "grad_norm": 0.5215600728988647, | |
| "learning_rate": 0.0009693069306930693, | |
| "loss": 2.1315, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.291308554840913, | |
| "grad_norm": 0.42443060874938965, | |
| "learning_rate": 0.0009683168316831683, | |
| "loss": 2.1075, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.2997324833513577, | |
| "grad_norm": 0.7379765510559082, | |
| "learning_rate": 0.0009673267326732673, | |
| "loss": 2.0997, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.308156411861802, | |
| "grad_norm": 0.532883882522583, | |
| "learning_rate": 0.0009663366336633663, | |
| "loss": 2.1009, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.3165803403722465, | |
| "grad_norm": 0.4312550127506256, | |
| "learning_rate": 0.0009653465346534653, | |
| "loss": 2.0836, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.325004268882691, | |
| "grad_norm": 0.42506101727485657, | |
| "learning_rate": 0.0009643564356435644, | |
| "loss": 2.0751, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.3334281973931357, | |
| "grad_norm": 0.9728929400444031, | |
| "learning_rate": 0.0009633663366336633, | |
| "loss": 2.0755, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.3418521259035803, | |
| "grad_norm": 0.4502295255661011, | |
| "learning_rate": 0.0009623762376237624, | |
| "loss": 2.0757, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.350276054414025, | |
| "grad_norm": 0.6825786232948303, | |
| "learning_rate": 0.0009613861386138613, | |
| "loss": 2.0593, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.350276054414025, | |
| "eval_accuracy": 0.5877788692302428, | |
| "eval_loss": 1.932070255279541, | |
| "eval_runtime": 877.2049, | |
| "eval_samples_per_second": 569.281, | |
| "eval_steps_per_second": 5.271, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.3586999829244695, | |
| "grad_norm": 0.5142760276794434, | |
| "learning_rate": 0.0009603960396039604, | |
| "loss": 2.0529, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.3671239114349136, | |
| "grad_norm": 0.613132119178772, | |
| "learning_rate": 0.0009594059405940594, | |
| "loss": 2.0423, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.3755478399453582, | |
| "grad_norm": 0.7282253503799438, | |
| "learning_rate": 0.0009584158415841584, | |
| "loss": 2.0522, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.383971768455803, | |
| "grad_norm": 0.37959426641464233, | |
| "learning_rate": 0.0009574257425742574, | |
| "loss": 2.0367, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.3923956969662474, | |
| "grad_norm": 0.35326164960861206, | |
| "learning_rate": 0.0009564356435643564, | |
| "loss": 2.0233, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.400819625476692, | |
| "grad_norm": 0.8196151256561279, | |
| "learning_rate": 0.0009554455445544554, | |
| "loss": 2.0264, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.409243553987136, | |
| "grad_norm": 0.7122208476066589, | |
| "learning_rate": 0.0009544554455445545, | |
| "loss": 2.0308, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.417667482497581, | |
| "grad_norm": 0.35665011405944824, | |
| "learning_rate": 0.0009534653465346534, | |
| "loss": 2.0133, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.4260914110080254, | |
| "grad_norm": 0.3755228519439697, | |
| "learning_rate": 0.0009524752475247525, | |
| "loss": 1.9992, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.4260914110080254, | |
| "eval_accuracy": 0.596780331496744, | |
| "eval_loss": 1.8819479942321777, | |
| "eval_runtime": 890.4504, | |
| "eval_samples_per_second": 560.813, | |
| "eval_steps_per_second": 5.193, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.43451533951847, | |
| "grad_norm": 0.7018378376960754, | |
| "learning_rate": 0.0009514851485148514, | |
| "loss": 2.0013, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.4429392680289146, | |
| "grad_norm": 0.4874301850795746, | |
| "learning_rate": 0.0009504950495049505, | |
| "loss": 1.9971, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.451363196539359, | |
| "grad_norm": 0.45909377932548523, | |
| "learning_rate": 0.0009495049504950495, | |
| "loss": 1.9881, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.459787125049804, | |
| "grad_norm": 0.4965904951095581, | |
| "learning_rate": 0.0009485148514851485, | |
| "loss": 1.989, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.468211053560248, | |
| "grad_norm": 0.4780527949333191, | |
| "learning_rate": 0.0009475247524752475, | |
| "loss": 1.9795, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.4766349820706925, | |
| "grad_norm": 0.5145118236541748, | |
| "learning_rate": 0.0009465346534653465, | |
| "loss": 1.973, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.485058910581137, | |
| "grad_norm": 0.5469622015953064, | |
| "learning_rate": 0.0009455445544554455, | |
| "loss": 1.9692, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.4934828390915817, | |
| "grad_norm": 0.5788788199424744, | |
| "learning_rate": 0.0009445544554455446, | |
| "loss": 1.9627, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.5019067676020263, | |
| "grad_norm": 0.5380696654319763, | |
| "learning_rate": 0.0009435643564356435, | |
| "loss": 1.9624, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.5019067676020263, | |
| "eval_accuracy": 0.6028271764812113, | |
| "eval_loss": 1.8441975116729736, | |
| "eval_runtime": 877.1334, | |
| "eval_samples_per_second": 569.327, | |
| "eval_steps_per_second": 5.272, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.510330696112471, | |
| "grad_norm": 0.4939862787723541, | |
| "learning_rate": 0.0009425742574257426, | |
| "loss": 1.9576, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.5187546246229155, | |
| "grad_norm": 0.4804815649986267, | |
| "learning_rate": 0.0009415841584158415, | |
| "loss": 1.948, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.5271785531333597, | |
| "grad_norm": 0.529515266418457, | |
| "learning_rate": 0.0009405940594059406, | |
| "loss": 1.9414, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.5356024816438043, | |
| "grad_norm": 0.5104151964187622, | |
| "learning_rate": 0.0009396039603960396, | |
| "loss": 1.9472, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.544026410154249, | |
| "grad_norm": 0.36934202909469604, | |
| "learning_rate": 0.0009386138613861386, | |
| "loss": 1.9358, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.5524503386646935, | |
| "grad_norm": 0.5956403017044067, | |
| "learning_rate": 0.0009376237623762376, | |
| "loss": 1.9272, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.560874267175138, | |
| "grad_norm": 0.5035738348960876, | |
| "learning_rate": 0.0009366336633663367, | |
| "loss": 1.934, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.5692981956855827, | |
| "grad_norm": 0.44133296608924866, | |
| "learning_rate": 0.0009356435643564357, | |
| "loss": 1.9192, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.5777221241960273, | |
| "grad_norm": 0.617588996887207, | |
| "learning_rate": 0.0009346534653465348, | |
| "loss": 1.9189, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.5777221241960273, | |
| "eval_accuracy": 0.6097417836200192, | |
| "eval_loss": 1.806692123413086, | |
| "eval_runtime": 890.173, | |
| "eval_samples_per_second": 560.988, | |
| "eval_steps_per_second": 5.194, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.5861460527064715, | |
| "grad_norm": 0.4702962338924408, | |
| "learning_rate": 0.0009336633663366337, | |
| "loss": 1.9145, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.594569981216916, | |
| "grad_norm": 0.37163108587265015, | |
| "learning_rate": 0.0009326732673267328, | |
| "loss": 1.907, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.6029939097273607, | |
| "grad_norm": 0.8039525151252747, | |
| "learning_rate": 0.0009316831683168317, | |
| "loss": 1.9071, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.6114178382378053, | |
| "grad_norm": 0.3594844341278076, | |
| "learning_rate": 0.0009306930693069308, | |
| "loss": 1.9109, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.61984176674825, | |
| "grad_norm": 0.44677871465682983, | |
| "learning_rate": 0.0009297029702970298, | |
| "loss": 1.8948, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.628265695258694, | |
| "grad_norm": 0.4496874511241913, | |
| "learning_rate": 0.0009287128712871288, | |
| "loss": 1.893, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.636689623769139, | |
| "grad_norm": 0.44437769055366516, | |
| "learning_rate": 0.0009277227722772278, | |
| "loss": 1.8891, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.6451135522795832, | |
| "grad_norm": 0.47511276602745056, | |
| "learning_rate": 0.0009267326732673268, | |
| "loss": 1.8828, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.653537480790028, | |
| "grad_norm": 0.5357436537742615, | |
| "learning_rate": 0.0009257425742574258, | |
| "loss": 1.8802, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.653537480790028, | |
| "eval_accuracy": 0.6167399590165771, | |
| "eval_loss": 1.7698620557785034, | |
| "eval_runtime": 887.5592, | |
| "eval_samples_per_second": 562.64, | |
| "eval_steps_per_second": 5.21, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.6619614093004724, | |
| "grad_norm": 0.5014392137527466, | |
| "learning_rate": 0.0009247524752475249, | |
| "loss": 1.8819, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.670385337810917, | |
| "grad_norm": 0.41872531175613403, | |
| "learning_rate": 0.0009237623762376238, | |
| "loss": 1.8736, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.6788092663213616, | |
| "grad_norm": 0.4343492388725281, | |
| "learning_rate": 0.0009227722772277229, | |
| "loss": 1.8659, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.687233194831806, | |
| "grad_norm": 0.45470404624938965, | |
| "learning_rate": 0.0009217821782178218, | |
| "loss": 1.8689, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.695657123342251, | |
| "grad_norm": 0.4626518487930298, | |
| "learning_rate": 0.0009207920792079209, | |
| "loss": 1.8606, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.704081051852695, | |
| "grad_norm": 0.4213305711746216, | |
| "learning_rate": 0.0009198019801980199, | |
| "loss": 1.8587, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.7125049803631396, | |
| "grad_norm": 0.5036765336990356, | |
| "learning_rate": 0.0009188118811881188, | |
| "loss": 1.8514, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.720928908873584, | |
| "grad_norm": 0.4738876223564148, | |
| "learning_rate": 0.0009178217821782179, | |
| "loss": 1.8506, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.729352837384029, | |
| "grad_norm": 0.3712784945964813, | |
| "learning_rate": 0.0009168316831683168, | |
| "loss": 1.8461, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.729352837384029, | |
| "eval_accuracy": 0.6231111347423419, | |
| "eval_loss": 1.7313838005065918, | |
| "eval_runtime": 889.784, | |
| "eval_samples_per_second": 561.233, | |
| "eval_steps_per_second": 5.197, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.7377767658944734, | |
| "grad_norm": 0.45651596784591675, | |
| "learning_rate": 0.0009158415841584159, | |
| "loss": 1.8405, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.7462006944049175, | |
| "grad_norm": 0.5253742933273315, | |
| "learning_rate": 0.000914851485148515, | |
| "loss": 1.839, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.754624622915362, | |
| "grad_norm": 0.4810900390148163, | |
| "learning_rate": 0.0009138613861386139, | |
| "loss": 1.8352, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.7630485514258067, | |
| "grad_norm": 0.42353251576423645, | |
| "learning_rate": 0.0009128712871287129, | |
| "loss": 1.8308, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.7714724799362513, | |
| "grad_norm": 0.34494903683662415, | |
| "learning_rate": 0.0009118811881188119, | |
| "loss": 1.8271, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.779896408446696, | |
| "grad_norm": 0.44857293367385864, | |
| "learning_rate": 0.0009108910891089109, | |
| "loss": 1.8272, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.7883203369571405, | |
| "grad_norm": 0.32810303568840027, | |
| "learning_rate": 0.00090990099009901, | |
| "loss": 1.8201, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.796744265467585, | |
| "grad_norm": 0.5814313292503357, | |
| "learning_rate": 0.0009089108910891089, | |
| "loss": 1.8181, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.8051681939780293, | |
| "grad_norm": 0.6469531655311584, | |
| "learning_rate": 0.000907920792079208, | |
| "loss": 1.8228, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.8051681939780293, | |
| "eval_accuracy": 0.627194729904968, | |
| "eval_loss": 1.7094751596450806, | |
| "eval_runtime": 879.8799, | |
| "eval_samples_per_second": 567.55, | |
| "eval_steps_per_second": 5.255, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.813592122488474, | |
| "grad_norm": 0.37370234727859497, | |
| "learning_rate": 0.0009069306930693069, | |
| "loss": 1.8143, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.8220160509989185, | |
| "grad_norm": 0.2818905711174011, | |
| "learning_rate": 0.000905940594059406, | |
| "loss": 1.8058, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.830439979509363, | |
| "grad_norm": 0.40032240748405457, | |
| "learning_rate": 0.000904950495049505, | |
| "loss": 1.8037, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.8388639080198077, | |
| "grad_norm": 0.4075703024864197, | |
| "learning_rate": 0.000903960396039604, | |
| "loss": 1.8042, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.8472878365302523, | |
| "grad_norm": 0.4188884496688843, | |
| "learning_rate": 0.000902970297029703, | |
| "loss": 1.7954, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.855711765040697, | |
| "grad_norm": 0.40151095390319824, | |
| "learning_rate": 0.000901980198019802, | |
| "loss": 1.8, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.864135693551141, | |
| "grad_norm": 0.38640516996383667, | |
| "learning_rate": 0.000900990099009901, | |
| "loss": 1.7897, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.8725596220615857, | |
| "grad_norm": 0.46775710582733154, | |
| "learning_rate": 0.0009000000000000001, | |
| "loss": 1.7889, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.8809835505720303, | |
| "grad_norm": 0.5004317760467529, | |
| "learning_rate": 0.000899009900990099, | |
| "loss": 1.7838, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.8809835505720303, | |
| "eval_accuracy": 0.6330453392339891, | |
| "eval_loss": 1.6756778955459595, | |
| "eval_runtime": 890.43, | |
| "eval_samples_per_second": 560.826, | |
| "eval_steps_per_second": 5.193, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.889407479082475, | |
| "grad_norm": 0.44054290652275085, | |
| "learning_rate": 0.0008980198019801981, | |
| "loss": 1.7839, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.8978314075929195, | |
| "grad_norm": 0.38003844022750854, | |
| "learning_rate": 0.000897029702970297, | |
| "loss": 1.7793, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.9062553361033636, | |
| "grad_norm": 0.3714471757411957, | |
| "learning_rate": 0.0008960396039603961, | |
| "loss": 1.7765, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.9146792646138087, | |
| "grad_norm": 0.4955293834209442, | |
| "learning_rate": 0.0008950495049504951, | |
| "loss": 1.7729, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.923103193124253, | |
| "grad_norm": 0.367481529712677, | |
| "learning_rate": 0.0008940594059405941, | |
| "loss": 1.7666, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.9315271216346974, | |
| "grad_norm": 0.48372742533683777, | |
| "learning_rate": 0.0008930693069306931, | |
| "loss": 1.7638, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.939951050145142, | |
| "grad_norm": 0.5356625318527222, | |
| "learning_rate": 0.0008920792079207921, | |
| "loss": 1.7625, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.9483749786555866, | |
| "grad_norm": 0.396090030670166, | |
| "learning_rate": 0.0008910891089108911, | |
| "loss": 1.7597, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.956798907166031, | |
| "grad_norm": 0.3071458041667938, | |
| "learning_rate": 0.0008900990099009902, | |
| "loss": 1.7513, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.956798907166031, | |
| "eval_accuracy": 0.640630813225039, | |
| "eval_loss": 1.6351577043533325, | |
| "eval_runtime": 887.1061, | |
| "eval_samples_per_second": 562.927, | |
| "eval_steps_per_second": 5.212, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.9652228356764754, | |
| "grad_norm": 0.7265316247940063, | |
| "learning_rate": 0.0008891089108910891, | |
| "loss": 1.7482, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.97364676418692, | |
| "grad_norm": 0.34152501821517944, | |
| "learning_rate": 0.0008881188118811882, | |
| "loss": 1.7454, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.9820706926973646, | |
| "grad_norm": 0.5570985078811646, | |
| "learning_rate": 0.0008871287128712871, | |
| "loss": 1.736, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.990494621207809, | |
| "grad_norm": 0.29268133640289307, | |
| "learning_rate": 0.0008861386138613862, | |
| "loss": 1.7323, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.998918549718254, | |
| "grad_norm": 0.4475082755088806, | |
| "learning_rate": 0.0008851485148514852, | |
| "loss": 1.7207, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 3.0073424782286984, | |
| "grad_norm": 0.39963921904563904, | |
| "learning_rate": 0.0008841584158415842, | |
| "loss": 1.7199, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 3.015766406739143, | |
| "grad_norm": 0.3290662169456482, | |
| "learning_rate": 0.0008831683168316832, | |
| "loss": 1.7103, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 3.024190335249587, | |
| "grad_norm": 0.4892579913139343, | |
| "learning_rate": 0.0008821782178217822, | |
| "loss": 1.7024, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 3.0326142637600317, | |
| "grad_norm": 0.45102205872535706, | |
| "learning_rate": 0.0008811881188118812, | |
| "loss": 1.7012, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.0326142637600317, | |
| "eval_accuracy": 0.65292687328356, | |
| "eval_loss": 1.578561544418335, | |
| "eval_runtime": 889.1801, | |
| "eval_samples_per_second": 561.614, | |
| "eval_steps_per_second": 5.2, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.0410381922704763, | |
| "grad_norm": 0.38877975940704346, | |
| "learning_rate": 0.0008801980198019803, | |
| "loss": 1.6999, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 3.049462120780921, | |
| "grad_norm": 0.32052722573280334, | |
| "learning_rate": 0.0008792079207920792, | |
| "loss": 1.6898, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 3.0578860492913655, | |
| "grad_norm": 0.4076586365699768, | |
| "learning_rate": 0.0008782178217821783, | |
| "loss": 1.682, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 3.06630997780181, | |
| "grad_norm": 0.3886164724826813, | |
| "learning_rate": 0.0008772277227722772, | |
| "loss": 1.6788, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 3.0747339063122547, | |
| "grad_norm": 0.43478402495384216, | |
| "learning_rate": 0.0008762376237623763, | |
| "loss": 1.6757, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 3.083157834822699, | |
| "grad_norm": 0.3681798279285431, | |
| "learning_rate": 0.0008752475247524753, | |
| "loss": 1.6725, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 3.0915817633331435, | |
| "grad_norm": 0.44459056854248047, | |
| "learning_rate": 0.0008742574257425743, | |
| "loss": 1.6653, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 3.100005691843588, | |
| "grad_norm": 0.3404163420200348, | |
| "learning_rate": 0.0008732673267326733, | |
| "loss": 1.6597, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 3.1084296203540327, | |
| "grad_norm": 0.39622583985328674, | |
| "learning_rate": 0.0008722772277227722, | |
| "loss": 1.664, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 3.1084296203540327, | |
| "eval_accuracy": 0.6616252383451875, | |
| "eval_loss": 1.5378377437591553, | |
| "eval_runtime": 880.004, | |
| "eval_samples_per_second": 567.47, | |
| "eval_steps_per_second": 5.255, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 3.1168535488644773, | |
| "grad_norm": 0.36066505312919617, | |
| "learning_rate": 0.0008712871287128713, | |
| "loss": 1.6552, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.125277477374922, | |
| "grad_norm": 0.45852380990982056, | |
| "learning_rate": 0.0008702970297029704, | |
| "loss": 1.6581, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 3.1337014058853665, | |
| "grad_norm": 0.3647266924381256, | |
| "learning_rate": 0.0008693069306930693, | |
| "loss": 1.6493, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 3.1421253343958107, | |
| "grad_norm": 0.4774695038795471, | |
| "learning_rate": 0.0008683168316831684, | |
| "loss": 1.6457, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 3.1505492629062553, | |
| "grad_norm": 0.4143640398979187, | |
| "learning_rate": 0.0008673267326732673, | |
| "loss": 1.6436, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 3.1589731914167, | |
| "grad_norm": 0.4920789897441864, | |
| "learning_rate": 0.0008663366336633663, | |
| "loss": 1.6431, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.1673971199271445, | |
| "grad_norm": 0.40231600403785706, | |
| "learning_rate": 0.0008653465346534654, | |
| "loss": 1.6373, | |
| "step": 3760 | |
| }, | |
| { | |
| "epoch": 3.175821048437589, | |
| "grad_norm": 0.35115131735801697, | |
| "learning_rate": 0.0008643564356435643, | |
| "loss": 1.6343, | |
| "step": 3770 | |
| }, | |
| { | |
| "epoch": 3.1842449769480337, | |
| "grad_norm": 0.3814195990562439, | |
| "learning_rate": 0.0008633663366336634, | |
| "loss": 1.6345, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 3.1842449769480337, | |
| "eval_accuracy": 0.6669776046149977, | |
| "eval_loss": 1.5131778717041016, | |
| "eval_runtime": 887.9268, | |
| "eval_samples_per_second": 562.407, | |
| "eval_steps_per_second": 5.208, | |
| "step": 3780 | |
| }, | |
| { | |
| "epoch": 3.192668905458478, | |
| "grad_norm": 0.3229101896286011, | |
| "learning_rate": 0.0008623762376237623, | |
| "loss": 1.6281, | |
| "step": 3790 | |
| }, | |
| { | |
| "epoch": 3.2010928339689224, | |
| "grad_norm": 0.4361475110054016, | |
| "learning_rate": 0.0008613861386138614, | |
| "loss": 1.6253, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.209516762479367, | |
| "grad_norm": 0.3246362507343292, | |
| "learning_rate": 0.0008603960396039604, | |
| "loss": 1.6269, | |
| "step": 3810 | |
| }, | |
| { | |
| "epoch": 3.2179406909898116, | |
| "grad_norm": 0.5126762390136719, | |
| "learning_rate": 0.0008594059405940594, | |
| "loss": 1.62, | |
| "step": 3820 | |
| }, | |
| { | |
| "epoch": 3.226364619500256, | |
| "grad_norm": 0.3813638389110565, | |
| "learning_rate": 0.0008584158415841584, | |
| "loss": 1.6228, | |
| "step": 3830 | |
| }, | |
| { | |
| "epoch": 3.234788548010701, | |
| "grad_norm": 0.5111351013183594, | |
| "learning_rate": 0.0008574257425742574, | |
| "loss": 1.6162, | |
| "step": 3840 | |
| }, | |
| { | |
| "epoch": 3.243212476521145, | |
| "grad_norm": 0.3448195457458496, | |
| "learning_rate": 0.0008564356435643564, | |
| "loss": 1.6156, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 3.2516364050315896, | |
| "grad_norm": 0.50129634141922, | |
| "learning_rate": 0.0008554455445544555, | |
| "loss": 1.6153, | |
| "step": 3860 | |
| }, | |
| { | |
| "epoch": 3.260060333542034, | |
| "grad_norm": 0.3352351188659668, | |
| "learning_rate": 0.0008544554455445544, | |
| "loss": 1.6117, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 3.260060333542034, | |
| "eval_accuracy": 0.6717362607348063, | |
| "eval_loss": 1.4890562295913696, | |
| "eval_runtime": 886.1465, | |
| "eval_samples_per_second": 563.537, | |
| "eval_steps_per_second": 5.218, | |
| "step": 3870 | |
| }, | |
| { | |
| "epoch": 3.2684842620524788, | |
| "grad_norm": 0.38713541626930237, | |
| "learning_rate": 0.0008534653465346535, | |
| "loss": 1.6058, | |
| "step": 3880 | |
| }, | |
| { | |
| "epoch": 3.2769081905629234, | |
| "grad_norm": 0.46299123764038086, | |
| "learning_rate": 0.0008524752475247524, | |
| "loss": 1.6053, | |
| "step": 3890 | |
| }, | |
| { | |
| "epoch": 3.285332119073368, | |
| "grad_norm": 0.4045964181423187, | |
| "learning_rate": 0.0008514851485148515, | |
| "loss": 1.6064, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 3.2937560475838126, | |
| "grad_norm": 0.37616729736328125, | |
| "learning_rate": 0.0008504950495049505, | |
| "loss": 1.6005, | |
| "step": 3910 | |
| }, | |
| { | |
| "epoch": 3.3021799760942567, | |
| "grad_norm": 0.47833314538002014, | |
| "learning_rate": 0.0008495049504950495, | |
| "loss": 1.599, | |
| "step": 3920 | |
| }, | |
| { | |
| "epoch": 3.3106039046047013, | |
| "grad_norm": 0.436625212430954, | |
| "learning_rate": 0.0008485148514851485, | |
| "loss": 1.5954, | |
| "step": 3930 | |
| }, | |
| { | |
| "epoch": 3.319027833115146, | |
| "grad_norm": 0.3456842005252838, | |
| "learning_rate": 0.0008475247524752475, | |
| "loss": 1.5924, | |
| "step": 3940 | |
| }, | |
| { | |
| "epoch": 3.3274517616255905, | |
| "grad_norm": 0.5403941869735718, | |
| "learning_rate": 0.0008465346534653465, | |
| "loss": 1.5915, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 3.335875690136035, | |
| "grad_norm": 0.3622403144836426, | |
| "learning_rate": 0.0008455445544554456, | |
| "loss": 1.6013, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 3.335875690136035, | |
| "eval_accuracy": 0.6740560565861919, | |
| "eval_loss": 1.475487232208252, | |
| "eval_runtime": 895.3114, | |
| "eval_samples_per_second": 557.768, | |
| "eval_steps_per_second": 5.165, | |
| "step": 3960 | |
| }, | |
| { | |
| "epoch": 3.3442996186464797, | |
| "grad_norm": 0.2850242555141449, | |
| "learning_rate": 0.0008445544554455445, | |
| "loss": 1.5903, | |
| "step": 3970 | |
| }, | |
| { | |
| "epoch": 3.3527235471569243, | |
| "grad_norm": 0.39831429719924927, | |
| "learning_rate": 0.0008435643564356436, | |
| "loss": 1.5846, | |
| "step": 3980 | |
| }, | |
| { | |
| "epoch": 3.3611474756673685, | |
| "grad_norm": 0.4886794686317444, | |
| "learning_rate": 0.0008425742574257425, | |
| "loss": 1.5876, | |
| "step": 3990 | |
| }, | |
| { | |
| "epoch": 3.369571404177813, | |
| "grad_norm": 0.35439977049827576, | |
| "learning_rate": 0.0008415841584158416, | |
| "loss": 1.5839, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.3779953326882577, | |
| "grad_norm": 0.32369595766067505, | |
| "learning_rate": 0.0008405940594059406, | |
| "loss": 1.5797, | |
| "step": 4010 | |
| }, | |
| { | |
| "epoch": 3.3864192611987023, | |
| "grad_norm": 0.48595139384269714, | |
| "learning_rate": 0.0008396039603960396, | |
| "loss": 1.58, | |
| "step": 4020 | |
| }, | |
| { | |
| "epoch": 3.394843189709147, | |
| "grad_norm": 0.39331361651420593, | |
| "learning_rate": 0.0008386138613861386, | |
| "loss": 1.5786, | |
| "step": 4030 | |
| }, | |
| { | |
| "epoch": 3.4032671182195915, | |
| "grad_norm": 0.31911513209342957, | |
| "learning_rate": 0.0008376237623762376, | |
| "loss": 1.5745, | |
| "step": 4040 | |
| }, | |
| { | |
| "epoch": 3.411691046730036, | |
| "grad_norm": 0.319876104593277, | |
| "learning_rate": 0.0008366336633663366, | |
| "loss": 1.5749, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.411691046730036, | |
| "eval_accuracy": 0.6780886041474171, | |
| "eval_loss": 1.4578139781951904, | |
| "eval_runtime": 880.4333, | |
| "eval_samples_per_second": 567.193, | |
| "eval_steps_per_second": 5.252, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 3.4201149752404802, | |
| "grad_norm": 0.45969948172569275, | |
| "learning_rate": 0.0008356435643564357, | |
| "loss": 1.5759, | |
| "step": 4060 | |
| }, | |
| { | |
| "epoch": 3.428538903750925, | |
| "grad_norm": 0.34449151158332825, | |
| "learning_rate": 0.0008346534653465346, | |
| "loss": 1.5707, | |
| "step": 4070 | |
| }, | |
| { | |
| "epoch": 3.4369628322613694, | |
| "grad_norm": 0.3478371202945709, | |
| "learning_rate": 0.0008336633663366337, | |
| "loss": 1.5699, | |
| "step": 4080 | |
| }, | |
| { | |
| "epoch": 3.445386760771814, | |
| "grad_norm": 0.5127679109573364, | |
| "learning_rate": 0.0008326732673267326, | |
| "loss": 1.5668, | |
| "step": 4090 | |
| }, | |
| { | |
| "epoch": 3.4538106892822587, | |
| "grad_norm": 0.302216500043869, | |
| "learning_rate": 0.0008316831683168317, | |
| "loss": 1.5647, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 3.4622346177927033, | |
| "grad_norm": 0.3295814096927643, | |
| "learning_rate": 0.0008306930693069307, | |
| "loss": 1.5628, | |
| "step": 4110 | |
| }, | |
| { | |
| "epoch": 3.4706585463031474, | |
| "grad_norm": 0.4209032654762268, | |
| "learning_rate": 0.0008297029702970297, | |
| "loss": 1.5628, | |
| "step": 4120 | |
| }, | |
| { | |
| "epoch": 3.479082474813592, | |
| "grad_norm": 0.34786614775657654, | |
| "learning_rate": 0.0008287128712871287, | |
| "loss": 1.5613, | |
| "step": 4130 | |
| }, | |
| { | |
| "epoch": 3.4875064033240366, | |
| "grad_norm": 0.4870763421058655, | |
| "learning_rate": 0.0008277227722772277, | |
| "loss": 1.5584, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.4875064033240366, | |
| "eval_accuracy": 0.6804383346028876, | |
| "eval_loss": 1.4444972276687622, | |
| "eval_runtime": 891.9286, | |
| "eval_samples_per_second": 559.883, | |
| "eval_steps_per_second": 5.184, | |
| "step": 4140 | |
| }, | |
| { | |
| "epoch": 3.495930331834481, | |
| "grad_norm": 0.31641605496406555, | |
| "learning_rate": 0.0008267326732673267, | |
| "loss": 1.5581, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 3.504354260344926, | |
| "grad_norm": 0.31303870677948, | |
| "learning_rate": 0.0008257425742574258, | |
| "loss": 1.5548, | |
| "step": 4160 | |
| }, | |
| { | |
| "epoch": 3.5127781888553704, | |
| "grad_norm": 0.35413628816604614, | |
| "learning_rate": 0.0008247524752475247, | |
| "loss": 1.5506, | |
| "step": 4170 | |
| }, | |
| { | |
| "epoch": 3.5212021173658146, | |
| "grad_norm": 0.39600226283073425, | |
| "learning_rate": 0.0008237623762376238, | |
| "loss": 1.5517, | |
| "step": 4180 | |
| }, | |
| { | |
| "epoch": 3.529626045876259, | |
| "grad_norm": 0.3600960075855255, | |
| "learning_rate": 0.0008227722772277227, | |
| "loss": 1.5563, | |
| "step": 4190 | |
| }, | |
| { | |
| "epoch": 3.5380499743867038, | |
| "grad_norm": 0.2877024710178375, | |
| "learning_rate": 0.0008217821782178218, | |
| "loss": 1.5467, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 3.5464739028971484, | |
| "grad_norm": 0.42324578762054443, | |
| "learning_rate": 0.0008207920792079208, | |
| "loss": 1.546, | |
| "step": 4210 | |
| }, | |
| { | |
| "epoch": 3.554897831407593, | |
| "grad_norm": 0.38907232880592346, | |
| "learning_rate": 0.0008198019801980197, | |
| "loss": 1.5458, | |
| "step": 4220 | |
| }, | |
| { | |
| "epoch": 3.5633217599180376, | |
| "grad_norm": 0.34750425815582275, | |
| "learning_rate": 0.0008188118811881188, | |
| "loss": 1.5437, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 3.5633217599180376, | |
| "eval_accuracy": 0.6840987986477044, | |
| "eval_loss": 1.4261698722839355, | |
| "eval_runtime": 886.2695, | |
| "eval_samples_per_second": 563.458, | |
| "eval_steps_per_second": 5.217, | |
| "step": 4230 | |
| }, | |
| { | |
| "epoch": 3.571745688428482, | |
| "grad_norm": 0.3718611001968384, | |
| "learning_rate": 0.0008178217821782177, | |
| "loss": 1.546, | |
| "step": 4240 | |
| }, | |
| { | |
| "epoch": 3.5801696169389263, | |
| "grad_norm": 0.39119917154312134, | |
| "learning_rate": 0.0008168316831683168, | |
| "loss": 1.5411, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 3.588593545449371, | |
| "grad_norm": 0.45689284801483154, | |
| "learning_rate": 0.0008158415841584159, | |
| "loss": 1.5416, | |
| "step": 4260 | |
| }, | |
| { | |
| "epoch": 3.5970174739598155, | |
| "grad_norm": 0.4029008150100708, | |
| "learning_rate": 0.0008148514851485148, | |
| "loss": 1.5364, | |
| "step": 4270 | |
| }, | |
| { | |
| "epoch": 3.60544140247026, | |
| "grad_norm": 0.3843879997730255, | |
| "learning_rate": 0.0008138613861386138, | |
| "loss": 1.5368, | |
| "step": 4280 | |
| }, | |
| { | |
| "epoch": 3.6138653309807047, | |
| "grad_norm": 0.33945897221565247, | |
| "learning_rate": 0.0008128712871287128, | |
| "loss": 1.5369, | |
| "step": 4290 | |
| }, | |
| { | |
| "epoch": 3.6222892594911493, | |
| "grad_norm": 0.29753997921943665, | |
| "learning_rate": 0.000811881188118812, | |
| "loss": 1.5326, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 3.630713188001594, | |
| "grad_norm": 0.4412858784198761, | |
| "learning_rate": 0.000810891089108911, | |
| "loss": 1.5316, | |
| "step": 4310 | |
| }, | |
| { | |
| "epoch": 3.639137116512038, | |
| "grad_norm": 0.30377647280693054, | |
| "learning_rate": 0.00080990099009901, | |
| "loss": 1.5308, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.639137116512038, | |
| "eval_accuracy": 0.6865785598346558, | |
| "eval_loss": 1.4111888408660889, | |
| "eval_runtime": 880.9823, | |
| "eval_samples_per_second": 566.84, | |
| "eval_steps_per_second": 5.249, | |
| "step": 4320 | |
| }, | |
| { | |
| "epoch": 3.6475610450224827, | |
| "grad_norm": 0.3666999638080597, | |
| "learning_rate": 0.000808910891089109, | |
| "loss": 1.5279, | |
| "step": 4330 | |
| }, | |
| { | |
| "epoch": 3.6559849735329273, | |
| "grad_norm": 0.3254301846027374, | |
| "learning_rate": 0.0008079207920792079, | |
| "loss": 1.5277, | |
| "step": 4340 | |
| }, | |
| { | |
| "epoch": 3.664408902043372, | |
| "grad_norm": 0.4963987469673157, | |
| "learning_rate": 0.000806930693069307, | |
| "loss": 1.5286, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 3.6728328305538165, | |
| "grad_norm": 0.34190070629119873, | |
| "learning_rate": 0.000805940594059406, | |
| "loss": 1.5294, | |
| "step": 4360 | |
| }, | |
| { | |
| "epoch": 3.6812567590642606, | |
| "grad_norm": 0.35153254866600037, | |
| "learning_rate": 0.000804950495049505, | |
| "loss": 1.5217, | |
| "step": 4370 | |
| }, | |
| { | |
| "epoch": 3.6896806875747057, | |
| "grad_norm": 0.345929354429245, | |
| "learning_rate": 0.000803960396039604, | |
| "loss": 1.52, | |
| "step": 4380 | |
| }, | |
| { | |
| "epoch": 3.69810461608515, | |
| "grad_norm": 0.37540799379348755, | |
| "learning_rate": 0.000802970297029703, | |
| "loss": 1.5208, | |
| "step": 4390 | |
| }, | |
| { | |
| "epoch": 3.7065285445955944, | |
| "grad_norm": 0.33499011397361755, | |
| "learning_rate": 0.000801980198019802, | |
| "loss": 1.5196, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 3.714952473106039, | |
| "grad_norm": 0.3461949825286865, | |
| "learning_rate": 0.0008009900990099011, | |
| "loss": 1.5188, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.714952473106039, | |
| "eval_accuracy": 0.6888913088166951, | |
| "eval_loss": 1.40292227268219, | |
| "eval_runtime": 882.772, | |
| "eval_samples_per_second": 565.691, | |
| "eval_steps_per_second": 5.238, | |
| "step": 4410 | |
| }, | |
| { | |
| "epoch": 3.7233764016164836, | |
| "grad_norm": 0.36491358280181885, | |
| "learning_rate": 0.0008, | |
| "loss": 1.5171, | |
| "step": 4420 | |
| }, | |
| { | |
| "epoch": 3.7318003301269282, | |
| "grad_norm": 0.2799367606639862, | |
| "learning_rate": 0.0007990099009900991, | |
| "loss": 1.5142, | |
| "step": 4430 | |
| }, | |
| { | |
| "epoch": 3.7402242586373724, | |
| "grad_norm": 0.361971914768219, | |
| "learning_rate": 0.000798019801980198, | |
| "loss": 1.5145, | |
| "step": 4440 | |
| }, | |
| { | |
| "epoch": 3.7486481871478174, | |
| "grad_norm": 0.2618056535720825, | |
| "learning_rate": 0.0007970297029702971, | |
| "loss": 1.5113, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 3.7570721156582616, | |
| "grad_norm": 0.5228148698806763, | |
| "learning_rate": 0.0007960396039603961, | |
| "loss": 1.5111, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 3.765496044168706, | |
| "grad_norm": 0.37740132212638855, | |
| "learning_rate": 0.0007950495049504951, | |
| "loss": 1.5121, | |
| "step": 4470 | |
| }, | |
| { | |
| "epoch": 3.773919972679151, | |
| "grad_norm": 0.3701629340648651, | |
| "learning_rate": 0.0007940594059405941, | |
| "loss": 1.5083, | |
| "step": 4480 | |
| }, | |
| { | |
| "epoch": 3.7823439011895954, | |
| "grad_norm": 0.3345108926296234, | |
| "learning_rate": 0.0007930693069306931, | |
| "loss": 1.5077, | |
| "step": 4490 | |
| }, | |
| { | |
| "epoch": 3.79076782970004, | |
| "grad_norm": 0.3989773988723755, | |
| "learning_rate": 0.0007920792079207921, | |
| "loss": 1.5079, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.79076782970004, | |
| "eval_accuracy": 0.6907081981543249, | |
| "eval_loss": 1.3909889459609985, | |
| "eval_runtime": 889.7203, | |
| "eval_samples_per_second": 561.273, | |
| "eval_steps_per_second": 5.197, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 3.799191758210484, | |
| "grad_norm": 0.284728080034256, | |
| "learning_rate": 0.0007910891089108912, | |
| "loss": 1.5046, | |
| "step": 4510 | |
| }, | |
| { | |
| "epoch": 3.8076156867209288, | |
| "grad_norm": 0.5029779672622681, | |
| "learning_rate": 0.0007900990099009901, | |
| "loss": 1.5049, | |
| "step": 4520 | |
| }, | |
| { | |
| "epoch": 3.8160396152313734, | |
| "grad_norm": 0.32617345452308655, | |
| "learning_rate": 0.0007891089108910892, | |
| "loss": 1.5068, | |
| "step": 4530 | |
| }, | |
| { | |
| "epoch": 3.824463543741818, | |
| "grad_norm": 0.36316540837287903, | |
| "learning_rate": 0.0007881188118811881, | |
| "loss": 1.4999, | |
| "step": 4540 | |
| }, | |
| { | |
| "epoch": 3.8328874722522626, | |
| "grad_norm": 0.30240392684936523, | |
| "learning_rate": 0.0007871287128712872, | |
| "loss": 1.498, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 3.841311400762707, | |
| "grad_norm": 0.3905390202999115, | |
| "learning_rate": 0.0007861386138613862, | |
| "loss": 1.4978, | |
| "step": 4560 | |
| }, | |
| { | |
| "epoch": 3.8497353292731518, | |
| "grad_norm": 0.30473875999450684, | |
| "learning_rate": 0.0007851485148514852, | |
| "loss": 1.4965, | |
| "step": 4570 | |
| }, | |
| { | |
| "epoch": 3.858159257783596, | |
| "grad_norm": 0.3675777316093445, | |
| "learning_rate": 0.0007841584158415842, | |
| "loss": 1.4957, | |
| "step": 4580 | |
| }, | |
| { | |
| "epoch": 3.8665831862940405, | |
| "grad_norm": 0.394168883562088, | |
| "learning_rate": 0.0007831683168316832, | |
| "loss": 1.4936, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.8665831862940405, | |
| "eval_accuracy": 0.6926193728848408, | |
| "eval_loss": 1.3844850063323975, | |
| "eval_runtime": 887.3028, | |
| "eval_samples_per_second": 562.802, | |
| "eval_steps_per_second": 5.211, | |
| "step": 4590 | |
| }, | |
| { | |
| "epoch": 3.875007114804485, | |
| "grad_norm": 0.3404500186443329, | |
| "learning_rate": 0.0007821782178217822, | |
| "loss": 1.4956, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 3.8834310433149297, | |
| "grad_norm": 0.3074527978897095, | |
| "learning_rate": 0.0007811881188118813, | |
| "loss": 1.4928, | |
| "step": 4610 | |
| }, | |
| { | |
| "epoch": 3.8918549718253743, | |
| "grad_norm": 0.44941094517707825, | |
| "learning_rate": 0.0007801980198019802, | |
| "loss": 1.4911, | |
| "step": 4620 | |
| }, | |
| { | |
| "epoch": 3.900278900335819, | |
| "grad_norm": 0.3098917603492737, | |
| "learning_rate": 0.0007792079207920793, | |
| "loss": 1.4918, | |
| "step": 4630 | |
| }, | |
| { | |
| "epoch": 3.9087028288462635, | |
| "grad_norm": 0.37436243891716003, | |
| "learning_rate": 0.0007782178217821782, | |
| "loss": 1.4866, | |
| "step": 4640 | |
| }, | |
| { | |
| "epoch": 3.9171267573567077, | |
| "grad_norm": 0.3058597445487976, | |
| "learning_rate": 0.0007772277227722773, | |
| "loss": 1.4896, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 3.9255506858671523, | |
| "grad_norm": 0.34245744347572327, | |
| "learning_rate": 0.0007762376237623763, | |
| "loss": 1.4874, | |
| "step": 4660 | |
| }, | |
| { | |
| "epoch": 3.933974614377597, | |
| "grad_norm": 0.3401254117488861, | |
| "learning_rate": 0.0007752475247524753, | |
| "loss": 1.4866, | |
| "step": 4670 | |
| }, | |
| { | |
| "epoch": 3.9423985428880415, | |
| "grad_norm": 0.35778889060020447, | |
| "learning_rate": 0.0007742574257425743, | |
| "loss": 1.4818, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.9423985428880415, | |
| "eval_accuracy": 0.6951155140000936, | |
| "eval_loss": 1.3689333200454712, | |
| "eval_runtime": 879.8095, | |
| "eval_samples_per_second": 567.596, | |
| "eval_steps_per_second": 5.256, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 3.950822471398486, | |
| "grad_norm": 0.2895776927471161, | |
| "learning_rate": 0.0007732673267326733, | |
| "loss": 1.4822, | |
| "step": 4690 | |
| }, | |
| { | |
| "epoch": 3.9592463999089302, | |
| "grad_norm": 0.3483330309391022, | |
| "learning_rate": 0.0007722772277227723, | |
| "loss": 1.4802, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 3.9676703284193753, | |
| "grad_norm": 0.30115026235580444, | |
| "learning_rate": 0.0007712871287128714, | |
| "loss": 1.4838, | |
| "step": 4710 | |
| }, | |
| { | |
| "epoch": 3.9760942569298194, | |
| "grad_norm": 0.32046666741371155, | |
| "learning_rate": 0.0007702970297029703, | |
| "loss": 1.4799, | |
| "step": 4720 | |
| }, | |
| { | |
| "epoch": 3.984518185440264, | |
| "grad_norm": 0.3833225965499878, | |
| "learning_rate": 0.0007693069306930694, | |
| "loss": 1.4785, | |
| "step": 4730 | |
| }, | |
| { | |
| "epoch": 3.9929421139507086, | |
| "grad_norm": 0.30888909101486206, | |
| "learning_rate": 0.0007683168316831683, | |
| "loss": 1.475, | |
| "step": 4740 | |
| }, | |
| { | |
| "epoch": 4.001366042461153, | |
| "grad_norm": 0.32462459802627563, | |
| "learning_rate": 0.0007673267326732674, | |
| "loss": 1.4746, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 4.009789970971598, | |
| "grad_norm": 0.3200187683105469, | |
| "learning_rate": 0.0007663366336633664, | |
| "loss": 1.4768, | |
| "step": 4760 | |
| }, | |
| { | |
| "epoch": 4.018213899482042, | |
| "grad_norm": 0.3794704079627991, | |
| "learning_rate": 0.0007653465346534654, | |
| "loss": 1.4761, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 4.018213899482042, | |
| "eval_accuracy": 0.6969660848927619, | |
| "eval_loss": 1.3595411777496338, | |
| "eval_runtime": 887.2228, | |
| "eval_samples_per_second": 562.853, | |
| "eval_steps_per_second": 5.212, | |
| "step": 4770 | |
| }, | |
| { | |
| "epoch": 4.026637827992487, | |
| "grad_norm": 0.27933019399642944, | |
| "learning_rate": 0.0007643564356435644, | |
| "loss": 1.47, | |
| "step": 4780 | |
| }, | |
| { | |
| "epoch": 4.035061756502931, | |
| "grad_norm": 0.32542508840560913, | |
| "learning_rate": 0.0007633663366336634, | |
| "loss": 1.4726, | |
| "step": 4790 | |
| }, | |
| { | |
| "epoch": 4.043485685013376, | |
| "grad_norm": 0.3638169765472412, | |
| "learning_rate": 0.0007623762376237624, | |
| "loss": 1.4697, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 4.05190961352382, | |
| "grad_norm": 0.3762564957141876, | |
| "learning_rate": 0.0007613861386138615, | |
| "loss": 1.4663, | |
| "step": 4810 | |
| }, | |
| { | |
| "epoch": 4.0603335420342646, | |
| "grad_norm": 0.36758995056152344, | |
| "learning_rate": 0.0007603960396039604, | |
| "loss": 1.4729, | |
| "step": 4820 | |
| }, | |
| { | |
| "epoch": 4.06875747054471, | |
| "grad_norm": 0.34590932726860046, | |
| "learning_rate": 0.0007594059405940595, | |
| "loss": 1.4665, | |
| "step": 4830 | |
| }, | |
| { | |
| "epoch": 4.077181399055154, | |
| "grad_norm": 0.3242778182029724, | |
| "learning_rate": 0.0007584158415841584, | |
| "loss": 1.4639, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 4.085605327565599, | |
| "grad_norm": 0.3849882185459137, | |
| "learning_rate": 0.0007574257425742574, | |
| "loss": 1.4613, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 4.094029256076043, | |
| "grad_norm": 0.3495323061943054, | |
| "learning_rate": 0.0007564356435643565, | |
| "loss": 1.4598, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 4.094029256076043, | |
| "eval_accuracy": 0.6996214986490302, | |
| "eval_loss": 1.3455697298049927, | |
| "eval_runtime": 887.3091, | |
| "eval_samples_per_second": 562.798, | |
| "eval_steps_per_second": 5.211, | |
| "step": 4860 | |
| }, | |
| { | |
| "epoch": 4.102453184586488, | |
| "grad_norm": 0.3290145993232727, | |
| "learning_rate": 0.0007554455445544554, | |
| "loss": 1.4601, | |
| "step": 4870 | |
| }, | |
| { | |
| "epoch": 4.110877113096932, | |
| "grad_norm": 0.34369096159935, | |
| "learning_rate": 0.0007544554455445545, | |
| "loss": 1.4603, | |
| "step": 4880 | |
| }, | |
| { | |
| "epoch": 4.119301041607376, | |
| "grad_norm": 0.3350279629230499, | |
| "learning_rate": 0.0007534653465346534, | |
| "loss": 1.4609, | |
| "step": 4890 | |
| }, | |
| { | |
| "epoch": 4.127724970117821, | |
| "grad_norm": 0.2575846016407013, | |
| "learning_rate": 0.0007524752475247525, | |
| "loss": 1.4565, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 4.1361488986282655, | |
| "grad_norm": 0.3337861895561218, | |
| "learning_rate": 0.0007514851485148515, | |
| "loss": 1.4574, | |
| "step": 4910 | |
| }, | |
| { | |
| "epoch": 4.144572827138711, | |
| "grad_norm": 0.3752147853374481, | |
| "learning_rate": 0.0007504950495049505, | |
| "loss": 1.4594, | |
| "step": 4920 | |
| }, | |
| { | |
| "epoch": 4.152996755649155, | |
| "grad_norm": 0.29587122797966003, | |
| "learning_rate": 0.0007495049504950495, | |
| "loss": 1.4518, | |
| "step": 4930 | |
| }, | |
| { | |
| "epoch": 4.161420684159599, | |
| "grad_norm": 0.2764742374420166, | |
| "learning_rate": 0.0007485148514851485, | |
| "loss": 1.4514, | |
| "step": 4940 | |
| }, | |
| { | |
| "epoch": 4.169844612670044, | |
| "grad_norm": 0.4625591039657593, | |
| "learning_rate": 0.0007475247524752475, | |
| "loss": 1.4527, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 4.169844612670044, | |
| "eval_accuracy": 0.701515475804278, | |
| "eval_loss": 1.3361947536468506, | |
| "eval_runtime": 883.9818, | |
| "eval_samples_per_second": 564.917, | |
| "eval_steps_per_second": 5.231, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 4.178268541180488, | |
| "grad_norm": 0.29412004351615906, | |
| "learning_rate": 0.0007465346534653466, | |
| "loss": 1.4514, | |
| "step": 4960 | |
| }, | |
| { | |
| "epoch": 4.186692469690933, | |
| "grad_norm": 0.3580242693424225, | |
| "learning_rate": 0.0007455445544554455, | |
| "loss": 1.4486, | |
| "step": 4970 | |
| }, | |
| { | |
| "epoch": 4.195116398201377, | |
| "grad_norm": 0.46256908774375916, | |
| "learning_rate": 0.0007445544554455446, | |
| "loss": 1.4494, | |
| "step": 4980 | |
| }, | |
| { | |
| "epoch": 4.203540326711822, | |
| "grad_norm": 0.3117842674255371, | |
| "learning_rate": 0.0007435643564356435, | |
| "loss": 1.4486, | |
| "step": 4990 | |
| }, | |
| { | |
| "epoch": 4.2119642552222665, | |
| "grad_norm": 0.3382858335971832, | |
| "learning_rate": 0.0007425742574257426, | |
| "loss": 1.4452, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.220388183732711, | |
| "grad_norm": 0.3153148889541626, | |
| "learning_rate": 0.0007415841584158416, | |
| "loss": 1.4465, | |
| "step": 5010 | |
| }, | |
| { | |
| "epoch": 4.228812112243156, | |
| "grad_norm": 0.3635173439979553, | |
| "learning_rate": 0.0007405940594059406, | |
| "loss": 1.4443, | |
| "step": 5020 | |
| }, | |
| { | |
| "epoch": 4.2372360407536, | |
| "grad_norm": 0.4260285794734955, | |
| "learning_rate": 0.0007396039603960396, | |
| "loss": 1.4454, | |
| "step": 5030 | |
| }, | |
| { | |
| "epoch": 4.245659969264045, | |
| "grad_norm": 0.29188039898872375, | |
| "learning_rate": 0.0007386138613861386, | |
| "loss": 1.4442, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 4.245659969264045, | |
| "eval_accuracy": 0.7031089800515327, | |
| "eval_loss": 1.3285191059112549, | |
| "eval_runtime": 890.9721, | |
| "eval_samples_per_second": 560.484, | |
| "eval_steps_per_second": 5.19, | |
| "step": 5040 | |
| }, | |
| { | |
| "epoch": 4.254083897774489, | |
| "grad_norm": 0.5350555777549744, | |
| "learning_rate": 0.0007376237623762376, | |
| "loss": 1.4416, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 4.262507826284934, | |
| "grad_norm": 0.35281315445899963, | |
| "learning_rate": 0.0007366336633663367, | |
| "loss": 1.4432, | |
| "step": 5060 | |
| }, | |
| { | |
| "epoch": 4.270931754795378, | |
| "grad_norm": 0.37922871112823486, | |
| "learning_rate": 0.0007356435643564356, | |
| "loss": 1.4399, | |
| "step": 5070 | |
| }, | |
| { | |
| "epoch": 4.279355683305822, | |
| "grad_norm": 0.3072182238101959, | |
| "learning_rate": 0.0007346534653465347, | |
| "loss": 1.4383, | |
| "step": 5080 | |
| }, | |
| { | |
| "epoch": 4.287779611816267, | |
| "grad_norm": 0.30223241448402405, | |
| "learning_rate": 0.0007336633663366336, | |
| "loss": 1.4406, | |
| "step": 5090 | |
| }, | |
| { | |
| "epoch": 4.296203540326712, | |
| "grad_norm": 0.5292770862579346, | |
| "learning_rate": 0.0007326732673267327, | |
| "loss": 1.4376, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 4.304627468837157, | |
| "grad_norm": 0.35330840945243835, | |
| "learning_rate": 0.0007316831683168317, | |
| "loss": 1.4389, | |
| "step": 5110 | |
| }, | |
| { | |
| "epoch": 4.313051397347601, | |
| "grad_norm": 0.30719104409217834, | |
| "learning_rate": 0.0007306930693069307, | |
| "loss": 1.4384, | |
| "step": 5120 | |
| }, | |
| { | |
| "epoch": 4.321475325858046, | |
| "grad_norm": 0.34203872084617615, | |
| "learning_rate": 0.0007297029702970297, | |
| "loss": 1.4374, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 4.321475325858046, | |
| "eval_accuracy": 0.7048288335521147, | |
| "eval_loss": 1.3187906742095947, | |
| "eval_runtime": 887.0787, | |
| "eval_samples_per_second": 562.944, | |
| "eval_steps_per_second": 5.213, | |
| "step": 5130 | |
| }, | |
| { | |
| "epoch": 4.32989925436849, | |
| "grad_norm": 0.38140207529067993, | |
| "learning_rate": 0.0007287128712871287, | |
| "loss": 1.4353, | |
| "step": 5140 | |
| }, | |
| { | |
| "epoch": 4.338323182878934, | |
| "grad_norm": 0.303752064704895, | |
| "learning_rate": 0.0007277227722772277, | |
| "loss": 1.4336, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 4.346747111389379, | |
| "grad_norm": 0.290764719247818, | |
| "learning_rate": 0.0007267326732673268, | |
| "loss": 1.4304, | |
| "step": 5160 | |
| }, | |
| { | |
| "epoch": 4.355171039899823, | |
| "grad_norm": 0.4335167407989502, | |
| "learning_rate": 0.0007257425742574257, | |
| "loss": 1.4327, | |
| "step": 5170 | |
| }, | |
| { | |
| "epoch": 4.363594968410268, | |
| "grad_norm": 0.3198365271091461, | |
| "learning_rate": 0.0007247524752475248, | |
| "loss": 1.4319, | |
| "step": 5180 | |
| }, | |
| { | |
| "epoch": 4.3720188969207125, | |
| "grad_norm": 0.41567763686180115, | |
| "learning_rate": 0.0007237623762376237, | |
| "loss": 1.4318, | |
| "step": 5190 | |
| }, | |
| { | |
| "epoch": 4.380442825431157, | |
| "grad_norm": 0.3342703580856323, | |
| "learning_rate": 0.0007227722772277228, | |
| "loss": 1.4298, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 4.388866753941602, | |
| "grad_norm": 0.25702279806137085, | |
| "learning_rate": 0.0007217821782178218, | |
| "loss": 1.4265, | |
| "step": 5210 | |
| }, | |
| { | |
| "epoch": 4.397290682452046, | |
| "grad_norm": 0.26949411630630493, | |
| "learning_rate": 0.0007207920792079208, | |
| "loss": 1.4278, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 4.397290682452046, | |
| "eval_accuracy": 0.7063243134470976, | |
| "eval_loss": 1.3113943338394165, | |
| "eval_runtime": 889.8031, | |
| "eval_samples_per_second": 561.221, | |
| "eval_steps_per_second": 5.197, | |
| "step": 5220 | |
| }, | |
| { | |
| "epoch": 4.405714610962491, | |
| "grad_norm": 0.3861467242240906, | |
| "learning_rate": 0.0007198019801980198, | |
| "loss": 1.4318, | |
| "step": 5230 | |
| }, | |
| { | |
| "epoch": 4.414138539472935, | |
| "grad_norm": 0.34858283400535583, | |
| "learning_rate": 0.0007188118811881188, | |
| "loss": 1.4291, | |
| "step": 5240 | |
| }, | |
| { | |
| "epoch": 4.42256246798338, | |
| "grad_norm": 0.3346785604953766, | |
| "learning_rate": 0.0007178217821782178, | |
| "loss": 1.425, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 4.430986396493824, | |
| "grad_norm": 0.3916323184967041, | |
| "learning_rate": 0.0007168316831683169, | |
| "loss": 1.4241, | |
| "step": 5260 | |
| }, | |
| { | |
| "epoch": 4.439410325004269, | |
| "grad_norm": 0.2802947759628296, | |
| "learning_rate": 0.0007158415841584158, | |
| "loss": 1.4221, | |
| "step": 5270 | |
| }, | |
| { | |
| "epoch": 4.4478342535147135, | |
| "grad_norm": 0.4092938303947449, | |
| "learning_rate": 0.0007148514851485149, | |
| "loss": 1.4236, | |
| "step": 5280 | |
| }, | |
| { | |
| "epoch": 4.456258182025158, | |
| "grad_norm": 0.25096723437309265, | |
| "learning_rate": 0.0007138613861386138, | |
| "loss": 1.4235, | |
| "step": 5290 | |
| }, | |
| { | |
| "epoch": 4.464682110535603, | |
| "grad_norm": 0.3570871949195862, | |
| "learning_rate": 0.0007128712871287129, | |
| "loss": 1.4216, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 4.473106039046047, | |
| "grad_norm": 0.3168172240257263, | |
| "learning_rate": 0.0007118811881188119, | |
| "loss": 1.4236, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 4.473106039046047, | |
| "eval_accuracy": 0.7076842136916008, | |
| "eval_loss": 1.307774543762207, | |
| "eval_runtime": 889.4836, | |
| "eval_samples_per_second": 561.422, | |
| "eval_steps_per_second": 5.199, | |
| "step": 5310 | |
| }, | |
| { | |
| "epoch": 4.481529967556492, | |
| "grad_norm": 0.30059170722961426, | |
| "learning_rate": 0.0007108910891089109, | |
| "loss": 1.4193, | |
| "step": 5320 | |
| }, | |
| { | |
| "epoch": 4.489953896066936, | |
| "grad_norm": 0.331824392080307, | |
| "learning_rate": 0.0007099009900990099, | |
| "loss": 1.4185, | |
| "step": 5330 | |
| }, | |
| { | |
| "epoch": 4.49837782457738, | |
| "grad_norm": 0.3295821249485016, | |
| "learning_rate": 0.0007089108910891088, | |
| "loss": 1.4198, | |
| "step": 5340 | |
| }, | |
| { | |
| "epoch": 4.506801753087825, | |
| "grad_norm": 0.3506734371185303, | |
| "learning_rate": 0.0007079207920792079, | |
| "loss": 1.4167, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 4.515225681598269, | |
| "grad_norm": 0.3836129903793335, | |
| "learning_rate": 0.000706930693069307, | |
| "loss": 1.417, | |
| "step": 5360 | |
| }, | |
| { | |
| "epoch": 4.5236496101087145, | |
| "grad_norm": 0.3046220541000366, | |
| "learning_rate": 0.0007059405940594059, | |
| "loss": 1.4177, | |
| "step": 5370 | |
| }, | |
| { | |
| "epoch": 4.532073538619159, | |
| "grad_norm": 0.37655332684516907, | |
| "learning_rate": 0.000704950495049505, | |
| "loss": 1.4149, | |
| "step": 5380 | |
| }, | |
| { | |
| "epoch": 4.540497467129603, | |
| "grad_norm": 0.32939672470092773, | |
| "learning_rate": 0.0007039603960396039, | |
| "loss": 1.4165, | |
| "step": 5390 | |
| }, | |
| { | |
| "epoch": 4.548921395640048, | |
| "grad_norm": 0.2900882363319397, | |
| "learning_rate": 0.0007029702970297029, | |
| "loss": 1.4128, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.548921395640048, | |
| "eval_accuracy": 0.7087959913049944, | |
| "eval_loss": 1.3013147115707397, | |
| "eval_runtime": 892.9333, | |
| "eval_samples_per_second": 559.253, | |
| "eval_steps_per_second": 5.178, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 4.557345324150492, | |
| "grad_norm": 0.27651771903038025, | |
| "learning_rate": 0.000701980198019802, | |
| "loss": 1.4122, | |
| "step": 5410 | |
| }, | |
| { | |
| "epoch": 4.565769252660937, | |
| "grad_norm": 0.4160715639591217, | |
| "learning_rate": 0.0007009900990099009, | |
| "loss": 1.4122, | |
| "step": 5420 | |
| }, | |
| { | |
| "epoch": 4.574193181171381, | |
| "grad_norm": 0.2724072337150574, | |
| "learning_rate": 0.0007, | |
| "loss": 1.41, | |
| "step": 5430 | |
| }, | |
| { | |
| "epoch": 4.582617109681826, | |
| "grad_norm": 0.35586145520210266, | |
| "learning_rate": 0.0006990099009900989, | |
| "loss": 1.4118, | |
| "step": 5440 | |
| }, | |
| { | |
| "epoch": 4.59104103819227, | |
| "grad_norm": 0.3268265128135681, | |
| "learning_rate": 0.000698019801980198, | |
| "loss": 1.4117, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 4.599464966702715, | |
| "grad_norm": 0.3230002522468567, | |
| "learning_rate": 0.000697029702970297, | |
| "loss": 1.4102, | |
| "step": 5460 | |
| }, | |
| { | |
| "epoch": 4.60788889521316, | |
| "grad_norm": 0.25019174814224243, | |
| "learning_rate": 0.000696039603960396, | |
| "loss": 1.4102, | |
| "step": 5470 | |
| }, | |
| { | |
| "epoch": 4.616312823723604, | |
| "grad_norm": 0.38475289940834045, | |
| "learning_rate": 0.000695049504950495, | |
| "loss": 1.4075, | |
| "step": 5480 | |
| }, | |
| { | |
| "epoch": 4.624736752234049, | |
| "grad_norm": 0.39824309945106506, | |
| "learning_rate": 0.000694059405940594, | |
| "loss": 1.4077, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 4.624736752234049, | |
| "eval_accuracy": 0.7098417264518991, | |
| "eval_loss": 1.2926928997039795, | |
| "eval_runtime": 881.9048, | |
| "eval_samples_per_second": 566.247, | |
| "eval_steps_per_second": 5.243, | |
| "step": 5490 | |
| }, | |
| { | |
| "epoch": 4.633160680744493, | |
| "grad_norm": 0.3250022828578949, | |
| "learning_rate": 0.000693069306930693, | |
| "loss": 1.4068, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.641584609254938, | |
| "grad_norm": 0.32388612627983093, | |
| "learning_rate": 0.0006920792079207921, | |
| "loss": 1.4062, | |
| "step": 5510 | |
| }, | |
| { | |
| "epoch": 4.650008537765382, | |
| "grad_norm": 0.2806077003479004, | |
| "learning_rate": 0.000691089108910891, | |
| "loss": 1.4049, | |
| "step": 5520 | |
| }, | |
| { | |
| "epoch": 4.658432466275826, | |
| "grad_norm": 0.33755025267601013, | |
| "learning_rate": 0.0006900990099009901, | |
| "loss": 1.4045, | |
| "step": 5530 | |
| }, | |
| { | |
| "epoch": 4.666856394786271, | |
| "grad_norm": 0.4184636175632477, | |
| "learning_rate": 0.000689108910891089, | |
| "loss": 1.4042, | |
| "step": 5540 | |
| }, | |
| { | |
| "epoch": 4.6752803232967155, | |
| "grad_norm": 0.34234240651130676, | |
| "learning_rate": 0.0006881188118811881, | |
| "loss": 1.4055, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 4.6837042518071605, | |
| "grad_norm": 0.32120293378829956, | |
| "learning_rate": 0.0006871287128712872, | |
| "loss": 1.4014, | |
| "step": 5560 | |
| }, | |
| { | |
| "epoch": 4.692128180317605, | |
| "grad_norm": 0.3810026943683624, | |
| "learning_rate": 0.0006861386138613862, | |
| "loss": 1.4039, | |
| "step": 5570 | |
| }, | |
| { | |
| "epoch": 4.70055210882805, | |
| "grad_norm": 0.3171080946922302, | |
| "learning_rate": 0.0006851485148514852, | |
| "loss": 1.4025, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 4.70055210882805, | |
| "eval_accuracy": 0.7115425686273988, | |
| "eval_loss": 1.285227656364441, | |
| "eval_runtime": 891.3368, | |
| "eval_samples_per_second": 560.255, | |
| "eval_steps_per_second": 5.188, | |
| "step": 5580 | |
| }, | |
| { | |
| "epoch": 4.708976037338494, | |
| "grad_norm": 0.24618960916996002, | |
| "learning_rate": 0.0006841584158415842, | |
| "loss": 1.3983, | |
| "step": 5590 | |
| }, | |
| { | |
| "epoch": 4.717399965848939, | |
| "grad_norm": 0.494895339012146, | |
| "learning_rate": 0.0006831683168316832, | |
| "loss": 1.4, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 4.725823894359383, | |
| "grad_norm": 0.31908226013183594, | |
| "learning_rate": 0.0006821782178217823, | |
| "loss": 1.3983, | |
| "step": 5610 | |
| }, | |
| { | |
| "epoch": 4.734247822869827, | |
| "grad_norm": 0.26488983631134033, | |
| "learning_rate": 0.0006811881188118812, | |
| "loss": 1.3956, | |
| "step": 5620 | |
| }, | |
| { | |
| "epoch": 4.742671751380272, | |
| "grad_norm": 0.3156343102455139, | |
| "learning_rate": 0.0006801980198019803, | |
| "loss": 1.397, | |
| "step": 5630 | |
| }, | |
| { | |
| "epoch": 4.7510956798907165, | |
| "grad_norm": 0.38938194513320923, | |
| "learning_rate": 0.0006792079207920792, | |
| "loss": 1.3987, | |
| "step": 5640 | |
| }, | |
| { | |
| "epoch": 4.7595196084011615, | |
| "grad_norm": 0.27233967185020447, | |
| "learning_rate": 0.0006782178217821783, | |
| "loss": 1.3983, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 4.767943536911606, | |
| "grad_norm": 0.347419410943985, | |
| "learning_rate": 0.0006772277227722773, | |
| "loss": 1.3953, | |
| "step": 5660 | |
| }, | |
| { | |
| "epoch": 4.77636746542205, | |
| "grad_norm": 0.44131675362586975, | |
| "learning_rate": 0.0006762376237623763, | |
| "loss": 1.3956, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 4.77636746542205, | |
| "eval_accuracy": 0.7112416746447588, | |
| "eval_loss": 1.290834665298462, | |
| "eval_runtime": 886.5668, | |
| "eval_samples_per_second": 563.269, | |
| "eval_steps_per_second": 5.216, | |
| "step": 5670 | |
| }, | |
| { | |
| "epoch": 4.784791393932495, | |
| "grad_norm": 0.3185184895992279, | |
| "learning_rate": 0.0006752475247524753, | |
| "loss": 1.3976, | |
| "step": 5680 | |
| }, | |
| { | |
| "epoch": 4.793215322442939, | |
| "grad_norm": 0.2549585998058319, | |
| "learning_rate": 0.0006742574257425743, | |
| "loss": 1.3931, | |
| "step": 5690 | |
| }, | |
| { | |
| "epoch": 4.801639250953384, | |
| "grad_norm": 0.315294086933136, | |
| "learning_rate": 0.0006732673267326733, | |
| "loss": 1.393, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 4.810063179463828, | |
| "grad_norm": 0.3866962492465973, | |
| "learning_rate": 0.0006722772277227724, | |
| "loss": 1.3923, | |
| "step": 5710 | |
| }, | |
| { | |
| "epoch": 4.818487107974272, | |
| "grad_norm": 0.28364527225494385, | |
| "learning_rate": 0.0006712871287128713, | |
| "loss": 1.3924, | |
| "step": 5720 | |
| }, | |
| { | |
| "epoch": 4.826911036484717, | |
| "grad_norm": 0.3253314793109894, | |
| "learning_rate": 0.0006702970297029704, | |
| "loss": 1.3914, | |
| "step": 5730 | |
| }, | |
| { | |
| "epoch": 4.835334964995162, | |
| "grad_norm": 0.31215131282806396, | |
| "learning_rate": 0.0006693069306930693, | |
| "loss": 1.3903, | |
| "step": 5740 | |
| }, | |
| { | |
| "epoch": 4.843758893505607, | |
| "grad_norm": 0.34929993748664856, | |
| "learning_rate": 0.0006683168316831684, | |
| "loss": 1.3894, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 4.852182822016051, | |
| "grad_norm": 0.38991761207580566, | |
| "learning_rate": 0.0006673267326732674, | |
| "loss": 1.3924, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 4.852182822016051, | |
| "eval_accuracy": 0.7133021748514282, | |
| "eval_loss": 1.2766938209533691, | |
| "eval_runtime": 881.7452, | |
| "eval_samples_per_second": 566.35, | |
| "eval_steps_per_second": 5.244, | |
| "step": 5760 | |
| }, | |
| { | |
| "epoch": 4.860606750526496, | |
| "grad_norm": 0.2888573408126831, | |
| "learning_rate": 0.0006663366336633664, | |
| "loss": 1.3918, | |
| "step": 5770 | |
| }, | |
| { | |
| "epoch": 4.86903067903694, | |
| "grad_norm": 0.3224232494831085, | |
| "learning_rate": 0.0006653465346534654, | |
| "loss": 1.3895, | |
| "step": 5780 | |
| }, | |
| { | |
| "epoch": 4.877454607547385, | |
| "grad_norm": 0.3562750518321991, | |
| "learning_rate": 0.0006643564356435644, | |
| "loss": 1.387, | |
| "step": 5790 | |
| }, | |
| { | |
| "epoch": 4.885878536057829, | |
| "grad_norm": 0.3339401185512543, | |
| "learning_rate": 0.0006633663366336634, | |
| "loss": 1.3886, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 4.894302464568273, | |
| "grad_norm": 0.3022938072681427, | |
| "learning_rate": 0.0006623762376237625, | |
| "loss": 1.3858, | |
| "step": 5810 | |
| }, | |
| { | |
| "epoch": 4.902726393078718, | |
| "grad_norm": 0.276065856218338, | |
| "learning_rate": 0.0006613861386138614, | |
| "loss": 1.386, | |
| "step": 5820 | |
| }, | |
| { | |
| "epoch": 4.9111503215891625, | |
| "grad_norm": 0.3148975372314453, | |
| "learning_rate": 0.0006603960396039605, | |
| "loss": 1.385, | |
| "step": 5830 | |
| }, | |
| { | |
| "epoch": 4.919574250099608, | |
| "grad_norm": 0.3374193608760834, | |
| "learning_rate": 0.0006594059405940594, | |
| "loss": 1.3842, | |
| "step": 5840 | |
| }, | |
| { | |
| "epoch": 4.927998178610052, | |
| "grad_norm": 0.3293200135231018, | |
| "learning_rate": 0.0006584158415841585, | |
| "loss": 1.3835, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 4.927998178610052, | |
| "eval_accuracy": 0.7147221912687882, | |
| "eval_loss": 1.2681052684783936, | |
| "eval_runtime": 890.793, | |
| "eval_samples_per_second": 560.597, | |
| "eval_steps_per_second": 5.191, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 4.936422107120496, | |
| "grad_norm": 0.3032568693161011, | |
| "learning_rate": 0.0006574257425742575, | |
| "loss": 1.3828, | |
| "step": 5860 | |
| }, | |
| { | |
| "epoch": 4.944846035630941, | |
| "grad_norm": 0.24251434206962585, | |
| "learning_rate": 0.0006564356435643565, | |
| "loss": 1.3818, | |
| "step": 5870 | |
| }, | |
| { | |
| "epoch": 4.953269964141385, | |
| "grad_norm": 0.3096301257610321, | |
| "learning_rate": 0.0006554455445544555, | |
| "loss": 1.3814, | |
| "step": 5880 | |
| }, | |
| { | |
| "epoch": 4.96169389265183, | |
| "grad_norm": 0.34841156005859375, | |
| "learning_rate": 0.0006544554455445545, | |
| "loss": 1.3823, | |
| "step": 5890 | |
| }, | |
| { | |
| "epoch": 4.970117821162274, | |
| "grad_norm": 0.312688946723938, | |
| "learning_rate": 0.0006534653465346535, | |
| "loss": 1.3818, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 4.978541749672719, | |
| "grad_norm": 0.30799320340156555, | |
| "learning_rate": 0.0006524752475247526, | |
| "loss": 1.379, | |
| "step": 5910 | |
| }, | |
| { | |
| "epoch": 4.9869656781831635, | |
| "grad_norm": 0.3510371148586273, | |
| "learning_rate": 0.0006514851485148515, | |
| "loss": 1.3814, | |
| "step": 5920 | |
| }, | |
| { | |
| "epoch": 4.9953896066936085, | |
| "grad_norm": 0.2894381582736969, | |
| "learning_rate": 0.0006504950495049506, | |
| "loss": 1.3812, | |
| "step": 5930 | |
| }, | |
| { | |
| "epoch": 5.003813535204053, | |
| "grad_norm": 0.2685450315475464, | |
| "learning_rate": 0.0006495049504950495, | |
| "loss": 1.3788, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 5.003813535204053, | |
| "eval_accuracy": 0.7160080315056353, | |
| "eval_loss": 1.2630343437194824, | |
| "eval_runtime": 883.8805, | |
| "eval_samples_per_second": 564.981, | |
| "eval_steps_per_second": 5.231, | |
| "step": 5940 | |
| }, | |
| { | |
| "epoch": 5.012237463714497, | |
| "grad_norm": 0.38857927918434143, | |
| "learning_rate": 0.0006485148514851485, | |
| "loss": 1.3809, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 5.020661392224942, | |
| "grad_norm": 0.2822309136390686, | |
| "learning_rate": 0.0006475247524752476, | |
| "loss": 1.3769, | |
| "step": 5960 | |
| }, | |
| { | |
| "epoch": 5.029085320735386, | |
| "grad_norm": 0.2725491523742676, | |
| "learning_rate": 0.0006465346534653465, | |
| "loss": 1.3762, | |
| "step": 5970 | |
| }, | |
| { | |
| "epoch": 5.037509249245831, | |
| "grad_norm": 0.32517486810684204, | |
| "learning_rate": 0.0006455445544554456, | |
| "loss": 1.377, | |
| "step": 5980 | |
| }, | |
| { | |
| "epoch": 5.045933177756275, | |
| "grad_norm": 0.34373360872268677, | |
| "learning_rate": 0.0006445544554455445, | |
| "loss": 1.3774, | |
| "step": 5990 | |
| }, | |
| { | |
| "epoch": 5.054357106266719, | |
| "grad_norm": 0.3029853403568268, | |
| "learning_rate": 0.0006435643564356436, | |
| "loss": 1.3746, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 5.0627810347771645, | |
| "grad_norm": 0.5577653646469116, | |
| "learning_rate": 0.0006425742574257426, | |
| "loss": 1.378, | |
| "step": 6010 | |
| }, | |
| { | |
| "epoch": 5.071204963287609, | |
| "grad_norm": 0.27967342734336853, | |
| "learning_rate": 0.0006415841584158416, | |
| "loss": 1.3779, | |
| "step": 6020 | |
| }, | |
| { | |
| "epoch": 5.079628891798054, | |
| "grad_norm": 0.2680428624153137, | |
| "learning_rate": 0.0006405940594059406, | |
| "loss": 1.3733, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 5.079628891798054, | |
| "eval_accuracy": 0.7168763989390342, | |
| "eval_loss": 1.258245825767517, | |
| "eval_runtime": 902.3568, | |
| "eval_samples_per_second": 553.413, | |
| "eval_steps_per_second": 5.124, | |
| "step": 6030 | |
| }, | |
| { | |
| "epoch": 5.088052820308498, | |
| "grad_norm": 0.24522745609283447, | |
| "learning_rate": 0.0006396039603960396, | |
| "loss": 1.3692, | |
| "step": 6040 | |
| }, | |
| { | |
| "epoch": 5.096476748818943, | |
| "grad_norm": 0.3076081871986389, | |
| "learning_rate": 0.0006386138613861386, | |
| "loss": 1.3724, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 5.104900677329387, | |
| "grad_norm": 0.32096347212791443, | |
| "learning_rate": 0.0006376237623762377, | |
| "loss": 1.3737, | |
| "step": 6060 | |
| }, | |
| { | |
| "epoch": 5.113324605839831, | |
| "grad_norm": 0.35196197032928467, | |
| "learning_rate": 0.0006366336633663366, | |
| "loss": 1.3719, | |
| "step": 6070 | |
| }, | |
| { | |
| "epoch": 5.121748534350276, | |
| "grad_norm": 0.39065635204315186, | |
| "learning_rate": 0.0006356435643564357, | |
| "loss": 1.3719, | |
| "step": 6080 | |
| }, | |
| { | |
| "epoch": 5.13017246286072, | |
| "grad_norm": 0.3439326882362366, | |
| "learning_rate": 0.0006346534653465346, | |
| "loss": 1.3749, | |
| "step": 6090 | |
| }, | |
| { | |
| "epoch": 5.138596391371165, | |
| "grad_norm": 0.3175961673259735, | |
| "learning_rate": 0.0006336633663366337, | |
| "loss": 1.3679, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 5.14702031988161, | |
| "grad_norm": 0.37071719765663147, | |
| "learning_rate": 0.0006326732673267327, | |
| "loss": 1.3706, | |
| "step": 6110 | |
| }, | |
| { | |
| "epoch": 5.155444248392055, | |
| "grad_norm": 0.2499271035194397, | |
| "learning_rate": 0.0006316831683168317, | |
| "loss": 1.3685, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 5.155444248392055, | |
| "eval_accuracy": 0.717981203712741, | |
| "eval_loss": 1.2521748542785645, | |
| "eval_runtime": 885.5528, | |
| "eval_samples_per_second": 563.914, | |
| "eval_steps_per_second": 5.222, | |
| "step": 6120 | |
| }, | |
| { | |
| "epoch": 5.163868176902499, | |
| "grad_norm": 0.3951607346534729, | |
| "learning_rate": 0.0006306930693069307, | |
| "loss": 1.3671, | |
| "step": 6130 | |
| }, | |
| { | |
| "epoch": 5.172292105412943, | |
| "grad_norm": 0.4264112114906311, | |
| "learning_rate": 0.0006297029702970297, | |
| "loss": 1.3652, | |
| "step": 6140 | |
| }, | |
| { | |
| "epoch": 5.180716033923388, | |
| "grad_norm": 0.3097785711288452, | |
| "learning_rate": 0.0006287128712871287, | |
| "loss": 1.3695, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 5.189139962433832, | |
| "grad_norm": 0.28887125849723816, | |
| "learning_rate": 0.0006277227722772278, | |
| "loss": 1.3658, | |
| "step": 6160 | |
| }, | |
| { | |
| "epoch": 5.197563890944277, | |
| "grad_norm": 0.27163591980934143, | |
| "learning_rate": 0.0006267326732673267, | |
| "loss": 1.3655, | |
| "step": 6170 | |
| }, | |
| { | |
| "epoch": 5.205987819454721, | |
| "grad_norm": 0.30266183614730835, | |
| "learning_rate": 0.0006257425742574258, | |
| "loss": 1.3631, | |
| "step": 6180 | |
| }, | |
| { | |
| "epoch": 5.2144117479651655, | |
| "grad_norm": 0.3191784620285034, | |
| "learning_rate": 0.0006247524752475247, | |
| "loss": 1.3667, | |
| "step": 6190 | |
| }, | |
| { | |
| "epoch": 5.2228356764756105, | |
| "grad_norm": 0.30907300114631653, | |
| "learning_rate": 0.0006237623762376238, | |
| "loss": 1.3667, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 5.231259604986055, | |
| "grad_norm": 0.3120558559894562, | |
| "learning_rate": 0.0006227722772277228, | |
| "loss": 1.3638, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 5.231259604986055, | |
| "eval_accuracy": 0.7190249020483522, | |
| "eval_loss": 1.2470471858978271, | |
| "eval_runtime": 893.7706, | |
| "eval_samples_per_second": 558.73, | |
| "eval_steps_per_second": 5.174, | |
| "step": 6210 | |
| }, | |
| { | |
| "epoch": 5.2396835334965, | |
| "grad_norm": 0.35595396161079407, | |
| "learning_rate": 0.0006217821782178218, | |
| "loss": 1.3634, | |
| "step": 6220 | |
| }, | |
| { | |
| "epoch": 5.248107462006944, | |
| "grad_norm": 0.33759573101997375, | |
| "learning_rate": 0.0006207920792079208, | |
| "loss": 1.3661, | |
| "step": 6230 | |
| }, | |
| { | |
| "epoch": 5.256531390517389, | |
| "grad_norm": 0.26417672634124756, | |
| "learning_rate": 0.0006198019801980198, | |
| "loss": 1.3627, | |
| "step": 6240 | |
| }, | |
| { | |
| "epoch": 5.264955319027833, | |
| "grad_norm": 0.28236111998558044, | |
| "learning_rate": 0.0006188118811881188, | |
| "loss": 1.362, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 5.273379247538277, | |
| "grad_norm": 0.5903481245040894, | |
| "learning_rate": 0.0006178217821782179, | |
| "loss": 1.3619, | |
| "step": 6260 | |
| }, | |
| { | |
| "epoch": 5.281803176048722, | |
| "grad_norm": 0.298475056886673, | |
| "learning_rate": 0.0006168316831683168, | |
| "loss": 1.3671, | |
| "step": 6270 | |
| }, | |
| { | |
| "epoch": 5.2902271045591664, | |
| "grad_norm": 0.27397215366363525, | |
| "learning_rate": 0.0006158415841584159, | |
| "loss": 1.3611, | |
| "step": 6280 | |
| }, | |
| { | |
| "epoch": 5.2986510330696115, | |
| "grad_norm": 0.28740593791007996, | |
| "learning_rate": 0.0006148514851485148, | |
| "loss": 1.3579, | |
| "step": 6290 | |
| }, | |
| { | |
| "epoch": 5.307074961580056, | |
| "grad_norm": 0.274557888507843, | |
| "learning_rate": 0.0006138613861386139, | |
| "loss": 1.3587, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 5.307074961580056, | |
| "eval_accuracy": 0.719703789624826, | |
| "eval_loss": 1.2432972192764282, | |
| "eval_runtime": 881.2394, | |
| "eval_samples_per_second": 566.675, | |
| "eval_steps_per_second": 5.247, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 5.315498890090501, | |
| "grad_norm": 0.31431418657302856, | |
| "learning_rate": 0.0006128712871287129, | |
| "loss": 1.3565, | |
| "step": 6310 | |
| }, | |
| { | |
| "epoch": 5.323922818600945, | |
| "grad_norm": 0.358239084482193, | |
| "learning_rate": 0.0006118811881188119, | |
| "loss": 1.3614, | |
| "step": 6320 | |
| }, | |
| { | |
| "epoch": 5.332346747111389, | |
| "grad_norm": 0.3043140769004822, | |
| "learning_rate": 0.0006108910891089109, | |
| "loss": 1.3576, | |
| "step": 6330 | |
| }, | |
| { | |
| "epoch": 5.340770675621834, | |
| "grad_norm": 0.2583385109901428, | |
| "learning_rate": 0.0006099009900990099, | |
| "loss": 1.3578, | |
| "step": 6340 | |
| }, | |
| { | |
| "epoch": 5.349194604132278, | |
| "grad_norm": 0.3068407475948334, | |
| "learning_rate": 0.0006089108910891089, | |
| "loss": 1.3577, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 5.357618532642723, | |
| "grad_norm": 0.2893878221511841, | |
| "learning_rate": 0.000607920792079208, | |
| "loss": 1.3569, | |
| "step": 6360 | |
| }, | |
| { | |
| "epoch": 5.366042461153167, | |
| "grad_norm": 0.2883850634098053, | |
| "learning_rate": 0.0006069306930693069, | |
| "loss": 1.3555, | |
| "step": 6370 | |
| }, | |
| { | |
| "epoch": 5.3744663896636125, | |
| "grad_norm": 0.3248838484287262, | |
| "learning_rate": 0.000605940594059406, | |
| "loss": 1.3561, | |
| "step": 6380 | |
| }, | |
| { | |
| "epoch": 5.382890318174057, | |
| "grad_norm": 0.29167214035987854, | |
| "learning_rate": 0.0006049504950495049, | |
| "loss": 1.3582, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 5.382890318174057, | |
| "eval_accuracy": 0.7203339064191229, | |
| "eval_loss": 1.241172432899475, | |
| "eval_runtime": 891.2006, | |
| "eval_samples_per_second": 560.341, | |
| "eval_steps_per_second": 5.189, | |
| "step": 6390 | |
| }, | |
| { | |
| "epoch": 5.391314246684501, | |
| "grad_norm": 0.3090030550956726, | |
| "learning_rate": 0.000603960396039604, | |
| "loss": 1.3534, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 5.399738175194946, | |
| "grad_norm": 0.25337210297584534, | |
| "learning_rate": 0.000602970297029703, | |
| "loss": 1.3564, | |
| "step": 6410 | |
| }, | |
| { | |
| "epoch": 5.40816210370539, | |
| "grad_norm": 0.25656768679618835, | |
| "learning_rate": 0.000601980198019802, | |
| "loss": 1.3549, | |
| "step": 6420 | |
| }, | |
| { | |
| "epoch": 5.416586032215835, | |
| "grad_norm": 0.2951459288597107, | |
| "learning_rate": 0.000600990099009901, | |
| "loss": 1.3518, | |
| "step": 6430 | |
| }, | |
| { | |
| "epoch": 5.425009960726279, | |
| "grad_norm": 0.2697450816631317, | |
| "learning_rate": 0.0006, | |
| "loss": 1.3531, | |
| "step": 6440 | |
| }, | |
| { | |
| "epoch": 5.433433889236724, | |
| "grad_norm": 0.28866857290267944, | |
| "learning_rate": 0.000599009900990099, | |
| "loss": 1.3524, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 5.441857817747168, | |
| "grad_norm": 0.26775673031806946, | |
| "learning_rate": 0.000598019801980198, | |
| "loss": 1.3505, | |
| "step": 6460 | |
| }, | |
| { | |
| "epoch": 5.4502817462576125, | |
| "grad_norm": 0.3911271393299103, | |
| "learning_rate": 0.000597029702970297, | |
| "loss": 1.3516, | |
| "step": 6470 | |
| }, | |
| { | |
| "epoch": 5.458705674768058, | |
| "grad_norm": 0.3151527941226959, | |
| "learning_rate": 0.000596039603960396, | |
| "loss": 1.353, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 5.458705674768058, | |
| "eval_accuracy": 0.7213715986510872, | |
| "eval_loss": 1.2357591390609741, | |
| "eval_runtime": 888.8097, | |
| "eval_samples_per_second": 561.848, | |
| "eval_steps_per_second": 5.202, | |
| "step": 6480 | |
| }, | |
| { | |
| "epoch": 5.467129603278502, | |
| "grad_norm": 0.32286888360977173, | |
| "learning_rate": 0.000595049504950495, | |
| "loss": 1.3527, | |
| "step": 6490 | |
| }, | |
| { | |
| "epoch": 5.475553531788947, | |
| "grad_norm": 0.3933228850364685, | |
| "learning_rate": 0.000594059405940594, | |
| "loss": 1.3511, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.483977460299391, | |
| "grad_norm": 0.3246067762374878, | |
| "learning_rate": 0.0005930693069306931, | |
| "loss": 1.3524, | |
| "step": 6510 | |
| }, | |
| { | |
| "epoch": 5.492401388809835, | |
| "grad_norm": 0.2912397086620331, | |
| "learning_rate": 0.000592079207920792, | |
| "loss": 1.3495, | |
| "step": 6520 | |
| }, | |
| { | |
| "epoch": 5.50082531732028, | |
| "grad_norm": 0.3058258891105652, | |
| "learning_rate": 0.0005910891089108911, | |
| "loss": 1.3486, | |
| "step": 6530 | |
| }, | |
| { | |
| "epoch": 5.509249245830724, | |
| "grad_norm": 0.310024231672287, | |
| "learning_rate": 0.00059009900990099, | |
| "loss": 1.3507, | |
| "step": 6540 | |
| }, | |
| { | |
| "epoch": 5.517673174341169, | |
| "grad_norm": 0.289165198802948, | |
| "learning_rate": 0.0005891089108910891, | |
| "loss": 1.3475, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 5.5260971028516135, | |
| "grad_norm": 0.324613094329834, | |
| "learning_rate": 0.0005881188118811881, | |
| "loss": 1.3489, | |
| "step": 6560 | |
| }, | |
| { | |
| "epoch": 5.5345210313620585, | |
| "grad_norm": 0.3530217111110687, | |
| "learning_rate": 0.0005871287128712871, | |
| "loss": 1.3477, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 5.5345210313620585, | |
| "eval_accuracy": 0.722217175302605, | |
| "eval_loss": 1.2293946743011475, | |
| "eval_runtime": 881.4092, | |
| "eval_samples_per_second": 566.565, | |
| "eval_steps_per_second": 5.246, | |
| "step": 6570 | |
| }, | |
| { | |
| "epoch": 5.542944959872503, | |
| "grad_norm": 0.3527272045612335, | |
| "learning_rate": 0.0005861386138613861, | |
| "loss": 1.3447, | |
| "step": 6580 | |
| }, | |
| { | |
| "epoch": 5.551368888382948, | |
| "grad_norm": 0.26519855856895447, | |
| "learning_rate": 0.0005851485148514851, | |
| "loss": 1.346, | |
| "step": 6590 | |
| }, | |
| { | |
| "epoch": 5.559792816893392, | |
| "grad_norm": 0.29473376274108887, | |
| "learning_rate": 0.0005841584158415841, | |
| "loss": 1.3461, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 5.568216745403836, | |
| "grad_norm": 0.31212469935417175, | |
| "learning_rate": 0.0005831683168316832, | |
| "loss": 1.3454, | |
| "step": 6610 | |
| }, | |
| { | |
| "epoch": 5.576640673914281, | |
| "grad_norm": 0.2541083097457886, | |
| "learning_rate": 0.0005821782178217821, | |
| "loss": 1.3451, | |
| "step": 6620 | |
| }, | |
| { | |
| "epoch": 5.585064602424725, | |
| "grad_norm": 0.28075823187828064, | |
| "learning_rate": 0.0005811881188118812, | |
| "loss": 1.3417, | |
| "step": 6630 | |
| }, | |
| { | |
| "epoch": 5.59348853093517, | |
| "grad_norm": 0.286945641040802, | |
| "learning_rate": 0.0005801980198019801, | |
| "loss": 1.3439, | |
| "step": 6640 | |
| }, | |
| { | |
| "epoch": 5.6019124594456144, | |
| "grad_norm": 0.2825601100921631, | |
| "learning_rate": 0.0005792079207920792, | |
| "loss": 1.3447, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 5.610336387956059, | |
| "grad_norm": 0.3023243844509125, | |
| "learning_rate": 0.0005782178217821782, | |
| "loss": 1.3428, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 5.610336387956059, | |
| "eval_accuracy": 0.7226627197479346, | |
| "eval_loss": 1.2287484407424927, | |
| "eval_runtime": 893.8585, | |
| "eval_samples_per_second": 558.675, | |
| "eval_steps_per_second": 5.173, | |
| "step": 6660 | |
| }, | |
| { | |
| "epoch": 5.618760316466504, | |
| "grad_norm": 0.2548897862434387, | |
| "learning_rate": 0.0005772277227722772, | |
| "loss": 1.3441, | |
| "step": 6670 | |
| }, | |
| { | |
| "epoch": 5.627184244976948, | |
| "grad_norm": 0.28277119994163513, | |
| "learning_rate": 0.0005762376237623762, | |
| "loss": 1.3421, | |
| "step": 6680 | |
| }, | |
| { | |
| "epoch": 5.635608173487393, | |
| "grad_norm": 0.35963568091392517, | |
| "learning_rate": 0.0005752475247524752, | |
| "loss": 1.3421, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 5.644032101997837, | |
| "grad_norm": 0.2753046452999115, | |
| "learning_rate": 0.0005742574257425742, | |
| "loss": 1.3449, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 5.652456030508281, | |
| "grad_norm": 0.31272053718566895, | |
| "learning_rate": 0.0005732673267326733, | |
| "loss": 1.3418, | |
| "step": 6710 | |
| }, | |
| { | |
| "epoch": 5.660879959018726, | |
| "grad_norm": 0.24427007138729095, | |
| "learning_rate": 0.0005722772277227722, | |
| "loss": 1.3409, | |
| "step": 6720 | |
| }, | |
| { | |
| "epoch": 5.66930388752917, | |
| "grad_norm": 0.4038189649581909, | |
| "learning_rate": 0.0005712871287128713, | |
| "loss": 1.3387, | |
| "step": 6730 | |
| }, | |
| { | |
| "epoch": 5.677727816039615, | |
| "grad_norm": 0.30009007453918457, | |
| "learning_rate": 0.0005702970297029702, | |
| "loss": 1.3425, | |
| "step": 6740 | |
| }, | |
| { | |
| "epoch": 5.68615174455006, | |
| "grad_norm": 0.2813461720943451, | |
| "learning_rate": 0.0005693069306930693, | |
| "loss": 1.3396, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 5.68615174455006, | |
| "eval_accuracy": 0.7239226758241876, | |
| "eval_loss": 1.2240657806396484, | |
| "eval_runtime": 898.7215, | |
| "eval_samples_per_second": 555.652, | |
| "eval_steps_per_second": 5.145, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 5.694575673060505, | |
| "grad_norm": 0.4396764039993286, | |
| "learning_rate": 0.0005683168316831683, | |
| "loss": 1.3408, | |
| "step": 6760 | |
| }, | |
| { | |
| "epoch": 5.702999601570949, | |
| "grad_norm": 0.2992042899131775, | |
| "learning_rate": 0.0005673267326732673, | |
| "loss": 1.3408, | |
| "step": 6770 | |
| }, | |
| { | |
| "epoch": 5.711423530081394, | |
| "grad_norm": 0.2579440474510193, | |
| "learning_rate": 0.0005663366336633663, | |
| "loss": 1.3369, | |
| "step": 6780 | |
| }, | |
| { | |
| "epoch": 5.719847458591838, | |
| "grad_norm": 0.32076653838157654, | |
| "learning_rate": 0.0005653465346534653, | |
| "loss": 1.3365, | |
| "step": 6790 | |
| }, | |
| { | |
| "epoch": 5.728271387102282, | |
| "grad_norm": 0.3180268108844757, | |
| "learning_rate": 0.0005643564356435643, | |
| "loss": 1.339, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 5.736695315612727, | |
| "grad_norm": 0.27663713693618774, | |
| "learning_rate": 0.0005633663366336634, | |
| "loss": 1.3373, | |
| "step": 6810 | |
| }, | |
| { | |
| "epoch": 5.745119244123171, | |
| "grad_norm": 0.27103811502456665, | |
| "learning_rate": 0.0005623762376237624, | |
| "loss": 1.3332, | |
| "step": 6820 | |
| }, | |
| { | |
| "epoch": 5.753543172633616, | |
| "grad_norm": 0.34022676944732666, | |
| "learning_rate": 0.0005613861386138615, | |
| "loss": 1.3373, | |
| "step": 6830 | |
| }, | |
| { | |
| "epoch": 5.7619671011440605, | |
| "grad_norm": 0.36838725209236145, | |
| "learning_rate": 0.0005603960396039604, | |
| "loss": 1.3384, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 5.7619671011440605, | |
| "eval_accuracy": 0.7243312842270887, | |
| "eval_loss": 1.221815586090088, | |
| "eval_runtime": 891.7897, | |
| "eval_samples_per_second": 559.971, | |
| "eval_steps_per_second": 5.185, | |
| "step": 6840 | |
| }, | |
| { | |
| "epoch": 5.770391029654505, | |
| "grad_norm": 0.2968374490737915, | |
| "learning_rate": 0.0005594059405940595, | |
| "loss": 1.3353, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 5.77881495816495, | |
| "grad_norm": 0.36536258459091187, | |
| "learning_rate": 0.0005584158415841585, | |
| "loss": 1.3331, | |
| "step": 6860 | |
| }, | |
| { | |
| "epoch": 5.787238886675394, | |
| "grad_norm": 0.2985541522502899, | |
| "learning_rate": 0.0005574257425742575, | |
| "loss": 1.3313, | |
| "step": 6870 | |
| }, | |
| { | |
| "epoch": 5.795662815185839, | |
| "grad_norm": 0.33506348729133606, | |
| "learning_rate": 0.0005564356435643565, | |
| "loss": 1.3349, | |
| "step": 6880 | |
| }, | |
| { | |
| "epoch": 5.804086743696283, | |
| "grad_norm": 0.31232866644859314, | |
| "learning_rate": 0.0005554455445544555, | |
| "loss": 1.3335, | |
| "step": 6890 | |
| }, | |
| { | |
| "epoch": 5.812510672206728, | |
| "grad_norm": 0.27576977014541626, | |
| "learning_rate": 0.0005544554455445545, | |
| "loss": 1.3309, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 5.820934600717172, | |
| "grad_norm": 0.2526339590549469, | |
| "learning_rate": 0.0005534653465346536, | |
| "loss": 1.3318, | |
| "step": 6910 | |
| }, | |
| { | |
| "epoch": 5.829358529227616, | |
| "grad_norm": 0.25774866342544556, | |
| "learning_rate": 0.0005524752475247525, | |
| "loss": 1.3329, | |
| "step": 6920 | |
| }, | |
| { | |
| "epoch": 5.8377824577380615, | |
| "grad_norm": 0.34311917424201965, | |
| "learning_rate": 0.0005514851485148516, | |
| "loss": 1.3334, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 5.8377824577380615, | |
| "eval_accuracy": 0.7251374384748042, | |
| "eval_loss": 1.216299057006836, | |
| "eval_runtime": 889.6984, | |
| "eval_samples_per_second": 561.287, | |
| "eval_steps_per_second": 5.197, | |
| "step": 6930 | |
| }, | |
| { | |
| "epoch": 5.846206386248506, | |
| "grad_norm": 0.32087624073028564, | |
| "learning_rate": 0.0005504950495049505, | |
| "loss": 1.3338, | |
| "step": 6940 | |
| }, | |
| { | |
| "epoch": 5.854630314758951, | |
| "grad_norm": 0.25447556376457214, | |
| "learning_rate": 0.0005495049504950496, | |
| "loss": 1.3315, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 5.863054243269395, | |
| "grad_norm": 0.285826712846756, | |
| "learning_rate": 0.0005485148514851486, | |
| "loss": 1.3303, | |
| "step": 6960 | |
| }, | |
| { | |
| "epoch": 5.87147817177984, | |
| "grad_norm": 0.2816094756126404, | |
| "learning_rate": 0.0005475247524752476, | |
| "loss": 1.3308, | |
| "step": 6970 | |
| }, | |
| { | |
| "epoch": 5.879902100290284, | |
| "grad_norm": 0.30444055795669556, | |
| "learning_rate": 0.0005465346534653466, | |
| "loss": 1.3303, | |
| "step": 6980 | |
| }, | |
| { | |
| "epoch": 5.888326028800728, | |
| "grad_norm": 0.3512563705444336, | |
| "learning_rate": 0.0005455445544554456, | |
| "loss": 1.3305, | |
| "step": 6990 | |
| }, | |
| { | |
| "epoch": 5.896749957311173, | |
| "grad_norm": 0.2924775779247284, | |
| "learning_rate": 0.0005445544554455446, | |
| "loss": 1.3307, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 5.905173885821617, | |
| "grad_norm": 0.3497087359428406, | |
| "learning_rate": 0.0005435643564356437, | |
| "loss": 1.3295, | |
| "step": 7010 | |
| }, | |
| { | |
| "epoch": 5.913597814332062, | |
| "grad_norm": 0.2714064419269562, | |
| "learning_rate": 0.0005425742574257426, | |
| "loss": 1.329, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 5.913597814332062, | |
| "eval_accuracy": 0.7261800107692413, | |
| "eval_loss": 1.2115275859832764, | |
| "eval_runtime": 893.0627, | |
| "eval_samples_per_second": 559.172, | |
| "eval_steps_per_second": 5.178, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 5.922021742842507, | |
| "grad_norm": 0.277203232049942, | |
| "learning_rate": 0.0005415841584158417, | |
| "loss": 1.3269, | |
| "step": 7030 | |
| }, | |
| { | |
| "epoch": 5.930445671352951, | |
| "grad_norm": 0.3769485354423523, | |
| "learning_rate": 0.0005405940594059406, | |
| "loss": 1.3268, | |
| "step": 7040 | |
| }, | |
| { | |
| "epoch": 5.938869599863396, | |
| "grad_norm": 0.2526576817035675, | |
| "learning_rate": 0.0005396039603960396, | |
| "loss": 1.3262, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 5.94729352837384, | |
| "grad_norm": 0.2670144736766815, | |
| "learning_rate": 0.0005386138613861387, | |
| "loss": 1.327, | |
| "step": 7060 | |
| }, | |
| { | |
| "epoch": 5.955717456884285, | |
| "grad_norm": 0.26662877202033997, | |
| "learning_rate": 0.0005376237623762376, | |
| "loss": 1.3277, | |
| "step": 7070 | |
| }, | |
| { | |
| "epoch": 5.964141385394729, | |
| "grad_norm": 0.3263689875602722, | |
| "learning_rate": 0.0005366336633663367, | |
| "loss": 1.3271, | |
| "step": 7080 | |
| }, | |
| { | |
| "epoch": 5.972565313905174, | |
| "grad_norm": 0.26732614636421204, | |
| "learning_rate": 0.0005356435643564356, | |
| "loss": 1.3264, | |
| "step": 7090 | |
| }, | |
| { | |
| "epoch": 5.980989242415618, | |
| "grad_norm": 0.3332139551639557, | |
| "learning_rate": 0.0005346534653465347, | |
| "loss": 1.3266, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 5.989413170926063, | |
| "grad_norm": 0.3081839680671692, | |
| "learning_rate": 0.0005336633663366337, | |
| "loss": 1.325, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 5.989413170926063, | |
| "eval_accuracy": 0.7263082386708871, | |
| "eval_loss": 1.2105002403259277, | |
| "eval_runtime": 893.0055, | |
| "eval_samples_per_second": 559.208, | |
| "eval_steps_per_second": 5.178, | |
| "step": 7110 | |
| }, | |
| { | |
| "epoch": 5.997837099436508, | |
| "grad_norm": 0.2502419650554657, | |
| "learning_rate": 0.0005326732673267327, | |
| "loss": 1.3263, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 6.006261027946952, | |
| "grad_norm": 0.2437312752008438, | |
| "learning_rate": 0.0005316831683168317, | |
| "loss": 1.3225, | |
| "step": 7130 | |
| }, | |
| { | |
| "epoch": 6.014684956457397, | |
| "grad_norm": 0.3372795581817627, | |
| "learning_rate": 0.0005306930693069307, | |
| "loss": 1.3234, | |
| "step": 7140 | |
| }, | |
| { | |
| "epoch": 6.023108884967841, | |
| "grad_norm": 0.2895912826061249, | |
| "learning_rate": 0.0005297029702970297, | |
| "loss": 1.3252, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 6.031532813478286, | |
| "grad_norm": 0.28451213240623474, | |
| "learning_rate": 0.0005287128712871288, | |
| "loss": 1.3238, | |
| "step": 7160 | |
| }, | |
| { | |
| "epoch": 6.03995674198873, | |
| "grad_norm": 0.2496078759431839, | |
| "learning_rate": 0.0005277227722772277, | |
| "loss": 1.323, | |
| "step": 7170 | |
| }, | |
| { | |
| "epoch": 6.048380670499174, | |
| "grad_norm": 0.26850923895835876, | |
| "learning_rate": 0.0005267326732673268, | |
| "loss": 1.322, | |
| "step": 7180 | |
| }, | |
| { | |
| "epoch": 6.056804599009619, | |
| "grad_norm": 0.30225685238838196, | |
| "learning_rate": 0.0005257425742574257, | |
| "loss": 1.3212, | |
| "step": 7190 | |
| }, | |
| { | |
| "epoch": 6.0652285275200635, | |
| "grad_norm": 0.32349905371665955, | |
| "learning_rate": 0.0005247524752475248, | |
| "loss": 1.3219, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 6.0652285275200635, | |
| "eval_accuracy": 0.727180971273756, | |
| "eval_loss": 1.205489993095398, | |
| "eval_runtime": 890.8938, | |
| "eval_samples_per_second": 560.534, | |
| "eval_steps_per_second": 5.19, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 6.0736524560305085, | |
| "grad_norm": 0.29943209886550903, | |
| "learning_rate": 0.0005237623762376238, | |
| "loss": 1.3182, | |
| "step": 7210 | |
| }, | |
| { | |
| "epoch": 6.082076384540953, | |
| "grad_norm": 0.30952343344688416, | |
| "learning_rate": 0.0005227722772277228, | |
| "loss": 1.3194, | |
| "step": 7220 | |
| }, | |
| { | |
| "epoch": 6.090500313051398, | |
| "grad_norm": 0.3158267140388489, | |
| "learning_rate": 0.0005217821782178218, | |
| "loss": 1.319, | |
| "step": 7230 | |
| }, | |
| { | |
| "epoch": 6.098924241561842, | |
| "grad_norm": 0.27009105682373047, | |
| "learning_rate": 0.0005207920792079208, | |
| "loss": 1.3212, | |
| "step": 7240 | |
| }, | |
| { | |
| "epoch": 6.107348170072286, | |
| "grad_norm": 0.2660143971443176, | |
| "learning_rate": 0.0005198019801980198, | |
| "loss": 1.3181, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 6.115772098582731, | |
| "grad_norm": 0.32289671897888184, | |
| "learning_rate": 0.0005188118811881189, | |
| "loss": 1.3166, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 6.124196027093175, | |
| "grad_norm": 0.301577627658844, | |
| "learning_rate": 0.0005178217821782178, | |
| "loss": 1.3215, | |
| "step": 7270 | |
| }, | |
| { | |
| "epoch": 6.13261995560362, | |
| "grad_norm": 0.26539114117622375, | |
| "learning_rate": 0.0005168316831683169, | |
| "loss": 1.3173, | |
| "step": 7280 | |
| }, | |
| { | |
| "epoch": 6.141043884114064, | |
| "grad_norm": 0.30636703968048096, | |
| "learning_rate": 0.0005158415841584158, | |
| "loss": 1.319, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 6.141043884114064, | |
| "eval_accuracy": 0.7278776618882268, | |
| "eval_loss": 1.2021031379699707, | |
| "eval_runtime": 893.3533, | |
| "eval_samples_per_second": 558.99, | |
| "eval_steps_per_second": 5.176, | |
| "step": 7290 | |
| }, | |
| { | |
| "epoch": 6.1494678126245095, | |
| "grad_norm": 0.2906350791454315, | |
| "learning_rate": 0.0005148514851485149, | |
| "loss": 1.3177, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 6.157891741134954, | |
| "grad_norm": 0.33962422609329224, | |
| "learning_rate": 0.0005138613861386139, | |
| "loss": 1.3173, | |
| "step": 7310 | |
| }, | |
| { | |
| "epoch": 6.166315669645398, | |
| "grad_norm": 0.29772093892097473, | |
| "learning_rate": 0.0005128712871287129, | |
| "loss": 1.3194, | |
| "step": 7320 | |
| }, | |
| { | |
| "epoch": 6.174739598155843, | |
| "grad_norm": 0.27262043952941895, | |
| "learning_rate": 0.0005118811881188119, | |
| "loss": 1.3159, | |
| "step": 7330 | |
| }, | |
| { | |
| "epoch": 6.183163526666287, | |
| "grad_norm": 0.2678314745426178, | |
| "learning_rate": 0.0005108910891089109, | |
| "loss": 1.3167, | |
| "step": 7340 | |
| }, | |
| { | |
| "epoch": 6.191587455176732, | |
| "grad_norm": 0.3115740716457367, | |
| "learning_rate": 0.0005099009900990099, | |
| "loss": 1.3142, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 6.200011383687176, | |
| "grad_norm": 0.2983403205871582, | |
| "learning_rate": 0.000508910891089109, | |
| "loss": 1.3158, | |
| "step": 7360 | |
| }, | |
| { | |
| "epoch": 6.208435312197621, | |
| "grad_norm": 0.2797269821166992, | |
| "learning_rate": 0.0005079207920792079, | |
| "loss": 1.3163, | |
| "step": 7370 | |
| }, | |
| { | |
| "epoch": 6.216859240708065, | |
| "grad_norm": 0.29581907391548157, | |
| "learning_rate": 0.000506930693069307, | |
| "loss": 1.3156, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 6.216859240708065, | |
| "eval_accuracy": 0.7285335214596267, | |
| "eval_loss": 1.1984630823135376, | |
| "eval_runtime": 881.1088, | |
| "eval_samples_per_second": 566.759, | |
| "eval_steps_per_second": 5.248, | |
| "step": 7380 | |
| }, | |
| { | |
| "epoch": 6.2252831692185095, | |
| "grad_norm": 0.2843240797519684, | |
| "learning_rate": 0.0005059405940594059, | |
| "loss": 1.3162, | |
| "step": 7390 | |
| }, | |
| { | |
| "epoch": 6.233707097728955, | |
| "grad_norm": 0.2662515938282013, | |
| "learning_rate": 0.000504950495049505, | |
| "loss": 1.314, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 6.242131026239399, | |
| "grad_norm": 0.3370913565158844, | |
| "learning_rate": 0.000503960396039604, | |
| "loss": 1.3136, | |
| "step": 7410 | |
| }, | |
| { | |
| "epoch": 6.250554954749844, | |
| "grad_norm": 0.29014459252357483, | |
| "learning_rate": 0.000502970297029703, | |
| "loss": 1.3127, | |
| "step": 7420 | |
| }, | |
| { | |
| "epoch": 6.258978883260288, | |
| "grad_norm": 0.2779816687107086, | |
| "learning_rate": 0.000501980198019802, | |
| "loss": 1.3137, | |
| "step": 7430 | |
| }, | |
| { | |
| "epoch": 6.267402811770733, | |
| "grad_norm": 0.2942447066307068, | |
| "learning_rate": 0.000500990099009901, | |
| "loss": 1.3138, | |
| "step": 7440 | |
| }, | |
| { | |
| "epoch": 6.275826740281177, | |
| "grad_norm": 0.3536125719547272, | |
| "learning_rate": 0.0005, | |
| "loss": 1.3135, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 6.284250668791621, | |
| "grad_norm": 0.29686686396598816, | |
| "learning_rate": 0.0004990099009900991, | |
| "loss": 1.3129, | |
| "step": 7460 | |
| }, | |
| { | |
| "epoch": 6.292674597302066, | |
| "grad_norm": 0.30590084195137024, | |
| "learning_rate": 0.000498019801980198, | |
| "loss": 1.3114, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 6.292674597302066, | |
| "eval_accuracy": 0.7293452386458654, | |
| "eval_loss": 1.1951327323913574, | |
| "eval_runtime": 893.3348, | |
| "eval_samples_per_second": 559.002, | |
| "eval_steps_per_second": 5.176, | |
| "step": 7470 | |
| }, | |
| { | |
| "epoch": 6.3010985258125105, | |
| "grad_norm": 0.2687655985355377, | |
| "learning_rate": 0.0004970297029702971, | |
| "loss": 1.3125, | |
| "step": 7480 | |
| }, | |
| { | |
| "epoch": 6.3095224543229556, | |
| "grad_norm": 0.31057268381118774, | |
| "learning_rate": 0.000496039603960396, | |
| "loss": 1.3106, | |
| "step": 7490 | |
| }, | |
| { | |
| "epoch": 6.3179463828334, | |
| "grad_norm": 0.3097970187664032, | |
| "learning_rate": 0.0004950495049504951, | |
| "loss": 1.31, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.326370311343844, | |
| "grad_norm": 0.28469330072402954, | |
| "learning_rate": 0.0004940594059405941, | |
| "loss": 1.3098, | |
| "step": 7510 | |
| }, | |
| { | |
| "epoch": 6.334794239854289, | |
| "grad_norm": 0.2911768853664398, | |
| "learning_rate": 0.000493069306930693, | |
| "loss": 1.3103, | |
| "step": 7520 | |
| }, | |
| { | |
| "epoch": 6.343218168364733, | |
| "grad_norm": 0.2990330755710602, | |
| "learning_rate": 0.0004920792079207921, | |
| "loss": 1.3108, | |
| "step": 7530 | |
| }, | |
| { | |
| "epoch": 6.351642096875178, | |
| "grad_norm": 0.2908383905887604, | |
| "learning_rate": 0.000491089108910891, | |
| "loss": 1.3092, | |
| "step": 7540 | |
| }, | |
| { | |
| "epoch": 6.360066025385622, | |
| "grad_norm": 0.306233674287796, | |
| "learning_rate": 0.0004900990099009901, | |
| "loss": 1.3107, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 6.368489953896067, | |
| "grad_norm": 0.2749456465244293, | |
| "learning_rate": 0.0004891089108910892, | |
| "loss": 1.3073, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 6.368489953896067, | |
| "eval_accuracy": 0.7300212582744398, | |
| "eval_loss": 1.1918327808380127, | |
| "eval_runtime": 886.4778, | |
| "eval_samples_per_second": 563.326, | |
| "eval_steps_per_second": 5.216, | |
| "step": 7560 | |
| }, | |
| { | |
| "epoch": 6.3769138824065115, | |
| "grad_norm": 0.2799837291240692, | |
| "learning_rate": 0.0004881188118811881, | |
| "loss": 1.3084, | |
| "step": 7570 | |
| }, | |
| { | |
| "epoch": 6.385337810916956, | |
| "grad_norm": 0.3050614893436432, | |
| "learning_rate": 0.00048712871287128715, | |
| "loss": 1.3082, | |
| "step": 7580 | |
| }, | |
| { | |
| "epoch": 6.393761739427401, | |
| "grad_norm": 0.2900220453739166, | |
| "learning_rate": 0.00048613861386138615, | |
| "loss": 1.3087, | |
| "step": 7590 | |
| }, | |
| { | |
| "epoch": 6.402185667937845, | |
| "grad_norm": 0.2592508792877197, | |
| "learning_rate": 0.00048514851485148515, | |
| "loss": 1.3082, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 6.41060959644829, | |
| "grad_norm": 0.2503323256969452, | |
| "learning_rate": 0.00048415841584158414, | |
| "loss": 1.3066, | |
| "step": 7610 | |
| }, | |
| { | |
| "epoch": 6.419033524958734, | |
| "grad_norm": 0.30254074931144714, | |
| "learning_rate": 0.00048316831683168314, | |
| "loss": 1.3079, | |
| "step": 7620 | |
| }, | |
| { | |
| "epoch": 6.427457453469179, | |
| "grad_norm": 0.28869137167930603, | |
| "learning_rate": 0.0004821782178217822, | |
| "loss": 1.3061, | |
| "step": 7630 | |
| }, | |
| { | |
| "epoch": 6.435881381979623, | |
| "grad_norm": 0.3226109445095062, | |
| "learning_rate": 0.0004811881188118812, | |
| "loss": 1.3051, | |
| "step": 7640 | |
| }, | |
| { | |
| "epoch": 6.444305310490067, | |
| "grad_norm": 0.2900817096233368, | |
| "learning_rate": 0.0004801980198019802, | |
| "loss": 1.3062, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 6.444305310490067, | |
| "eval_accuracy": 0.7304169114350704, | |
| "eval_loss": 1.1914669275283813, | |
| "eval_runtime": 888.5325, | |
| "eval_samples_per_second": 562.023, | |
| "eval_steps_per_second": 5.204, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 6.452729239000512, | |
| "grad_norm": 0.3235354721546173, | |
| "learning_rate": 0.0004792079207920792, | |
| "loss": 1.3074, | |
| "step": 7660 | |
| }, | |
| { | |
| "epoch": 6.461153167510957, | |
| "grad_norm": 0.26384827494621277, | |
| "learning_rate": 0.0004782178217821782, | |
| "loss": 1.3052, | |
| "step": 7670 | |
| }, | |
| { | |
| "epoch": 6.469577096021402, | |
| "grad_norm": 0.27176037430763245, | |
| "learning_rate": 0.00047722772277227724, | |
| "loss": 1.3032, | |
| "step": 7680 | |
| }, | |
| { | |
| "epoch": 6.478001024531846, | |
| "grad_norm": 0.27846911549568176, | |
| "learning_rate": 0.00047623762376237624, | |
| "loss": 1.3038, | |
| "step": 7690 | |
| }, | |
| { | |
| "epoch": 6.48642495304229, | |
| "grad_norm": 0.32258498668670654, | |
| "learning_rate": 0.00047524752475247524, | |
| "loss": 1.3052, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 6.494848881552735, | |
| "grad_norm": 0.3000924587249756, | |
| "learning_rate": 0.00047425742574257423, | |
| "loss": 1.3046, | |
| "step": 7710 | |
| }, | |
| { | |
| "epoch": 6.503272810063179, | |
| "grad_norm": 0.22748370468616486, | |
| "learning_rate": 0.00047326732673267323, | |
| "loss": 1.3054, | |
| "step": 7720 | |
| }, | |
| { | |
| "epoch": 6.511696738573624, | |
| "grad_norm": 0.3552054464817047, | |
| "learning_rate": 0.0004722772277227723, | |
| "loss": 1.3026, | |
| "step": 7730 | |
| }, | |
| { | |
| "epoch": 6.520120667084068, | |
| "grad_norm": 0.2629605531692505, | |
| "learning_rate": 0.0004712871287128713, | |
| "loss": 1.3021, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 6.520120667084068, | |
| "eval_accuracy": 0.7311149976881265, | |
| "eval_loss": 1.1877076625823975, | |
| "eval_runtime": 883.1573, | |
| "eval_samples_per_second": 565.444, | |
| "eval_steps_per_second": 5.236, | |
| "step": 7740 | |
| }, | |
| { | |
| "epoch": 6.528544595594513, | |
| "grad_norm": 0.31692177057266235, | |
| "learning_rate": 0.0004702970297029703, | |
| "loss": 1.3048, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 6.5369685241049575, | |
| "grad_norm": 0.3689730167388916, | |
| "learning_rate": 0.0004693069306930693, | |
| "loss": 1.3016, | |
| "step": 7760 | |
| }, | |
| { | |
| "epoch": 6.545392452615403, | |
| "grad_norm": 0.2619648277759552, | |
| "learning_rate": 0.00046831683168316833, | |
| "loss": 1.3018, | |
| "step": 7770 | |
| }, | |
| { | |
| "epoch": 6.553816381125847, | |
| "grad_norm": 0.29713907837867737, | |
| "learning_rate": 0.0004673267326732674, | |
| "loss": 1.3007, | |
| "step": 7780 | |
| }, | |
| { | |
| "epoch": 6.562240309636291, | |
| "grad_norm": 0.3426944315433502, | |
| "learning_rate": 0.0004663366336633664, | |
| "loss": 1.302, | |
| "step": 7790 | |
| }, | |
| { | |
| "epoch": 6.570664238146736, | |
| "grad_norm": 0.30286312103271484, | |
| "learning_rate": 0.0004653465346534654, | |
| "loss": 1.3024, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 6.57908816665718, | |
| "grad_norm": 0.2533584237098694, | |
| "learning_rate": 0.0004643564356435644, | |
| "loss": 1.2991, | |
| "step": 7810 | |
| }, | |
| { | |
| "epoch": 6.587512095167625, | |
| "grad_norm": 0.23465867340564728, | |
| "learning_rate": 0.0004633663366336634, | |
| "loss": 1.3007, | |
| "step": 7820 | |
| }, | |
| { | |
| "epoch": 6.595936023678069, | |
| "grad_norm": 0.31729191541671753, | |
| "learning_rate": 0.00046237623762376243, | |
| "loss": 1.3, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 6.595936023678069, | |
| "eval_accuracy": 0.7318502985148011, | |
| "eval_loss": 1.1818432807922363, | |
| "eval_runtime": 891.13, | |
| "eval_samples_per_second": 560.385, | |
| "eval_steps_per_second": 5.189, | |
| "step": 7830 | |
| }, | |
| { | |
| "epoch": 6.6043599521885135, | |
| "grad_norm": 0.26264631748199463, | |
| "learning_rate": 0.00046138613861386143, | |
| "loss": 1.3003, | |
| "step": 7840 | |
| }, | |
| { | |
| "epoch": 6.6127838806989585, | |
| "grad_norm": 0.26062801480293274, | |
| "learning_rate": 0.0004603960396039604, | |
| "loss": 1.2977, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 6.621207809209403, | |
| "grad_norm": 0.2755686640739441, | |
| "learning_rate": 0.0004594059405940594, | |
| "loss": 1.2979, | |
| "step": 7860 | |
| }, | |
| { | |
| "epoch": 6.629631737719848, | |
| "grad_norm": 0.32309025526046753, | |
| "learning_rate": 0.0004584158415841584, | |
| "loss": 1.297, | |
| "step": 7870 | |
| }, | |
| { | |
| "epoch": 6.638055666230292, | |
| "grad_norm": 0.2709057927131653, | |
| "learning_rate": 0.0004574257425742575, | |
| "loss": 1.2999, | |
| "step": 7880 | |
| }, | |
| { | |
| "epoch": 6.646479594740737, | |
| "grad_norm": 0.2785532772541046, | |
| "learning_rate": 0.00045643564356435647, | |
| "loss": 1.2959, | |
| "step": 7890 | |
| }, | |
| { | |
| "epoch": 6.654903523251181, | |
| "grad_norm": 0.2822953164577484, | |
| "learning_rate": 0.00045544554455445547, | |
| "loss": 1.2984, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 6.663327451761625, | |
| "grad_norm": 0.2704668641090393, | |
| "learning_rate": 0.00045445544554455447, | |
| "loss": 1.2956, | |
| "step": 7910 | |
| }, | |
| { | |
| "epoch": 6.67175138027207, | |
| "grad_norm": 0.3228791058063507, | |
| "learning_rate": 0.00045346534653465347, | |
| "loss": 1.2984, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 6.67175138027207, | |
| "eval_accuracy": 0.7318941432804211, | |
| "eval_loss": 1.184158205986023, | |
| "eval_runtime": 883.7641, | |
| "eval_samples_per_second": 565.056, | |
| "eval_steps_per_second": 5.232, | |
| "step": 7920 | |
| }, | |
| { | |
| "epoch": 6.680175308782514, | |
| "grad_norm": 0.2641367018222809, | |
| "learning_rate": 0.0004524752475247525, | |
| "loss": 1.299, | |
| "step": 7930 | |
| }, | |
| { | |
| "epoch": 6.6885992372929595, | |
| "grad_norm": 0.28555190563201904, | |
| "learning_rate": 0.0004514851485148515, | |
| "loss": 1.2985, | |
| "step": 7940 | |
| }, | |
| { | |
| "epoch": 6.697023165803404, | |
| "grad_norm": 0.2615039050579071, | |
| "learning_rate": 0.0004504950495049505, | |
| "loss": 1.294, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 6.705447094313849, | |
| "grad_norm": 0.25349870324134827, | |
| "learning_rate": 0.0004495049504950495, | |
| "loss": 1.295, | |
| "step": 7960 | |
| }, | |
| { | |
| "epoch": 6.713871022824293, | |
| "grad_norm": 0.3342011272907257, | |
| "learning_rate": 0.0004485148514851485, | |
| "loss": 1.2963, | |
| "step": 7970 | |
| }, | |
| { | |
| "epoch": 6.722294951334737, | |
| "grad_norm": 0.2608206570148468, | |
| "learning_rate": 0.00044752475247524756, | |
| "loss": 1.2957, | |
| "step": 7980 | |
| }, | |
| { | |
| "epoch": 6.730718879845182, | |
| "grad_norm": 0.27476873993873596, | |
| "learning_rate": 0.00044653465346534656, | |
| "loss": 1.2939, | |
| "step": 7990 | |
| }, | |
| { | |
| "epoch": 6.739142808355626, | |
| "grad_norm": 0.3241907060146332, | |
| "learning_rate": 0.00044554455445544556, | |
| "loss": 1.2965, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.747566736866071, | |
| "grad_norm": 0.3494180142879486, | |
| "learning_rate": 0.00044455445544554456, | |
| "loss": 1.2962, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 6.747566736866071, | |
| "eval_accuracy": 0.7322386411238602, | |
| "eval_loss": 1.182516098022461, | |
| "eval_runtime": 889.7545, | |
| "eval_samples_per_second": 561.251, | |
| "eval_steps_per_second": 5.197, | |
| "step": 8010 | |
| }, | |
| { | |
| "epoch": 6.755990665376515, | |
| "grad_norm": 0.2616145610809326, | |
| "learning_rate": 0.00044356435643564356, | |
| "loss": 1.2958, | |
| "step": 8020 | |
| }, | |
| { | |
| "epoch": 6.7644145938869595, | |
| "grad_norm": 0.29238995909690857, | |
| "learning_rate": 0.0004425742574257426, | |
| "loss": 1.293, | |
| "step": 8030 | |
| }, | |
| { | |
| "epoch": 6.772838522397405, | |
| "grad_norm": 0.24060964584350586, | |
| "learning_rate": 0.0004415841584158416, | |
| "loss": 1.2948, | |
| "step": 8040 | |
| }, | |
| { | |
| "epoch": 6.781262450907849, | |
| "grad_norm": 0.29363489151000977, | |
| "learning_rate": 0.0004405940594059406, | |
| "loss": 1.2928, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 6.789686379418294, | |
| "grad_norm": 0.3320622444152832, | |
| "learning_rate": 0.0004396039603960396, | |
| "loss": 1.2925, | |
| "step": 8060 | |
| }, | |
| { | |
| "epoch": 6.798110307928738, | |
| "grad_norm": 0.23857133090496063, | |
| "learning_rate": 0.0004386138613861386, | |
| "loss": 1.2943, | |
| "step": 8070 | |
| }, | |
| { | |
| "epoch": 6.806534236439183, | |
| "grad_norm": 0.24713198840618134, | |
| "learning_rate": 0.00043762376237623765, | |
| "loss": 1.2938, | |
| "step": 8080 | |
| }, | |
| { | |
| "epoch": 6.814958164949627, | |
| "grad_norm": 0.26270854473114014, | |
| "learning_rate": 0.00043663366336633665, | |
| "loss": 1.2916, | |
| "step": 8090 | |
| }, | |
| { | |
| "epoch": 6.823382093460072, | |
| "grad_norm": 0.2450101524591446, | |
| "learning_rate": 0.00043564356435643565, | |
| "loss": 1.2931, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 6.823382093460072, | |
| "eval_accuracy": 0.7332625526391774, | |
| "eval_loss": 1.1757333278656006, | |
| "eval_runtime": 889.0249, | |
| "eval_samples_per_second": 561.712, | |
| "eval_steps_per_second": 5.201, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 6.831806021970516, | |
| "grad_norm": 0.27462685108184814, | |
| "learning_rate": 0.00043465346534653465, | |
| "loss": 1.2923, | |
| "step": 8110 | |
| }, | |
| { | |
| "epoch": 6.8402299504809605, | |
| "grad_norm": 0.2707907259464264, | |
| "learning_rate": 0.00043366336633663365, | |
| "loss": 1.2925, | |
| "step": 8120 | |
| }, | |
| { | |
| "epoch": 6.8486538789914055, | |
| "grad_norm": 0.24748317897319794, | |
| "learning_rate": 0.0004326732673267327, | |
| "loss": 1.2929, | |
| "step": 8130 | |
| }, | |
| { | |
| "epoch": 6.85707780750185, | |
| "grad_norm": 0.226767897605896, | |
| "learning_rate": 0.0004316831683168317, | |
| "loss": 1.2883, | |
| "step": 8140 | |
| }, | |
| { | |
| "epoch": 6.865501736012295, | |
| "grad_norm": 0.24889105558395386, | |
| "learning_rate": 0.0004306930693069307, | |
| "loss": 1.2893, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 6.873925664522739, | |
| "grad_norm": 0.26075902581214905, | |
| "learning_rate": 0.0004297029702970297, | |
| "loss": 1.2893, | |
| "step": 8160 | |
| }, | |
| { | |
| "epoch": 6.882349593033183, | |
| "grad_norm": 0.26210734248161316, | |
| "learning_rate": 0.0004287128712871287, | |
| "loss": 1.2868, | |
| "step": 8170 | |
| }, | |
| { | |
| "epoch": 6.890773521543628, | |
| "grad_norm": 0.2559298872947693, | |
| "learning_rate": 0.00042772277227722774, | |
| "loss": 1.2886, | |
| "step": 8180 | |
| }, | |
| { | |
| "epoch": 6.899197450054072, | |
| "grad_norm": 0.2503817081451416, | |
| "learning_rate": 0.00042673267326732674, | |
| "loss": 1.2883, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 6.899197450054072, | |
| "eval_accuracy": 0.7335132915044345, | |
| "eval_loss": 1.1744158267974854, | |
| "eval_runtime": 885.5636, | |
| "eval_samples_per_second": 563.908, | |
| "eval_steps_per_second": 5.222, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 6.907621378564517, | |
| "grad_norm": 0.24540117383003235, | |
| "learning_rate": 0.00042574257425742574, | |
| "loss": 1.2893, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 6.9160453070749615, | |
| "grad_norm": 0.3089258670806885, | |
| "learning_rate": 0.00042475247524752474, | |
| "loss": 1.2896, | |
| "step": 8210 | |
| }, | |
| { | |
| "epoch": 6.9244692355854065, | |
| "grad_norm": 0.26888999342918396, | |
| "learning_rate": 0.00042376237623762374, | |
| "loss": 1.2895, | |
| "step": 8220 | |
| }, | |
| { | |
| "epoch": 6.932893164095851, | |
| "grad_norm": 0.24743571877479553, | |
| "learning_rate": 0.0004227722772277228, | |
| "loss": 1.2884, | |
| "step": 8230 | |
| }, | |
| { | |
| "epoch": 6.941317092606295, | |
| "grad_norm": 0.24364733695983887, | |
| "learning_rate": 0.0004217821782178218, | |
| "loss": 1.2879, | |
| "step": 8240 | |
| }, | |
| { | |
| "epoch": 6.94974102111674, | |
| "grad_norm": 0.2963743507862091, | |
| "learning_rate": 0.0004207920792079208, | |
| "loss": 1.2878, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 6.958164949627184, | |
| "grad_norm": 0.2444639950990677, | |
| "learning_rate": 0.0004198019801980198, | |
| "loss": 1.2871, | |
| "step": 8260 | |
| }, | |
| { | |
| "epoch": 6.966588878137629, | |
| "grad_norm": 0.27140820026397705, | |
| "learning_rate": 0.0004188118811881188, | |
| "loss": 1.2878, | |
| "step": 8270 | |
| }, | |
| { | |
| "epoch": 6.975012806648073, | |
| "grad_norm": 0.2628765404224396, | |
| "learning_rate": 0.00041782178217821784, | |
| "loss": 1.2873, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 6.975012806648073, | |
| "eval_accuracy": 0.734204579286565, | |
| "eval_loss": 1.171156644821167, | |
| "eval_runtime": 888.1172, | |
| "eval_samples_per_second": 562.286, | |
| "eval_steps_per_second": 5.207, | |
| "step": 8280 | |
| }, | |
| { | |
| "epoch": 6.983436735158518, | |
| "grad_norm": 0.2539413869380951, | |
| "learning_rate": 0.00041683168316831683, | |
| "loss": 1.2874, | |
| "step": 8290 | |
| }, | |
| { | |
| "epoch": 6.991860663668962, | |
| "grad_norm": 0.29522642493247986, | |
| "learning_rate": 0.00041584158415841583, | |
| "loss": 1.2859, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 7.000284592179407, | |
| "grad_norm": 0.29553958773612976, | |
| "learning_rate": 0.00041485148514851483, | |
| "loss": 1.2878, | |
| "step": 8310 | |
| }, | |
| { | |
| "epoch": 7.008708520689852, | |
| "grad_norm": 0.3111182153224945, | |
| "learning_rate": 0.00041386138613861383, | |
| "loss": 1.2874, | |
| "step": 8320 | |
| }, | |
| { | |
| "epoch": 7.017132449200296, | |
| "grad_norm": 0.33146336674690247, | |
| "learning_rate": 0.0004128712871287129, | |
| "loss": 1.287, | |
| "step": 8330 | |
| }, | |
| { | |
| "epoch": 7.025556377710741, | |
| "grad_norm": 0.27456361055374146, | |
| "learning_rate": 0.0004118811881188119, | |
| "loss": 1.2858, | |
| "step": 8340 | |
| }, | |
| { | |
| "epoch": 7.033980306221185, | |
| "grad_norm": 0.29216212034225464, | |
| "learning_rate": 0.0004108910891089109, | |
| "loss": 1.2838, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 7.042404234731629, | |
| "grad_norm": 0.24966631829738617, | |
| "learning_rate": 0.0004099009900990099, | |
| "loss": 1.2857, | |
| "step": 8360 | |
| }, | |
| { | |
| "epoch": 7.050828163242074, | |
| "grad_norm": 0.2910294234752655, | |
| "learning_rate": 0.0004089108910891089, | |
| "loss": 1.2858, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 7.050828163242074, | |
| "eval_accuracy": 0.7346228547150983, | |
| "eval_loss": 1.169946551322937, | |
| "eval_runtime": 890.9908, | |
| "eval_samples_per_second": 560.473, | |
| "eval_steps_per_second": 5.19, | |
| "step": 8370 | |
| }, | |
| { | |
| "epoch": 7.059252091752518, | |
| "grad_norm": 0.26337358355522156, | |
| "learning_rate": 0.0004079207920792079, | |
| "loss": 1.2842, | |
| "step": 8380 | |
| }, | |
| { | |
| "epoch": 7.067676020262963, | |
| "grad_norm": 0.2426845133304596, | |
| "learning_rate": 0.0004069306930693069, | |
| "loss": 1.2836, | |
| "step": 8390 | |
| }, | |
| { | |
| "epoch": 7.0760999487734075, | |
| "grad_norm": 0.2740408778190613, | |
| "learning_rate": 0.000405940594059406, | |
| "loss": 1.2842, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 7.084523877283853, | |
| "grad_norm": 0.27966201305389404, | |
| "learning_rate": 0.000404950495049505, | |
| "loss": 1.2841, | |
| "step": 8410 | |
| }, | |
| { | |
| "epoch": 7.092947805794297, | |
| "grad_norm": 0.3083817660808563, | |
| "learning_rate": 0.00040396039603960397, | |
| "loss": 1.2823, | |
| "step": 8420 | |
| }, | |
| { | |
| "epoch": 7.101371734304741, | |
| "grad_norm": 0.30730104446411133, | |
| "learning_rate": 0.000402970297029703, | |
| "loss": 1.2845, | |
| "step": 8430 | |
| }, | |
| { | |
| "epoch": 7.109795662815186, | |
| "grad_norm": 0.2973144054412842, | |
| "learning_rate": 0.000401980198019802, | |
| "loss": 1.2814, | |
| "step": 8440 | |
| }, | |
| { | |
| "epoch": 7.11821959132563, | |
| "grad_norm": 0.2775426208972931, | |
| "learning_rate": 0.000400990099009901, | |
| "loss": 1.2823, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 7.126643519836075, | |
| "grad_norm": 0.2734345495700836, | |
| "learning_rate": 0.0004, | |
| "loss": 1.2819, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 7.126643519836075, | |
| "eval_accuracy": 0.735104089750221, | |
| "eval_loss": 1.1682698726654053, | |
| "eval_runtime": 886.7497, | |
| "eval_samples_per_second": 563.153, | |
| "eval_steps_per_second": 5.215, | |
| "step": 8460 | |
| }, | |
| { | |
| "epoch": 7.135067448346519, | |
| "grad_norm": 0.27912047505378723, | |
| "learning_rate": 0.000399009900990099, | |
| "loss": 1.2826, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 7.143491376856964, | |
| "grad_norm": 0.3084285855293274, | |
| "learning_rate": 0.00039801980198019807, | |
| "loss": 1.2811, | |
| "step": 8480 | |
| }, | |
| { | |
| "epoch": 7.1519153053674085, | |
| "grad_norm": 0.30194783210754395, | |
| "learning_rate": 0.00039702970297029707, | |
| "loss": 1.2828, | |
| "step": 8490 | |
| }, | |
| { | |
| "epoch": 7.160339233877853, | |
| "grad_norm": 0.25307685136795044, | |
| "learning_rate": 0.00039603960396039607, | |
| "loss": 1.2791, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 7.168763162388298, | |
| "grad_norm": 0.25018778443336487, | |
| "learning_rate": 0.00039504950495049506, | |
| "loss": 1.2796, | |
| "step": 8510 | |
| }, | |
| { | |
| "epoch": 7.177187090898742, | |
| "grad_norm": 0.2541010081768036, | |
| "learning_rate": 0.00039405940594059406, | |
| "loss": 1.2812, | |
| "step": 8520 | |
| }, | |
| { | |
| "epoch": 7.185611019409187, | |
| "grad_norm": 0.29745373129844666, | |
| "learning_rate": 0.0003930693069306931, | |
| "loss": 1.2828, | |
| "step": 8530 | |
| }, | |
| { | |
| "epoch": 7.194034947919631, | |
| "grad_norm": 0.2740705907344818, | |
| "learning_rate": 0.0003920792079207921, | |
| "loss": 1.2812, | |
| "step": 8540 | |
| }, | |
| { | |
| "epoch": 7.202458876430076, | |
| "grad_norm": 0.23998434841632843, | |
| "learning_rate": 0.0003910891089108911, | |
| "loss": 1.2781, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 7.202458876430076, | |
| "eval_accuracy": 0.7354429371546514, | |
| "eval_loss": 1.1649537086486816, | |
| "eval_runtime": 891.9041, | |
| "eval_samples_per_second": 559.899, | |
| "eval_steps_per_second": 5.184, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 7.21088280494052, | |
| "grad_norm": 0.2691722512245178, | |
| "learning_rate": 0.0003900990099009901, | |
| "loss": 1.2785, | |
| "step": 8560 | |
| }, | |
| { | |
| "epoch": 7.219306733450964, | |
| "grad_norm": 0.28188225626945496, | |
| "learning_rate": 0.0003891089108910891, | |
| "loss": 1.2807, | |
| "step": 8570 | |
| }, | |
| { | |
| "epoch": 7.2277306619614095, | |
| "grad_norm": 0.3311617970466614, | |
| "learning_rate": 0.00038811881188118816, | |
| "loss": 1.2809, | |
| "step": 8580 | |
| }, | |
| { | |
| "epoch": 7.236154590471854, | |
| "grad_norm": 0.2717738747596741, | |
| "learning_rate": 0.00038712871287128716, | |
| "loss": 1.278, | |
| "step": 8590 | |
| }, | |
| { | |
| "epoch": 7.244578518982299, | |
| "grad_norm": 0.27171820402145386, | |
| "learning_rate": 0.00038613861386138616, | |
| "loss": 1.2803, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 7.253002447492743, | |
| "grad_norm": 0.249137282371521, | |
| "learning_rate": 0.00038514851485148515, | |
| "loss": 1.277, | |
| "step": 8610 | |
| }, | |
| { | |
| "epoch": 7.261426376003188, | |
| "grad_norm": 0.26939263939857483, | |
| "learning_rate": 0.00038415841584158415, | |
| "loss": 1.2773, | |
| "step": 8620 | |
| }, | |
| { | |
| "epoch": 7.269850304513632, | |
| "grad_norm": 0.3177802860736847, | |
| "learning_rate": 0.0003831683168316832, | |
| "loss": 1.2763, | |
| "step": 8630 | |
| }, | |
| { | |
| "epoch": 7.278274233024076, | |
| "grad_norm": 0.2421504557132721, | |
| "learning_rate": 0.0003821782178217822, | |
| "loss": 1.2771, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 7.278274233024076, | |
| "eval_accuracy": 0.7357238880776348, | |
| "eval_loss": 1.1646403074264526, | |
| "eval_runtime": 878.5966, | |
| "eval_samples_per_second": 568.379, | |
| "eval_steps_per_second": 5.263, | |
| "step": 8640 | |
| }, | |
| { | |
| "epoch": 7.286698161534521, | |
| "grad_norm": 0.28808215260505676, | |
| "learning_rate": 0.0003811881188118812, | |
| "loss": 1.2744, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 7.295122090044965, | |
| "grad_norm": 0.26363667845726013, | |
| "learning_rate": 0.0003801980198019802, | |
| "loss": 1.2788, | |
| "step": 8660 | |
| }, | |
| { | |
| "epoch": 7.30354601855541, | |
| "grad_norm": 0.35491064190864563, | |
| "learning_rate": 0.0003792079207920792, | |
| "loss": 1.2792, | |
| "step": 8670 | |
| }, | |
| { | |
| "epoch": 7.311969947065855, | |
| "grad_norm": 0.3273920714855194, | |
| "learning_rate": 0.00037821782178217825, | |
| "loss": 1.278, | |
| "step": 8680 | |
| }, | |
| { | |
| "epoch": 7.320393875576299, | |
| "grad_norm": 0.28319239616394043, | |
| "learning_rate": 0.00037722772277227725, | |
| "loss": 1.2762, | |
| "step": 8690 | |
| }, | |
| { | |
| "epoch": 7.328817804086744, | |
| "grad_norm": 0.28414586186408997, | |
| "learning_rate": 0.00037623762376237625, | |
| "loss": 1.2769, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 7.337241732597188, | |
| "grad_norm": 0.25393033027648926, | |
| "learning_rate": 0.00037524752475247524, | |
| "loss": 1.2742, | |
| "step": 8710 | |
| }, | |
| { | |
| "epoch": 7.345665661107633, | |
| "grad_norm": 0.25634288787841797, | |
| "learning_rate": 0.00037425742574257424, | |
| "loss": 1.2753, | |
| "step": 8720 | |
| }, | |
| { | |
| "epoch": 7.354089589618077, | |
| "grad_norm": 0.2355813831090927, | |
| "learning_rate": 0.0003732673267326733, | |
| "loss": 1.2749, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 7.354089589618077, | |
| "eval_accuracy": 0.7361996522899728, | |
| "eval_loss": 1.160847544670105, | |
| "eval_runtime": 889.4544, | |
| "eval_samples_per_second": 561.441, | |
| "eval_steps_per_second": 5.199, | |
| "step": 8730 | |
| }, | |
| { | |
| "epoch": 7.362513518128522, | |
| "grad_norm": 0.24002189934253693, | |
| "learning_rate": 0.0003722772277227723, | |
| "loss": 1.2751, | |
| "step": 8740 | |
| }, | |
| { | |
| "epoch": 7.370937446638966, | |
| "grad_norm": 0.2806450128555298, | |
| "learning_rate": 0.0003712871287128713, | |
| "loss": 1.275, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 7.3793613751494105, | |
| "grad_norm": 0.24552834033966064, | |
| "learning_rate": 0.0003702970297029703, | |
| "loss": 1.2753, | |
| "step": 8760 | |
| }, | |
| { | |
| "epoch": 7.3877853036598555, | |
| "grad_norm": 0.24814461171627045, | |
| "learning_rate": 0.0003693069306930693, | |
| "loss": 1.276, | |
| "step": 8770 | |
| }, | |
| { | |
| "epoch": 7.3962092321703, | |
| "grad_norm": 0.26086533069610596, | |
| "learning_rate": 0.00036831683168316834, | |
| "loss": 1.2744, | |
| "step": 8780 | |
| }, | |
| { | |
| "epoch": 7.404633160680745, | |
| "grad_norm": 0.2854679822921753, | |
| "learning_rate": 0.00036732673267326734, | |
| "loss": 1.2739, | |
| "step": 8790 | |
| }, | |
| { | |
| "epoch": 7.413057089191189, | |
| "grad_norm": 0.24847003817558289, | |
| "learning_rate": 0.00036633663366336634, | |
| "loss": 1.2731, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 7.421481017701634, | |
| "grad_norm": 0.3230905532836914, | |
| "learning_rate": 0.00036534653465346533, | |
| "loss": 1.2732, | |
| "step": 8810 | |
| }, | |
| { | |
| "epoch": 7.429904946212078, | |
| "grad_norm": 0.30264076590538025, | |
| "learning_rate": 0.00036435643564356433, | |
| "loss": 1.273, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 7.429904946212078, | |
| "eval_accuracy": 0.7366944357714759, | |
| "eval_loss": 1.1585748195648193, | |
| "eval_runtime": 884.7129, | |
| "eval_samples_per_second": 564.45, | |
| "eval_steps_per_second": 5.227, | |
| "step": 8820 | |
| }, | |
| { | |
| "epoch": 7.438328874722522, | |
| "grad_norm": 0.25705888867378235, | |
| "learning_rate": 0.0003633663366336634, | |
| "loss": 1.2738, | |
| "step": 8830 | |
| }, | |
| { | |
| "epoch": 7.446752803232967, | |
| "grad_norm": 0.2455236166715622, | |
| "learning_rate": 0.0003623762376237624, | |
| "loss": 1.2727, | |
| "step": 8840 | |
| }, | |
| { | |
| "epoch": 7.4551767317434114, | |
| "grad_norm": 0.2877678871154785, | |
| "learning_rate": 0.0003613861386138614, | |
| "loss": 1.2733, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 7.4636006602538565, | |
| "grad_norm": 0.2644253969192505, | |
| "learning_rate": 0.0003603960396039604, | |
| "loss": 1.2711, | |
| "step": 8860 | |
| }, | |
| { | |
| "epoch": 7.472024588764301, | |
| "grad_norm": 0.25103089213371277, | |
| "learning_rate": 0.0003594059405940594, | |
| "loss": 1.2727, | |
| "step": 8870 | |
| }, | |
| { | |
| "epoch": 7.480448517274746, | |
| "grad_norm": 0.28732746839523315, | |
| "learning_rate": 0.00035841584158415843, | |
| "loss": 1.2729, | |
| "step": 8880 | |
| }, | |
| { | |
| "epoch": 7.48887244578519, | |
| "grad_norm": 0.3096875846385956, | |
| "learning_rate": 0.00035742574257425743, | |
| "loss": 1.2733, | |
| "step": 8890 | |
| }, | |
| { | |
| "epoch": 7.497296374295634, | |
| "grad_norm": 0.27695363759994507, | |
| "learning_rate": 0.0003564356435643564, | |
| "loss": 1.2719, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 7.505720302806079, | |
| "grad_norm": 0.26089048385620117, | |
| "learning_rate": 0.0003554455445544554, | |
| "loss": 1.2718, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 7.505720302806079, | |
| "eval_accuracy": 0.7372118632602084, | |
| "eval_loss": 1.1557950973510742, | |
| "eval_runtime": 890.5411, | |
| "eval_samples_per_second": 560.756, | |
| "eval_steps_per_second": 5.192, | |
| "step": 8910 | |
| }, | |
| { | |
| "epoch": 7.514144231316523, | |
| "grad_norm": 0.24578547477722168, | |
| "learning_rate": 0.0003544554455445544, | |
| "loss": 1.2723, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 7.522568159826968, | |
| "grad_norm": 0.2624136209487915, | |
| "learning_rate": 0.0003534653465346535, | |
| "loss": 1.2708, | |
| "step": 8930 | |
| }, | |
| { | |
| "epoch": 7.530992088337412, | |
| "grad_norm": 0.25748109817504883, | |
| "learning_rate": 0.0003524752475247525, | |
| "loss": 1.2708, | |
| "step": 8940 | |
| }, | |
| { | |
| "epoch": 7.5394160168478574, | |
| "grad_norm": 0.28079208731651306, | |
| "learning_rate": 0.00035148514851485147, | |
| "loss": 1.2727, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 7.547839945358302, | |
| "grad_norm": 0.2706407904624939, | |
| "learning_rate": 0.00035049504950495047, | |
| "loss": 1.2712, | |
| "step": 8960 | |
| }, | |
| { | |
| "epoch": 7.556263873868746, | |
| "grad_norm": 0.27032172679901123, | |
| "learning_rate": 0.00034950495049504947, | |
| "loss": 1.2673, | |
| "step": 8970 | |
| }, | |
| { | |
| "epoch": 7.564687802379191, | |
| "grad_norm": 0.24915465712547302, | |
| "learning_rate": 0.0003485148514851485, | |
| "loss": 1.2682, | |
| "step": 8980 | |
| }, | |
| { | |
| "epoch": 7.573111730889635, | |
| "grad_norm": 0.24191108345985413, | |
| "learning_rate": 0.0003475247524752475, | |
| "loss": 1.2719, | |
| "step": 8990 | |
| }, | |
| { | |
| "epoch": 7.58153565940008, | |
| "grad_norm": 0.2806965112686157, | |
| "learning_rate": 0.0003465346534653465, | |
| "loss": 1.2681, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.58153565940008, | |
| "eval_accuracy": 0.7375367942915361, | |
| "eval_loss": 1.1551363468170166, | |
| "eval_runtime": 876.3936, | |
| "eval_samples_per_second": 569.808, | |
| "eval_steps_per_second": 5.276, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.589959587910524, | |
| "grad_norm": 0.2909415364265442, | |
| "learning_rate": 0.0003455445544554455, | |
| "loss": 1.2687, | |
| "step": 9010 | |
| }, | |
| { | |
| "epoch": 7.598383516420968, | |
| "grad_norm": 0.30222398042678833, | |
| "learning_rate": 0.0003445544554455445, | |
| "loss": 1.2684, | |
| "step": 9020 | |
| }, | |
| { | |
| "epoch": 7.606807444931413, | |
| "grad_norm": 0.25246381759643555, | |
| "learning_rate": 0.0003435643564356436, | |
| "loss": 1.2689, | |
| "step": 9030 | |
| }, | |
| { | |
| "epoch": 7.6152313734418575, | |
| "grad_norm": 0.25202953815460205, | |
| "learning_rate": 0.0003425742574257426, | |
| "loss": 1.2689, | |
| "step": 9040 | |
| }, | |
| { | |
| "epoch": 7.623655301952303, | |
| "grad_norm": 0.2351432740688324, | |
| "learning_rate": 0.0003415841584158416, | |
| "loss": 1.2655, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 7.632079230462747, | |
| "grad_norm": 0.26545044779777527, | |
| "learning_rate": 0.0003405940594059406, | |
| "loss": 1.2659, | |
| "step": 9060 | |
| }, | |
| { | |
| "epoch": 7.640503158973192, | |
| "grad_norm": 0.248436838388443, | |
| "learning_rate": 0.0003396039603960396, | |
| "loss": 1.2677, | |
| "step": 9070 | |
| }, | |
| { | |
| "epoch": 7.648927087483636, | |
| "grad_norm": 0.3021203279495239, | |
| "learning_rate": 0.00033861386138613867, | |
| "loss": 1.2692, | |
| "step": 9080 | |
| }, | |
| { | |
| "epoch": 7.657351015994081, | |
| "grad_norm": 0.27577024698257446, | |
| "learning_rate": 0.00033762376237623766, | |
| "loss": 1.2672, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 7.657351015994081, | |
| "eval_accuracy": 0.7378275299930978, | |
| "eval_loss": 1.1522574424743652, | |
| "eval_runtime": 891.8663, | |
| "eval_samples_per_second": 559.923, | |
| "eval_steps_per_second": 5.185, | |
| "step": 9090 | |
| }, | |
| { | |
| "epoch": 7.665774944504525, | |
| "grad_norm": 0.2087612897157669, | |
| "learning_rate": 0.00033663366336633666, | |
| "loss": 1.2655, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 7.674198873014969, | |
| "grad_norm": 0.24880866706371307, | |
| "learning_rate": 0.00033564356435643566, | |
| "loss": 1.2677, | |
| "step": 9110 | |
| }, | |
| { | |
| "epoch": 7.682622801525414, | |
| "grad_norm": 0.26335397362709045, | |
| "learning_rate": 0.00033465346534653466, | |
| "loss": 1.2647, | |
| "step": 9120 | |
| }, | |
| { | |
| "epoch": 7.6910467300358585, | |
| "grad_norm": 0.25413015484809875, | |
| "learning_rate": 0.0003336633663366337, | |
| "loss": 1.265, | |
| "step": 9130 | |
| }, | |
| { | |
| "epoch": 7.6994706585463035, | |
| "grad_norm": 0.3119896650314331, | |
| "learning_rate": 0.0003326732673267327, | |
| "loss": 1.2674, | |
| "step": 9140 | |
| }, | |
| { | |
| "epoch": 7.707894587056748, | |
| "grad_norm": 0.2269907146692276, | |
| "learning_rate": 0.0003316831683168317, | |
| "loss": 1.2647, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 7.716318515567192, | |
| "grad_norm": 0.31745684146881104, | |
| "learning_rate": 0.0003306930693069307, | |
| "loss": 1.2668, | |
| "step": 9160 | |
| }, | |
| { | |
| "epoch": 7.724742444077637, | |
| "grad_norm": 0.28096485137939453, | |
| "learning_rate": 0.0003297029702970297, | |
| "loss": 1.2658, | |
| "step": 9170 | |
| }, | |
| { | |
| "epoch": 7.733166372588081, | |
| "grad_norm": 0.26646697521209717, | |
| "learning_rate": 0.00032871287128712876, | |
| "loss": 1.2664, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 7.733166372588081, | |
| "eval_accuracy": 0.7381772885380696, | |
| "eval_loss": 1.151962161064148, | |
| "eval_runtime": 889.9446, | |
| "eval_samples_per_second": 561.132, | |
| "eval_steps_per_second": 5.196, | |
| "step": 9180 | |
| }, | |
| { | |
| "epoch": 7.741590301098526, | |
| "grad_norm": 0.24463273584842682, | |
| "learning_rate": 0.00032772277227722775, | |
| "loss": 1.2663, | |
| "step": 9190 | |
| }, | |
| { | |
| "epoch": 7.75001422960897, | |
| "grad_norm": 0.23978425562381744, | |
| "learning_rate": 0.00032673267326732675, | |
| "loss": 1.2634, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 7.758438158119414, | |
| "grad_norm": 0.25662901997566223, | |
| "learning_rate": 0.00032574257425742575, | |
| "loss": 1.2651, | |
| "step": 9210 | |
| }, | |
| { | |
| "epoch": 7.766862086629859, | |
| "grad_norm": 0.2697198688983917, | |
| "learning_rate": 0.00032475247524752475, | |
| "loss": 1.2628, | |
| "step": 9220 | |
| }, | |
| { | |
| "epoch": 7.775286015140304, | |
| "grad_norm": 0.2753835618495941, | |
| "learning_rate": 0.0003237623762376238, | |
| "loss": 1.2632, | |
| "step": 9230 | |
| }, | |
| { | |
| "epoch": 7.783709943650749, | |
| "grad_norm": 0.23303931951522827, | |
| "learning_rate": 0.0003227722772277228, | |
| "loss": 1.2625, | |
| "step": 9240 | |
| }, | |
| { | |
| "epoch": 7.792133872161193, | |
| "grad_norm": 0.26077255606651306, | |
| "learning_rate": 0.0003217821782178218, | |
| "loss": 1.2648, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 7.800557800671638, | |
| "grad_norm": 0.25494781136512756, | |
| "learning_rate": 0.0003207920792079208, | |
| "loss": 1.2648, | |
| "step": 9260 | |
| }, | |
| { | |
| "epoch": 7.808981729182082, | |
| "grad_norm": 0.2447885125875473, | |
| "learning_rate": 0.0003198019801980198, | |
| "loss": 1.2645, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 7.808981729182082, | |
| "eval_accuracy": 0.7385748699480129, | |
| "eval_loss": 1.1492513418197632, | |
| "eval_runtime": 885.3604, | |
| "eval_samples_per_second": 564.037, | |
| "eval_steps_per_second": 5.223, | |
| "step": 9270 | |
| }, | |
| { | |
| "epoch": 7.817405657692527, | |
| "grad_norm": 0.23961922526359558, | |
| "learning_rate": 0.00031881188118811885, | |
| "loss": 1.2631, | |
| "step": 9280 | |
| }, | |
| { | |
| "epoch": 7.825829586202971, | |
| "grad_norm": 0.2850695252418518, | |
| "learning_rate": 0.00031782178217821784, | |
| "loss": 1.2636, | |
| "step": 9290 | |
| }, | |
| { | |
| "epoch": 7.834253514713415, | |
| "grad_norm": 0.257962167263031, | |
| "learning_rate": 0.00031683168316831684, | |
| "loss": 1.2647, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 7.84267744322386, | |
| "grad_norm": 0.28995752334594727, | |
| "learning_rate": 0.00031584158415841584, | |
| "loss": 1.2613, | |
| "step": 9310 | |
| }, | |
| { | |
| "epoch": 7.851101371734305, | |
| "grad_norm": 0.23544956743717194, | |
| "learning_rate": 0.00031485148514851484, | |
| "loss": 1.261, | |
| "step": 9320 | |
| }, | |
| { | |
| "epoch": 7.85952530024475, | |
| "grad_norm": 0.27855780720710754, | |
| "learning_rate": 0.0003138613861386139, | |
| "loss": 1.2615, | |
| "step": 9330 | |
| }, | |
| { | |
| "epoch": 7.867949228755194, | |
| "grad_norm": 0.2668914198875427, | |
| "learning_rate": 0.0003128712871287129, | |
| "loss": 1.2629, | |
| "step": 9340 | |
| }, | |
| { | |
| "epoch": 7.876373157265638, | |
| "grad_norm": 0.2561187446117401, | |
| "learning_rate": 0.0003118811881188119, | |
| "loss": 1.2614, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 7.884797085776083, | |
| "grad_norm": 0.23943807184696198, | |
| "learning_rate": 0.0003108910891089109, | |
| "loss": 1.2591, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 7.884797085776083, | |
| "eval_accuracy": 0.7389714933005799, | |
| "eval_loss": 1.1477636098861694, | |
| "eval_runtime": 884.2901, | |
| "eval_samples_per_second": 564.72, | |
| "eval_steps_per_second": 5.229, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 7.893221014286527, | |
| "grad_norm": 0.3144013583660126, | |
| "learning_rate": 0.0003099009900990099, | |
| "loss": 1.2606, | |
| "step": 9370 | |
| }, | |
| { | |
| "epoch": 7.901644942796972, | |
| "grad_norm": 0.30694615840911865, | |
| "learning_rate": 0.00030891089108910894, | |
| "loss": 1.2607, | |
| "step": 9380 | |
| }, | |
| { | |
| "epoch": 7.910068871307416, | |
| "grad_norm": 0.28703033924102783, | |
| "learning_rate": 0.00030792079207920793, | |
| "loss": 1.2625, | |
| "step": 9390 | |
| }, | |
| { | |
| "epoch": 7.918492799817861, | |
| "grad_norm": 0.24160224199295044, | |
| "learning_rate": 0.00030693069306930693, | |
| "loss": 1.2594, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 7.9269167283283055, | |
| "grad_norm": 0.26693734526634216, | |
| "learning_rate": 0.00030594059405940593, | |
| "loss": 1.2605, | |
| "step": 9410 | |
| }, | |
| { | |
| "epoch": 7.935340656838751, | |
| "grad_norm": 0.23551449179649353, | |
| "learning_rate": 0.00030495049504950493, | |
| "loss": 1.2589, | |
| "step": 9420 | |
| }, | |
| { | |
| "epoch": 7.943764585349195, | |
| "grad_norm": 0.23266945779323578, | |
| "learning_rate": 0.000303960396039604, | |
| "loss": 1.2575, | |
| "step": 9430 | |
| }, | |
| { | |
| "epoch": 7.952188513859639, | |
| "grad_norm": 0.19307726621627808, | |
| "learning_rate": 0.000302970297029703, | |
| "loss": 1.2594, | |
| "step": 9440 | |
| }, | |
| { | |
| "epoch": 7.960612442370084, | |
| "grad_norm": 0.2490869015455246, | |
| "learning_rate": 0.000301980198019802, | |
| "loss": 1.2594, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 7.960612442370084, | |
| "eval_accuracy": 0.7392987654643606, | |
| "eval_loss": 1.1463170051574707, | |
| "eval_runtime": 887.3291, | |
| "eval_samples_per_second": 562.786, | |
| "eval_steps_per_second": 5.211, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 7.969036370880528, | |
| "grad_norm": 0.24613766372203827, | |
| "learning_rate": 0.000300990099009901, | |
| "loss": 1.2586, | |
| "step": 9460 | |
| }, | |
| { | |
| "epoch": 7.977460299390973, | |
| "grad_norm": 0.28653955459594727, | |
| "learning_rate": 0.0003, | |
| "loss": 1.2596, | |
| "step": 9470 | |
| }, | |
| { | |
| "epoch": 7.985884227901417, | |
| "grad_norm": 0.2534151077270508, | |
| "learning_rate": 0.000299009900990099, | |
| "loss": 1.258, | |
| "step": 9480 | |
| }, | |
| { | |
| "epoch": 7.994308156411861, | |
| "grad_norm": 0.2278260588645935, | |
| "learning_rate": 0.000298019801980198, | |
| "loss": 1.2596, | |
| "step": 9490 | |
| }, | |
| { | |
| "epoch": 8.002732084922306, | |
| "grad_norm": 0.24955512583255768, | |
| "learning_rate": 0.000297029702970297, | |
| "loss": 1.2589, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 8.011156013432752, | |
| "grad_norm": 0.24727576971054077, | |
| "learning_rate": 0.000296039603960396, | |
| "loss": 1.259, | |
| "step": 9510 | |
| }, | |
| { | |
| "epoch": 8.019579941943196, | |
| "grad_norm": 0.23246212303638458, | |
| "learning_rate": 0.000295049504950495, | |
| "loss": 1.2569, | |
| "step": 9520 | |
| }, | |
| { | |
| "epoch": 8.02800387045364, | |
| "grad_norm": 0.31031736731529236, | |
| "learning_rate": 0.00029405940594059407, | |
| "loss": 1.2576, | |
| "step": 9530 | |
| }, | |
| { | |
| "epoch": 8.036427798964084, | |
| "grad_norm": 0.25005343556404114, | |
| "learning_rate": 0.00029306930693069307, | |
| "loss": 1.2586, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 8.036427798964084, | |
| "eval_accuracy": 0.7396166114825387, | |
| "eval_loss": 1.1443780660629272, | |
| "eval_runtime": 886.7087, | |
| "eval_samples_per_second": 563.179, | |
| "eval_steps_per_second": 5.215, | |
| "step": 9540 | |
| }, | |
| { | |
| "epoch": 8.044851727474528, | |
| "grad_norm": 0.26693809032440186, | |
| "learning_rate": 0.00029207920792079207, | |
| "loss": 1.2565, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 8.053275655984974, | |
| "grad_norm": 0.2694302797317505, | |
| "learning_rate": 0.00029108910891089107, | |
| "loss": 1.2578, | |
| "step": 9560 | |
| }, | |
| { | |
| "epoch": 8.061699584495418, | |
| "grad_norm": 0.28717589378356934, | |
| "learning_rate": 0.00029009900990099006, | |
| "loss": 1.257, | |
| "step": 9570 | |
| }, | |
| { | |
| "epoch": 8.070123513005862, | |
| "grad_norm": 0.2473517805337906, | |
| "learning_rate": 0.0002891089108910891, | |
| "loss": 1.2584, | |
| "step": 9580 | |
| }, | |
| { | |
| "epoch": 8.078547441516307, | |
| "grad_norm": 0.238663449883461, | |
| "learning_rate": 0.0002881188118811881, | |
| "loss": 1.2565, | |
| "step": 9590 | |
| }, | |
| { | |
| "epoch": 8.086971370026752, | |
| "grad_norm": 0.25168007612228394, | |
| "learning_rate": 0.0002871287128712871, | |
| "loss": 1.2601, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 8.095395298537197, | |
| "grad_norm": 0.2553163766860962, | |
| "learning_rate": 0.0002861386138613861, | |
| "loss": 1.2582, | |
| "step": 9610 | |
| }, | |
| { | |
| "epoch": 8.10381922704764, | |
| "grad_norm": 0.22442133724689484, | |
| "learning_rate": 0.0002851485148514851, | |
| "loss": 1.2564, | |
| "step": 9620 | |
| }, | |
| { | |
| "epoch": 8.112243155558085, | |
| "grad_norm": 0.2428729087114334, | |
| "learning_rate": 0.00028415841584158416, | |
| "loss": 1.2555, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 8.112243155558085, | |
| "eval_accuracy": 0.7398516451845706, | |
| "eval_loss": 1.1434710025787354, | |
| "eval_runtime": 884.9135, | |
| "eval_samples_per_second": 564.322, | |
| "eval_steps_per_second": 5.225, | |
| "step": 9630 | |
| }, | |
| { | |
| "epoch": 8.120667084068529, | |
| "grad_norm": 0.24635536968708038, | |
| "learning_rate": 0.00028316831683168316, | |
| "loss": 1.256, | |
| "step": 9640 | |
| }, | |
| { | |
| "epoch": 8.129091012578975, | |
| "grad_norm": 0.25894826650619507, | |
| "learning_rate": 0.00028217821782178216, | |
| "loss": 1.2559, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 8.13751494108942, | |
| "grad_norm": 0.28364095091819763, | |
| "learning_rate": 0.0002811881188118812, | |
| "loss": 1.2558, | |
| "step": 9660 | |
| }, | |
| { | |
| "epoch": 8.145938869599863, | |
| "grad_norm": 0.27813902497291565, | |
| "learning_rate": 0.0002801980198019802, | |
| "loss": 1.2551, | |
| "step": 9670 | |
| }, | |
| { | |
| "epoch": 8.154362798110308, | |
| "grad_norm": 0.25842994451522827, | |
| "learning_rate": 0.00027920792079207926, | |
| "loss": 1.2566, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 8.162786726620752, | |
| "grad_norm": 0.28136196732521057, | |
| "learning_rate": 0.00027821782178217826, | |
| "loss": 1.2558, | |
| "step": 9690 | |
| }, | |
| { | |
| "epoch": 8.171210655131198, | |
| "grad_norm": 0.24087685346603394, | |
| "learning_rate": 0.00027722772277227726, | |
| "loss": 1.2548, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 8.179634583641642, | |
| "grad_norm": 0.24687226116657257, | |
| "learning_rate": 0.00027623762376237626, | |
| "loss": 1.2585, | |
| "step": 9710 | |
| }, | |
| { | |
| "epoch": 8.188058512152086, | |
| "grad_norm": 0.22570998966693878, | |
| "learning_rate": 0.00027524752475247525, | |
| "loss": 1.2534, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 8.188058512152086, | |
| "eval_accuracy": 0.7402963892075639, | |
| "eval_loss": 1.1417516469955444, | |
| "eval_runtime": 887.2248, | |
| "eval_samples_per_second": 562.852, | |
| "eval_steps_per_second": 5.212, | |
| "step": 9720 | |
| }, | |
| { | |
| "epoch": 8.19648244066253, | |
| "grad_norm": 0.2180325835943222, | |
| "learning_rate": 0.0002742574257425743, | |
| "loss": 1.254, | |
| "step": 9730 | |
| }, | |
| { | |
| "epoch": 8.204906369172976, | |
| "grad_norm": 0.24650686979293823, | |
| "learning_rate": 0.0002732673267326733, | |
| "loss": 1.2549, | |
| "step": 9740 | |
| }, | |
| { | |
| "epoch": 8.21333029768342, | |
| "grad_norm": 0.23055210709571838, | |
| "learning_rate": 0.0002722772277227723, | |
| "loss": 1.2533, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 8.221754226193864, | |
| "grad_norm": 0.2486119419336319, | |
| "learning_rate": 0.0002712871287128713, | |
| "loss": 1.2535, | |
| "step": 9760 | |
| }, | |
| { | |
| "epoch": 8.230178154704308, | |
| "grad_norm": 0.2295829951763153, | |
| "learning_rate": 0.0002702970297029703, | |
| "loss": 1.2532, | |
| "step": 9770 | |
| }, | |
| { | |
| "epoch": 8.238602083214753, | |
| "grad_norm": 0.24997445940971375, | |
| "learning_rate": 0.00026930693069306935, | |
| "loss": 1.2531, | |
| "step": 9780 | |
| }, | |
| { | |
| "epoch": 8.247026011725199, | |
| "grad_norm": 0.26696640253067017, | |
| "learning_rate": 0.00026831683168316835, | |
| "loss": 1.2537, | |
| "step": 9790 | |
| }, | |
| { | |
| "epoch": 8.255449940235643, | |
| "grad_norm": 0.26139459013938904, | |
| "learning_rate": 0.00026732673267326735, | |
| "loss": 1.255, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 8.263873868746087, | |
| "grad_norm": 0.24359402060508728, | |
| "learning_rate": 0.00026633663366336635, | |
| "loss": 1.2531, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 8.263873868746087, | |
| "eval_accuracy": 0.7405673501883495, | |
| "eval_loss": 1.139613389968872, | |
| "eval_runtime": 879.601, | |
| "eval_samples_per_second": 567.73, | |
| "eval_steps_per_second": 5.257, | |
| "step": 9810 | |
| }, | |
| { | |
| "epoch": 8.272297797256531, | |
| "grad_norm": 0.2327917069196701, | |
| "learning_rate": 0.00026534653465346534, | |
| "loss": 1.2534, | |
| "step": 9820 | |
| }, | |
| { | |
| "epoch": 8.280721725766975, | |
| "grad_norm": 0.25629815459251404, | |
| "learning_rate": 0.0002643564356435644, | |
| "loss": 1.2531, | |
| "step": 9830 | |
| }, | |
| { | |
| "epoch": 8.289145654277421, | |
| "grad_norm": 0.22450138628482819, | |
| "learning_rate": 0.0002633663366336634, | |
| "loss": 1.2529, | |
| "step": 9840 | |
| }, | |
| { | |
| "epoch": 8.297569582787865, | |
| "grad_norm": 0.2623524069786072, | |
| "learning_rate": 0.0002623762376237624, | |
| "loss": 1.2504, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 8.30599351129831, | |
| "grad_norm": 0.2159668356180191, | |
| "learning_rate": 0.0002613861386138614, | |
| "loss": 1.2528, | |
| "step": 9860 | |
| }, | |
| { | |
| "epoch": 8.314417439808754, | |
| "grad_norm": 0.24267102777957916, | |
| "learning_rate": 0.0002603960396039604, | |
| "loss": 1.2514, | |
| "step": 9870 | |
| }, | |
| { | |
| "epoch": 8.322841368319198, | |
| "grad_norm": 0.2541745603084564, | |
| "learning_rate": 0.00025940594059405944, | |
| "loss": 1.2505, | |
| "step": 9880 | |
| }, | |
| { | |
| "epoch": 8.331265296829644, | |
| "grad_norm": 0.28231385350227356, | |
| "learning_rate": 0.00025841584158415844, | |
| "loss": 1.2511, | |
| "step": 9890 | |
| }, | |
| { | |
| "epoch": 8.339689225340088, | |
| "grad_norm": 0.2412833273410797, | |
| "learning_rate": 0.00025742574257425744, | |
| "loss": 1.2506, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 8.339689225340088, | |
| "eval_accuracy": 0.740612444763646, | |
| "eval_loss": 1.140478491783142, | |
| "eval_runtime": 884.9323, | |
| "eval_samples_per_second": 564.31, | |
| "eval_steps_per_second": 5.225, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 8.348113153850532, | |
| "grad_norm": 0.2641441524028778, | |
| "learning_rate": 0.00025643564356435644, | |
| "loss": 1.2519, | |
| "step": 9910 | |
| }, | |
| { | |
| "epoch": 8.356537082360976, | |
| "grad_norm": 0.2675786316394806, | |
| "learning_rate": 0.00025544554455445543, | |
| "loss": 1.2516, | |
| "step": 9920 | |
| }, | |
| { | |
| "epoch": 8.364961010871422, | |
| "grad_norm": 0.2118910253047943, | |
| "learning_rate": 0.0002544554455445545, | |
| "loss": 1.2511, | |
| "step": 9930 | |
| }, | |
| { | |
| "epoch": 8.373384939381866, | |
| "grad_norm": 0.27223941683769226, | |
| "learning_rate": 0.0002534653465346535, | |
| "loss": 1.2519, | |
| "step": 9940 | |
| }, | |
| { | |
| "epoch": 8.38180886789231, | |
| "grad_norm": 0.2487749308347702, | |
| "learning_rate": 0.0002524752475247525, | |
| "loss": 1.2506, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 8.390232796402755, | |
| "grad_norm": 0.2320510894060135, | |
| "learning_rate": 0.0002514851485148515, | |
| "loss": 1.2534, | |
| "step": 9960 | |
| }, | |
| { | |
| "epoch": 8.398656724913199, | |
| "grad_norm": 0.2474934607744217, | |
| "learning_rate": 0.0002504950495049505, | |
| "loss": 1.249, | |
| "step": 9970 | |
| }, | |
| { | |
| "epoch": 8.407080653423645, | |
| "grad_norm": 0.23778343200683594, | |
| "learning_rate": 0.00024950495049504953, | |
| "loss": 1.2503, | |
| "step": 9980 | |
| }, | |
| { | |
| "epoch": 8.415504581934089, | |
| "grad_norm": 0.2715946137905121, | |
| "learning_rate": 0.00024851485148514853, | |
| "loss": 1.2515, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 8.415504581934089, | |
| "eval_accuracy": 0.7412818791412316, | |
| "eval_loss": 1.137270450592041, | |
| "eval_runtime": 885.4223, | |
| "eval_samples_per_second": 563.998, | |
| "eval_steps_per_second": 5.222, | |
| "step": 9990 | |
| }, | |
| { | |
| "epoch": 8.423928510444533, | |
| "grad_norm": 0.26555290818214417, | |
| "learning_rate": 0.00024752475247524753, | |
| "loss": 1.2485, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 8.432352438954977, | |
| "grad_norm": 0.23698092997074127, | |
| "learning_rate": 0.0002465346534653465, | |
| "loss": 1.2498, | |
| "step": 10010 | |
| }, | |
| { | |
| "epoch": 8.440776367465421, | |
| "grad_norm": 0.23015616834163666, | |
| "learning_rate": 0.0002455445544554455, | |
| "loss": 1.2482, | |
| "step": 10020 | |
| }, | |
| { | |
| "epoch": 8.449200295975867, | |
| "grad_norm": 0.22911451756954193, | |
| "learning_rate": 0.0002445544554455446, | |
| "loss": 1.2503, | |
| "step": 10030 | |
| }, | |
| { | |
| "epoch": 8.457624224486311, | |
| "grad_norm": 0.24171452224254608, | |
| "learning_rate": 0.00024356435643564357, | |
| "loss": 1.2485, | |
| "step": 10040 | |
| }, | |
| { | |
| "epoch": 8.466048152996756, | |
| "grad_norm": 0.24717497825622559, | |
| "learning_rate": 0.00024257425742574257, | |
| "loss": 1.2503, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 8.4744720815072, | |
| "grad_norm": 0.23118732869625092, | |
| "learning_rate": 0.00024158415841584157, | |
| "loss": 1.2488, | |
| "step": 10060 | |
| }, | |
| { | |
| "epoch": 8.482896010017644, | |
| "grad_norm": 0.22151467204093933, | |
| "learning_rate": 0.0002405940594059406, | |
| "loss": 1.2484, | |
| "step": 10070 | |
| }, | |
| { | |
| "epoch": 8.49131993852809, | |
| "grad_norm": 0.2284466177225113, | |
| "learning_rate": 0.0002396039603960396, | |
| "loss": 1.2487, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 8.49131993852809, | |
| "eval_accuracy": 0.7414350855696202, | |
| "eval_loss": 1.134464144706726, | |
| "eval_runtime": 887.5421, | |
| "eval_samples_per_second": 562.65, | |
| "eval_steps_per_second": 5.21, | |
| "step": 10080 | |
| }, | |
| { | |
| "epoch": 8.499743867038534, | |
| "grad_norm": 0.2377534806728363, | |
| "learning_rate": 0.00023861386138613862, | |
| "loss": 1.2491, | |
| "step": 10090 | |
| }, | |
| { | |
| "epoch": 8.508167795548978, | |
| "grad_norm": 0.2649644613265991, | |
| "learning_rate": 0.00023762376237623762, | |
| "loss": 1.2467, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 8.516591724059422, | |
| "grad_norm": 0.22302138805389404, | |
| "learning_rate": 0.00023663366336633662, | |
| "loss": 1.2496, | |
| "step": 10110 | |
| }, | |
| { | |
| "epoch": 8.525015652569868, | |
| "grad_norm": 0.24170257151126862, | |
| "learning_rate": 0.00023564356435643564, | |
| "loss": 1.2471, | |
| "step": 10120 | |
| }, | |
| { | |
| "epoch": 8.533439581080312, | |
| "grad_norm": 0.2645774781703949, | |
| "learning_rate": 0.00023465346534653464, | |
| "loss": 1.2477, | |
| "step": 10130 | |
| }, | |
| { | |
| "epoch": 8.541863509590756, | |
| "grad_norm": 0.24155734479427338, | |
| "learning_rate": 0.0002336633663366337, | |
| "loss": 1.2466, | |
| "step": 10140 | |
| }, | |
| { | |
| "epoch": 8.5502874381012, | |
| "grad_norm": 0.23023132979869843, | |
| "learning_rate": 0.0002326732673267327, | |
| "loss": 1.2457, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 8.558711366611645, | |
| "grad_norm": 0.2243080586194992, | |
| "learning_rate": 0.0002316831683168317, | |
| "loss": 1.2476, | |
| "step": 10160 | |
| }, | |
| { | |
| "epoch": 8.56713529512209, | |
| "grad_norm": 0.278157114982605, | |
| "learning_rate": 0.00023069306930693071, | |
| "loss": 1.2462, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 8.56713529512209, | |
| "eval_accuracy": 0.7417397824056636, | |
| "eval_loss": 1.1336922645568848, | |
| "eval_runtime": 892.4907, | |
| "eval_samples_per_second": 559.531, | |
| "eval_steps_per_second": 5.181, | |
| "step": 10170 | |
| }, | |
| { | |
| "epoch": 8.575559223632535, | |
| "grad_norm": 0.24606026709079742, | |
| "learning_rate": 0.0002297029702970297, | |
| "loss": 1.2478, | |
| "step": 10180 | |
| }, | |
| { | |
| "epoch": 8.583983152142979, | |
| "grad_norm": 0.23494498431682587, | |
| "learning_rate": 0.00022871287128712874, | |
| "loss": 1.2463, | |
| "step": 10190 | |
| }, | |
| { | |
| "epoch": 8.592407080653423, | |
| "grad_norm": 0.21522320806980133, | |
| "learning_rate": 0.00022772277227722774, | |
| "loss": 1.2479, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 8.60083100916387, | |
| "grad_norm": 0.2655723989009857, | |
| "learning_rate": 0.00022673267326732673, | |
| "loss": 1.2468, | |
| "step": 10210 | |
| }, | |
| { | |
| "epoch": 8.609254937674313, | |
| "grad_norm": 0.2444898933172226, | |
| "learning_rate": 0.00022574257425742576, | |
| "loss": 1.246, | |
| "step": 10220 | |
| }, | |
| { | |
| "epoch": 8.617678866184757, | |
| "grad_norm": 0.2277156114578247, | |
| "learning_rate": 0.00022475247524752476, | |
| "loss": 1.2466, | |
| "step": 10230 | |
| }, | |
| { | |
| "epoch": 8.626102794695202, | |
| "grad_norm": 0.22111962735652924, | |
| "learning_rate": 0.00022376237623762378, | |
| "loss": 1.2451, | |
| "step": 10240 | |
| }, | |
| { | |
| "epoch": 8.634526723205646, | |
| "grad_norm": 0.23199447989463806, | |
| "learning_rate": 0.00022277227722772278, | |
| "loss": 1.2463, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 8.642950651716092, | |
| "grad_norm": 0.22960427403450012, | |
| "learning_rate": 0.00022178217821782178, | |
| "loss": 1.2465, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 8.642950651716092, | |
| "eval_accuracy": 0.7420823467349104, | |
| "eval_loss": 1.1322184801101685, | |
| "eval_runtime": 883.7567, | |
| "eval_samples_per_second": 565.061, | |
| "eval_steps_per_second": 5.232, | |
| "step": 10260 | |
| }, | |
| { | |
| "epoch": 8.651374580226536, | |
| "grad_norm": 0.290622353553772, | |
| "learning_rate": 0.0002207920792079208, | |
| "loss": 1.2444, | |
| "step": 10270 | |
| }, | |
| { | |
| "epoch": 8.65979850873698, | |
| "grad_norm": 0.2639337480068207, | |
| "learning_rate": 0.0002198019801980198, | |
| "loss": 1.247, | |
| "step": 10280 | |
| }, | |
| { | |
| "epoch": 8.668222437247424, | |
| "grad_norm": 0.22477252781391144, | |
| "learning_rate": 0.00021881188118811883, | |
| "loss": 1.2443, | |
| "step": 10290 | |
| }, | |
| { | |
| "epoch": 8.676646365757868, | |
| "grad_norm": 0.2989983558654785, | |
| "learning_rate": 0.00021782178217821783, | |
| "loss": 1.2461, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 8.685070294268314, | |
| "grad_norm": 0.22259776294231415, | |
| "learning_rate": 0.00021683168316831682, | |
| "loss": 1.2438, | |
| "step": 10310 | |
| }, | |
| { | |
| "epoch": 8.693494222778758, | |
| "grad_norm": 0.21380363404750824, | |
| "learning_rate": 0.00021584158415841585, | |
| "loss": 1.2414, | |
| "step": 10320 | |
| }, | |
| { | |
| "epoch": 8.701918151289203, | |
| "grad_norm": 0.23593538999557495, | |
| "learning_rate": 0.00021485148514851485, | |
| "loss": 1.2454, | |
| "step": 10330 | |
| }, | |
| { | |
| "epoch": 8.710342079799647, | |
| "grad_norm": 0.25987499952316284, | |
| "learning_rate": 0.00021386138613861387, | |
| "loss": 1.2444, | |
| "step": 10340 | |
| }, | |
| { | |
| "epoch": 8.71876600831009, | |
| "grad_norm": 0.21150009334087372, | |
| "learning_rate": 0.00021287128712871287, | |
| "loss": 1.2414, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 8.71876600831009, | |
| "eval_accuracy": 0.7421671573662553, | |
| "eval_loss": 1.1316900253295898, | |
| "eval_runtime": 893.0033, | |
| "eval_samples_per_second": 559.21, | |
| "eval_steps_per_second": 5.178, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 8.727189936820537, | |
| "grad_norm": 0.23628725111484528, | |
| "learning_rate": 0.00021188118811881187, | |
| "loss": 1.2432, | |
| "step": 10360 | |
| }, | |
| { | |
| "epoch": 8.735613865330981, | |
| "grad_norm": 0.24477533996105194, | |
| "learning_rate": 0.0002108910891089109, | |
| "loss": 1.2447, | |
| "step": 10370 | |
| }, | |
| { | |
| "epoch": 8.744037793841425, | |
| "grad_norm": 0.2156253159046173, | |
| "learning_rate": 0.0002099009900990099, | |
| "loss": 1.2452, | |
| "step": 10380 | |
| }, | |
| { | |
| "epoch": 8.75246172235187, | |
| "grad_norm": 0.27982792258262634, | |
| "learning_rate": 0.00020891089108910892, | |
| "loss": 1.2434, | |
| "step": 10390 | |
| }, | |
| { | |
| "epoch": 8.760885650862313, | |
| "grad_norm": 0.24025356769561768, | |
| "learning_rate": 0.00020792079207920792, | |
| "loss": 1.244, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 8.76930957937276, | |
| "grad_norm": 0.22768454253673553, | |
| "learning_rate": 0.00020693069306930691, | |
| "loss": 1.2427, | |
| "step": 10410 | |
| }, | |
| { | |
| "epoch": 8.777733507883204, | |
| "grad_norm": 0.2676762640476227, | |
| "learning_rate": 0.00020594059405940594, | |
| "loss": 1.244, | |
| "step": 10420 | |
| }, | |
| { | |
| "epoch": 8.786157436393648, | |
| "grad_norm": 0.23502378165721893, | |
| "learning_rate": 0.00020495049504950494, | |
| "loss": 1.244, | |
| "step": 10430 | |
| }, | |
| { | |
| "epoch": 8.794581364904092, | |
| "grad_norm": 0.23354895412921906, | |
| "learning_rate": 0.00020396039603960396, | |
| "loss": 1.2435, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 8.794581364904092, | |
| "eval_accuracy": 0.7425177306861277, | |
| "eval_loss": 1.1301963329315186, | |
| "eval_runtime": 885.137, | |
| "eval_samples_per_second": 564.179, | |
| "eval_steps_per_second": 5.224, | |
| "step": 10440 | |
| }, | |
| { | |
| "epoch": 8.803005293414538, | |
| "grad_norm": 0.22738757729530334, | |
| "learning_rate": 0.000202970297029703, | |
| "loss": 1.2426, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 8.811429221924982, | |
| "grad_norm": 0.20702116191387177, | |
| "learning_rate": 0.00020198019801980199, | |
| "loss": 1.243, | |
| "step": 10460 | |
| }, | |
| { | |
| "epoch": 8.819853150435426, | |
| "grad_norm": 0.20945468544960022, | |
| "learning_rate": 0.000200990099009901, | |
| "loss": 1.2411, | |
| "step": 10470 | |
| }, | |
| { | |
| "epoch": 8.82827707894587, | |
| "grad_norm": 0.21654458343982697, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2428, | |
| "step": 10480 | |
| }, | |
| { | |
| "epoch": 8.836701007456314, | |
| "grad_norm": 0.2217228263616562, | |
| "learning_rate": 0.00019900990099009903, | |
| "loss": 1.2405, | |
| "step": 10490 | |
| }, | |
| { | |
| "epoch": 8.84512493596676, | |
| "grad_norm": 0.27619633078575134, | |
| "learning_rate": 0.00019801980198019803, | |
| "loss": 1.2424, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 8.853548864477204, | |
| "grad_norm": 0.2569934129714966, | |
| "learning_rate": 0.00019702970297029703, | |
| "loss": 1.2418, | |
| "step": 10510 | |
| }, | |
| { | |
| "epoch": 8.861972792987649, | |
| "grad_norm": 0.2570299804210663, | |
| "learning_rate": 0.00019603960396039606, | |
| "loss": 1.2423, | |
| "step": 10520 | |
| }, | |
| { | |
| "epoch": 8.870396721498093, | |
| "grad_norm": 0.22972337901592255, | |
| "learning_rate": 0.00019504950495049505, | |
| "loss": 1.2399, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 8.870396721498093, | |
| "eval_accuracy": 0.7427001211705735, | |
| "eval_loss": 1.1304486989974976, | |
| "eval_runtime": 881.4454, | |
| "eval_samples_per_second": 566.542, | |
| "eval_steps_per_second": 5.246, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 8.878820650008539, | |
| "grad_norm": 0.2365693300962448, | |
| "learning_rate": 0.00019405940594059408, | |
| "loss": 1.2426, | |
| "step": 10540 | |
| }, | |
| { | |
| "epoch": 8.887244578518983, | |
| "grad_norm": 0.2252751588821411, | |
| "learning_rate": 0.00019306930693069308, | |
| "loss": 1.2406, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 8.895668507029427, | |
| "grad_norm": 0.2205033302307129, | |
| "learning_rate": 0.00019207920792079208, | |
| "loss": 1.2419, | |
| "step": 10560 | |
| }, | |
| { | |
| "epoch": 8.904092435539871, | |
| "grad_norm": 0.21468041837215424, | |
| "learning_rate": 0.0001910891089108911, | |
| "loss": 1.2406, | |
| "step": 10570 | |
| }, | |
| { | |
| "epoch": 8.912516364050315, | |
| "grad_norm": 0.23669223487377167, | |
| "learning_rate": 0.0001900990099009901, | |
| "loss": 1.2401, | |
| "step": 10580 | |
| }, | |
| { | |
| "epoch": 8.920940292560761, | |
| "grad_norm": 0.2412618100643158, | |
| "learning_rate": 0.00018910891089108913, | |
| "loss": 1.2402, | |
| "step": 10590 | |
| }, | |
| { | |
| "epoch": 8.929364221071205, | |
| "grad_norm": 0.21675223112106323, | |
| "learning_rate": 0.00018811881188118812, | |
| "loss": 1.2417, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 8.93778814958165, | |
| "grad_norm": 0.24683676660060883, | |
| "learning_rate": 0.00018712871287128712, | |
| "loss": 1.2417, | |
| "step": 10610 | |
| }, | |
| { | |
| "epoch": 8.946212078092094, | |
| "grad_norm": 0.21681492030620575, | |
| "learning_rate": 0.00018613861386138615, | |
| "loss": 1.2408, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 8.946212078092094, | |
| "eval_accuracy": 0.7428579001690714, | |
| "eval_loss": 1.1290760040283203, | |
| "eval_runtime": 889.1418, | |
| "eval_samples_per_second": 561.638, | |
| "eval_steps_per_second": 5.201, | |
| "step": 10620 | |
| }, | |
| { | |
| "epoch": 8.954636006602538, | |
| "grad_norm": 0.22117485105991364, | |
| "learning_rate": 0.00018514851485148514, | |
| "loss": 1.2399, | |
| "step": 10630 | |
| }, | |
| { | |
| "epoch": 8.963059935112984, | |
| "grad_norm": 0.2180255800485611, | |
| "learning_rate": 0.00018415841584158417, | |
| "loss": 1.2378, | |
| "step": 10640 | |
| }, | |
| { | |
| "epoch": 8.971483863623428, | |
| "grad_norm": 0.23244567215442657, | |
| "learning_rate": 0.00018316831683168317, | |
| "loss": 1.2402, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 8.979907792133872, | |
| "grad_norm": 0.23777294158935547, | |
| "learning_rate": 0.00018217821782178217, | |
| "loss": 1.2417, | |
| "step": 10660 | |
| }, | |
| { | |
| "epoch": 8.988331720644316, | |
| "grad_norm": 0.26418906450271606, | |
| "learning_rate": 0.0001811881188118812, | |
| "loss": 1.238, | |
| "step": 10670 | |
| }, | |
| { | |
| "epoch": 8.99675564915476, | |
| "grad_norm": 0.21142803132534027, | |
| "learning_rate": 0.0001801980198019802, | |
| "loss": 1.2384, | |
| "step": 10680 | |
| }, | |
| { | |
| "epoch": 9.005179577665206, | |
| "grad_norm": 0.21976542472839355, | |
| "learning_rate": 0.00017920792079207922, | |
| "loss": 1.2399, | |
| "step": 10690 | |
| }, | |
| { | |
| "epoch": 9.01360350617565, | |
| "grad_norm": 0.2216147631406784, | |
| "learning_rate": 0.0001782178217821782, | |
| "loss": 1.2391, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 9.022027434686095, | |
| "grad_norm": 0.1873018890619278, | |
| "learning_rate": 0.0001772277227722772, | |
| "loss": 1.2368, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 9.022027434686095, | |
| "eval_accuracy": 0.7431224622062498, | |
| "eval_loss": 1.1265127658843994, | |
| "eval_runtime": 891.5668, | |
| "eval_samples_per_second": 560.111, | |
| "eval_steps_per_second": 5.186, | |
| "step": 10710 | |
| }, | |
| { | |
| "epoch": 9.030451363196539, | |
| "grad_norm": 0.23913191258907318, | |
| "learning_rate": 0.00017623762376237624, | |
| "loss": 1.2404, | |
| "step": 10720 | |
| }, | |
| { | |
| "epoch": 9.038875291706983, | |
| "grad_norm": 0.21578449010849, | |
| "learning_rate": 0.00017524752475247524, | |
| "loss": 1.2388, | |
| "step": 10730 | |
| }, | |
| { | |
| "epoch": 9.047299220217429, | |
| "grad_norm": 0.2038455754518509, | |
| "learning_rate": 0.00017425742574257426, | |
| "loss": 1.2402, | |
| "step": 10740 | |
| }, | |
| { | |
| "epoch": 9.055723148727873, | |
| "grad_norm": 0.21903488039970398, | |
| "learning_rate": 0.00017326732673267326, | |
| "loss": 1.2383, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 9.064147077238317, | |
| "grad_norm": 0.21970726549625397, | |
| "learning_rate": 0.00017227722772277226, | |
| "loss": 1.2386, | |
| "step": 10760 | |
| }, | |
| { | |
| "epoch": 9.072571005748761, | |
| "grad_norm": 0.22701360285282135, | |
| "learning_rate": 0.0001712871287128713, | |
| "loss": 1.2391, | |
| "step": 10770 | |
| }, | |
| { | |
| "epoch": 9.080994934259207, | |
| "grad_norm": 0.21777622401714325, | |
| "learning_rate": 0.0001702970297029703, | |
| "loss": 1.2388, | |
| "step": 10780 | |
| }, | |
| { | |
| "epoch": 9.089418862769651, | |
| "grad_norm": 0.2336941659450531, | |
| "learning_rate": 0.00016930693069306933, | |
| "loss": 1.2383, | |
| "step": 10790 | |
| }, | |
| { | |
| "epoch": 9.097842791280096, | |
| "grad_norm": 0.20545706152915955, | |
| "learning_rate": 0.00016831683168316833, | |
| "loss": 1.2376, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 9.097842791280096, | |
| "eval_accuracy": 0.7435866345331611, | |
| "eval_loss": 1.1250243186950684, | |
| "eval_runtime": 885.3582, | |
| "eval_samples_per_second": 564.038, | |
| "eval_steps_per_second": 5.223, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 9.10626671979054, | |
| "grad_norm": 0.23678459227085114, | |
| "learning_rate": 0.00016732673267326733, | |
| "loss": 1.2394, | |
| "step": 10810 | |
| }, | |
| { | |
| "epoch": 9.114690648300984, | |
| "grad_norm": 0.24195948243141174, | |
| "learning_rate": 0.00016633663366336635, | |
| "loss": 1.238, | |
| "step": 10820 | |
| }, | |
| { | |
| "epoch": 9.12311457681143, | |
| "grad_norm": 0.20026259124279022, | |
| "learning_rate": 0.00016534653465346535, | |
| "loss": 1.2364, | |
| "step": 10830 | |
| }, | |
| { | |
| "epoch": 9.131538505321874, | |
| "grad_norm": 0.21753010153770447, | |
| "learning_rate": 0.00016435643564356438, | |
| "loss": 1.238, | |
| "step": 10840 | |
| }, | |
| { | |
| "epoch": 9.139962433832318, | |
| "grad_norm": 0.20273657143115997, | |
| "learning_rate": 0.00016336633663366338, | |
| "loss": 1.2374, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 9.148386362342762, | |
| "grad_norm": 0.21302086114883423, | |
| "learning_rate": 0.00016237623762376237, | |
| "loss": 1.2372, | |
| "step": 10860 | |
| }, | |
| { | |
| "epoch": 9.156810290853207, | |
| "grad_norm": 0.23342467844486237, | |
| "learning_rate": 0.0001613861386138614, | |
| "loss": 1.2378, | |
| "step": 10870 | |
| }, | |
| { | |
| "epoch": 9.165234219363652, | |
| "grad_norm": 0.24393875896930695, | |
| "learning_rate": 0.0001603960396039604, | |
| "loss": 1.2362, | |
| "step": 10880 | |
| }, | |
| { | |
| "epoch": 9.173658147874097, | |
| "grad_norm": 0.19604717195034027, | |
| "learning_rate": 0.00015940594059405942, | |
| "loss": 1.237, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 9.173658147874097, | |
| "eval_accuracy": 0.743667723412049, | |
| "eval_loss": 1.124830722808838, | |
| "eval_runtime": 887.4222, | |
| "eval_samples_per_second": 562.727, | |
| "eval_steps_per_second": 5.211, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 9.18208207638454, | |
| "grad_norm": 0.19619697332382202, | |
| "learning_rate": 0.00015841584158415842, | |
| "loss": 1.2356, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 9.190506004894985, | |
| "grad_norm": 0.20415499806404114, | |
| "learning_rate": 0.00015742574257425742, | |
| "loss": 1.2373, | |
| "step": 10910 | |
| }, | |
| { | |
| "epoch": 9.19892993340543, | |
| "grad_norm": 0.21602529287338257, | |
| "learning_rate": 0.00015643564356435644, | |
| "loss": 1.2369, | |
| "step": 10920 | |
| }, | |
| { | |
| "epoch": 9.207353861915875, | |
| "grad_norm": 0.2266259491443634, | |
| "learning_rate": 0.00015544554455445544, | |
| "loss": 1.236, | |
| "step": 10930 | |
| }, | |
| { | |
| "epoch": 9.21577779042632, | |
| "grad_norm": 0.2172340452671051, | |
| "learning_rate": 0.00015445544554455447, | |
| "loss": 1.236, | |
| "step": 10940 | |
| }, | |
| { | |
| "epoch": 9.224201718936763, | |
| "grad_norm": 0.21929994225502014, | |
| "learning_rate": 0.00015346534653465347, | |
| "loss": 1.2381, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 9.232625647447207, | |
| "grad_norm": 0.20617130398750305, | |
| "learning_rate": 0.00015247524752475246, | |
| "loss": 1.2346, | |
| "step": 10960 | |
| }, | |
| { | |
| "epoch": 9.241049575957653, | |
| "grad_norm": 0.2271021008491516, | |
| "learning_rate": 0.0001514851485148515, | |
| "loss": 1.2364, | |
| "step": 10970 | |
| }, | |
| { | |
| "epoch": 9.249473504468098, | |
| "grad_norm": 0.22377552092075348, | |
| "learning_rate": 0.0001504950495049505, | |
| "loss": 1.2342, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 9.249473504468098, | |
| "eval_accuracy": 0.7438243969178056, | |
| "eval_loss": 1.124144434928894, | |
| "eval_runtime": 880.0851, | |
| "eval_samples_per_second": 567.418, | |
| "eval_steps_per_second": 5.254, | |
| "step": 10980 | |
| }, | |
| { | |
| "epoch": 9.257897432978542, | |
| "grad_norm": 0.23195216059684753, | |
| "learning_rate": 0.0001495049504950495, | |
| "loss": 1.2347, | |
| "step": 10990 | |
| }, | |
| { | |
| "epoch": 9.266321361488986, | |
| "grad_norm": 0.19934554398059845, | |
| "learning_rate": 0.0001485148514851485, | |
| "loss": 1.2359, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 9.27474528999943, | |
| "grad_norm": 0.19541287422180176, | |
| "learning_rate": 0.0001475247524752475, | |
| "loss": 1.2342, | |
| "step": 11010 | |
| }, | |
| { | |
| "epoch": 9.283169218509876, | |
| "grad_norm": 0.2204955518245697, | |
| "learning_rate": 0.00014653465346534653, | |
| "loss": 1.2356, | |
| "step": 11020 | |
| }, | |
| { | |
| "epoch": 9.29159314702032, | |
| "grad_norm": 0.22855669260025024, | |
| "learning_rate": 0.00014554455445544553, | |
| "loss": 1.2367, | |
| "step": 11030 | |
| }, | |
| { | |
| "epoch": 9.300017075530764, | |
| "grad_norm": 0.20308193564414978, | |
| "learning_rate": 0.00014455445544554456, | |
| "loss": 1.235, | |
| "step": 11040 | |
| }, | |
| { | |
| "epoch": 9.308441004041208, | |
| "grad_norm": 0.18201188743114471, | |
| "learning_rate": 0.00014356435643564356, | |
| "loss": 1.235, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 9.316864932551653, | |
| "grad_norm": 0.199186772108078, | |
| "learning_rate": 0.00014257425742574255, | |
| "loss": 1.2348, | |
| "step": 11060 | |
| }, | |
| { | |
| "epoch": 9.325288861062099, | |
| "grad_norm": 0.23214493691921234, | |
| "learning_rate": 0.00014158415841584158, | |
| "loss": 1.2335, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 9.325288861062099, | |
| "eval_accuracy": 0.7438911749364814, | |
| "eval_loss": 1.123384714126587, | |
| "eval_runtime": 888.3176, | |
| "eval_samples_per_second": 562.159, | |
| "eval_steps_per_second": 5.205, | |
| "step": 11070 | |
| }, | |
| { | |
| "epoch": 9.333712789572543, | |
| "grad_norm": 0.2128278762102127, | |
| "learning_rate": 0.0001405940594059406, | |
| "loss": 1.2337, | |
| "step": 11080 | |
| }, | |
| { | |
| "epoch": 9.342136718082987, | |
| "grad_norm": 0.20257510244846344, | |
| "learning_rate": 0.00013960396039603963, | |
| "loss": 1.2357, | |
| "step": 11090 | |
| }, | |
| { | |
| "epoch": 9.350560646593431, | |
| "grad_norm": 0.22038786113262177, | |
| "learning_rate": 0.00013861386138613863, | |
| "loss": 1.2333, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 9.358984575103877, | |
| "grad_norm": 0.2351042628288269, | |
| "learning_rate": 0.00013762376237623763, | |
| "loss": 1.235, | |
| "step": 11110 | |
| }, | |
| { | |
| "epoch": 9.367408503614321, | |
| "grad_norm": 0.2042153775691986, | |
| "learning_rate": 0.00013663366336633665, | |
| "loss": 1.2339, | |
| "step": 11120 | |
| }, | |
| { | |
| "epoch": 9.375832432124765, | |
| "grad_norm": 0.20065917074680328, | |
| "learning_rate": 0.00013564356435643565, | |
| "loss": 1.234, | |
| "step": 11130 | |
| }, | |
| { | |
| "epoch": 9.38425636063521, | |
| "grad_norm": 0.22544540464878082, | |
| "learning_rate": 0.00013465346534653468, | |
| "loss": 1.2319, | |
| "step": 11140 | |
| }, | |
| { | |
| "epoch": 9.392680289145654, | |
| "grad_norm": 0.2352074533700943, | |
| "learning_rate": 0.00013366336633663367, | |
| "loss": 1.2347, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 9.4011042176561, | |
| "grad_norm": 0.2452593892812729, | |
| "learning_rate": 0.00013267326732673267, | |
| "loss": 1.2343, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 9.4011042176561, | |
| "eval_accuracy": 0.7445740208736444, | |
| "eval_loss": 1.1202077865600586, | |
| "eval_runtime": 879.3984, | |
| "eval_samples_per_second": 567.861, | |
| "eval_steps_per_second": 5.258, | |
| "step": 11160 | |
| }, | |
| { | |
| "epoch": 9.409528146166544, | |
| "grad_norm": 0.20848217606544495, | |
| "learning_rate": 0.0001316831683168317, | |
| "loss": 1.2315, | |
| "step": 11170 | |
| }, | |
| { | |
| "epoch": 9.417952074676988, | |
| "grad_norm": 0.20628029108047485, | |
| "learning_rate": 0.0001306930693069307, | |
| "loss": 1.2326, | |
| "step": 11180 | |
| }, | |
| { | |
| "epoch": 9.426376003187432, | |
| "grad_norm": 0.199026957154274, | |
| "learning_rate": 0.00012970297029702972, | |
| "loss": 1.2329, | |
| "step": 11190 | |
| }, | |
| { | |
| "epoch": 9.434799931697876, | |
| "grad_norm": 0.21373671293258667, | |
| "learning_rate": 0.00012871287128712872, | |
| "loss": 1.2326, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 9.443223860208322, | |
| "grad_norm": 0.2015460729598999, | |
| "learning_rate": 0.00012772277227722772, | |
| "loss": 1.2327, | |
| "step": 11210 | |
| }, | |
| { | |
| "epoch": 9.451647788718766, | |
| "grad_norm": 0.2228008210659027, | |
| "learning_rate": 0.00012673267326732674, | |
| "loss": 1.2334, | |
| "step": 11220 | |
| }, | |
| { | |
| "epoch": 9.46007171722921, | |
| "grad_norm": 0.21561528742313385, | |
| "learning_rate": 0.00012574257425742574, | |
| "loss": 1.233, | |
| "step": 11230 | |
| }, | |
| { | |
| "epoch": 9.468495645739655, | |
| "grad_norm": 0.2073032706975937, | |
| "learning_rate": 0.00012475247524752477, | |
| "loss": 1.2314, | |
| "step": 11240 | |
| }, | |
| { | |
| "epoch": 9.4769195742501, | |
| "grad_norm": 0.19552037119865417, | |
| "learning_rate": 0.00012376237623762376, | |
| "loss": 1.2333, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 9.4769195742501, | |
| "eval_accuracy": 0.744401638855597, | |
| "eval_loss": 1.1210565567016602, | |
| "eval_runtime": 888.2535, | |
| "eval_samples_per_second": 562.2, | |
| "eval_steps_per_second": 5.206, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 9.485343502760545, | |
| "grad_norm": 0.20909276604652405, | |
| "learning_rate": 0.00012277227722772276, | |
| "loss": 1.2332, | |
| "step": 11260 | |
| }, | |
| { | |
| "epoch": 9.493767431270989, | |
| "grad_norm": 0.210150346159935, | |
| "learning_rate": 0.00012178217821782179, | |
| "loss": 1.2308, | |
| "step": 11270 | |
| }, | |
| { | |
| "epoch": 9.502191359781433, | |
| "grad_norm": 0.1982164978981018, | |
| "learning_rate": 0.00012079207920792079, | |
| "loss": 1.2305, | |
| "step": 11280 | |
| }, | |
| { | |
| "epoch": 9.510615288291877, | |
| "grad_norm": 0.2049965262413025, | |
| "learning_rate": 0.0001198019801980198, | |
| "loss": 1.2334, | |
| "step": 11290 | |
| }, | |
| { | |
| "epoch": 9.519039216802323, | |
| "grad_norm": 0.18243108689785004, | |
| "learning_rate": 0.00011881188118811881, | |
| "loss": 1.2335, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 9.527463145312767, | |
| "grad_norm": 0.2009328156709671, | |
| "learning_rate": 0.00011782178217821782, | |
| "loss": 1.2313, | |
| "step": 11310 | |
| }, | |
| { | |
| "epoch": 9.535887073823211, | |
| "grad_norm": 0.19226033985614777, | |
| "learning_rate": 0.00011683168316831685, | |
| "loss": 1.2332, | |
| "step": 11320 | |
| }, | |
| { | |
| "epoch": 9.544311002333655, | |
| "grad_norm": 0.20206843316555023, | |
| "learning_rate": 0.00011584158415841584, | |
| "loss": 1.2333, | |
| "step": 11330 | |
| }, | |
| { | |
| "epoch": 9.5527349308441, | |
| "grad_norm": 0.20852382481098175, | |
| "learning_rate": 0.00011485148514851486, | |
| "loss": 1.2322, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 9.5527349308441, | |
| "eval_accuracy": 0.7448142064493213, | |
| "eval_loss": 1.1182734966278076, | |
| "eval_runtime": 889.106, | |
| "eval_samples_per_second": 561.661, | |
| "eval_steps_per_second": 5.201, | |
| "step": 11340 | |
| }, | |
| { | |
| "epoch": 9.561158859354546, | |
| "grad_norm": 0.19330884516239166, | |
| "learning_rate": 0.00011386138613861387, | |
| "loss": 1.2294, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 9.56958278786499, | |
| "grad_norm": 0.17878125607967377, | |
| "learning_rate": 0.00011287128712871288, | |
| "loss": 1.2301, | |
| "step": 11360 | |
| }, | |
| { | |
| "epoch": 9.578006716375434, | |
| "grad_norm": 0.20679515600204468, | |
| "learning_rate": 0.00011188118811881189, | |
| "loss": 1.2302, | |
| "step": 11370 | |
| }, | |
| { | |
| "epoch": 9.586430644885878, | |
| "grad_norm": 0.20949432253837585, | |
| "learning_rate": 0.00011089108910891089, | |
| "loss": 1.2308, | |
| "step": 11380 | |
| }, | |
| { | |
| "epoch": 9.594854573396322, | |
| "grad_norm": 0.21771377325057983, | |
| "learning_rate": 0.0001099009900990099, | |
| "loss": 1.2313, | |
| "step": 11390 | |
| }, | |
| { | |
| "epoch": 9.603278501906768, | |
| "grad_norm": 0.1953546106815338, | |
| "learning_rate": 0.00010891089108910891, | |
| "loss": 1.2305, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 9.611702430417212, | |
| "grad_norm": 0.20105966925621033, | |
| "learning_rate": 0.00010792079207920792, | |
| "loss": 1.2294, | |
| "step": 11410 | |
| }, | |
| { | |
| "epoch": 9.620126358927656, | |
| "grad_norm": 0.20625823736190796, | |
| "learning_rate": 0.00010693069306930694, | |
| "loss": 1.2287, | |
| "step": 11420 | |
| }, | |
| { | |
| "epoch": 9.6285502874381, | |
| "grad_norm": 0.2024402767419815, | |
| "learning_rate": 0.00010594059405940593, | |
| "loss": 1.2309, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 9.6285502874381, | |
| "eval_accuracy": 0.7450274546722492, | |
| "eval_loss": 1.1177880764007568, | |
| "eval_runtime": 889.3816, | |
| "eval_samples_per_second": 561.487, | |
| "eval_steps_per_second": 5.199, | |
| "step": 11430 | |
| }, | |
| { | |
| "epoch": 9.636974215948547, | |
| "grad_norm": 0.20498992502689362, | |
| "learning_rate": 0.00010495049504950495, | |
| "loss": 1.228, | |
| "step": 11440 | |
| }, | |
| { | |
| "epoch": 9.64539814445899, | |
| "grad_norm": 0.18760576844215393, | |
| "learning_rate": 0.00010396039603960396, | |
| "loss": 1.2287, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 9.653822072969435, | |
| "grad_norm": 0.2059292048215866, | |
| "learning_rate": 0.00010297029702970297, | |
| "loss": 1.2284, | |
| "step": 11460 | |
| }, | |
| { | |
| "epoch": 9.662246001479879, | |
| "grad_norm": 0.20898665487766266, | |
| "learning_rate": 0.00010198019801980198, | |
| "loss": 1.231, | |
| "step": 11470 | |
| }, | |
| { | |
| "epoch": 9.670669929990323, | |
| "grad_norm": 0.20303255319595337, | |
| "learning_rate": 0.00010099009900990099, | |
| "loss": 1.2302, | |
| "step": 11480 | |
| }, | |
| { | |
| "epoch": 9.679093858500769, | |
| "grad_norm": 0.20947200059890747, | |
| "learning_rate": 0.0001, | |
| "loss": 1.2314, | |
| "step": 11490 | |
| }, | |
| { | |
| "epoch": 9.687517787011213, | |
| "grad_norm": 0.20898771286010742, | |
| "learning_rate": 9.900990099009902e-05, | |
| "loss": 1.2294, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 9.695941715521657, | |
| "grad_norm": 0.18466849625110626, | |
| "learning_rate": 9.801980198019803e-05, | |
| "loss": 1.2309, | |
| "step": 11510 | |
| }, | |
| { | |
| "epoch": 9.704365644032102, | |
| "grad_norm": 0.1769760698080063, | |
| "learning_rate": 9.702970297029704e-05, | |
| "loss": 1.2282, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 9.704365644032102, | |
| "eval_accuracy": 0.7449189101862153, | |
| "eval_loss": 1.118354082107544, | |
| "eval_runtime": 879.3937, | |
| "eval_samples_per_second": 567.864, | |
| "eval_steps_per_second": 5.258, | |
| "step": 11520 | |
| }, | |
| { | |
| "epoch": 9.712789572542546, | |
| "grad_norm": 0.18270480632781982, | |
| "learning_rate": 9.603960396039604e-05, | |
| "loss": 1.2286, | |
| "step": 11530 | |
| }, | |
| { | |
| "epoch": 9.721213501052992, | |
| "grad_norm": 0.1812662035226822, | |
| "learning_rate": 9.504950495049505e-05, | |
| "loss": 1.2279, | |
| "step": 11540 | |
| }, | |
| { | |
| "epoch": 9.729637429563436, | |
| "grad_norm": 0.20632152259349823, | |
| "learning_rate": 9.405940594059406e-05, | |
| "loss": 1.2295, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 9.73806135807388, | |
| "grad_norm": 0.19512777030467987, | |
| "learning_rate": 9.306930693069307e-05, | |
| "loss": 1.2292, | |
| "step": 11560 | |
| }, | |
| { | |
| "epoch": 9.746485286584324, | |
| "grad_norm": 0.19665522873401642, | |
| "learning_rate": 9.207920792079209e-05, | |
| "loss": 1.2294, | |
| "step": 11570 | |
| }, | |
| { | |
| "epoch": 9.75490921509477, | |
| "grad_norm": 0.18540680408477783, | |
| "learning_rate": 9.108910891089108e-05, | |
| "loss": 1.2297, | |
| "step": 11580 | |
| }, | |
| { | |
| "epoch": 9.763333143605214, | |
| "grad_norm": 0.21472424268722534, | |
| "learning_rate": 9.00990099009901e-05, | |
| "loss": 1.2277, | |
| "step": 11590 | |
| }, | |
| { | |
| "epoch": 9.771757072115658, | |
| "grad_norm": 0.2189822793006897, | |
| "learning_rate": 8.91089108910891e-05, | |
| "loss": 1.2293, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 9.780181000626103, | |
| "grad_norm": 0.19983939826488495, | |
| "learning_rate": 8.811881188118812e-05, | |
| "loss": 1.2287, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 9.780181000626103, | |
| "eval_accuracy": 0.7452771934107217, | |
| "eval_loss": 1.1166530847549438, | |
| "eval_runtime": 886.9822, | |
| "eval_samples_per_second": 563.006, | |
| "eval_steps_per_second": 5.213, | |
| "step": 11610 | |
| }, | |
| { | |
| "epoch": 9.788604929136547, | |
| "grad_norm": 0.1868014931678772, | |
| "learning_rate": 8.712871287128713e-05, | |
| "loss": 1.2296, | |
| "step": 11620 | |
| }, | |
| { | |
| "epoch": 9.797028857646993, | |
| "grad_norm": 0.2048911601305008, | |
| "learning_rate": 8.613861386138613e-05, | |
| "loss": 1.2291, | |
| "step": 11630 | |
| }, | |
| { | |
| "epoch": 9.805452786157437, | |
| "grad_norm": 0.2088802009820938, | |
| "learning_rate": 8.514851485148515e-05, | |
| "loss": 1.2271, | |
| "step": 11640 | |
| }, | |
| { | |
| "epoch": 9.813876714667881, | |
| "grad_norm": 0.20058122277259827, | |
| "learning_rate": 8.415841584158417e-05, | |
| "loss": 1.2296, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 9.822300643178325, | |
| "grad_norm": 0.1964656561613083, | |
| "learning_rate": 8.316831683168318e-05, | |
| "loss": 1.2272, | |
| "step": 11660 | |
| }, | |
| { | |
| "epoch": 9.83072457168877, | |
| "grad_norm": 0.20214231312274933, | |
| "learning_rate": 8.217821782178219e-05, | |
| "loss": 1.2271, | |
| "step": 11670 | |
| }, | |
| { | |
| "epoch": 9.839148500199215, | |
| "grad_norm": 0.19427910447120667, | |
| "learning_rate": 8.118811881188119e-05, | |
| "loss": 1.2264, | |
| "step": 11680 | |
| }, | |
| { | |
| "epoch": 9.84757242870966, | |
| "grad_norm": 0.18842646479606628, | |
| "learning_rate": 8.01980198019802e-05, | |
| "loss": 1.2265, | |
| "step": 11690 | |
| }, | |
| { | |
| "epoch": 9.855996357220103, | |
| "grad_norm": 0.18588952720165253, | |
| "learning_rate": 7.920792079207921e-05, | |
| "loss": 1.2279, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 9.855996357220103, | |
| "eval_accuracy": 0.7454476541387279, | |
| "eval_loss": 1.1153885126113892, | |
| "eval_runtime": 879.2745, | |
| "eval_samples_per_second": 567.941, | |
| "eval_steps_per_second": 5.259, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 9.864420285730548, | |
| "grad_norm": 0.18300525844097137, | |
| "learning_rate": 7.821782178217822e-05, | |
| "loss": 1.2268, | |
| "step": 11710 | |
| }, | |
| { | |
| "epoch": 9.872844214240992, | |
| "grad_norm": 0.18436813354492188, | |
| "learning_rate": 7.722772277227723e-05, | |
| "loss": 1.2256, | |
| "step": 11720 | |
| }, | |
| { | |
| "epoch": 9.881268142751438, | |
| "grad_norm": 0.19767363369464874, | |
| "learning_rate": 7.623762376237623e-05, | |
| "loss": 1.2246, | |
| "step": 11730 | |
| }, | |
| { | |
| "epoch": 9.889692071261882, | |
| "grad_norm": 0.1749766319990158, | |
| "learning_rate": 7.524752475247524e-05, | |
| "loss": 1.2277, | |
| "step": 11740 | |
| }, | |
| { | |
| "epoch": 9.898115999772326, | |
| "grad_norm": 0.17161355912685394, | |
| "learning_rate": 7.425742574257426e-05, | |
| "loss": 1.2262, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 9.90653992828277, | |
| "grad_norm": 0.190937340259552, | |
| "learning_rate": 7.326732673267327e-05, | |
| "loss": 1.2276, | |
| "step": 11760 | |
| }, | |
| { | |
| "epoch": 9.914963856793216, | |
| "grad_norm": 0.18256962299346924, | |
| "learning_rate": 7.227722772277228e-05, | |
| "loss": 1.2274, | |
| "step": 11770 | |
| }, | |
| { | |
| "epoch": 9.92338778530366, | |
| "grad_norm": 0.1912631094455719, | |
| "learning_rate": 7.128712871287128e-05, | |
| "loss": 1.2243, | |
| "step": 11780 | |
| }, | |
| { | |
| "epoch": 9.931811713814104, | |
| "grad_norm": 0.19331537187099457, | |
| "learning_rate": 7.02970297029703e-05, | |
| "loss": 1.2261, | |
| "step": 11790 | |
| }, | |
| { | |
| "epoch": 9.931811713814104, | |
| "eval_accuracy": 0.7455543705350357, | |
| "eval_loss": 1.115136981010437, | |
| "eval_runtime": 887.3277, | |
| "eval_samples_per_second": 562.786, | |
| "eval_steps_per_second": 5.211, | |
| "step": 11790 | |
| }, | |
| { | |
| "epoch": 9.940235642324549, | |
| "grad_norm": 0.17607170343399048, | |
| "learning_rate": 6.930693069306931e-05, | |
| "loss": 1.228, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 9.948659570834993, | |
| "grad_norm": 0.17280788719654083, | |
| "learning_rate": 6.831683168316833e-05, | |
| "loss": 1.2269, | |
| "step": 11810 | |
| }, | |
| { | |
| "epoch": 9.957083499345439, | |
| "grad_norm": 0.19290916621685028, | |
| "learning_rate": 6.732673267326734e-05, | |
| "loss": 1.2279, | |
| "step": 11820 | |
| }, | |
| { | |
| "epoch": 9.965507427855883, | |
| "grad_norm": 0.19125664234161377, | |
| "learning_rate": 6.633663366336634e-05, | |
| "loss": 1.227, | |
| "step": 11830 | |
| }, | |
| { | |
| "epoch": 9.973931356366327, | |
| "grad_norm": 0.18251217901706696, | |
| "learning_rate": 6.534653465346535e-05, | |
| "loss": 1.2254, | |
| "step": 11840 | |
| }, | |
| { | |
| "epoch": 9.982355284876771, | |
| "grad_norm": 0.19647039473056793, | |
| "learning_rate": 6.435643564356436e-05, | |
| "loss": 1.2261, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 9.990779213387215, | |
| "grad_norm": 0.17714038491249084, | |
| "learning_rate": 6.336633663366337e-05, | |
| "loss": 1.2276, | |
| "step": 11860 | |
| }, | |
| { | |
| "epoch": 9.999203141897661, | |
| "grad_norm": 0.18365037441253662, | |
| "learning_rate": 6.237623762376238e-05, | |
| "loss": 1.2261, | |
| "step": 11870 | |
| }, | |
| { | |
| "epoch": 10.007627070408105, | |
| "grad_norm": 0.1910678595304489, | |
| "learning_rate": 6.138613861386138e-05, | |
| "loss": 1.2244, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 10.007627070408105, | |
| "eval_accuracy": 0.7456593741030724, | |
| "eval_loss": 1.1154232025146484, | |
| "eval_runtime": 887.0764, | |
| "eval_samples_per_second": 562.946, | |
| "eval_steps_per_second": 5.213, | |
| "step": 11880 | |
| }, | |
| { | |
| "epoch": 10.01605099891855, | |
| "grad_norm": 0.18324702978134155, | |
| "learning_rate": 6.039603960396039e-05, | |
| "loss": 1.2267, | |
| "step": 11890 | |
| }, | |
| { | |
| "epoch": 10.024474927428994, | |
| "grad_norm": 0.1686498522758484, | |
| "learning_rate": 5.9405940594059404e-05, | |
| "loss": 1.2242, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 10.03289885593944, | |
| "grad_norm": 0.17256265878677368, | |
| "learning_rate": 5.841584158415842e-05, | |
| "loss": 1.2239, | |
| "step": 11910 | |
| }, | |
| { | |
| "epoch": 10.041322784449884, | |
| "grad_norm": 0.19624483585357666, | |
| "learning_rate": 5.742574257425743e-05, | |
| "loss": 1.2258, | |
| "step": 11920 | |
| }, | |
| { | |
| "epoch": 10.049746712960328, | |
| "grad_norm": 0.17262500524520874, | |
| "learning_rate": 5.643564356435644e-05, | |
| "loss": 1.2258, | |
| "step": 11930 | |
| }, | |
| { | |
| "epoch": 10.058170641470772, | |
| "grad_norm": 0.1741054356098175, | |
| "learning_rate": 5.5445544554455445e-05, | |
| "loss": 1.2245, | |
| "step": 11940 | |
| }, | |
| { | |
| "epoch": 10.066594569981216, | |
| "grad_norm": 0.17313139140605927, | |
| "learning_rate": 5.4455445544554456e-05, | |
| "loss": 1.2256, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 10.075018498491662, | |
| "grad_norm": 0.18322905898094177, | |
| "learning_rate": 5.346534653465347e-05, | |
| "loss": 1.2243, | |
| "step": 11960 | |
| }, | |
| { | |
| "epoch": 10.083442427002106, | |
| "grad_norm": 0.18261946737766266, | |
| "learning_rate": 5.247524752475247e-05, | |
| "loss": 1.2252, | |
| "step": 11970 | |
| }, | |
| { | |
| "epoch": 10.083442427002106, | |
| "eval_accuracy": 0.7457714664313748, | |
| "eval_loss": 1.1143237352371216, | |
| "eval_runtime": 887.1041, | |
| "eval_samples_per_second": 562.928, | |
| "eval_steps_per_second": 5.212, | |
| "step": 11970 | |
| }, | |
| { | |
| "epoch": 10.09186635551255, | |
| "grad_norm": 0.1877572238445282, | |
| "learning_rate": 5.1485148514851485e-05, | |
| "loss": 1.2249, | |
| "step": 11980 | |
| }, | |
| { | |
| "epoch": 10.100290284022995, | |
| "grad_norm": 0.18356889486312866, | |
| "learning_rate": 5.0495049504950497e-05, | |
| "loss": 1.2255, | |
| "step": 11990 | |
| }, | |
| { | |
| "epoch": 10.108714212533439, | |
| "grad_norm": 0.1898818463087082, | |
| "learning_rate": 4.950495049504951e-05, | |
| "loss": 1.2241, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 10.117138141043885, | |
| "grad_norm": 0.17149324715137482, | |
| "learning_rate": 4.851485148514852e-05, | |
| "loss": 1.2257, | |
| "step": 12010 | |
| }, | |
| { | |
| "epoch": 10.125562069554329, | |
| "grad_norm": 0.16672831773757935, | |
| "learning_rate": 4.7524752475247525e-05, | |
| "loss": 1.2255, | |
| "step": 12020 | |
| }, | |
| { | |
| "epoch": 10.133985998064773, | |
| "grad_norm": 0.16820046305656433, | |
| "learning_rate": 4.653465346534654e-05, | |
| "loss": 1.225, | |
| "step": 12030 | |
| }, | |
| { | |
| "epoch": 10.142409926575217, | |
| "grad_norm": 0.17770229279994965, | |
| "learning_rate": 4.554455445544554e-05, | |
| "loss": 1.227, | |
| "step": 12040 | |
| }, | |
| { | |
| "epoch": 10.150833855085661, | |
| "grad_norm": 0.16082800924777985, | |
| "learning_rate": 4.455445544554455e-05, | |
| "loss": 1.2253, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 10.159257783596107, | |
| "grad_norm": 0.1669086515903473, | |
| "learning_rate": 4.3564356435643565e-05, | |
| "loss": 1.2241, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 10.159257783596107, | |
| "eval_accuracy": 0.7460534494522424, | |
| "eval_loss": 1.1121779680252075, | |
| "eval_runtime": 882.614, | |
| "eval_samples_per_second": 565.792, | |
| "eval_steps_per_second": 5.239, | |
| "step": 12060 | |
| }, | |
| { | |
| "epoch": 10.167681712106551, | |
| "grad_norm": 0.17394189536571503, | |
| "learning_rate": 4.257425742574258e-05, | |
| "loss": 1.2238, | |
| "step": 12070 | |
| }, | |
| { | |
| "epoch": 10.176105640616996, | |
| "grad_norm": 0.1611398160457611, | |
| "learning_rate": 4.158415841584159e-05, | |
| "loss": 1.2243, | |
| "step": 12080 | |
| }, | |
| { | |
| "epoch": 10.18452956912744, | |
| "grad_norm": 0.16469168663024902, | |
| "learning_rate": 4.0594059405940594e-05, | |
| "loss": 1.2232, | |
| "step": 12090 | |
| }, | |
| { | |
| "epoch": 10.192953497637886, | |
| "grad_norm": 0.1700202375650406, | |
| "learning_rate": 3.9603960396039605e-05, | |
| "loss": 1.2243, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 10.20137742614833, | |
| "grad_norm": 0.16961273550987244, | |
| "learning_rate": 3.861386138613862e-05, | |
| "loss": 1.2244, | |
| "step": 12110 | |
| }, | |
| { | |
| "epoch": 10.209801354658774, | |
| "grad_norm": 0.18176864087581635, | |
| "learning_rate": 3.762376237623762e-05, | |
| "loss": 1.2234, | |
| "step": 12120 | |
| }, | |
| { | |
| "epoch": 10.218225283169218, | |
| "grad_norm": 0.17132678627967834, | |
| "learning_rate": 3.6633663366336634e-05, | |
| "loss": 1.2231, | |
| "step": 12130 | |
| }, | |
| { | |
| "epoch": 10.226649211679662, | |
| "grad_norm": 0.1708788424730301, | |
| "learning_rate": 3.564356435643564e-05, | |
| "loss": 1.2228, | |
| "step": 12140 | |
| }, | |
| { | |
| "epoch": 10.235073140190108, | |
| "grad_norm": 0.16924616694450378, | |
| "learning_rate": 3.465346534653466e-05, | |
| "loss": 1.2241, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 10.235073140190108, | |
| "eval_accuracy": 0.7462807420235112, | |
| "eval_loss": 1.1115893125534058, | |
| "eval_runtime": 893.1249, | |
| "eval_samples_per_second": 559.133, | |
| "eval_steps_per_second": 5.177, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 10.243497068700552, | |
| "grad_norm": 0.1617705076932907, | |
| "learning_rate": 3.366336633663367e-05, | |
| "loss": 1.2239, | |
| "step": 12160 | |
| }, | |
| { | |
| "epoch": 10.251920997210997, | |
| "grad_norm": 0.17731362581253052, | |
| "learning_rate": 3.2673267326732674e-05, | |
| "loss": 1.2232, | |
| "step": 12170 | |
| }, | |
| { | |
| "epoch": 10.26034492572144, | |
| "grad_norm": 0.17324230074882507, | |
| "learning_rate": 3.1683168316831686e-05, | |
| "loss": 1.224, | |
| "step": 12180 | |
| }, | |
| { | |
| "epoch": 10.268768854231885, | |
| "grad_norm": 0.15266722440719604, | |
| "learning_rate": 3.069306930693069e-05, | |
| "loss": 1.224, | |
| "step": 12190 | |
| }, | |
| { | |
| "epoch": 10.27719278274233, | |
| "grad_norm": 0.1547342985868454, | |
| "learning_rate": 2.9702970297029702e-05, | |
| "loss": 1.2232, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 10.285616711252775, | |
| "grad_norm": 0.15873835980892181, | |
| "learning_rate": 2.8712871287128714e-05, | |
| "loss": 1.2221, | |
| "step": 12210 | |
| }, | |
| { | |
| "epoch": 10.29404063976322, | |
| "grad_norm": 0.15968631207942963, | |
| "learning_rate": 2.7722772277227722e-05, | |
| "loss": 1.223, | |
| "step": 12220 | |
| }, | |
| { | |
| "epoch": 10.302464568273663, | |
| "grad_norm": 0.15929782390594482, | |
| "learning_rate": 2.6732673267326734e-05, | |
| "loss": 1.2242, | |
| "step": 12230 | |
| }, | |
| { | |
| "epoch": 10.31088849678411, | |
| "grad_norm": 0.1512889713048935, | |
| "learning_rate": 2.5742574257425742e-05, | |
| "loss": 1.2223, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 10.31088849678411, | |
| "eval_accuracy": 0.7462616988558893, | |
| "eval_loss": 1.1114362478256226, | |
| "eval_runtime": 886.8923, | |
| "eval_samples_per_second": 563.063, | |
| "eval_steps_per_second": 5.214, | |
| "step": 12240 | |
| }, | |
| { | |
| "epoch": 10.319312425294553, | |
| "grad_norm": 0.15943297743797302, | |
| "learning_rate": 2.4752475247524754e-05, | |
| "loss": 1.2224, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 10.327736353804998, | |
| "grad_norm": 0.16134706139564514, | |
| "learning_rate": 2.3762376237623762e-05, | |
| "loss": 1.2218, | |
| "step": 12260 | |
| }, | |
| { | |
| "epoch": 10.336160282315442, | |
| "grad_norm": 0.15525278449058533, | |
| "learning_rate": 2.277227722772277e-05, | |
| "loss": 1.2237, | |
| "step": 12270 | |
| }, | |
| { | |
| "epoch": 10.344584210825886, | |
| "grad_norm": 0.1626599282026291, | |
| "learning_rate": 2.1782178217821783e-05, | |
| "loss": 1.2228, | |
| "step": 12280 | |
| }, | |
| { | |
| "epoch": 10.353008139336332, | |
| "grad_norm": 0.1533862203359604, | |
| "learning_rate": 2.0792079207920794e-05, | |
| "loss": 1.221, | |
| "step": 12290 | |
| }, | |
| { | |
| "epoch": 10.361432067846776, | |
| "grad_norm": 0.14988014101982117, | |
| "learning_rate": 1.9801980198019803e-05, | |
| "loss": 1.2238, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 10.36985599635722, | |
| "grad_norm": 0.15282054245471954, | |
| "learning_rate": 1.881188118811881e-05, | |
| "loss": 1.2202, | |
| "step": 12310 | |
| }, | |
| { | |
| "epoch": 10.378279924867664, | |
| "grad_norm": 0.1532844454050064, | |
| "learning_rate": 1.782178217821782e-05, | |
| "loss": 1.2222, | |
| "step": 12320 | |
| }, | |
| { | |
| "epoch": 10.386703853378108, | |
| "grad_norm": 0.15041793882846832, | |
| "learning_rate": 1.6831683168316834e-05, | |
| "loss": 1.2233, | |
| "step": 12330 | |
| }, | |
| { | |
| "epoch": 10.386703853378108, | |
| "eval_accuracy": 0.7464784909349403, | |
| "eval_loss": 1.1103906631469727, | |
| "eval_runtime": 893.2259, | |
| "eval_samples_per_second": 559.07, | |
| "eval_steps_per_second": 5.177, | |
| "step": 12330 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 12500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 11, | |
| "save_steps": 90, | |
| "total_flos": 3.205415169974477e+18, | |
| "train_batch_size": 108, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |