| { | |
| "best_metric": 0.2697894275188446, | |
| "best_model_checkpoint": "/mlspeech/data/yoadsnapir/models/heb_small_exp_1/checkpoint-3300", | |
| "epoch": 1.966626936829559, | |
| "eval_steps": 150, | |
| "global_step": 3300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011918951132300357, | |
| "grad_norm": 0.46440812945365906, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0799, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.023837902264600714, | |
| "grad_norm": 0.3614272475242615, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0757, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03575685339690107, | |
| "grad_norm": 0.3306252658367157, | |
| "learning_rate": 3e-06, | |
| "loss": 0.0707, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04767580452920143, | |
| "grad_norm": 0.3750213384628296, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.0707, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.05959475566150179, | |
| "grad_norm": 0.3395947515964508, | |
| "learning_rate": 5e-06, | |
| "loss": 0.0677, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07151370679380215, | |
| "grad_norm": 0.3402997553348541, | |
| "learning_rate": 6e-06, | |
| "loss": 0.0651, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08343265792610251, | |
| "grad_norm": 0.36161497235298157, | |
| "learning_rate": 7e-06, | |
| "loss": 0.0662, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08939213349225268, | |
| "eval_loss": 0.4555220901966095, | |
| "eval_runtime": 32.3927, | |
| "eval_samples_per_second": 86.439, | |
| "eval_steps_per_second": 1.358, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.09535160905840286, | |
| "grad_norm": 0.35821646451950073, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.0659, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10727056019070322, | |
| "grad_norm": 0.3810754716396332, | |
| "learning_rate": 9e-06, | |
| "loss": 0.0632, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.11918951132300358, | |
| "grad_norm": 0.35852399468421936, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0636, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.13110846245530394, | |
| "grad_norm": 0.33330053091049194, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.0632, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1430274135876043, | |
| "grad_norm": 0.3282977044582367, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.0617, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.15494636471990464, | |
| "grad_norm": 0.32969963550567627, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.0609, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.16686531585220502, | |
| "grad_norm": 0.33057352900505066, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.0606, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.17878426698450536, | |
| "grad_norm": 0.3497098982334137, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.0598, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.17878426698450536, | |
| "eval_loss": 0.4085025489330292, | |
| "eval_runtime": 24.3982, | |
| "eval_samples_per_second": 114.763, | |
| "eval_steps_per_second": 1.803, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1907032181168057, | |
| "grad_norm": 0.34013646841049194, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.0586, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2026221692491061, | |
| "grad_norm": 0.3030904233455658, | |
| "learning_rate": 1.7e-05, | |
| "loss": 0.0588, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.21454112038140644, | |
| "grad_norm": 0.3328603208065033, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.0562, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.22646007151370678, | |
| "grad_norm": 0.32149162888526917, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.0559, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.23837902264600716, | |
| "grad_norm": 0.2991398572921753, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0553, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.25029797377830754, | |
| "grad_norm": 0.30644193291664124, | |
| "learning_rate": 2.1000000000000002e-05, | |
| "loss": 0.0556, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2622169249106079, | |
| "grad_norm": 0.357546329498291, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 0.0546, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.26817640047675806, | |
| "eval_loss": 0.38544225692749023, | |
| "eval_runtime": 24.3319, | |
| "eval_samples_per_second": 115.075, | |
| "eval_steps_per_second": 1.808, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.27413587604290823, | |
| "grad_norm": 0.3578217923641205, | |
| "learning_rate": 2.3e-05, | |
| "loss": 0.054, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2860548271752086, | |
| "grad_norm": 0.34381797909736633, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.0553, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.29797377830750893, | |
| "grad_norm": 0.30176016688346863, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0543, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3098927294398093, | |
| "grad_norm": 0.33324921131134033, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 0.0544, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3218116805721097, | |
| "grad_norm": 0.3484705090522766, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 0.0531, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.33373063170441003, | |
| "grad_norm": 0.3282860219478607, | |
| "learning_rate": 2.8e-05, | |
| "loss": 0.0518, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3456495828367104, | |
| "grad_norm": 0.3473268449306488, | |
| "learning_rate": 2.9e-05, | |
| "loss": 0.0515, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3575685339690107, | |
| "grad_norm": 0.3255419433116913, | |
| "learning_rate": 3.0000000000000004e-05, | |
| "loss": 0.0526, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3575685339690107, | |
| "eval_loss": 0.36782801151275635, | |
| "eval_runtime": 24.3536, | |
| "eval_samples_per_second": 114.973, | |
| "eval_steps_per_second": 1.807, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3694874851013111, | |
| "grad_norm": 0.3105810880661011, | |
| "learning_rate": 3.1e-05, | |
| "loss": 0.0505, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3814064362336114, | |
| "grad_norm": 0.3272826373577118, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 0.0514, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.3933253873659118, | |
| "grad_norm": 0.2896602749824524, | |
| "learning_rate": 3.3e-05, | |
| "loss": 0.0509, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4052443384982122, | |
| "grad_norm": 0.2664831876754761, | |
| "learning_rate": 3.4e-05, | |
| "loss": 0.0508, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4171632896305125, | |
| "grad_norm": 0.29491451382637024, | |
| "learning_rate": 3.5000000000000004e-05, | |
| "loss": 0.0503, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.42908224076281287, | |
| "grad_norm": 0.3337947726249695, | |
| "learning_rate": 3.6e-05, | |
| "loss": 0.0508, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4410011918951132, | |
| "grad_norm": 0.274883508682251, | |
| "learning_rate": 3.7000000000000005e-05, | |
| "loss": 0.0491, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4469606674612634, | |
| "eval_loss": 0.3556332290172577, | |
| "eval_runtime": 24.3629, | |
| "eval_samples_per_second": 114.929, | |
| "eval_steps_per_second": 1.806, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.45292014302741357, | |
| "grad_norm": 0.3304573595523834, | |
| "learning_rate": 3.8e-05, | |
| "loss": 0.0496, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.464839094159714, | |
| "grad_norm": 0.43425601720809937, | |
| "learning_rate": 3.9e-05, | |
| "loss": 0.0514, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4767580452920143, | |
| "grad_norm": 0.27023833990097046, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0493, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.48867699642431467, | |
| "grad_norm": 0.30646392703056335, | |
| "learning_rate": 3.981105337742088e-05, | |
| "loss": 0.0479, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5005959475566151, | |
| "grad_norm": 0.2886641025543213, | |
| "learning_rate": 3.9622106754841764e-05, | |
| "loss": 0.0477, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5125148986889154, | |
| "grad_norm": 0.2744874358177185, | |
| "learning_rate": 3.943316013226264e-05, | |
| "loss": 0.0475, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5244338498212158, | |
| "grad_norm": 0.3189797103404999, | |
| "learning_rate": 3.924421350968352e-05, | |
| "loss": 0.0487, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5363528009535161, | |
| "grad_norm": 0.2827907204627991, | |
| "learning_rate": 3.9055266887104394e-05, | |
| "loss": 0.047, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5363528009535161, | |
| "eval_loss": 0.3409503102302551, | |
| "eval_runtime": 24.4692, | |
| "eval_samples_per_second": 114.43, | |
| "eval_steps_per_second": 1.798, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5482717520858165, | |
| "grad_norm": 0.29098019003868103, | |
| "learning_rate": 3.886632026452528e-05, | |
| "loss": 0.0477, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5601907032181168, | |
| "grad_norm": 0.34882616996765137, | |
| "learning_rate": 3.8677373641946155e-05, | |
| "loss": 0.0469, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.5721096543504172, | |
| "grad_norm": 0.27143335342407227, | |
| "learning_rate": 3.848842701936703e-05, | |
| "loss": 0.0463, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5840286054827175, | |
| "grad_norm": 0.3700932562351227, | |
| "learning_rate": 3.829948039678791e-05, | |
| "loss": 0.0471, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5959475566150179, | |
| "grad_norm": 0.2656194567680359, | |
| "learning_rate": 3.811053377420879e-05, | |
| "loss": 0.0435, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6078665077473182, | |
| "grad_norm": 0.3158736526966095, | |
| "learning_rate": 3.792158715162967e-05, | |
| "loss": 0.0456, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6197854588796186, | |
| "grad_norm": 0.3060922920703888, | |
| "learning_rate": 3.7732640529050546e-05, | |
| "loss": 0.0464, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.6257449344457687, | |
| "eval_loss": 0.32704514265060425, | |
| "eval_runtime": 24.2942, | |
| "eval_samples_per_second": 115.254, | |
| "eval_steps_per_second": 1.811, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6317044100119189, | |
| "grad_norm": 0.284631609916687, | |
| "learning_rate": 3.754369390647142e-05, | |
| "loss": 0.0436, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6436233611442194, | |
| "grad_norm": 0.2965853810310364, | |
| "learning_rate": 3.7354747283892307e-05, | |
| "loss": 0.0451, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6555423122765197, | |
| "grad_norm": 0.2642371952533722, | |
| "learning_rate": 3.7165800661313183e-05, | |
| "loss": 0.0447, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6674612634088201, | |
| "grad_norm": 0.2913525402545929, | |
| "learning_rate": 3.697685403873406e-05, | |
| "loss": 0.0438, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.6793802145411204, | |
| "grad_norm": 0.9295551776885986, | |
| "learning_rate": 3.678790741615494e-05, | |
| "loss": 0.0454, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6912991656734208, | |
| "grad_norm": 0.2802795469760895, | |
| "learning_rate": 3.659896079357582e-05, | |
| "loss": 0.0437, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7032181168057211, | |
| "grad_norm": 0.2768346965312958, | |
| "learning_rate": 3.64100141709967e-05, | |
| "loss": 0.0436, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7151370679380215, | |
| "grad_norm": 0.32342103123664856, | |
| "learning_rate": 3.6221067548417575e-05, | |
| "loss": 0.0434, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7151370679380215, | |
| "eval_loss": 0.3151066303253174, | |
| "eval_runtime": 24.3395, | |
| "eval_samples_per_second": 115.039, | |
| "eval_steps_per_second": 1.808, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7270560190703218, | |
| "grad_norm": 0.28825289011001587, | |
| "learning_rate": 3.603212092583845e-05, | |
| "loss": 0.0444, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7389749702026222, | |
| "grad_norm": 0.2973019778728485, | |
| "learning_rate": 3.5843174303259335e-05, | |
| "loss": 0.0432, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7508939213349225, | |
| "grad_norm": 0.28604206442832947, | |
| "learning_rate": 3.565422768068021e-05, | |
| "loss": 0.0427, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.7628128724672228, | |
| "grad_norm": 0.3076627552509308, | |
| "learning_rate": 3.546528105810109e-05, | |
| "loss": 0.0437, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.7747318235995232, | |
| "grad_norm": 0.24243097007274628, | |
| "learning_rate": 3.5276334435521966e-05, | |
| "loss": 0.0419, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7866507747318237, | |
| "grad_norm": 0.2624588906764984, | |
| "learning_rate": 3.508738781294285e-05, | |
| "loss": 0.041, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.798569725864124, | |
| "grad_norm": 0.2966582477092743, | |
| "learning_rate": 3.4898441190363726e-05, | |
| "loss": 0.0419, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8045292014302742, | |
| "eval_loss": 0.30741235613822937, | |
| "eval_runtime": 24.3327, | |
| "eval_samples_per_second": 115.071, | |
| "eval_steps_per_second": 1.808, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8104886769964244, | |
| "grad_norm": 0.2586236894130707, | |
| "learning_rate": 3.47094945677846e-05, | |
| "loss": 0.0417, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8224076281287247, | |
| "grad_norm": 0.26168954372406006, | |
| "learning_rate": 3.452054794520548e-05, | |
| "loss": 0.0416, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.834326579261025, | |
| "grad_norm": 0.2288641631603241, | |
| "learning_rate": 3.4331601322626364e-05, | |
| "loss": 0.0407, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8462455303933254, | |
| "grad_norm": 0.239312082529068, | |
| "learning_rate": 3.414265470004724e-05, | |
| "loss": 0.0424, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.8581644815256257, | |
| "grad_norm": 0.2796385884284973, | |
| "learning_rate": 3.395370807746812e-05, | |
| "loss": 0.0423, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.8700834326579261, | |
| "grad_norm": 0.26898354291915894, | |
| "learning_rate": 3.3764761454888994e-05, | |
| "loss": 0.0404, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.8820023837902264, | |
| "grad_norm": 0.2980300188064575, | |
| "learning_rate": 3.357581483230988e-05, | |
| "loss": 0.0413, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8939213349225268, | |
| "grad_norm": 0.2609230577945709, | |
| "learning_rate": 3.3386868209730755e-05, | |
| "loss": 0.0417, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8939213349225268, | |
| "eval_loss": 0.3016362488269806, | |
| "eval_runtime": 24.39, | |
| "eval_samples_per_second": 114.801, | |
| "eval_steps_per_second": 1.804, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9058402860548271, | |
| "grad_norm": 0.22545361518859863, | |
| "learning_rate": 3.319792158715163e-05, | |
| "loss": 0.0402, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9177592371871275, | |
| "grad_norm": 0.23952241241931915, | |
| "learning_rate": 3.300897496457251e-05, | |
| "loss": 0.0404, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.929678188319428, | |
| "grad_norm": 0.211136594414711, | |
| "learning_rate": 3.282002834199339e-05, | |
| "loss": 0.0394, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9415971394517283, | |
| "grad_norm": 0.2786746323108673, | |
| "learning_rate": 3.263108171941427e-05, | |
| "loss": 0.0405, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.9535160905840286, | |
| "grad_norm": 0.27551010251045227, | |
| "learning_rate": 3.2442135096835146e-05, | |
| "loss": 0.0393, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.965435041716329, | |
| "grad_norm": 0.23502199351787567, | |
| "learning_rate": 3.225318847425602e-05, | |
| "loss": 0.0397, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.9773539928486293, | |
| "grad_norm": 0.22266072034835815, | |
| "learning_rate": 3.2064241851676906e-05, | |
| "loss": 0.0382, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.9833134684147795, | |
| "eval_loss": 0.2927956283092499, | |
| "eval_runtime": 24.5022, | |
| "eval_samples_per_second": 114.276, | |
| "eval_steps_per_second": 1.796, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.9892729439809297, | |
| "grad_norm": 0.2455359548330307, | |
| "learning_rate": 3.187529522909778e-05, | |
| "loss": 0.0377, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0011918951132301, | |
| "grad_norm": 0.19770461320877075, | |
| "learning_rate": 3.168634860651866e-05, | |
| "loss": 0.0394, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.0131108462455305, | |
| "grad_norm": 0.23039540648460388, | |
| "learning_rate": 3.149740198393954e-05, | |
| "loss": 0.0306, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0250297973778308, | |
| "grad_norm": 0.22617976367473602, | |
| "learning_rate": 3.130845536136042e-05, | |
| "loss": 0.0302, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.0369487485101312, | |
| "grad_norm": 0.2306171953678131, | |
| "learning_rate": 3.11195087387813e-05, | |
| "loss": 0.0297, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.0488676996424315, | |
| "grad_norm": 0.2231408953666687, | |
| "learning_rate": 3.0930562116202174e-05, | |
| "loss": 0.0291, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.0607866507747319, | |
| "grad_norm": 0.2368210107088089, | |
| "learning_rate": 3.074161549362305e-05, | |
| "loss": 0.0296, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.0727056019070322, | |
| "grad_norm": 0.2105601727962494, | |
| "learning_rate": 3.0552668871043935e-05, | |
| "loss": 0.0302, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0727056019070322, | |
| "eval_loss": 0.2897118330001831, | |
| "eval_runtime": 24.5143, | |
| "eval_samples_per_second": 114.219, | |
| "eval_steps_per_second": 1.795, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.0846245530393326, | |
| "grad_norm": 0.21115803718566895, | |
| "learning_rate": 3.036372224846481e-05, | |
| "loss": 0.03, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.096543504171633, | |
| "grad_norm": 0.21346606314182281, | |
| "learning_rate": 3.017477562588569e-05, | |
| "loss": 0.0303, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.1084624553039333, | |
| "grad_norm": 0.2169450968503952, | |
| "learning_rate": 2.998582900330657e-05, | |
| "loss": 0.0301, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.1203814064362336, | |
| "grad_norm": 0.22653773427009583, | |
| "learning_rate": 2.979688238072745e-05, | |
| "loss": 0.03, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.132300357568534, | |
| "grad_norm": 0.23219868540763855, | |
| "learning_rate": 2.9607935758148326e-05, | |
| "loss": 0.0299, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.1442193087008343, | |
| "grad_norm": 0.19858673214912415, | |
| "learning_rate": 2.9418989135569203e-05, | |
| "loss": 0.0304, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.1561382598331347, | |
| "grad_norm": 0.20563630759716034, | |
| "learning_rate": 2.9230042512990083e-05, | |
| "loss": 0.0298, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.162097735399285, | |
| "eval_loss": 0.2881970703601837, | |
| "eval_runtime": 24.482, | |
| "eval_samples_per_second": 114.37, | |
| "eval_steps_per_second": 1.797, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.168057210965435, | |
| "grad_norm": 0.23171384632587433, | |
| "learning_rate": 2.9041095890410963e-05, | |
| "loss": 0.0305, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.1799761620977354, | |
| "grad_norm": 0.22927935421466827, | |
| "learning_rate": 2.885214926783184e-05, | |
| "loss": 0.0293, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.1918951132300357, | |
| "grad_norm": 0.22874480485916138, | |
| "learning_rate": 2.8663202645252717e-05, | |
| "loss": 0.0306, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.203814064362336, | |
| "grad_norm": 0.2067805975675583, | |
| "learning_rate": 2.8474256022673597e-05, | |
| "loss": 0.0301, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.2157330154946364, | |
| "grad_norm": 0.2245558649301529, | |
| "learning_rate": 2.8285309400094477e-05, | |
| "loss": 0.032, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.2276519666269368, | |
| "grad_norm": 0.2403639554977417, | |
| "learning_rate": 2.8096362777515354e-05, | |
| "loss": 0.0296, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.2395709177592371, | |
| "grad_norm": 0.2070910781621933, | |
| "learning_rate": 2.790741615493623e-05, | |
| "loss": 0.03, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.2514898688915377, | |
| "grad_norm": 0.23112566769123077, | |
| "learning_rate": 2.771846953235711e-05, | |
| "loss": 0.0295, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.2514898688915377, | |
| "eval_loss": 0.2860635817050934, | |
| "eval_runtime": 24.4598, | |
| "eval_samples_per_second": 114.473, | |
| "eval_steps_per_second": 1.799, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.2634088200238378, | |
| "grad_norm": 0.2321086972951889, | |
| "learning_rate": 2.7529522909777992e-05, | |
| "loss": 0.0307, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.2753277711561384, | |
| "grad_norm": 0.23003174364566803, | |
| "learning_rate": 2.734057628719887e-05, | |
| "loss": 0.0306, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.2872467222884385, | |
| "grad_norm": 0.2210853099822998, | |
| "learning_rate": 2.7151629664619745e-05, | |
| "loss": 0.0289, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.299165673420739, | |
| "grad_norm": 0.2513985335826874, | |
| "learning_rate": 2.6962683042040626e-05, | |
| "loss": 0.0298, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.3110846245530392, | |
| "grad_norm": 0.2379380464553833, | |
| "learning_rate": 2.6773736419461506e-05, | |
| "loss": 0.0301, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.3230035756853398, | |
| "grad_norm": 0.21417196094989777, | |
| "learning_rate": 2.6584789796882383e-05, | |
| "loss": 0.0301, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.3349225268176401, | |
| "grad_norm": 0.22283975780010223, | |
| "learning_rate": 2.639584317430326e-05, | |
| "loss": 0.0309, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.3408820023837902, | |
| "eval_loss": 0.2835468649864197, | |
| "eval_runtime": 24.4862, | |
| "eval_samples_per_second": 114.35, | |
| "eval_steps_per_second": 1.797, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.3468414779499405, | |
| "grad_norm": 0.21475766599178314, | |
| "learning_rate": 2.620689655172414e-05, | |
| "loss": 0.0296, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.3587604290822408, | |
| "grad_norm": 0.2578884959220886, | |
| "learning_rate": 2.601794992914502e-05, | |
| "loss": 0.0285, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.3706793802145412, | |
| "grad_norm": 0.208306223154068, | |
| "learning_rate": 2.5829003306565897e-05, | |
| "loss": 0.0296, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.3825983313468415, | |
| "grad_norm": 0.20098654925823212, | |
| "learning_rate": 2.5640056683986774e-05, | |
| "loss": 0.0299, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.3945172824791419, | |
| "grad_norm": 0.19324277341365814, | |
| "learning_rate": 2.5451110061407654e-05, | |
| "loss": 0.0299, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.4064362336114422, | |
| "grad_norm": 0.22991597652435303, | |
| "learning_rate": 2.5262163438828534e-05, | |
| "loss": 0.0283, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.4183551847437426, | |
| "grad_norm": 0.20805244147777557, | |
| "learning_rate": 2.507321681624941e-05, | |
| "loss": 0.0294, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.430274135876043, | |
| "grad_norm": 0.2217872589826584, | |
| "learning_rate": 2.488427019367029e-05, | |
| "loss": 0.0298, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.430274135876043, | |
| "eval_loss": 0.28145191073417664, | |
| "eval_runtime": 24.4822, | |
| "eval_samples_per_second": 114.369, | |
| "eval_steps_per_second": 1.797, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.4421930870083433, | |
| "grad_norm": 0.2886907756328583, | |
| "learning_rate": 2.469532357109117e-05, | |
| "loss": 0.0286, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.4541120381406436, | |
| "grad_norm": 0.22498397529125214, | |
| "learning_rate": 2.450637694851205e-05, | |
| "loss": 0.0303, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.466030989272944, | |
| "grad_norm": 0.2244011014699936, | |
| "learning_rate": 2.4317430325932926e-05, | |
| "loss": 0.0291, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.4779499404052443, | |
| "grad_norm": 0.254245400428772, | |
| "learning_rate": 2.4128483703353806e-05, | |
| "loss": 0.0291, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.4898688915375446, | |
| "grad_norm": 0.23943567276000977, | |
| "learning_rate": 2.3939537080774683e-05, | |
| "loss": 0.0279, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.5017878426698452, | |
| "grad_norm": 0.19281241297721863, | |
| "learning_rate": 2.3750590458195563e-05, | |
| "loss": 0.0284, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.5137067938021453, | |
| "grad_norm": 0.1942477971315384, | |
| "learning_rate": 2.356164383561644e-05, | |
| "loss": 0.0286, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.5196662693682956, | |
| "eval_loss": 0.2797168493270874, | |
| "eval_runtime": 24.5526, | |
| "eval_samples_per_second": 114.041, | |
| "eval_steps_per_second": 1.792, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.525625744934446, | |
| "grad_norm": 0.21547091007232666, | |
| "learning_rate": 2.337269721303732e-05, | |
| "loss": 0.0299, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.537544696066746, | |
| "grad_norm": 0.2152203619480133, | |
| "learning_rate": 2.3183750590458197e-05, | |
| "loss": 0.0285, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.5494636471990466, | |
| "grad_norm": 0.19925089180469513, | |
| "learning_rate": 2.2994803967879077e-05, | |
| "loss": 0.0282, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.5613825983313467, | |
| "grad_norm": 0.21512266993522644, | |
| "learning_rate": 2.2805857345299954e-05, | |
| "loss": 0.0291, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.5733015494636473, | |
| "grad_norm": 0.2275344282388687, | |
| "learning_rate": 2.2616910722720834e-05, | |
| "loss": 0.0288, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.5852205005959474, | |
| "grad_norm": 0.22899560630321503, | |
| "learning_rate": 2.242796410014171e-05, | |
| "loss": 0.0279, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.597139451728248, | |
| "grad_norm": 0.21439722180366516, | |
| "learning_rate": 2.223901747756259e-05, | |
| "loss": 0.0273, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.6090584028605481, | |
| "grad_norm": 0.21393275260925293, | |
| "learning_rate": 2.205007085498347e-05, | |
| "loss": 0.0284, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.6090584028605481, | |
| "eval_loss": 0.27715668082237244, | |
| "eval_runtime": 24.544, | |
| "eval_samples_per_second": 114.081, | |
| "eval_steps_per_second": 1.793, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.6209773539928487, | |
| "grad_norm": 0.22286593914031982, | |
| "learning_rate": 2.186112423240435e-05, | |
| "loss": 0.028, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.6328963051251488, | |
| "grad_norm": 0.21552002429962158, | |
| "learning_rate": 2.1672177609825225e-05, | |
| "loss": 0.0282, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.6448152562574494, | |
| "grad_norm": 0.20231053233146667, | |
| "learning_rate": 2.1483230987246106e-05, | |
| "loss": 0.0278, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.6567342073897497, | |
| "grad_norm": 0.20446668565273285, | |
| "learning_rate": 2.1294284364666983e-05, | |
| "loss": 0.0283, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.66865315852205, | |
| "grad_norm": 0.21102942526340485, | |
| "learning_rate": 2.1105337742087863e-05, | |
| "loss": 0.029, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.6805721096543504, | |
| "grad_norm": 0.190469890832901, | |
| "learning_rate": 2.091639111950874e-05, | |
| "loss": 0.0286, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.6924910607866508, | |
| "grad_norm": 0.2115412801504135, | |
| "learning_rate": 2.072744449692962e-05, | |
| "loss": 0.0278, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.698450536352801, | |
| "eval_loss": 0.2754322588443756, | |
| "eval_runtime": 24.6377, | |
| "eval_samples_per_second": 113.647, | |
| "eval_steps_per_second": 1.786, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.7044100119189511, | |
| "grad_norm": 0.18578040599822998, | |
| "learning_rate": 2.0538497874350497e-05, | |
| "loss": 0.0284, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.7163289630512515, | |
| "grad_norm": 0.23163530230522156, | |
| "learning_rate": 2.0349551251771377e-05, | |
| "loss": 0.0277, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.7282479141835518, | |
| "grad_norm": 0.2035950869321823, | |
| "learning_rate": 2.0160604629192254e-05, | |
| "loss": 0.0275, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.7401668653158522, | |
| "grad_norm": 0.24992471933364868, | |
| "learning_rate": 1.9971658006613134e-05, | |
| "loss": 0.028, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.7520858164481525, | |
| "grad_norm": 0.20510628819465637, | |
| "learning_rate": 1.978271138403401e-05, | |
| "loss": 0.027, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.7640047675804529, | |
| "grad_norm": 0.216608926653862, | |
| "learning_rate": 1.959376476145489e-05, | |
| "loss": 0.0283, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.7759237187127532, | |
| "grad_norm": 0.20540915429592133, | |
| "learning_rate": 1.9404818138875768e-05, | |
| "loss": 0.0274, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.7878426698450536, | |
| "grad_norm": 0.20191486179828644, | |
| "learning_rate": 1.921587151629665e-05, | |
| "loss": 0.0275, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.7878426698450536, | |
| "eval_loss": 0.27235740423202515, | |
| "eval_runtime": 24.612, | |
| "eval_samples_per_second": 113.766, | |
| "eval_steps_per_second": 1.788, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.7997616209773541, | |
| "grad_norm": 0.24276046454906464, | |
| "learning_rate": 1.9026924893717525e-05, | |
| "loss": 0.028, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.8116805721096543, | |
| "grad_norm": 0.21864300966262817, | |
| "learning_rate": 1.8837978271138406e-05, | |
| "loss": 0.0279, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.8235995232419548, | |
| "grad_norm": 0.2271438091993332, | |
| "learning_rate": 1.8649031648559282e-05, | |
| "loss": 0.0276, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.835518474374255, | |
| "grad_norm": 0.20855475962162018, | |
| "learning_rate": 1.8460085025980163e-05, | |
| "loss": 0.0283, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.8474374255065555, | |
| "grad_norm": 0.2064722329378128, | |
| "learning_rate": 1.827113840340104e-05, | |
| "loss": 0.0271, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.8593563766388557, | |
| "grad_norm": 0.2111743986606598, | |
| "learning_rate": 1.808219178082192e-05, | |
| "loss": 0.0273, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 1.8712753277711562, | |
| "grad_norm": 0.18417872488498688, | |
| "learning_rate": 1.7893245158242797e-05, | |
| "loss": 0.0275, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 1.8772348033373063, | |
| "eval_loss": 0.2706995904445648, | |
| "eval_runtime": 24.5215, | |
| "eval_samples_per_second": 114.186, | |
| "eval_steps_per_second": 1.794, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.8831942789034564, | |
| "grad_norm": 0.19550646841526031, | |
| "learning_rate": 1.7704298535663677e-05, | |
| "loss": 0.0275, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 1.895113230035757, | |
| "grad_norm": 0.19912759959697723, | |
| "learning_rate": 1.7515351913084554e-05, | |
| "loss": 0.0267, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 1.907032181168057, | |
| "grad_norm": 0.20110997557640076, | |
| "learning_rate": 1.7326405290505434e-05, | |
| "loss": 0.0281, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.9189511323003576, | |
| "grad_norm": 0.20968247950077057, | |
| "learning_rate": 1.713745866792631e-05, | |
| "loss": 0.0271, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 1.930870083432658, | |
| "grad_norm": 0.20576246082782745, | |
| "learning_rate": 1.694851204534719e-05, | |
| "loss": 0.0277, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 1.9427890345649583, | |
| "grad_norm": 0.21499717235565186, | |
| "learning_rate": 1.6759565422768068e-05, | |
| "loss": 0.0264, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 1.9547079856972587, | |
| "grad_norm": 0.23822776973247528, | |
| "learning_rate": 1.6570618800188948e-05, | |
| "loss": 0.027, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 1.966626936829559, | |
| "grad_norm": 0.21049529314041138, | |
| "learning_rate": 1.6381672177609825e-05, | |
| "loss": 0.0259, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.966626936829559, | |
| "eval_loss": 0.2697894275188446, | |
| "eval_runtime": 24.6235, | |
| "eval_samples_per_second": 113.713, | |
| "eval_steps_per_second": 1.787, | |
| "step": 3300 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 5034, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2189847399207797e+20, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |