{ "best_metric": 0.2697894275188446, "best_model_checkpoint": "/mlspeech/data/yoadsnapir/models/heb_small_exp_1/checkpoint-3300", "epoch": 1.966626936829559, "eval_steps": 150, "global_step": 3300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011918951132300357, "grad_norm": 0.46440812945365906, "learning_rate": 1.0000000000000002e-06, "loss": 0.0799, "step": 20 }, { "epoch": 0.023837902264600714, "grad_norm": 0.3614272475242615, "learning_rate": 2.0000000000000003e-06, "loss": 0.0757, "step": 40 }, { "epoch": 0.03575685339690107, "grad_norm": 0.3306252658367157, "learning_rate": 3e-06, "loss": 0.0707, "step": 60 }, { "epoch": 0.04767580452920143, "grad_norm": 0.3750213384628296, "learning_rate": 4.000000000000001e-06, "loss": 0.0707, "step": 80 }, { "epoch": 0.05959475566150179, "grad_norm": 0.3395947515964508, "learning_rate": 5e-06, "loss": 0.0677, "step": 100 }, { "epoch": 0.07151370679380215, "grad_norm": 0.3402997553348541, "learning_rate": 6e-06, "loss": 0.0651, "step": 120 }, { "epoch": 0.08343265792610251, "grad_norm": 0.36161497235298157, "learning_rate": 7e-06, "loss": 0.0662, "step": 140 }, { "epoch": 0.08939213349225268, "eval_loss": 0.4555220901966095, "eval_runtime": 32.3927, "eval_samples_per_second": 86.439, "eval_steps_per_second": 1.358, "step": 150 }, { "epoch": 0.09535160905840286, "grad_norm": 0.35821646451950073, "learning_rate": 8.000000000000001e-06, "loss": 0.0659, "step": 160 }, { "epoch": 0.10727056019070322, "grad_norm": 0.3810754716396332, "learning_rate": 9e-06, "loss": 0.0632, "step": 180 }, { "epoch": 0.11918951132300358, "grad_norm": 0.35852399468421936, "learning_rate": 1e-05, "loss": 0.0636, "step": 200 }, { "epoch": 0.13110846245530394, "grad_norm": 0.33330053091049194, "learning_rate": 1.1000000000000001e-05, "loss": 0.0632, "step": 220 }, { "epoch": 0.1430274135876043, "grad_norm": 0.3282977044582367, "learning_rate": 1.2e-05, "loss": 0.0617, "step": 240 }, { "epoch": 0.15494636471990464, "grad_norm": 0.32969963550567627, "learning_rate": 1.3000000000000001e-05, "loss": 0.0609, "step": 260 }, { "epoch": 0.16686531585220502, "grad_norm": 0.33057352900505066, "learning_rate": 1.4e-05, "loss": 0.0606, "step": 280 }, { "epoch": 0.17878426698450536, "grad_norm": 0.3497098982334137, "learning_rate": 1.5000000000000002e-05, "loss": 0.0598, "step": 300 }, { "epoch": 0.17878426698450536, "eval_loss": 0.4085025489330292, "eval_runtime": 24.3982, "eval_samples_per_second": 114.763, "eval_steps_per_second": 1.803, "step": 300 }, { "epoch": 0.1907032181168057, "grad_norm": 0.34013646841049194, "learning_rate": 1.6000000000000003e-05, "loss": 0.0586, "step": 320 }, { "epoch": 0.2026221692491061, "grad_norm": 0.3030904233455658, "learning_rate": 1.7e-05, "loss": 0.0588, "step": 340 }, { "epoch": 0.21454112038140644, "grad_norm": 0.3328603208065033, "learning_rate": 1.8e-05, "loss": 0.0562, "step": 360 }, { "epoch": 0.22646007151370678, "grad_norm": 0.32149162888526917, "learning_rate": 1.9e-05, "loss": 0.0559, "step": 380 }, { "epoch": 0.23837902264600716, "grad_norm": 0.2991398572921753, "learning_rate": 2e-05, "loss": 0.0553, "step": 400 }, { "epoch": 0.25029797377830754, "grad_norm": 0.30644193291664124, "learning_rate": 2.1000000000000002e-05, "loss": 0.0556, "step": 420 }, { "epoch": 0.2622169249106079, "grad_norm": 0.357546329498291, "learning_rate": 2.2000000000000003e-05, "loss": 0.0546, "step": 440 }, { "epoch": 0.26817640047675806, "eval_loss": 0.38544225692749023, "eval_runtime": 24.3319, "eval_samples_per_second": 115.075, "eval_steps_per_second": 1.808, "step": 450 }, { "epoch": 0.27413587604290823, "grad_norm": 0.3578217923641205, "learning_rate": 2.3e-05, "loss": 0.054, "step": 460 }, { "epoch": 0.2860548271752086, "grad_norm": 0.34381797909736633, "learning_rate": 2.4e-05, "loss": 0.0553, "step": 480 }, { "epoch": 0.29797377830750893, "grad_norm": 0.30176016688346863, "learning_rate": 2.5e-05, "loss": 0.0543, "step": 500 }, { "epoch": 0.3098927294398093, "grad_norm": 0.33324921131134033, "learning_rate": 2.6000000000000002e-05, "loss": 0.0544, "step": 520 }, { "epoch": 0.3218116805721097, "grad_norm": 0.3484705090522766, "learning_rate": 2.7000000000000002e-05, "loss": 0.0531, "step": 540 }, { "epoch": 0.33373063170441003, "grad_norm": 0.3282860219478607, "learning_rate": 2.8e-05, "loss": 0.0518, "step": 560 }, { "epoch": 0.3456495828367104, "grad_norm": 0.3473268449306488, "learning_rate": 2.9e-05, "loss": 0.0515, "step": 580 }, { "epoch": 0.3575685339690107, "grad_norm": 0.3255419433116913, "learning_rate": 3.0000000000000004e-05, "loss": 0.0526, "step": 600 }, { "epoch": 0.3575685339690107, "eval_loss": 0.36782801151275635, "eval_runtime": 24.3536, "eval_samples_per_second": 114.973, "eval_steps_per_second": 1.807, "step": 600 }, { "epoch": 0.3694874851013111, "grad_norm": 0.3105810880661011, "learning_rate": 3.1e-05, "loss": 0.0505, "step": 620 }, { "epoch": 0.3814064362336114, "grad_norm": 0.3272826373577118, "learning_rate": 3.2000000000000005e-05, "loss": 0.0514, "step": 640 }, { "epoch": 0.3933253873659118, "grad_norm": 0.2896602749824524, "learning_rate": 3.3e-05, "loss": 0.0509, "step": 660 }, { "epoch": 0.4052443384982122, "grad_norm": 0.2664831876754761, "learning_rate": 3.4e-05, "loss": 0.0508, "step": 680 }, { "epoch": 0.4171632896305125, "grad_norm": 0.29491451382637024, "learning_rate": 3.5000000000000004e-05, "loss": 0.0503, "step": 700 }, { "epoch": 0.42908224076281287, "grad_norm": 0.3337947726249695, "learning_rate": 3.6e-05, "loss": 0.0508, "step": 720 }, { "epoch": 0.4410011918951132, "grad_norm": 0.274883508682251, "learning_rate": 3.7000000000000005e-05, "loss": 0.0491, "step": 740 }, { "epoch": 0.4469606674612634, "eval_loss": 0.3556332290172577, "eval_runtime": 24.3629, "eval_samples_per_second": 114.929, "eval_steps_per_second": 1.806, "step": 750 }, { "epoch": 0.45292014302741357, "grad_norm": 0.3304573595523834, "learning_rate": 3.8e-05, "loss": 0.0496, "step": 760 }, { "epoch": 0.464839094159714, "grad_norm": 0.43425601720809937, "learning_rate": 3.9e-05, "loss": 0.0514, "step": 780 }, { "epoch": 0.4767580452920143, "grad_norm": 0.27023833990097046, "learning_rate": 4e-05, "loss": 0.0493, "step": 800 }, { "epoch": 0.48867699642431467, "grad_norm": 0.30646392703056335, "learning_rate": 3.981105337742088e-05, "loss": 0.0479, "step": 820 }, { "epoch": 0.5005959475566151, "grad_norm": 0.2886641025543213, "learning_rate": 3.9622106754841764e-05, "loss": 0.0477, "step": 840 }, { "epoch": 0.5125148986889154, "grad_norm": 0.2744874358177185, "learning_rate": 3.943316013226264e-05, "loss": 0.0475, "step": 860 }, { "epoch": 0.5244338498212158, "grad_norm": 0.3189797103404999, "learning_rate": 3.924421350968352e-05, "loss": 0.0487, "step": 880 }, { "epoch": 0.5363528009535161, "grad_norm": 0.2827907204627991, "learning_rate": 3.9055266887104394e-05, "loss": 0.047, "step": 900 }, { "epoch": 0.5363528009535161, "eval_loss": 0.3409503102302551, "eval_runtime": 24.4692, "eval_samples_per_second": 114.43, "eval_steps_per_second": 1.798, "step": 900 }, { "epoch": 0.5482717520858165, "grad_norm": 0.29098019003868103, "learning_rate": 3.886632026452528e-05, "loss": 0.0477, "step": 920 }, { "epoch": 0.5601907032181168, "grad_norm": 0.34882616996765137, "learning_rate": 3.8677373641946155e-05, "loss": 0.0469, "step": 940 }, { "epoch": 0.5721096543504172, "grad_norm": 0.27143335342407227, "learning_rate": 3.848842701936703e-05, "loss": 0.0463, "step": 960 }, { "epoch": 0.5840286054827175, "grad_norm": 0.3700932562351227, "learning_rate": 3.829948039678791e-05, "loss": 0.0471, "step": 980 }, { "epoch": 0.5959475566150179, "grad_norm": 0.2656194567680359, "learning_rate": 3.811053377420879e-05, "loss": 0.0435, "step": 1000 }, { "epoch": 0.6078665077473182, "grad_norm": 0.3158736526966095, "learning_rate": 3.792158715162967e-05, "loss": 0.0456, "step": 1020 }, { "epoch": 0.6197854588796186, "grad_norm": 0.3060922920703888, "learning_rate": 3.7732640529050546e-05, "loss": 0.0464, "step": 1040 }, { "epoch": 0.6257449344457687, "eval_loss": 0.32704514265060425, "eval_runtime": 24.2942, "eval_samples_per_second": 115.254, "eval_steps_per_second": 1.811, "step": 1050 }, { "epoch": 0.6317044100119189, "grad_norm": 0.284631609916687, "learning_rate": 3.754369390647142e-05, "loss": 0.0436, "step": 1060 }, { "epoch": 0.6436233611442194, "grad_norm": 0.2965853810310364, "learning_rate": 3.7354747283892307e-05, "loss": 0.0451, "step": 1080 }, { "epoch": 0.6555423122765197, "grad_norm": 0.2642371952533722, "learning_rate": 3.7165800661313183e-05, "loss": 0.0447, "step": 1100 }, { "epoch": 0.6674612634088201, "grad_norm": 0.2913525402545929, "learning_rate": 3.697685403873406e-05, "loss": 0.0438, "step": 1120 }, { "epoch": 0.6793802145411204, "grad_norm": 0.9295551776885986, "learning_rate": 3.678790741615494e-05, "loss": 0.0454, "step": 1140 }, { "epoch": 0.6912991656734208, "grad_norm": 0.2802795469760895, "learning_rate": 3.659896079357582e-05, "loss": 0.0437, "step": 1160 }, { "epoch": 0.7032181168057211, "grad_norm": 0.2768346965312958, "learning_rate": 3.64100141709967e-05, "loss": 0.0436, "step": 1180 }, { "epoch": 0.7151370679380215, "grad_norm": 0.32342103123664856, "learning_rate": 3.6221067548417575e-05, "loss": 0.0434, "step": 1200 }, { "epoch": 0.7151370679380215, "eval_loss": 0.3151066303253174, "eval_runtime": 24.3395, "eval_samples_per_second": 115.039, "eval_steps_per_second": 1.808, "step": 1200 }, { "epoch": 0.7270560190703218, "grad_norm": 0.28825289011001587, "learning_rate": 3.603212092583845e-05, "loss": 0.0444, "step": 1220 }, { "epoch": 0.7389749702026222, "grad_norm": 0.2973019778728485, "learning_rate": 3.5843174303259335e-05, "loss": 0.0432, "step": 1240 }, { "epoch": 0.7508939213349225, "grad_norm": 0.28604206442832947, "learning_rate": 3.565422768068021e-05, "loss": 0.0427, "step": 1260 }, { "epoch": 0.7628128724672228, "grad_norm": 0.3076627552509308, "learning_rate": 3.546528105810109e-05, "loss": 0.0437, "step": 1280 }, { "epoch": 0.7747318235995232, "grad_norm": 0.24243097007274628, "learning_rate": 3.5276334435521966e-05, "loss": 0.0419, "step": 1300 }, { "epoch": 0.7866507747318237, "grad_norm": 0.2624588906764984, "learning_rate": 3.508738781294285e-05, "loss": 0.041, "step": 1320 }, { "epoch": 0.798569725864124, "grad_norm": 0.2966582477092743, "learning_rate": 3.4898441190363726e-05, "loss": 0.0419, "step": 1340 }, { "epoch": 0.8045292014302742, "eval_loss": 0.30741235613822937, "eval_runtime": 24.3327, "eval_samples_per_second": 115.071, "eval_steps_per_second": 1.808, "step": 1350 }, { "epoch": 0.8104886769964244, "grad_norm": 0.2586236894130707, "learning_rate": 3.47094945677846e-05, "loss": 0.0417, "step": 1360 }, { "epoch": 0.8224076281287247, "grad_norm": 0.26168954372406006, "learning_rate": 3.452054794520548e-05, "loss": 0.0416, "step": 1380 }, { "epoch": 0.834326579261025, "grad_norm": 0.2288641631603241, "learning_rate": 3.4331601322626364e-05, "loss": 0.0407, "step": 1400 }, { "epoch": 0.8462455303933254, "grad_norm": 0.239312082529068, "learning_rate": 3.414265470004724e-05, "loss": 0.0424, "step": 1420 }, { "epoch": 0.8581644815256257, "grad_norm": 0.2796385884284973, "learning_rate": 3.395370807746812e-05, "loss": 0.0423, "step": 1440 }, { "epoch": 0.8700834326579261, "grad_norm": 0.26898354291915894, "learning_rate": 3.3764761454888994e-05, "loss": 0.0404, "step": 1460 }, { "epoch": 0.8820023837902264, "grad_norm": 0.2980300188064575, "learning_rate": 3.357581483230988e-05, "loss": 0.0413, "step": 1480 }, { "epoch": 0.8939213349225268, "grad_norm": 0.2609230577945709, "learning_rate": 3.3386868209730755e-05, "loss": 0.0417, "step": 1500 }, { "epoch": 0.8939213349225268, "eval_loss": 0.3016362488269806, "eval_runtime": 24.39, "eval_samples_per_second": 114.801, "eval_steps_per_second": 1.804, "step": 1500 }, { "epoch": 0.9058402860548271, "grad_norm": 0.22545361518859863, "learning_rate": 3.319792158715163e-05, "loss": 0.0402, "step": 1520 }, { "epoch": 0.9177592371871275, "grad_norm": 0.23952241241931915, "learning_rate": 3.300897496457251e-05, "loss": 0.0404, "step": 1540 }, { "epoch": 0.929678188319428, "grad_norm": 0.211136594414711, "learning_rate": 3.282002834199339e-05, "loss": 0.0394, "step": 1560 }, { "epoch": 0.9415971394517283, "grad_norm": 0.2786746323108673, "learning_rate": 3.263108171941427e-05, "loss": 0.0405, "step": 1580 }, { "epoch": 0.9535160905840286, "grad_norm": 0.27551010251045227, "learning_rate": 3.2442135096835146e-05, "loss": 0.0393, "step": 1600 }, { "epoch": 0.965435041716329, "grad_norm": 0.23502199351787567, "learning_rate": 3.225318847425602e-05, "loss": 0.0397, "step": 1620 }, { "epoch": 0.9773539928486293, "grad_norm": 0.22266072034835815, "learning_rate": 3.2064241851676906e-05, "loss": 0.0382, "step": 1640 }, { "epoch": 0.9833134684147795, "eval_loss": 0.2927956283092499, "eval_runtime": 24.5022, "eval_samples_per_second": 114.276, "eval_steps_per_second": 1.796, "step": 1650 }, { "epoch": 0.9892729439809297, "grad_norm": 0.2455359548330307, "learning_rate": 3.187529522909778e-05, "loss": 0.0377, "step": 1660 }, { "epoch": 1.0011918951132301, "grad_norm": 0.19770461320877075, "learning_rate": 3.168634860651866e-05, "loss": 0.0394, "step": 1680 }, { "epoch": 1.0131108462455305, "grad_norm": 0.23039540648460388, "learning_rate": 3.149740198393954e-05, "loss": 0.0306, "step": 1700 }, { "epoch": 1.0250297973778308, "grad_norm": 0.22617976367473602, "learning_rate": 3.130845536136042e-05, "loss": 0.0302, "step": 1720 }, { "epoch": 1.0369487485101312, "grad_norm": 0.2306171953678131, "learning_rate": 3.11195087387813e-05, "loss": 0.0297, "step": 1740 }, { "epoch": 1.0488676996424315, "grad_norm": 0.2231408953666687, "learning_rate": 3.0930562116202174e-05, "loss": 0.0291, "step": 1760 }, { "epoch": 1.0607866507747319, "grad_norm": 0.2368210107088089, "learning_rate": 3.074161549362305e-05, "loss": 0.0296, "step": 1780 }, { "epoch": 1.0727056019070322, "grad_norm": 0.2105601727962494, "learning_rate": 3.0552668871043935e-05, "loss": 0.0302, "step": 1800 }, { "epoch": 1.0727056019070322, "eval_loss": 0.2897118330001831, "eval_runtime": 24.5143, "eval_samples_per_second": 114.219, "eval_steps_per_second": 1.795, "step": 1800 }, { "epoch": 1.0846245530393326, "grad_norm": 0.21115803718566895, "learning_rate": 3.036372224846481e-05, "loss": 0.03, "step": 1820 }, { "epoch": 1.096543504171633, "grad_norm": 0.21346606314182281, "learning_rate": 3.017477562588569e-05, "loss": 0.0303, "step": 1840 }, { "epoch": 1.1084624553039333, "grad_norm": 0.2169450968503952, "learning_rate": 2.998582900330657e-05, "loss": 0.0301, "step": 1860 }, { "epoch": 1.1203814064362336, "grad_norm": 0.22653773427009583, "learning_rate": 2.979688238072745e-05, "loss": 0.03, "step": 1880 }, { "epoch": 1.132300357568534, "grad_norm": 0.23219868540763855, "learning_rate": 2.9607935758148326e-05, "loss": 0.0299, "step": 1900 }, { "epoch": 1.1442193087008343, "grad_norm": 0.19858673214912415, "learning_rate": 2.9418989135569203e-05, "loss": 0.0304, "step": 1920 }, { "epoch": 1.1561382598331347, "grad_norm": 0.20563630759716034, "learning_rate": 2.9230042512990083e-05, "loss": 0.0298, "step": 1940 }, { "epoch": 1.162097735399285, "eval_loss": 0.2881970703601837, "eval_runtime": 24.482, "eval_samples_per_second": 114.37, "eval_steps_per_second": 1.797, "step": 1950 }, { "epoch": 1.168057210965435, "grad_norm": 0.23171384632587433, "learning_rate": 2.9041095890410963e-05, "loss": 0.0305, "step": 1960 }, { "epoch": 1.1799761620977354, "grad_norm": 0.22927935421466827, "learning_rate": 2.885214926783184e-05, "loss": 0.0293, "step": 1980 }, { "epoch": 1.1918951132300357, "grad_norm": 0.22874480485916138, "learning_rate": 2.8663202645252717e-05, "loss": 0.0306, "step": 2000 }, { "epoch": 1.203814064362336, "grad_norm": 0.2067805975675583, "learning_rate": 2.8474256022673597e-05, "loss": 0.0301, "step": 2020 }, { "epoch": 1.2157330154946364, "grad_norm": 0.2245558649301529, "learning_rate": 2.8285309400094477e-05, "loss": 0.032, "step": 2040 }, { "epoch": 1.2276519666269368, "grad_norm": 0.2403639554977417, "learning_rate": 2.8096362777515354e-05, "loss": 0.0296, "step": 2060 }, { "epoch": 1.2395709177592371, "grad_norm": 0.2070910781621933, "learning_rate": 2.790741615493623e-05, "loss": 0.03, "step": 2080 }, { "epoch": 1.2514898688915377, "grad_norm": 0.23112566769123077, "learning_rate": 2.771846953235711e-05, "loss": 0.0295, "step": 2100 }, { "epoch": 1.2514898688915377, "eval_loss": 0.2860635817050934, "eval_runtime": 24.4598, "eval_samples_per_second": 114.473, "eval_steps_per_second": 1.799, "step": 2100 }, { "epoch": 1.2634088200238378, "grad_norm": 0.2321086972951889, "learning_rate": 2.7529522909777992e-05, "loss": 0.0307, "step": 2120 }, { "epoch": 1.2753277711561384, "grad_norm": 0.23003174364566803, "learning_rate": 2.734057628719887e-05, "loss": 0.0306, "step": 2140 }, { "epoch": 1.2872467222884385, "grad_norm": 0.2210853099822998, "learning_rate": 2.7151629664619745e-05, "loss": 0.0289, "step": 2160 }, { "epoch": 1.299165673420739, "grad_norm": 0.2513985335826874, "learning_rate": 2.6962683042040626e-05, "loss": 0.0298, "step": 2180 }, { "epoch": 1.3110846245530392, "grad_norm": 0.2379380464553833, "learning_rate": 2.6773736419461506e-05, "loss": 0.0301, "step": 2200 }, { "epoch": 1.3230035756853398, "grad_norm": 0.21417196094989777, "learning_rate": 2.6584789796882383e-05, "loss": 0.0301, "step": 2220 }, { "epoch": 1.3349225268176401, "grad_norm": 0.22283975780010223, "learning_rate": 2.639584317430326e-05, "loss": 0.0309, "step": 2240 }, { "epoch": 1.3408820023837902, "eval_loss": 0.2835468649864197, "eval_runtime": 24.4862, "eval_samples_per_second": 114.35, "eval_steps_per_second": 1.797, "step": 2250 }, { "epoch": 1.3468414779499405, "grad_norm": 0.21475766599178314, "learning_rate": 2.620689655172414e-05, "loss": 0.0296, "step": 2260 }, { "epoch": 1.3587604290822408, "grad_norm": 0.2578884959220886, "learning_rate": 2.601794992914502e-05, "loss": 0.0285, "step": 2280 }, { "epoch": 1.3706793802145412, "grad_norm": 0.208306223154068, "learning_rate": 2.5829003306565897e-05, "loss": 0.0296, "step": 2300 }, { "epoch": 1.3825983313468415, "grad_norm": 0.20098654925823212, "learning_rate": 2.5640056683986774e-05, "loss": 0.0299, "step": 2320 }, { "epoch": 1.3945172824791419, "grad_norm": 0.19324277341365814, "learning_rate": 2.5451110061407654e-05, "loss": 0.0299, "step": 2340 }, { "epoch": 1.4064362336114422, "grad_norm": 0.22991597652435303, "learning_rate": 2.5262163438828534e-05, "loss": 0.0283, "step": 2360 }, { "epoch": 1.4183551847437426, "grad_norm": 0.20805244147777557, "learning_rate": 2.507321681624941e-05, "loss": 0.0294, "step": 2380 }, { "epoch": 1.430274135876043, "grad_norm": 0.2217872589826584, "learning_rate": 2.488427019367029e-05, "loss": 0.0298, "step": 2400 }, { "epoch": 1.430274135876043, "eval_loss": 0.28145191073417664, "eval_runtime": 24.4822, "eval_samples_per_second": 114.369, "eval_steps_per_second": 1.797, "step": 2400 }, { "epoch": 1.4421930870083433, "grad_norm": 0.2886907756328583, "learning_rate": 2.469532357109117e-05, "loss": 0.0286, "step": 2420 }, { "epoch": 1.4541120381406436, "grad_norm": 0.22498397529125214, "learning_rate": 2.450637694851205e-05, "loss": 0.0303, "step": 2440 }, { "epoch": 1.466030989272944, "grad_norm": 0.2244011014699936, "learning_rate": 2.4317430325932926e-05, "loss": 0.0291, "step": 2460 }, { "epoch": 1.4779499404052443, "grad_norm": 0.254245400428772, "learning_rate": 2.4128483703353806e-05, "loss": 0.0291, "step": 2480 }, { "epoch": 1.4898688915375446, "grad_norm": 0.23943567276000977, "learning_rate": 2.3939537080774683e-05, "loss": 0.0279, "step": 2500 }, { "epoch": 1.5017878426698452, "grad_norm": 0.19281241297721863, "learning_rate": 2.3750590458195563e-05, "loss": 0.0284, "step": 2520 }, { "epoch": 1.5137067938021453, "grad_norm": 0.1942477971315384, "learning_rate": 2.356164383561644e-05, "loss": 0.0286, "step": 2540 }, { "epoch": 1.5196662693682956, "eval_loss": 0.2797168493270874, "eval_runtime": 24.5526, "eval_samples_per_second": 114.041, "eval_steps_per_second": 1.792, "step": 2550 }, { "epoch": 1.525625744934446, "grad_norm": 0.21547091007232666, "learning_rate": 2.337269721303732e-05, "loss": 0.0299, "step": 2560 }, { "epoch": 1.537544696066746, "grad_norm": 0.2152203619480133, "learning_rate": 2.3183750590458197e-05, "loss": 0.0285, "step": 2580 }, { "epoch": 1.5494636471990466, "grad_norm": 0.19925089180469513, "learning_rate": 2.2994803967879077e-05, "loss": 0.0282, "step": 2600 }, { "epoch": 1.5613825983313467, "grad_norm": 0.21512266993522644, "learning_rate": 2.2805857345299954e-05, "loss": 0.0291, "step": 2620 }, { "epoch": 1.5733015494636473, "grad_norm": 0.2275344282388687, "learning_rate": 2.2616910722720834e-05, "loss": 0.0288, "step": 2640 }, { "epoch": 1.5852205005959474, "grad_norm": 0.22899560630321503, "learning_rate": 2.242796410014171e-05, "loss": 0.0279, "step": 2660 }, { "epoch": 1.597139451728248, "grad_norm": 0.21439722180366516, "learning_rate": 2.223901747756259e-05, "loss": 0.0273, "step": 2680 }, { "epoch": 1.6090584028605481, "grad_norm": 0.21393275260925293, "learning_rate": 2.205007085498347e-05, "loss": 0.0284, "step": 2700 }, { "epoch": 1.6090584028605481, "eval_loss": 0.27715668082237244, "eval_runtime": 24.544, "eval_samples_per_second": 114.081, "eval_steps_per_second": 1.793, "step": 2700 }, { "epoch": 1.6209773539928487, "grad_norm": 0.22286593914031982, "learning_rate": 2.186112423240435e-05, "loss": 0.028, "step": 2720 }, { "epoch": 1.6328963051251488, "grad_norm": 0.21552002429962158, "learning_rate": 2.1672177609825225e-05, "loss": 0.0282, "step": 2740 }, { "epoch": 1.6448152562574494, "grad_norm": 0.20231053233146667, "learning_rate": 2.1483230987246106e-05, "loss": 0.0278, "step": 2760 }, { "epoch": 1.6567342073897497, "grad_norm": 0.20446668565273285, "learning_rate": 2.1294284364666983e-05, "loss": 0.0283, "step": 2780 }, { "epoch": 1.66865315852205, "grad_norm": 0.21102942526340485, "learning_rate": 2.1105337742087863e-05, "loss": 0.029, "step": 2800 }, { "epoch": 1.6805721096543504, "grad_norm": 0.190469890832901, "learning_rate": 2.091639111950874e-05, "loss": 0.0286, "step": 2820 }, { "epoch": 1.6924910607866508, "grad_norm": 0.2115412801504135, "learning_rate": 2.072744449692962e-05, "loss": 0.0278, "step": 2840 }, { "epoch": 1.698450536352801, "eval_loss": 0.2754322588443756, "eval_runtime": 24.6377, "eval_samples_per_second": 113.647, "eval_steps_per_second": 1.786, "step": 2850 }, { "epoch": 1.7044100119189511, "grad_norm": 0.18578040599822998, "learning_rate": 2.0538497874350497e-05, "loss": 0.0284, "step": 2860 }, { "epoch": 1.7163289630512515, "grad_norm": 0.23163530230522156, "learning_rate": 2.0349551251771377e-05, "loss": 0.0277, "step": 2880 }, { "epoch": 1.7282479141835518, "grad_norm": 0.2035950869321823, "learning_rate": 2.0160604629192254e-05, "loss": 0.0275, "step": 2900 }, { "epoch": 1.7401668653158522, "grad_norm": 0.24992471933364868, "learning_rate": 1.9971658006613134e-05, "loss": 0.028, "step": 2920 }, { "epoch": 1.7520858164481525, "grad_norm": 0.20510628819465637, "learning_rate": 1.978271138403401e-05, "loss": 0.027, "step": 2940 }, { "epoch": 1.7640047675804529, "grad_norm": 0.216608926653862, "learning_rate": 1.959376476145489e-05, "loss": 0.0283, "step": 2960 }, { "epoch": 1.7759237187127532, "grad_norm": 0.20540915429592133, "learning_rate": 1.9404818138875768e-05, "loss": 0.0274, "step": 2980 }, { "epoch": 1.7878426698450536, "grad_norm": 0.20191486179828644, "learning_rate": 1.921587151629665e-05, "loss": 0.0275, "step": 3000 }, { "epoch": 1.7878426698450536, "eval_loss": 0.27235740423202515, "eval_runtime": 24.612, "eval_samples_per_second": 113.766, "eval_steps_per_second": 1.788, "step": 3000 }, { "epoch": 1.7997616209773541, "grad_norm": 0.24276046454906464, "learning_rate": 1.9026924893717525e-05, "loss": 0.028, "step": 3020 }, { "epoch": 1.8116805721096543, "grad_norm": 0.21864300966262817, "learning_rate": 1.8837978271138406e-05, "loss": 0.0279, "step": 3040 }, { "epoch": 1.8235995232419548, "grad_norm": 0.2271438091993332, "learning_rate": 1.8649031648559282e-05, "loss": 0.0276, "step": 3060 }, { "epoch": 1.835518474374255, "grad_norm": 0.20855475962162018, "learning_rate": 1.8460085025980163e-05, "loss": 0.0283, "step": 3080 }, { "epoch": 1.8474374255065555, "grad_norm": 0.2064722329378128, "learning_rate": 1.827113840340104e-05, "loss": 0.0271, "step": 3100 }, { "epoch": 1.8593563766388557, "grad_norm": 0.2111743986606598, "learning_rate": 1.808219178082192e-05, "loss": 0.0273, "step": 3120 }, { "epoch": 1.8712753277711562, "grad_norm": 0.18417872488498688, "learning_rate": 1.7893245158242797e-05, "loss": 0.0275, "step": 3140 }, { "epoch": 1.8772348033373063, "eval_loss": 0.2706995904445648, "eval_runtime": 24.5215, "eval_samples_per_second": 114.186, "eval_steps_per_second": 1.794, "step": 3150 }, { "epoch": 1.8831942789034564, "grad_norm": 0.19550646841526031, "learning_rate": 1.7704298535663677e-05, "loss": 0.0275, "step": 3160 }, { "epoch": 1.895113230035757, "grad_norm": 0.19912759959697723, "learning_rate": 1.7515351913084554e-05, "loss": 0.0267, "step": 3180 }, { "epoch": 1.907032181168057, "grad_norm": 0.20110997557640076, "learning_rate": 1.7326405290505434e-05, "loss": 0.0281, "step": 3200 }, { "epoch": 1.9189511323003576, "grad_norm": 0.20968247950077057, "learning_rate": 1.713745866792631e-05, "loss": 0.0271, "step": 3220 }, { "epoch": 1.930870083432658, "grad_norm": 0.20576246082782745, "learning_rate": 1.694851204534719e-05, "loss": 0.0277, "step": 3240 }, { "epoch": 1.9427890345649583, "grad_norm": 0.21499717235565186, "learning_rate": 1.6759565422768068e-05, "loss": 0.0264, "step": 3260 }, { "epoch": 1.9547079856972587, "grad_norm": 0.23822776973247528, "learning_rate": 1.6570618800188948e-05, "loss": 0.027, "step": 3280 }, { "epoch": 1.966626936829559, "grad_norm": 0.21049529314041138, "learning_rate": 1.6381672177609825e-05, "loss": 0.0259, "step": 3300 }, { "epoch": 1.966626936829559, "eval_loss": 0.2697894275188446, "eval_runtime": 24.6235, "eval_samples_per_second": 113.713, "eval_steps_per_second": 1.787, "step": 3300 } ], "logging_steps": 20, "max_steps": 5034, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2189847399207797e+20, "train_batch_size": 16, "trial_name": null, "trial_params": null }