whisper-small / trainer_state.json
yoad's picture
Upload folder using huggingface_hub
a4158da verified
{
"best_metric": 0.2697894275188446,
"best_model_checkpoint": "/mlspeech/data/yoadsnapir/models/heb_small_exp_1/checkpoint-3300",
"epoch": 1.966626936829559,
"eval_steps": 150,
"global_step": 3300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011918951132300357,
"grad_norm": 0.46440812945365906,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0799,
"step": 20
},
{
"epoch": 0.023837902264600714,
"grad_norm": 0.3614272475242615,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0757,
"step": 40
},
{
"epoch": 0.03575685339690107,
"grad_norm": 0.3306252658367157,
"learning_rate": 3e-06,
"loss": 0.0707,
"step": 60
},
{
"epoch": 0.04767580452920143,
"grad_norm": 0.3750213384628296,
"learning_rate": 4.000000000000001e-06,
"loss": 0.0707,
"step": 80
},
{
"epoch": 0.05959475566150179,
"grad_norm": 0.3395947515964508,
"learning_rate": 5e-06,
"loss": 0.0677,
"step": 100
},
{
"epoch": 0.07151370679380215,
"grad_norm": 0.3402997553348541,
"learning_rate": 6e-06,
"loss": 0.0651,
"step": 120
},
{
"epoch": 0.08343265792610251,
"grad_norm": 0.36161497235298157,
"learning_rate": 7e-06,
"loss": 0.0662,
"step": 140
},
{
"epoch": 0.08939213349225268,
"eval_loss": 0.4555220901966095,
"eval_runtime": 32.3927,
"eval_samples_per_second": 86.439,
"eval_steps_per_second": 1.358,
"step": 150
},
{
"epoch": 0.09535160905840286,
"grad_norm": 0.35821646451950073,
"learning_rate": 8.000000000000001e-06,
"loss": 0.0659,
"step": 160
},
{
"epoch": 0.10727056019070322,
"grad_norm": 0.3810754716396332,
"learning_rate": 9e-06,
"loss": 0.0632,
"step": 180
},
{
"epoch": 0.11918951132300358,
"grad_norm": 0.35852399468421936,
"learning_rate": 1e-05,
"loss": 0.0636,
"step": 200
},
{
"epoch": 0.13110846245530394,
"grad_norm": 0.33330053091049194,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.0632,
"step": 220
},
{
"epoch": 0.1430274135876043,
"grad_norm": 0.3282977044582367,
"learning_rate": 1.2e-05,
"loss": 0.0617,
"step": 240
},
{
"epoch": 0.15494636471990464,
"grad_norm": 0.32969963550567627,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.0609,
"step": 260
},
{
"epoch": 0.16686531585220502,
"grad_norm": 0.33057352900505066,
"learning_rate": 1.4e-05,
"loss": 0.0606,
"step": 280
},
{
"epoch": 0.17878426698450536,
"grad_norm": 0.3497098982334137,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.0598,
"step": 300
},
{
"epoch": 0.17878426698450536,
"eval_loss": 0.4085025489330292,
"eval_runtime": 24.3982,
"eval_samples_per_second": 114.763,
"eval_steps_per_second": 1.803,
"step": 300
},
{
"epoch": 0.1907032181168057,
"grad_norm": 0.34013646841049194,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.0586,
"step": 320
},
{
"epoch": 0.2026221692491061,
"grad_norm": 0.3030904233455658,
"learning_rate": 1.7e-05,
"loss": 0.0588,
"step": 340
},
{
"epoch": 0.21454112038140644,
"grad_norm": 0.3328603208065033,
"learning_rate": 1.8e-05,
"loss": 0.0562,
"step": 360
},
{
"epoch": 0.22646007151370678,
"grad_norm": 0.32149162888526917,
"learning_rate": 1.9e-05,
"loss": 0.0559,
"step": 380
},
{
"epoch": 0.23837902264600716,
"grad_norm": 0.2991398572921753,
"learning_rate": 2e-05,
"loss": 0.0553,
"step": 400
},
{
"epoch": 0.25029797377830754,
"grad_norm": 0.30644193291664124,
"learning_rate": 2.1000000000000002e-05,
"loss": 0.0556,
"step": 420
},
{
"epoch": 0.2622169249106079,
"grad_norm": 0.357546329498291,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.0546,
"step": 440
},
{
"epoch": 0.26817640047675806,
"eval_loss": 0.38544225692749023,
"eval_runtime": 24.3319,
"eval_samples_per_second": 115.075,
"eval_steps_per_second": 1.808,
"step": 450
},
{
"epoch": 0.27413587604290823,
"grad_norm": 0.3578217923641205,
"learning_rate": 2.3e-05,
"loss": 0.054,
"step": 460
},
{
"epoch": 0.2860548271752086,
"grad_norm": 0.34381797909736633,
"learning_rate": 2.4e-05,
"loss": 0.0553,
"step": 480
},
{
"epoch": 0.29797377830750893,
"grad_norm": 0.30176016688346863,
"learning_rate": 2.5e-05,
"loss": 0.0543,
"step": 500
},
{
"epoch": 0.3098927294398093,
"grad_norm": 0.33324921131134033,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.0544,
"step": 520
},
{
"epoch": 0.3218116805721097,
"grad_norm": 0.3484705090522766,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.0531,
"step": 540
},
{
"epoch": 0.33373063170441003,
"grad_norm": 0.3282860219478607,
"learning_rate": 2.8e-05,
"loss": 0.0518,
"step": 560
},
{
"epoch": 0.3456495828367104,
"grad_norm": 0.3473268449306488,
"learning_rate": 2.9e-05,
"loss": 0.0515,
"step": 580
},
{
"epoch": 0.3575685339690107,
"grad_norm": 0.3255419433116913,
"learning_rate": 3.0000000000000004e-05,
"loss": 0.0526,
"step": 600
},
{
"epoch": 0.3575685339690107,
"eval_loss": 0.36782801151275635,
"eval_runtime": 24.3536,
"eval_samples_per_second": 114.973,
"eval_steps_per_second": 1.807,
"step": 600
},
{
"epoch": 0.3694874851013111,
"grad_norm": 0.3105810880661011,
"learning_rate": 3.1e-05,
"loss": 0.0505,
"step": 620
},
{
"epoch": 0.3814064362336114,
"grad_norm": 0.3272826373577118,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.0514,
"step": 640
},
{
"epoch": 0.3933253873659118,
"grad_norm": 0.2896602749824524,
"learning_rate": 3.3e-05,
"loss": 0.0509,
"step": 660
},
{
"epoch": 0.4052443384982122,
"grad_norm": 0.2664831876754761,
"learning_rate": 3.4e-05,
"loss": 0.0508,
"step": 680
},
{
"epoch": 0.4171632896305125,
"grad_norm": 0.29491451382637024,
"learning_rate": 3.5000000000000004e-05,
"loss": 0.0503,
"step": 700
},
{
"epoch": 0.42908224076281287,
"grad_norm": 0.3337947726249695,
"learning_rate": 3.6e-05,
"loss": 0.0508,
"step": 720
},
{
"epoch": 0.4410011918951132,
"grad_norm": 0.274883508682251,
"learning_rate": 3.7000000000000005e-05,
"loss": 0.0491,
"step": 740
},
{
"epoch": 0.4469606674612634,
"eval_loss": 0.3556332290172577,
"eval_runtime": 24.3629,
"eval_samples_per_second": 114.929,
"eval_steps_per_second": 1.806,
"step": 750
},
{
"epoch": 0.45292014302741357,
"grad_norm": 0.3304573595523834,
"learning_rate": 3.8e-05,
"loss": 0.0496,
"step": 760
},
{
"epoch": 0.464839094159714,
"grad_norm": 0.43425601720809937,
"learning_rate": 3.9e-05,
"loss": 0.0514,
"step": 780
},
{
"epoch": 0.4767580452920143,
"grad_norm": 0.27023833990097046,
"learning_rate": 4e-05,
"loss": 0.0493,
"step": 800
},
{
"epoch": 0.48867699642431467,
"grad_norm": 0.30646392703056335,
"learning_rate": 3.981105337742088e-05,
"loss": 0.0479,
"step": 820
},
{
"epoch": 0.5005959475566151,
"grad_norm": 0.2886641025543213,
"learning_rate": 3.9622106754841764e-05,
"loss": 0.0477,
"step": 840
},
{
"epoch": 0.5125148986889154,
"grad_norm": 0.2744874358177185,
"learning_rate": 3.943316013226264e-05,
"loss": 0.0475,
"step": 860
},
{
"epoch": 0.5244338498212158,
"grad_norm": 0.3189797103404999,
"learning_rate": 3.924421350968352e-05,
"loss": 0.0487,
"step": 880
},
{
"epoch": 0.5363528009535161,
"grad_norm": 0.2827907204627991,
"learning_rate": 3.9055266887104394e-05,
"loss": 0.047,
"step": 900
},
{
"epoch": 0.5363528009535161,
"eval_loss": 0.3409503102302551,
"eval_runtime": 24.4692,
"eval_samples_per_second": 114.43,
"eval_steps_per_second": 1.798,
"step": 900
},
{
"epoch": 0.5482717520858165,
"grad_norm": 0.29098019003868103,
"learning_rate": 3.886632026452528e-05,
"loss": 0.0477,
"step": 920
},
{
"epoch": 0.5601907032181168,
"grad_norm": 0.34882616996765137,
"learning_rate": 3.8677373641946155e-05,
"loss": 0.0469,
"step": 940
},
{
"epoch": 0.5721096543504172,
"grad_norm": 0.27143335342407227,
"learning_rate": 3.848842701936703e-05,
"loss": 0.0463,
"step": 960
},
{
"epoch": 0.5840286054827175,
"grad_norm": 0.3700932562351227,
"learning_rate": 3.829948039678791e-05,
"loss": 0.0471,
"step": 980
},
{
"epoch": 0.5959475566150179,
"grad_norm": 0.2656194567680359,
"learning_rate": 3.811053377420879e-05,
"loss": 0.0435,
"step": 1000
},
{
"epoch": 0.6078665077473182,
"grad_norm": 0.3158736526966095,
"learning_rate": 3.792158715162967e-05,
"loss": 0.0456,
"step": 1020
},
{
"epoch": 0.6197854588796186,
"grad_norm": 0.3060922920703888,
"learning_rate": 3.7732640529050546e-05,
"loss": 0.0464,
"step": 1040
},
{
"epoch": 0.6257449344457687,
"eval_loss": 0.32704514265060425,
"eval_runtime": 24.2942,
"eval_samples_per_second": 115.254,
"eval_steps_per_second": 1.811,
"step": 1050
},
{
"epoch": 0.6317044100119189,
"grad_norm": 0.284631609916687,
"learning_rate": 3.754369390647142e-05,
"loss": 0.0436,
"step": 1060
},
{
"epoch": 0.6436233611442194,
"grad_norm": 0.2965853810310364,
"learning_rate": 3.7354747283892307e-05,
"loss": 0.0451,
"step": 1080
},
{
"epoch": 0.6555423122765197,
"grad_norm": 0.2642371952533722,
"learning_rate": 3.7165800661313183e-05,
"loss": 0.0447,
"step": 1100
},
{
"epoch": 0.6674612634088201,
"grad_norm": 0.2913525402545929,
"learning_rate": 3.697685403873406e-05,
"loss": 0.0438,
"step": 1120
},
{
"epoch": 0.6793802145411204,
"grad_norm": 0.9295551776885986,
"learning_rate": 3.678790741615494e-05,
"loss": 0.0454,
"step": 1140
},
{
"epoch": 0.6912991656734208,
"grad_norm": 0.2802795469760895,
"learning_rate": 3.659896079357582e-05,
"loss": 0.0437,
"step": 1160
},
{
"epoch": 0.7032181168057211,
"grad_norm": 0.2768346965312958,
"learning_rate": 3.64100141709967e-05,
"loss": 0.0436,
"step": 1180
},
{
"epoch": 0.7151370679380215,
"grad_norm": 0.32342103123664856,
"learning_rate": 3.6221067548417575e-05,
"loss": 0.0434,
"step": 1200
},
{
"epoch": 0.7151370679380215,
"eval_loss": 0.3151066303253174,
"eval_runtime": 24.3395,
"eval_samples_per_second": 115.039,
"eval_steps_per_second": 1.808,
"step": 1200
},
{
"epoch": 0.7270560190703218,
"grad_norm": 0.28825289011001587,
"learning_rate": 3.603212092583845e-05,
"loss": 0.0444,
"step": 1220
},
{
"epoch": 0.7389749702026222,
"grad_norm": 0.2973019778728485,
"learning_rate": 3.5843174303259335e-05,
"loss": 0.0432,
"step": 1240
},
{
"epoch": 0.7508939213349225,
"grad_norm": 0.28604206442832947,
"learning_rate": 3.565422768068021e-05,
"loss": 0.0427,
"step": 1260
},
{
"epoch": 0.7628128724672228,
"grad_norm": 0.3076627552509308,
"learning_rate": 3.546528105810109e-05,
"loss": 0.0437,
"step": 1280
},
{
"epoch": 0.7747318235995232,
"grad_norm": 0.24243097007274628,
"learning_rate": 3.5276334435521966e-05,
"loss": 0.0419,
"step": 1300
},
{
"epoch": 0.7866507747318237,
"grad_norm": 0.2624588906764984,
"learning_rate": 3.508738781294285e-05,
"loss": 0.041,
"step": 1320
},
{
"epoch": 0.798569725864124,
"grad_norm": 0.2966582477092743,
"learning_rate": 3.4898441190363726e-05,
"loss": 0.0419,
"step": 1340
},
{
"epoch": 0.8045292014302742,
"eval_loss": 0.30741235613822937,
"eval_runtime": 24.3327,
"eval_samples_per_second": 115.071,
"eval_steps_per_second": 1.808,
"step": 1350
},
{
"epoch": 0.8104886769964244,
"grad_norm": 0.2586236894130707,
"learning_rate": 3.47094945677846e-05,
"loss": 0.0417,
"step": 1360
},
{
"epoch": 0.8224076281287247,
"grad_norm": 0.26168954372406006,
"learning_rate": 3.452054794520548e-05,
"loss": 0.0416,
"step": 1380
},
{
"epoch": 0.834326579261025,
"grad_norm": 0.2288641631603241,
"learning_rate": 3.4331601322626364e-05,
"loss": 0.0407,
"step": 1400
},
{
"epoch": 0.8462455303933254,
"grad_norm": 0.239312082529068,
"learning_rate": 3.414265470004724e-05,
"loss": 0.0424,
"step": 1420
},
{
"epoch": 0.8581644815256257,
"grad_norm": 0.2796385884284973,
"learning_rate": 3.395370807746812e-05,
"loss": 0.0423,
"step": 1440
},
{
"epoch": 0.8700834326579261,
"grad_norm": 0.26898354291915894,
"learning_rate": 3.3764761454888994e-05,
"loss": 0.0404,
"step": 1460
},
{
"epoch": 0.8820023837902264,
"grad_norm": 0.2980300188064575,
"learning_rate": 3.357581483230988e-05,
"loss": 0.0413,
"step": 1480
},
{
"epoch": 0.8939213349225268,
"grad_norm": 0.2609230577945709,
"learning_rate": 3.3386868209730755e-05,
"loss": 0.0417,
"step": 1500
},
{
"epoch": 0.8939213349225268,
"eval_loss": 0.3016362488269806,
"eval_runtime": 24.39,
"eval_samples_per_second": 114.801,
"eval_steps_per_second": 1.804,
"step": 1500
},
{
"epoch": 0.9058402860548271,
"grad_norm": 0.22545361518859863,
"learning_rate": 3.319792158715163e-05,
"loss": 0.0402,
"step": 1520
},
{
"epoch": 0.9177592371871275,
"grad_norm": 0.23952241241931915,
"learning_rate": 3.300897496457251e-05,
"loss": 0.0404,
"step": 1540
},
{
"epoch": 0.929678188319428,
"grad_norm": 0.211136594414711,
"learning_rate": 3.282002834199339e-05,
"loss": 0.0394,
"step": 1560
},
{
"epoch": 0.9415971394517283,
"grad_norm": 0.2786746323108673,
"learning_rate": 3.263108171941427e-05,
"loss": 0.0405,
"step": 1580
},
{
"epoch": 0.9535160905840286,
"grad_norm": 0.27551010251045227,
"learning_rate": 3.2442135096835146e-05,
"loss": 0.0393,
"step": 1600
},
{
"epoch": 0.965435041716329,
"grad_norm": 0.23502199351787567,
"learning_rate": 3.225318847425602e-05,
"loss": 0.0397,
"step": 1620
},
{
"epoch": 0.9773539928486293,
"grad_norm": 0.22266072034835815,
"learning_rate": 3.2064241851676906e-05,
"loss": 0.0382,
"step": 1640
},
{
"epoch": 0.9833134684147795,
"eval_loss": 0.2927956283092499,
"eval_runtime": 24.5022,
"eval_samples_per_second": 114.276,
"eval_steps_per_second": 1.796,
"step": 1650
},
{
"epoch": 0.9892729439809297,
"grad_norm": 0.2455359548330307,
"learning_rate": 3.187529522909778e-05,
"loss": 0.0377,
"step": 1660
},
{
"epoch": 1.0011918951132301,
"grad_norm": 0.19770461320877075,
"learning_rate": 3.168634860651866e-05,
"loss": 0.0394,
"step": 1680
},
{
"epoch": 1.0131108462455305,
"grad_norm": 0.23039540648460388,
"learning_rate": 3.149740198393954e-05,
"loss": 0.0306,
"step": 1700
},
{
"epoch": 1.0250297973778308,
"grad_norm": 0.22617976367473602,
"learning_rate": 3.130845536136042e-05,
"loss": 0.0302,
"step": 1720
},
{
"epoch": 1.0369487485101312,
"grad_norm": 0.2306171953678131,
"learning_rate": 3.11195087387813e-05,
"loss": 0.0297,
"step": 1740
},
{
"epoch": 1.0488676996424315,
"grad_norm": 0.2231408953666687,
"learning_rate": 3.0930562116202174e-05,
"loss": 0.0291,
"step": 1760
},
{
"epoch": 1.0607866507747319,
"grad_norm": 0.2368210107088089,
"learning_rate": 3.074161549362305e-05,
"loss": 0.0296,
"step": 1780
},
{
"epoch": 1.0727056019070322,
"grad_norm": 0.2105601727962494,
"learning_rate": 3.0552668871043935e-05,
"loss": 0.0302,
"step": 1800
},
{
"epoch": 1.0727056019070322,
"eval_loss": 0.2897118330001831,
"eval_runtime": 24.5143,
"eval_samples_per_second": 114.219,
"eval_steps_per_second": 1.795,
"step": 1800
},
{
"epoch": 1.0846245530393326,
"grad_norm": 0.21115803718566895,
"learning_rate": 3.036372224846481e-05,
"loss": 0.03,
"step": 1820
},
{
"epoch": 1.096543504171633,
"grad_norm": 0.21346606314182281,
"learning_rate": 3.017477562588569e-05,
"loss": 0.0303,
"step": 1840
},
{
"epoch": 1.1084624553039333,
"grad_norm": 0.2169450968503952,
"learning_rate": 2.998582900330657e-05,
"loss": 0.0301,
"step": 1860
},
{
"epoch": 1.1203814064362336,
"grad_norm": 0.22653773427009583,
"learning_rate": 2.979688238072745e-05,
"loss": 0.03,
"step": 1880
},
{
"epoch": 1.132300357568534,
"grad_norm": 0.23219868540763855,
"learning_rate": 2.9607935758148326e-05,
"loss": 0.0299,
"step": 1900
},
{
"epoch": 1.1442193087008343,
"grad_norm": 0.19858673214912415,
"learning_rate": 2.9418989135569203e-05,
"loss": 0.0304,
"step": 1920
},
{
"epoch": 1.1561382598331347,
"grad_norm": 0.20563630759716034,
"learning_rate": 2.9230042512990083e-05,
"loss": 0.0298,
"step": 1940
},
{
"epoch": 1.162097735399285,
"eval_loss": 0.2881970703601837,
"eval_runtime": 24.482,
"eval_samples_per_second": 114.37,
"eval_steps_per_second": 1.797,
"step": 1950
},
{
"epoch": 1.168057210965435,
"grad_norm": 0.23171384632587433,
"learning_rate": 2.9041095890410963e-05,
"loss": 0.0305,
"step": 1960
},
{
"epoch": 1.1799761620977354,
"grad_norm": 0.22927935421466827,
"learning_rate": 2.885214926783184e-05,
"loss": 0.0293,
"step": 1980
},
{
"epoch": 1.1918951132300357,
"grad_norm": 0.22874480485916138,
"learning_rate": 2.8663202645252717e-05,
"loss": 0.0306,
"step": 2000
},
{
"epoch": 1.203814064362336,
"grad_norm": 0.2067805975675583,
"learning_rate": 2.8474256022673597e-05,
"loss": 0.0301,
"step": 2020
},
{
"epoch": 1.2157330154946364,
"grad_norm": 0.2245558649301529,
"learning_rate": 2.8285309400094477e-05,
"loss": 0.032,
"step": 2040
},
{
"epoch": 1.2276519666269368,
"grad_norm": 0.2403639554977417,
"learning_rate": 2.8096362777515354e-05,
"loss": 0.0296,
"step": 2060
},
{
"epoch": 1.2395709177592371,
"grad_norm": 0.2070910781621933,
"learning_rate": 2.790741615493623e-05,
"loss": 0.03,
"step": 2080
},
{
"epoch": 1.2514898688915377,
"grad_norm": 0.23112566769123077,
"learning_rate": 2.771846953235711e-05,
"loss": 0.0295,
"step": 2100
},
{
"epoch": 1.2514898688915377,
"eval_loss": 0.2860635817050934,
"eval_runtime": 24.4598,
"eval_samples_per_second": 114.473,
"eval_steps_per_second": 1.799,
"step": 2100
},
{
"epoch": 1.2634088200238378,
"grad_norm": 0.2321086972951889,
"learning_rate": 2.7529522909777992e-05,
"loss": 0.0307,
"step": 2120
},
{
"epoch": 1.2753277711561384,
"grad_norm": 0.23003174364566803,
"learning_rate": 2.734057628719887e-05,
"loss": 0.0306,
"step": 2140
},
{
"epoch": 1.2872467222884385,
"grad_norm": 0.2210853099822998,
"learning_rate": 2.7151629664619745e-05,
"loss": 0.0289,
"step": 2160
},
{
"epoch": 1.299165673420739,
"grad_norm": 0.2513985335826874,
"learning_rate": 2.6962683042040626e-05,
"loss": 0.0298,
"step": 2180
},
{
"epoch": 1.3110846245530392,
"grad_norm": 0.2379380464553833,
"learning_rate": 2.6773736419461506e-05,
"loss": 0.0301,
"step": 2200
},
{
"epoch": 1.3230035756853398,
"grad_norm": 0.21417196094989777,
"learning_rate": 2.6584789796882383e-05,
"loss": 0.0301,
"step": 2220
},
{
"epoch": 1.3349225268176401,
"grad_norm": 0.22283975780010223,
"learning_rate": 2.639584317430326e-05,
"loss": 0.0309,
"step": 2240
},
{
"epoch": 1.3408820023837902,
"eval_loss": 0.2835468649864197,
"eval_runtime": 24.4862,
"eval_samples_per_second": 114.35,
"eval_steps_per_second": 1.797,
"step": 2250
},
{
"epoch": 1.3468414779499405,
"grad_norm": 0.21475766599178314,
"learning_rate": 2.620689655172414e-05,
"loss": 0.0296,
"step": 2260
},
{
"epoch": 1.3587604290822408,
"grad_norm": 0.2578884959220886,
"learning_rate": 2.601794992914502e-05,
"loss": 0.0285,
"step": 2280
},
{
"epoch": 1.3706793802145412,
"grad_norm": 0.208306223154068,
"learning_rate": 2.5829003306565897e-05,
"loss": 0.0296,
"step": 2300
},
{
"epoch": 1.3825983313468415,
"grad_norm": 0.20098654925823212,
"learning_rate": 2.5640056683986774e-05,
"loss": 0.0299,
"step": 2320
},
{
"epoch": 1.3945172824791419,
"grad_norm": 0.19324277341365814,
"learning_rate": 2.5451110061407654e-05,
"loss": 0.0299,
"step": 2340
},
{
"epoch": 1.4064362336114422,
"grad_norm": 0.22991597652435303,
"learning_rate": 2.5262163438828534e-05,
"loss": 0.0283,
"step": 2360
},
{
"epoch": 1.4183551847437426,
"grad_norm": 0.20805244147777557,
"learning_rate": 2.507321681624941e-05,
"loss": 0.0294,
"step": 2380
},
{
"epoch": 1.430274135876043,
"grad_norm": 0.2217872589826584,
"learning_rate": 2.488427019367029e-05,
"loss": 0.0298,
"step": 2400
},
{
"epoch": 1.430274135876043,
"eval_loss": 0.28145191073417664,
"eval_runtime": 24.4822,
"eval_samples_per_second": 114.369,
"eval_steps_per_second": 1.797,
"step": 2400
},
{
"epoch": 1.4421930870083433,
"grad_norm": 0.2886907756328583,
"learning_rate": 2.469532357109117e-05,
"loss": 0.0286,
"step": 2420
},
{
"epoch": 1.4541120381406436,
"grad_norm": 0.22498397529125214,
"learning_rate": 2.450637694851205e-05,
"loss": 0.0303,
"step": 2440
},
{
"epoch": 1.466030989272944,
"grad_norm": 0.2244011014699936,
"learning_rate": 2.4317430325932926e-05,
"loss": 0.0291,
"step": 2460
},
{
"epoch": 1.4779499404052443,
"grad_norm": 0.254245400428772,
"learning_rate": 2.4128483703353806e-05,
"loss": 0.0291,
"step": 2480
},
{
"epoch": 1.4898688915375446,
"grad_norm": 0.23943567276000977,
"learning_rate": 2.3939537080774683e-05,
"loss": 0.0279,
"step": 2500
},
{
"epoch": 1.5017878426698452,
"grad_norm": 0.19281241297721863,
"learning_rate": 2.3750590458195563e-05,
"loss": 0.0284,
"step": 2520
},
{
"epoch": 1.5137067938021453,
"grad_norm": 0.1942477971315384,
"learning_rate": 2.356164383561644e-05,
"loss": 0.0286,
"step": 2540
},
{
"epoch": 1.5196662693682956,
"eval_loss": 0.2797168493270874,
"eval_runtime": 24.5526,
"eval_samples_per_second": 114.041,
"eval_steps_per_second": 1.792,
"step": 2550
},
{
"epoch": 1.525625744934446,
"grad_norm": 0.21547091007232666,
"learning_rate": 2.337269721303732e-05,
"loss": 0.0299,
"step": 2560
},
{
"epoch": 1.537544696066746,
"grad_norm": 0.2152203619480133,
"learning_rate": 2.3183750590458197e-05,
"loss": 0.0285,
"step": 2580
},
{
"epoch": 1.5494636471990466,
"grad_norm": 0.19925089180469513,
"learning_rate": 2.2994803967879077e-05,
"loss": 0.0282,
"step": 2600
},
{
"epoch": 1.5613825983313467,
"grad_norm": 0.21512266993522644,
"learning_rate": 2.2805857345299954e-05,
"loss": 0.0291,
"step": 2620
},
{
"epoch": 1.5733015494636473,
"grad_norm": 0.2275344282388687,
"learning_rate": 2.2616910722720834e-05,
"loss": 0.0288,
"step": 2640
},
{
"epoch": 1.5852205005959474,
"grad_norm": 0.22899560630321503,
"learning_rate": 2.242796410014171e-05,
"loss": 0.0279,
"step": 2660
},
{
"epoch": 1.597139451728248,
"grad_norm": 0.21439722180366516,
"learning_rate": 2.223901747756259e-05,
"loss": 0.0273,
"step": 2680
},
{
"epoch": 1.6090584028605481,
"grad_norm": 0.21393275260925293,
"learning_rate": 2.205007085498347e-05,
"loss": 0.0284,
"step": 2700
},
{
"epoch": 1.6090584028605481,
"eval_loss": 0.27715668082237244,
"eval_runtime": 24.544,
"eval_samples_per_second": 114.081,
"eval_steps_per_second": 1.793,
"step": 2700
},
{
"epoch": 1.6209773539928487,
"grad_norm": 0.22286593914031982,
"learning_rate": 2.186112423240435e-05,
"loss": 0.028,
"step": 2720
},
{
"epoch": 1.6328963051251488,
"grad_norm": 0.21552002429962158,
"learning_rate": 2.1672177609825225e-05,
"loss": 0.0282,
"step": 2740
},
{
"epoch": 1.6448152562574494,
"grad_norm": 0.20231053233146667,
"learning_rate": 2.1483230987246106e-05,
"loss": 0.0278,
"step": 2760
},
{
"epoch": 1.6567342073897497,
"grad_norm": 0.20446668565273285,
"learning_rate": 2.1294284364666983e-05,
"loss": 0.0283,
"step": 2780
},
{
"epoch": 1.66865315852205,
"grad_norm": 0.21102942526340485,
"learning_rate": 2.1105337742087863e-05,
"loss": 0.029,
"step": 2800
},
{
"epoch": 1.6805721096543504,
"grad_norm": 0.190469890832901,
"learning_rate": 2.091639111950874e-05,
"loss": 0.0286,
"step": 2820
},
{
"epoch": 1.6924910607866508,
"grad_norm": 0.2115412801504135,
"learning_rate": 2.072744449692962e-05,
"loss": 0.0278,
"step": 2840
},
{
"epoch": 1.698450536352801,
"eval_loss": 0.2754322588443756,
"eval_runtime": 24.6377,
"eval_samples_per_second": 113.647,
"eval_steps_per_second": 1.786,
"step": 2850
},
{
"epoch": 1.7044100119189511,
"grad_norm": 0.18578040599822998,
"learning_rate": 2.0538497874350497e-05,
"loss": 0.0284,
"step": 2860
},
{
"epoch": 1.7163289630512515,
"grad_norm": 0.23163530230522156,
"learning_rate": 2.0349551251771377e-05,
"loss": 0.0277,
"step": 2880
},
{
"epoch": 1.7282479141835518,
"grad_norm": 0.2035950869321823,
"learning_rate": 2.0160604629192254e-05,
"loss": 0.0275,
"step": 2900
},
{
"epoch": 1.7401668653158522,
"grad_norm": 0.24992471933364868,
"learning_rate": 1.9971658006613134e-05,
"loss": 0.028,
"step": 2920
},
{
"epoch": 1.7520858164481525,
"grad_norm": 0.20510628819465637,
"learning_rate": 1.978271138403401e-05,
"loss": 0.027,
"step": 2940
},
{
"epoch": 1.7640047675804529,
"grad_norm": 0.216608926653862,
"learning_rate": 1.959376476145489e-05,
"loss": 0.0283,
"step": 2960
},
{
"epoch": 1.7759237187127532,
"grad_norm": 0.20540915429592133,
"learning_rate": 1.9404818138875768e-05,
"loss": 0.0274,
"step": 2980
},
{
"epoch": 1.7878426698450536,
"grad_norm": 0.20191486179828644,
"learning_rate": 1.921587151629665e-05,
"loss": 0.0275,
"step": 3000
},
{
"epoch": 1.7878426698450536,
"eval_loss": 0.27235740423202515,
"eval_runtime": 24.612,
"eval_samples_per_second": 113.766,
"eval_steps_per_second": 1.788,
"step": 3000
},
{
"epoch": 1.7997616209773541,
"grad_norm": 0.24276046454906464,
"learning_rate": 1.9026924893717525e-05,
"loss": 0.028,
"step": 3020
},
{
"epoch": 1.8116805721096543,
"grad_norm": 0.21864300966262817,
"learning_rate": 1.8837978271138406e-05,
"loss": 0.0279,
"step": 3040
},
{
"epoch": 1.8235995232419548,
"grad_norm": 0.2271438091993332,
"learning_rate": 1.8649031648559282e-05,
"loss": 0.0276,
"step": 3060
},
{
"epoch": 1.835518474374255,
"grad_norm": 0.20855475962162018,
"learning_rate": 1.8460085025980163e-05,
"loss": 0.0283,
"step": 3080
},
{
"epoch": 1.8474374255065555,
"grad_norm": 0.2064722329378128,
"learning_rate": 1.827113840340104e-05,
"loss": 0.0271,
"step": 3100
},
{
"epoch": 1.8593563766388557,
"grad_norm": 0.2111743986606598,
"learning_rate": 1.808219178082192e-05,
"loss": 0.0273,
"step": 3120
},
{
"epoch": 1.8712753277711562,
"grad_norm": 0.18417872488498688,
"learning_rate": 1.7893245158242797e-05,
"loss": 0.0275,
"step": 3140
},
{
"epoch": 1.8772348033373063,
"eval_loss": 0.2706995904445648,
"eval_runtime": 24.5215,
"eval_samples_per_second": 114.186,
"eval_steps_per_second": 1.794,
"step": 3150
},
{
"epoch": 1.8831942789034564,
"grad_norm": 0.19550646841526031,
"learning_rate": 1.7704298535663677e-05,
"loss": 0.0275,
"step": 3160
},
{
"epoch": 1.895113230035757,
"grad_norm": 0.19912759959697723,
"learning_rate": 1.7515351913084554e-05,
"loss": 0.0267,
"step": 3180
},
{
"epoch": 1.907032181168057,
"grad_norm": 0.20110997557640076,
"learning_rate": 1.7326405290505434e-05,
"loss": 0.0281,
"step": 3200
},
{
"epoch": 1.9189511323003576,
"grad_norm": 0.20968247950077057,
"learning_rate": 1.713745866792631e-05,
"loss": 0.0271,
"step": 3220
},
{
"epoch": 1.930870083432658,
"grad_norm": 0.20576246082782745,
"learning_rate": 1.694851204534719e-05,
"loss": 0.0277,
"step": 3240
},
{
"epoch": 1.9427890345649583,
"grad_norm": 0.21499717235565186,
"learning_rate": 1.6759565422768068e-05,
"loss": 0.0264,
"step": 3260
},
{
"epoch": 1.9547079856972587,
"grad_norm": 0.23822776973247528,
"learning_rate": 1.6570618800188948e-05,
"loss": 0.027,
"step": 3280
},
{
"epoch": 1.966626936829559,
"grad_norm": 0.21049529314041138,
"learning_rate": 1.6381672177609825e-05,
"loss": 0.0259,
"step": 3300
},
{
"epoch": 1.966626936829559,
"eval_loss": 0.2697894275188446,
"eval_runtime": 24.6235,
"eval_samples_per_second": 113.713,
"eval_steps_per_second": 1.787,
"step": 3300
}
],
"logging_steps": 20,
"max_steps": 5034,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2189847399207797e+20,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}