{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.407709414381023, "eval_steps": 1000, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037064492216456635, "grad_norm": 14.615763664245605, "learning_rate": 4.800000000000001e-07, "loss": 1.4607, "step": 25 }, { "epoch": 0.07412898443291327, "grad_norm": 10.59756851196289, "learning_rate": 9.800000000000001e-07, "loss": 1.176, "step": 50 }, { "epoch": 0.1111934766493699, "grad_norm": 7.142136573791504, "learning_rate": 1.48e-06, "loss": 0.9267, "step": 75 }, { "epoch": 0.14825796886582654, "grad_norm": 9.191902160644531, "learning_rate": 1.98e-06, "loss": 0.7253, "step": 100 }, { "epoch": 0.18532246108228317, "grad_norm": 10.320201873779297, "learning_rate": 2.4800000000000004e-06, "loss": 0.7047, "step": 125 }, { "epoch": 0.2223869532987398, "grad_norm": 8.486912727355957, "learning_rate": 2.9800000000000003e-06, "loss": 0.6634, "step": 150 }, { "epoch": 0.25945144551519644, "grad_norm": 9.802300453186035, "learning_rate": 3.48e-06, "loss": 0.5786, "step": 175 }, { "epoch": 0.2965159377316531, "grad_norm": 9.568249702453613, "learning_rate": 3.980000000000001e-06, "loss": 0.5857, "step": 200 }, { "epoch": 0.3335804299481097, "grad_norm": 7.968526840209961, "learning_rate": 4.48e-06, "loss": 0.5385, "step": 225 }, { "epoch": 0.37064492216456635, "grad_norm": 7.507795810699463, "learning_rate": 4.980000000000001e-06, "loss": 0.5151, "step": 250 }, { "epoch": 0.407709414381023, "grad_norm": 6.258375644683838, "learning_rate": 5.480000000000001e-06, "loss": 0.4649, "step": 275 }, { "epoch": 0.4447739065974796, "grad_norm": 9.89697551727295, "learning_rate": 5.98e-06, "loss": 0.4209, "step": 300 }, { "epoch": 0.48183839881393625, "grad_norm": 7.8507490158081055, "learning_rate": 6.480000000000001e-06, "loss": 0.4459, "step": 325 }, { "epoch": 0.5189028910303929, "grad_norm": 5.835811138153076, "learning_rate": 6.98e-06, "loss": 0.4141, "step": 350 }, { "epoch": 0.5559673832468495, "grad_norm": 6.767547607421875, "learning_rate": 7.48e-06, "loss": 0.4108, "step": 375 }, { "epoch": 0.5930318754633062, "grad_norm": 5.9475884437561035, "learning_rate": 7.980000000000002e-06, "loss": 0.41, "step": 400 }, { "epoch": 0.6300963676797627, "grad_norm": 7.767906188964844, "learning_rate": 8.48e-06, "loss": 0.3781, "step": 425 }, { "epoch": 0.6671608598962194, "grad_norm": 6.990137100219727, "learning_rate": 8.98e-06, "loss": 0.39, "step": 450 }, { "epoch": 0.704225352112676, "grad_norm": 5.607441425323486, "learning_rate": 9.48e-06, "loss": 0.3783, "step": 475 }, { "epoch": 0.7412898443291327, "grad_norm": 6.288857936859131, "learning_rate": 9.980000000000001e-06, "loss": 0.3559, "step": 500 }, { "epoch": 0.7783543365455893, "grad_norm": 6.985698699951172, "learning_rate": 9.946666666666667e-06, "loss": 0.3595, "step": 525 }, { "epoch": 0.815418828762046, "grad_norm": 6.037854194641113, "learning_rate": 9.891111111111113e-06, "loss": 0.3163, "step": 550 }, { "epoch": 0.8524833209785025, "grad_norm": 5.8710784912109375, "learning_rate": 9.835555555555556e-06, "loss": 0.3502, "step": 575 }, { "epoch": 0.8895478131949592, "grad_norm": 6.342834949493408, "learning_rate": 9.780000000000001e-06, "loss": 0.317, "step": 600 }, { "epoch": 0.9266123054114158, "grad_norm": 5.589534759521484, "learning_rate": 9.724444444444445e-06, "loss": 0.3228, "step": 625 }, { "epoch": 0.9636767976278725, "grad_norm": 7.743918418884277, "learning_rate": 9.66888888888889e-06, "loss": 0.3144, "step": 650 }, { "epoch": 1.0, "grad_norm": 10.073568344116211, "learning_rate": 9.613333333333335e-06, "loss": 0.2939, "step": 675 }, { "epoch": 1.0370644922164567, "grad_norm": 4.640520095825195, "learning_rate": 9.557777777777777e-06, "loss": 0.1939, "step": 700 }, { "epoch": 1.0741289844329134, "grad_norm": 3.2049508094787598, "learning_rate": 9.502222222222223e-06, "loss": 0.1929, "step": 725 }, { "epoch": 1.1111934766493698, "grad_norm": 3.9065611362457275, "learning_rate": 9.446666666666667e-06, "loss": 0.1998, "step": 750 }, { "epoch": 1.1482579688658265, "grad_norm": 3.7471649646759033, "learning_rate": 9.391111111111111e-06, "loss": 0.2007, "step": 775 }, { "epoch": 1.1853224610822832, "grad_norm": 3.952751874923706, "learning_rate": 9.335555555555557e-06, "loss": 0.1863, "step": 800 }, { "epoch": 1.2223869532987397, "grad_norm": 5.39549446105957, "learning_rate": 9.280000000000001e-06, "loss": 0.1953, "step": 825 }, { "epoch": 1.2594514455151964, "grad_norm": 4.03216552734375, "learning_rate": 9.224444444444445e-06, "loss": 0.2065, "step": 850 }, { "epoch": 1.296515937731653, "grad_norm": 3.854651689529419, "learning_rate": 9.168888888888889e-06, "loss": 0.1703, "step": 875 }, { "epoch": 1.3335804299481098, "grad_norm": 4.835360050201416, "learning_rate": 9.113333333333335e-06, "loss": 0.1692, "step": 900 }, { "epoch": 1.3706449221645665, "grad_norm": 5.247130393981934, "learning_rate": 9.057777777777779e-06, "loss": 0.1982, "step": 925 }, { "epoch": 1.407709414381023, "grad_norm": 3.9537737369537354, "learning_rate": 9.002222222222223e-06, "loss": 0.1661, "step": 950 }, { "epoch": 1.4447739065974796, "grad_norm": 4.887810230255127, "learning_rate": 8.946666666666669e-06, "loss": 0.1836, "step": 975 }, { "epoch": 1.4818383988139363, "grad_norm": 3.6338751316070557, "learning_rate": 8.891111111111111e-06, "loss": 0.1822, "step": 1000 }, { "epoch": 1.4818383988139363, "eval_loss": 0.2655850648880005, "eval_runtime": 730.9503, "eval_samples_per_second": 3.947, "eval_steps_per_second": 0.494, "eval_wer": 0.14449384404924762, "step": 1000 }, { "epoch": 1.5189028910303928, "grad_norm": 4.078255653381348, "learning_rate": 8.835555555555557e-06, "loss": 0.1661, "step": 1025 }, { "epoch": 1.5559673832468495, "grad_norm": 3.9311952590942383, "learning_rate": 8.78e-06, "loss": 0.1725, "step": 1050 }, { "epoch": 1.5930318754633062, "grad_norm": 4.800196170806885, "learning_rate": 8.724444444444445e-06, "loss": 0.1704, "step": 1075 }, { "epoch": 1.6300963676797626, "grad_norm": 4.550530910491943, "learning_rate": 8.66888888888889e-06, "loss": 0.1793, "step": 1100 }, { "epoch": 1.6671608598962195, "grad_norm": 6.508624076843262, "learning_rate": 8.613333333333333e-06, "loss": 0.1619, "step": 1125 }, { "epoch": 1.704225352112676, "grad_norm": 4.16792106628418, "learning_rate": 8.557777777777778e-06, "loss": 0.1652, "step": 1150 }, { "epoch": 1.7412898443291327, "grad_norm": 4.420657157897949, "learning_rate": 8.502222222222223e-06, "loss": 0.16, "step": 1175 }, { "epoch": 1.7783543365455894, "grad_norm": 4.781569004058838, "learning_rate": 8.446666666666668e-06, "loss": 0.1695, "step": 1200 }, { "epoch": 1.8154188287620459, "grad_norm": 3.877307176589966, "learning_rate": 8.391111111111112e-06, "loss": 0.1529, "step": 1225 }, { "epoch": 1.8524833209785025, "grad_norm": 4.159163475036621, "learning_rate": 8.335555555555556e-06, "loss": 0.1619, "step": 1250 }, { "epoch": 1.8895478131949592, "grad_norm": 3.6631579399108887, "learning_rate": 8.28e-06, "loss": 0.1654, "step": 1275 }, { "epoch": 1.9266123054114157, "grad_norm": 4.1784210205078125, "learning_rate": 8.224444444444444e-06, "loss": 0.1494, "step": 1300 }, { "epoch": 1.9636767976278726, "grad_norm": 5.867852210998535, "learning_rate": 8.16888888888889e-06, "loss": 0.1443, "step": 1325 }, { "epoch": 2.0, "grad_norm": 5.817214012145996, "learning_rate": 8.113333333333334e-06, "loss": 0.139, "step": 1350 }, { "epoch": 2.0370644922164565, "grad_norm": 2.3572022914886475, "learning_rate": 8.057777777777778e-06, "loss": 0.0614, "step": 1375 }, { "epoch": 2.0741289844329134, "grad_norm": 2.2769412994384766, "learning_rate": 8.002222222222222e-06, "loss": 0.0606, "step": 1400 }, { "epoch": 2.11119347664937, "grad_norm": 2.474583864212036, "learning_rate": 7.946666666666666e-06, "loss": 0.0716, "step": 1425 }, { "epoch": 2.1482579688658268, "grad_norm": 2.5783841609954834, "learning_rate": 7.891111111111112e-06, "loss": 0.065, "step": 1450 }, { "epoch": 2.1853224610822832, "grad_norm": 1.6132420301437378, "learning_rate": 7.835555555555556e-06, "loss": 0.067, "step": 1475 }, { "epoch": 2.2223869532987397, "grad_norm": 3.8042001724243164, "learning_rate": 7.78e-06, "loss": 0.0724, "step": 1500 }, { "epoch": 2.2594514455151966, "grad_norm": 2.2419843673706055, "learning_rate": 7.724444444444446e-06, "loss": 0.0761, "step": 1525 }, { "epoch": 2.296515937731653, "grad_norm": 2.706354856491089, "learning_rate": 7.66888888888889e-06, "loss": 0.0659, "step": 1550 }, { "epoch": 2.3335804299481095, "grad_norm": 2.8394265174865723, "learning_rate": 7.613333333333334e-06, "loss": 0.0688, "step": 1575 }, { "epoch": 2.3706449221645665, "grad_norm": 2.383784770965576, "learning_rate": 7.557777777777779e-06, "loss": 0.0729, "step": 1600 }, { "epoch": 2.407709414381023, "grad_norm": 3.0959832668304443, "learning_rate": 7.502222222222223e-06, "loss": 0.0626, "step": 1625 }, { "epoch": 2.4447739065974794, "grad_norm": 2.927393913269043, "learning_rate": 7.446666666666668e-06, "loss": 0.0677, "step": 1650 }, { "epoch": 2.4818383988139363, "grad_norm": 2.644434928894043, "learning_rate": 7.3911111111111125e-06, "loss": 0.0644, "step": 1675 }, { "epoch": 2.5189028910303928, "grad_norm": 2.9071755409240723, "learning_rate": 7.335555555555556e-06, "loss": 0.061, "step": 1700 }, { "epoch": 2.5559673832468492, "grad_norm": 2.6862034797668457, "learning_rate": 7.280000000000001e-06, "loss": 0.0615, "step": 1725 }, { "epoch": 2.593031875463306, "grad_norm": 3.1184046268463135, "learning_rate": 7.224444444444445e-06, "loss": 0.0714, "step": 1750 }, { "epoch": 2.6300963676797626, "grad_norm": 1.7592053413391113, "learning_rate": 7.1688888888888895e-06, "loss": 0.0704, "step": 1775 }, { "epoch": 2.6671608598962195, "grad_norm": 2.9316508769989014, "learning_rate": 7.113333333333334e-06, "loss": 0.0689, "step": 1800 }, { "epoch": 2.704225352112676, "grad_norm": 2.1934666633605957, "learning_rate": 7.057777777777778e-06, "loss": 0.0721, "step": 1825 }, { "epoch": 2.741289844329133, "grad_norm": 3.4919371604919434, "learning_rate": 7.0022222222222225e-06, "loss": 0.0638, "step": 1850 }, { "epoch": 2.7783543365455894, "grad_norm": 2.723252058029175, "learning_rate": 6.946666666666667e-06, "loss": 0.0598, "step": 1875 }, { "epoch": 2.815418828762046, "grad_norm": 1.8668267726898193, "learning_rate": 6.891111111111111e-06, "loss": 0.0607, "step": 1900 }, { "epoch": 2.8524833209785028, "grad_norm": 2.0989866256713867, "learning_rate": 6.835555555555556e-06, "loss": 0.0821, "step": 1925 }, { "epoch": 2.8895478131949592, "grad_norm": 2.9375364780426025, "learning_rate": 6.780000000000001e-06, "loss": 0.0636, "step": 1950 }, { "epoch": 2.9266123054114157, "grad_norm": 2.1375315189361572, "learning_rate": 6.724444444444444e-06, "loss": 0.0723, "step": 1975 }, { "epoch": 2.9636767976278726, "grad_norm": 2.5874264240264893, "learning_rate": 6.668888888888889e-06, "loss": 0.0706, "step": 2000 }, { "epoch": 2.9636767976278726, "eval_loss": 0.2490690052509308, "eval_runtime": 730.2087, "eval_samples_per_second": 3.951, "eval_steps_per_second": 0.494, "eval_wer": 0.12696648426812585, "step": 2000 }, { "epoch": 3.0, "grad_norm": 6.509148597717285, "learning_rate": 6.613333333333334e-06, "loss": 0.0587, "step": 2025 }, { "epoch": 3.0370644922164565, "grad_norm": 1.9590086936950684, "learning_rate": 6.557777777777778e-06, "loss": 0.0241, "step": 2050 }, { "epoch": 3.0741289844329134, "grad_norm": 1.4612740278244019, "learning_rate": 6.502222222222223e-06, "loss": 0.0267, "step": 2075 }, { "epoch": 3.11119347664937, "grad_norm": 0.9522780179977417, "learning_rate": 6.446666666666668e-06, "loss": 0.023, "step": 2100 }, { "epoch": 3.1482579688658268, "grad_norm": 1.891400694847107, "learning_rate": 6.391111111111111e-06, "loss": 0.0281, "step": 2125 }, { "epoch": 3.1853224610822832, "grad_norm": 1.0783302783966064, "learning_rate": 6.335555555555556e-06, "loss": 0.0246, "step": 2150 }, { "epoch": 3.2223869532987397, "grad_norm": 1.3504562377929688, "learning_rate": 6.280000000000001e-06, "loss": 0.0244, "step": 2175 }, { "epoch": 3.2594514455151966, "grad_norm": 1.8768439292907715, "learning_rate": 6.224444444444445e-06, "loss": 0.0264, "step": 2200 }, { "epoch": 3.296515937731653, "grad_norm": 1.5083887577056885, "learning_rate": 6.16888888888889e-06, "loss": 0.0248, "step": 2225 }, { "epoch": 3.3335804299481095, "grad_norm": 3.5768120288848877, "learning_rate": 6.113333333333333e-06, "loss": 0.0316, "step": 2250 }, { "epoch": 3.3706449221645665, "grad_norm": 1.1493444442749023, "learning_rate": 6.057777777777778e-06, "loss": 0.0294, "step": 2275 }, { "epoch": 3.407709414381023, "grad_norm": 2.3746306896209717, "learning_rate": 6.002222222222223e-06, "loss": 0.0263, "step": 2300 }, { "epoch": 3.4447739065974794, "grad_norm": 2.144634485244751, "learning_rate": 5.946666666666668e-06, "loss": 0.0348, "step": 2325 }, { "epoch": 3.4818383988139363, "grad_norm": 1.5002686977386475, "learning_rate": 5.891111111111112e-06, "loss": 0.0228, "step": 2350 }, { "epoch": 3.5189028910303928, "grad_norm": 1.6059187650680542, "learning_rate": 5.8355555555555565e-06, "loss": 0.0239, "step": 2375 }, { "epoch": 3.5559673832468492, "grad_norm": 2.757420778274536, "learning_rate": 5.78e-06, "loss": 0.0277, "step": 2400 }, { "epoch": 3.593031875463306, "grad_norm": 1.3977222442626953, "learning_rate": 5.724444444444445e-06, "loss": 0.0224, "step": 2425 }, { "epoch": 3.6300963676797626, "grad_norm": 1.9618048667907715, "learning_rate": 5.6688888888888895e-06, "loss": 0.026, "step": 2450 }, { "epoch": 3.6671608598962195, "grad_norm": 0.898245632648468, "learning_rate": 5.613333333333334e-06, "loss": 0.0326, "step": 2475 }, { "epoch": 3.704225352112676, "grad_norm": 1.8148616552352905, "learning_rate": 5.557777777777778e-06, "loss": 0.0213, "step": 2500 }, { "epoch": 3.741289844329133, "grad_norm": 1.308030366897583, "learning_rate": 5.5022222222222224e-06, "loss": 0.0192, "step": 2525 }, { "epoch": 3.7783543365455894, "grad_norm": 1.6680744886398315, "learning_rate": 5.4466666666666665e-06, "loss": 0.027, "step": 2550 }, { "epoch": 3.815418828762046, "grad_norm": 3.235917568206787, "learning_rate": 5.391111111111111e-06, "loss": 0.0242, "step": 2575 }, { "epoch": 3.8524833209785028, "grad_norm": 2.096780300140381, "learning_rate": 5.335555555555556e-06, "loss": 0.0243, "step": 2600 }, { "epoch": 3.8895478131949592, "grad_norm": 1.8445031642913818, "learning_rate": 5.28e-06, "loss": 0.024, "step": 2625 }, { "epoch": 3.9266123054114157, "grad_norm": 1.357937216758728, "learning_rate": 5.224444444444445e-06, "loss": 0.0244, "step": 2650 }, { "epoch": 3.9636767976278726, "grad_norm": 1.0413466691970825, "learning_rate": 5.168888888888889e-06, "loss": 0.0221, "step": 2675 }, { "epoch": 4.0, "grad_norm": 3.0572996139526367, "learning_rate": 5.113333333333333e-06, "loss": 0.0206, "step": 2700 }, { "epoch": 4.037064492216457, "grad_norm": 0.9961848258972168, "learning_rate": 5.057777777777778e-06, "loss": 0.0136, "step": 2725 }, { "epoch": 4.074128984432913, "grad_norm": 1.0248702764511108, "learning_rate": 5.002222222222223e-06, "loss": 0.009, "step": 2750 }, { "epoch": 4.11119347664937, "grad_norm": 0.6142157912254333, "learning_rate": 4.946666666666667e-06, "loss": 0.0113, "step": 2775 }, { "epoch": 4.148257968865827, "grad_norm": 0.27292531728744507, "learning_rate": 4.891111111111111e-06, "loss": 0.009, "step": 2800 }, { "epoch": 4.185322461082283, "grad_norm": 2.2906312942504883, "learning_rate": 4.835555555555556e-06, "loss": 0.0073, "step": 2825 }, { "epoch": 4.22238695329874, "grad_norm": 1.0498850345611572, "learning_rate": 4.78e-06, "loss": 0.0093, "step": 2850 }, { "epoch": 4.259451445515197, "grad_norm": 1.1574844121932983, "learning_rate": 4.724444444444445e-06, "loss": 0.0159, "step": 2875 }, { "epoch": 4.2965159377316535, "grad_norm": 0.7209671139717102, "learning_rate": 4.66888888888889e-06, "loss": 0.0088, "step": 2900 }, { "epoch": 4.3335804299481095, "grad_norm": 1.168841004371643, "learning_rate": 4.613333333333334e-06, "loss": 0.0094, "step": 2925 }, { "epoch": 4.3706449221645665, "grad_norm": 0.6153778433799744, "learning_rate": 4.557777777777778e-06, "loss": 0.009, "step": 2950 }, { "epoch": 4.407709414381023, "grad_norm": 1.5705232620239258, "learning_rate": 4.502222222222223e-06, "loss": 0.0085, "step": 2975 }, { "epoch": 4.444773906597479, "grad_norm": 0.24448032677173615, "learning_rate": 4.446666666666667e-06, "loss": 0.0072, "step": 3000 }, { "epoch": 4.444773906597479, "eval_loss": 0.27286583185195923, "eval_runtime": 739.8615, "eval_samples_per_second": 3.899, "eval_steps_per_second": 0.488, "eval_wer": 0.11913474692202462, "step": 3000 }, { "epoch": 4.481838398813936, "grad_norm": 1.2278587818145752, "learning_rate": 4.391111111111112e-06, "loss": 0.0146, "step": 3025 }, { "epoch": 4.518902891030393, "grad_norm": 0.6478213667869568, "learning_rate": 4.3355555555555565e-06, "loss": 0.014, "step": 3050 }, { "epoch": 4.555967383246849, "grad_norm": 0.7865190505981445, "learning_rate": 4.2800000000000005e-06, "loss": 0.0079, "step": 3075 }, { "epoch": 4.593031875463306, "grad_norm": 2.3078877925872803, "learning_rate": 4.2244444444444446e-06, "loss": 0.009, "step": 3100 }, { "epoch": 4.630096367679763, "grad_norm": 0.9625842571258545, "learning_rate": 4.168888888888889e-06, "loss": 0.0096, "step": 3125 }, { "epoch": 4.667160859896219, "grad_norm": 0.7619579434394836, "learning_rate": 4.1133333333333335e-06, "loss": 0.0096, "step": 3150 }, { "epoch": 4.704225352112676, "grad_norm": 1.5049270391464233, "learning_rate": 4.057777777777778e-06, "loss": 0.0099, "step": 3175 }, { "epoch": 4.741289844329133, "grad_norm": 1.1056573390960693, "learning_rate": 4.002222222222222e-06, "loss": 0.0065, "step": 3200 }, { "epoch": 4.778354336545589, "grad_norm": 0.7983392477035522, "learning_rate": 3.946666666666667e-06, "loss": 0.0105, "step": 3225 }, { "epoch": 4.815418828762046, "grad_norm": 1.1153795719146729, "learning_rate": 3.891111111111111e-06, "loss": 0.0075, "step": 3250 }, { "epoch": 4.852483320978503, "grad_norm": 0.9730608463287354, "learning_rate": 3.835555555555555e-06, "loss": 0.0087, "step": 3275 }, { "epoch": 4.889547813194959, "grad_norm": 0.5694206953048706, "learning_rate": 3.7800000000000002e-06, "loss": 0.0071, "step": 3300 }, { "epoch": 4.926612305411416, "grad_norm": 0.2520028352737427, "learning_rate": 3.724444444444445e-06, "loss": 0.0081, "step": 3325 }, { "epoch": 4.963676797627873, "grad_norm": 0.436355322599411, "learning_rate": 3.668888888888889e-06, "loss": 0.0078, "step": 3350 }, { "epoch": 5.0, "grad_norm": 0.798361599445343, "learning_rate": 3.6133333333333336e-06, "loss": 0.0075, "step": 3375 }, { "epoch": 5.037064492216457, "grad_norm": 1.3702267408370972, "learning_rate": 3.5577777777777785e-06, "loss": 0.005, "step": 3400 }, { "epoch": 5.074128984432913, "grad_norm": 0.2790464162826538, "learning_rate": 3.5022222222222225e-06, "loss": 0.0032, "step": 3425 }, { "epoch": 5.11119347664937, "grad_norm": 0.15111476182937622, "learning_rate": 3.446666666666667e-06, "loss": 0.0046, "step": 3450 }, { "epoch": 5.148257968865827, "grad_norm": 0.09985285252332687, "learning_rate": 3.391111111111111e-06, "loss": 0.0035, "step": 3475 }, { "epoch": 5.185322461082283, "grad_norm": 0.5352105498313904, "learning_rate": 3.335555555555556e-06, "loss": 0.0031, "step": 3500 }, { "epoch": 5.22238695329874, "grad_norm": 0.9406213760375977, "learning_rate": 3.2800000000000004e-06, "loss": 0.0035, "step": 3525 }, { "epoch": 5.259451445515197, "grad_norm": 0.7073507905006409, "learning_rate": 3.2244444444444444e-06, "loss": 0.0035, "step": 3550 }, { "epoch": 5.2965159377316535, "grad_norm": 0.07916448265314102, "learning_rate": 3.1688888888888893e-06, "loss": 0.0035, "step": 3575 }, { "epoch": 5.3335804299481095, "grad_norm": 0.5285120606422424, "learning_rate": 3.1133333333333337e-06, "loss": 0.0027, "step": 3600 }, { "epoch": 5.3706449221645665, "grad_norm": 0.09832775592803955, "learning_rate": 3.0577777777777778e-06, "loss": 0.0036, "step": 3625 }, { "epoch": 5.407709414381023, "grad_norm": 0.21083103120326996, "learning_rate": 3.0022222222222227e-06, "loss": 0.0041, "step": 3650 }, { "epoch": 5.444773906597479, "grad_norm": 0.6747980713844299, "learning_rate": 2.946666666666667e-06, "loss": 0.003, "step": 3675 }, { "epoch": 5.481838398813936, "grad_norm": 0.5111549496650696, "learning_rate": 2.891111111111111e-06, "loss": 0.0028, "step": 3700 }, { "epoch": 5.518902891030393, "grad_norm": 0.6502516269683838, "learning_rate": 2.835555555555556e-06, "loss": 0.0045, "step": 3725 }, { "epoch": 5.555967383246849, "grad_norm": 0.4688964784145355, "learning_rate": 2.7800000000000005e-06, "loss": 0.0036, "step": 3750 }, { "epoch": 5.593031875463306, "grad_norm": 0.281994104385376, "learning_rate": 2.7244444444444445e-06, "loss": 0.0021, "step": 3775 }, { "epoch": 5.630096367679763, "grad_norm": 0.11583279073238373, "learning_rate": 2.6688888888888894e-06, "loss": 0.0041, "step": 3800 }, { "epoch": 5.667160859896219, "grad_norm": 0.22941534221172333, "learning_rate": 2.6133333333333334e-06, "loss": 0.0022, "step": 3825 }, { "epoch": 5.704225352112676, "grad_norm": 0.13950073719024658, "learning_rate": 2.557777777777778e-06, "loss": 0.003, "step": 3850 }, { "epoch": 5.741289844329133, "grad_norm": 0.6869206428527832, "learning_rate": 2.5022222222222224e-06, "loss": 0.0024, "step": 3875 }, { "epoch": 5.778354336545589, "grad_norm": 0.09893081337213516, "learning_rate": 2.446666666666667e-06, "loss": 0.0029, "step": 3900 }, { "epoch": 5.815418828762046, "grad_norm": 0.1264762133359909, "learning_rate": 2.3911111111111113e-06, "loss": 0.0033, "step": 3925 }, { "epoch": 5.852483320978503, "grad_norm": 0.15489889681339264, "learning_rate": 2.3355555555555557e-06, "loss": 0.003, "step": 3950 }, { "epoch": 5.889547813194959, "grad_norm": 0.5875250697135925, "learning_rate": 2.28e-06, "loss": 0.0022, "step": 3975 }, { "epoch": 5.926612305411416, "grad_norm": 0.06691984087228775, "learning_rate": 2.2244444444444447e-06, "loss": 0.005, "step": 4000 }, { "epoch": 5.926612305411416, "eval_loss": 0.28099098801612854, "eval_runtime": 734.9707, "eval_samples_per_second": 3.925, "eval_steps_per_second": 0.491, "eval_wer": 0.11566347469220246, "step": 4000 }, { "epoch": 5.963676797627873, "grad_norm": 0.2645249664783478, "learning_rate": 2.168888888888889e-06, "loss": 0.0026, "step": 4025 }, { "epoch": 6.0, "grad_norm": 0.3361597955226898, "learning_rate": 2.1133333333333336e-06, "loss": 0.0023, "step": 4050 }, { "epoch": 6.037064492216457, "grad_norm": 0.059147898107767105, "learning_rate": 2.057777777777778e-06, "loss": 0.0015, "step": 4075 }, { "epoch": 6.074128984432913, "grad_norm": 0.1158735603094101, "learning_rate": 2.0022222222222225e-06, "loss": 0.0016, "step": 4100 }, { "epoch": 6.11119347664937, "grad_norm": 1.3564985990524292, "learning_rate": 1.9466666666666665e-06, "loss": 0.0014, "step": 4125 }, { "epoch": 6.148257968865827, "grad_norm": 0.5956087112426758, "learning_rate": 1.8911111111111114e-06, "loss": 0.0018, "step": 4150 }, { "epoch": 6.185322461082283, "grad_norm": 0.09224885702133179, "learning_rate": 1.8355555555555557e-06, "loss": 0.0017, "step": 4175 }, { "epoch": 6.22238695329874, "grad_norm": 0.06868930906057358, "learning_rate": 1.7800000000000001e-06, "loss": 0.0017, "step": 4200 }, { "epoch": 6.259451445515197, "grad_norm": 0.06657718122005463, "learning_rate": 1.7244444444444448e-06, "loss": 0.0014, "step": 4225 }, { "epoch": 6.2965159377316535, "grad_norm": 0.05459928885102272, "learning_rate": 1.668888888888889e-06, "loss": 0.0017, "step": 4250 }, { "epoch": 6.3335804299481095, "grad_norm": 0.05795517563819885, "learning_rate": 1.6133333333333335e-06, "loss": 0.0027, "step": 4275 }, { "epoch": 6.3706449221645665, "grad_norm": 0.06204914301633835, "learning_rate": 1.5577777777777777e-06, "loss": 0.0012, "step": 4300 }, { "epoch": 6.407709414381023, "grad_norm": 0.0820712074637413, "learning_rate": 1.5022222222222224e-06, "loss": 0.0012, "step": 4325 }, { "epoch": 6.444773906597479, "grad_norm": 0.056523606181144714, "learning_rate": 1.4466666666666669e-06, "loss": 0.0013, "step": 4350 }, { "epoch": 6.481838398813936, "grad_norm": 0.07985592633485794, "learning_rate": 1.3911111111111111e-06, "loss": 0.0014, "step": 4375 }, { "epoch": 6.518902891030393, "grad_norm": 0.044111426919698715, "learning_rate": 1.3355555555555558e-06, "loss": 0.0012, "step": 4400 }, { "epoch": 6.555967383246849, "grad_norm": 0.05683915689587593, "learning_rate": 1.28e-06, "loss": 0.0014, "step": 4425 }, { "epoch": 6.593031875463306, "grad_norm": 0.08568093180656433, "learning_rate": 1.2244444444444445e-06, "loss": 0.0012, "step": 4450 }, { "epoch": 6.630096367679763, "grad_norm": 0.054062824696302414, "learning_rate": 1.168888888888889e-06, "loss": 0.0011, "step": 4475 }, { "epoch": 6.667160859896219, "grad_norm": 0.0509476363658905, "learning_rate": 1.1133333333333334e-06, "loss": 0.0013, "step": 4500 }, { "epoch": 6.704225352112676, "grad_norm": 0.04927874356508255, "learning_rate": 1.0577777777777779e-06, "loss": 0.0012, "step": 4525 }, { "epoch": 6.741289844329133, "grad_norm": 0.08598697185516357, "learning_rate": 1.0022222222222223e-06, "loss": 0.0011, "step": 4550 }, { "epoch": 6.778354336545589, "grad_norm": 0.3571934700012207, "learning_rate": 9.466666666666667e-07, "loss": 0.0016, "step": 4575 }, { "epoch": 6.815418828762046, "grad_norm": 0.05977300554513931, "learning_rate": 8.911111111111112e-07, "loss": 0.001, "step": 4600 }, { "epoch": 6.852483320978503, "grad_norm": 0.05966237559914589, "learning_rate": 8.355555555555556e-07, "loss": 0.001, "step": 4625 }, { "epoch": 6.889547813194959, "grad_norm": 0.05432112514972687, "learning_rate": 7.8e-07, "loss": 0.001, "step": 4650 }, { "epoch": 6.926612305411416, "grad_norm": 0.06741122156381607, "learning_rate": 7.244444444444446e-07, "loss": 0.0019, "step": 4675 }, { "epoch": 6.963676797627873, "grad_norm": 0.04723643884062767, "learning_rate": 6.68888888888889e-07, "loss": 0.0012, "step": 4700 }, { "epoch": 7.0, "grad_norm": 0.07329325377941132, "learning_rate": 6.133333333333333e-07, "loss": 0.001, "step": 4725 }, { "epoch": 7.037064492216457, "grad_norm": 0.06389188766479492, "learning_rate": 5.577777777777779e-07, "loss": 0.001, "step": 4750 }, { "epoch": 7.074128984432913, "grad_norm": 0.03797365352511406, "learning_rate": 5.022222222222222e-07, "loss": 0.001, "step": 4775 }, { "epoch": 7.11119347664937, "grad_norm": 0.04686768725514412, "learning_rate": 4.466666666666667e-07, "loss": 0.0009, "step": 4800 }, { "epoch": 7.148257968865827, "grad_norm": 0.06883518397808075, "learning_rate": 3.9111111111111115e-07, "loss": 0.001, "step": 4825 }, { "epoch": 7.185322461082283, "grad_norm": 0.02842629700899124, "learning_rate": 3.3555555555555556e-07, "loss": 0.0009, "step": 4850 }, { "epoch": 7.22238695329874, "grad_norm": 0.04749394953250885, "learning_rate": 2.8e-07, "loss": 0.001, "step": 4875 }, { "epoch": 7.259451445515197, "grad_norm": 0.04491546377539635, "learning_rate": 2.2444444444444445e-07, "loss": 0.001, "step": 4900 }, { "epoch": 7.2965159377316535, "grad_norm": 0.056013334542512894, "learning_rate": 1.6888888888888888e-07, "loss": 0.001, "step": 4925 }, { "epoch": 7.3335804299481095, "grad_norm": 0.057778194546699524, "learning_rate": 1.1333333333333336e-07, "loss": 0.0011, "step": 4950 }, { "epoch": 7.3706449221645665, "grad_norm": 0.051241885870695114, "learning_rate": 5.777777777777778e-08, "loss": 0.0011, "step": 4975 }, { "epoch": 7.407709414381023, "grad_norm": 0.06301814317703247, "learning_rate": 2.2222222222222225e-09, "loss": 0.0009, "step": 5000 }, { "epoch": 7.407709414381023, "eval_loss": 0.29011788964271545, "eval_runtime": 732.4342, "eval_samples_per_second": 3.939, "eval_steps_per_second": 0.493, "eval_wer": 0.1146545827633379, "step": 5000 }, { "epoch": 7.407709414381023, "step": 5000, "total_flos": 8.155551755501568e+19, "train_loss": 0.10907779041565954, "train_runtime": 12394.4337, "train_samples_per_second": 6.455, "train_steps_per_second": 0.403 } ], "logging_steps": 25, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.155551755501568e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }