{ "best_global_step": 4000, "best_metric": 2.1779719650083877, "best_model_checkpoint": "./SALAMA_C4/checkpoint-4000", "epoch": 2.7229407760381212, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013614703880190605, "grad_norm": 6.790204048156738, "learning_rate": 6.333333333333334e-07, "loss": 0.1302, "step": 20 }, { "epoch": 0.02722940776038121, "grad_norm": 6.674442768096924, "learning_rate": 1.3e-06, "loss": 0.1286, "step": 40 }, { "epoch": 0.04084411164057182, "grad_norm": 6.946224689483643, "learning_rate": 1.9666666666666668e-06, "loss": 0.1394, "step": 60 }, { "epoch": 0.05445881552076242, "grad_norm": 8.382658004760742, "learning_rate": 2.6333333333333332e-06, "loss": 0.1267, "step": 80 }, { "epoch": 0.06807351940095303, "grad_norm": 5.505548477172852, "learning_rate": 3.3000000000000006e-06, "loss": 0.1, "step": 100 }, { "epoch": 0.08168822328114364, "grad_norm": 3.323385715484619, "learning_rate": 3.966666666666667e-06, "loss": 0.0979, "step": 120 }, { "epoch": 0.09530292716133425, "grad_norm": 3.4979546070098877, "learning_rate": 4.633333333333334e-06, "loss": 0.0934, "step": 140 }, { "epoch": 0.10891763104152484, "grad_norm": 5.154605865478516, "learning_rate": 5.300000000000001e-06, "loss": 0.1039, "step": 160 }, { "epoch": 0.12253233492171545, "grad_norm": 5.488887786865234, "learning_rate": 5.966666666666667e-06, "loss": 0.1029, "step": 180 }, { "epoch": 0.13614703880190607, "grad_norm": 3.0190138816833496, "learning_rate": 6.633333333333334e-06, "loss": 0.1151, "step": 200 }, { "epoch": 0.14976174268209666, "grad_norm": 4.113169193267822, "learning_rate": 7.3e-06, "loss": 0.0936, "step": 220 }, { "epoch": 0.16337644656228728, "grad_norm": 6.0397515296936035, "learning_rate": 7.966666666666668e-06, "loss": 0.1214, "step": 240 }, { "epoch": 0.17699115044247787, "grad_norm": 6.033300399780273, "learning_rate": 8.633333333333334e-06, "loss": 0.1153, "step": 260 }, { "epoch": 0.1906058543226685, "grad_norm": 3.8764054775238037, "learning_rate": 9.3e-06, "loss": 0.1025, "step": 280 }, { "epoch": 0.2042205582028591, "grad_norm": 5.634373188018799, "learning_rate": 9.966666666666667e-06, "loss": 0.12, "step": 300 }, { "epoch": 0.21783526208304968, "grad_norm": 4.584943771362305, "learning_rate": 9.973030518097942e-06, "loss": 0.1099, "step": 320 }, { "epoch": 0.2314499659632403, "grad_norm": 5.086080551147461, "learning_rate": 9.944641589779987e-06, "loss": 0.1267, "step": 340 }, { "epoch": 0.2450646698434309, "grad_norm": 5.2667741775512695, "learning_rate": 9.91625266146203e-06, "loss": 0.1059, "step": 360 }, { "epoch": 0.2586793737236215, "grad_norm": 5.114360809326172, "learning_rate": 9.887863733144075e-06, "loss": 0.1294, "step": 380 }, { "epoch": 0.27229407760381213, "grad_norm": 6.17408561706543, "learning_rate": 9.859474804826119e-06, "loss": 0.12, "step": 400 }, { "epoch": 0.2859087814840027, "grad_norm": 4.034195899963379, "learning_rate": 9.831085876508163e-06, "loss": 0.1082, "step": 420 }, { "epoch": 0.2995234853641933, "grad_norm": 4.708987236022949, "learning_rate": 9.802696948190207e-06, "loss": 0.1041, "step": 440 }, { "epoch": 0.3131381892443839, "grad_norm": 3.7173264026641846, "learning_rate": 9.774308019872251e-06, "loss": 0.115, "step": 460 }, { "epoch": 0.32675289312457456, "grad_norm": 4.3770647048950195, "learning_rate": 9.745919091554295e-06, "loss": 0.1202, "step": 480 }, { "epoch": 0.34036759700476515, "grad_norm": 3.7374465465545654, "learning_rate": 9.717530163236339e-06, "loss": 0.1008, "step": 500 }, { "epoch": 0.35398230088495575, "grad_norm": 3.801574945449829, "learning_rate": 9.689141234918381e-06, "loss": 0.1171, "step": 520 }, { "epoch": 0.36759700476514634, "grad_norm": 3.962735176086426, "learning_rate": 9.660752306600427e-06, "loss": 0.0997, "step": 540 }, { "epoch": 0.381211708645337, "grad_norm": 4.948540687561035, "learning_rate": 9.632363378282471e-06, "loss": 0.1304, "step": 560 }, { "epoch": 0.3948264125255276, "grad_norm": 5.182698726654053, "learning_rate": 9.603974449964515e-06, "loss": 0.1059, "step": 580 }, { "epoch": 0.4084411164057182, "grad_norm": 5.27463436126709, "learning_rate": 9.57558552164656e-06, "loss": 0.1379, "step": 600 }, { "epoch": 0.42205582028590877, "grad_norm": 4.754587650299072, "learning_rate": 9.547196593328602e-06, "loss": 0.1135, "step": 620 }, { "epoch": 0.43567052416609936, "grad_norm": 4.032769680023193, "learning_rate": 9.518807665010646e-06, "loss": 0.0925, "step": 640 }, { "epoch": 0.44928522804629, "grad_norm": Infinity, "learning_rate": 9.49041873669269e-06, "loss": 0.1764, "step": 660 }, { "epoch": 0.4628999319264806, "grad_norm": 12.64933967590332, "learning_rate": 9.467707594038326e-06, "loss": 0.5231, "step": 680 }, { "epoch": 0.4765146358066712, "grad_norm": 16.499448776245117, "learning_rate": 9.43931866572037e-06, "loss": 0.1525, "step": 700 }, { "epoch": 0.4901293396868618, "grad_norm": 2.7940587997436523, "learning_rate": 9.410929737402414e-06, "loss": 0.1635, "step": 720 }, { "epoch": 0.5037440435670524, "grad_norm": 5.6086039543151855, "learning_rate": 9.382540809084458e-06, "loss": 0.1177, "step": 740 }, { "epoch": 0.517358747447243, "grad_norm": 5.121674537658691, "learning_rate": 9.354151880766502e-06, "loss": 0.1185, "step": 760 }, { "epoch": 0.5309734513274337, "grad_norm": 5.849049091339111, "learning_rate": 9.325762952448546e-06, "loss": 0.1358, "step": 780 }, { "epoch": 0.5445881552076243, "grad_norm": 4.180930137634277, "learning_rate": 9.29737402413059e-06, "loss": 0.1192, "step": 800 }, { "epoch": 0.5582028590878149, "grad_norm": 5.0372796058654785, "learning_rate": 9.268985095812634e-06, "loss": 0.1194, "step": 820 }, { "epoch": 0.5718175629680055, "grad_norm": 3.9500577449798584, "learning_rate": 9.240596167494677e-06, "loss": 0.1026, "step": 840 }, { "epoch": 0.585432266848196, "grad_norm": 5.2215800285339355, "learning_rate": 9.212207239176721e-06, "loss": 0.1227, "step": 860 }, { "epoch": 0.5990469707283866, "grad_norm": 7.128079414367676, "learning_rate": 9.183818310858765e-06, "loss": 0.1296, "step": 880 }, { "epoch": 0.6126616746085772, "grad_norm": 4.189157485961914, "learning_rate": 9.15542938254081e-06, "loss": 0.1092, "step": 900 }, { "epoch": 0.6262763784887678, "grad_norm": 4.203508377075195, "learning_rate": 9.127040454222855e-06, "loss": 0.1067, "step": 920 }, { "epoch": 0.6398910823689585, "grad_norm": 5.561427593231201, "learning_rate": 9.098651525904897e-06, "loss": 0.1156, "step": 940 }, { "epoch": 0.6535057862491491, "grad_norm": 2.9789481163024902, "learning_rate": 9.070262597586941e-06, "loss": 0.1144, "step": 960 }, { "epoch": 0.6671204901293397, "grad_norm": 6.220657825469971, "learning_rate": 9.041873669268985e-06, "loss": 0.121, "step": 980 }, { "epoch": 0.6807351940095303, "grad_norm": 5.125210285186768, "learning_rate": 9.01348474095103e-06, "loss": 0.1073, "step": 1000 }, { "epoch": 0.6943498978897209, "grad_norm": 4.665761947631836, "learning_rate": 8.985095812633073e-06, "loss": 0.1032, "step": 1020 }, { "epoch": 0.7079646017699115, "grad_norm": 3.9673075675964355, "learning_rate": 8.956706884315118e-06, "loss": 0.1133, "step": 1040 }, { "epoch": 0.7215793056501021, "grad_norm": 4.1961140632629395, "learning_rate": 8.928317955997162e-06, "loss": 0.1008, "step": 1060 }, { "epoch": 0.7351940095302927, "grad_norm": 6.15500545501709, "learning_rate": 8.899929027679206e-06, "loss": 0.1041, "step": 1080 }, { "epoch": 0.7488087134104833, "grad_norm": 4.683866024017334, "learning_rate": 8.87154009936125e-06, "loss": 0.1086, "step": 1100 }, { "epoch": 0.762423417290674, "grad_norm": 8.679821968078613, "learning_rate": 8.843151171043294e-06, "loss": 0.1049, "step": 1120 }, { "epoch": 0.7760381211708646, "grad_norm": 4.2732038497924805, "learning_rate": 8.814762242725338e-06, "loss": 0.1013, "step": 1140 }, { "epoch": 0.7896528250510552, "grad_norm": 9.020421028137207, "learning_rate": 8.786373314407382e-06, "loss": 0.101, "step": 1160 }, { "epoch": 0.8032675289312458, "grad_norm": 6.960070610046387, "learning_rate": 8.757984386089426e-06, "loss": 0.1067, "step": 1180 }, { "epoch": 0.8168822328114363, "grad_norm": 3.1116604804992676, "learning_rate": 8.72959545777147e-06, "loss": 0.099, "step": 1200 }, { "epoch": 0.8304969366916269, "grad_norm": 3.949913740158081, "learning_rate": 8.701206529453514e-06, "loss": 0.1068, "step": 1220 }, { "epoch": 0.8441116405718175, "grad_norm": 4.617140293121338, "learning_rate": 8.672817601135558e-06, "loss": 0.1237, "step": 1240 }, { "epoch": 0.8577263444520081, "grad_norm": 6.639930248260498, "learning_rate": 8.644428672817602e-06, "loss": 0.1295, "step": 1260 }, { "epoch": 0.8713410483321987, "grad_norm": 5.353048324584961, "learning_rate": 8.616039744499646e-06, "loss": 0.1088, "step": 1280 }, { "epoch": 0.8849557522123894, "grad_norm": 7.117270469665527, "learning_rate": 8.587650816181689e-06, "loss": 0.112, "step": 1300 }, { "epoch": 0.89857045609258, "grad_norm": 5.29574728012085, "learning_rate": 8.559261887863733e-06, "loss": 0.1188, "step": 1320 }, { "epoch": 0.9121851599727706, "grad_norm": 7.008860111236572, "learning_rate": 8.530872959545777e-06, "loss": 0.1076, "step": 1340 }, { "epoch": 0.9257998638529612, "grad_norm": 5.290408134460449, "learning_rate": 8.502484031227823e-06, "loss": 0.1339, "step": 1360 }, { "epoch": 0.9394145677331518, "grad_norm": 4.095961093902588, "learning_rate": 8.474095102909867e-06, "loss": 0.111, "step": 1380 }, { "epoch": 0.9530292716133424, "grad_norm": 4.4582085609436035, "learning_rate": 8.44570617459191e-06, "loss": 0.1079, "step": 1400 }, { "epoch": 0.966643975493533, "grad_norm": 3.397148370742798, "learning_rate": 8.417317246273953e-06, "loss": 0.0989, "step": 1420 }, { "epoch": 0.9802586793737236, "grad_norm": 4.60284948348999, "learning_rate": 8.388928317955997e-06, "loss": 0.1205, "step": 1440 }, { "epoch": 0.9938733832539143, "grad_norm": 4.263270378112793, "learning_rate": 8.360539389638041e-06, "loss": 0.0981, "step": 1460 }, { "epoch": 1.0074880871341048, "grad_norm": 2.734585762023926, "learning_rate": 8.332150461320085e-06, "loss": 0.0687, "step": 1480 }, { "epoch": 1.0211027910142954, "grad_norm": 4.107769966125488, "learning_rate": 8.30376153300213e-06, "loss": 0.0483, "step": 1500 }, { "epoch": 1.034717494894486, "grad_norm": 3.7158114910125732, "learning_rate": 8.275372604684174e-06, "loss": 0.0523, "step": 1520 }, { "epoch": 1.0483321987746765, "grad_norm": 3.725184917449951, "learning_rate": 8.246983676366218e-06, "loss": 0.0438, "step": 1540 }, { "epoch": 1.0619469026548674, "grad_norm": 2.794508218765259, "learning_rate": 8.218594748048262e-06, "loss": 0.0534, "step": 1560 }, { "epoch": 1.075561606535058, "grad_norm": 3.6485531330108643, "learning_rate": 8.190205819730306e-06, "loss": 0.0443, "step": 1580 }, { "epoch": 1.0891763104152485, "grad_norm": 2.9312281608581543, "learning_rate": 8.16181689141235e-06, "loss": 0.0426, "step": 1600 }, { "epoch": 1.1027910142954391, "grad_norm": 3.66829514503479, "learning_rate": 8.133427963094394e-06, "loss": 0.0558, "step": 1620 }, { "epoch": 1.1164057181756297, "grad_norm": 2.831806182861328, "learning_rate": 8.105039034776438e-06, "loss": 0.0504, "step": 1640 }, { "epoch": 1.1300204220558203, "grad_norm": 4.0393171310424805, "learning_rate": 8.076650106458482e-06, "loss": 0.0447, "step": 1660 }, { "epoch": 1.143635125936011, "grad_norm": 3.919926643371582, "learning_rate": 8.048261178140526e-06, "loss": 0.0432, "step": 1680 }, { "epoch": 1.1572498298162015, "grad_norm": 2.3747353553771973, "learning_rate": 8.01987224982257e-06, "loss": 0.0495, "step": 1700 }, { "epoch": 1.170864533696392, "grad_norm": 2.7838926315307617, "learning_rate": 7.991483321504614e-06, "loss": 0.0477, "step": 1720 }, { "epoch": 1.1844792375765827, "grad_norm": 4.423709869384766, "learning_rate": 7.963094393186658e-06, "loss": 0.0511, "step": 1740 }, { "epoch": 1.1980939414567733, "grad_norm": 3.6399760246276855, "learning_rate": 7.9347054648687e-06, "loss": 0.0512, "step": 1760 }, { "epoch": 1.2117086453369639, "grad_norm": 4.124867916107178, "learning_rate": 7.906316536550745e-06, "loss": 0.0503, "step": 1780 }, { "epoch": 1.2253233492171545, "grad_norm": 2.90708589553833, "learning_rate": 7.877927608232789e-06, "loss": 0.0428, "step": 1800 }, { "epoch": 1.238938053097345, "grad_norm": 3.0257723331451416, "learning_rate": 7.849538679914835e-06, "loss": 0.0505, "step": 1820 }, { "epoch": 1.2525527569775359, "grad_norm": 3.9913330078125, "learning_rate": 7.821149751596879e-06, "loss": 0.051, "step": 1840 }, { "epoch": 1.2661674608577265, "grad_norm": 3.413067102432251, "learning_rate": 7.792760823278921e-06, "loss": 0.0566, "step": 1860 }, { "epoch": 1.279782164737917, "grad_norm": 2.3691389560699463, "learning_rate": 7.764371894960965e-06, "loss": 0.0387, "step": 1880 }, { "epoch": 1.2933968686181077, "grad_norm": 2.782249689102173, "learning_rate": 7.73598296664301e-06, "loss": 0.0548, "step": 1900 }, { "epoch": 1.3070115724982982, "grad_norm": 2.8736960887908936, "learning_rate": 7.707594038325053e-06, "loss": 0.0512, "step": 1920 }, { "epoch": 1.3206262763784888, "grad_norm": 2.5277926921844482, "learning_rate": 7.679205110007098e-06, "loss": 0.0449, "step": 1940 }, { "epoch": 1.3342409802586794, "grad_norm": 3.7290806770324707, "learning_rate": 7.650816181689142e-06, "loss": 0.052, "step": 1960 }, { "epoch": 1.34785568413887, "grad_norm": 3.4079151153564453, "learning_rate": 7.6224272533711865e-06, "loss": 0.0459, "step": 1980 }, { "epoch": 1.3614703880190606, "grad_norm": 2.7560651302337646, "learning_rate": 7.59403832505323e-06, "loss": 0.0536, "step": 2000 }, { "epoch": 1.3614703880190606, "eval_loss": 0.05250042304396629, "eval_runtime": 4762.58, "eval_samples_per_second": 2.467, "eval_steps_per_second": 0.308, "eval_wer": 3.935134725289306, "step": 2000 }, { "epoch": 1.3750850918992512, "grad_norm": 2.759082794189453, "learning_rate": 7.565649396735274e-06, "loss": 0.0465, "step": 2020 }, { "epoch": 1.3886997957794418, "grad_norm": 3.005037307739258, "learning_rate": 7.537260468417318e-06, "loss": 0.044, "step": 2040 }, { "epoch": 1.4023144996596324, "grad_norm": 1.479257345199585, "learning_rate": 7.508871540099362e-06, "loss": 0.0439, "step": 2060 }, { "epoch": 1.415929203539823, "grad_norm": 3.5328667163848877, "learning_rate": 7.480482611781405e-06, "loss": 0.0489, "step": 2080 }, { "epoch": 1.4295439074200136, "grad_norm": 4.166772842407227, "learning_rate": 7.452093683463449e-06, "loss": 0.047, "step": 2100 }, { "epoch": 1.4431586113002042, "grad_norm": 6.129347801208496, "learning_rate": 7.423704755145494e-06, "loss": 0.0417, "step": 2120 }, { "epoch": 1.4567733151803948, "grad_norm": 3.5741307735443115, "learning_rate": 7.395315826827538e-06, "loss": 0.0531, "step": 2140 }, { "epoch": 1.4703880190605854, "grad_norm": 2.9267029762268066, "learning_rate": 7.366926898509582e-06, "loss": 0.0413, "step": 2160 }, { "epoch": 1.484002722940776, "grad_norm": 5.829305648803711, "learning_rate": 7.3385379701916255e-06, "loss": 0.056, "step": 2180 }, { "epoch": 1.4976174268209665, "grad_norm": 5.405384540557861, "learning_rate": 7.31014904187367e-06, "loss": 0.0466, "step": 2200 }, { "epoch": 1.5112321307011571, "grad_norm": 2.835297107696533, "learning_rate": 7.281760113555714e-06, "loss": 0.0488, "step": 2220 }, { "epoch": 1.5248468345813477, "grad_norm": 3.75079607963562, "learning_rate": 7.253371185237758e-06, "loss": 0.0422, "step": 2240 }, { "epoch": 1.5384615384615383, "grad_norm": 3.934020519256592, "learning_rate": 7.224982256919801e-06, "loss": 0.0554, "step": 2260 }, { "epoch": 1.552076242341729, "grad_norm": 2.9037790298461914, "learning_rate": 7.196593328601847e-06, "loss": 0.0393, "step": 2280 }, { "epoch": 1.5656909462219195, "grad_norm": 3.191974639892578, "learning_rate": 7.16820440028389e-06, "loss": 0.0477, "step": 2300 }, { "epoch": 1.5793056501021103, "grad_norm": 3.6773743629455566, "learning_rate": 7.139815471965934e-06, "loss": 0.0443, "step": 2320 }, { "epoch": 1.592920353982301, "grad_norm": 3.3054301738739014, "learning_rate": 7.111426543647978e-06, "loss": 0.0516, "step": 2340 }, { "epoch": 1.6065350578624915, "grad_norm": 3.2623848915100098, "learning_rate": 7.083037615330021e-06, "loss": 0.0449, "step": 2360 }, { "epoch": 1.620149761742682, "grad_norm": 3.678145408630371, "learning_rate": 7.0546486870120654e-06, "loss": 0.0421, "step": 2380 }, { "epoch": 1.6337644656228727, "grad_norm": 1.8990530967712402, "learning_rate": 7.0262597586941095e-06, "loss": 0.0451, "step": 2400 }, { "epoch": 1.6473791695030633, "grad_norm": 3.1517415046691895, "learning_rate": 6.997870830376154e-06, "loss": 0.0444, "step": 2420 }, { "epoch": 1.6609938733832539, "grad_norm": 3.7715275287628174, "learning_rate": 6.9694819020581985e-06, "loss": 0.0478, "step": 2440 }, { "epoch": 1.6746085772634445, "grad_norm": 3.123321533203125, "learning_rate": 6.9410929737402426e-06, "loss": 0.0492, "step": 2460 }, { "epoch": 1.6882232811436353, "grad_norm": 3.15405535697937, "learning_rate": 6.912704045422286e-06, "loss": 0.0478, "step": 2480 }, { "epoch": 1.7018379850238259, "grad_norm": 3.2529942989349365, "learning_rate": 6.88431511710433e-06, "loss": 0.0478, "step": 2500 }, { "epoch": 1.7154526889040165, "grad_norm": 5.611981391906738, "learning_rate": 6.855926188786374e-06, "loss": 0.0516, "step": 2520 }, { "epoch": 1.729067392784207, "grad_norm": 2.6857259273529053, "learning_rate": 6.827537260468417e-06, "loss": 0.0416, "step": 2540 }, { "epoch": 1.7426820966643977, "grad_norm": 3.303100824356079, "learning_rate": 6.799148332150461e-06, "loss": 0.0431, "step": 2560 }, { "epoch": 1.7562968005445883, "grad_norm": 3.7506537437438965, "learning_rate": 6.770759403832506e-06, "loss": 0.0555, "step": 2580 }, { "epoch": 1.7699115044247788, "grad_norm": 2.169769048690796, "learning_rate": 6.74237047551455e-06, "loss": 0.0449, "step": 2600 }, { "epoch": 1.7835262083049694, "grad_norm": 2.864407539367676, "learning_rate": 6.713981547196594e-06, "loss": 0.045, "step": 2620 }, { "epoch": 1.79714091218516, "grad_norm": 2.529905319213867, "learning_rate": 6.685592618878638e-06, "loss": 0.0598, "step": 2640 }, { "epoch": 1.8107556160653506, "grad_norm": 3.385667324066162, "learning_rate": 6.657203690560682e-06, "loss": 0.05, "step": 2660 }, { "epoch": 1.8243703199455412, "grad_norm": 4.172725677490234, "learning_rate": 6.628814762242726e-06, "loss": 0.0473, "step": 2680 }, { "epoch": 1.8379850238257318, "grad_norm": 2.7874386310577393, "learning_rate": 6.60042583392477e-06, "loss": 0.0541, "step": 2700 }, { "epoch": 1.8515997277059224, "grad_norm": 2.623189926147461, "learning_rate": 6.572036905606813e-06, "loss": 0.0486, "step": 2720 }, { "epoch": 1.865214431586113, "grad_norm": 3.5073838233947754, "learning_rate": 6.543647977288859e-06, "loss": 0.0496, "step": 2740 }, { "epoch": 1.8788291354663036, "grad_norm": 2.545915126800537, "learning_rate": 6.515259048970902e-06, "loss": 0.05, "step": 2760 }, { "epoch": 1.8924438393464942, "grad_norm": 3.92095947265625, "learning_rate": 6.486870120652946e-06, "loss": 0.0576, "step": 2780 }, { "epoch": 1.9060585432266848, "grad_norm": 3.5295567512512207, "learning_rate": 6.45848119233499e-06, "loss": 0.0497, "step": 2800 }, { "epoch": 1.9196732471068754, "grad_norm": 2.9580278396606445, "learning_rate": 6.430092264017034e-06, "loss": 0.0487, "step": 2820 }, { "epoch": 1.933287950987066, "grad_norm": 2.3317949771881104, "learning_rate": 6.4017033356990774e-06, "loss": 0.0457, "step": 2840 }, { "epoch": 1.9469026548672566, "grad_norm": 2.8551571369171143, "learning_rate": 6.3733144073811215e-06, "loss": 0.0507, "step": 2860 }, { "epoch": 1.9605173587474471, "grad_norm": 3.363938093185425, "learning_rate": 6.344925479063166e-06, "loss": 0.0403, "step": 2880 }, { "epoch": 1.9741320626276377, "grad_norm": 3.0591518878936768, "learning_rate": 6.3165365507452105e-06, "loss": 0.0532, "step": 2900 }, { "epoch": 1.9877467665078283, "grad_norm": 3.6997368335723877, "learning_rate": 6.288147622427255e-06, "loss": 0.0572, "step": 2920 }, { "epoch": 2.001361470388019, "grad_norm": 1.2867134809494019, "learning_rate": 6.259758694109298e-06, "loss": 0.0393, "step": 2940 }, { "epoch": 2.0149761742682095, "grad_norm": 2.620507001876831, "learning_rate": 6.231369765791342e-06, "loss": 0.0201, "step": 2960 }, { "epoch": 2.0285908781484, "grad_norm": 1.7586274147033691, "learning_rate": 6.202980837473386e-06, "loss": 0.0164, "step": 2980 }, { "epoch": 2.0422055820285907, "grad_norm": 2.2848572731018066, "learning_rate": 6.17459190915543e-06, "loss": 0.0194, "step": 3000 }, { "epoch": 2.0558202859087813, "grad_norm": 1.6622294187545776, "learning_rate": 6.146202980837473e-06, "loss": 0.0164, "step": 3020 }, { "epoch": 2.069434989788972, "grad_norm": 3.2560629844665527, "learning_rate": 6.117814052519518e-06, "loss": 0.0184, "step": 3040 }, { "epoch": 2.0830496936691625, "grad_norm": 1.6647343635559082, "learning_rate": 6.089425124201562e-06, "loss": 0.014, "step": 3060 }, { "epoch": 2.096664397549353, "grad_norm": 1.7806779146194458, "learning_rate": 6.061036195883606e-06, "loss": 0.0172, "step": 3080 }, { "epoch": 2.110279101429544, "grad_norm": 1.9715111255645752, "learning_rate": 6.03264726756565e-06, "loss": 0.017, "step": 3100 }, { "epoch": 2.1238938053097347, "grad_norm": 2.1102495193481445, "learning_rate": 6.004258339247694e-06, "loss": 0.0154, "step": 3120 }, { "epoch": 2.1375085091899253, "grad_norm": 2.978144407272339, "learning_rate": 5.975869410929738e-06, "loss": 0.0227, "step": 3140 }, { "epoch": 2.151123213070116, "grad_norm": 1.914223074913025, "learning_rate": 5.947480482611782e-06, "loss": 0.0144, "step": 3160 }, { "epoch": 2.1647379169503065, "grad_norm": 2.0948495864868164, "learning_rate": 5.919091554293826e-06, "loss": 0.0161, "step": 3180 }, { "epoch": 2.178352620830497, "grad_norm": 2.937959909439087, "learning_rate": 5.890702625975871e-06, "loss": 0.0177, "step": 3200 }, { "epoch": 2.1919673247106877, "grad_norm": 0.9957959651947021, "learning_rate": 5.862313697657914e-06, "loss": 0.0165, "step": 3220 }, { "epoch": 2.2055820285908783, "grad_norm": 1.2504860162734985, "learning_rate": 5.833924769339958e-06, "loss": 0.0169, "step": 3240 }, { "epoch": 2.219196732471069, "grad_norm": 2.532622814178467, "learning_rate": 5.805535841022002e-06, "loss": 0.0193, "step": 3260 }, { "epoch": 2.2328114363512594, "grad_norm": 2.0159101486206055, "learning_rate": 5.777146912704046e-06, "loss": 0.0189, "step": 3280 }, { "epoch": 2.24642614023145, "grad_norm": 2.19897198677063, "learning_rate": 5.7487579843860894e-06, "loss": 0.0195, "step": 3300 }, { "epoch": 2.2600408441116406, "grad_norm": 1.4231209754943848, "learning_rate": 5.7203690560681335e-06, "loss": 0.0185, "step": 3320 }, { "epoch": 2.2736555479918312, "grad_norm": 1.622592568397522, "learning_rate": 5.691980127750178e-06, "loss": 0.0176, "step": 3340 }, { "epoch": 2.287270251872022, "grad_norm": 1.630968451499939, "learning_rate": 5.6635911994322225e-06, "loss": 0.0179, "step": 3360 }, { "epoch": 2.3008849557522124, "grad_norm": 1.7456097602844238, "learning_rate": 5.635202271114267e-06, "loss": 0.0182, "step": 3380 }, { "epoch": 2.314499659632403, "grad_norm": 3.1690380573272705, "learning_rate": 5.60681334279631e-06, "loss": 0.0149, "step": 3400 }, { "epoch": 2.3281143635125936, "grad_norm": 2.187356472015381, "learning_rate": 5.578424414478354e-06, "loss": 0.0182, "step": 3420 }, { "epoch": 2.341729067392784, "grad_norm": 1.772560954093933, "learning_rate": 5.550035486160398e-06, "loss": 0.022, "step": 3440 }, { "epoch": 2.355343771272975, "grad_norm": 1.4121778011322021, "learning_rate": 5.521646557842442e-06, "loss": 0.0184, "step": 3460 }, { "epoch": 2.3689584751531654, "grad_norm": 2.99052357673645, "learning_rate": 5.493257629524485e-06, "loss": 0.0169, "step": 3480 }, { "epoch": 2.382573179033356, "grad_norm": 1.1008409261703491, "learning_rate": 5.464868701206529e-06, "loss": 0.0139, "step": 3500 }, { "epoch": 2.3961878829135466, "grad_norm": 1.6258739233016968, "learning_rate": 5.436479772888574e-06, "loss": 0.0132, "step": 3520 }, { "epoch": 2.409802586793737, "grad_norm": 2.573024034500122, "learning_rate": 5.408090844570618e-06, "loss": 0.0145, "step": 3540 }, { "epoch": 2.4234172906739277, "grad_norm": 1.7651166915893555, "learning_rate": 5.379701916252662e-06, "loss": 0.018, "step": 3560 }, { "epoch": 2.4370319945541183, "grad_norm": 1.000501036643982, "learning_rate": 5.351312987934706e-06, "loss": 0.0174, "step": 3580 }, { "epoch": 2.450646698434309, "grad_norm": 1.9365277290344238, "learning_rate": 5.32292405961675e-06, "loss": 0.0239, "step": 3600 }, { "epoch": 2.4642614023144995, "grad_norm": 1.6012334823608398, "learning_rate": 5.294535131298794e-06, "loss": 0.0158, "step": 3620 }, { "epoch": 2.47787610619469, "grad_norm": 1.9519362449645996, "learning_rate": 5.266146202980838e-06, "loss": 0.0172, "step": 3640 }, { "epoch": 2.4914908100748807, "grad_norm": 1.6032203435897827, "learning_rate": 5.237757274662883e-06, "loss": 0.0168, "step": 3660 }, { "epoch": 2.5051055139550717, "grad_norm": 1.5285481214523315, "learning_rate": 5.209368346344926e-06, "loss": 0.0197, "step": 3680 }, { "epoch": 2.518720217835262, "grad_norm": 0.9293742179870605, "learning_rate": 5.18097941802697e-06, "loss": 0.0169, "step": 3700 }, { "epoch": 2.532334921715453, "grad_norm": 1.169976830482483, "learning_rate": 5.152590489709014e-06, "loss": 0.0173, "step": 3720 }, { "epoch": 2.545949625595643, "grad_norm": 1.4823977947235107, "learning_rate": 5.124201561391058e-06, "loss": 0.0166, "step": 3740 }, { "epoch": 2.559564329475834, "grad_norm": 0.9930800199508667, "learning_rate": 5.0958126330731014e-06, "loss": 0.0179, "step": 3760 }, { "epoch": 2.5731790333560243, "grad_norm": 1.3004252910614014, "learning_rate": 5.0674237047551455e-06, "loss": 0.0189, "step": 3780 }, { "epoch": 2.5867937372362153, "grad_norm": 2.19750714302063, "learning_rate": 5.03903477643719e-06, "loss": 0.0181, "step": 3800 }, { "epoch": 2.6004084411164055, "grad_norm": 2.087999105453491, "learning_rate": 5.0106458481192345e-06, "loss": 0.0151, "step": 3820 }, { "epoch": 2.6140231449965965, "grad_norm": 1.2687196731567383, "learning_rate": 4.982256919801278e-06, "loss": 0.018, "step": 3840 }, { "epoch": 2.627637848876787, "grad_norm": 1.2243746519088745, "learning_rate": 4.953867991483322e-06, "loss": 0.0173, "step": 3860 }, { "epoch": 2.6412525527569777, "grad_norm": 0.9616296291351318, "learning_rate": 4.925479063165366e-06, "loss": 0.02, "step": 3880 }, { "epoch": 2.6548672566371683, "grad_norm": 0.7424539923667908, "learning_rate": 4.89709013484741e-06, "loss": 0.0143, "step": 3900 }, { "epoch": 2.668481960517359, "grad_norm": 2.386852979660034, "learning_rate": 4.868701206529454e-06, "loss": 0.0165, "step": 3920 }, { "epoch": 2.6820966643975495, "grad_norm": 1.4014451503753662, "learning_rate": 4.840312278211498e-06, "loss": 0.0231, "step": 3940 }, { "epoch": 2.69571136827774, "grad_norm": 1.2173271179199219, "learning_rate": 4.811923349893542e-06, "loss": 0.0181, "step": 3960 }, { "epoch": 2.7093260721579306, "grad_norm": 3.268631935119629, "learning_rate": 4.783534421575585e-06, "loss": 0.0179, "step": 3980 }, { "epoch": 2.7229407760381212, "grad_norm": 1.9503110647201538, "learning_rate": 4.7551454932576295e-06, "loss": 0.0154, "step": 4000 }, { "epoch": 2.7229407760381212, "eval_loss": 0.03271039202809334, "eval_runtime": 4672.049, "eval_samples_per_second": 2.515, "eval_steps_per_second": 0.314, "eval_wer": 2.1779719650083877, "step": 4000 } ], "logging_steps": 20, "max_steps": 7345, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.693258236215296e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }