{ "best_global_step": 2100, "best_metric": 1.0858707427978516, "best_model_checkpoint": "./outputs/checkpoint-2100", "epoch": 0.16188870151770657, "eval_steps": 100, "global_step": 2100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015417971573114913, "grad_norm": 1.2087944746017456, "learning_rate": 2.0000000000000003e-06, "loss": 1.8689, "step": 2 }, { "epoch": 0.00030835943146229826, "grad_norm": 1.2666666507720947, "learning_rate": 6e-06, "loss": 1.7785, "step": 4 }, { "epoch": 0.00046253914719344736, "grad_norm": 0.7307026982307434, "learning_rate": 1e-05, "loss": 1.6809, "step": 6 }, { "epoch": 0.0006167188629245965, "grad_norm": 1.2569252252578735, "learning_rate": 1.4000000000000001e-05, "loss": 1.9048, "step": 8 }, { "epoch": 0.0007708985786557456, "grad_norm": 0.9572980403900146, "learning_rate": 1.8e-05, "loss": 1.7574, "step": 10 }, { "epoch": 0.0009250782943868947, "grad_norm": 0.9918506145477295, "learning_rate": 2.2000000000000003e-05, "loss": 1.858, "step": 12 }, { "epoch": 0.0010792580101180438, "grad_norm": 0.9316955208778381, "learning_rate": 2.6000000000000002e-05, "loss": 1.8238, "step": 14 }, { "epoch": 0.001233437725849193, "grad_norm": 0.8265096545219421, "learning_rate": 3e-05, "loss": 1.6852, "step": 16 }, { "epoch": 0.001387617441580342, "grad_norm": 0.900516152381897, "learning_rate": 3.4000000000000007e-05, "loss": 1.8227, "step": 18 }, { "epoch": 0.0015417971573114912, "grad_norm": 0.9343056678771973, "learning_rate": 3.8e-05, "loss": 1.7732, "step": 20 }, { "epoch": 0.0016959768730426404, "grad_norm": 0.8314495086669922, "learning_rate": 4.2e-05, "loss": 1.732, "step": 22 }, { "epoch": 0.0018501565887737894, "grad_norm": 0.8370314240455627, "learning_rate": 4.600000000000001e-05, "loss": 1.6725, "step": 24 }, { "epoch": 0.0020043363045049384, "grad_norm": 0.6678845286369324, "learning_rate": 5e-05, "loss": 1.5638, "step": 26 }, { "epoch": 0.0021585160202360876, "grad_norm": 0.6469596028327942, "learning_rate": 5.4000000000000005e-05, "loss": 1.6414, "step": 28 }, { "epoch": 0.002312695735967237, "grad_norm": 1.1161589622497559, "learning_rate": 5.8e-05, "loss": 1.6015, "step": 30 }, { "epoch": 0.002466875451698386, "grad_norm": 0.6085391044616699, "learning_rate": 6.2e-05, "loss": 1.4577, "step": 32 }, { "epoch": 0.0026210551674295353, "grad_norm": 0.7159522175788879, "learning_rate": 6.6e-05, "loss": 1.4667, "step": 34 }, { "epoch": 0.002775234883160684, "grad_norm": 0.67247074842453, "learning_rate": 7e-05, "loss": 1.5619, "step": 36 }, { "epoch": 0.0029294145988918332, "grad_norm": 0.6272625923156738, "learning_rate": 7.4e-05, "loss": 1.322, "step": 38 }, { "epoch": 0.0030835943146229824, "grad_norm": 0.7291163206100464, "learning_rate": 7.800000000000001e-05, "loss": 1.3936, "step": 40 }, { "epoch": 0.0032377740303541317, "grad_norm": 0.4980190396308899, "learning_rate": 8.2e-05, "loss": 1.3322, "step": 42 }, { "epoch": 0.003391953746085281, "grad_norm": 1.032578945159912, "learning_rate": 8.6e-05, "loss": 1.3657, "step": 44 }, { "epoch": 0.0035461334618164296, "grad_norm": 0.5118615031242371, "learning_rate": 9e-05, "loss": 1.2866, "step": 46 }, { "epoch": 0.003700313177547579, "grad_norm": 0.5234407782554626, "learning_rate": 9.4e-05, "loss": 1.2806, "step": 48 }, { "epoch": 0.003854492893278728, "grad_norm": 0.49764135479927063, "learning_rate": 9.8e-05, "loss": 1.2004, "step": 50 }, { "epoch": 0.004008672609009877, "grad_norm": 0.34377485513687134, "learning_rate": 0.00010200000000000001, "loss": 1.1947, "step": 52 }, { "epoch": 0.0041628523247410265, "grad_norm": 0.41426530480384827, "learning_rate": 0.00010600000000000002, "loss": 1.2689, "step": 54 }, { "epoch": 0.004317032040472175, "grad_norm": 0.5027992129325867, "learning_rate": 0.00011000000000000002, "loss": 1.2249, "step": 56 }, { "epoch": 0.004471211756203325, "grad_norm": 0.44335752725601196, "learning_rate": 0.00011399999999999999, "loss": 1.2771, "step": 58 }, { "epoch": 0.004625391471934474, "grad_norm": 0.3176646828651428, "learning_rate": 0.000118, "loss": 1.1873, "step": 60 }, { "epoch": 0.0047795711876656224, "grad_norm": 0.24802716076374054, "learning_rate": 0.000122, "loss": 1.1989, "step": 62 }, { "epoch": 0.004933750903396772, "grad_norm": 0.23831751942634583, "learning_rate": 0.000126, "loss": 1.1093, "step": 64 }, { "epoch": 0.005087930619127921, "grad_norm": 0.24024009704589844, "learning_rate": 0.00013000000000000002, "loss": 1.2196, "step": 66 }, { "epoch": 0.0052421103348590705, "grad_norm": 0.2745237350463867, "learning_rate": 0.000134, "loss": 1.1802, "step": 68 }, { "epoch": 0.005396290050590219, "grad_norm": 0.27817806601524353, "learning_rate": 0.000138, "loss": 1.1939, "step": 70 }, { "epoch": 0.005550469766321368, "grad_norm": 0.19907328486442566, "learning_rate": 0.000142, "loss": 1.2061, "step": 72 }, { "epoch": 0.005704649482052518, "grad_norm": 0.18879663944244385, "learning_rate": 0.000146, "loss": 1.2149, "step": 74 }, { "epoch": 0.0058588291977836665, "grad_norm": 0.21456782519817352, "learning_rate": 0.00015000000000000001, "loss": 1.1726, "step": 76 }, { "epoch": 0.006013008913514816, "grad_norm": 0.23913143575191498, "learning_rate": 0.000154, "loss": 1.148, "step": 78 }, { "epoch": 0.006167188629245965, "grad_norm": 0.2148526906967163, "learning_rate": 0.00015800000000000002, "loss": 1.1925, "step": 80 }, { "epoch": 0.006321368344977114, "grad_norm": 0.2392999231815338, "learning_rate": 0.000162, "loss": 1.1488, "step": 82 }, { "epoch": 0.006475548060708263, "grad_norm": 0.16503232717514038, "learning_rate": 0.000166, "loss": 1.1555, "step": 84 }, { "epoch": 0.006629727776439412, "grad_norm": 0.1844739466905594, "learning_rate": 0.00017, "loss": 1.1934, "step": 86 }, { "epoch": 0.006783907492170562, "grad_norm": 0.23832857608795166, "learning_rate": 0.000174, "loss": 1.1129, "step": 88 }, { "epoch": 0.0069380872079017105, "grad_norm": 0.8846365809440613, "learning_rate": 0.00017800000000000002, "loss": 1.1028, "step": 90 }, { "epoch": 0.007092266923632859, "grad_norm": 0.187076598405838, "learning_rate": 0.000182, "loss": 1.1, "step": 92 }, { "epoch": 0.007246446639364009, "grad_norm": 0.1795521378517151, "learning_rate": 0.00018600000000000002, "loss": 1.1478, "step": 94 }, { "epoch": 0.007400626355095158, "grad_norm": 0.199871227145195, "learning_rate": 0.00019, "loss": 1.1223, "step": 96 }, { "epoch": 0.007554806070826307, "grad_norm": 0.17832662165164948, "learning_rate": 0.000194, "loss": 1.0909, "step": 98 }, { "epoch": 0.007708985786557456, "grad_norm": 0.17023932933807373, "learning_rate": 0.00019800000000000002, "loss": 1.1526, "step": 100 }, { "epoch": 0.007708985786557456, "eval_loss": 1.1401352882385254, "eval_runtime": 185.6269, "eval_samples_per_second": 91.274, "eval_steps_per_second": 1.428, "step": 100 }, { "epoch": 0.007863165502288605, "grad_norm": 0.17429223656654358, "learning_rate": 0.00019999484748557298, "loss": 1.1597, "step": 102 }, { "epoch": 0.008017345218019754, "grad_norm": 0.16158349812030792, "learning_rate": 0.0001999845424567189, "loss": 1.1297, "step": 104 }, { "epoch": 0.008171524933750904, "grad_norm": 0.15818771719932556, "learning_rate": 0.0001999742374278648, "loss": 1.083, "step": 106 }, { "epoch": 0.008325704649482053, "grad_norm": 0.1591726392507553, "learning_rate": 0.00019996393239901073, "loss": 1.086, "step": 108 }, { "epoch": 0.008479884365213202, "grad_norm": 0.174184650182724, "learning_rate": 0.00019995362737015664, "loss": 1.0769, "step": 110 }, { "epoch": 0.00863406408094435, "grad_norm": 0.15928815305233002, "learning_rate": 0.00019994332234130258, "loss": 1.1315, "step": 112 }, { "epoch": 0.0087882437966755, "grad_norm": 0.19639264047145844, "learning_rate": 0.0001999330173124485, "loss": 1.1339, "step": 114 }, { "epoch": 0.00894242351240665, "grad_norm": 0.1639835238456726, "learning_rate": 0.0001999227122835944, "loss": 1.0836, "step": 116 }, { "epoch": 0.009096603228137799, "grad_norm": 0.18691964447498322, "learning_rate": 0.00019991240725474033, "loss": 1.2109, "step": 118 }, { "epoch": 0.009250782943868947, "grad_norm": 0.188096821308136, "learning_rate": 0.00019990210222588624, "loss": 1.1778, "step": 120 }, { "epoch": 0.009404962659600096, "grad_norm": 0.1527150571346283, "learning_rate": 0.00019989179719703218, "loss": 1.0977, "step": 122 }, { "epoch": 0.009559142375331245, "grad_norm": 0.1705218255519867, "learning_rate": 0.0001998814921681781, "loss": 1.1333, "step": 124 }, { "epoch": 0.009713322091062395, "grad_norm": 0.1888928860425949, "learning_rate": 0.00019987118713932401, "loss": 1.1843, "step": 126 }, { "epoch": 0.009867501806793544, "grad_norm": 0.1778104603290558, "learning_rate": 0.00019986088211046993, "loss": 1.0766, "step": 128 }, { "epoch": 0.010021681522524693, "grad_norm": 0.15807992219924927, "learning_rate": 0.00019985057708161584, "loss": 1.0449, "step": 130 }, { "epoch": 0.010175861238255842, "grad_norm": 0.16706159710884094, "learning_rate": 0.00019984027205276176, "loss": 1.0644, "step": 132 }, { "epoch": 0.01033004095398699, "grad_norm": 0.16455501317977905, "learning_rate": 0.00019982996702390767, "loss": 1.1479, "step": 134 }, { "epoch": 0.010484220669718141, "grad_norm": 0.17258939146995544, "learning_rate": 0.0001998196619950536, "loss": 1.0614, "step": 136 }, { "epoch": 0.01063840038544929, "grad_norm": 0.15501369535923004, "learning_rate": 0.0001998093569661995, "loss": 1.1045, "step": 138 }, { "epoch": 0.010792580101180439, "grad_norm": 0.1534334272146225, "learning_rate": 0.00019979905193734542, "loss": 1.1035, "step": 140 }, { "epoch": 0.010946759816911587, "grad_norm": 0.14120443165302277, "learning_rate": 0.00019978874690849136, "loss": 1.0618, "step": 142 }, { "epoch": 0.011100939532642736, "grad_norm": 0.17808520793914795, "learning_rate": 0.00019977844187963728, "loss": 1.1687, "step": 144 }, { "epoch": 0.011255119248373887, "grad_norm": 0.16697613894939423, "learning_rate": 0.0001997681368507832, "loss": 1.0979, "step": 146 }, { "epoch": 0.011409298964105035, "grad_norm": 0.16491086781024933, "learning_rate": 0.0001997578318219291, "loss": 1.1219, "step": 148 }, { "epoch": 0.011563478679836184, "grad_norm": 0.15342313051223755, "learning_rate": 0.00019974752679307502, "loss": 1.1169, "step": 150 }, { "epoch": 0.011717658395567333, "grad_norm": 0.1539286971092224, "learning_rate": 0.00019973722176422093, "loss": 1.1288, "step": 152 }, { "epoch": 0.011871838111298482, "grad_norm": 0.15605852007865906, "learning_rate": 0.00019972691673536688, "loss": 1.0445, "step": 154 }, { "epoch": 0.012026017827029632, "grad_norm": 0.14324098825454712, "learning_rate": 0.0001997166117065128, "loss": 1.1309, "step": 156 }, { "epoch": 0.012180197542760781, "grad_norm": 0.21045701205730438, "learning_rate": 0.0001997063066776587, "loss": 1.0946, "step": 158 }, { "epoch": 0.01233437725849193, "grad_norm": 0.16019922494888306, "learning_rate": 0.00019969600164880462, "loss": 1.11, "step": 160 }, { "epoch": 0.012488556974223079, "grad_norm": 0.15740078687667847, "learning_rate": 0.00019968569661995054, "loss": 1.112, "step": 162 }, { "epoch": 0.012642736689954227, "grad_norm": 0.16974380612373352, "learning_rate": 0.00019967539159109648, "loss": 1.1279, "step": 164 }, { "epoch": 0.012796916405685378, "grad_norm": 0.16405288875102997, "learning_rate": 0.0001996650865622424, "loss": 1.0952, "step": 166 }, { "epoch": 0.012951096121416527, "grad_norm": 0.16120509803295135, "learning_rate": 0.0001996547815333883, "loss": 1.1203, "step": 168 }, { "epoch": 0.013105275837147675, "grad_norm": 0.17402276396751404, "learning_rate": 0.00019964447650453422, "loss": 1.0991, "step": 170 }, { "epoch": 0.013259455552878824, "grad_norm": 0.18349111080169678, "learning_rate": 0.00019963417147568014, "loss": 1.1394, "step": 172 }, { "epoch": 0.013413635268609973, "grad_norm": 0.14613087475299835, "learning_rate": 0.00019962386644682608, "loss": 1.1357, "step": 174 }, { "epoch": 0.013567814984341123, "grad_norm": 0.142988383769989, "learning_rate": 0.000199613561417972, "loss": 1.0169, "step": 176 }, { "epoch": 0.013721994700072272, "grad_norm": 0.14817160367965698, "learning_rate": 0.0001996032563891179, "loss": 1.1238, "step": 178 }, { "epoch": 0.013876174415803421, "grad_norm": 0.15391133725643158, "learning_rate": 0.00019959295136026382, "loss": 1.0712, "step": 180 }, { "epoch": 0.01403035413153457, "grad_norm": 0.1766846477985382, "learning_rate": 0.00019958264633140974, "loss": 1.1422, "step": 182 }, { "epoch": 0.014184533847265719, "grad_norm": 0.16789212822914124, "learning_rate": 0.00019957234130255565, "loss": 1.1266, "step": 184 }, { "epoch": 0.014338713562996869, "grad_norm": 0.1527165323495865, "learning_rate": 0.00019956203627370157, "loss": 1.0667, "step": 186 }, { "epoch": 0.014492893278728018, "grad_norm": 0.1772206574678421, "learning_rate": 0.00019955173124484748, "loss": 1.1182, "step": 188 }, { "epoch": 0.014647072994459167, "grad_norm": 0.15008313953876495, "learning_rate": 0.0001995414262159934, "loss": 1.0382, "step": 190 }, { "epoch": 0.014801252710190315, "grad_norm": 0.16365988552570343, "learning_rate": 0.00019953112118713931, "loss": 1.1262, "step": 192 }, { "epoch": 0.014955432425921464, "grad_norm": 0.14952193200588226, "learning_rate": 0.00019952081615828526, "loss": 1.1245, "step": 194 }, { "epoch": 0.015109612141652615, "grad_norm": 0.15425263345241547, "learning_rate": 0.00019951051112943117, "loss": 1.1452, "step": 196 }, { "epoch": 0.015263791857383763, "grad_norm": 0.1567617654800415, "learning_rate": 0.00019950020610057709, "loss": 1.0392, "step": 198 }, { "epoch": 0.015417971573114912, "grad_norm": 0.14292609691619873, "learning_rate": 0.000199489901071723, "loss": 1.0728, "step": 200 }, { "epoch": 0.015417971573114912, "eval_loss": 1.1127630472183228, "eval_runtime": 185.2528, "eval_samples_per_second": 91.459, "eval_steps_per_second": 1.43, "step": 200 }, { "epoch": 0.015572151288846061, "grad_norm": 0.15465517342090607, "learning_rate": 0.00019947959604286892, "loss": 1.0596, "step": 202 }, { "epoch": 0.01572633100457721, "grad_norm": 0.16749607026576996, "learning_rate": 0.00019946929101401486, "loss": 1.1005, "step": 204 }, { "epoch": 0.01588051072030836, "grad_norm": 0.15854287147521973, "learning_rate": 0.00019945898598516077, "loss": 1.0963, "step": 206 }, { "epoch": 0.016034690436039507, "grad_norm": 0.1457831859588623, "learning_rate": 0.0001994486809563067, "loss": 1.1149, "step": 208 }, { "epoch": 0.016188870151770656, "grad_norm": 0.15744629502296448, "learning_rate": 0.0001994383759274526, "loss": 1.0789, "step": 210 }, { "epoch": 0.01634304986750181, "grad_norm": 0.13411423563957214, "learning_rate": 0.00019942807089859852, "loss": 1.0641, "step": 212 }, { "epoch": 0.016497229583232957, "grad_norm": 0.1575399488210678, "learning_rate": 0.00019941776586974446, "loss": 1.0888, "step": 214 }, { "epoch": 0.016651409298964106, "grad_norm": 0.14619529247283936, "learning_rate": 0.00019940746084089037, "loss": 1.081, "step": 216 }, { "epoch": 0.016805589014695255, "grad_norm": 0.15578237175941467, "learning_rate": 0.0001993971558120363, "loss": 1.1434, "step": 218 }, { "epoch": 0.016959768730426403, "grad_norm": 0.1516629308462143, "learning_rate": 0.0001993868507831822, "loss": 1.0909, "step": 220 }, { "epoch": 0.017113948446157552, "grad_norm": 0.15613436698913574, "learning_rate": 0.00019937654575432812, "loss": 1.0999, "step": 222 }, { "epoch": 0.0172681281618887, "grad_norm": 0.14825573563575745, "learning_rate": 0.00019936624072547406, "loss": 1.0827, "step": 224 }, { "epoch": 0.01742230787761985, "grad_norm": 0.1624906212091446, "learning_rate": 0.00019935593569661998, "loss": 1.0856, "step": 226 }, { "epoch": 0.017576487593351, "grad_norm": 0.1380940079689026, "learning_rate": 0.0001993456306677659, "loss": 1.0514, "step": 228 }, { "epoch": 0.017730667309082147, "grad_norm": 0.13712120056152344, "learning_rate": 0.0001993353256389118, "loss": 1.0977, "step": 230 }, { "epoch": 0.0178848470248133, "grad_norm": 0.1448957622051239, "learning_rate": 0.00019932502061005772, "loss": 1.0729, "step": 232 }, { "epoch": 0.01803902674054445, "grad_norm": 0.13421876728534698, "learning_rate": 0.00019931471558120364, "loss": 1.0879, "step": 234 }, { "epoch": 0.018193206456275597, "grad_norm": 0.16884732246398926, "learning_rate": 0.00019930441055234955, "loss": 1.1159, "step": 236 }, { "epoch": 0.018347386172006746, "grad_norm": 0.14634890854358673, "learning_rate": 0.00019929410552349547, "loss": 1.0568, "step": 238 }, { "epoch": 0.018501565887737895, "grad_norm": 0.16796648502349854, "learning_rate": 0.00019928380049464138, "loss": 1.0944, "step": 240 }, { "epoch": 0.018655745603469043, "grad_norm": 0.13724717497825623, "learning_rate": 0.0001992734954657873, "loss": 1.0609, "step": 242 }, { "epoch": 0.018809925319200192, "grad_norm": 0.14133594930171967, "learning_rate": 0.0001992631904369332, "loss": 1.0879, "step": 244 }, { "epoch": 0.01896410503493134, "grad_norm": 0.1611246019601822, "learning_rate": 0.00019925288540807915, "loss": 1.0681, "step": 246 }, { "epoch": 0.01911828475066249, "grad_norm": 0.17420877516269684, "learning_rate": 0.00019924258037922507, "loss": 1.1336, "step": 248 }, { "epoch": 0.01927246446639364, "grad_norm": 0.13766029477119446, "learning_rate": 0.00019923227535037098, "loss": 1.075, "step": 250 }, { "epoch": 0.01942664418212479, "grad_norm": 0.1691662222146988, "learning_rate": 0.0001992219703215169, "loss": 1.1369, "step": 252 }, { "epoch": 0.01958082389785594, "grad_norm": 0.14959432184696198, "learning_rate": 0.0001992116652926628, "loss": 1.1129, "step": 254 }, { "epoch": 0.01973500361358709, "grad_norm": 0.14996406435966492, "learning_rate": 0.00019920136026380875, "loss": 1.0304, "step": 256 }, { "epoch": 0.019889183329318237, "grad_norm": 0.13211801648139954, "learning_rate": 0.00019919105523495467, "loss": 1.0652, "step": 258 }, { "epoch": 0.020043363045049386, "grad_norm": 0.16041967272758484, "learning_rate": 0.00019918075020610058, "loss": 1.077, "step": 260 }, { "epoch": 0.020197542760780535, "grad_norm": 0.1524546593427658, "learning_rate": 0.0001991704451772465, "loss": 1.1176, "step": 262 }, { "epoch": 0.020351722476511683, "grad_norm": 0.16032540798187256, "learning_rate": 0.00019916014014839241, "loss": 1.0736, "step": 264 }, { "epoch": 0.020505902192242832, "grad_norm": 0.17891019582748413, "learning_rate": 0.00019914983511953836, "loss": 1.1435, "step": 266 }, { "epoch": 0.02066008190797398, "grad_norm": 0.14484059810638428, "learning_rate": 0.00019913953009068427, "loss": 1.0356, "step": 268 }, { "epoch": 0.02081426162370513, "grad_norm": 0.14321155846118927, "learning_rate": 0.00019912922506183019, "loss": 1.0536, "step": 270 }, { "epoch": 0.020968441339436282, "grad_norm": 0.17357808351516724, "learning_rate": 0.0001991189200329761, "loss": 1.171, "step": 272 }, { "epoch": 0.02112262105516743, "grad_norm": 0.13990800082683563, "learning_rate": 0.00019910861500412202, "loss": 1.0946, "step": 274 }, { "epoch": 0.02127680077089858, "grad_norm": 0.16634231805801392, "learning_rate": 0.00019909830997526796, "loss": 1.1029, "step": 276 }, { "epoch": 0.02143098048662973, "grad_norm": 0.16322381794452667, "learning_rate": 0.00019908800494641387, "loss": 1.0688, "step": 278 }, { "epoch": 0.021585160202360877, "grad_norm": 0.1652844250202179, "learning_rate": 0.0001990776999175598, "loss": 1.1237, "step": 280 }, { "epoch": 0.021739339918092026, "grad_norm": 0.14457885921001434, "learning_rate": 0.0001990673948887057, "loss": 1.1995, "step": 282 }, { "epoch": 0.021893519633823175, "grad_norm": 0.15549878776073456, "learning_rate": 0.00019905708985985162, "loss": 1.0475, "step": 284 }, { "epoch": 0.022047699349554323, "grad_norm": 0.15715502202510834, "learning_rate": 0.00019904678483099756, "loss": 1.1211, "step": 286 }, { "epoch": 0.022201879065285472, "grad_norm": 0.14022529125213623, "learning_rate": 0.00019903647980214347, "loss": 1.1056, "step": 288 }, { "epoch": 0.02235605878101662, "grad_norm": 0.13293786346912384, "learning_rate": 0.0001990261747732894, "loss": 1.0877, "step": 290 }, { "epoch": 0.022510238496747773, "grad_norm": 0.14625073969364166, "learning_rate": 0.0001990158697444353, "loss": 1.0375, "step": 292 }, { "epoch": 0.022664418212478922, "grad_norm": 0.1417943835258484, "learning_rate": 0.0001990055647155812, "loss": 1.091, "step": 294 }, { "epoch": 0.02281859792821007, "grad_norm": 0.1519964039325714, "learning_rate": 0.00019899525968672713, "loss": 1.0396, "step": 296 }, { "epoch": 0.02297277764394122, "grad_norm": 0.1676655411720276, "learning_rate": 0.00019898495465787305, "loss": 1.1249, "step": 298 }, { "epoch": 0.02312695735967237, "grad_norm": 0.1487220674753189, "learning_rate": 0.00019897464962901896, "loss": 1.1768, "step": 300 }, { "epoch": 0.02312695735967237, "eval_loss": 1.1061022281646729, "eval_runtime": 185.239, "eval_samples_per_second": 91.466, "eval_steps_per_second": 1.431, "step": 300 }, { "epoch": 0.023281137075403517, "grad_norm": 0.1399739533662796, "learning_rate": 0.00019896434460016488, "loss": 1.0962, "step": 302 }, { "epoch": 0.023435316791134666, "grad_norm": 0.15282337367534637, "learning_rate": 0.0001989540395713108, "loss": 1.1688, "step": 304 }, { "epoch": 0.023589496506865815, "grad_norm": 0.15459619462490082, "learning_rate": 0.00019894373454245674, "loss": 1.0216, "step": 306 }, { "epoch": 0.023743676222596963, "grad_norm": 0.15799634158611298, "learning_rate": 0.00019893342951360265, "loss": 1.1429, "step": 308 }, { "epoch": 0.023897855938328112, "grad_norm": 0.1343819946050644, "learning_rate": 0.00019892312448474857, "loss": 1.0959, "step": 310 }, { "epoch": 0.024052035654059264, "grad_norm": 0.14791317284107208, "learning_rate": 0.00019891281945589448, "loss": 1.0636, "step": 312 }, { "epoch": 0.024206215369790413, "grad_norm": 0.1442137360572815, "learning_rate": 0.0001989025144270404, "loss": 1.055, "step": 314 }, { "epoch": 0.024360395085521562, "grad_norm": 0.14649145305156708, "learning_rate": 0.00019889220939818634, "loss": 1.0906, "step": 316 }, { "epoch": 0.02451457480125271, "grad_norm": 0.14234665036201477, "learning_rate": 0.00019888190436933225, "loss": 1.0853, "step": 318 }, { "epoch": 0.02466875451698386, "grad_norm": 0.1419668048620224, "learning_rate": 0.00019887159934047817, "loss": 1.0296, "step": 320 }, { "epoch": 0.02482293423271501, "grad_norm": 0.14730845391750336, "learning_rate": 0.00019886129431162408, "loss": 1.0421, "step": 322 }, { "epoch": 0.024977113948446157, "grad_norm": 0.1400081068277359, "learning_rate": 0.00019885098928277, "loss": 1.0291, "step": 324 }, { "epoch": 0.025131293664177306, "grad_norm": 0.15542668104171753, "learning_rate": 0.0001988406842539159, "loss": 1.0597, "step": 326 }, { "epoch": 0.025285473379908455, "grad_norm": 0.14521440863609314, "learning_rate": 0.00019883037922506185, "loss": 1.0491, "step": 328 }, { "epoch": 0.025439653095639603, "grad_norm": 0.16224826872348785, "learning_rate": 0.00019882007419620777, "loss": 1.1031, "step": 330 }, { "epoch": 0.025593832811370756, "grad_norm": 0.15028877556324005, "learning_rate": 0.00019880976916735368, "loss": 1.1154, "step": 332 }, { "epoch": 0.025748012527101904, "grad_norm": 0.12962941825389862, "learning_rate": 0.0001987994641384996, "loss": 1.0363, "step": 334 }, { "epoch": 0.025902192242833053, "grad_norm": 0.14908359944820404, "learning_rate": 0.0001987891591096455, "loss": 1.1513, "step": 336 }, { "epoch": 0.026056371958564202, "grad_norm": 0.15441828966140747, "learning_rate": 0.00019877885408079146, "loss": 1.1303, "step": 338 }, { "epoch": 0.02621055167429535, "grad_norm": 0.12669101357460022, "learning_rate": 0.00019876854905193737, "loss": 1.0875, "step": 340 }, { "epoch": 0.0263647313900265, "grad_norm": 0.13190661370754242, "learning_rate": 0.00019875824402308329, "loss": 1.0778, "step": 342 }, { "epoch": 0.02651891110575765, "grad_norm": 0.14043989777565002, "learning_rate": 0.0001987479389942292, "loss": 1.1011, "step": 344 }, { "epoch": 0.026673090821488797, "grad_norm": 0.13694870471954346, "learning_rate": 0.00019873763396537512, "loss": 1.0532, "step": 346 }, { "epoch": 0.026827270537219946, "grad_norm": 0.15089921653270721, "learning_rate": 0.00019872732893652103, "loss": 1.1292, "step": 348 }, { "epoch": 0.026981450252951095, "grad_norm": 0.14839838445186615, "learning_rate": 0.00019871702390766694, "loss": 1.0275, "step": 350 }, { "epoch": 0.027135629968682247, "grad_norm": 0.16198500990867615, "learning_rate": 0.00019870671887881286, "loss": 1.1453, "step": 352 }, { "epoch": 0.027289809684413396, "grad_norm": 0.14694632589817047, "learning_rate": 0.00019869641384995877, "loss": 1.129, "step": 354 }, { "epoch": 0.027443989400144544, "grad_norm": 0.16091379523277283, "learning_rate": 0.0001986861088211047, "loss": 1.1186, "step": 356 }, { "epoch": 0.027598169115875693, "grad_norm": 0.144720658659935, "learning_rate": 0.00019867580379225063, "loss": 1.0224, "step": 358 }, { "epoch": 0.027752348831606842, "grad_norm": 0.13851307332515717, "learning_rate": 0.00019866549876339655, "loss": 1.1421, "step": 360 }, { "epoch": 0.02790652854733799, "grad_norm": 0.13124969601631165, "learning_rate": 0.00019865519373454246, "loss": 1.0938, "step": 362 }, { "epoch": 0.02806070826306914, "grad_norm": 0.14723828434944153, "learning_rate": 0.00019864488870568838, "loss": 1.1335, "step": 364 }, { "epoch": 0.02821488797880029, "grad_norm": 0.17669795453548431, "learning_rate": 0.0001986345836768343, "loss": 1.0765, "step": 366 }, { "epoch": 0.028369067694531437, "grad_norm": 0.1457260102033615, "learning_rate": 0.00019862427864798023, "loss": 1.1073, "step": 368 }, { "epoch": 0.028523247410262586, "grad_norm": 0.13594554364681244, "learning_rate": 0.00019861397361912615, "loss": 1.0587, "step": 370 }, { "epoch": 0.028677427125993738, "grad_norm": 0.13798941671848297, "learning_rate": 0.00019860366859027206, "loss": 1.0833, "step": 372 }, { "epoch": 0.028831606841724887, "grad_norm": 0.15587519109249115, "learning_rate": 0.00019859336356141798, "loss": 1.0287, "step": 374 }, { "epoch": 0.028985786557456036, "grad_norm": 0.16585086286067963, "learning_rate": 0.0001985830585325639, "loss": 1.1786, "step": 376 }, { "epoch": 0.029139966273187184, "grad_norm": 0.1444484293460846, "learning_rate": 0.00019857275350370983, "loss": 1.1793, "step": 378 }, { "epoch": 0.029294145988918333, "grad_norm": 0.14413981139659882, "learning_rate": 0.00019856244847485575, "loss": 1.1141, "step": 380 }, { "epoch": 0.029448325704649482, "grad_norm": 0.142032191157341, "learning_rate": 0.00019855214344600166, "loss": 1.1033, "step": 382 }, { "epoch": 0.02960250542038063, "grad_norm": 0.1490195393562317, "learning_rate": 0.00019854183841714758, "loss": 1.1592, "step": 384 }, { "epoch": 0.02975668513611178, "grad_norm": 0.1408643275499344, "learning_rate": 0.0001985315333882935, "loss": 1.1505, "step": 386 }, { "epoch": 0.02991086485184293, "grad_norm": 0.12526237964630127, "learning_rate": 0.00019852122835943944, "loss": 1.1027, "step": 388 }, { "epoch": 0.030065044567574077, "grad_norm": 0.1339711844921112, "learning_rate": 0.00019851092333058535, "loss": 1.1238, "step": 390 }, { "epoch": 0.03021922428330523, "grad_norm": 0.13032345473766327, "learning_rate": 0.00019850061830173127, "loss": 1.1121, "step": 392 }, { "epoch": 0.030373403999036378, "grad_norm": 0.15815846621990204, "learning_rate": 0.00019849031327287718, "loss": 1.168, "step": 394 }, { "epoch": 0.030527583714767527, "grad_norm": 0.14245116710662842, "learning_rate": 0.0001984800082440231, "loss": 1.0436, "step": 396 }, { "epoch": 0.030681763430498676, "grad_norm": 0.15660050511360168, "learning_rate": 0.000198469703215169, "loss": 1.158, "step": 398 }, { "epoch": 0.030835943146229824, "grad_norm": 0.1654158979654312, "learning_rate": 0.00019845939818631493, "loss": 1.0802, "step": 400 }, { "epoch": 0.030835943146229824, "eval_loss": 1.1026971340179443, "eval_runtime": 185.7295, "eval_samples_per_second": 91.224, "eval_steps_per_second": 1.427, "step": 400 }, { "epoch": 0.030990122861960973, "grad_norm": 0.13845407962799072, "learning_rate": 0.00019844909315746084, "loss": 1.1055, "step": 402 }, { "epoch": 0.031144302577692122, "grad_norm": 0.14852891862392426, "learning_rate": 0.00019843878812860676, "loss": 1.0983, "step": 404 }, { "epoch": 0.031298482293423274, "grad_norm": 0.13408593833446503, "learning_rate": 0.00019842848309975267, "loss": 1.1063, "step": 406 }, { "epoch": 0.03145266200915442, "grad_norm": 0.14041072130203247, "learning_rate": 0.00019841817807089859, "loss": 1.0327, "step": 408 }, { "epoch": 0.03160684172488557, "grad_norm": 0.16119754314422607, "learning_rate": 0.00019840787304204453, "loss": 1.1, "step": 410 }, { "epoch": 0.03176102144061672, "grad_norm": 0.14471223950386047, "learning_rate": 0.00019839756801319044, "loss": 1.0783, "step": 412 }, { "epoch": 0.03191520115634787, "grad_norm": 0.15591050684452057, "learning_rate": 0.00019838726298433636, "loss": 1.1782, "step": 414 }, { "epoch": 0.032069380872079015, "grad_norm": 0.1766556203365326, "learning_rate": 0.00019837695795548227, "loss": 1.1063, "step": 416 }, { "epoch": 0.03222356058781017, "grad_norm": 0.16078630089759827, "learning_rate": 0.0001983666529266282, "loss": 1.0891, "step": 418 }, { "epoch": 0.03237774030354131, "grad_norm": 0.13378402590751648, "learning_rate": 0.00019835634789777413, "loss": 1.074, "step": 420 }, { "epoch": 0.032531920019272464, "grad_norm": 0.14526261389255524, "learning_rate": 0.00019834604286892004, "loss": 1.108, "step": 422 }, { "epoch": 0.03268609973500362, "grad_norm": 0.1321713775396347, "learning_rate": 0.00019833573784006596, "loss": 1.019, "step": 424 }, { "epoch": 0.03284027945073476, "grad_norm": 0.12685374915599823, "learning_rate": 0.00019832543281121187, "loss": 1.09, "step": 426 }, { "epoch": 0.032994459166465914, "grad_norm": 0.13825605809688568, "learning_rate": 0.0001983151277823578, "loss": 1.1356, "step": 428 }, { "epoch": 0.03314863888219706, "grad_norm": 0.13683827221393585, "learning_rate": 0.00019830482275350373, "loss": 1.1405, "step": 430 }, { "epoch": 0.03330281859792821, "grad_norm": 0.16707143187522888, "learning_rate": 0.00019829451772464965, "loss": 1.1305, "step": 432 }, { "epoch": 0.03345699831365936, "grad_norm": 0.11735045164823532, "learning_rate": 0.00019828421269579556, "loss": 1.0421, "step": 434 }, { "epoch": 0.03361117802939051, "grad_norm": 0.1337989866733551, "learning_rate": 0.00019827390766694148, "loss": 1.0572, "step": 436 }, { "epoch": 0.033765357745121655, "grad_norm": 0.17111611366271973, "learning_rate": 0.0001982636026380874, "loss": 1.1698, "step": 438 }, { "epoch": 0.03391953746085281, "grad_norm": 0.13785259425640106, "learning_rate": 0.00019825329760923333, "loss": 1.056, "step": 440 }, { "epoch": 0.03407371717658395, "grad_norm": 0.15061460435390472, "learning_rate": 0.00019824299258037925, "loss": 1.0963, "step": 442 }, { "epoch": 0.034227896892315104, "grad_norm": 0.1231001690030098, "learning_rate": 0.00019823268755152516, "loss": 1.1264, "step": 444 }, { "epoch": 0.03438207660804626, "grad_norm": 0.13752298057079315, "learning_rate": 0.00019822238252267108, "loss": 1.0672, "step": 446 }, { "epoch": 0.0345362563237774, "grad_norm": 0.13519813120365143, "learning_rate": 0.000198212077493817, "loss": 1.0882, "step": 448 }, { "epoch": 0.034690436039508554, "grad_norm": 0.140150785446167, "learning_rate": 0.0001982017724649629, "loss": 1.0572, "step": 450 }, { "epoch": 0.0348446157552397, "grad_norm": 0.13910406827926636, "learning_rate": 0.00019819146743610882, "loss": 1.0762, "step": 452 }, { "epoch": 0.03499879547097085, "grad_norm": 0.14587442576885223, "learning_rate": 0.00019818116240725474, "loss": 1.1232, "step": 454 }, { "epoch": 0.035152975186702, "grad_norm": 0.14476893842220306, "learning_rate": 0.00019817085737840065, "loss": 1.1004, "step": 456 }, { "epoch": 0.03530715490243315, "grad_norm": 0.13861101865768433, "learning_rate": 0.00019816055234954657, "loss": 1.0302, "step": 458 }, { "epoch": 0.035461334618164295, "grad_norm": 0.14342686533927917, "learning_rate": 0.0001981502473206925, "loss": 1.1092, "step": 460 }, { "epoch": 0.03561551433389545, "grad_norm": 0.11709775030612946, "learning_rate": 0.00019813994229183842, "loss": 1.0463, "step": 462 }, { "epoch": 0.0357696940496266, "grad_norm": 0.15154917538166046, "learning_rate": 0.00019812963726298434, "loss": 1.0897, "step": 464 }, { "epoch": 0.035923873765357744, "grad_norm": 0.16716259717941284, "learning_rate": 0.00019811933223413025, "loss": 1.1214, "step": 466 }, { "epoch": 0.0360780534810889, "grad_norm": 0.13513320684432983, "learning_rate": 0.00019810902720527617, "loss": 1.0623, "step": 468 }, { "epoch": 0.03623223319682004, "grad_norm": 0.15930432081222534, "learning_rate": 0.0001980987221764221, "loss": 1.1092, "step": 470 }, { "epoch": 0.036386412912551194, "grad_norm": 0.13990509510040283, "learning_rate": 0.00019808841714756803, "loss": 1.1048, "step": 472 }, { "epoch": 0.03654059262828234, "grad_norm": 0.18784300982952118, "learning_rate": 0.00019807811211871394, "loss": 1.1676, "step": 474 }, { "epoch": 0.03669477234401349, "grad_norm": 0.152045339345932, "learning_rate": 0.00019806780708985986, "loss": 1.1303, "step": 476 }, { "epoch": 0.03684895205974464, "grad_norm": 0.1409967988729477, "learning_rate": 0.00019805750206100577, "loss": 1.0972, "step": 478 }, { "epoch": 0.03700313177547579, "grad_norm": 0.13838854432106018, "learning_rate": 0.0001980471970321517, "loss": 1.101, "step": 480 }, { "epoch": 0.037157311491206935, "grad_norm": 0.1579430103302002, "learning_rate": 0.00019803689200329763, "loss": 1.1077, "step": 482 }, { "epoch": 0.03731149120693809, "grad_norm": 0.15061910450458527, "learning_rate": 0.00019802658697444354, "loss": 1.1239, "step": 484 }, { "epoch": 0.03746567092266924, "grad_norm": 0.16408291459083557, "learning_rate": 0.00019801628194558946, "loss": 1.0961, "step": 486 }, { "epoch": 0.037619850638400384, "grad_norm": 0.15612424910068512, "learning_rate": 0.00019800597691673537, "loss": 1.1299, "step": 488 }, { "epoch": 0.03777403035413154, "grad_norm": 0.14135530591011047, "learning_rate": 0.00019799567188788131, "loss": 1.0489, "step": 490 }, { "epoch": 0.03792821006986268, "grad_norm": 0.13743548095226288, "learning_rate": 0.00019798536685902723, "loss": 1.0837, "step": 492 }, { "epoch": 0.038082389785593834, "grad_norm": 0.157401442527771, "learning_rate": 0.00019797506183017314, "loss": 1.0573, "step": 494 }, { "epoch": 0.03823656950132498, "grad_norm": 0.14982052147388458, "learning_rate": 0.00019796475680131906, "loss": 1.0839, "step": 496 }, { "epoch": 0.03839074921705613, "grad_norm": 0.1347000151872635, "learning_rate": 0.00019795445177246497, "loss": 1.113, "step": 498 }, { "epoch": 0.03854492893278728, "grad_norm": 0.14478904008865356, "learning_rate": 0.0001979441467436109, "loss": 1.0514, "step": 500 }, { "epoch": 0.03854492893278728, "eval_loss": 1.1000746488571167, "eval_runtime": 185.5217, "eval_samples_per_second": 91.326, "eval_steps_per_second": 1.428, "step": 500 }, { "epoch": 0.03869910864851843, "grad_norm": 0.14274291694164276, "learning_rate": 0.00019793384171475683, "loss": 1.0847, "step": 502 }, { "epoch": 0.03885328836424958, "grad_norm": 0.14326965808868408, "learning_rate": 0.00019792353668590275, "loss": 1.0865, "step": 504 }, { "epoch": 0.03900746807998073, "grad_norm": 0.1575518548488617, "learning_rate": 0.00019791323165704866, "loss": 1.1258, "step": 506 }, { "epoch": 0.03916164779571188, "grad_norm": 0.14699862897396088, "learning_rate": 0.00019790292662819458, "loss": 1.1687, "step": 508 }, { "epoch": 0.039315827511443024, "grad_norm": 0.1394687294960022, "learning_rate": 0.0001978926215993405, "loss": 1.1214, "step": 510 }, { "epoch": 0.03947000722717418, "grad_norm": 0.14366985857486725, "learning_rate": 0.0001978823165704864, "loss": 1.0651, "step": 512 }, { "epoch": 0.03962418694290532, "grad_norm": 0.14171218872070312, "learning_rate": 0.00019787201154163232, "loss": 1.1398, "step": 514 }, { "epoch": 0.039778366658636474, "grad_norm": 0.13258612155914307, "learning_rate": 0.00019786170651277824, "loss": 1.1234, "step": 516 }, { "epoch": 0.03993254637436762, "grad_norm": 0.17693160474300385, "learning_rate": 0.00019785140148392415, "loss": 1.1121, "step": 518 }, { "epoch": 0.04008672609009877, "grad_norm": 0.143838569521904, "learning_rate": 0.00019784109645507006, "loss": 1.102, "step": 520 }, { "epoch": 0.04024090580582992, "grad_norm": 0.14078038930892944, "learning_rate": 0.000197830791426216, "loss": 1.1044, "step": 522 }, { "epoch": 0.04039508552156107, "grad_norm": 0.12367985397577286, "learning_rate": 0.00019782048639736192, "loss": 1.102, "step": 524 }, { "epoch": 0.04054926523729222, "grad_norm": 0.136929452419281, "learning_rate": 0.00019781018136850784, "loss": 1.0802, "step": 526 }, { "epoch": 0.04070344495302337, "grad_norm": 0.15831957757472992, "learning_rate": 0.00019779987633965375, "loss": 1.09, "step": 528 }, { "epoch": 0.04085762466875452, "grad_norm": 0.15482452511787415, "learning_rate": 0.00019778957131079967, "loss": 1.0828, "step": 530 }, { "epoch": 0.041011804384485664, "grad_norm": 0.13797122240066528, "learning_rate": 0.0001977792662819456, "loss": 1.1263, "step": 532 }, { "epoch": 0.04116598410021682, "grad_norm": 0.18304814398288727, "learning_rate": 0.00019776896125309152, "loss": 1.0991, "step": 534 }, { "epoch": 0.04132016381594796, "grad_norm": 0.1509987860918045, "learning_rate": 0.00019775865622423744, "loss": 1.0804, "step": 536 }, { "epoch": 0.041474343531679114, "grad_norm": 0.13406258821487427, "learning_rate": 0.00019774835119538335, "loss": 1.0348, "step": 538 }, { "epoch": 0.04162852324741026, "grad_norm": 0.1413736194372177, "learning_rate": 0.00019773804616652927, "loss": 1.066, "step": 540 }, { "epoch": 0.04178270296314141, "grad_norm": 0.1451394259929657, "learning_rate": 0.0001977277411376752, "loss": 1.0485, "step": 542 }, { "epoch": 0.041936882678872564, "grad_norm": 0.13275358080863953, "learning_rate": 0.00019771743610882113, "loss": 1.1164, "step": 544 }, { "epoch": 0.04209106239460371, "grad_norm": 0.15869611501693726, "learning_rate": 0.00019770713107996704, "loss": 1.1361, "step": 546 }, { "epoch": 0.04224524211033486, "grad_norm": 0.14091487228870392, "learning_rate": 0.00019769682605111295, "loss": 1.061, "step": 548 }, { "epoch": 0.04239942182606601, "grad_norm": 0.13538867235183716, "learning_rate": 0.00019768652102225887, "loss": 1.0607, "step": 550 }, { "epoch": 0.04255360154179716, "grad_norm": 0.15626317262649536, "learning_rate": 0.0001976762159934048, "loss": 1.0758, "step": 552 }, { "epoch": 0.042707781257528304, "grad_norm": 0.1293731927871704, "learning_rate": 0.00019766591096455073, "loss": 1.0434, "step": 554 }, { "epoch": 0.04286196097325946, "grad_norm": 0.13498535752296448, "learning_rate": 0.00019765560593569664, "loss": 1.0953, "step": 556 }, { "epoch": 0.0430161406889906, "grad_norm": 0.14134527742862701, "learning_rate": 0.00019764530090684256, "loss": 1.1559, "step": 558 }, { "epoch": 0.043170320404721754, "grad_norm": 0.13958705961704254, "learning_rate": 0.00019763499587798847, "loss": 1.2585, "step": 560 }, { "epoch": 0.0433245001204529, "grad_norm": 0.2181047797203064, "learning_rate": 0.0001976246908491344, "loss": 1.0164, "step": 562 }, { "epoch": 0.04347867983618405, "grad_norm": 0.1365436315536499, "learning_rate": 0.0001976143858202803, "loss": 1.124, "step": 564 }, { "epoch": 0.043632859551915204, "grad_norm": 0.12809793651103973, "learning_rate": 0.00019760408079142622, "loss": 1.0378, "step": 566 }, { "epoch": 0.04378703926764635, "grad_norm": 0.12341924756765366, "learning_rate": 0.00019759377576257213, "loss": 1.1091, "step": 568 }, { "epoch": 0.0439412189833775, "grad_norm": 0.14291982352733612, "learning_rate": 0.00019758347073371805, "loss": 1.1366, "step": 570 }, { "epoch": 0.04409539869910865, "grad_norm": 0.14486652612686157, "learning_rate": 0.000197573165704864, "loss": 1.0168, "step": 572 }, { "epoch": 0.0442495784148398, "grad_norm": 0.1724916249513626, "learning_rate": 0.0001975628606760099, "loss": 1.1037, "step": 574 }, { "epoch": 0.044403758130570944, "grad_norm": 0.13338427245616913, "learning_rate": 0.00019755255564715582, "loss": 1.0259, "step": 576 }, { "epoch": 0.0445579378463021, "grad_norm": 0.1372508853673935, "learning_rate": 0.00019754225061830173, "loss": 1.0784, "step": 578 }, { "epoch": 0.04471211756203324, "grad_norm": 0.11633725464344025, "learning_rate": 0.00019753194558944765, "loss": 1.0648, "step": 580 }, { "epoch": 0.044866297277764394, "grad_norm": 0.14386776089668274, "learning_rate": 0.00019752164056059356, "loss": 1.0777, "step": 582 }, { "epoch": 0.045020476993495546, "grad_norm": 0.14929193258285522, "learning_rate": 0.0001975113355317395, "loss": 1.1319, "step": 584 }, { "epoch": 0.04517465670922669, "grad_norm": 0.1324220448732376, "learning_rate": 0.00019750103050288542, "loss": 1.0614, "step": 586 }, { "epoch": 0.045328836424957844, "grad_norm": 0.1392926126718521, "learning_rate": 0.00019749072547403133, "loss": 1.142, "step": 588 }, { "epoch": 0.04548301614068899, "grad_norm": 0.2632090151309967, "learning_rate": 0.00019748042044517725, "loss": 1.0159, "step": 590 }, { "epoch": 0.04563719585642014, "grad_norm": 0.13699129223823547, "learning_rate": 0.00019747011541632316, "loss": 1.0778, "step": 592 }, { "epoch": 0.04579137557215129, "grad_norm": 0.13768675923347473, "learning_rate": 0.0001974598103874691, "loss": 1.0719, "step": 594 }, { "epoch": 0.04594555528788244, "grad_norm": 0.13458684086799622, "learning_rate": 0.00019744950535861502, "loss": 1.0145, "step": 596 }, { "epoch": 0.046099735003613584, "grad_norm": 0.1772696077823639, "learning_rate": 0.00019743920032976094, "loss": 1.0629, "step": 598 }, { "epoch": 0.04625391471934474, "grad_norm": 0.13998697698116302, "learning_rate": 0.00019742889530090685, "loss": 1.102, "step": 600 }, { "epoch": 0.04625391471934474, "eval_loss": 1.098169207572937, "eval_runtime": 185.5141, "eval_samples_per_second": 91.33, "eval_steps_per_second": 1.428, "step": 600 }, { "epoch": 0.04640809443507588, "grad_norm": 0.13928066194057465, "learning_rate": 0.00019741859027205277, "loss": 1.1527, "step": 602 }, { "epoch": 0.046562274150807034, "grad_norm": 0.13011601567268372, "learning_rate": 0.0001974082852431987, "loss": 1.1259, "step": 604 }, { "epoch": 0.046716453866538186, "grad_norm": 0.1306074559688568, "learning_rate": 0.00019739798021434462, "loss": 1.0951, "step": 606 }, { "epoch": 0.04687063358226933, "grad_norm": 0.14797037839889526, "learning_rate": 0.00019738767518549054, "loss": 1.0321, "step": 608 }, { "epoch": 0.047024813298000484, "grad_norm": 0.14849938452243805, "learning_rate": 0.00019737737015663645, "loss": 1.1096, "step": 610 }, { "epoch": 0.04717899301373163, "grad_norm": 0.12060682475566864, "learning_rate": 0.00019736706512778237, "loss": 1.0652, "step": 612 }, { "epoch": 0.04733317272946278, "grad_norm": 0.12754854559898376, "learning_rate": 0.00019735676009892828, "loss": 1.1097, "step": 614 }, { "epoch": 0.04748735244519393, "grad_norm": 0.12162326276302338, "learning_rate": 0.0001973464550700742, "loss": 1.1087, "step": 616 }, { "epoch": 0.04764153216092508, "grad_norm": 0.175630122423172, "learning_rate": 0.0001973361500412201, "loss": 1.0723, "step": 618 }, { "epoch": 0.047795711876656224, "grad_norm": 0.15365472435951233, "learning_rate": 0.00019732584501236603, "loss": 1.1009, "step": 620 }, { "epoch": 0.04794989159238738, "grad_norm": 0.13359837234020233, "learning_rate": 0.00019731553998351194, "loss": 1.0974, "step": 622 }, { "epoch": 0.04810407130811853, "grad_norm": 0.1482960432767868, "learning_rate": 0.00019730523495465788, "loss": 1.1214, "step": 624 }, { "epoch": 0.048258251023849674, "grad_norm": 0.1309668868780136, "learning_rate": 0.0001972949299258038, "loss": 1.0849, "step": 626 }, { "epoch": 0.048412430739580826, "grad_norm": 0.1544414609670639, "learning_rate": 0.00019728462489694971, "loss": 1.092, "step": 628 }, { "epoch": 0.04856661045531197, "grad_norm": 0.14907146990299225, "learning_rate": 0.00019727431986809563, "loss": 1.0671, "step": 630 }, { "epoch": 0.048720790171043124, "grad_norm": 0.16943813860416412, "learning_rate": 0.00019726401483924154, "loss": 1.1433, "step": 632 }, { "epoch": 0.04887496988677427, "grad_norm": 0.14070230722427368, "learning_rate": 0.00019725370981038749, "loss": 1.1613, "step": 634 }, { "epoch": 0.04902914960250542, "grad_norm": 0.15507204830646515, "learning_rate": 0.0001972434047815334, "loss": 1.1286, "step": 636 }, { "epoch": 0.04918332931823657, "grad_norm": 0.13587893545627594, "learning_rate": 0.00019723309975267932, "loss": 1.1094, "step": 638 }, { "epoch": 0.04933750903396772, "grad_norm": 0.12399852275848389, "learning_rate": 0.00019722279472382523, "loss": 1.058, "step": 640 }, { "epoch": 0.049491688749698864, "grad_norm": 0.12497518211603165, "learning_rate": 0.00019721248969497115, "loss": 1.0716, "step": 642 }, { "epoch": 0.04964586846543002, "grad_norm": 0.15282607078552246, "learning_rate": 0.0001972021846661171, "loss": 1.0912, "step": 644 }, { "epoch": 0.04980004818116117, "grad_norm": 0.14203013479709625, "learning_rate": 0.000197191879637263, "loss": 1.0846, "step": 646 }, { "epoch": 0.049954227896892314, "grad_norm": 0.12308704853057861, "learning_rate": 0.00019718157460840892, "loss": 1.1202, "step": 648 }, { "epoch": 0.050108407612623466, "grad_norm": 0.15226681530475616, "learning_rate": 0.00019717126957955483, "loss": 1.0626, "step": 650 }, { "epoch": 0.05026258732835461, "grad_norm": 0.12636694312095642, "learning_rate": 0.00019716096455070075, "loss": 1.1086, "step": 652 }, { "epoch": 0.050416767044085764, "grad_norm": 0.14969666302204132, "learning_rate": 0.0001971506595218467, "loss": 1.1602, "step": 654 }, { "epoch": 0.05057094675981691, "grad_norm": 0.130833700299263, "learning_rate": 0.0001971403544929926, "loss": 1.0657, "step": 656 }, { "epoch": 0.05072512647554806, "grad_norm": 0.1283751279115677, "learning_rate": 0.00019713004946413852, "loss": 1.0371, "step": 658 }, { "epoch": 0.05087930619127921, "grad_norm": 0.11827697604894638, "learning_rate": 0.00019711974443528443, "loss": 1.0308, "step": 660 }, { "epoch": 0.05103348590701036, "grad_norm": 0.12265590578317642, "learning_rate": 0.00019710943940643035, "loss": 1.1127, "step": 662 }, { "epoch": 0.05118766562274151, "grad_norm": 0.13979150354862213, "learning_rate": 0.0001970991343775763, "loss": 1.1011, "step": 664 }, { "epoch": 0.05134184533847266, "grad_norm": 0.1368461698293686, "learning_rate": 0.0001970888293487222, "loss": 1.0857, "step": 666 }, { "epoch": 0.05149602505420381, "grad_norm": 0.13669301569461823, "learning_rate": 0.00019707852431986812, "loss": 1.0971, "step": 668 }, { "epoch": 0.051650204769934954, "grad_norm": 0.12659449875354767, "learning_rate": 0.00019706821929101404, "loss": 1.0556, "step": 670 }, { "epoch": 0.051804384485666106, "grad_norm": 0.14103113114833832, "learning_rate": 0.00019705791426215995, "loss": 1.0913, "step": 672 }, { "epoch": 0.05195856420139725, "grad_norm": 0.16134017705917358, "learning_rate": 0.00019704760923330587, "loss": 1.0994, "step": 674 }, { "epoch": 0.052112743917128404, "grad_norm": 0.12725086510181427, "learning_rate": 0.00019703730420445178, "loss": 1.1008, "step": 676 }, { "epoch": 0.05226692363285955, "grad_norm": 0.12865908443927765, "learning_rate": 0.0001970269991755977, "loss": 1.0186, "step": 678 }, { "epoch": 0.0524211033485907, "grad_norm": 0.1661859154701233, "learning_rate": 0.0001970166941467436, "loss": 1.068, "step": 680 }, { "epoch": 0.05257528306432185, "grad_norm": 0.14370663464069366, "learning_rate": 0.00019700638911788953, "loss": 1.102, "step": 682 }, { "epoch": 0.052729462780053, "grad_norm": 0.13285204768180847, "learning_rate": 0.00019699608408903544, "loss": 1.1055, "step": 684 }, { "epoch": 0.05288364249578415, "grad_norm": 0.17762747406959534, "learning_rate": 0.00019698577906018138, "loss": 1.1601, "step": 686 }, { "epoch": 0.0530378222115153, "grad_norm": 0.12693317234516144, "learning_rate": 0.0001969754740313273, "loss": 1.0494, "step": 688 }, { "epoch": 0.05319200192724645, "grad_norm": 0.1302707940340042, "learning_rate": 0.0001969651690024732, "loss": 1.066, "step": 690 }, { "epoch": 0.053346181642977594, "grad_norm": 0.11844471096992493, "learning_rate": 0.00019695486397361913, "loss": 1.0085, "step": 692 }, { "epoch": 0.053500361358708746, "grad_norm": 0.12299422174692154, "learning_rate": 0.00019694455894476504, "loss": 1.0985, "step": 694 }, { "epoch": 0.05365454107443989, "grad_norm": 0.1222420409321785, "learning_rate": 0.00019693425391591098, "loss": 1.0648, "step": 696 }, { "epoch": 0.053808720790171044, "grad_norm": 0.13273879885673523, "learning_rate": 0.0001969239488870569, "loss": 1.1108, "step": 698 }, { "epoch": 0.05396290050590219, "grad_norm": 0.13202215731143951, "learning_rate": 0.00019691364385820281, "loss": 1.1013, "step": 700 }, { "epoch": 0.05396290050590219, "eval_loss": 1.0964874029159546, "eval_runtime": 185.3303, "eval_samples_per_second": 91.421, "eval_steps_per_second": 1.43, "step": 700 }, { "epoch": 0.05411708022163334, "grad_norm": 0.13038010895252228, "learning_rate": 0.00019690333882934873, "loss": 1.0642, "step": 702 }, { "epoch": 0.054271259937364494, "grad_norm": 0.18084144592285156, "learning_rate": 0.00019689303380049464, "loss": 1.0673, "step": 704 }, { "epoch": 0.05442543965309564, "grad_norm": 0.18958036601543427, "learning_rate": 0.00019688272877164059, "loss": 1.0925, "step": 706 }, { "epoch": 0.05457961936882679, "grad_norm": 0.13386841118335724, "learning_rate": 0.0001968724237427865, "loss": 1.0978, "step": 708 }, { "epoch": 0.05473379908455794, "grad_norm": 0.1408504843711853, "learning_rate": 0.00019686211871393242, "loss": 1.1158, "step": 710 }, { "epoch": 0.05488797880028909, "grad_norm": 0.12006545811891556, "learning_rate": 0.00019685181368507833, "loss": 1.0395, "step": 712 }, { "epoch": 0.055042158516020234, "grad_norm": 0.13973191380500793, "learning_rate": 0.00019684150865622425, "loss": 1.0685, "step": 714 }, { "epoch": 0.055196338231751386, "grad_norm": 0.14461107552051544, "learning_rate": 0.0001968312036273702, "loss": 1.0924, "step": 716 }, { "epoch": 0.05535051794748253, "grad_norm": 0.13358595967292786, "learning_rate": 0.0001968208985985161, "loss": 1.0479, "step": 718 }, { "epoch": 0.055504697663213684, "grad_norm": 0.13416843116283417, "learning_rate": 0.00019681059356966202, "loss": 1.0166, "step": 720 }, { "epoch": 0.05565887737894483, "grad_norm": 0.15217959880828857, "learning_rate": 0.00019680028854080793, "loss": 1.0918, "step": 722 }, { "epoch": 0.05581305709467598, "grad_norm": 0.13012762367725372, "learning_rate": 0.00019678998351195385, "loss": 1.0967, "step": 724 }, { "epoch": 0.055967236810407134, "grad_norm": 0.13023535907268524, "learning_rate": 0.00019677967848309976, "loss": 1.0247, "step": 726 }, { "epoch": 0.05612141652613828, "grad_norm": 0.13703665137290955, "learning_rate": 0.00019676937345424568, "loss": 1.0969, "step": 728 }, { "epoch": 0.05627559624186943, "grad_norm": 0.12767066061496735, "learning_rate": 0.0001967590684253916, "loss": 1.08, "step": 730 }, { "epoch": 0.05642977595760058, "grad_norm": 0.12238382548093796, "learning_rate": 0.0001967487633965375, "loss": 1.1233, "step": 732 }, { "epoch": 0.05658395567333173, "grad_norm": 0.1356974095106125, "learning_rate": 0.00019673845836768342, "loss": 1.0439, "step": 734 }, { "epoch": 0.056738135389062874, "grad_norm": 0.14199669659137726, "learning_rate": 0.00019672815333882936, "loss": 1.0753, "step": 736 }, { "epoch": 0.056892315104794026, "grad_norm": 0.12904112040996552, "learning_rate": 0.00019671784830997528, "loss": 1.0749, "step": 738 }, { "epoch": 0.05704649482052517, "grad_norm": 0.1235031932592392, "learning_rate": 0.0001967075432811212, "loss": 1.0275, "step": 740 }, { "epoch": 0.057200674536256324, "grad_norm": 0.170023113489151, "learning_rate": 0.0001966972382522671, "loss": 1.1295, "step": 742 }, { "epoch": 0.057354854251987476, "grad_norm": 0.15533532202243805, "learning_rate": 0.00019668693322341302, "loss": 1.0629, "step": 744 }, { "epoch": 0.05750903396771862, "grad_norm": 0.1602126806974411, "learning_rate": 0.00019667662819455897, "loss": 1.1538, "step": 746 }, { "epoch": 0.057663213683449774, "grad_norm": 0.16433580219745636, "learning_rate": 0.00019666632316570488, "loss": 1.1322, "step": 748 }, { "epoch": 0.05781739339918092, "grad_norm": 0.13925233483314514, "learning_rate": 0.0001966560181368508, "loss": 1.083, "step": 750 }, { "epoch": 0.05797157311491207, "grad_norm": 0.12234565615653992, "learning_rate": 0.0001966457131079967, "loss": 1.0113, "step": 752 }, { "epoch": 0.05812575283064322, "grad_norm": 0.1425125002861023, "learning_rate": 0.00019663540807914262, "loss": 1.0762, "step": 754 }, { "epoch": 0.05827993254637437, "grad_norm": 0.14309099316596985, "learning_rate": 0.00019662510305028854, "loss": 1.0633, "step": 756 }, { "epoch": 0.058434112262105514, "grad_norm": 0.1381814330816269, "learning_rate": 0.00019661479802143448, "loss": 1.142, "step": 758 }, { "epoch": 0.058588291977836666, "grad_norm": 0.15551595389842987, "learning_rate": 0.0001966044929925804, "loss": 1.026, "step": 760 }, { "epoch": 0.05874247169356781, "grad_norm": 0.14606410264968872, "learning_rate": 0.0001965941879637263, "loss": 1.1265, "step": 762 }, { "epoch": 0.058896651409298964, "grad_norm": 0.13017289340496063, "learning_rate": 0.00019658388293487223, "loss": 1.1051, "step": 764 }, { "epoch": 0.059050831125030116, "grad_norm": 0.1500990092754364, "learning_rate": 0.00019657357790601814, "loss": 1.0948, "step": 766 }, { "epoch": 0.05920501084076126, "grad_norm": 0.14307473599910736, "learning_rate": 0.00019656327287716408, "loss": 1.0667, "step": 768 }, { "epoch": 0.059359190556492414, "grad_norm": 0.13513712584972382, "learning_rate": 0.00019655296784831, "loss": 1.0488, "step": 770 }, { "epoch": 0.05951337027222356, "grad_norm": 0.13991938531398773, "learning_rate": 0.0001965426628194559, "loss": 1.0888, "step": 772 }, { "epoch": 0.05966754998795471, "grad_norm": 0.15015999972820282, "learning_rate": 0.00019653235779060183, "loss": 1.0774, "step": 774 }, { "epoch": 0.05982172970368586, "grad_norm": 0.16419099271297455, "learning_rate": 0.00019652205276174774, "loss": 1.0661, "step": 776 }, { "epoch": 0.05997590941941701, "grad_norm": 0.12072901427745819, "learning_rate": 0.00019651174773289366, "loss": 1.0645, "step": 778 }, { "epoch": 0.060130089135148154, "grad_norm": 0.13410696387290955, "learning_rate": 0.00019650144270403957, "loss": 1.0677, "step": 780 }, { "epoch": 0.060284268850879306, "grad_norm": 0.13373896479606628, "learning_rate": 0.0001964911376751855, "loss": 1.0055, "step": 782 }, { "epoch": 0.06043844856661046, "grad_norm": 0.13043928146362305, "learning_rate": 0.0001964808326463314, "loss": 1.0579, "step": 784 }, { "epoch": 0.060592628282341604, "grad_norm": 0.13334155082702637, "learning_rate": 0.00019647052761747732, "loss": 1.0781, "step": 786 }, { "epoch": 0.060746807998072756, "grad_norm": 0.14660002291202545, "learning_rate": 0.00019646022258862326, "loss": 1.1244, "step": 788 }, { "epoch": 0.0609009877138039, "grad_norm": 0.1240791380405426, "learning_rate": 0.00019644991755976917, "loss": 1.0353, "step": 790 }, { "epoch": 0.061055167429535054, "grad_norm": 0.12248943001031876, "learning_rate": 0.0001964396125309151, "loss": 1.1292, "step": 792 }, { "epoch": 0.0612093471452662, "grad_norm": 0.1340823471546173, "learning_rate": 0.000196429307502061, "loss": 1.0764, "step": 794 }, { "epoch": 0.06136352686099735, "grad_norm": 0.1297413557767868, "learning_rate": 0.00019641900247320692, "loss": 1.0998, "step": 796 }, { "epoch": 0.0615177065767285, "grad_norm": 0.13512568175792694, "learning_rate": 0.00019640869744435286, "loss": 1.0349, "step": 798 }, { "epoch": 0.06167188629245965, "grad_norm": 0.13964438438415527, "learning_rate": 0.00019639839241549878, "loss": 1.0543, "step": 800 }, { "epoch": 0.06167188629245965, "eval_loss": 1.0952669382095337, "eval_runtime": 185.8383, "eval_samples_per_second": 91.171, "eval_steps_per_second": 1.426, "step": 800 }, { "epoch": 0.061826066008190794, "grad_norm": 0.1318446695804596, "learning_rate": 0.0001963880873866447, "loss": 1.1469, "step": 802 }, { "epoch": 0.061980245723921946, "grad_norm": 0.13778544962406158, "learning_rate": 0.0001963777823577906, "loss": 1.0361, "step": 804 }, { "epoch": 0.0621344254396531, "grad_norm": 0.14804169535636902, "learning_rate": 0.00019636747732893652, "loss": 1.0537, "step": 806 }, { "epoch": 0.062288605155384244, "grad_norm": 0.1363479495048523, "learning_rate": 0.00019635717230008246, "loss": 1.0819, "step": 808 }, { "epoch": 0.062442784871115396, "grad_norm": 0.12277363240718842, "learning_rate": 0.00019634686727122838, "loss": 1.0629, "step": 810 }, { "epoch": 0.06259696458684655, "grad_norm": 0.13027344644069672, "learning_rate": 0.0001963365622423743, "loss": 1.0544, "step": 812 }, { "epoch": 0.0627511443025777, "grad_norm": 0.1274079531431198, "learning_rate": 0.0001963262572135202, "loss": 1.0685, "step": 814 }, { "epoch": 0.06290532401830884, "grad_norm": 0.1349189281463623, "learning_rate": 0.00019631595218466612, "loss": 1.0289, "step": 816 }, { "epoch": 0.06305950373403998, "grad_norm": 0.1265273541212082, "learning_rate": 0.00019630564715581206, "loss": 1.0765, "step": 818 }, { "epoch": 0.06321368344977114, "grad_norm": 0.1393941193819046, "learning_rate": 0.00019629534212695798, "loss": 1.0918, "step": 820 }, { "epoch": 0.06336786316550229, "grad_norm": 0.12475106865167618, "learning_rate": 0.0001962850370981039, "loss": 1.027, "step": 822 }, { "epoch": 0.06352204288123343, "grad_norm": 0.13844382762908936, "learning_rate": 0.0001962747320692498, "loss": 1.1482, "step": 824 }, { "epoch": 0.0636762225969646, "grad_norm": 0.1444624364376068, "learning_rate": 0.00019626442704039572, "loss": 1.0659, "step": 826 }, { "epoch": 0.06383040231269574, "grad_norm": 0.13939915597438812, "learning_rate": 0.00019625412201154164, "loss": 1.0392, "step": 828 }, { "epoch": 0.06398458202842688, "grad_norm": 0.12919913232326508, "learning_rate": 0.00019624381698268755, "loss": 1.0566, "step": 830 }, { "epoch": 0.06413876174415803, "grad_norm": 0.1297498196363449, "learning_rate": 0.00019623351195383347, "loss": 1.058, "step": 832 }, { "epoch": 0.06429294145988919, "grad_norm": 0.16311457753181458, "learning_rate": 0.00019622320692497938, "loss": 1.1175, "step": 834 }, { "epoch": 0.06444712117562033, "grad_norm": 0.14434239268302917, "learning_rate": 0.0001962129018961253, "loss": 1.0966, "step": 836 }, { "epoch": 0.06460130089135148, "grad_norm": 0.13500697910785675, "learning_rate": 0.00019620259686727121, "loss": 1.138, "step": 838 }, { "epoch": 0.06475548060708262, "grad_norm": 0.13175781071186066, "learning_rate": 0.00019619229183841716, "loss": 1.0744, "step": 840 }, { "epoch": 0.06490966032281378, "grad_norm": 0.142098531126976, "learning_rate": 0.00019618198680956307, "loss": 1.0686, "step": 842 }, { "epoch": 0.06506384003854493, "grad_norm": 0.16844119131565094, "learning_rate": 0.00019617168178070899, "loss": 1.0992, "step": 844 }, { "epoch": 0.06521801975427607, "grad_norm": 0.13562923669815063, "learning_rate": 0.0001961613767518549, "loss": 1.0749, "step": 846 }, { "epoch": 0.06537219947000723, "grad_norm": 0.14538466930389404, "learning_rate": 0.00019615107172300082, "loss": 1.123, "step": 848 }, { "epoch": 0.06552637918573838, "grad_norm": 0.13058879971504211, "learning_rate": 0.00019614076669414676, "loss": 1.0835, "step": 850 }, { "epoch": 0.06568055890146952, "grad_norm": 0.1567140519618988, "learning_rate": 0.00019613046166529267, "loss": 1.1157, "step": 852 }, { "epoch": 0.06583473861720067, "grad_norm": 0.12576104700565338, "learning_rate": 0.0001961201566364386, "loss": 1.0143, "step": 854 }, { "epoch": 0.06598891833293183, "grad_norm": 0.13823091983795166, "learning_rate": 0.0001961098516075845, "loss": 1.0797, "step": 856 }, { "epoch": 0.06614309804866297, "grad_norm": 0.12293639779090881, "learning_rate": 0.00019609954657873042, "loss": 1.0808, "step": 858 }, { "epoch": 0.06629727776439412, "grad_norm": 0.13951502740383148, "learning_rate": 0.00019608924154987636, "loss": 1.076, "step": 860 }, { "epoch": 0.06645145748012526, "grad_norm": 0.13900773227214813, "learning_rate": 0.00019607893652102227, "loss": 1.0846, "step": 862 }, { "epoch": 0.06660563719585642, "grad_norm": 0.14335249364376068, "learning_rate": 0.0001960686314921682, "loss": 1.0639, "step": 864 }, { "epoch": 0.06675981691158757, "grad_norm": 0.1712643951177597, "learning_rate": 0.0001960583264633141, "loss": 1.1411, "step": 866 }, { "epoch": 0.06691399662731871, "grad_norm": 0.12118082493543625, "learning_rate": 0.00019604802143446002, "loss": 1.0807, "step": 868 }, { "epoch": 0.06706817634304987, "grad_norm": 0.141808420419693, "learning_rate": 0.00019603771640560596, "loss": 1.0641, "step": 870 }, { "epoch": 0.06722235605878102, "grad_norm": 0.14798308908939362, "learning_rate": 0.00019602741137675188, "loss": 1.073, "step": 872 }, { "epoch": 0.06737653577451216, "grad_norm": 0.13768306374549866, "learning_rate": 0.0001960171063478978, "loss": 1.0735, "step": 874 }, { "epoch": 0.06753071549024331, "grad_norm": 0.12452355027198792, "learning_rate": 0.0001960068013190437, "loss": 1.0509, "step": 876 }, { "epoch": 0.06768489520597447, "grad_norm": 0.1402217000722885, "learning_rate": 0.00019599649629018962, "loss": 1.1157, "step": 878 }, { "epoch": 0.06783907492170561, "grad_norm": 0.12509870529174805, "learning_rate": 0.00019598619126133556, "loss": 1.0516, "step": 880 }, { "epoch": 0.06799325463743676, "grad_norm": 0.1574297547340393, "learning_rate": 0.00019597588623248148, "loss": 1.0823, "step": 882 }, { "epoch": 0.0681474343531679, "grad_norm": 0.14185413718223572, "learning_rate": 0.0001959655812036274, "loss": 1.0444, "step": 884 }, { "epoch": 0.06830161406889906, "grad_norm": 0.1380462348461151, "learning_rate": 0.0001959552761747733, "loss": 1.1066, "step": 886 }, { "epoch": 0.06845579378463021, "grad_norm": 0.12986746430397034, "learning_rate": 0.00019594497114591922, "loss": 1.1006, "step": 888 }, { "epoch": 0.06860997350036135, "grad_norm": 0.13894346356391907, "learning_rate": 0.00019593466611706514, "loss": 1.0569, "step": 890 }, { "epoch": 0.06876415321609251, "grad_norm": 0.12822435796260834, "learning_rate": 0.00019592436108821105, "loss": 1.0696, "step": 892 }, { "epoch": 0.06891833293182366, "grad_norm": 0.1369408816099167, "learning_rate": 0.00019591405605935697, "loss": 1.0691, "step": 894 }, { "epoch": 0.0690725126475548, "grad_norm": 0.13459660112857819, "learning_rate": 0.00019590375103050288, "loss": 1.0801, "step": 896 }, { "epoch": 0.06922669236328595, "grad_norm": 0.1299123764038086, "learning_rate": 0.0001958934460016488, "loss": 1.0885, "step": 898 }, { "epoch": 0.06938087207901711, "grad_norm": 0.12562230229377747, "learning_rate": 0.00019588314097279474, "loss": 1.183, "step": 900 }, { "epoch": 0.06938087207901711, "eval_loss": 1.0944268703460693, "eval_runtime": 185.3723, "eval_samples_per_second": 91.4, "eval_steps_per_second": 1.43, "step": 900 }, { "epoch": 0.06953505179474825, "grad_norm": 0.13996927440166473, "learning_rate": 0.00019587283594394065, "loss": 1.0356, "step": 902 }, { "epoch": 0.0696892315104794, "grad_norm": 0.128004252910614, "learning_rate": 0.00019586253091508657, "loss": 1.0343, "step": 904 }, { "epoch": 0.06984341122621056, "grad_norm": 0.15650418400764465, "learning_rate": 0.00019585222588623248, "loss": 1.1138, "step": 906 }, { "epoch": 0.0699975909419417, "grad_norm": 0.5840476751327515, "learning_rate": 0.0001958419208573784, "loss": 1.1785, "step": 908 }, { "epoch": 0.07015177065767285, "grad_norm": 0.15330374240875244, "learning_rate": 0.00019583161582852434, "loss": 1.0243, "step": 910 }, { "epoch": 0.070305950373404, "grad_norm": 0.1603543907403946, "learning_rate": 0.00019582131079967026, "loss": 1.1228, "step": 912 }, { "epoch": 0.07046013008913515, "grad_norm": 0.14209845662117004, "learning_rate": 0.00019581100577081617, "loss": 1.0939, "step": 914 }, { "epoch": 0.0706143098048663, "grad_norm": 0.16117019951343536, "learning_rate": 0.00019580070074196209, "loss": 1.1447, "step": 916 }, { "epoch": 0.07076848952059744, "grad_norm": 0.14068694412708282, "learning_rate": 0.000195790395713108, "loss": 1.0642, "step": 918 }, { "epoch": 0.07092266923632859, "grad_norm": 0.15248316526412964, "learning_rate": 0.00019578009068425394, "loss": 1.0162, "step": 920 }, { "epoch": 0.07107684895205975, "grad_norm": 0.22734233736991882, "learning_rate": 0.00019576978565539986, "loss": 1.1123, "step": 922 }, { "epoch": 0.0712310286677909, "grad_norm": 0.1393287032842636, "learning_rate": 0.00019575948062654577, "loss": 1.0862, "step": 924 }, { "epoch": 0.07138520838352204, "grad_norm": 0.12911191582679749, "learning_rate": 0.0001957491755976917, "loss": 1.0651, "step": 926 }, { "epoch": 0.0715393880992532, "grad_norm": 0.12298440933227539, "learning_rate": 0.0001957388705688376, "loss": 1.1227, "step": 928 }, { "epoch": 0.07169356781498434, "grad_norm": 0.14941005408763885, "learning_rate": 0.00019572856553998352, "loss": 1.0989, "step": 930 }, { "epoch": 0.07184774753071549, "grad_norm": 0.1411515325307846, "learning_rate": 0.00019571826051112946, "loss": 1.0816, "step": 932 }, { "epoch": 0.07200192724644663, "grad_norm": 0.11999720335006714, "learning_rate": 0.00019570795548227537, "loss": 1.0306, "step": 934 }, { "epoch": 0.0721561069621778, "grad_norm": 0.1500861495733261, "learning_rate": 0.0001956976504534213, "loss": 1.0678, "step": 936 }, { "epoch": 0.07231028667790894, "grad_norm": 0.12102475017309189, "learning_rate": 0.0001956873454245672, "loss": 1.0534, "step": 938 }, { "epoch": 0.07246446639364008, "grad_norm": 0.11554603278636932, "learning_rate": 0.00019567704039571312, "loss": 1.0535, "step": 940 }, { "epoch": 0.07261864610937123, "grad_norm": 0.12290264666080475, "learning_rate": 0.00019566673536685903, "loss": 1.0738, "step": 942 }, { "epoch": 0.07277282582510239, "grad_norm": 0.17740991711616516, "learning_rate": 0.00019565643033800495, "loss": 1.0811, "step": 944 }, { "epoch": 0.07292700554083353, "grad_norm": 0.14767777919769287, "learning_rate": 0.00019564612530915086, "loss": 1.105, "step": 946 }, { "epoch": 0.07308118525656468, "grad_norm": 0.13773177564144135, "learning_rate": 0.00019563582028029678, "loss": 1.0983, "step": 948 }, { "epoch": 0.07323536497229584, "grad_norm": 0.13891370594501495, "learning_rate": 0.0001956255152514427, "loss": 1.1349, "step": 950 }, { "epoch": 0.07338954468802698, "grad_norm": 0.14717017114162445, "learning_rate": 0.00019561521022258863, "loss": 1.134, "step": 952 }, { "epoch": 0.07354372440375813, "grad_norm": 0.15095743536949158, "learning_rate": 0.00019560490519373455, "loss": 1.063, "step": 954 }, { "epoch": 0.07369790411948927, "grad_norm": 0.12851206958293915, "learning_rate": 0.00019559460016488046, "loss": 1.1005, "step": 956 }, { "epoch": 0.07385208383522043, "grad_norm": 0.13364006578922272, "learning_rate": 0.00019558429513602638, "loss": 1.0429, "step": 958 }, { "epoch": 0.07400626355095158, "grad_norm": 0.1326039433479309, "learning_rate": 0.0001955739901071723, "loss": 1.1586, "step": 960 }, { "epoch": 0.07416044326668272, "grad_norm": 0.13149486482143402, "learning_rate": 0.00019556368507831824, "loss": 1.109, "step": 962 }, { "epoch": 0.07431462298241387, "grad_norm": 0.1189669519662857, "learning_rate": 0.00019555338004946415, "loss": 1.0462, "step": 964 }, { "epoch": 0.07446880269814503, "grad_norm": 0.14341482520103455, "learning_rate": 0.00019554307502061007, "loss": 1.0623, "step": 966 }, { "epoch": 0.07462298241387617, "grad_norm": 0.14133721590042114, "learning_rate": 0.00019553276999175598, "loss": 1.0945, "step": 968 }, { "epoch": 0.07477716212960732, "grad_norm": 0.1351941078901291, "learning_rate": 0.0001955224649629019, "loss": 1.0327, "step": 970 }, { "epoch": 0.07493134184533848, "grad_norm": 0.12836019694805145, "learning_rate": 0.00019551215993404784, "loss": 1.069, "step": 972 }, { "epoch": 0.07508552156106962, "grad_norm": 0.13199055194854736, "learning_rate": 0.00019550185490519375, "loss": 1.0323, "step": 974 }, { "epoch": 0.07523970127680077, "grad_norm": 0.14991353452205658, "learning_rate": 0.00019549154987633967, "loss": 1.0625, "step": 976 }, { "epoch": 0.07539388099253191, "grad_norm": 0.13832435011863708, "learning_rate": 0.00019548124484748558, "loss": 1.1031, "step": 978 }, { "epoch": 0.07554806070826307, "grad_norm": 0.12351599335670471, "learning_rate": 0.0001954709398186315, "loss": 1.0286, "step": 980 }, { "epoch": 0.07570224042399422, "grad_norm": 0.12360050529241562, "learning_rate": 0.00019546063478977744, "loss": 1.0652, "step": 982 }, { "epoch": 0.07585642013972536, "grad_norm": 0.13384872674942017, "learning_rate": 0.00019545032976092335, "loss": 1.1125, "step": 984 }, { "epoch": 0.07601059985545652, "grad_norm": 0.13200527429580688, "learning_rate": 0.00019544002473206927, "loss": 1.0727, "step": 986 }, { "epoch": 0.07616477957118767, "grad_norm": 0.143647700548172, "learning_rate": 0.00019542971970321518, "loss": 1.1207, "step": 988 }, { "epoch": 0.07631895928691881, "grad_norm": 0.13605177402496338, "learning_rate": 0.0001954194146743611, "loss": 1.0225, "step": 990 }, { "epoch": 0.07647313900264996, "grad_norm": 0.12646125257015228, "learning_rate": 0.00019540910964550701, "loss": 1.11, "step": 992 }, { "epoch": 0.07662731871838112, "grad_norm": 0.132467120885849, "learning_rate": 0.00019539880461665293, "loss": 1.1092, "step": 994 }, { "epoch": 0.07678149843411226, "grad_norm": 0.12461701035499573, "learning_rate": 0.00019538849958779884, "loss": 1.0854, "step": 996 }, { "epoch": 0.07693567814984341, "grad_norm": 0.13430501520633698, "learning_rate": 0.00019537819455894476, "loss": 1.2, "step": 998 }, { "epoch": 0.07708985786557455, "grad_norm": 0.12623916566371918, "learning_rate": 0.00019536788953009067, "loss": 1.0522, "step": 1000 }, { "epoch": 0.07708985786557455, "eval_loss": 1.0930616855621338, "eval_runtime": 185.4001, "eval_samples_per_second": 91.386, "eval_steps_per_second": 1.429, "step": 1000 }, { "epoch": 0.07724403758130571, "grad_norm": 0.11760087311267853, "learning_rate": 0.00019535758450123662, "loss": 1.1566, "step": 1002 }, { "epoch": 0.07739821729703686, "grad_norm": 0.145633727312088, "learning_rate": 0.00019534727947238253, "loss": 1.094, "step": 1004 }, { "epoch": 0.077552397012768, "grad_norm": 0.1311633288860321, "learning_rate": 0.00019533697444352845, "loss": 1.0792, "step": 1006 }, { "epoch": 0.07770657672849916, "grad_norm": 0.12563548982143402, "learning_rate": 0.00019532666941467436, "loss": 1.0601, "step": 1008 }, { "epoch": 0.07786075644423031, "grad_norm": 0.14429886639118195, "learning_rate": 0.00019531636438582028, "loss": 1.0926, "step": 1010 }, { "epoch": 0.07801493615996145, "grad_norm": 0.13131891191005707, "learning_rate": 0.0001953060593569662, "loss": 1.1012, "step": 1012 }, { "epoch": 0.0781691158756926, "grad_norm": 0.14185300469398499, "learning_rate": 0.00019529575432811213, "loss": 1.1113, "step": 1014 }, { "epoch": 0.07832329559142376, "grad_norm": 0.14298418164253235, "learning_rate": 0.00019528544929925805, "loss": 1.0909, "step": 1016 }, { "epoch": 0.0784774753071549, "grad_norm": 0.1339821219444275, "learning_rate": 0.00019527514427040396, "loss": 1.0994, "step": 1018 }, { "epoch": 0.07863165502288605, "grad_norm": 0.1252928525209427, "learning_rate": 0.00019526483924154988, "loss": 1.0316, "step": 1020 }, { "epoch": 0.0787858347386172, "grad_norm": 0.1277703046798706, "learning_rate": 0.0001952545342126958, "loss": 1.1067, "step": 1022 }, { "epoch": 0.07894001445434835, "grad_norm": 0.12644124031066895, "learning_rate": 0.00019524422918384173, "loss": 1.0176, "step": 1024 }, { "epoch": 0.0790941941700795, "grad_norm": 0.13443627953529358, "learning_rate": 0.00019523392415498765, "loss": 1.0754, "step": 1026 }, { "epoch": 0.07924837388581064, "grad_norm": 0.1895609050989151, "learning_rate": 0.00019522361912613356, "loss": 1.0551, "step": 1028 }, { "epoch": 0.0794025536015418, "grad_norm": 0.1372397392988205, "learning_rate": 0.00019521331409727948, "loss": 1.0442, "step": 1030 }, { "epoch": 0.07955673331727295, "grad_norm": 0.14173942804336548, "learning_rate": 0.0001952030090684254, "loss": 1.0692, "step": 1032 }, { "epoch": 0.0797109130330041, "grad_norm": 0.12321804463863373, "learning_rate": 0.00019519270403957134, "loss": 1.0276, "step": 1034 }, { "epoch": 0.07986509274873524, "grad_norm": 0.12327130138874054, "learning_rate": 0.00019518239901071725, "loss": 1.0376, "step": 1036 }, { "epoch": 0.0800192724644664, "grad_norm": 0.12301841378211975, "learning_rate": 0.00019517209398186317, "loss": 1.0887, "step": 1038 }, { "epoch": 0.08017345218019754, "grad_norm": 0.1429559886455536, "learning_rate": 0.00019516178895300908, "loss": 1.0321, "step": 1040 }, { "epoch": 0.08032763189592869, "grad_norm": 0.13955366611480713, "learning_rate": 0.000195151483924155, "loss": 1.1081, "step": 1042 }, { "epoch": 0.08048181161165983, "grad_norm": 0.13553303480148315, "learning_rate": 0.00019514117889530094, "loss": 1.0252, "step": 1044 }, { "epoch": 0.080635991327391, "grad_norm": 0.14100225269794464, "learning_rate": 0.00019513087386644685, "loss": 1.1071, "step": 1046 }, { "epoch": 0.08079017104312214, "grad_norm": 0.14522643387317657, "learning_rate": 0.00019512056883759277, "loss": 1.0653, "step": 1048 }, { "epoch": 0.08094435075885328, "grad_norm": 0.14540371298789978, "learning_rate": 0.00019511026380873868, "loss": 1.01, "step": 1050 }, { "epoch": 0.08109853047458444, "grad_norm": 0.1459018737077713, "learning_rate": 0.0001950999587798846, "loss": 1.1147, "step": 1052 }, { "epoch": 0.08125271019031559, "grad_norm": 0.12590867280960083, "learning_rate": 0.0001950896537510305, "loss": 1.0685, "step": 1054 }, { "epoch": 0.08140688990604673, "grad_norm": 0.11943504959344864, "learning_rate": 0.00019507934872217643, "loss": 1.0854, "step": 1056 }, { "epoch": 0.08156106962177788, "grad_norm": 0.12039398401975632, "learning_rate": 0.00019506904369332234, "loss": 1.1397, "step": 1058 }, { "epoch": 0.08171524933750904, "grad_norm": 0.1411554217338562, "learning_rate": 0.00019505873866446826, "loss": 1.1271, "step": 1060 }, { "epoch": 0.08186942905324018, "grad_norm": 0.1402871012687683, "learning_rate": 0.00019504843363561417, "loss": 1.0425, "step": 1062 }, { "epoch": 0.08202360876897133, "grad_norm": 0.13545840978622437, "learning_rate": 0.00019503812860676011, "loss": 1.0571, "step": 1064 }, { "epoch": 0.08217778848470249, "grad_norm": 0.12789209187030792, "learning_rate": 0.00019502782357790603, "loss": 1.0596, "step": 1066 }, { "epoch": 0.08233196820043363, "grad_norm": 0.13018928468227386, "learning_rate": 0.00019501751854905194, "loss": 1.1188, "step": 1068 }, { "epoch": 0.08248614791616478, "grad_norm": 0.12482234835624695, "learning_rate": 0.00019500721352019786, "loss": 1.0831, "step": 1070 }, { "epoch": 0.08264032763189592, "grad_norm": 0.11897309869527817, "learning_rate": 0.00019499690849134377, "loss": 1.0658, "step": 1072 }, { "epoch": 0.08279450734762708, "grad_norm": 0.12954497337341309, "learning_rate": 0.00019498660346248972, "loss": 1.0204, "step": 1074 }, { "epoch": 0.08294868706335823, "grad_norm": 0.14220042526721954, "learning_rate": 0.00019497629843363563, "loss": 1.1101, "step": 1076 }, { "epoch": 0.08310286677908937, "grad_norm": 0.1631559580564499, "learning_rate": 0.00019496599340478155, "loss": 1.1352, "step": 1078 }, { "epoch": 0.08325704649482052, "grad_norm": 0.13439539074897766, "learning_rate": 0.00019495568837592746, "loss": 1.0108, "step": 1080 }, { "epoch": 0.08341122621055168, "grad_norm": 0.12389718741178513, "learning_rate": 0.00019494538334707338, "loss": 1.0155, "step": 1082 }, { "epoch": 0.08356540592628282, "grad_norm": 0.1241556853055954, "learning_rate": 0.00019493507831821932, "loss": 1.1428, "step": 1084 }, { "epoch": 0.08371958564201397, "grad_norm": 0.13087880611419678, "learning_rate": 0.00019492477328936523, "loss": 1.0876, "step": 1086 }, { "epoch": 0.08387376535774513, "grad_norm": 0.12431449443101883, "learning_rate": 0.00019491446826051115, "loss": 1.0758, "step": 1088 }, { "epoch": 0.08402794507347627, "grad_norm": 0.13807635009288788, "learning_rate": 0.00019490416323165706, "loss": 1.0902, "step": 1090 }, { "epoch": 0.08418212478920742, "grad_norm": 0.12751048803329468, "learning_rate": 0.00019489385820280298, "loss": 1.0732, "step": 1092 }, { "epoch": 0.08433630450493856, "grad_norm": 0.15594707429409027, "learning_rate": 0.00019488355317394892, "loss": 1.1115, "step": 1094 }, { "epoch": 0.08449048422066972, "grad_norm": 0.11647301912307739, "learning_rate": 0.00019487324814509483, "loss": 1.1592, "step": 1096 }, { "epoch": 0.08464466393640087, "grad_norm": 0.13609850406646729, "learning_rate": 0.00019486294311624075, "loss": 1.1139, "step": 1098 }, { "epoch": 0.08479884365213201, "grad_norm": 0.1234198659658432, "learning_rate": 0.00019485263808738666, "loss": 1.0682, "step": 1100 }, { "epoch": 0.08479884365213201, "eval_loss": 1.0920624732971191, "eval_runtime": 185.5142, "eval_samples_per_second": 91.33, "eval_steps_per_second": 1.428, "step": 1100 }, { "epoch": 0.08495302336786316, "grad_norm": 0.1375039666891098, "learning_rate": 0.00019484233305853258, "loss": 1.0585, "step": 1102 }, { "epoch": 0.08510720308359432, "grad_norm": 0.14471521973609924, "learning_rate": 0.0001948320280296785, "loss": 1.1115, "step": 1104 }, { "epoch": 0.08526138279932546, "grad_norm": 0.12425632029771805, "learning_rate": 0.0001948217230008244, "loss": 1.0501, "step": 1106 }, { "epoch": 0.08541556251505661, "grad_norm": 0.1161596029996872, "learning_rate": 0.00019481141797197032, "loss": 1.0182, "step": 1108 }, { "epoch": 0.08556974223078777, "grad_norm": 0.11700072139501572, "learning_rate": 0.00019480111294311624, "loss": 1.0579, "step": 1110 }, { "epoch": 0.08572392194651891, "grad_norm": 0.14330415427684784, "learning_rate": 0.00019479080791426215, "loss": 1.1211, "step": 1112 }, { "epoch": 0.08587810166225006, "grad_norm": 0.14039026200771332, "learning_rate": 0.00019478050288540807, "loss": 1.0826, "step": 1114 }, { "epoch": 0.0860322813779812, "grad_norm": 0.14031362533569336, "learning_rate": 0.000194770197856554, "loss": 1.0871, "step": 1116 }, { "epoch": 0.08618646109371236, "grad_norm": 0.12351037561893463, "learning_rate": 0.00019475989282769993, "loss": 1.001, "step": 1118 }, { "epoch": 0.08634064080944351, "grad_norm": 0.11667052656412125, "learning_rate": 0.00019474958779884584, "loss": 1.0421, "step": 1120 }, { "epoch": 0.08649482052517465, "grad_norm": 0.1489124447107315, "learning_rate": 0.00019473928276999175, "loss": 1.1644, "step": 1122 }, { "epoch": 0.0866490002409058, "grad_norm": 0.1338202804327011, "learning_rate": 0.00019472897774113767, "loss": 1.1239, "step": 1124 }, { "epoch": 0.08680317995663696, "grad_norm": 0.13266493380069733, "learning_rate": 0.0001947186727122836, "loss": 1.0839, "step": 1126 }, { "epoch": 0.0869573596723681, "grad_norm": 0.13726286590099335, "learning_rate": 0.00019470836768342953, "loss": 1.1325, "step": 1128 }, { "epoch": 0.08711153938809925, "grad_norm": 0.14077100157737732, "learning_rate": 0.00019469806265457544, "loss": 1.0429, "step": 1130 }, { "epoch": 0.08726571910383041, "grad_norm": 0.1362866312265396, "learning_rate": 0.00019468775762572136, "loss": 1.0715, "step": 1132 }, { "epoch": 0.08741989881956155, "grad_norm": 0.12472223490476608, "learning_rate": 0.00019467745259686727, "loss": 1.0503, "step": 1134 }, { "epoch": 0.0875740785352927, "grad_norm": 0.1350635141134262, "learning_rate": 0.0001946671475680132, "loss": 1.0498, "step": 1136 }, { "epoch": 0.08772825825102384, "grad_norm": 0.1424301117658615, "learning_rate": 0.00019465684253915913, "loss": 1.1589, "step": 1138 }, { "epoch": 0.087882437966755, "grad_norm": 0.12365067005157471, "learning_rate": 0.00019464653751030504, "loss": 1.1065, "step": 1140 }, { "epoch": 0.08803661768248615, "grad_norm": 0.16497495770454407, "learning_rate": 0.00019463623248145096, "loss": 1.0189, "step": 1142 }, { "epoch": 0.0881907973982173, "grad_norm": 0.1381298303604126, "learning_rate": 0.00019462592745259687, "loss": 1.0426, "step": 1144 }, { "epoch": 0.08834497711394845, "grad_norm": 0.15007291734218597, "learning_rate": 0.00019461562242374282, "loss": 1.1108, "step": 1146 }, { "epoch": 0.0884991568296796, "grad_norm": 0.19384606182575226, "learning_rate": 0.00019460531739488873, "loss": 1.0664, "step": 1148 }, { "epoch": 0.08865333654541074, "grad_norm": 0.12032177299261093, "learning_rate": 0.00019459501236603465, "loss": 1.018, "step": 1150 }, { "epoch": 0.08880751626114189, "grad_norm": 0.1197669506072998, "learning_rate": 0.00019458470733718056, "loss": 1.071, "step": 1152 }, { "epoch": 0.08896169597687305, "grad_norm": 0.12108784914016724, "learning_rate": 0.00019457440230832647, "loss": 1.0499, "step": 1154 }, { "epoch": 0.0891158756926042, "grad_norm": 0.1270270049571991, "learning_rate": 0.0001945640972794724, "loss": 1.1172, "step": 1156 }, { "epoch": 0.08927005540833534, "grad_norm": 0.13599786162376404, "learning_rate": 0.0001945537922506183, "loss": 1.103, "step": 1158 }, { "epoch": 0.08942423512406648, "grad_norm": 0.12051045894622803, "learning_rate": 0.00019454348722176422, "loss": 1.0905, "step": 1160 }, { "epoch": 0.08957841483979764, "grad_norm": 0.12117696553468704, "learning_rate": 0.00019453318219291013, "loss": 1.0611, "step": 1162 }, { "epoch": 0.08973259455552879, "grad_norm": 0.13710887730121613, "learning_rate": 0.00019452287716405605, "loss": 1.0242, "step": 1164 }, { "epoch": 0.08988677427125993, "grad_norm": 0.1160813644528389, "learning_rate": 0.000194512572135202, "loss": 1.0863, "step": 1166 }, { "epoch": 0.09004095398699109, "grad_norm": 0.1754099279642105, "learning_rate": 0.0001945022671063479, "loss": 1.0938, "step": 1168 }, { "epoch": 0.09019513370272224, "grad_norm": 0.1331128627061844, "learning_rate": 0.00019449196207749382, "loss": 1.0692, "step": 1170 }, { "epoch": 0.09034931341845338, "grad_norm": 0.13422611355781555, "learning_rate": 0.00019448165704863974, "loss": 1.0699, "step": 1172 }, { "epoch": 0.09050349313418453, "grad_norm": 0.12999802827835083, "learning_rate": 0.00019447135201978565, "loss": 1.0957, "step": 1174 }, { "epoch": 0.09065767284991569, "grad_norm": 0.13413815200328827, "learning_rate": 0.0001944610469909316, "loss": 1.0869, "step": 1176 }, { "epoch": 0.09081185256564683, "grad_norm": 0.12901006639003754, "learning_rate": 0.0001944507419620775, "loss": 1.0442, "step": 1178 }, { "epoch": 0.09096603228137798, "grad_norm": 0.11824194341897964, "learning_rate": 0.00019444043693322342, "loss": 1.0935, "step": 1180 }, { "epoch": 0.09112021199710912, "grad_norm": 0.14895616471767426, "learning_rate": 0.00019443013190436934, "loss": 1.0624, "step": 1182 }, { "epoch": 0.09127439171284028, "grad_norm": 0.13515722751617432, "learning_rate": 0.00019441982687551525, "loss": 1.0797, "step": 1184 }, { "epoch": 0.09142857142857143, "grad_norm": 0.13411575555801392, "learning_rate": 0.00019440952184666117, "loss": 1.0637, "step": 1186 }, { "epoch": 0.09158275114430257, "grad_norm": 0.12519463896751404, "learning_rate": 0.0001943992168178071, "loss": 1.0608, "step": 1188 }, { "epoch": 0.09173693086003373, "grad_norm": 0.1267428696155548, "learning_rate": 0.00019438891178895302, "loss": 1.0182, "step": 1190 }, { "epoch": 0.09189111057576488, "grad_norm": 0.13116560876369476, "learning_rate": 0.00019437860676009894, "loss": 1.1139, "step": 1192 }, { "epoch": 0.09204529029149602, "grad_norm": 0.14659713208675385, "learning_rate": 0.00019436830173124485, "loss": 1.1275, "step": 1194 }, { "epoch": 0.09219947000722717, "grad_norm": 0.12913885712623596, "learning_rate": 0.00019435799670239077, "loss": 1.0858, "step": 1196 }, { "epoch": 0.09235364972295833, "grad_norm": 0.12855856120586395, "learning_rate": 0.0001943476916735367, "loss": 1.0811, "step": 1198 }, { "epoch": 0.09250782943868947, "grad_norm": 0.1391747146844864, "learning_rate": 0.00019433738664468263, "loss": 1.0146, "step": 1200 }, { "epoch": 0.09250782943868947, "eval_loss": 1.0912913084030151, "eval_runtime": 185.3661, "eval_samples_per_second": 91.403, "eval_steps_per_second": 1.43, "step": 1200 }, { "epoch": 0.09266200915442062, "grad_norm": 0.13186782598495483, "learning_rate": 0.00019432708161582854, "loss": 1.1017, "step": 1202 }, { "epoch": 0.09281618887015176, "grad_norm": 0.12913943827152252, "learning_rate": 0.00019431677658697446, "loss": 1.1027, "step": 1204 }, { "epoch": 0.09297036858588292, "grad_norm": 0.1349743753671646, "learning_rate": 0.00019430647155812037, "loss": 1.1023, "step": 1206 }, { "epoch": 0.09312454830161407, "grad_norm": 0.12534667551517487, "learning_rate": 0.00019429616652926629, "loss": 1.0659, "step": 1208 }, { "epoch": 0.09327872801734521, "grad_norm": 0.11720700562000275, "learning_rate": 0.0001942858615004122, "loss": 1.0532, "step": 1210 }, { "epoch": 0.09343290773307637, "grad_norm": 0.1364222913980484, "learning_rate": 0.00019427555647155812, "loss": 1.0575, "step": 1212 }, { "epoch": 0.09358708744880752, "grad_norm": 0.15532977879047394, "learning_rate": 0.00019426525144270403, "loss": 1.1145, "step": 1214 }, { "epoch": 0.09374126716453866, "grad_norm": 0.1377478837966919, "learning_rate": 0.00019425494641384995, "loss": 1.0505, "step": 1216 }, { "epoch": 0.09389544688026981, "grad_norm": 0.1273409128189087, "learning_rate": 0.0001942446413849959, "loss": 1.0873, "step": 1218 }, { "epoch": 0.09404962659600097, "grad_norm": 0.11990435421466827, "learning_rate": 0.0001942343363561418, "loss": 1.0829, "step": 1220 }, { "epoch": 0.09420380631173211, "grad_norm": 0.14191892743110657, "learning_rate": 0.00019422403132728772, "loss": 1.0992, "step": 1222 }, { "epoch": 0.09435798602746326, "grad_norm": 0.14520397782325745, "learning_rate": 0.00019421372629843363, "loss": 1.0712, "step": 1224 }, { "epoch": 0.09451216574319442, "grad_norm": 0.13780727982521057, "learning_rate": 0.00019420342126957955, "loss": 0.9943, "step": 1226 }, { "epoch": 0.09466634545892556, "grad_norm": 0.13550738990306854, "learning_rate": 0.0001941931162407255, "loss": 1.1264, "step": 1228 }, { "epoch": 0.09482052517465671, "grad_norm": 0.12125276774168015, "learning_rate": 0.0001941828112118714, "loss": 1.1207, "step": 1230 }, { "epoch": 0.09497470489038785, "grad_norm": 0.14529301226139069, "learning_rate": 0.00019417250618301732, "loss": 1.144, "step": 1232 }, { "epoch": 0.09512888460611901, "grad_norm": 0.15477551519870758, "learning_rate": 0.00019416220115416323, "loss": 1.0568, "step": 1234 }, { "epoch": 0.09528306432185016, "grad_norm": 0.1299963742494583, "learning_rate": 0.00019415189612530915, "loss": 1.0235, "step": 1236 }, { "epoch": 0.0954372440375813, "grad_norm": 0.1372281014919281, "learning_rate": 0.0001941415910964551, "loss": 1.0764, "step": 1238 }, { "epoch": 0.09559142375331245, "grad_norm": 0.1247306764125824, "learning_rate": 0.000194131286067601, "loss": 1.1345, "step": 1240 }, { "epoch": 0.09574560346904361, "grad_norm": 0.1330571472644806, "learning_rate": 0.00019412098103874692, "loss": 1.1596, "step": 1242 }, { "epoch": 0.09589978318477475, "grad_norm": 0.15787385404109955, "learning_rate": 0.00019411067600989284, "loss": 1.1067, "step": 1244 }, { "epoch": 0.0960539629005059, "grad_norm": 0.12646274268627167, "learning_rate": 0.00019410037098103875, "loss": 1.0769, "step": 1246 }, { "epoch": 0.09620814261623706, "grad_norm": 0.16424262523651123, "learning_rate": 0.0001940900659521847, "loss": 1.0459, "step": 1248 }, { "epoch": 0.0963623223319682, "grad_norm": 0.1401062309741974, "learning_rate": 0.0001940797609233306, "loss": 1.1308, "step": 1250 }, { "epoch": 0.09651650204769935, "grad_norm": 0.13971561193466187, "learning_rate": 0.00019406945589447652, "loss": 1.1457, "step": 1252 }, { "epoch": 0.0966706817634305, "grad_norm": 0.13544687628746033, "learning_rate": 0.00019405915086562244, "loss": 1.0532, "step": 1254 }, { "epoch": 0.09682486147916165, "grad_norm": 0.13527531921863556, "learning_rate": 0.00019404884583676835, "loss": 1.0376, "step": 1256 }, { "epoch": 0.0969790411948928, "grad_norm": 0.1731848120689392, "learning_rate": 0.0001940385408079143, "loss": 1.2252, "step": 1258 }, { "epoch": 0.09713322091062394, "grad_norm": 0.13142083585262299, "learning_rate": 0.0001940282357790602, "loss": 1.0254, "step": 1260 }, { "epoch": 0.09728740062635509, "grad_norm": 0.13390247523784637, "learning_rate": 0.00019401793075020612, "loss": 1.0448, "step": 1262 }, { "epoch": 0.09744158034208625, "grad_norm": 0.15188650786876678, "learning_rate": 0.00019400762572135204, "loss": 1.1019, "step": 1264 }, { "epoch": 0.0975957600578174, "grad_norm": 0.14055617153644562, "learning_rate": 0.00019399732069249795, "loss": 1.0835, "step": 1266 }, { "epoch": 0.09774993977354854, "grad_norm": 0.12209255248308182, "learning_rate": 0.00019398701566364387, "loss": 1.0675, "step": 1268 }, { "epoch": 0.0979041194892797, "grad_norm": 0.14639706909656525, "learning_rate": 0.00019397671063478978, "loss": 1.049, "step": 1270 }, { "epoch": 0.09805829920501084, "grad_norm": 0.13672591745853424, "learning_rate": 0.0001939664056059357, "loss": 1.1057, "step": 1272 }, { "epoch": 0.09821247892074199, "grad_norm": 0.1522635966539383, "learning_rate": 0.00019395610057708161, "loss": 1.14, "step": 1274 }, { "epoch": 0.09836665863647313, "grad_norm": 0.13887491822242737, "learning_rate": 0.00019394579554822753, "loss": 1.069, "step": 1276 }, { "epoch": 0.09852083835220429, "grad_norm": 0.13854965567588806, "learning_rate": 0.00019393549051937344, "loss": 1.0704, "step": 1278 }, { "epoch": 0.09867501806793544, "grad_norm": 0.12839765846729279, "learning_rate": 0.00019392518549051939, "loss": 1.0512, "step": 1280 }, { "epoch": 0.09882919778366658, "grad_norm": 0.1270405352115631, "learning_rate": 0.0001939148804616653, "loss": 1.0251, "step": 1282 }, { "epoch": 0.09898337749939773, "grad_norm": 0.1269143521785736, "learning_rate": 0.00019390457543281122, "loss": 1.0433, "step": 1284 }, { "epoch": 0.09913755721512889, "grad_norm": 0.14292192459106445, "learning_rate": 0.00019389427040395713, "loss": 1.1507, "step": 1286 }, { "epoch": 0.09929173693086003, "grad_norm": 0.12512263655662537, "learning_rate": 0.00019388396537510305, "loss": 1.0918, "step": 1288 }, { "epoch": 0.09944591664659118, "grad_norm": 0.11927679181098938, "learning_rate": 0.000193873660346249, "loss": 1.0924, "step": 1290 }, { "epoch": 0.09960009636232234, "grad_norm": 0.13639990985393524, "learning_rate": 0.0001938633553173949, "loss": 1.1024, "step": 1292 }, { "epoch": 0.09975427607805348, "grad_norm": 0.142363503575325, "learning_rate": 0.00019385305028854082, "loss": 1.021, "step": 1294 }, { "epoch": 0.09990845579378463, "grad_norm": 0.1389359086751938, "learning_rate": 0.00019384274525968673, "loss": 1.0269, "step": 1296 }, { "epoch": 0.10006263550951577, "grad_norm": 0.15595073997974396, "learning_rate": 0.00019383244023083265, "loss": 1.0913, "step": 1298 }, { "epoch": 0.10021681522524693, "grad_norm": 0.1324295848608017, "learning_rate": 0.0001938221352019786, "loss": 1.1001, "step": 1300 }, { "epoch": 0.10021681522524693, "eval_loss": 1.0909266471862793, "eval_runtime": 185.4116, "eval_samples_per_second": 91.38, "eval_steps_per_second": 1.429, "step": 1300 }, { "epoch": 0.10037099494097808, "grad_norm": 0.139576256275177, "learning_rate": 0.0001938118301731245, "loss": 1.1147, "step": 1302 }, { "epoch": 0.10052517465670922, "grad_norm": 0.12854811549186707, "learning_rate": 0.00019380152514427042, "loss": 1.0973, "step": 1304 }, { "epoch": 0.10067935437244037, "grad_norm": 0.1245393380522728, "learning_rate": 0.00019379122011541633, "loss": 1.0485, "step": 1306 }, { "epoch": 0.10083353408817153, "grad_norm": 0.13261497020721436, "learning_rate": 0.00019378091508656225, "loss": 1.156, "step": 1308 }, { "epoch": 0.10098771380390267, "grad_norm": 0.1255144327878952, "learning_rate": 0.0001937706100577082, "loss": 1.0852, "step": 1310 }, { "epoch": 0.10114189351963382, "grad_norm": 0.1412706971168518, "learning_rate": 0.0001937603050288541, "loss": 1.0766, "step": 1312 }, { "epoch": 0.10129607323536498, "grad_norm": 0.1281047761440277, "learning_rate": 0.00019375000000000002, "loss": 1.0824, "step": 1314 }, { "epoch": 0.10145025295109612, "grad_norm": 0.13307350873947144, "learning_rate": 0.00019373969497114594, "loss": 1.0887, "step": 1316 }, { "epoch": 0.10160443266682727, "grad_norm": 0.1287691742181778, "learning_rate": 0.00019372938994229185, "loss": 1.0705, "step": 1318 }, { "epoch": 0.10175861238255841, "grad_norm": 0.1303441971540451, "learning_rate": 0.00019371908491343777, "loss": 1.1684, "step": 1320 }, { "epoch": 0.10191279209828957, "grad_norm": 0.13304616510868073, "learning_rate": 0.00019370877988458368, "loss": 1.0944, "step": 1322 }, { "epoch": 0.10206697181402072, "grad_norm": 0.13905592262744904, "learning_rate": 0.0001936984748557296, "loss": 1.0915, "step": 1324 }, { "epoch": 0.10222115152975186, "grad_norm": 0.13225632905960083, "learning_rate": 0.0001936881698268755, "loss": 1.0418, "step": 1326 }, { "epoch": 0.10237533124548302, "grad_norm": 0.1267402619123459, "learning_rate": 0.00019367786479802142, "loss": 1.0446, "step": 1328 }, { "epoch": 0.10252951096121417, "grad_norm": 0.1439935863018036, "learning_rate": 0.00019366755976916737, "loss": 1.0582, "step": 1330 }, { "epoch": 0.10268369067694531, "grad_norm": 0.1267223060131073, "learning_rate": 0.00019365725474031328, "loss": 1.0176, "step": 1332 }, { "epoch": 0.10283787039267646, "grad_norm": 0.1298942118883133, "learning_rate": 0.0001936469497114592, "loss": 1.0552, "step": 1334 }, { "epoch": 0.10299205010840762, "grad_norm": 0.13010933995246887, "learning_rate": 0.0001936366446826051, "loss": 1.0848, "step": 1336 }, { "epoch": 0.10314622982413876, "grad_norm": 0.13728559017181396, "learning_rate": 0.00019362633965375103, "loss": 1.0779, "step": 1338 }, { "epoch": 0.10330040953986991, "grad_norm": 0.13863548636436462, "learning_rate": 0.00019361603462489697, "loss": 1.0326, "step": 1340 }, { "epoch": 0.10345458925560105, "grad_norm": 0.12995532155036926, "learning_rate": 0.00019360572959604288, "loss": 1.1427, "step": 1342 }, { "epoch": 0.10360876897133221, "grad_norm": 0.13650789856910706, "learning_rate": 0.0001935954245671888, "loss": 1.0528, "step": 1344 }, { "epoch": 0.10376294868706336, "grad_norm": 0.1336941123008728, "learning_rate": 0.0001935851195383347, "loss": 1.1155, "step": 1346 }, { "epoch": 0.1039171284027945, "grad_norm": 0.13927003741264343, "learning_rate": 0.00019357481450948063, "loss": 1.0551, "step": 1348 }, { "epoch": 0.10407130811852566, "grad_norm": 0.14504994451999664, "learning_rate": 0.00019356450948062657, "loss": 1.1014, "step": 1350 }, { "epoch": 0.10422548783425681, "grad_norm": 0.15796230733394623, "learning_rate": 0.00019355420445177248, "loss": 1.2115, "step": 1352 }, { "epoch": 0.10437966754998795, "grad_norm": 0.1317984163761139, "learning_rate": 0.0001935438994229184, "loss": 1.0933, "step": 1354 }, { "epoch": 0.1045338472657191, "grad_norm": 0.13189563155174255, "learning_rate": 0.00019353359439406431, "loss": 1.0664, "step": 1356 }, { "epoch": 0.10468802698145026, "grad_norm": 0.1323234885931015, "learning_rate": 0.00019352328936521023, "loss": 1.0824, "step": 1358 }, { "epoch": 0.1048422066971814, "grad_norm": 0.13659097254276276, "learning_rate": 0.00019351298433635614, "loss": 1.0334, "step": 1360 }, { "epoch": 0.10499638641291255, "grad_norm": 0.11882172524929047, "learning_rate": 0.0001935026793075021, "loss": 1.0401, "step": 1362 }, { "epoch": 0.1051505661286437, "grad_norm": 0.13025067746639252, "learning_rate": 0.000193492374278648, "loss": 1.0838, "step": 1364 }, { "epoch": 0.10530474584437485, "grad_norm": 0.1249939501285553, "learning_rate": 0.00019348206924979392, "loss": 1.0349, "step": 1366 }, { "epoch": 0.105458925560106, "grad_norm": 0.12588031589984894, "learning_rate": 0.00019347176422093983, "loss": 1.079, "step": 1368 }, { "epoch": 0.10561310527583714, "grad_norm": 0.12548890709877014, "learning_rate": 0.00019346145919208575, "loss": 1.0062, "step": 1370 }, { "epoch": 0.1057672849915683, "grad_norm": 0.13328798115253448, "learning_rate": 0.00019345115416323166, "loss": 1.1154, "step": 1372 }, { "epoch": 0.10592146470729945, "grad_norm": 0.1443903148174286, "learning_rate": 0.00019344084913437758, "loss": 1.097, "step": 1374 }, { "epoch": 0.1060756444230306, "grad_norm": 0.12835648655891418, "learning_rate": 0.0001934305441055235, "loss": 1.0723, "step": 1376 }, { "epoch": 0.10622982413876174, "grad_norm": 0.13068312406539917, "learning_rate": 0.0001934202390766694, "loss": 1.1128, "step": 1378 }, { "epoch": 0.1063840038544929, "grad_norm": 0.13628961145877838, "learning_rate": 0.00019340993404781532, "loss": 1.1146, "step": 1380 }, { "epoch": 0.10653818357022404, "grad_norm": 0.12263484299182892, "learning_rate": 0.00019339962901896126, "loss": 1.0947, "step": 1382 }, { "epoch": 0.10669236328595519, "grad_norm": 0.12684424221515656, "learning_rate": 0.00019338932399010718, "loss": 1.059, "step": 1384 }, { "epoch": 0.10684654300168633, "grad_norm": 0.1421595960855484, "learning_rate": 0.0001933790189612531, "loss": 1.0688, "step": 1386 }, { "epoch": 0.10700072271741749, "grad_norm": 0.12416025251150131, "learning_rate": 0.000193368713932399, "loss": 1.0905, "step": 1388 }, { "epoch": 0.10715490243314864, "grad_norm": 0.1284332126379013, "learning_rate": 0.00019335840890354492, "loss": 1.0612, "step": 1390 }, { "epoch": 0.10730908214887978, "grad_norm": 0.1282491385936737, "learning_rate": 0.00019334810387469086, "loss": 1.0851, "step": 1392 }, { "epoch": 0.10746326186461094, "grad_norm": 0.13221289217472076, "learning_rate": 0.00019333779884583678, "loss": 1.0446, "step": 1394 }, { "epoch": 0.10761744158034209, "grad_norm": 0.12401736527681351, "learning_rate": 0.0001933274938169827, "loss": 1.0826, "step": 1396 }, { "epoch": 0.10777162129607323, "grad_norm": 0.14316771924495697, "learning_rate": 0.0001933171887881286, "loss": 1.1136, "step": 1398 }, { "epoch": 0.10792580101180438, "grad_norm": 0.17223364114761353, "learning_rate": 0.00019330688375927452, "loss": 1.0752, "step": 1400 }, { "epoch": 0.10792580101180438, "eval_loss": 1.0899540185928345, "eval_runtime": 185.3818, "eval_samples_per_second": 91.395, "eval_steps_per_second": 1.429, "step": 1400 }, { "epoch": 0.10807998072753554, "grad_norm": 0.15027141571044922, "learning_rate": 0.00019329657873042047, "loss": 1.0371, "step": 1402 }, { "epoch": 0.10823416044326668, "grad_norm": 0.19876505434513092, "learning_rate": 0.00019328627370156638, "loss": 1.0312, "step": 1404 }, { "epoch": 0.10838834015899783, "grad_norm": 0.1422131210565567, "learning_rate": 0.0001932759686727123, "loss": 1.0597, "step": 1406 }, { "epoch": 0.10854251987472899, "grad_norm": 0.13597753643989563, "learning_rate": 0.0001932656636438582, "loss": 1.0939, "step": 1408 }, { "epoch": 0.10869669959046013, "grad_norm": 0.16808953881263733, "learning_rate": 0.00019325535861500413, "loss": 1.1221, "step": 1410 }, { "epoch": 0.10885087930619128, "grad_norm": 0.14884881675243378, "learning_rate": 0.00019324505358615007, "loss": 1.1114, "step": 1412 }, { "epoch": 0.10900505902192242, "grad_norm": 0.12680503726005554, "learning_rate": 0.00019323474855729598, "loss": 1.1032, "step": 1414 }, { "epoch": 0.10915923873765358, "grad_norm": 0.13997766375541687, "learning_rate": 0.0001932244435284419, "loss": 1.0799, "step": 1416 }, { "epoch": 0.10931341845338473, "grad_norm": 0.1343669593334198, "learning_rate": 0.0001932141384995878, "loss": 1.0778, "step": 1418 }, { "epoch": 0.10946759816911587, "grad_norm": 0.12029851973056793, "learning_rate": 0.00019320383347073373, "loss": 1.1021, "step": 1420 }, { "epoch": 0.10962177788484702, "grad_norm": 0.1322990357875824, "learning_rate": 0.00019319352844187967, "loss": 1.1061, "step": 1422 }, { "epoch": 0.10977595760057818, "grad_norm": 0.13710594177246094, "learning_rate": 0.00019318322341302558, "loss": 1.0786, "step": 1424 }, { "epoch": 0.10993013731630932, "grad_norm": 0.11956049501895905, "learning_rate": 0.0001931729183841715, "loss": 1.0711, "step": 1426 }, { "epoch": 0.11008431703204047, "grad_norm": 0.139973446726799, "learning_rate": 0.00019316261335531741, "loss": 1.1162, "step": 1428 }, { "epoch": 0.11023849674777163, "grad_norm": 0.1525941640138626, "learning_rate": 0.00019315230832646333, "loss": 1.0572, "step": 1430 }, { "epoch": 0.11039267646350277, "grad_norm": 0.1349973976612091, "learning_rate": 0.00019314200329760924, "loss": 1.1048, "step": 1432 }, { "epoch": 0.11054685617923392, "grad_norm": 0.1305711269378662, "learning_rate": 0.00019313169826875516, "loss": 1.0841, "step": 1434 }, { "epoch": 0.11070103589496506, "grad_norm": 0.16756822168827057, "learning_rate": 0.00019312139323990107, "loss": 1.0736, "step": 1436 }, { "epoch": 0.11085521561069622, "grad_norm": 0.13367486000061035, "learning_rate": 0.000193111088211047, "loss": 1.0774, "step": 1438 }, { "epoch": 0.11100939532642737, "grad_norm": 0.12484605610370636, "learning_rate": 0.0001931007831821929, "loss": 1.1196, "step": 1440 }, { "epoch": 0.11116357504215851, "grad_norm": 0.14064739644527435, "learning_rate": 0.00019309047815333885, "loss": 1.1101, "step": 1442 }, { "epoch": 0.11131775475788966, "grad_norm": 0.1366916447877884, "learning_rate": 0.00019308017312448476, "loss": 1.111, "step": 1444 }, { "epoch": 0.11147193447362082, "grad_norm": 0.11520934104919434, "learning_rate": 0.00019306986809563068, "loss": 1.065, "step": 1446 }, { "epoch": 0.11162611418935196, "grad_norm": 0.15567731857299805, "learning_rate": 0.0001930595630667766, "loss": 1.1036, "step": 1448 }, { "epoch": 0.11178029390508311, "grad_norm": 0.13628730177879333, "learning_rate": 0.0001930492580379225, "loss": 1.0717, "step": 1450 }, { "epoch": 0.11193447362081427, "grad_norm": 0.1359964907169342, "learning_rate": 0.00019303895300906842, "loss": 1.0986, "step": 1452 }, { "epoch": 0.11208865333654541, "grad_norm": 0.16372162103652954, "learning_rate": 0.00019302864798021436, "loss": 1.0306, "step": 1454 }, { "epoch": 0.11224283305227656, "grad_norm": 0.1724134087562561, "learning_rate": 0.00019301834295136028, "loss": 1.0753, "step": 1456 }, { "epoch": 0.1123970127680077, "grad_norm": 0.13646383583545685, "learning_rate": 0.0001930080379225062, "loss": 1.0975, "step": 1458 }, { "epoch": 0.11255119248373886, "grad_norm": 0.1522134691476822, "learning_rate": 0.0001929977328936521, "loss": 1.1031, "step": 1460 }, { "epoch": 0.11270537219947001, "grad_norm": 0.13656160235404968, "learning_rate": 0.00019298742786479802, "loss": 1.0602, "step": 1462 }, { "epoch": 0.11285955191520115, "grad_norm": 0.14140130579471588, "learning_rate": 0.00019297712283594396, "loss": 1.1289, "step": 1464 }, { "epoch": 0.1130137316309323, "grad_norm": 0.1383032351732254, "learning_rate": 0.00019296681780708988, "loss": 1.0797, "step": 1466 }, { "epoch": 0.11316791134666346, "grad_norm": 0.15723556280136108, "learning_rate": 0.0001929565127782358, "loss": 1.1156, "step": 1468 }, { "epoch": 0.1133220910623946, "grad_norm": 0.13462230563163757, "learning_rate": 0.0001929462077493817, "loss": 1.0953, "step": 1470 }, { "epoch": 0.11347627077812575, "grad_norm": 0.14101319015026093, "learning_rate": 0.00019293590272052762, "loss": 1.1152, "step": 1472 }, { "epoch": 0.11363045049385691, "grad_norm": 0.13705132901668549, "learning_rate": 0.00019292559769167357, "loss": 1.0886, "step": 1474 }, { "epoch": 0.11378463020958805, "grad_norm": 0.1206672340631485, "learning_rate": 0.00019291529266281948, "loss": 1.0995, "step": 1476 }, { "epoch": 0.1139388099253192, "grad_norm": 0.13666383922100067, "learning_rate": 0.0001929049876339654, "loss": 1.058, "step": 1478 }, { "epoch": 0.11409298964105034, "grad_norm": 0.1265423446893692, "learning_rate": 0.0001928946826051113, "loss": 1.0676, "step": 1480 }, { "epoch": 0.1142471693567815, "grad_norm": 0.1528097242116928, "learning_rate": 0.00019288437757625723, "loss": 1.0675, "step": 1482 }, { "epoch": 0.11440134907251265, "grad_norm": 0.16541676223278046, "learning_rate": 0.00019287407254740314, "loss": 1.1539, "step": 1484 }, { "epoch": 0.1145555287882438, "grad_norm": 0.20383091270923615, "learning_rate": 0.00019286376751854906, "loss": 1.0472, "step": 1486 }, { "epoch": 0.11470970850397495, "grad_norm": 0.13806484639644623, "learning_rate": 0.00019285346248969497, "loss": 1.0408, "step": 1488 }, { "epoch": 0.1148638882197061, "grad_norm": 0.1251746118068695, "learning_rate": 0.00019284315746084089, "loss": 1.1207, "step": 1490 }, { "epoch": 0.11501806793543724, "grad_norm": 0.13218504190444946, "learning_rate": 0.0001928328524319868, "loss": 1.1131, "step": 1492 }, { "epoch": 0.11517224765116839, "grad_norm": 0.21616914868354797, "learning_rate": 0.00019282254740313274, "loss": 1.1103, "step": 1494 }, { "epoch": 0.11532642736689955, "grad_norm": 0.1437305361032486, "learning_rate": 0.00019281224237427866, "loss": 1.1243, "step": 1496 }, { "epoch": 0.11548060708263069, "grad_norm": 0.13094168901443481, "learning_rate": 0.00019280193734542457, "loss": 1.1012, "step": 1498 }, { "epoch": 0.11563478679836184, "grad_norm": 0.12384334206581116, "learning_rate": 0.0001927916323165705, "loss": 1.05, "step": 1500 }, { "epoch": 0.11563478679836184, "eval_loss": 1.0905406475067139, "eval_runtime": 185.4473, "eval_samples_per_second": 91.363, "eval_steps_per_second": 1.429, "step": 1500 }, { "epoch": 0.11578896651409298, "grad_norm": 0.12807106971740723, "learning_rate": 0.0001927813272877164, "loss": 1.0754, "step": 1502 }, { "epoch": 0.11594314622982414, "grad_norm": 0.12517131865024567, "learning_rate": 0.00019277102225886234, "loss": 1.1017, "step": 1504 }, { "epoch": 0.11609732594555529, "grad_norm": 0.1704496592283249, "learning_rate": 0.00019276071723000826, "loss": 1.098, "step": 1506 }, { "epoch": 0.11625150566128643, "grad_norm": 0.12152231484651566, "learning_rate": 0.00019275041220115417, "loss": 1.0738, "step": 1508 }, { "epoch": 0.11640568537701759, "grad_norm": 0.12952156364917755, "learning_rate": 0.0001927401071723001, "loss": 1.0479, "step": 1510 }, { "epoch": 0.11655986509274874, "grad_norm": 0.1499640941619873, "learning_rate": 0.000192729802143446, "loss": 1.1046, "step": 1512 }, { "epoch": 0.11671404480847988, "grad_norm": 0.1331593543291092, "learning_rate": 0.00019271949711459195, "loss": 1.1219, "step": 1514 }, { "epoch": 0.11686822452421103, "grad_norm": 0.1368558406829834, "learning_rate": 0.00019270919208573786, "loss": 1.1357, "step": 1516 }, { "epoch": 0.11702240423994219, "grad_norm": 0.12278290838003159, "learning_rate": 0.00019269888705688378, "loss": 1.1079, "step": 1518 }, { "epoch": 0.11717658395567333, "grad_norm": 0.11737775802612305, "learning_rate": 0.0001926885820280297, "loss": 1.1224, "step": 1520 }, { "epoch": 0.11733076367140448, "grad_norm": 0.13017341494560242, "learning_rate": 0.0001926782769991756, "loss": 1.0648, "step": 1522 }, { "epoch": 0.11748494338713562, "grad_norm": 0.11939583718776703, "learning_rate": 0.00019266797197032155, "loss": 1.0899, "step": 1524 }, { "epoch": 0.11763912310286678, "grad_norm": 0.12446755915880203, "learning_rate": 0.00019265766694146746, "loss": 1.0626, "step": 1526 }, { "epoch": 0.11779330281859793, "grad_norm": 0.13369430601596832, "learning_rate": 0.00019264736191261338, "loss": 1.0526, "step": 1528 }, { "epoch": 0.11794748253432907, "grad_norm": 0.13470736145973206, "learning_rate": 0.0001926370568837593, "loss": 1.0946, "step": 1530 }, { "epoch": 0.11810166225006023, "grad_norm": 0.14193174242973328, "learning_rate": 0.0001926267518549052, "loss": 1.1089, "step": 1532 }, { "epoch": 0.11825584196579138, "grad_norm": 0.14893026649951935, "learning_rate": 0.00019261644682605112, "loss": 1.0606, "step": 1534 }, { "epoch": 0.11841002168152252, "grad_norm": 0.20594976842403412, "learning_rate": 0.00019260614179719704, "loss": 1.0375, "step": 1536 }, { "epoch": 0.11856420139725367, "grad_norm": 0.15287873148918152, "learning_rate": 0.00019259583676834295, "loss": 1.1414, "step": 1538 }, { "epoch": 0.11871838111298483, "grad_norm": 0.1275177299976349, "learning_rate": 0.00019258553173948887, "loss": 1.1084, "step": 1540 }, { "epoch": 0.11887256082871597, "grad_norm": 0.20036157965660095, "learning_rate": 0.00019257522671063478, "loss": 1.1261, "step": 1542 }, { "epoch": 0.11902674054444712, "grad_norm": 0.14492087066173553, "learning_rate": 0.0001925649216817807, "loss": 1.1137, "step": 1544 }, { "epoch": 0.11918092026017826, "grad_norm": 0.1259312629699707, "learning_rate": 0.00019255461665292664, "loss": 1.0409, "step": 1546 }, { "epoch": 0.11933509997590942, "grad_norm": 0.1296795755624771, "learning_rate": 0.00019254431162407255, "loss": 1.0332, "step": 1548 }, { "epoch": 0.11948927969164057, "grad_norm": 0.13372276723384857, "learning_rate": 0.00019253400659521847, "loss": 1.1087, "step": 1550 }, { "epoch": 0.11964345940737171, "grad_norm": 0.14354725182056427, "learning_rate": 0.00019252370156636438, "loss": 1.0398, "step": 1552 }, { "epoch": 0.11979763912310287, "grad_norm": 0.1378318965435028, "learning_rate": 0.0001925133965375103, "loss": 1.0542, "step": 1554 }, { "epoch": 0.11995181883883402, "grad_norm": 0.12171255797147751, "learning_rate": 0.00019250309150865624, "loss": 1.0935, "step": 1556 }, { "epoch": 0.12010599855456516, "grad_norm": 0.11905664205551147, "learning_rate": 0.00019249278647980215, "loss": 1.0097, "step": 1558 }, { "epoch": 0.12026017827029631, "grad_norm": 0.12854760885238647, "learning_rate": 0.00019248248145094807, "loss": 1.1517, "step": 1560 }, { "epoch": 0.12041435798602747, "grad_norm": 0.247908353805542, "learning_rate": 0.00019247217642209398, "loss": 1.0876, "step": 1562 }, { "epoch": 0.12056853770175861, "grad_norm": 0.1441553235054016, "learning_rate": 0.0001924618713932399, "loss": 1.1414, "step": 1564 }, { "epoch": 0.12072271741748976, "grad_norm": 0.13307887315750122, "learning_rate": 0.00019245156636438584, "loss": 1.1012, "step": 1566 }, { "epoch": 0.12087689713322092, "grad_norm": 0.14192406833171844, "learning_rate": 0.00019244126133553176, "loss": 1.1418, "step": 1568 }, { "epoch": 0.12103107684895206, "grad_norm": 0.11530864983797073, "learning_rate": 0.00019243095630667767, "loss": 1.0776, "step": 1570 }, { "epoch": 0.12118525656468321, "grad_norm": 0.13385196030139923, "learning_rate": 0.00019242065127782359, "loss": 1.1311, "step": 1572 }, { "epoch": 0.12133943628041435, "grad_norm": 0.1308089643716812, "learning_rate": 0.0001924103462489695, "loss": 1.0625, "step": 1574 }, { "epoch": 0.12149361599614551, "grad_norm": 0.11851842701435089, "learning_rate": 0.00019240004122011544, "loss": 1.0182, "step": 1576 }, { "epoch": 0.12164779571187666, "grad_norm": 0.2496737688779831, "learning_rate": 0.00019238973619126136, "loss": 1.0746, "step": 1578 }, { "epoch": 0.1218019754276078, "grad_norm": 0.12962055206298828, "learning_rate": 0.00019237943116240727, "loss": 1.0245, "step": 1580 }, { "epoch": 0.12195615514333895, "grad_norm": 0.13170978426933289, "learning_rate": 0.0001923691261335532, "loss": 0.9897, "step": 1582 }, { "epoch": 0.12211033485907011, "grad_norm": 0.13226309418678284, "learning_rate": 0.0001923588211046991, "loss": 1.1035, "step": 1584 }, { "epoch": 0.12226451457480125, "grad_norm": 0.11901077628135681, "learning_rate": 0.00019234851607584502, "loss": 1.0084, "step": 1586 }, { "epoch": 0.1224186942905324, "grad_norm": 0.15274369716644287, "learning_rate": 0.00019233821104699093, "loss": 1.1436, "step": 1588 }, { "epoch": 0.12257287400626356, "grad_norm": 0.11832466721534729, "learning_rate": 0.00019232790601813685, "loss": 1.0179, "step": 1590 }, { "epoch": 0.1227270537219947, "grad_norm": 0.13038666546344757, "learning_rate": 0.00019231760098928276, "loss": 1.0779, "step": 1592 }, { "epoch": 0.12288123343772585, "grad_norm": 0.12837626039981842, "learning_rate": 0.00019230729596042868, "loss": 1.1404, "step": 1594 }, { "epoch": 0.123035413153457, "grad_norm": 0.1400509923696518, "learning_rate": 0.00019229699093157462, "loss": 1.1132, "step": 1596 }, { "epoch": 0.12318959286918815, "grad_norm": 0.13757595419883728, "learning_rate": 0.00019228668590272053, "loss": 1.0816, "step": 1598 }, { "epoch": 0.1233437725849193, "grad_norm": 0.12403321266174316, "learning_rate": 0.00019227638087386645, "loss": 1.039, "step": 1600 }, { "epoch": 0.1233437725849193, "eval_loss": 1.0888522863388062, "eval_runtime": 185.2371, "eval_samples_per_second": 91.467, "eval_steps_per_second": 1.431, "step": 1600 }, { "epoch": 0.12349795230065044, "grad_norm": 0.12380605190992355, "learning_rate": 0.00019226607584501236, "loss": 1.0903, "step": 1602 }, { "epoch": 0.12365213201638159, "grad_norm": 0.13564443588256836, "learning_rate": 0.00019225577081615828, "loss": 1.0768, "step": 1604 }, { "epoch": 0.12380631173211275, "grad_norm": 0.1533685177564621, "learning_rate": 0.00019224546578730422, "loss": 1.0852, "step": 1606 }, { "epoch": 0.12396049144784389, "grad_norm": 0.1163390502333641, "learning_rate": 0.00019223516075845014, "loss": 1.0574, "step": 1608 }, { "epoch": 0.12411467116357504, "grad_norm": 0.13867324590682983, "learning_rate": 0.00019222485572959605, "loss": 1.0992, "step": 1610 }, { "epoch": 0.1242688508793062, "grad_norm": 0.12759087979793549, "learning_rate": 0.00019221455070074197, "loss": 1.0738, "step": 1612 }, { "epoch": 0.12442303059503734, "grad_norm": 0.1237189844250679, "learning_rate": 0.00019220424567188788, "loss": 1.0974, "step": 1614 }, { "epoch": 0.12457721031076849, "grad_norm": 0.13331052660942078, "learning_rate": 0.00019219394064303382, "loss": 1.0917, "step": 1616 }, { "epoch": 0.12473139002649963, "grad_norm": 0.1290212869644165, "learning_rate": 0.00019218363561417974, "loss": 1.0696, "step": 1618 }, { "epoch": 0.12488556974223079, "grad_norm": 0.13309410214424133, "learning_rate": 0.00019217333058532565, "loss": 1.043, "step": 1620 }, { "epoch": 0.12503974945796192, "grad_norm": 0.13453248143196106, "learning_rate": 0.00019216302555647157, "loss": 1.0435, "step": 1622 }, { "epoch": 0.1251939291736931, "grad_norm": 0.11639372259378433, "learning_rate": 0.00019215272052761748, "loss": 1.0579, "step": 1624 }, { "epoch": 0.12534810888942424, "grad_norm": 0.13231517374515533, "learning_rate": 0.0001921424154987634, "loss": 1.1268, "step": 1626 }, { "epoch": 0.1255022886051554, "grad_norm": 0.1349351406097412, "learning_rate": 0.00019213211046990934, "loss": 1.1599, "step": 1628 }, { "epoch": 0.12565646832088653, "grad_norm": 0.13710346817970276, "learning_rate": 0.00019212180544105525, "loss": 1.0866, "step": 1630 }, { "epoch": 0.12581064803661768, "grad_norm": 0.14535072445869446, "learning_rate": 0.00019211150041220117, "loss": 1.0445, "step": 1632 }, { "epoch": 0.12596482775234882, "grad_norm": 0.11799806356430054, "learning_rate": 0.00019210119538334708, "loss": 1.0525, "step": 1634 }, { "epoch": 0.12611900746807997, "grad_norm": 0.13399624824523926, "learning_rate": 0.000192090890354493, "loss": 1.0246, "step": 1636 }, { "epoch": 0.12627318718381114, "grad_norm": 0.14404788613319397, "learning_rate": 0.00019208058532563894, "loss": 1.0582, "step": 1638 }, { "epoch": 0.1264273668995423, "grad_norm": 0.14395713806152344, "learning_rate": 0.00019207028029678486, "loss": 1.0686, "step": 1640 }, { "epoch": 0.12658154661527343, "grad_norm": 0.13249294459819794, "learning_rate": 0.00019205997526793077, "loss": 1.1286, "step": 1642 }, { "epoch": 0.12673572633100458, "grad_norm": 0.12791812419891357, "learning_rate": 0.00019204967023907669, "loss": 1.062, "step": 1644 }, { "epoch": 0.12688990604673572, "grad_norm": 0.12210959941148758, "learning_rate": 0.0001920393652102226, "loss": 1.0419, "step": 1646 }, { "epoch": 0.12704408576246687, "grad_norm": 0.13438813388347626, "learning_rate": 0.00019202906018136852, "loss": 1.0589, "step": 1648 }, { "epoch": 0.127198265478198, "grad_norm": 0.12953762710094452, "learning_rate": 0.00019201875515251443, "loss": 1.0128, "step": 1650 }, { "epoch": 0.1273524451939292, "grad_norm": 0.1318603903055191, "learning_rate": 0.00019200845012366035, "loss": 1.073, "step": 1652 }, { "epoch": 0.12750662490966033, "grad_norm": 0.12956051528453827, "learning_rate": 0.00019199814509480626, "loss": 1.0489, "step": 1654 }, { "epoch": 0.12766080462539148, "grad_norm": 0.13501368463039398, "learning_rate": 0.00019198784006595218, "loss": 1.0198, "step": 1656 }, { "epoch": 0.12781498434112262, "grad_norm": 0.13902342319488525, "learning_rate": 0.00019197753503709812, "loss": 1.0512, "step": 1658 }, { "epoch": 0.12796916405685377, "grad_norm": 0.15590503811836243, "learning_rate": 0.00019196723000824403, "loss": 1.1782, "step": 1660 }, { "epoch": 0.1281233437725849, "grad_norm": 0.13954932987689972, "learning_rate": 0.00019195692497938995, "loss": 1.0421, "step": 1662 }, { "epoch": 0.12827752348831606, "grad_norm": 0.11550859361886978, "learning_rate": 0.00019194661995053586, "loss": 1.086, "step": 1664 }, { "epoch": 0.1284317032040472, "grad_norm": 0.12175869196653366, "learning_rate": 0.00019193631492168178, "loss": 1.0704, "step": 1666 }, { "epoch": 0.12858588291977838, "grad_norm": 0.13503512740135193, "learning_rate": 0.00019192600989282772, "loss": 1.1166, "step": 1668 }, { "epoch": 0.12874006263550952, "grad_norm": 0.12849009037017822, "learning_rate": 0.00019191570486397363, "loss": 1.0315, "step": 1670 }, { "epoch": 0.12889424235124067, "grad_norm": 0.12484319508075714, "learning_rate": 0.00019190539983511955, "loss": 1.0737, "step": 1672 }, { "epoch": 0.1290484220669718, "grad_norm": 0.1364014446735382, "learning_rate": 0.00019189509480626546, "loss": 1.0619, "step": 1674 }, { "epoch": 0.12920260178270296, "grad_norm": 0.12930172681808472, "learning_rate": 0.00019188478977741138, "loss": 1.046, "step": 1676 }, { "epoch": 0.1293567814984341, "grad_norm": 0.13860805332660675, "learning_rate": 0.00019187448474855732, "loss": 1.0832, "step": 1678 }, { "epoch": 0.12951096121416525, "grad_norm": 0.1379111111164093, "learning_rate": 0.00019186417971970324, "loss": 1.1406, "step": 1680 }, { "epoch": 0.12966514092989642, "grad_norm": 0.1349123865365982, "learning_rate": 0.00019185387469084915, "loss": 1.1055, "step": 1682 }, { "epoch": 0.12981932064562757, "grad_norm": 0.13304142653942108, "learning_rate": 0.00019184356966199507, "loss": 1.0392, "step": 1684 }, { "epoch": 0.1299735003613587, "grad_norm": 0.12159105390310287, "learning_rate": 0.00019183326463314098, "loss": 1.0548, "step": 1686 }, { "epoch": 0.13012768007708986, "grad_norm": 0.12661418318748474, "learning_rate": 0.00019182295960428692, "loss": 1.0588, "step": 1688 }, { "epoch": 0.130281859792821, "grad_norm": 0.13691510260105133, "learning_rate": 0.00019181265457543284, "loss": 1.0854, "step": 1690 }, { "epoch": 0.13043603950855215, "grad_norm": 0.1401318609714508, "learning_rate": 0.00019180234954657875, "loss": 1.0864, "step": 1692 }, { "epoch": 0.1305902192242833, "grad_norm": 0.1355384737253189, "learning_rate": 0.00019179204451772467, "loss": 1.058, "step": 1694 }, { "epoch": 0.13074439894001447, "grad_norm": 0.13987474143505096, "learning_rate": 0.00019178173948887058, "loss": 1.06, "step": 1696 }, { "epoch": 0.1308985786557456, "grad_norm": 0.14350661635398865, "learning_rate": 0.0001917714344600165, "loss": 1.0731, "step": 1698 }, { "epoch": 0.13105275837147676, "grad_norm": 0.12443742901086807, "learning_rate": 0.0001917611294311624, "loss": 1.0987, "step": 1700 }, { "epoch": 0.13105275837147676, "eval_loss": 1.0880467891693115, "eval_runtime": 185.5457, "eval_samples_per_second": 91.314, "eval_steps_per_second": 1.428, "step": 1700 }, { "epoch": 0.1312069380872079, "grad_norm": 0.10956554859876633, "learning_rate": 0.00019175082440230833, "loss": 1.0393, "step": 1702 }, { "epoch": 0.13136111780293905, "grad_norm": 0.11846137791872025, "learning_rate": 0.00019174051937345424, "loss": 1.0998, "step": 1704 }, { "epoch": 0.1315152975186702, "grad_norm": 0.11894328892230988, "learning_rate": 0.00019173021434460016, "loss": 1.1007, "step": 1706 }, { "epoch": 0.13166947723440134, "grad_norm": 0.11090514808893204, "learning_rate": 0.00019171990931574607, "loss": 1.0343, "step": 1708 }, { "epoch": 0.1318236569501325, "grad_norm": 0.1276719868183136, "learning_rate": 0.000191709604286892, "loss": 1.0392, "step": 1710 }, { "epoch": 0.13197783666586366, "grad_norm": 0.12342885881662369, "learning_rate": 0.00019169929925803793, "loss": 1.063, "step": 1712 }, { "epoch": 0.1321320163815948, "grad_norm": 0.1237882748246193, "learning_rate": 0.00019168899422918384, "loss": 1.0558, "step": 1714 }, { "epoch": 0.13228619609732595, "grad_norm": 0.12958785891532898, "learning_rate": 0.00019167868920032976, "loss": 1.0493, "step": 1716 }, { "epoch": 0.1324403758130571, "grad_norm": 0.1181110367178917, "learning_rate": 0.00019166838417147567, "loss": 1.0668, "step": 1718 }, { "epoch": 0.13259455552878824, "grad_norm": 0.12053950875997543, "learning_rate": 0.00019165807914262162, "loss": 1.0392, "step": 1720 }, { "epoch": 0.13274873524451938, "grad_norm": 0.11725175380706787, "learning_rate": 0.00019164777411376753, "loss": 1.0188, "step": 1722 }, { "epoch": 0.13290291496025053, "grad_norm": 0.12475614994764328, "learning_rate": 0.00019163746908491344, "loss": 1.0134, "step": 1724 }, { "epoch": 0.1330570946759817, "grad_norm": 0.1231207475066185, "learning_rate": 0.00019162716405605936, "loss": 1.0309, "step": 1726 }, { "epoch": 0.13321127439171285, "grad_norm": 0.1269765943288803, "learning_rate": 0.00019161685902720527, "loss": 1.0918, "step": 1728 }, { "epoch": 0.133365454107444, "grad_norm": 0.12103556841611862, "learning_rate": 0.00019160655399835122, "loss": 1.0453, "step": 1730 }, { "epoch": 0.13351963382317514, "grad_norm": 0.12427771091461182, "learning_rate": 0.00019159624896949713, "loss": 1.1544, "step": 1732 }, { "epoch": 0.13367381353890628, "grad_norm": 0.13416282832622528, "learning_rate": 0.00019158594394064305, "loss": 1.0941, "step": 1734 }, { "epoch": 0.13382799325463743, "grad_norm": 0.13207705318927765, "learning_rate": 0.00019157563891178896, "loss": 1.0998, "step": 1736 }, { "epoch": 0.13398217297036857, "grad_norm": 0.1436687856912613, "learning_rate": 0.00019156533388293488, "loss": 1.0723, "step": 1738 }, { "epoch": 0.13413635268609975, "grad_norm": 0.1206304207444191, "learning_rate": 0.00019155502885408082, "loss": 1.0279, "step": 1740 }, { "epoch": 0.1342905324018309, "grad_norm": 0.12685900926589966, "learning_rate": 0.00019154472382522673, "loss": 1.0683, "step": 1742 }, { "epoch": 0.13444471211756204, "grad_norm": 0.12833228707313538, "learning_rate": 0.00019153441879637265, "loss": 1.0904, "step": 1744 }, { "epoch": 0.13459889183329318, "grad_norm": 0.12999312579631805, "learning_rate": 0.00019152411376751856, "loss": 1.0492, "step": 1746 }, { "epoch": 0.13475307154902433, "grad_norm": 0.13486912846565247, "learning_rate": 0.00019151380873866448, "loss": 1.101, "step": 1748 }, { "epoch": 0.13490725126475547, "grad_norm": 0.12793023884296417, "learning_rate": 0.0001915035037098104, "loss": 1.1135, "step": 1750 }, { "epoch": 0.13506143098048662, "grad_norm": 0.12652675807476044, "learning_rate": 0.0001914931986809563, "loss": 1.0902, "step": 1752 }, { "epoch": 0.1352156106962178, "grad_norm": 0.12431836873292923, "learning_rate": 0.00019148289365210222, "loss": 1.0922, "step": 1754 }, { "epoch": 0.13536979041194894, "grad_norm": 0.13665209710597992, "learning_rate": 0.00019147258862324814, "loss": 1.0584, "step": 1756 }, { "epoch": 0.13552397012768008, "grad_norm": 0.1355196088552475, "learning_rate": 0.00019146228359439405, "loss": 1.1199, "step": 1758 }, { "epoch": 0.13567814984341123, "grad_norm": 0.14115893840789795, "learning_rate": 0.00019145197856554, "loss": 1.0697, "step": 1760 }, { "epoch": 0.13583232955914237, "grad_norm": 0.13009534776210785, "learning_rate": 0.0001914416735366859, "loss": 1.1111, "step": 1762 }, { "epoch": 0.13598650927487352, "grad_norm": 0.12280994653701782, "learning_rate": 0.00019143136850783182, "loss": 1.0341, "step": 1764 }, { "epoch": 0.13614068899060466, "grad_norm": 0.15171582996845245, "learning_rate": 0.00019142106347897774, "loss": 1.1275, "step": 1766 }, { "epoch": 0.1362948687063358, "grad_norm": 0.15258526802062988, "learning_rate": 0.00019141075845012365, "loss": 1.0513, "step": 1768 }, { "epoch": 0.13644904842206698, "grad_norm": 0.132346972823143, "learning_rate": 0.0001914004534212696, "loss": 1.0878, "step": 1770 }, { "epoch": 0.13660322813779813, "grad_norm": 0.13237041234970093, "learning_rate": 0.0001913901483924155, "loss": 1.0845, "step": 1772 }, { "epoch": 0.13675740785352927, "grad_norm": 0.13837209343910217, "learning_rate": 0.00019137984336356143, "loss": 1.1221, "step": 1774 }, { "epoch": 0.13691158756926042, "grad_norm": 0.17590375244617462, "learning_rate": 0.00019136953833470734, "loss": 1.1963, "step": 1776 }, { "epoch": 0.13706576728499156, "grad_norm": 0.12898488342761993, "learning_rate": 0.00019135923330585326, "loss": 1.1306, "step": 1778 }, { "epoch": 0.1372199470007227, "grad_norm": 0.12428785115480423, "learning_rate": 0.0001913489282769992, "loss": 1.068, "step": 1780 }, { "epoch": 0.13737412671645385, "grad_norm": 0.12678809463977814, "learning_rate": 0.0001913386232481451, "loss": 1.0709, "step": 1782 }, { "epoch": 0.13752830643218503, "grad_norm": 0.1344168782234192, "learning_rate": 0.00019132831821929103, "loss": 1.1073, "step": 1784 }, { "epoch": 0.13768248614791617, "grad_norm": 0.14730733633041382, "learning_rate": 0.00019131801319043694, "loss": 1.0073, "step": 1786 }, { "epoch": 0.13783666586364732, "grad_norm": 0.13661792874336243, "learning_rate": 0.00019130770816158286, "loss": 1.0637, "step": 1788 }, { "epoch": 0.13799084557937846, "grad_norm": 0.1342434138059616, "learning_rate": 0.0001912974031327288, "loss": 1.1069, "step": 1790 }, { "epoch": 0.1381450252951096, "grad_norm": 0.11941581219434738, "learning_rate": 0.00019128709810387471, "loss": 1.1023, "step": 1792 }, { "epoch": 0.13829920501084075, "grad_norm": 0.13641759753227234, "learning_rate": 0.00019127679307502063, "loss": 1.0564, "step": 1794 }, { "epoch": 0.1384533847265719, "grad_norm": 0.11148608475923538, "learning_rate": 0.00019126648804616654, "loss": 1.0255, "step": 1796 }, { "epoch": 0.13860756444230307, "grad_norm": 0.1387186199426651, "learning_rate": 0.00019125618301731246, "loss": 1.0663, "step": 1798 }, { "epoch": 0.13876174415803422, "grad_norm": 0.12380651384592056, "learning_rate": 0.00019124587798845837, "loss": 1.1222, "step": 1800 }, { "epoch": 0.13876174415803422, "eval_loss": 1.0875153541564941, "eval_runtime": 185.4605, "eval_samples_per_second": 91.356, "eval_steps_per_second": 1.429, "step": 1800 }, { "epoch": 0.13891592387376536, "grad_norm": 0.13224369287490845, "learning_rate": 0.00019123557295960432, "loss": 1.0821, "step": 1802 }, { "epoch": 0.1390701035894965, "grad_norm": 0.13096244633197784, "learning_rate": 0.00019122526793075023, "loss": 1.0097, "step": 1804 }, { "epoch": 0.13922428330522765, "grad_norm": 0.11652527749538422, "learning_rate": 0.00019121496290189615, "loss": 1.0517, "step": 1806 }, { "epoch": 0.1393784630209588, "grad_norm": 0.13449358940124512, "learning_rate": 0.00019120465787304206, "loss": 1.0915, "step": 1808 }, { "epoch": 0.13953264273668994, "grad_norm": 0.11550068855285645, "learning_rate": 0.00019119435284418798, "loss": 1.0568, "step": 1810 }, { "epoch": 0.13968682245242112, "grad_norm": 0.13804587721824646, "learning_rate": 0.0001911840478153339, "loss": 1.0933, "step": 1812 }, { "epoch": 0.13984100216815226, "grad_norm": 0.12062159180641174, "learning_rate": 0.0001911737427864798, "loss": 1.0517, "step": 1814 }, { "epoch": 0.1399951818838834, "grad_norm": 0.12154779583215714, "learning_rate": 0.00019116343775762572, "loss": 1.0955, "step": 1816 }, { "epoch": 0.14014936159961455, "grad_norm": 0.11615799367427826, "learning_rate": 0.00019115313272877164, "loss": 0.968, "step": 1818 }, { "epoch": 0.1403035413153457, "grad_norm": 0.1207037940621376, "learning_rate": 0.00019114282769991755, "loss": 1.0896, "step": 1820 }, { "epoch": 0.14045772103107684, "grad_norm": 0.12750887870788574, "learning_rate": 0.0001911325226710635, "loss": 1.065, "step": 1822 }, { "epoch": 0.140611900746808, "grad_norm": 0.16391952335834503, "learning_rate": 0.0001911222176422094, "loss": 1.0232, "step": 1824 }, { "epoch": 0.14076608046253913, "grad_norm": 0.14626921713352203, "learning_rate": 0.00019111191261335532, "loss": 1.0375, "step": 1826 }, { "epoch": 0.1409202601782703, "grad_norm": 0.12393996119499207, "learning_rate": 0.00019110160758450124, "loss": 1.0345, "step": 1828 }, { "epoch": 0.14107443989400145, "grad_norm": 0.13275925815105438, "learning_rate": 0.00019109130255564715, "loss": 1.071, "step": 1830 }, { "epoch": 0.1412286196097326, "grad_norm": 0.1255485862493515, "learning_rate": 0.0001910809975267931, "loss": 1.1026, "step": 1832 }, { "epoch": 0.14138279932546374, "grad_norm": 0.13399668037891388, "learning_rate": 0.000191070692497939, "loss": 1.11, "step": 1834 }, { "epoch": 0.1415369790411949, "grad_norm": 0.13084925711154938, "learning_rate": 0.00019106038746908492, "loss": 1.0528, "step": 1836 }, { "epoch": 0.14169115875692603, "grad_norm": 0.15695689618587494, "learning_rate": 0.00019105008244023084, "loss": 1.1336, "step": 1838 }, { "epoch": 0.14184533847265718, "grad_norm": 0.13630808889865875, "learning_rate": 0.00019103977741137675, "loss": 1.0767, "step": 1840 }, { "epoch": 0.14199951818838835, "grad_norm": 0.11874844878911972, "learning_rate": 0.0001910294723825227, "loss": 1.0511, "step": 1842 }, { "epoch": 0.1421536979041195, "grad_norm": 0.11898507922887802, "learning_rate": 0.0001910191673536686, "loss": 1.0866, "step": 1844 }, { "epoch": 0.14230787761985064, "grad_norm": 0.1393211930990219, "learning_rate": 0.00019100886232481453, "loss": 1.0553, "step": 1846 }, { "epoch": 0.1424620573355818, "grad_norm": 0.1382310539484024, "learning_rate": 0.00019099855729596044, "loss": 1.07, "step": 1848 }, { "epoch": 0.14261623705131293, "grad_norm": 0.1471824198961258, "learning_rate": 0.00019098825226710636, "loss": 1.0893, "step": 1850 }, { "epoch": 0.14277041676704408, "grad_norm": 0.12706084549427032, "learning_rate": 0.0001909779472382523, "loss": 1.0848, "step": 1852 }, { "epoch": 0.14292459648277522, "grad_norm": 0.1324569135904312, "learning_rate": 0.0001909676422093982, "loss": 1.024, "step": 1854 }, { "epoch": 0.1430787761985064, "grad_norm": 0.11245544254779816, "learning_rate": 0.00019095733718054413, "loss": 1.0802, "step": 1856 }, { "epoch": 0.14323295591423754, "grad_norm": 0.15419217944145203, "learning_rate": 0.00019094703215169004, "loss": 1.1101, "step": 1858 }, { "epoch": 0.1433871356299687, "grad_norm": 0.1071443036198616, "learning_rate": 0.00019093672712283596, "loss": 1.0576, "step": 1860 }, { "epoch": 0.14354131534569983, "grad_norm": 0.1341090053319931, "learning_rate": 0.00019092642209398187, "loss": 1.0606, "step": 1862 }, { "epoch": 0.14369549506143098, "grad_norm": 0.11848092079162598, "learning_rate": 0.0001909161170651278, "loss": 1.0714, "step": 1864 }, { "epoch": 0.14384967477716212, "grad_norm": 0.12697815895080566, "learning_rate": 0.0001909058120362737, "loss": 1.092, "step": 1866 }, { "epoch": 0.14400385449289327, "grad_norm": 0.11891257762908936, "learning_rate": 0.00019089550700741962, "loss": 0.9649, "step": 1868 }, { "epoch": 0.14415803420862444, "grad_norm": 0.12616439163684845, "learning_rate": 0.00019088520197856553, "loss": 1.0962, "step": 1870 }, { "epoch": 0.1443122139243556, "grad_norm": 0.12141067534685135, "learning_rate": 0.00019087489694971147, "loss": 1.0838, "step": 1872 }, { "epoch": 0.14446639364008673, "grad_norm": 0.13279564678668976, "learning_rate": 0.0001908645919208574, "loss": 1.0484, "step": 1874 }, { "epoch": 0.14462057335581788, "grad_norm": 0.15748505294322968, "learning_rate": 0.0001908542868920033, "loss": 1.1433, "step": 1876 }, { "epoch": 0.14477475307154902, "grad_norm": 0.11593475937843323, "learning_rate": 0.00019084398186314922, "loss": 1.1483, "step": 1878 }, { "epoch": 0.14492893278728017, "grad_norm": 0.14499489963054657, "learning_rate": 0.00019083367683429513, "loss": 1.0782, "step": 1880 }, { "epoch": 0.1450831125030113, "grad_norm": 0.13570410013198853, "learning_rate": 0.00019082337180544105, "loss": 1.0989, "step": 1882 }, { "epoch": 0.14523729221874246, "grad_norm": 0.12810774147510529, "learning_rate": 0.000190813066776587, "loss": 1.0374, "step": 1884 }, { "epoch": 0.14539147193447363, "grad_norm": 0.11781581491231918, "learning_rate": 0.0001908027617477329, "loss": 1.0796, "step": 1886 }, { "epoch": 0.14554565165020478, "grad_norm": 0.12243229150772095, "learning_rate": 0.00019079245671887882, "loss": 1.0477, "step": 1888 }, { "epoch": 0.14569983136593592, "grad_norm": 0.1385030299425125, "learning_rate": 0.00019078215169002474, "loss": 1.0349, "step": 1890 }, { "epoch": 0.14585401108166707, "grad_norm": 0.12011386454105377, "learning_rate": 0.00019077184666117065, "loss": 1.0718, "step": 1892 }, { "epoch": 0.1460081907973982, "grad_norm": 0.12646062672138214, "learning_rate": 0.0001907615416323166, "loss": 1.1228, "step": 1894 }, { "epoch": 0.14616237051312936, "grad_norm": 0.1284620612859726, "learning_rate": 0.0001907512366034625, "loss": 1.079, "step": 1896 }, { "epoch": 0.1463165502288605, "grad_norm": 0.15374581515789032, "learning_rate": 0.00019074093157460842, "loss": 1.1147, "step": 1898 }, { "epoch": 0.14647072994459168, "grad_norm": 0.1325882524251938, "learning_rate": 0.00019073062654575434, "loss": 1.0404, "step": 1900 }, { "epoch": 0.14647072994459168, "eval_loss": 1.0869932174682617, "eval_runtime": 185.4754, "eval_samples_per_second": 91.349, "eval_steps_per_second": 1.429, "step": 1900 }, { "epoch": 0.14662490966032282, "grad_norm": 0.14041611552238464, "learning_rate": 0.00019072032151690025, "loss": 1.095, "step": 1902 }, { "epoch": 0.14677908937605397, "grad_norm": 0.14162160456180573, "learning_rate": 0.0001907100164880462, "loss": 1.1714, "step": 1904 }, { "epoch": 0.1469332690917851, "grad_norm": 0.12077832221984863, "learning_rate": 0.0001906997114591921, "loss": 1.1109, "step": 1906 }, { "epoch": 0.14708744880751626, "grad_norm": 0.1738968789577484, "learning_rate": 0.00019068940643033802, "loss": 1.0838, "step": 1908 }, { "epoch": 0.1472416285232474, "grad_norm": 0.13948039710521698, "learning_rate": 0.00019067910140148394, "loss": 1.0494, "step": 1910 }, { "epoch": 0.14739580823897855, "grad_norm": 0.21179239451885223, "learning_rate": 0.00019066879637262985, "loss": 1.0962, "step": 1912 }, { "epoch": 0.14754998795470972, "grad_norm": 0.12927787005901337, "learning_rate": 0.00019065849134377577, "loss": 1.1113, "step": 1914 }, { "epoch": 0.14770416767044087, "grad_norm": 0.1296701431274414, "learning_rate": 0.00019064818631492168, "loss": 1.0603, "step": 1916 }, { "epoch": 0.147858347386172, "grad_norm": 0.1282590925693512, "learning_rate": 0.0001906378812860676, "loss": 1.0594, "step": 1918 }, { "epoch": 0.14801252710190316, "grad_norm": 0.13304758071899414, "learning_rate": 0.0001906275762572135, "loss": 1.0784, "step": 1920 }, { "epoch": 0.1481667068176343, "grad_norm": 0.15661965310573578, "learning_rate": 0.00019061727122835943, "loss": 1.008, "step": 1922 }, { "epoch": 0.14832088653336545, "grad_norm": 0.12986873090267181, "learning_rate": 0.00019060696619950537, "loss": 1.0788, "step": 1924 }, { "epoch": 0.1484750662490966, "grad_norm": 0.1128251776099205, "learning_rate": 0.00019059666117065128, "loss": 1.1449, "step": 1926 }, { "epoch": 0.14862924596482774, "grad_norm": 0.13722160458564758, "learning_rate": 0.0001905863561417972, "loss": 1.0914, "step": 1928 }, { "epoch": 0.1487834256805589, "grad_norm": 0.1507786512374878, "learning_rate": 0.00019057605111294311, "loss": 1.0694, "step": 1930 }, { "epoch": 0.14893760539629006, "grad_norm": 0.1368752121925354, "learning_rate": 0.00019056574608408903, "loss": 1.0417, "step": 1932 }, { "epoch": 0.1490917851120212, "grad_norm": 0.12566259503364563, "learning_rate": 0.00019055544105523497, "loss": 1.0853, "step": 1934 }, { "epoch": 0.14924596482775235, "grad_norm": 0.12362397462129593, "learning_rate": 0.0001905451360263809, "loss": 1.1136, "step": 1936 }, { "epoch": 0.1494001445434835, "grad_norm": 0.12472514808177948, "learning_rate": 0.0001905348309975268, "loss": 1.0628, "step": 1938 }, { "epoch": 0.14955432425921464, "grad_norm": 0.1355161964893341, "learning_rate": 0.00019052452596867272, "loss": 1.1211, "step": 1940 }, { "epoch": 0.14970850397494578, "grad_norm": 0.13438721001148224, "learning_rate": 0.00019051422093981863, "loss": 1.0758, "step": 1942 }, { "epoch": 0.14986268369067696, "grad_norm": 0.11768204718828201, "learning_rate": 0.00019050391591096457, "loss": 1.0533, "step": 1944 }, { "epoch": 0.1500168634064081, "grad_norm": 0.13892577588558197, "learning_rate": 0.0001904936108821105, "loss": 1.1076, "step": 1946 }, { "epoch": 0.15017104312213925, "grad_norm": 0.1532358080148697, "learning_rate": 0.0001904833058532564, "loss": 1.0706, "step": 1948 }, { "epoch": 0.1503252228378704, "grad_norm": 0.13364464044570923, "learning_rate": 0.00019047300082440232, "loss": 1.1322, "step": 1950 }, { "epoch": 0.15047940255360154, "grad_norm": 0.12663134932518005, "learning_rate": 0.00019046269579554823, "loss": 1.0749, "step": 1952 }, { "epoch": 0.15063358226933268, "grad_norm": 0.1297607123851776, "learning_rate": 0.00019045239076669417, "loss": 1.0594, "step": 1954 }, { "epoch": 0.15078776198506383, "grad_norm": 0.11931920051574707, "learning_rate": 0.0001904420857378401, "loss": 1.0522, "step": 1956 }, { "epoch": 0.150941941700795, "grad_norm": 0.1334810107946396, "learning_rate": 0.000190431780708986, "loss": 1.0674, "step": 1958 }, { "epoch": 0.15109612141652615, "grad_norm": 0.12633340060710907, "learning_rate": 0.00019042147568013192, "loss": 1.0139, "step": 1960 }, { "epoch": 0.1512503011322573, "grad_norm": 0.12485836446285248, "learning_rate": 0.00019041117065127783, "loss": 1.0288, "step": 1962 }, { "epoch": 0.15140448084798844, "grad_norm": 0.10940799117088318, "learning_rate": 0.00019040086562242375, "loss": 1.0475, "step": 1964 }, { "epoch": 0.15155866056371958, "grad_norm": 0.12229325622320175, "learning_rate": 0.00019039056059356966, "loss": 1.0628, "step": 1966 }, { "epoch": 0.15171284027945073, "grad_norm": 0.14333505928516388, "learning_rate": 0.00019038025556471558, "loss": 1.0423, "step": 1968 }, { "epoch": 0.15186701999518187, "grad_norm": 0.12773017585277557, "learning_rate": 0.0001903699505358615, "loss": 1.1283, "step": 1970 }, { "epoch": 0.15202119971091305, "grad_norm": 0.11913473904132843, "learning_rate": 0.0001903596455070074, "loss": 1.0646, "step": 1972 }, { "epoch": 0.1521753794266442, "grad_norm": 0.13321518898010254, "learning_rate": 0.00019034934047815332, "loss": 1.0476, "step": 1974 }, { "epoch": 0.15232955914237534, "grad_norm": 0.1362799108028412, "learning_rate": 0.00019033903544929927, "loss": 1.0937, "step": 1976 }, { "epoch": 0.15248373885810648, "grad_norm": 0.13804180920124054, "learning_rate": 0.00019032873042044518, "loss": 1.113, "step": 1978 }, { "epoch": 0.15263791857383763, "grad_norm": 0.1774570494890213, "learning_rate": 0.0001903184253915911, "loss": 1.0795, "step": 1980 }, { "epoch": 0.15279209828956877, "grad_norm": 0.13106994330883026, "learning_rate": 0.000190308120362737, "loss": 1.098, "step": 1982 }, { "epoch": 0.15294627800529992, "grad_norm": 0.14435411989688873, "learning_rate": 0.00019029781533388293, "loss": 1.0814, "step": 1984 }, { "epoch": 0.15310045772103106, "grad_norm": 0.13178013265132904, "learning_rate": 0.00019028751030502887, "loss": 1.1002, "step": 1986 }, { "epoch": 0.15325463743676224, "grad_norm": 0.1283218264579773, "learning_rate": 0.00019027720527617478, "loss": 1.0749, "step": 1988 }, { "epoch": 0.15340881715249338, "grad_norm": 0.12113723158836365, "learning_rate": 0.0001902669002473207, "loss": 1.0831, "step": 1990 }, { "epoch": 0.15356299686822453, "grad_norm": 0.12649892270565033, "learning_rate": 0.0001902565952184666, "loss": 1.0166, "step": 1992 }, { "epoch": 0.15371717658395567, "grad_norm": 0.12823793292045593, "learning_rate": 0.00019024629018961253, "loss": 1.0273, "step": 1994 }, { "epoch": 0.15387135629968682, "grad_norm": 0.1291527897119522, "learning_rate": 0.00019023598516075847, "loss": 1.1092, "step": 1996 }, { "epoch": 0.15402553601541796, "grad_norm": 0.12588894367218018, "learning_rate": 0.00019022568013190438, "loss": 1.0627, "step": 1998 }, { "epoch": 0.1541797157311491, "grad_norm": 0.12996312975883484, "learning_rate": 0.0001902153751030503, "loss": 1.1196, "step": 2000 }, { "epoch": 0.1541797157311491, "eval_loss": 1.0863893032073975, "eval_runtime": 185.3254, "eval_samples_per_second": 91.423, "eval_steps_per_second": 1.43, "step": 2000 }, { "epoch": 0.15433389544688028, "grad_norm": 0.14361834526062012, "learning_rate": 0.00019020507007419621, "loss": 1.1151, "step": 2002 }, { "epoch": 0.15448807516261143, "grad_norm": 0.12650837004184723, "learning_rate": 0.00019019476504534213, "loss": 1.1155, "step": 2004 }, { "epoch": 0.15464225487834257, "grad_norm": 0.13820499181747437, "learning_rate": 0.00019018446001648807, "loss": 1.1243, "step": 2006 }, { "epoch": 0.15479643459407372, "grad_norm": 0.13205693662166595, "learning_rate": 0.00019017415498763399, "loss": 1.0626, "step": 2008 }, { "epoch": 0.15495061430980486, "grad_norm": 0.13930106163024902, "learning_rate": 0.0001901638499587799, "loss": 1.1105, "step": 2010 }, { "epoch": 0.155104794025536, "grad_norm": 0.14711922407150269, "learning_rate": 0.00019015354492992582, "loss": 1.0556, "step": 2012 }, { "epoch": 0.15525897374126715, "grad_norm": 0.11909156292676926, "learning_rate": 0.00019014323990107173, "loss": 1.1025, "step": 2014 }, { "epoch": 0.15541315345699833, "grad_norm": 0.14099714159965515, "learning_rate": 0.00019013293487221767, "loss": 1.064, "step": 2016 }, { "epoch": 0.15556733317272947, "grad_norm": 0.11500216275453568, "learning_rate": 0.0001901226298433636, "loss": 1.1196, "step": 2018 }, { "epoch": 0.15572151288846062, "grad_norm": 0.12341683357954025, "learning_rate": 0.0001901123248145095, "loss": 1.0625, "step": 2020 }, { "epoch": 0.15587569260419176, "grad_norm": 0.1390669196844101, "learning_rate": 0.00019010201978565542, "loss": 1.0526, "step": 2022 }, { "epoch": 0.1560298723199229, "grad_norm": 0.13482992351055145, "learning_rate": 0.00019009171475680133, "loss": 1.1074, "step": 2024 }, { "epoch": 0.15618405203565405, "grad_norm": 0.12277045845985413, "learning_rate": 0.00019008140972794725, "loss": 1.0648, "step": 2026 }, { "epoch": 0.1563382317513852, "grad_norm": 0.13579949736595154, "learning_rate": 0.00019007110469909316, "loss": 1.1235, "step": 2028 }, { "epoch": 0.15649241146711637, "grad_norm": 0.14128637313842773, "learning_rate": 0.00019006079967023908, "loss": 1.0442, "step": 2030 }, { "epoch": 0.15664659118284752, "grad_norm": 0.13722474873065948, "learning_rate": 0.000190050494641385, "loss": 1.1215, "step": 2032 }, { "epoch": 0.15680077089857866, "grad_norm": 0.13500674068927765, "learning_rate": 0.0001900401896125309, "loss": 1.0776, "step": 2034 }, { "epoch": 0.1569549506143098, "grad_norm": 0.11917294561862946, "learning_rate": 0.00019002988458367685, "loss": 1.0698, "step": 2036 }, { "epoch": 0.15710913033004095, "grad_norm": 0.12245581299066544, "learning_rate": 0.00019001957955482276, "loss": 1.0166, "step": 2038 }, { "epoch": 0.1572633100457721, "grad_norm": 0.12556669116020203, "learning_rate": 0.00019000927452596868, "loss": 1.0846, "step": 2040 }, { "epoch": 0.15741748976150324, "grad_norm": 0.13316373527050018, "learning_rate": 0.0001899989694971146, "loss": 1.0566, "step": 2042 }, { "epoch": 0.1575716694772344, "grad_norm": 0.1296815425157547, "learning_rate": 0.0001899886644682605, "loss": 1.0824, "step": 2044 }, { "epoch": 0.15772584919296556, "grad_norm": 0.1288246214389801, "learning_rate": 0.00018997835943940645, "loss": 1.0974, "step": 2046 }, { "epoch": 0.1578800289086967, "grad_norm": 0.1185479462146759, "learning_rate": 0.00018996805441055237, "loss": 1.1443, "step": 2048 }, { "epoch": 0.15803420862442785, "grad_norm": 0.12504369020462036, "learning_rate": 0.00018995774938169828, "loss": 1.0899, "step": 2050 }, { "epoch": 0.158188388340159, "grad_norm": 0.1266452521085739, "learning_rate": 0.0001899474443528442, "loss": 1.0654, "step": 2052 }, { "epoch": 0.15834256805589014, "grad_norm": 0.13447126746177673, "learning_rate": 0.0001899371393239901, "loss": 1.0649, "step": 2054 }, { "epoch": 0.1584967477716213, "grad_norm": 0.1446131467819214, "learning_rate": 0.00018992683429513603, "loss": 1.1439, "step": 2056 }, { "epoch": 0.15865092748735243, "grad_norm": 0.12688389420509338, "learning_rate": 0.00018991652926628197, "loss": 1.0262, "step": 2058 }, { "epoch": 0.1588051072030836, "grad_norm": 0.12581713497638702, "learning_rate": 0.00018990622423742788, "loss": 1.0723, "step": 2060 }, { "epoch": 0.15895928691881475, "grad_norm": 0.15745951235294342, "learning_rate": 0.0001898959192085738, "loss": 1.1038, "step": 2062 }, { "epoch": 0.1591134666345459, "grad_norm": 0.14457587897777557, "learning_rate": 0.0001898856141797197, "loss": 1.1072, "step": 2064 }, { "epoch": 0.15926764635027704, "grad_norm": 0.11454683542251587, "learning_rate": 0.00018987530915086563, "loss": 1.0605, "step": 2066 }, { "epoch": 0.1594218260660082, "grad_norm": 0.1137547716498375, "learning_rate": 0.00018986500412201157, "loss": 1.0405, "step": 2068 }, { "epoch": 0.15957600578173933, "grad_norm": 0.1220378428697586, "learning_rate": 0.00018985469909315748, "loss": 1.086, "step": 2070 }, { "epoch": 0.15973018549747048, "grad_norm": 0.13579098880290985, "learning_rate": 0.0001898443940643034, "loss": 1.0334, "step": 2072 }, { "epoch": 0.15988436521320165, "grad_norm": 0.1529407948255539, "learning_rate": 0.00018983408903544931, "loss": 1.0614, "step": 2074 }, { "epoch": 0.1600385449289328, "grad_norm": 0.13769444823265076, "learning_rate": 0.00018982378400659523, "loss": 1.1212, "step": 2076 }, { "epoch": 0.16019272464466394, "grad_norm": 0.12095335125923157, "learning_rate": 0.00018981347897774114, "loss": 1.047, "step": 2078 }, { "epoch": 0.1603469043603951, "grad_norm": 0.12483233958482742, "learning_rate": 0.00018980317394888706, "loss": 1.0808, "step": 2080 }, { "epoch": 0.16050108407612623, "grad_norm": 0.12451382726430893, "learning_rate": 0.00018979286892003297, "loss": 1.1259, "step": 2082 }, { "epoch": 0.16065526379185738, "grad_norm": 0.12540730834007263, "learning_rate": 0.0001897825638911789, "loss": 1.0761, "step": 2084 }, { "epoch": 0.16080944350758852, "grad_norm": 0.12948516011238098, "learning_rate": 0.0001897722588623248, "loss": 1.0621, "step": 2086 }, { "epoch": 0.16096362322331967, "grad_norm": 0.1349886953830719, "learning_rate": 0.00018976195383347075, "loss": 1.0549, "step": 2088 }, { "epoch": 0.16111780293905084, "grad_norm": 0.1249813437461853, "learning_rate": 0.00018975164880461666, "loss": 1.0828, "step": 2090 }, { "epoch": 0.161271982654782, "grad_norm": 0.1299104243516922, "learning_rate": 0.00018974134377576258, "loss": 1.097, "step": 2092 }, { "epoch": 0.16142616237051313, "grad_norm": 0.13004744052886963, "learning_rate": 0.0001897310387469085, "loss": 1.0417, "step": 2094 }, { "epoch": 0.16158034208624428, "grad_norm": 0.11553830653429031, "learning_rate": 0.0001897207337180544, "loss": 1.0563, "step": 2096 }, { "epoch": 0.16173452180197542, "grad_norm": 0.12000396102666855, "learning_rate": 0.00018971042868920035, "loss": 1.077, "step": 2098 }, { "epoch": 0.16188870151770657, "grad_norm": 0.13707685470581055, "learning_rate": 0.00018970012366034626, "loss": 1.0994, "step": 2100 }, { "epoch": 0.16188870151770657, "eval_loss": 1.0858707427978516, "eval_runtime": 185.7188, "eval_samples_per_second": 91.229, "eval_steps_per_second": 1.427, "step": 2100 } ], "logging_steps": 2, "max_steps": 38916, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.132999221824717e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }