diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7561 @@ +{ + "best_global_step": 2100, + "best_metric": 1.0858707427978516, + "best_model_checkpoint": "./outputs/checkpoint-2100", + "epoch": 0.16188870151770657, + "eval_steps": 100, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015417971573114913, + "grad_norm": 1.2087944746017456, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.8689, + "step": 2 + }, + { + "epoch": 0.00030835943146229826, + "grad_norm": 1.2666666507720947, + "learning_rate": 6e-06, + "loss": 1.7785, + "step": 4 + }, + { + "epoch": 0.00046253914719344736, + "grad_norm": 0.7307026982307434, + "learning_rate": 1e-05, + "loss": 1.6809, + "step": 6 + }, + { + "epoch": 0.0006167188629245965, + "grad_norm": 1.2569252252578735, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.9048, + "step": 8 + }, + { + "epoch": 0.0007708985786557456, + "grad_norm": 0.9572980403900146, + "learning_rate": 1.8e-05, + "loss": 1.7574, + "step": 10 + }, + { + "epoch": 0.0009250782943868947, + "grad_norm": 0.9918506145477295, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.858, + "step": 12 + }, + { + "epoch": 0.0010792580101180438, + "grad_norm": 0.9316955208778381, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.8238, + "step": 14 + }, + { + "epoch": 0.001233437725849193, + "grad_norm": 0.8265096545219421, + "learning_rate": 3e-05, + "loss": 1.6852, + "step": 16 + }, + { + "epoch": 0.001387617441580342, + "grad_norm": 0.900516152381897, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.8227, + "step": 18 + }, + { + "epoch": 0.0015417971573114912, + "grad_norm": 0.9343056678771973, + "learning_rate": 3.8e-05, + "loss": 1.7732, + "step": 20 + }, + { + "epoch": 0.0016959768730426404, + "grad_norm": 0.8314495086669922, + "learning_rate": 4.2e-05, + "loss": 1.732, + "step": 22 + }, + { + "epoch": 0.0018501565887737894, + "grad_norm": 0.8370314240455627, + "learning_rate": 4.600000000000001e-05, + "loss": 1.6725, + "step": 24 + }, + { + "epoch": 0.0020043363045049384, + "grad_norm": 0.6678845286369324, + "learning_rate": 5e-05, + "loss": 1.5638, + "step": 26 + }, + { + "epoch": 0.0021585160202360876, + "grad_norm": 0.6469596028327942, + "learning_rate": 5.4000000000000005e-05, + "loss": 1.6414, + "step": 28 + }, + { + "epoch": 0.002312695735967237, + "grad_norm": 1.1161589622497559, + "learning_rate": 5.8e-05, + "loss": 1.6015, + "step": 30 + }, + { + "epoch": 0.002466875451698386, + "grad_norm": 0.6085391044616699, + "learning_rate": 6.2e-05, + "loss": 1.4577, + "step": 32 + }, + { + "epoch": 0.0026210551674295353, + "grad_norm": 0.7159522175788879, + "learning_rate": 6.6e-05, + "loss": 1.4667, + "step": 34 + }, + { + "epoch": 0.002775234883160684, + "grad_norm": 0.67247074842453, + "learning_rate": 7e-05, + "loss": 1.5619, + "step": 36 + }, + { + "epoch": 0.0029294145988918332, + "grad_norm": 0.6272625923156738, + "learning_rate": 7.4e-05, + "loss": 1.322, + "step": 38 + }, + { + "epoch": 0.0030835943146229824, + "grad_norm": 0.7291163206100464, + "learning_rate": 7.800000000000001e-05, + "loss": 1.3936, + "step": 40 + }, + { + "epoch": 0.0032377740303541317, + "grad_norm": 0.4980190396308899, + "learning_rate": 8.2e-05, + "loss": 1.3322, + "step": 42 + }, + { + "epoch": 0.003391953746085281, + "grad_norm": 1.032578945159912, + "learning_rate": 8.6e-05, + "loss": 1.3657, + "step": 44 + }, + { + "epoch": 0.0035461334618164296, + "grad_norm": 0.5118615031242371, + "learning_rate": 9e-05, + "loss": 1.2866, + "step": 46 + }, + { + "epoch": 0.003700313177547579, + "grad_norm": 0.5234407782554626, + "learning_rate": 9.4e-05, + "loss": 1.2806, + "step": 48 + }, + { + "epoch": 0.003854492893278728, + "grad_norm": 0.49764135479927063, + "learning_rate": 9.8e-05, + "loss": 1.2004, + "step": 50 + }, + { + "epoch": 0.004008672609009877, + "grad_norm": 0.34377485513687134, + "learning_rate": 0.00010200000000000001, + "loss": 1.1947, + "step": 52 + }, + { + "epoch": 0.0041628523247410265, + "grad_norm": 0.41426530480384827, + "learning_rate": 0.00010600000000000002, + "loss": 1.2689, + "step": 54 + }, + { + "epoch": 0.004317032040472175, + "grad_norm": 0.5027992129325867, + "learning_rate": 0.00011000000000000002, + "loss": 1.2249, + "step": 56 + }, + { + "epoch": 0.004471211756203325, + "grad_norm": 0.44335752725601196, + "learning_rate": 0.00011399999999999999, + "loss": 1.2771, + "step": 58 + }, + { + "epoch": 0.004625391471934474, + "grad_norm": 0.3176646828651428, + "learning_rate": 0.000118, + "loss": 1.1873, + "step": 60 + }, + { + "epoch": 0.0047795711876656224, + "grad_norm": 0.24802716076374054, + "learning_rate": 0.000122, + "loss": 1.1989, + "step": 62 + }, + { + "epoch": 0.004933750903396772, + "grad_norm": 0.23831751942634583, + "learning_rate": 0.000126, + "loss": 1.1093, + "step": 64 + }, + { + "epoch": 0.005087930619127921, + "grad_norm": 0.24024009704589844, + "learning_rate": 0.00013000000000000002, + "loss": 1.2196, + "step": 66 + }, + { + "epoch": 0.0052421103348590705, + "grad_norm": 0.2745237350463867, + "learning_rate": 0.000134, + "loss": 1.1802, + "step": 68 + }, + { + "epoch": 0.005396290050590219, + "grad_norm": 0.27817806601524353, + "learning_rate": 0.000138, + "loss": 1.1939, + "step": 70 + }, + { + "epoch": 0.005550469766321368, + "grad_norm": 0.19907328486442566, + "learning_rate": 0.000142, + "loss": 1.2061, + "step": 72 + }, + { + "epoch": 0.005704649482052518, + "grad_norm": 0.18879663944244385, + "learning_rate": 0.000146, + "loss": 1.2149, + "step": 74 + }, + { + "epoch": 0.0058588291977836665, + "grad_norm": 0.21456782519817352, + "learning_rate": 0.00015000000000000001, + "loss": 1.1726, + "step": 76 + }, + { + "epoch": 0.006013008913514816, + "grad_norm": 0.23913143575191498, + "learning_rate": 0.000154, + "loss": 1.148, + "step": 78 + }, + { + "epoch": 0.006167188629245965, + "grad_norm": 0.2148526906967163, + "learning_rate": 0.00015800000000000002, + "loss": 1.1925, + "step": 80 + }, + { + "epoch": 0.006321368344977114, + "grad_norm": 0.2392999231815338, + "learning_rate": 0.000162, + "loss": 1.1488, + "step": 82 + }, + { + "epoch": 0.006475548060708263, + "grad_norm": 0.16503232717514038, + "learning_rate": 0.000166, + "loss": 1.1555, + "step": 84 + }, + { + "epoch": 0.006629727776439412, + "grad_norm": 0.1844739466905594, + "learning_rate": 0.00017, + "loss": 1.1934, + "step": 86 + }, + { + "epoch": 0.006783907492170562, + "grad_norm": 0.23832857608795166, + "learning_rate": 0.000174, + "loss": 1.1129, + "step": 88 + }, + { + "epoch": 0.0069380872079017105, + "grad_norm": 0.8846365809440613, + "learning_rate": 0.00017800000000000002, + "loss": 1.1028, + "step": 90 + }, + { + "epoch": 0.007092266923632859, + "grad_norm": 0.187076598405838, + "learning_rate": 0.000182, + "loss": 1.1, + "step": 92 + }, + { + "epoch": 0.007246446639364009, + "grad_norm": 0.1795521378517151, + "learning_rate": 0.00018600000000000002, + "loss": 1.1478, + "step": 94 + }, + { + "epoch": 0.007400626355095158, + "grad_norm": 0.199871227145195, + "learning_rate": 0.00019, + "loss": 1.1223, + "step": 96 + }, + { + "epoch": 0.007554806070826307, + "grad_norm": 0.17832662165164948, + "learning_rate": 0.000194, + "loss": 1.0909, + "step": 98 + }, + { + "epoch": 0.007708985786557456, + "grad_norm": 0.17023932933807373, + "learning_rate": 0.00019800000000000002, + "loss": 1.1526, + "step": 100 + }, + { + "epoch": 0.007708985786557456, + "eval_loss": 1.1401352882385254, + "eval_runtime": 185.6269, + "eval_samples_per_second": 91.274, + "eval_steps_per_second": 1.428, + "step": 100 + }, + { + "epoch": 0.007863165502288605, + "grad_norm": 0.17429223656654358, + "learning_rate": 0.00019999484748557298, + "loss": 1.1597, + "step": 102 + }, + { + "epoch": 0.008017345218019754, + "grad_norm": 0.16158349812030792, + "learning_rate": 0.0001999845424567189, + "loss": 1.1297, + "step": 104 + }, + { + "epoch": 0.008171524933750904, + "grad_norm": 0.15818771719932556, + "learning_rate": 0.0001999742374278648, + "loss": 1.083, + "step": 106 + }, + { + "epoch": 0.008325704649482053, + "grad_norm": 0.1591726392507553, + "learning_rate": 0.00019996393239901073, + "loss": 1.086, + "step": 108 + }, + { + "epoch": 0.008479884365213202, + "grad_norm": 0.174184650182724, + "learning_rate": 0.00019995362737015664, + "loss": 1.0769, + "step": 110 + }, + { + "epoch": 0.00863406408094435, + "grad_norm": 0.15928815305233002, + "learning_rate": 0.00019994332234130258, + "loss": 1.1315, + "step": 112 + }, + { + "epoch": 0.0087882437966755, + "grad_norm": 0.19639264047145844, + "learning_rate": 0.0001999330173124485, + "loss": 1.1339, + "step": 114 + }, + { + "epoch": 0.00894242351240665, + "grad_norm": 0.1639835238456726, + "learning_rate": 0.0001999227122835944, + "loss": 1.0836, + "step": 116 + }, + { + "epoch": 0.009096603228137799, + "grad_norm": 0.18691964447498322, + "learning_rate": 0.00019991240725474033, + "loss": 1.2109, + "step": 118 + }, + { + "epoch": 0.009250782943868947, + "grad_norm": 0.188096821308136, + "learning_rate": 0.00019990210222588624, + "loss": 1.1778, + "step": 120 + }, + { + "epoch": 0.009404962659600096, + "grad_norm": 0.1527150571346283, + "learning_rate": 0.00019989179719703218, + "loss": 1.0977, + "step": 122 + }, + { + "epoch": 0.009559142375331245, + "grad_norm": 0.1705218255519867, + "learning_rate": 0.0001998814921681781, + "loss": 1.1333, + "step": 124 + }, + { + "epoch": 0.009713322091062395, + "grad_norm": 0.1888928860425949, + "learning_rate": 0.00019987118713932401, + "loss": 1.1843, + "step": 126 + }, + { + "epoch": 0.009867501806793544, + "grad_norm": 0.1778104603290558, + "learning_rate": 0.00019986088211046993, + "loss": 1.0766, + "step": 128 + }, + { + "epoch": 0.010021681522524693, + "grad_norm": 0.15807992219924927, + "learning_rate": 0.00019985057708161584, + "loss": 1.0449, + "step": 130 + }, + { + "epoch": 0.010175861238255842, + "grad_norm": 0.16706159710884094, + "learning_rate": 0.00019984027205276176, + "loss": 1.0644, + "step": 132 + }, + { + "epoch": 0.01033004095398699, + "grad_norm": 0.16455501317977905, + "learning_rate": 0.00019982996702390767, + "loss": 1.1479, + "step": 134 + }, + { + "epoch": 0.010484220669718141, + "grad_norm": 0.17258939146995544, + "learning_rate": 0.0001998196619950536, + "loss": 1.0614, + "step": 136 + }, + { + "epoch": 0.01063840038544929, + "grad_norm": 0.15501369535923004, + "learning_rate": 0.0001998093569661995, + "loss": 1.1045, + "step": 138 + }, + { + "epoch": 0.010792580101180439, + "grad_norm": 0.1534334272146225, + "learning_rate": 0.00019979905193734542, + "loss": 1.1035, + "step": 140 + }, + { + "epoch": 0.010946759816911587, + "grad_norm": 0.14120443165302277, + "learning_rate": 0.00019978874690849136, + "loss": 1.0618, + "step": 142 + }, + { + "epoch": 0.011100939532642736, + "grad_norm": 0.17808520793914795, + "learning_rate": 0.00019977844187963728, + "loss": 1.1687, + "step": 144 + }, + { + "epoch": 0.011255119248373887, + "grad_norm": 0.16697613894939423, + "learning_rate": 0.0001997681368507832, + "loss": 1.0979, + "step": 146 + }, + { + "epoch": 0.011409298964105035, + "grad_norm": 0.16491086781024933, + "learning_rate": 0.0001997578318219291, + "loss": 1.1219, + "step": 148 + }, + { + "epoch": 0.011563478679836184, + "grad_norm": 0.15342313051223755, + "learning_rate": 0.00019974752679307502, + "loss": 1.1169, + "step": 150 + }, + { + "epoch": 0.011717658395567333, + "grad_norm": 0.1539286971092224, + "learning_rate": 0.00019973722176422093, + "loss": 1.1288, + "step": 152 + }, + { + "epoch": 0.011871838111298482, + "grad_norm": 0.15605852007865906, + "learning_rate": 0.00019972691673536688, + "loss": 1.0445, + "step": 154 + }, + { + "epoch": 0.012026017827029632, + "grad_norm": 0.14324098825454712, + "learning_rate": 0.0001997166117065128, + "loss": 1.1309, + "step": 156 + }, + { + "epoch": 0.012180197542760781, + "grad_norm": 0.21045701205730438, + "learning_rate": 0.0001997063066776587, + "loss": 1.0946, + "step": 158 + }, + { + "epoch": 0.01233437725849193, + "grad_norm": 0.16019922494888306, + "learning_rate": 0.00019969600164880462, + "loss": 1.11, + "step": 160 + }, + { + "epoch": 0.012488556974223079, + "grad_norm": 0.15740078687667847, + "learning_rate": 0.00019968569661995054, + "loss": 1.112, + "step": 162 + }, + { + "epoch": 0.012642736689954227, + "grad_norm": 0.16974380612373352, + "learning_rate": 0.00019967539159109648, + "loss": 1.1279, + "step": 164 + }, + { + "epoch": 0.012796916405685378, + "grad_norm": 0.16405288875102997, + "learning_rate": 0.0001996650865622424, + "loss": 1.0952, + "step": 166 + }, + { + "epoch": 0.012951096121416527, + "grad_norm": 0.16120509803295135, + "learning_rate": 0.0001996547815333883, + "loss": 1.1203, + "step": 168 + }, + { + "epoch": 0.013105275837147675, + "grad_norm": 0.17402276396751404, + "learning_rate": 0.00019964447650453422, + "loss": 1.0991, + "step": 170 + }, + { + "epoch": 0.013259455552878824, + "grad_norm": 0.18349111080169678, + "learning_rate": 0.00019963417147568014, + "loss": 1.1394, + "step": 172 + }, + { + "epoch": 0.013413635268609973, + "grad_norm": 0.14613087475299835, + "learning_rate": 0.00019962386644682608, + "loss": 1.1357, + "step": 174 + }, + { + "epoch": 0.013567814984341123, + "grad_norm": 0.142988383769989, + "learning_rate": 0.000199613561417972, + "loss": 1.0169, + "step": 176 + }, + { + "epoch": 0.013721994700072272, + "grad_norm": 0.14817160367965698, + "learning_rate": 0.0001996032563891179, + "loss": 1.1238, + "step": 178 + }, + { + "epoch": 0.013876174415803421, + "grad_norm": 0.15391133725643158, + "learning_rate": 0.00019959295136026382, + "loss": 1.0712, + "step": 180 + }, + { + "epoch": 0.01403035413153457, + "grad_norm": 0.1766846477985382, + "learning_rate": 0.00019958264633140974, + "loss": 1.1422, + "step": 182 + }, + { + "epoch": 0.014184533847265719, + "grad_norm": 0.16789212822914124, + "learning_rate": 0.00019957234130255565, + "loss": 1.1266, + "step": 184 + }, + { + "epoch": 0.014338713562996869, + "grad_norm": 0.1527165323495865, + "learning_rate": 0.00019956203627370157, + "loss": 1.0667, + "step": 186 + }, + { + "epoch": 0.014492893278728018, + "grad_norm": 0.1772206574678421, + "learning_rate": 0.00019955173124484748, + "loss": 1.1182, + "step": 188 + }, + { + "epoch": 0.014647072994459167, + "grad_norm": 0.15008313953876495, + "learning_rate": 0.0001995414262159934, + "loss": 1.0382, + "step": 190 + }, + { + "epoch": 0.014801252710190315, + "grad_norm": 0.16365988552570343, + "learning_rate": 0.00019953112118713931, + "loss": 1.1262, + "step": 192 + }, + { + "epoch": 0.014955432425921464, + "grad_norm": 0.14952193200588226, + "learning_rate": 0.00019952081615828526, + "loss": 1.1245, + "step": 194 + }, + { + "epoch": 0.015109612141652615, + "grad_norm": 0.15425263345241547, + "learning_rate": 0.00019951051112943117, + "loss": 1.1452, + "step": 196 + }, + { + "epoch": 0.015263791857383763, + "grad_norm": 0.1567617654800415, + "learning_rate": 0.00019950020610057709, + "loss": 1.0392, + "step": 198 + }, + { + "epoch": 0.015417971573114912, + "grad_norm": 0.14292609691619873, + "learning_rate": 0.000199489901071723, + "loss": 1.0728, + "step": 200 + }, + { + "epoch": 0.015417971573114912, + "eval_loss": 1.1127630472183228, + "eval_runtime": 185.2528, + "eval_samples_per_second": 91.459, + "eval_steps_per_second": 1.43, + "step": 200 + }, + { + "epoch": 0.015572151288846061, + "grad_norm": 0.15465517342090607, + "learning_rate": 0.00019947959604286892, + "loss": 1.0596, + "step": 202 + }, + { + "epoch": 0.01572633100457721, + "grad_norm": 0.16749607026576996, + "learning_rate": 0.00019946929101401486, + "loss": 1.1005, + "step": 204 + }, + { + "epoch": 0.01588051072030836, + "grad_norm": 0.15854287147521973, + "learning_rate": 0.00019945898598516077, + "loss": 1.0963, + "step": 206 + }, + { + "epoch": 0.016034690436039507, + "grad_norm": 0.1457831859588623, + "learning_rate": 0.0001994486809563067, + "loss": 1.1149, + "step": 208 + }, + { + "epoch": 0.016188870151770656, + "grad_norm": 0.15744629502296448, + "learning_rate": 0.0001994383759274526, + "loss": 1.0789, + "step": 210 + }, + { + "epoch": 0.01634304986750181, + "grad_norm": 0.13411423563957214, + "learning_rate": 0.00019942807089859852, + "loss": 1.0641, + "step": 212 + }, + { + "epoch": 0.016497229583232957, + "grad_norm": 0.1575399488210678, + "learning_rate": 0.00019941776586974446, + "loss": 1.0888, + "step": 214 + }, + { + "epoch": 0.016651409298964106, + "grad_norm": 0.14619529247283936, + "learning_rate": 0.00019940746084089037, + "loss": 1.081, + "step": 216 + }, + { + "epoch": 0.016805589014695255, + "grad_norm": 0.15578237175941467, + "learning_rate": 0.0001993971558120363, + "loss": 1.1434, + "step": 218 + }, + { + "epoch": 0.016959768730426403, + "grad_norm": 0.1516629308462143, + "learning_rate": 0.0001993868507831822, + "loss": 1.0909, + "step": 220 + }, + { + "epoch": 0.017113948446157552, + "grad_norm": 0.15613436698913574, + "learning_rate": 0.00019937654575432812, + "loss": 1.0999, + "step": 222 + }, + { + "epoch": 0.0172681281618887, + "grad_norm": 0.14825573563575745, + "learning_rate": 0.00019936624072547406, + "loss": 1.0827, + "step": 224 + }, + { + "epoch": 0.01742230787761985, + "grad_norm": 0.1624906212091446, + "learning_rate": 0.00019935593569661998, + "loss": 1.0856, + "step": 226 + }, + { + "epoch": 0.017576487593351, + "grad_norm": 0.1380940079689026, + "learning_rate": 0.0001993456306677659, + "loss": 1.0514, + "step": 228 + }, + { + "epoch": 0.017730667309082147, + "grad_norm": 0.13712120056152344, + "learning_rate": 0.0001993353256389118, + "loss": 1.0977, + "step": 230 + }, + { + "epoch": 0.0178848470248133, + "grad_norm": 0.1448957622051239, + "learning_rate": 0.00019932502061005772, + "loss": 1.0729, + "step": 232 + }, + { + "epoch": 0.01803902674054445, + "grad_norm": 0.13421876728534698, + "learning_rate": 0.00019931471558120364, + "loss": 1.0879, + "step": 234 + }, + { + "epoch": 0.018193206456275597, + "grad_norm": 0.16884732246398926, + "learning_rate": 0.00019930441055234955, + "loss": 1.1159, + "step": 236 + }, + { + "epoch": 0.018347386172006746, + "grad_norm": 0.14634890854358673, + "learning_rate": 0.00019929410552349547, + "loss": 1.0568, + "step": 238 + }, + { + "epoch": 0.018501565887737895, + "grad_norm": 0.16796648502349854, + "learning_rate": 0.00019928380049464138, + "loss": 1.0944, + "step": 240 + }, + { + "epoch": 0.018655745603469043, + "grad_norm": 0.13724717497825623, + "learning_rate": 0.0001992734954657873, + "loss": 1.0609, + "step": 242 + }, + { + "epoch": 0.018809925319200192, + "grad_norm": 0.14133594930171967, + "learning_rate": 0.0001992631904369332, + "loss": 1.0879, + "step": 244 + }, + { + "epoch": 0.01896410503493134, + "grad_norm": 0.1611246019601822, + "learning_rate": 0.00019925288540807915, + "loss": 1.0681, + "step": 246 + }, + { + "epoch": 0.01911828475066249, + "grad_norm": 0.17420877516269684, + "learning_rate": 0.00019924258037922507, + "loss": 1.1336, + "step": 248 + }, + { + "epoch": 0.01927246446639364, + "grad_norm": 0.13766029477119446, + "learning_rate": 0.00019923227535037098, + "loss": 1.075, + "step": 250 + }, + { + "epoch": 0.01942664418212479, + "grad_norm": 0.1691662222146988, + "learning_rate": 0.0001992219703215169, + "loss": 1.1369, + "step": 252 + }, + { + "epoch": 0.01958082389785594, + "grad_norm": 0.14959432184696198, + "learning_rate": 0.0001992116652926628, + "loss": 1.1129, + "step": 254 + }, + { + "epoch": 0.01973500361358709, + "grad_norm": 0.14996406435966492, + "learning_rate": 0.00019920136026380875, + "loss": 1.0304, + "step": 256 + }, + { + "epoch": 0.019889183329318237, + "grad_norm": 0.13211801648139954, + "learning_rate": 0.00019919105523495467, + "loss": 1.0652, + "step": 258 + }, + { + "epoch": 0.020043363045049386, + "grad_norm": 0.16041967272758484, + "learning_rate": 0.00019918075020610058, + "loss": 1.077, + "step": 260 + }, + { + "epoch": 0.020197542760780535, + "grad_norm": 0.1524546593427658, + "learning_rate": 0.0001991704451772465, + "loss": 1.1176, + "step": 262 + }, + { + "epoch": 0.020351722476511683, + "grad_norm": 0.16032540798187256, + "learning_rate": 0.00019916014014839241, + "loss": 1.0736, + "step": 264 + }, + { + "epoch": 0.020505902192242832, + "grad_norm": 0.17891019582748413, + "learning_rate": 0.00019914983511953836, + "loss": 1.1435, + "step": 266 + }, + { + "epoch": 0.02066008190797398, + "grad_norm": 0.14484059810638428, + "learning_rate": 0.00019913953009068427, + "loss": 1.0356, + "step": 268 + }, + { + "epoch": 0.02081426162370513, + "grad_norm": 0.14321155846118927, + "learning_rate": 0.00019912922506183019, + "loss": 1.0536, + "step": 270 + }, + { + "epoch": 0.020968441339436282, + "grad_norm": 0.17357808351516724, + "learning_rate": 0.0001991189200329761, + "loss": 1.171, + "step": 272 + }, + { + "epoch": 0.02112262105516743, + "grad_norm": 0.13990800082683563, + "learning_rate": 0.00019910861500412202, + "loss": 1.0946, + "step": 274 + }, + { + "epoch": 0.02127680077089858, + "grad_norm": 0.16634231805801392, + "learning_rate": 0.00019909830997526796, + "loss": 1.1029, + "step": 276 + }, + { + "epoch": 0.02143098048662973, + "grad_norm": 0.16322381794452667, + "learning_rate": 0.00019908800494641387, + "loss": 1.0688, + "step": 278 + }, + { + "epoch": 0.021585160202360877, + "grad_norm": 0.1652844250202179, + "learning_rate": 0.0001990776999175598, + "loss": 1.1237, + "step": 280 + }, + { + "epoch": 0.021739339918092026, + "grad_norm": 0.14457885921001434, + "learning_rate": 0.0001990673948887057, + "loss": 1.1995, + "step": 282 + }, + { + "epoch": 0.021893519633823175, + "grad_norm": 0.15549878776073456, + "learning_rate": 0.00019905708985985162, + "loss": 1.0475, + "step": 284 + }, + { + "epoch": 0.022047699349554323, + "grad_norm": 0.15715502202510834, + "learning_rate": 0.00019904678483099756, + "loss": 1.1211, + "step": 286 + }, + { + "epoch": 0.022201879065285472, + "grad_norm": 0.14022529125213623, + "learning_rate": 0.00019903647980214347, + "loss": 1.1056, + "step": 288 + }, + { + "epoch": 0.02235605878101662, + "grad_norm": 0.13293786346912384, + "learning_rate": 0.0001990261747732894, + "loss": 1.0877, + "step": 290 + }, + { + "epoch": 0.022510238496747773, + "grad_norm": 0.14625073969364166, + "learning_rate": 0.0001990158697444353, + "loss": 1.0375, + "step": 292 + }, + { + "epoch": 0.022664418212478922, + "grad_norm": 0.1417943835258484, + "learning_rate": 0.0001990055647155812, + "loss": 1.091, + "step": 294 + }, + { + "epoch": 0.02281859792821007, + "grad_norm": 0.1519964039325714, + "learning_rate": 0.00019899525968672713, + "loss": 1.0396, + "step": 296 + }, + { + "epoch": 0.02297277764394122, + "grad_norm": 0.1676655411720276, + "learning_rate": 0.00019898495465787305, + "loss": 1.1249, + "step": 298 + }, + { + "epoch": 0.02312695735967237, + "grad_norm": 0.1487220674753189, + "learning_rate": 0.00019897464962901896, + "loss": 1.1768, + "step": 300 + }, + { + "epoch": 0.02312695735967237, + "eval_loss": 1.1061022281646729, + "eval_runtime": 185.239, + "eval_samples_per_second": 91.466, + "eval_steps_per_second": 1.431, + "step": 300 + }, + { + "epoch": 0.023281137075403517, + "grad_norm": 0.1399739533662796, + "learning_rate": 0.00019896434460016488, + "loss": 1.0962, + "step": 302 + }, + { + "epoch": 0.023435316791134666, + "grad_norm": 0.15282337367534637, + "learning_rate": 0.0001989540395713108, + "loss": 1.1688, + "step": 304 + }, + { + "epoch": 0.023589496506865815, + "grad_norm": 0.15459619462490082, + "learning_rate": 0.00019894373454245674, + "loss": 1.0216, + "step": 306 + }, + { + "epoch": 0.023743676222596963, + "grad_norm": 0.15799634158611298, + "learning_rate": 0.00019893342951360265, + "loss": 1.1429, + "step": 308 + }, + { + "epoch": 0.023897855938328112, + "grad_norm": 0.1343819946050644, + "learning_rate": 0.00019892312448474857, + "loss": 1.0959, + "step": 310 + }, + { + "epoch": 0.024052035654059264, + "grad_norm": 0.14791317284107208, + "learning_rate": 0.00019891281945589448, + "loss": 1.0636, + "step": 312 + }, + { + "epoch": 0.024206215369790413, + "grad_norm": 0.1442137360572815, + "learning_rate": 0.0001989025144270404, + "loss": 1.055, + "step": 314 + }, + { + "epoch": 0.024360395085521562, + "grad_norm": 0.14649145305156708, + "learning_rate": 0.00019889220939818634, + "loss": 1.0906, + "step": 316 + }, + { + "epoch": 0.02451457480125271, + "grad_norm": 0.14234665036201477, + "learning_rate": 0.00019888190436933225, + "loss": 1.0853, + "step": 318 + }, + { + "epoch": 0.02466875451698386, + "grad_norm": 0.1419668048620224, + "learning_rate": 0.00019887159934047817, + "loss": 1.0296, + "step": 320 + }, + { + "epoch": 0.02482293423271501, + "grad_norm": 0.14730845391750336, + "learning_rate": 0.00019886129431162408, + "loss": 1.0421, + "step": 322 + }, + { + "epoch": 0.024977113948446157, + "grad_norm": 0.1400081068277359, + "learning_rate": 0.00019885098928277, + "loss": 1.0291, + "step": 324 + }, + { + "epoch": 0.025131293664177306, + "grad_norm": 0.15542668104171753, + "learning_rate": 0.0001988406842539159, + "loss": 1.0597, + "step": 326 + }, + { + "epoch": 0.025285473379908455, + "grad_norm": 0.14521440863609314, + "learning_rate": 0.00019883037922506185, + "loss": 1.0491, + "step": 328 + }, + { + "epoch": 0.025439653095639603, + "grad_norm": 0.16224826872348785, + "learning_rate": 0.00019882007419620777, + "loss": 1.1031, + "step": 330 + }, + { + "epoch": 0.025593832811370756, + "grad_norm": 0.15028877556324005, + "learning_rate": 0.00019880976916735368, + "loss": 1.1154, + "step": 332 + }, + { + "epoch": 0.025748012527101904, + "grad_norm": 0.12962941825389862, + "learning_rate": 0.0001987994641384996, + "loss": 1.0363, + "step": 334 + }, + { + "epoch": 0.025902192242833053, + "grad_norm": 0.14908359944820404, + "learning_rate": 0.0001987891591096455, + "loss": 1.1513, + "step": 336 + }, + { + "epoch": 0.026056371958564202, + "grad_norm": 0.15441828966140747, + "learning_rate": 0.00019877885408079146, + "loss": 1.1303, + "step": 338 + }, + { + "epoch": 0.02621055167429535, + "grad_norm": 0.12669101357460022, + "learning_rate": 0.00019876854905193737, + "loss": 1.0875, + "step": 340 + }, + { + "epoch": 0.0263647313900265, + "grad_norm": 0.13190661370754242, + "learning_rate": 0.00019875824402308329, + "loss": 1.0778, + "step": 342 + }, + { + "epoch": 0.02651891110575765, + "grad_norm": 0.14043989777565002, + "learning_rate": 0.0001987479389942292, + "loss": 1.1011, + "step": 344 + }, + { + "epoch": 0.026673090821488797, + "grad_norm": 0.13694870471954346, + "learning_rate": 0.00019873763396537512, + "loss": 1.0532, + "step": 346 + }, + { + "epoch": 0.026827270537219946, + "grad_norm": 0.15089921653270721, + "learning_rate": 0.00019872732893652103, + "loss": 1.1292, + "step": 348 + }, + { + "epoch": 0.026981450252951095, + "grad_norm": 0.14839838445186615, + "learning_rate": 0.00019871702390766694, + "loss": 1.0275, + "step": 350 + }, + { + "epoch": 0.027135629968682247, + "grad_norm": 0.16198500990867615, + "learning_rate": 0.00019870671887881286, + "loss": 1.1453, + "step": 352 + }, + { + "epoch": 0.027289809684413396, + "grad_norm": 0.14694632589817047, + "learning_rate": 0.00019869641384995877, + "loss": 1.129, + "step": 354 + }, + { + "epoch": 0.027443989400144544, + "grad_norm": 0.16091379523277283, + "learning_rate": 0.0001986861088211047, + "loss": 1.1186, + "step": 356 + }, + { + "epoch": 0.027598169115875693, + "grad_norm": 0.144720658659935, + "learning_rate": 0.00019867580379225063, + "loss": 1.0224, + "step": 358 + }, + { + "epoch": 0.027752348831606842, + "grad_norm": 0.13851307332515717, + "learning_rate": 0.00019866549876339655, + "loss": 1.1421, + "step": 360 + }, + { + "epoch": 0.02790652854733799, + "grad_norm": 0.13124969601631165, + "learning_rate": 0.00019865519373454246, + "loss": 1.0938, + "step": 362 + }, + { + "epoch": 0.02806070826306914, + "grad_norm": 0.14723828434944153, + "learning_rate": 0.00019864488870568838, + "loss": 1.1335, + "step": 364 + }, + { + "epoch": 0.02821488797880029, + "grad_norm": 0.17669795453548431, + "learning_rate": 0.0001986345836768343, + "loss": 1.0765, + "step": 366 + }, + { + "epoch": 0.028369067694531437, + "grad_norm": 0.1457260102033615, + "learning_rate": 0.00019862427864798023, + "loss": 1.1073, + "step": 368 + }, + { + "epoch": 0.028523247410262586, + "grad_norm": 0.13594554364681244, + "learning_rate": 0.00019861397361912615, + "loss": 1.0587, + "step": 370 + }, + { + "epoch": 0.028677427125993738, + "grad_norm": 0.13798941671848297, + "learning_rate": 0.00019860366859027206, + "loss": 1.0833, + "step": 372 + }, + { + "epoch": 0.028831606841724887, + "grad_norm": 0.15587519109249115, + "learning_rate": 0.00019859336356141798, + "loss": 1.0287, + "step": 374 + }, + { + "epoch": 0.028985786557456036, + "grad_norm": 0.16585086286067963, + "learning_rate": 0.0001985830585325639, + "loss": 1.1786, + "step": 376 + }, + { + "epoch": 0.029139966273187184, + "grad_norm": 0.1444484293460846, + "learning_rate": 0.00019857275350370983, + "loss": 1.1793, + "step": 378 + }, + { + "epoch": 0.029294145988918333, + "grad_norm": 0.14413981139659882, + "learning_rate": 0.00019856244847485575, + "loss": 1.1141, + "step": 380 + }, + { + "epoch": 0.029448325704649482, + "grad_norm": 0.142032191157341, + "learning_rate": 0.00019855214344600166, + "loss": 1.1033, + "step": 382 + }, + { + "epoch": 0.02960250542038063, + "grad_norm": 0.1490195393562317, + "learning_rate": 0.00019854183841714758, + "loss": 1.1592, + "step": 384 + }, + { + "epoch": 0.02975668513611178, + "grad_norm": 0.1408643275499344, + "learning_rate": 0.0001985315333882935, + "loss": 1.1505, + "step": 386 + }, + { + "epoch": 0.02991086485184293, + "grad_norm": 0.12526237964630127, + "learning_rate": 0.00019852122835943944, + "loss": 1.1027, + "step": 388 + }, + { + "epoch": 0.030065044567574077, + "grad_norm": 0.1339711844921112, + "learning_rate": 0.00019851092333058535, + "loss": 1.1238, + "step": 390 + }, + { + "epoch": 0.03021922428330523, + "grad_norm": 0.13032345473766327, + "learning_rate": 0.00019850061830173127, + "loss": 1.1121, + "step": 392 + }, + { + "epoch": 0.030373403999036378, + "grad_norm": 0.15815846621990204, + "learning_rate": 0.00019849031327287718, + "loss": 1.168, + "step": 394 + }, + { + "epoch": 0.030527583714767527, + "grad_norm": 0.14245116710662842, + "learning_rate": 0.0001984800082440231, + "loss": 1.0436, + "step": 396 + }, + { + "epoch": 0.030681763430498676, + "grad_norm": 0.15660050511360168, + "learning_rate": 0.000198469703215169, + "loss": 1.158, + "step": 398 + }, + { + "epoch": 0.030835943146229824, + "grad_norm": 0.1654158979654312, + "learning_rate": 0.00019845939818631493, + "loss": 1.0802, + "step": 400 + }, + { + "epoch": 0.030835943146229824, + "eval_loss": 1.1026971340179443, + "eval_runtime": 185.7295, + "eval_samples_per_second": 91.224, + "eval_steps_per_second": 1.427, + "step": 400 + }, + { + "epoch": 0.030990122861960973, + "grad_norm": 0.13845407962799072, + "learning_rate": 0.00019844909315746084, + "loss": 1.1055, + "step": 402 + }, + { + "epoch": 0.031144302577692122, + "grad_norm": 0.14852891862392426, + "learning_rate": 0.00019843878812860676, + "loss": 1.0983, + "step": 404 + }, + { + "epoch": 0.031298482293423274, + "grad_norm": 0.13408593833446503, + "learning_rate": 0.00019842848309975267, + "loss": 1.1063, + "step": 406 + }, + { + "epoch": 0.03145266200915442, + "grad_norm": 0.14041072130203247, + "learning_rate": 0.00019841817807089859, + "loss": 1.0327, + "step": 408 + }, + { + "epoch": 0.03160684172488557, + "grad_norm": 0.16119754314422607, + "learning_rate": 0.00019840787304204453, + "loss": 1.1, + "step": 410 + }, + { + "epoch": 0.03176102144061672, + "grad_norm": 0.14471223950386047, + "learning_rate": 0.00019839756801319044, + "loss": 1.0783, + "step": 412 + }, + { + "epoch": 0.03191520115634787, + "grad_norm": 0.15591050684452057, + "learning_rate": 0.00019838726298433636, + "loss": 1.1782, + "step": 414 + }, + { + "epoch": 0.032069380872079015, + "grad_norm": 0.1766556203365326, + "learning_rate": 0.00019837695795548227, + "loss": 1.1063, + "step": 416 + }, + { + "epoch": 0.03222356058781017, + "grad_norm": 0.16078630089759827, + "learning_rate": 0.0001983666529266282, + "loss": 1.0891, + "step": 418 + }, + { + "epoch": 0.03237774030354131, + "grad_norm": 0.13378402590751648, + "learning_rate": 0.00019835634789777413, + "loss": 1.074, + "step": 420 + }, + { + "epoch": 0.032531920019272464, + "grad_norm": 0.14526261389255524, + "learning_rate": 0.00019834604286892004, + "loss": 1.108, + "step": 422 + }, + { + "epoch": 0.03268609973500362, + "grad_norm": 0.1321713775396347, + "learning_rate": 0.00019833573784006596, + "loss": 1.019, + "step": 424 + }, + { + "epoch": 0.03284027945073476, + "grad_norm": 0.12685374915599823, + "learning_rate": 0.00019832543281121187, + "loss": 1.09, + "step": 426 + }, + { + "epoch": 0.032994459166465914, + "grad_norm": 0.13825605809688568, + "learning_rate": 0.0001983151277823578, + "loss": 1.1356, + "step": 428 + }, + { + "epoch": 0.03314863888219706, + "grad_norm": 0.13683827221393585, + "learning_rate": 0.00019830482275350373, + "loss": 1.1405, + "step": 430 + }, + { + "epoch": 0.03330281859792821, + "grad_norm": 0.16707143187522888, + "learning_rate": 0.00019829451772464965, + "loss": 1.1305, + "step": 432 + }, + { + "epoch": 0.03345699831365936, + "grad_norm": 0.11735045164823532, + "learning_rate": 0.00019828421269579556, + "loss": 1.0421, + "step": 434 + }, + { + "epoch": 0.03361117802939051, + "grad_norm": 0.1337989866733551, + "learning_rate": 0.00019827390766694148, + "loss": 1.0572, + "step": 436 + }, + { + "epoch": 0.033765357745121655, + "grad_norm": 0.17111611366271973, + "learning_rate": 0.0001982636026380874, + "loss": 1.1698, + "step": 438 + }, + { + "epoch": 0.03391953746085281, + "grad_norm": 0.13785259425640106, + "learning_rate": 0.00019825329760923333, + "loss": 1.056, + "step": 440 + }, + { + "epoch": 0.03407371717658395, + "grad_norm": 0.15061460435390472, + "learning_rate": 0.00019824299258037925, + "loss": 1.0963, + "step": 442 + }, + { + "epoch": 0.034227896892315104, + "grad_norm": 0.1231001690030098, + "learning_rate": 0.00019823268755152516, + "loss": 1.1264, + "step": 444 + }, + { + "epoch": 0.03438207660804626, + "grad_norm": 0.13752298057079315, + "learning_rate": 0.00019822238252267108, + "loss": 1.0672, + "step": 446 + }, + { + "epoch": 0.0345362563237774, + "grad_norm": 0.13519813120365143, + "learning_rate": 0.000198212077493817, + "loss": 1.0882, + "step": 448 + }, + { + "epoch": 0.034690436039508554, + "grad_norm": 0.140150785446167, + "learning_rate": 0.0001982017724649629, + "loss": 1.0572, + "step": 450 + }, + { + "epoch": 0.0348446157552397, + "grad_norm": 0.13910406827926636, + "learning_rate": 0.00019819146743610882, + "loss": 1.0762, + "step": 452 + }, + { + "epoch": 0.03499879547097085, + "grad_norm": 0.14587442576885223, + "learning_rate": 0.00019818116240725474, + "loss": 1.1232, + "step": 454 + }, + { + "epoch": 0.035152975186702, + "grad_norm": 0.14476893842220306, + "learning_rate": 0.00019817085737840065, + "loss": 1.1004, + "step": 456 + }, + { + "epoch": 0.03530715490243315, + "grad_norm": 0.13861101865768433, + "learning_rate": 0.00019816055234954657, + "loss": 1.0302, + "step": 458 + }, + { + "epoch": 0.035461334618164295, + "grad_norm": 0.14342686533927917, + "learning_rate": 0.0001981502473206925, + "loss": 1.1092, + "step": 460 + }, + { + "epoch": 0.03561551433389545, + "grad_norm": 0.11709775030612946, + "learning_rate": 0.00019813994229183842, + "loss": 1.0463, + "step": 462 + }, + { + "epoch": 0.0357696940496266, + "grad_norm": 0.15154917538166046, + "learning_rate": 0.00019812963726298434, + "loss": 1.0897, + "step": 464 + }, + { + "epoch": 0.035923873765357744, + "grad_norm": 0.16716259717941284, + "learning_rate": 0.00019811933223413025, + "loss": 1.1214, + "step": 466 + }, + { + "epoch": 0.0360780534810889, + "grad_norm": 0.13513320684432983, + "learning_rate": 0.00019810902720527617, + "loss": 1.0623, + "step": 468 + }, + { + "epoch": 0.03623223319682004, + "grad_norm": 0.15930432081222534, + "learning_rate": 0.0001980987221764221, + "loss": 1.1092, + "step": 470 + }, + { + "epoch": 0.036386412912551194, + "grad_norm": 0.13990509510040283, + "learning_rate": 0.00019808841714756803, + "loss": 1.1048, + "step": 472 + }, + { + "epoch": 0.03654059262828234, + "grad_norm": 0.18784300982952118, + "learning_rate": 0.00019807811211871394, + "loss": 1.1676, + "step": 474 + }, + { + "epoch": 0.03669477234401349, + "grad_norm": 0.152045339345932, + "learning_rate": 0.00019806780708985986, + "loss": 1.1303, + "step": 476 + }, + { + "epoch": 0.03684895205974464, + "grad_norm": 0.1409967988729477, + "learning_rate": 0.00019805750206100577, + "loss": 1.0972, + "step": 478 + }, + { + "epoch": 0.03700313177547579, + "grad_norm": 0.13838854432106018, + "learning_rate": 0.0001980471970321517, + "loss": 1.101, + "step": 480 + }, + { + "epoch": 0.037157311491206935, + "grad_norm": 0.1579430103302002, + "learning_rate": 0.00019803689200329763, + "loss": 1.1077, + "step": 482 + }, + { + "epoch": 0.03731149120693809, + "grad_norm": 0.15061910450458527, + "learning_rate": 0.00019802658697444354, + "loss": 1.1239, + "step": 484 + }, + { + "epoch": 0.03746567092266924, + "grad_norm": 0.16408291459083557, + "learning_rate": 0.00019801628194558946, + "loss": 1.0961, + "step": 486 + }, + { + "epoch": 0.037619850638400384, + "grad_norm": 0.15612424910068512, + "learning_rate": 0.00019800597691673537, + "loss": 1.1299, + "step": 488 + }, + { + "epoch": 0.03777403035413154, + "grad_norm": 0.14135530591011047, + "learning_rate": 0.00019799567188788131, + "loss": 1.0489, + "step": 490 + }, + { + "epoch": 0.03792821006986268, + "grad_norm": 0.13743548095226288, + "learning_rate": 0.00019798536685902723, + "loss": 1.0837, + "step": 492 + }, + { + "epoch": 0.038082389785593834, + "grad_norm": 0.157401442527771, + "learning_rate": 0.00019797506183017314, + "loss": 1.0573, + "step": 494 + }, + { + "epoch": 0.03823656950132498, + "grad_norm": 0.14982052147388458, + "learning_rate": 0.00019796475680131906, + "loss": 1.0839, + "step": 496 + }, + { + "epoch": 0.03839074921705613, + "grad_norm": 0.1347000151872635, + "learning_rate": 0.00019795445177246497, + "loss": 1.113, + "step": 498 + }, + { + "epoch": 0.03854492893278728, + "grad_norm": 0.14478904008865356, + "learning_rate": 0.0001979441467436109, + "loss": 1.0514, + "step": 500 + }, + { + "epoch": 0.03854492893278728, + "eval_loss": 1.1000746488571167, + "eval_runtime": 185.5217, + "eval_samples_per_second": 91.326, + "eval_steps_per_second": 1.428, + "step": 500 + }, + { + "epoch": 0.03869910864851843, + "grad_norm": 0.14274291694164276, + "learning_rate": 0.00019793384171475683, + "loss": 1.0847, + "step": 502 + }, + { + "epoch": 0.03885328836424958, + "grad_norm": 0.14326965808868408, + "learning_rate": 0.00019792353668590275, + "loss": 1.0865, + "step": 504 + }, + { + "epoch": 0.03900746807998073, + "grad_norm": 0.1575518548488617, + "learning_rate": 0.00019791323165704866, + "loss": 1.1258, + "step": 506 + }, + { + "epoch": 0.03916164779571188, + "grad_norm": 0.14699862897396088, + "learning_rate": 0.00019790292662819458, + "loss": 1.1687, + "step": 508 + }, + { + "epoch": 0.039315827511443024, + "grad_norm": 0.1394687294960022, + "learning_rate": 0.0001978926215993405, + "loss": 1.1214, + "step": 510 + }, + { + "epoch": 0.03947000722717418, + "grad_norm": 0.14366985857486725, + "learning_rate": 0.0001978823165704864, + "loss": 1.0651, + "step": 512 + }, + { + "epoch": 0.03962418694290532, + "grad_norm": 0.14171218872070312, + "learning_rate": 0.00019787201154163232, + "loss": 1.1398, + "step": 514 + }, + { + "epoch": 0.039778366658636474, + "grad_norm": 0.13258612155914307, + "learning_rate": 0.00019786170651277824, + "loss": 1.1234, + "step": 516 + }, + { + "epoch": 0.03993254637436762, + "grad_norm": 0.17693160474300385, + "learning_rate": 0.00019785140148392415, + "loss": 1.1121, + "step": 518 + }, + { + "epoch": 0.04008672609009877, + "grad_norm": 0.143838569521904, + "learning_rate": 0.00019784109645507006, + "loss": 1.102, + "step": 520 + }, + { + "epoch": 0.04024090580582992, + "grad_norm": 0.14078038930892944, + "learning_rate": 0.000197830791426216, + "loss": 1.1044, + "step": 522 + }, + { + "epoch": 0.04039508552156107, + "grad_norm": 0.12367985397577286, + "learning_rate": 0.00019782048639736192, + "loss": 1.102, + "step": 524 + }, + { + "epoch": 0.04054926523729222, + "grad_norm": 0.136929452419281, + "learning_rate": 0.00019781018136850784, + "loss": 1.0802, + "step": 526 + }, + { + "epoch": 0.04070344495302337, + "grad_norm": 0.15831957757472992, + "learning_rate": 0.00019779987633965375, + "loss": 1.09, + "step": 528 + }, + { + "epoch": 0.04085762466875452, + "grad_norm": 0.15482452511787415, + "learning_rate": 0.00019778957131079967, + "loss": 1.0828, + "step": 530 + }, + { + "epoch": 0.041011804384485664, + "grad_norm": 0.13797122240066528, + "learning_rate": 0.0001977792662819456, + "loss": 1.1263, + "step": 532 + }, + { + "epoch": 0.04116598410021682, + "grad_norm": 0.18304814398288727, + "learning_rate": 0.00019776896125309152, + "loss": 1.0991, + "step": 534 + }, + { + "epoch": 0.04132016381594796, + "grad_norm": 0.1509987860918045, + "learning_rate": 0.00019775865622423744, + "loss": 1.0804, + "step": 536 + }, + { + "epoch": 0.041474343531679114, + "grad_norm": 0.13406258821487427, + "learning_rate": 0.00019774835119538335, + "loss": 1.0348, + "step": 538 + }, + { + "epoch": 0.04162852324741026, + "grad_norm": 0.1413736194372177, + "learning_rate": 0.00019773804616652927, + "loss": 1.066, + "step": 540 + }, + { + "epoch": 0.04178270296314141, + "grad_norm": 0.1451394259929657, + "learning_rate": 0.0001977277411376752, + "loss": 1.0485, + "step": 542 + }, + { + "epoch": 0.041936882678872564, + "grad_norm": 0.13275358080863953, + "learning_rate": 0.00019771743610882113, + "loss": 1.1164, + "step": 544 + }, + { + "epoch": 0.04209106239460371, + "grad_norm": 0.15869611501693726, + "learning_rate": 0.00019770713107996704, + "loss": 1.1361, + "step": 546 + }, + { + "epoch": 0.04224524211033486, + "grad_norm": 0.14091487228870392, + "learning_rate": 0.00019769682605111295, + "loss": 1.061, + "step": 548 + }, + { + "epoch": 0.04239942182606601, + "grad_norm": 0.13538867235183716, + "learning_rate": 0.00019768652102225887, + "loss": 1.0607, + "step": 550 + }, + { + "epoch": 0.04255360154179716, + "grad_norm": 0.15626317262649536, + "learning_rate": 0.0001976762159934048, + "loss": 1.0758, + "step": 552 + }, + { + "epoch": 0.042707781257528304, + "grad_norm": 0.1293731927871704, + "learning_rate": 0.00019766591096455073, + "loss": 1.0434, + "step": 554 + }, + { + "epoch": 0.04286196097325946, + "grad_norm": 0.13498535752296448, + "learning_rate": 0.00019765560593569664, + "loss": 1.0953, + "step": 556 + }, + { + "epoch": 0.0430161406889906, + "grad_norm": 0.14134527742862701, + "learning_rate": 0.00019764530090684256, + "loss": 1.1559, + "step": 558 + }, + { + "epoch": 0.043170320404721754, + "grad_norm": 0.13958705961704254, + "learning_rate": 0.00019763499587798847, + "loss": 1.2585, + "step": 560 + }, + { + "epoch": 0.0433245001204529, + "grad_norm": 0.2181047797203064, + "learning_rate": 0.0001976246908491344, + "loss": 1.0164, + "step": 562 + }, + { + "epoch": 0.04347867983618405, + "grad_norm": 0.1365436315536499, + "learning_rate": 0.0001976143858202803, + "loss": 1.124, + "step": 564 + }, + { + "epoch": 0.043632859551915204, + "grad_norm": 0.12809793651103973, + "learning_rate": 0.00019760408079142622, + "loss": 1.0378, + "step": 566 + }, + { + "epoch": 0.04378703926764635, + "grad_norm": 0.12341924756765366, + "learning_rate": 0.00019759377576257213, + "loss": 1.1091, + "step": 568 + }, + { + "epoch": 0.0439412189833775, + "grad_norm": 0.14291982352733612, + "learning_rate": 0.00019758347073371805, + "loss": 1.1366, + "step": 570 + }, + { + "epoch": 0.04409539869910865, + "grad_norm": 0.14486652612686157, + "learning_rate": 0.000197573165704864, + "loss": 1.0168, + "step": 572 + }, + { + "epoch": 0.0442495784148398, + "grad_norm": 0.1724916249513626, + "learning_rate": 0.0001975628606760099, + "loss": 1.1037, + "step": 574 + }, + { + "epoch": 0.044403758130570944, + "grad_norm": 0.13338427245616913, + "learning_rate": 0.00019755255564715582, + "loss": 1.0259, + "step": 576 + }, + { + "epoch": 0.0445579378463021, + "grad_norm": 0.1372508853673935, + "learning_rate": 0.00019754225061830173, + "loss": 1.0784, + "step": 578 + }, + { + "epoch": 0.04471211756203324, + "grad_norm": 0.11633725464344025, + "learning_rate": 0.00019753194558944765, + "loss": 1.0648, + "step": 580 + }, + { + "epoch": 0.044866297277764394, + "grad_norm": 0.14386776089668274, + "learning_rate": 0.00019752164056059356, + "loss": 1.0777, + "step": 582 + }, + { + "epoch": 0.045020476993495546, + "grad_norm": 0.14929193258285522, + "learning_rate": 0.0001975113355317395, + "loss": 1.1319, + "step": 584 + }, + { + "epoch": 0.04517465670922669, + "grad_norm": 0.1324220448732376, + "learning_rate": 0.00019750103050288542, + "loss": 1.0614, + "step": 586 + }, + { + "epoch": 0.045328836424957844, + "grad_norm": 0.1392926126718521, + "learning_rate": 0.00019749072547403133, + "loss": 1.142, + "step": 588 + }, + { + "epoch": 0.04548301614068899, + "grad_norm": 0.2632090151309967, + "learning_rate": 0.00019748042044517725, + "loss": 1.0159, + "step": 590 + }, + { + "epoch": 0.04563719585642014, + "grad_norm": 0.13699129223823547, + "learning_rate": 0.00019747011541632316, + "loss": 1.0778, + "step": 592 + }, + { + "epoch": 0.04579137557215129, + "grad_norm": 0.13768675923347473, + "learning_rate": 0.0001974598103874691, + "loss": 1.0719, + "step": 594 + }, + { + "epoch": 0.04594555528788244, + "grad_norm": 0.13458684086799622, + "learning_rate": 0.00019744950535861502, + "loss": 1.0145, + "step": 596 + }, + { + "epoch": 0.046099735003613584, + "grad_norm": 0.1772696077823639, + "learning_rate": 0.00019743920032976094, + "loss": 1.0629, + "step": 598 + }, + { + "epoch": 0.04625391471934474, + "grad_norm": 0.13998697698116302, + "learning_rate": 0.00019742889530090685, + "loss": 1.102, + "step": 600 + }, + { + "epoch": 0.04625391471934474, + "eval_loss": 1.098169207572937, + "eval_runtime": 185.5141, + "eval_samples_per_second": 91.33, + "eval_steps_per_second": 1.428, + "step": 600 + }, + { + "epoch": 0.04640809443507588, + "grad_norm": 0.13928066194057465, + "learning_rate": 0.00019741859027205277, + "loss": 1.1527, + "step": 602 + }, + { + "epoch": 0.046562274150807034, + "grad_norm": 0.13011601567268372, + "learning_rate": 0.0001974082852431987, + "loss": 1.1259, + "step": 604 + }, + { + "epoch": 0.046716453866538186, + "grad_norm": 0.1306074559688568, + "learning_rate": 0.00019739798021434462, + "loss": 1.0951, + "step": 606 + }, + { + "epoch": 0.04687063358226933, + "grad_norm": 0.14797037839889526, + "learning_rate": 0.00019738767518549054, + "loss": 1.0321, + "step": 608 + }, + { + "epoch": 0.047024813298000484, + "grad_norm": 0.14849938452243805, + "learning_rate": 0.00019737737015663645, + "loss": 1.1096, + "step": 610 + }, + { + "epoch": 0.04717899301373163, + "grad_norm": 0.12060682475566864, + "learning_rate": 0.00019736706512778237, + "loss": 1.0652, + "step": 612 + }, + { + "epoch": 0.04733317272946278, + "grad_norm": 0.12754854559898376, + "learning_rate": 0.00019735676009892828, + "loss": 1.1097, + "step": 614 + }, + { + "epoch": 0.04748735244519393, + "grad_norm": 0.12162326276302338, + "learning_rate": 0.0001973464550700742, + "loss": 1.1087, + "step": 616 + }, + { + "epoch": 0.04764153216092508, + "grad_norm": 0.175630122423172, + "learning_rate": 0.0001973361500412201, + "loss": 1.0723, + "step": 618 + }, + { + "epoch": 0.047795711876656224, + "grad_norm": 0.15365472435951233, + "learning_rate": 0.00019732584501236603, + "loss": 1.1009, + "step": 620 + }, + { + "epoch": 0.04794989159238738, + "grad_norm": 0.13359837234020233, + "learning_rate": 0.00019731553998351194, + "loss": 1.0974, + "step": 622 + }, + { + "epoch": 0.04810407130811853, + "grad_norm": 0.1482960432767868, + "learning_rate": 0.00019730523495465788, + "loss": 1.1214, + "step": 624 + }, + { + "epoch": 0.048258251023849674, + "grad_norm": 0.1309668868780136, + "learning_rate": 0.0001972949299258038, + "loss": 1.0849, + "step": 626 + }, + { + "epoch": 0.048412430739580826, + "grad_norm": 0.1544414609670639, + "learning_rate": 0.00019728462489694971, + "loss": 1.092, + "step": 628 + }, + { + "epoch": 0.04856661045531197, + "grad_norm": 0.14907146990299225, + "learning_rate": 0.00019727431986809563, + "loss": 1.0671, + "step": 630 + }, + { + "epoch": 0.048720790171043124, + "grad_norm": 0.16943813860416412, + "learning_rate": 0.00019726401483924154, + "loss": 1.1433, + "step": 632 + }, + { + "epoch": 0.04887496988677427, + "grad_norm": 0.14070230722427368, + "learning_rate": 0.00019725370981038749, + "loss": 1.1613, + "step": 634 + }, + { + "epoch": 0.04902914960250542, + "grad_norm": 0.15507204830646515, + "learning_rate": 0.0001972434047815334, + "loss": 1.1286, + "step": 636 + }, + { + "epoch": 0.04918332931823657, + "grad_norm": 0.13587893545627594, + "learning_rate": 0.00019723309975267932, + "loss": 1.1094, + "step": 638 + }, + { + "epoch": 0.04933750903396772, + "grad_norm": 0.12399852275848389, + "learning_rate": 0.00019722279472382523, + "loss": 1.058, + "step": 640 + }, + { + "epoch": 0.049491688749698864, + "grad_norm": 0.12497518211603165, + "learning_rate": 0.00019721248969497115, + "loss": 1.0716, + "step": 642 + }, + { + "epoch": 0.04964586846543002, + "grad_norm": 0.15282607078552246, + "learning_rate": 0.0001972021846661171, + "loss": 1.0912, + "step": 644 + }, + { + "epoch": 0.04980004818116117, + "grad_norm": 0.14203013479709625, + "learning_rate": 0.000197191879637263, + "loss": 1.0846, + "step": 646 + }, + { + "epoch": 0.049954227896892314, + "grad_norm": 0.12308704853057861, + "learning_rate": 0.00019718157460840892, + "loss": 1.1202, + "step": 648 + }, + { + "epoch": 0.050108407612623466, + "grad_norm": 0.15226681530475616, + "learning_rate": 0.00019717126957955483, + "loss": 1.0626, + "step": 650 + }, + { + "epoch": 0.05026258732835461, + "grad_norm": 0.12636694312095642, + "learning_rate": 0.00019716096455070075, + "loss": 1.1086, + "step": 652 + }, + { + "epoch": 0.050416767044085764, + "grad_norm": 0.14969666302204132, + "learning_rate": 0.0001971506595218467, + "loss": 1.1602, + "step": 654 + }, + { + "epoch": 0.05057094675981691, + "grad_norm": 0.130833700299263, + "learning_rate": 0.0001971403544929926, + "loss": 1.0657, + "step": 656 + }, + { + "epoch": 0.05072512647554806, + "grad_norm": 0.1283751279115677, + "learning_rate": 0.00019713004946413852, + "loss": 1.0371, + "step": 658 + }, + { + "epoch": 0.05087930619127921, + "grad_norm": 0.11827697604894638, + "learning_rate": 0.00019711974443528443, + "loss": 1.0308, + "step": 660 + }, + { + "epoch": 0.05103348590701036, + "grad_norm": 0.12265590578317642, + "learning_rate": 0.00019710943940643035, + "loss": 1.1127, + "step": 662 + }, + { + "epoch": 0.05118766562274151, + "grad_norm": 0.13979150354862213, + "learning_rate": 0.0001970991343775763, + "loss": 1.1011, + "step": 664 + }, + { + "epoch": 0.05134184533847266, + "grad_norm": 0.1368461698293686, + "learning_rate": 0.0001970888293487222, + "loss": 1.0857, + "step": 666 + }, + { + "epoch": 0.05149602505420381, + "grad_norm": 0.13669301569461823, + "learning_rate": 0.00019707852431986812, + "loss": 1.0971, + "step": 668 + }, + { + "epoch": 0.051650204769934954, + "grad_norm": 0.12659449875354767, + "learning_rate": 0.00019706821929101404, + "loss": 1.0556, + "step": 670 + }, + { + "epoch": 0.051804384485666106, + "grad_norm": 0.14103113114833832, + "learning_rate": 0.00019705791426215995, + "loss": 1.0913, + "step": 672 + }, + { + "epoch": 0.05195856420139725, + "grad_norm": 0.16134017705917358, + "learning_rate": 0.00019704760923330587, + "loss": 1.0994, + "step": 674 + }, + { + "epoch": 0.052112743917128404, + "grad_norm": 0.12725086510181427, + "learning_rate": 0.00019703730420445178, + "loss": 1.1008, + "step": 676 + }, + { + "epoch": 0.05226692363285955, + "grad_norm": 0.12865908443927765, + "learning_rate": 0.0001970269991755977, + "loss": 1.0186, + "step": 678 + }, + { + "epoch": 0.0524211033485907, + "grad_norm": 0.1661859154701233, + "learning_rate": 0.0001970166941467436, + "loss": 1.068, + "step": 680 + }, + { + "epoch": 0.05257528306432185, + "grad_norm": 0.14370663464069366, + "learning_rate": 0.00019700638911788953, + "loss": 1.102, + "step": 682 + }, + { + "epoch": 0.052729462780053, + "grad_norm": 0.13285204768180847, + "learning_rate": 0.00019699608408903544, + "loss": 1.1055, + "step": 684 + }, + { + "epoch": 0.05288364249578415, + "grad_norm": 0.17762747406959534, + "learning_rate": 0.00019698577906018138, + "loss": 1.1601, + "step": 686 + }, + { + "epoch": 0.0530378222115153, + "grad_norm": 0.12693317234516144, + "learning_rate": 0.0001969754740313273, + "loss": 1.0494, + "step": 688 + }, + { + "epoch": 0.05319200192724645, + "grad_norm": 0.1302707940340042, + "learning_rate": 0.0001969651690024732, + "loss": 1.066, + "step": 690 + }, + { + "epoch": 0.053346181642977594, + "grad_norm": 0.11844471096992493, + "learning_rate": 0.00019695486397361913, + "loss": 1.0085, + "step": 692 + }, + { + "epoch": 0.053500361358708746, + "grad_norm": 0.12299422174692154, + "learning_rate": 0.00019694455894476504, + "loss": 1.0985, + "step": 694 + }, + { + "epoch": 0.05365454107443989, + "grad_norm": 0.1222420409321785, + "learning_rate": 0.00019693425391591098, + "loss": 1.0648, + "step": 696 + }, + { + "epoch": 0.053808720790171044, + "grad_norm": 0.13273879885673523, + "learning_rate": 0.0001969239488870569, + "loss": 1.1108, + "step": 698 + }, + { + "epoch": 0.05396290050590219, + "grad_norm": 0.13202215731143951, + "learning_rate": 0.00019691364385820281, + "loss": 1.1013, + "step": 700 + }, + { + "epoch": 0.05396290050590219, + "eval_loss": 1.0964874029159546, + "eval_runtime": 185.3303, + "eval_samples_per_second": 91.421, + "eval_steps_per_second": 1.43, + "step": 700 + }, + { + "epoch": 0.05411708022163334, + "grad_norm": 0.13038010895252228, + "learning_rate": 0.00019690333882934873, + "loss": 1.0642, + "step": 702 + }, + { + "epoch": 0.054271259937364494, + "grad_norm": 0.18084144592285156, + "learning_rate": 0.00019689303380049464, + "loss": 1.0673, + "step": 704 + }, + { + "epoch": 0.05442543965309564, + "grad_norm": 0.18958036601543427, + "learning_rate": 0.00019688272877164059, + "loss": 1.0925, + "step": 706 + }, + { + "epoch": 0.05457961936882679, + "grad_norm": 0.13386841118335724, + "learning_rate": 0.0001968724237427865, + "loss": 1.0978, + "step": 708 + }, + { + "epoch": 0.05473379908455794, + "grad_norm": 0.1408504843711853, + "learning_rate": 0.00019686211871393242, + "loss": 1.1158, + "step": 710 + }, + { + "epoch": 0.05488797880028909, + "grad_norm": 0.12006545811891556, + "learning_rate": 0.00019685181368507833, + "loss": 1.0395, + "step": 712 + }, + { + "epoch": 0.055042158516020234, + "grad_norm": 0.13973191380500793, + "learning_rate": 0.00019684150865622425, + "loss": 1.0685, + "step": 714 + }, + { + "epoch": 0.055196338231751386, + "grad_norm": 0.14461107552051544, + "learning_rate": 0.0001968312036273702, + "loss": 1.0924, + "step": 716 + }, + { + "epoch": 0.05535051794748253, + "grad_norm": 0.13358595967292786, + "learning_rate": 0.0001968208985985161, + "loss": 1.0479, + "step": 718 + }, + { + "epoch": 0.055504697663213684, + "grad_norm": 0.13416843116283417, + "learning_rate": 0.00019681059356966202, + "loss": 1.0166, + "step": 720 + }, + { + "epoch": 0.05565887737894483, + "grad_norm": 0.15217959880828857, + "learning_rate": 0.00019680028854080793, + "loss": 1.0918, + "step": 722 + }, + { + "epoch": 0.05581305709467598, + "grad_norm": 0.13012762367725372, + "learning_rate": 0.00019678998351195385, + "loss": 1.0967, + "step": 724 + }, + { + "epoch": 0.055967236810407134, + "grad_norm": 0.13023535907268524, + "learning_rate": 0.00019677967848309976, + "loss": 1.0247, + "step": 726 + }, + { + "epoch": 0.05612141652613828, + "grad_norm": 0.13703665137290955, + "learning_rate": 0.00019676937345424568, + "loss": 1.0969, + "step": 728 + }, + { + "epoch": 0.05627559624186943, + "grad_norm": 0.12767066061496735, + "learning_rate": 0.0001967590684253916, + "loss": 1.08, + "step": 730 + }, + { + "epoch": 0.05642977595760058, + "grad_norm": 0.12238382548093796, + "learning_rate": 0.0001967487633965375, + "loss": 1.1233, + "step": 732 + }, + { + "epoch": 0.05658395567333173, + "grad_norm": 0.1356974095106125, + "learning_rate": 0.00019673845836768342, + "loss": 1.0439, + "step": 734 + }, + { + "epoch": 0.056738135389062874, + "grad_norm": 0.14199669659137726, + "learning_rate": 0.00019672815333882936, + "loss": 1.0753, + "step": 736 + }, + { + "epoch": 0.056892315104794026, + "grad_norm": 0.12904112040996552, + "learning_rate": 0.00019671784830997528, + "loss": 1.0749, + "step": 738 + }, + { + "epoch": 0.05704649482052517, + "grad_norm": 0.1235031932592392, + "learning_rate": 0.0001967075432811212, + "loss": 1.0275, + "step": 740 + }, + { + "epoch": 0.057200674536256324, + "grad_norm": 0.170023113489151, + "learning_rate": 0.0001966972382522671, + "loss": 1.1295, + "step": 742 + }, + { + "epoch": 0.057354854251987476, + "grad_norm": 0.15533532202243805, + "learning_rate": 0.00019668693322341302, + "loss": 1.0629, + "step": 744 + }, + { + "epoch": 0.05750903396771862, + "grad_norm": 0.1602126806974411, + "learning_rate": 0.00019667662819455897, + "loss": 1.1538, + "step": 746 + }, + { + "epoch": 0.057663213683449774, + "grad_norm": 0.16433580219745636, + "learning_rate": 0.00019666632316570488, + "loss": 1.1322, + "step": 748 + }, + { + "epoch": 0.05781739339918092, + "grad_norm": 0.13925233483314514, + "learning_rate": 0.0001966560181368508, + "loss": 1.083, + "step": 750 + }, + { + "epoch": 0.05797157311491207, + "grad_norm": 0.12234565615653992, + "learning_rate": 0.0001966457131079967, + "loss": 1.0113, + "step": 752 + }, + { + "epoch": 0.05812575283064322, + "grad_norm": 0.1425125002861023, + "learning_rate": 0.00019663540807914262, + "loss": 1.0762, + "step": 754 + }, + { + "epoch": 0.05827993254637437, + "grad_norm": 0.14309099316596985, + "learning_rate": 0.00019662510305028854, + "loss": 1.0633, + "step": 756 + }, + { + "epoch": 0.058434112262105514, + "grad_norm": 0.1381814330816269, + "learning_rate": 0.00019661479802143448, + "loss": 1.142, + "step": 758 + }, + { + "epoch": 0.058588291977836666, + "grad_norm": 0.15551595389842987, + "learning_rate": 0.0001966044929925804, + "loss": 1.026, + "step": 760 + }, + { + "epoch": 0.05874247169356781, + "grad_norm": 0.14606410264968872, + "learning_rate": 0.0001965941879637263, + "loss": 1.1265, + "step": 762 + }, + { + "epoch": 0.058896651409298964, + "grad_norm": 0.13017289340496063, + "learning_rate": 0.00019658388293487223, + "loss": 1.1051, + "step": 764 + }, + { + "epoch": 0.059050831125030116, + "grad_norm": 0.1500990092754364, + "learning_rate": 0.00019657357790601814, + "loss": 1.0948, + "step": 766 + }, + { + "epoch": 0.05920501084076126, + "grad_norm": 0.14307473599910736, + "learning_rate": 0.00019656327287716408, + "loss": 1.0667, + "step": 768 + }, + { + "epoch": 0.059359190556492414, + "grad_norm": 0.13513712584972382, + "learning_rate": 0.00019655296784831, + "loss": 1.0488, + "step": 770 + }, + { + "epoch": 0.05951337027222356, + "grad_norm": 0.13991938531398773, + "learning_rate": 0.0001965426628194559, + "loss": 1.0888, + "step": 772 + }, + { + "epoch": 0.05966754998795471, + "grad_norm": 0.15015999972820282, + "learning_rate": 0.00019653235779060183, + "loss": 1.0774, + "step": 774 + }, + { + "epoch": 0.05982172970368586, + "grad_norm": 0.16419099271297455, + "learning_rate": 0.00019652205276174774, + "loss": 1.0661, + "step": 776 + }, + { + "epoch": 0.05997590941941701, + "grad_norm": 0.12072901427745819, + "learning_rate": 0.00019651174773289366, + "loss": 1.0645, + "step": 778 + }, + { + "epoch": 0.060130089135148154, + "grad_norm": 0.13410696387290955, + "learning_rate": 0.00019650144270403957, + "loss": 1.0677, + "step": 780 + }, + { + "epoch": 0.060284268850879306, + "grad_norm": 0.13373896479606628, + "learning_rate": 0.0001964911376751855, + "loss": 1.0055, + "step": 782 + }, + { + "epoch": 0.06043844856661046, + "grad_norm": 0.13043928146362305, + "learning_rate": 0.0001964808326463314, + "loss": 1.0579, + "step": 784 + }, + { + "epoch": 0.060592628282341604, + "grad_norm": 0.13334155082702637, + "learning_rate": 0.00019647052761747732, + "loss": 1.0781, + "step": 786 + }, + { + "epoch": 0.060746807998072756, + "grad_norm": 0.14660002291202545, + "learning_rate": 0.00019646022258862326, + "loss": 1.1244, + "step": 788 + }, + { + "epoch": 0.0609009877138039, + "grad_norm": 0.1240791380405426, + "learning_rate": 0.00019644991755976917, + "loss": 1.0353, + "step": 790 + }, + { + "epoch": 0.061055167429535054, + "grad_norm": 0.12248943001031876, + "learning_rate": 0.0001964396125309151, + "loss": 1.1292, + "step": 792 + }, + { + "epoch": 0.0612093471452662, + "grad_norm": 0.1340823471546173, + "learning_rate": 0.000196429307502061, + "loss": 1.0764, + "step": 794 + }, + { + "epoch": 0.06136352686099735, + "grad_norm": 0.1297413557767868, + "learning_rate": 0.00019641900247320692, + "loss": 1.0998, + "step": 796 + }, + { + "epoch": 0.0615177065767285, + "grad_norm": 0.13512568175792694, + "learning_rate": 0.00019640869744435286, + "loss": 1.0349, + "step": 798 + }, + { + "epoch": 0.06167188629245965, + "grad_norm": 0.13964438438415527, + "learning_rate": 0.00019639839241549878, + "loss": 1.0543, + "step": 800 + }, + { + "epoch": 0.06167188629245965, + "eval_loss": 1.0952669382095337, + "eval_runtime": 185.8383, + "eval_samples_per_second": 91.171, + "eval_steps_per_second": 1.426, + "step": 800 + }, + { + "epoch": 0.061826066008190794, + "grad_norm": 0.1318446695804596, + "learning_rate": 0.0001963880873866447, + "loss": 1.1469, + "step": 802 + }, + { + "epoch": 0.061980245723921946, + "grad_norm": 0.13778544962406158, + "learning_rate": 0.0001963777823577906, + "loss": 1.0361, + "step": 804 + }, + { + "epoch": 0.0621344254396531, + "grad_norm": 0.14804169535636902, + "learning_rate": 0.00019636747732893652, + "loss": 1.0537, + "step": 806 + }, + { + "epoch": 0.062288605155384244, + "grad_norm": 0.1363479495048523, + "learning_rate": 0.00019635717230008246, + "loss": 1.0819, + "step": 808 + }, + { + "epoch": 0.062442784871115396, + "grad_norm": 0.12277363240718842, + "learning_rate": 0.00019634686727122838, + "loss": 1.0629, + "step": 810 + }, + { + "epoch": 0.06259696458684655, + "grad_norm": 0.13027344644069672, + "learning_rate": 0.0001963365622423743, + "loss": 1.0544, + "step": 812 + }, + { + "epoch": 0.0627511443025777, + "grad_norm": 0.1274079531431198, + "learning_rate": 0.0001963262572135202, + "loss": 1.0685, + "step": 814 + }, + { + "epoch": 0.06290532401830884, + "grad_norm": 0.1349189281463623, + "learning_rate": 0.00019631595218466612, + "loss": 1.0289, + "step": 816 + }, + { + "epoch": 0.06305950373403998, + "grad_norm": 0.1265273541212082, + "learning_rate": 0.00019630564715581206, + "loss": 1.0765, + "step": 818 + }, + { + "epoch": 0.06321368344977114, + "grad_norm": 0.1393941193819046, + "learning_rate": 0.00019629534212695798, + "loss": 1.0918, + "step": 820 + }, + { + "epoch": 0.06336786316550229, + "grad_norm": 0.12475106865167618, + "learning_rate": 0.0001962850370981039, + "loss": 1.027, + "step": 822 + }, + { + "epoch": 0.06352204288123343, + "grad_norm": 0.13844382762908936, + "learning_rate": 0.0001962747320692498, + "loss": 1.1482, + "step": 824 + }, + { + "epoch": 0.0636762225969646, + "grad_norm": 0.1444624364376068, + "learning_rate": 0.00019626442704039572, + "loss": 1.0659, + "step": 826 + }, + { + "epoch": 0.06383040231269574, + "grad_norm": 0.13939915597438812, + "learning_rate": 0.00019625412201154164, + "loss": 1.0392, + "step": 828 + }, + { + "epoch": 0.06398458202842688, + "grad_norm": 0.12919913232326508, + "learning_rate": 0.00019624381698268755, + "loss": 1.0566, + "step": 830 + }, + { + "epoch": 0.06413876174415803, + "grad_norm": 0.1297498196363449, + "learning_rate": 0.00019623351195383347, + "loss": 1.058, + "step": 832 + }, + { + "epoch": 0.06429294145988919, + "grad_norm": 0.16311457753181458, + "learning_rate": 0.00019622320692497938, + "loss": 1.1175, + "step": 834 + }, + { + "epoch": 0.06444712117562033, + "grad_norm": 0.14434239268302917, + "learning_rate": 0.0001962129018961253, + "loss": 1.0966, + "step": 836 + }, + { + "epoch": 0.06460130089135148, + "grad_norm": 0.13500697910785675, + "learning_rate": 0.00019620259686727121, + "loss": 1.138, + "step": 838 + }, + { + "epoch": 0.06475548060708262, + "grad_norm": 0.13175781071186066, + "learning_rate": 0.00019619229183841716, + "loss": 1.0744, + "step": 840 + }, + { + "epoch": 0.06490966032281378, + "grad_norm": 0.142098531126976, + "learning_rate": 0.00019618198680956307, + "loss": 1.0686, + "step": 842 + }, + { + "epoch": 0.06506384003854493, + "grad_norm": 0.16844119131565094, + "learning_rate": 0.00019617168178070899, + "loss": 1.0992, + "step": 844 + }, + { + "epoch": 0.06521801975427607, + "grad_norm": 0.13562923669815063, + "learning_rate": 0.0001961613767518549, + "loss": 1.0749, + "step": 846 + }, + { + "epoch": 0.06537219947000723, + "grad_norm": 0.14538466930389404, + "learning_rate": 0.00019615107172300082, + "loss": 1.123, + "step": 848 + }, + { + "epoch": 0.06552637918573838, + "grad_norm": 0.13058879971504211, + "learning_rate": 0.00019614076669414676, + "loss": 1.0835, + "step": 850 + }, + { + "epoch": 0.06568055890146952, + "grad_norm": 0.1567140519618988, + "learning_rate": 0.00019613046166529267, + "loss": 1.1157, + "step": 852 + }, + { + "epoch": 0.06583473861720067, + "grad_norm": 0.12576104700565338, + "learning_rate": 0.0001961201566364386, + "loss": 1.0143, + "step": 854 + }, + { + "epoch": 0.06598891833293183, + "grad_norm": 0.13823091983795166, + "learning_rate": 0.0001961098516075845, + "loss": 1.0797, + "step": 856 + }, + { + "epoch": 0.06614309804866297, + "grad_norm": 0.12293639779090881, + "learning_rate": 0.00019609954657873042, + "loss": 1.0808, + "step": 858 + }, + { + "epoch": 0.06629727776439412, + "grad_norm": 0.13951502740383148, + "learning_rate": 0.00019608924154987636, + "loss": 1.076, + "step": 860 + }, + { + "epoch": 0.06645145748012526, + "grad_norm": 0.13900773227214813, + "learning_rate": 0.00019607893652102227, + "loss": 1.0846, + "step": 862 + }, + { + "epoch": 0.06660563719585642, + "grad_norm": 0.14335249364376068, + "learning_rate": 0.0001960686314921682, + "loss": 1.0639, + "step": 864 + }, + { + "epoch": 0.06675981691158757, + "grad_norm": 0.1712643951177597, + "learning_rate": 0.0001960583264633141, + "loss": 1.1411, + "step": 866 + }, + { + "epoch": 0.06691399662731871, + "grad_norm": 0.12118082493543625, + "learning_rate": 0.00019604802143446002, + "loss": 1.0807, + "step": 868 + }, + { + "epoch": 0.06706817634304987, + "grad_norm": 0.141808420419693, + "learning_rate": 0.00019603771640560596, + "loss": 1.0641, + "step": 870 + }, + { + "epoch": 0.06722235605878102, + "grad_norm": 0.14798308908939362, + "learning_rate": 0.00019602741137675188, + "loss": 1.073, + "step": 872 + }, + { + "epoch": 0.06737653577451216, + "grad_norm": 0.13768306374549866, + "learning_rate": 0.0001960171063478978, + "loss": 1.0735, + "step": 874 + }, + { + "epoch": 0.06753071549024331, + "grad_norm": 0.12452355027198792, + "learning_rate": 0.0001960068013190437, + "loss": 1.0509, + "step": 876 + }, + { + "epoch": 0.06768489520597447, + "grad_norm": 0.1402217000722885, + "learning_rate": 0.00019599649629018962, + "loss": 1.1157, + "step": 878 + }, + { + "epoch": 0.06783907492170561, + "grad_norm": 0.12509870529174805, + "learning_rate": 0.00019598619126133556, + "loss": 1.0516, + "step": 880 + }, + { + "epoch": 0.06799325463743676, + "grad_norm": 0.1574297547340393, + "learning_rate": 0.00019597588623248148, + "loss": 1.0823, + "step": 882 + }, + { + "epoch": 0.0681474343531679, + "grad_norm": 0.14185413718223572, + "learning_rate": 0.0001959655812036274, + "loss": 1.0444, + "step": 884 + }, + { + "epoch": 0.06830161406889906, + "grad_norm": 0.1380462348461151, + "learning_rate": 0.0001959552761747733, + "loss": 1.1066, + "step": 886 + }, + { + "epoch": 0.06845579378463021, + "grad_norm": 0.12986746430397034, + "learning_rate": 0.00019594497114591922, + "loss": 1.1006, + "step": 888 + }, + { + "epoch": 0.06860997350036135, + "grad_norm": 0.13894346356391907, + "learning_rate": 0.00019593466611706514, + "loss": 1.0569, + "step": 890 + }, + { + "epoch": 0.06876415321609251, + "grad_norm": 0.12822435796260834, + "learning_rate": 0.00019592436108821105, + "loss": 1.0696, + "step": 892 + }, + { + "epoch": 0.06891833293182366, + "grad_norm": 0.1369408816099167, + "learning_rate": 0.00019591405605935697, + "loss": 1.0691, + "step": 894 + }, + { + "epoch": 0.0690725126475548, + "grad_norm": 0.13459660112857819, + "learning_rate": 0.00019590375103050288, + "loss": 1.0801, + "step": 896 + }, + { + "epoch": 0.06922669236328595, + "grad_norm": 0.1299123764038086, + "learning_rate": 0.0001958934460016488, + "loss": 1.0885, + "step": 898 + }, + { + "epoch": 0.06938087207901711, + "grad_norm": 0.12562230229377747, + "learning_rate": 0.00019588314097279474, + "loss": 1.183, + "step": 900 + }, + { + "epoch": 0.06938087207901711, + "eval_loss": 1.0944268703460693, + "eval_runtime": 185.3723, + "eval_samples_per_second": 91.4, + "eval_steps_per_second": 1.43, + "step": 900 + }, + { + "epoch": 0.06953505179474825, + "grad_norm": 0.13996927440166473, + "learning_rate": 0.00019587283594394065, + "loss": 1.0356, + "step": 902 + }, + { + "epoch": 0.0696892315104794, + "grad_norm": 0.128004252910614, + "learning_rate": 0.00019586253091508657, + "loss": 1.0343, + "step": 904 + }, + { + "epoch": 0.06984341122621056, + "grad_norm": 0.15650418400764465, + "learning_rate": 0.00019585222588623248, + "loss": 1.1138, + "step": 906 + }, + { + "epoch": 0.0699975909419417, + "grad_norm": 0.5840476751327515, + "learning_rate": 0.0001958419208573784, + "loss": 1.1785, + "step": 908 + }, + { + "epoch": 0.07015177065767285, + "grad_norm": 0.15330374240875244, + "learning_rate": 0.00019583161582852434, + "loss": 1.0243, + "step": 910 + }, + { + "epoch": 0.070305950373404, + "grad_norm": 0.1603543907403946, + "learning_rate": 0.00019582131079967026, + "loss": 1.1228, + "step": 912 + }, + { + "epoch": 0.07046013008913515, + "grad_norm": 0.14209845662117004, + "learning_rate": 0.00019581100577081617, + "loss": 1.0939, + "step": 914 + }, + { + "epoch": 0.0706143098048663, + "grad_norm": 0.16117019951343536, + "learning_rate": 0.00019580070074196209, + "loss": 1.1447, + "step": 916 + }, + { + "epoch": 0.07076848952059744, + "grad_norm": 0.14068694412708282, + "learning_rate": 0.000195790395713108, + "loss": 1.0642, + "step": 918 + }, + { + "epoch": 0.07092266923632859, + "grad_norm": 0.15248316526412964, + "learning_rate": 0.00019578009068425394, + "loss": 1.0162, + "step": 920 + }, + { + "epoch": 0.07107684895205975, + "grad_norm": 0.22734233736991882, + "learning_rate": 0.00019576978565539986, + "loss": 1.1123, + "step": 922 + }, + { + "epoch": 0.0712310286677909, + "grad_norm": 0.1393287032842636, + "learning_rate": 0.00019575948062654577, + "loss": 1.0862, + "step": 924 + }, + { + "epoch": 0.07138520838352204, + "grad_norm": 0.12911191582679749, + "learning_rate": 0.0001957491755976917, + "loss": 1.0651, + "step": 926 + }, + { + "epoch": 0.0715393880992532, + "grad_norm": 0.12298440933227539, + "learning_rate": 0.0001957388705688376, + "loss": 1.1227, + "step": 928 + }, + { + "epoch": 0.07169356781498434, + "grad_norm": 0.14941005408763885, + "learning_rate": 0.00019572856553998352, + "loss": 1.0989, + "step": 930 + }, + { + "epoch": 0.07184774753071549, + "grad_norm": 0.1411515325307846, + "learning_rate": 0.00019571826051112946, + "loss": 1.0816, + "step": 932 + }, + { + "epoch": 0.07200192724644663, + "grad_norm": 0.11999720335006714, + "learning_rate": 0.00019570795548227537, + "loss": 1.0306, + "step": 934 + }, + { + "epoch": 0.0721561069621778, + "grad_norm": 0.1500861495733261, + "learning_rate": 0.0001956976504534213, + "loss": 1.0678, + "step": 936 + }, + { + "epoch": 0.07231028667790894, + "grad_norm": 0.12102475017309189, + "learning_rate": 0.0001956873454245672, + "loss": 1.0534, + "step": 938 + }, + { + "epoch": 0.07246446639364008, + "grad_norm": 0.11554603278636932, + "learning_rate": 0.00019567704039571312, + "loss": 1.0535, + "step": 940 + }, + { + "epoch": 0.07261864610937123, + "grad_norm": 0.12290264666080475, + "learning_rate": 0.00019566673536685903, + "loss": 1.0738, + "step": 942 + }, + { + "epoch": 0.07277282582510239, + "grad_norm": 0.17740991711616516, + "learning_rate": 0.00019565643033800495, + "loss": 1.0811, + "step": 944 + }, + { + "epoch": 0.07292700554083353, + "grad_norm": 0.14767777919769287, + "learning_rate": 0.00019564612530915086, + "loss": 1.105, + "step": 946 + }, + { + "epoch": 0.07308118525656468, + "grad_norm": 0.13773177564144135, + "learning_rate": 0.00019563582028029678, + "loss": 1.0983, + "step": 948 + }, + { + "epoch": 0.07323536497229584, + "grad_norm": 0.13891370594501495, + "learning_rate": 0.0001956255152514427, + "loss": 1.1349, + "step": 950 + }, + { + "epoch": 0.07338954468802698, + "grad_norm": 0.14717017114162445, + "learning_rate": 0.00019561521022258863, + "loss": 1.134, + "step": 952 + }, + { + "epoch": 0.07354372440375813, + "grad_norm": 0.15095743536949158, + "learning_rate": 0.00019560490519373455, + "loss": 1.063, + "step": 954 + }, + { + "epoch": 0.07369790411948927, + "grad_norm": 0.12851206958293915, + "learning_rate": 0.00019559460016488046, + "loss": 1.1005, + "step": 956 + }, + { + "epoch": 0.07385208383522043, + "grad_norm": 0.13364006578922272, + "learning_rate": 0.00019558429513602638, + "loss": 1.0429, + "step": 958 + }, + { + "epoch": 0.07400626355095158, + "grad_norm": 0.1326039433479309, + "learning_rate": 0.0001955739901071723, + "loss": 1.1586, + "step": 960 + }, + { + "epoch": 0.07416044326668272, + "grad_norm": 0.13149486482143402, + "learning_rate": 0.00019556368507831824, + "loss": 1.109, + "step": 962 + }, + { + "epoch": 0.07431462298241387, + "grad_norm": 0.1189669519662857, + "learning_rate": 0.00019555338004946415, + "loss": 1.0462, + "step": 964 + }, + { + "epoch": 0.07446880269814503, + "grad_norm": 0.14341482520103455, + "learning_rate": 0.00019554307502061007, + "loss": 1.0623, + "step": 966 + }, + { + "epoch": 0.07462298241387617, + "grad_norm": 0.14133721590042114, + "learning_rate": 0.00019553276999175598, + "loss": 1.0945, + "step": 968 + }, + { + "epoch": 0.07477716212960732, + "grad_norm": 0.1351941078901291, + "learning_rate": 0.0001955224649629019, + "loss": 1.0327, + "step": 970 + }, + { + "epoch": 0.07493134184533848, + "grad_norm": 0.12836019694805145, + "learning_rate": 0.00019551215993404784, + "loss": 1.069, + "step": 972 + }, + { + "epoch": 0.07508552156106962, + "grad_norm": 0.13199055194854736, + "learning_rate": 0.00019550185490519375, + "loss": 1.0323, + "step": 974 + }, + { + "epoch": 0.07523970127680077, + "grad_norm": 0.14991353452205658, + "learning_rate": 0.00019549154987633967, + "loss": 1.0625, + "step": 976 + }, + { + "epoch": 0.07539388099253191, + "grad_norm": 0.13832435011863708, + "learning_rate": 0.00019548124484748558, + "loss": 1.1031, + "step": 978 + }, + { + "epoch": 0.07554806070826307, + "grad_norm": 0.12351599335670471, + "learning_rate": 0.0001954709398186315, + "loss": 1.0286, + "step": 980 + }, + { + "epoch": 0.07570224042399422, + "grad_norm": 0.12360050529241562, + "learning_rate": 0.00019546063478977744, + "loss": 1.0652, + "step": 982 + }, + { + "epoch": 0.07585642013972536, + "grad_norm": 0.13384872674942017, + "learning_rate": 0.00019545032976092335, + "loss": 1.1125, + "step": 984 + }, + { + "epoch": 0.07601059985545652, + "grad_norm": 0.13200527429580688, + "learning_rate": 0.00019544002473206927, + "loss": 1.0727, + "step": 986 + }, + { + "epoch": 0.07616477957118767, + "grad_norm": 0.143647700548172, + "learning_rate": 0.00019542971970321518, + "loss": 1.1207, + "step": 988 + }, + { + "epoch": 0.07631895928691881, + "grad_norm": 0.13605177402496338, + "learning_rate": 0.0001954194146743611, + "loss": 1.0225, + "step": 990 + }, + { + "epoch": 0.07647313900264996, + "grad_norm": 0.12646125257015228, + "learning_rate": 0.00019540910964550701, + "loss": 1.11, + "step": 992 + }, + { + "epoch": 0.07662731871838112, + "grad_norm": 0.132467120885849, + "learning_rate": 0.00019539880461665293, + "loss": 1.1092, + "step": 994 + }, + { + "epoch": 0.07678149843411226, + "grad_norm": 0.12461701035499573, + "learning_rate": 0.00019538849958779884, + "loss": 1.0854, + "step": 996 + }, + { + "epoch": 0.07693567814984341, + "grad_norm": 0.13430501520633698, + "learning_rate": 0.00019537819455894476, + "loss": 1.2, + "step": 998 + }, + { + "epoch": 0.07708985786557455, + "grad_norm": 0.12623916566371918, + "learning_rate": 0.00019536788953009067, + "loss": 1.0522, + "step": 1000 + }, + { + "epoch": 0.07708985786557455, + "eval_loss": 1.0930616855621338, + "eval_runtime": 185.4001, + "eval_samples_per_second": 91.386, + "eval_steps_per_second": 1.429, + "step": 1000 + }, + { + "epoch": 0.07724403758130571, + "grad_norm": 0.11760087311267853, + "learning_rate": 0.00019535758450123662, + "loss": 1.1566, + "step": 1002 + }, + { + "epoch": 0.07739821729703686, + "grad_norm": 0.145633727312088, + "learning_rate": 0.00019534727947238253, + "loss": 1.094, + "step": 1004 + }, + { + "epoch": 0.077552397012768, + "grad_norm": 0.1311633288860321, + "learning_rate": 0.00019533697444352845, + "loss": 1.0792, + "step": 1006 + }, + { + "epoch": 0.07770657672849916, + "grad_norm": 0.12563548982143402, + "learning_rate": 0.00019532666941467436, + "loss": 1.0601, + "step": 1008 + }, + { + "epoch": 0.07786075644423031, + "grad_norm": 0.14429886639118195, + "learning_rate": 0.00019531636438582028, + "loss": 1.0926, + "step": 1010 + }, + { + "epoch": 0.07801493615996145, + "grad_norm": 0.13131891191005707, + "learning_rate": 0.0001953060593569662, + "loss": 1.1012, + "step": 1012 + }, + { + "epoch": 0.0781691158756926, + "grad_norm": 0.14185300469398499, + "learning_rate": 0.00019529575432811213, + "loss": 1.1113, + "step": 1014 + }, + { + "epoch": 0.07832329559142376, + "grad_norm": 0.14298418164253235, + "learning_rate": 0.00019528544929925805, + "loss": 1.0909, + "step": 1016 + }, + { + "epoch": 0.0784774753071549, + "grad_norm": 0.1339821219444275, + "learning_rate": 0.00019527514427040396, + "loss": 1.0994, + "step": 1018 + }, + { + "epoch": 0.07863165502288605, + "grad_norm": 0.1252928525209427, + "learning_rate": 0.00019526483924154988, + "loss": 1.0316, + "step": 1020 + }, + { + "epoch": 0.0787858347386172, + "grad_norm": 0.1277703046798706, + "learning_rate": 0.0001952545342126958, + "loss": 1.1067, + "step": 1022 + }, + { + "epoch": 0.07894001445434835, + "grad_norm": 0.12644124031066895, + "learning_rate": 0.00019524422918384173, + "loss": 1.0176, + "step": 1024 + }, + { + "epoch": 0.0790941941700795, + "grad_norm": 0.13443627953529358, + "learning_rate": 0.00019523392415498765, + "loss": 1.0754, + "step": 1026 + }, + { + "epoch": 0.07924837388581064, + "grad_norm": 0.1895609050989151, + "learning_rate": 0.00019522361912613356, + "loss": 1.0551, + "step": 1028 + }, + { + "epoch": 0.0794025536015418, + "grad_norm": 0.1372397392988205, + "learning_rate": 0.00019521331409727948, + "loss": 1.0442, + "step": 1030 + }, + { + "epoch": 0.07955673331727295, + "grad_norm": 0.14173942804336548, + "learning_rate": 0.0001952030090684254, + "loss": 1.0692, + "step": 1032 + }, + { + "epoch": 0.0797109130330041, + "grad_norm": 0.12321804463863373, + "learning_rate": 0.00019519270403957134, + "loss": 1.0276, + "step": 1034 + }, + { + "epoch": 0.07986509274873524, + "grad_norm": 0.12327130138874054, + "learning_rate": 0.00019518239901071725, + "loss": 1.0376, + "step": 1036 + }, + { + "epoch": 0.0800192724644664, + "grad_norm": 0.12301841378211975, + "learning_rate": 0.00019517209398186317, + "loss": 1.0887, + "step": 1038 + }, + { + "epoch": 0.08017345218019754, + "grad_norm": 0.1429559886455536, + "learning_rate": 0.00019516178895300908, + "loss": 1.0321, + "step": 1040 + }, + { + "epoch": 0.08032763189592869, + "grad_norm": 0.13955366611480713, + "learning_rate": 0.000195151483924155, + "loss": 1.1081, + "step": 1042 + }, + { + "epoch": 0.08048181161165983, + "grad_norm": 0.13553303480148315, + "learning_rate": 0.00019514117889530094, + "loss": 1.0252, + "step": 1044 + }, + { + "epoch": 0.080635991327391, + "grad_norm": 0.14100225269794464, + "learning_rate": 0.00019513087386644685, + "loss": 1.1071, + "step": 1046 + }, + { + "epoch": 0.08079017104312214, + "grad_norm": 0.14522643387317657, + "learning_rate": 0.00019512056883759277, + "loss": 1.0653, + "step": 1048 + }, + { + "epoch": 0.08094435075885328, + "grad_norm": 0.14540371298789978, + "learning_rate": 0.00019511026380873868, + "loss": 1.01, + "step": 1050 + }, + { + "epoch": 0.08109853047458444, + "grad_norm": 0.1459018737077713, + "learning_rate": 0.0001950999587798846, + "loss": 1.1147, + "step": 1052 + }, + { + "epoch": 0.08125271019031559, + "grad_norm": 0.12590867280960083, + "learning_rate": 0.0001950896537510305, + "loss": 1.0685, + "step": 1054 + }, + { + "epoch": 0.08140688990604673, + "grad_norm": 0.11943504959344864, + "learning_rate": 0.00019507934872217643, + "loss": 1.0854, + "step": 1056 + }, + { + "epoch": 0.08156106962177788, + "grad_norm": 0.12039398401975632, + "learning_rate": 0.00019506904369332234, + "loss": 1.1397, + "step": 1058 + }, + { + "epoch": 0.08171524933750904, + "grad_norm": 0.1411554217338562, + "learning_rate": 0.00019505873866446826, + "loss": 1.1271, + "step": 1060 + }, + { + "epoch": 0.08186942905324018, + "grad_norm": 0.1402871012687683, + "learning_rate": 0.00019504843363561417, + "loss": 1.0425, + "step": 1062 + }, + { + "epoch": 0.08202360876897133, + "grad_norm": 0.13545840978622437, + "learning_rate": 0.00019503812860676011, + "loss": 1.0571, + "step": 1064 + }, + { + "epoch": 0.08217778848470249, + "grad_norm": 0.12789209187030792, + "learning_rate": 0.00019502782357790603, + "loss": 1.0596, + "step": 1066 + }, + { + "epoch": 0.08233196820043363, + "grad_norm": 0.13018928468227386, + "learning_rate": 0.00019501751854905194, + "loss": 1.1188, + "step": 1068 + }, + { + "epoch": 0.08248614791616478, + "grad_norm": 0.12482234835624695, + "learning_rate": 0.00019500721352019786, + "loss": 1.0831, + "step": 1070 + }, + { + "epoch": 0.08264032763189592, + "grad_norm": 0.11897309869527817, + "learning_rate": 0.00019499690849134377, + "loss": 1.0658, + "step": 1072 + }, + { + "epoch": 0.08279450734762708, + "grad_norm": 0.12954497337341309, + "learning_rate": 0.00019498660346248972, + "loss": 1.0204, + "step": 1074 + }, + { + "epoch": 0.08294868706335823, + "grad_norm": 0.14220042526721954, + "learning_rate": 0.00019497629843363563, + "loss": 1.1101, + "step": 1076 + }, + { + "epoch": 0.08310286677908937, + "grad_norm": 0.1631559580564499, + "learning_rate": 0.00019496599340478155, + "loss": 1.1352, + "step": 1078 + }, + { + "epoch": 0.08325704649482052, + "grad_norm": 0.13439539074897766, + "learning_rate": 0.00019495568837592746, + "loss": 1.0108, + "step": 1080 + }, + { + "epoch": 0.08341122621055168, + "grad_norm": 0.12389718741178513, + "learning_rate": 0.00019494538334707338, + "loss": 1.0155, + "step": 1082 + }, + { + "epoch": 0.08356540592628282, + "grad_norm": 0.1241556853055954, + "learning_rate": 0.00019493507831821932, + "loss": 1.1428, + "step": 1084 + }, + { + "epoch": 0.08371958564201397, + "grad_norm": 0.13087880611419678, + "learning_rate": 0.00019492477328936523, + "loss": 1.0876, + "step": 1086 + }, + { + "epoch": 0.08387376535774513, + "grad_norm": 0.12431449443101883, + "learning_rate": 0.00019491446826051115, + "loss": 1.0758, + "step": 1088 + }, + { + "epoch": 0.08402794507347627, + "grad_norm": 0.13807635009288788, + "learning_rate": 0.00019490416323165706, + "loss": 1.0902, + "step": 1090 + }, + { + "epoch": 0.08418212478920742, + "grad_norm": 0.12751048803329468, + "learning_rate": 0.00019489385820280298, + "loss": 1.0732, + "step": 1092 + }, + { + "epoch": 0.08433630450493856, + "grad_norm": 0.15594707429409027, + "learning_rate": 0.00019488355317394892, + "loss": 1.1115, + "step": 1094 + }, + { + "epoch": 0.08449048422066972, + "grad_norm": 0.11647301912307739, + "learning_rate": 0.00019487324814509483, + "loss": 1.1592, + "step": 1096 + }, + { + "epoch": 0.08464466393640087, + "grad_norm": 0.13609850406646729, + "learning_rate": 0.00019486294311624075, + "loss": 1.1139, + "step": 1098 + }, + { + "epoch": 0.08479884365213201, + "grad_norm": 0.1234198659658432, + "learning_rate": 0.00019485263808738666, + "loss": 1.0682, + "step": 1100 + }, + { + "epoch": 0.08479884365213201, + "eval_loss": 1.0920624732971191, + "eval_runtime": 185.5142, + "eval_samples_per_second": 91.33, + "eval_steps_per_second": 1.428, + "step": 1100 + }, + { + "epoch": 0.08495302336786316, + "grad_norm": 0.1375039666891098, + "learning_rate": 0.00019484233305853258, + "loss": 1.0585, + "step": 1102 + }, + { + "epoch": 0.08510720308359432, + "grad_norm": 0.14471521973609924, + "learning_rate": 0.0001948320280296785, + "loss": 1.1115, + "step": 1104 + }, + { + "epoch": 0.08526138279932546, + "grad_norm": 0.12425632029771805, + "learning_rate": 0.0001948217230008244, + "loss": 1.0501, + "step": 1106 + }, + { + "epoch": 0.08541556251505661, + "grad_norm": 0.1161596029996872, + "learning_rate": 0.00019481141797197032, + "loss": 1.0182, + "step": 1108 + }, + { + "epoch": 0.08556974223078777, + "grad_norm": 0.11700072139501572, + "learning_rate": 0.00019480111294311624, + "loss": 1.0579, + "step": 1110 + }, + { + "epoch": 0.08572392194651891, + "grad_norm": 0.14330415427684784, + "learning_rate": 0.00019479080791426215, + "loss": 1.1211, + "step": 1112 + }, + { + "epoch": 0.08587810166225006, + "grad_norm": 0.14039026200771332, + "learning_rate": 0.00019478050288540807, + "loss": 1.0826, + "step": 1114 + }, + { + "epoch": 0.0860322813779812, + "grad_norm": 0.14031362533569336, + "learning_rate": 0.000194770197856554, + "loss": 1.0871, + "step": 1116 + }, + { + "epoch": 0.08618646109371236, + "grad_norm": 0.12351037561893463, + "learning_rate": 0.00019475989282769993, + "loss": 1.001, + "step": 1118 + }, + { + "epoch": 0.08634064080944351, + "grad_norm": 0.11667052656412125, + "learning_rate": 0.00019474958779884584, + "loss": 1.0421, + "step": 1120 + }, + { + "epoch": 0.08649482052517465, + "grad_norm": 0.1489124447107315, + "learning_rate": 0.00019473928276999175, + "loss": 1.1644, + "step": 1122 + }, + { + "epoch": 0.0866490002409058, + "grad_norm": 0.1338202804327011, + "learning_rate": 0.00019472897774113767, + "loss": 1.1239, + "step": 1124 + }, + { + "epoch": 0.08680317995663696, + "grad_norm": 0.13266493380069733, + "learning_rate": 0.0001947186727122836, + "loss": 1.0839, + "step": 1126 + }, + { + "epoch": 0.0869573596723681, + "grad_norm": 0.13726286590099335, + "learning_rate": 0.00019470836768342953, + "loss": 1.1325, + "step": 1128 + }, + { + "epoch": 0.08711153938809925, + "grad_norm": 0.14077100157737732, + "learning_rate": 0.00019469806265457544, + "loss": 1.0429, + "step": 1130 + }, + { + "epoch": 0.08726571910383041, + "grad_norm": 0.1362866312265396, + "learning_rate": 0.00019468775762572136, + "loss": 1.0715, + "step": 1132 + }, + { + "epoch": 0.08741989881956155, + "grad_norm": 0.12472223490476608, + "learning_rate": 0.00019467745259686727, + "loss": 1.0503, + "step": 1134 + }, + { + "epoch": 0.0875740785352927, + "grad_norm": 0.1350635141134262, + "learning_rate": 0.0001946671475680132, + "loss": 1.0498, + "step": 1136 + }, + { + "epoch": 0.08772825825102384, + "grad_norm": 0.1424301117658615, + "learning_rate": 0.00019465684253915913, + "loss": 1.1589, + "step": 1138 + }, + { + "epoch": 0.087882437966755, + "grad_norm": 0.12365067005157471, + "learning_rate": 0.00019464653751030504, + "loss": 1.1065, + "step": 1140 + }, + { + "epoch": 0.08803661768248615, + "grad_norm": 0.16497495770454407, + "learning_rate": 0.00019463623248145096, + "loss": 1.0189, + "step": 1142 + }, + { + "epoch": 0.0881907973982173, + "grad_norm": 0.1381298303604126, + "learning_rate": 0.00019462592745259687, + "loss": 1.0426, + "step": 1144 + }, + { + "epoch": 0.08834497711394845, + "grad_norm": 0.15007291734218597, + "learning_rate": 0.00019461562242374282, + "loss": 1.1108, + "step": 1146 + }, + { + "epoch": 0.0884991568296796, + "grad_norm": 0.19384606182575226, + "learning_rate": 0.00019460531739488873, + "loss": 1.0664, + "step": 1148 + }, + { + "epoch": 0.08865333654541074, + "grad_norm": 0.12032177299261093, + "learning_rate": 0.00019459501236603465, + "loss": 1.018, + "step": 1150 + }, + { + "epoch": 0.08880751626114189, + "grad_norm": 0.1197669506072998, + "learning_rate": 0.00019458470733718056, + "loss": 1.071, + "step": 1152 + }, + { + "epoch": 0.08896169597687305, + "grad_norm": 0.12108784914016724, + "learning_rate": 0.00019457440230832647, + "loss": 1.0499, + "step": 1154 + }, + { + "epoch": 0.0891158756926042, + "grad_norm": 0.1270270049571991, + "learning_rate": 0.0001945640972794724, + "loss": 1.1172, + "step": 1156 + }, + { + "epoch": 0.08927005540833534, + "grad_norm": 0.13599786162376404, + "learning_rate": 0.0001945537922506183, + "loss": 1.103, + "step": 1158 + }, + { + "epoch": 0.08942423512406648, + "grad_norm": 0.12051045894622803, + "learning_rate": 0.00019454348722176422, + "loss": 1.0905, + "step": 1160 + }, + { + "epoch": 0.08957841483979764, + "grad_norm": 0.12117696553468704, + "learning_rate": 0.00019453318219291013, + "loss": 1.0611, + "step": 1162 + }, + { + "epoch": 0.08973259455552879, + "grad_norm": 0.13710887730121613, + "learning_rate": 0.00019452287716405605, + "loss": 1.0242, + "step": 1164 + }, + { + "epoch": 0.08988677427125993, + "grad_norm": 0.1160813644528389, + "learning_rate": 0.000194512572135202, + "loss": 1.0863, + "step": 1166 + }, + { + "epoch": 0.09004095398699109, + "grad_norm": 0.1754099279642105, + "learning_rate": 0.0001945022671063479, + "loss": 1.0938, + "step": 1168 + }, + { + "epoch": 0.09019513370272224, + "grad_norm": 0.1331128627061844, + "learning_rate": 0.00019449196207749382, + "loss": 1.0692, + "step": 1170 + }, + { + "epoch": 0.09034931341845338, + "grad_norm": 0.13422611355781555, + "learning_rate": 0.00019448165704863974, + "loss": 1.0699, + "step": 1172 + }, + { + "epoch": 0.09050349313418453, + "grad_norm": 0.12999802827835083, + "learning_rate": 0.00019447135201978565, + "loss": 1.0957, + "step": 1174 + }, + { + "epoch": 0.09065767284991569, + "grad_norm": 0.13413815200328827, + "learning_rate": 0.0001944610469909316, + "loss": 1.0869, + "step": 1176 + }, + { + "epoch": 0.09081185256564683, + "grad_norm": 0.12901006639003754, + "learning_rate": 0.0001944507419620775, + "loss": 1.0442, + "step": 1178 + }, + { + "epoch": 0.09096603228137798, + "grad_norm": 0.11824194341897964, + "learning_rate": 0.00019444043693322342, + "loss": 1.0935, + "step": 1180 + }, + { + "epoch": 0.09112021199710912, + "grad_norm": 0.14895616471767426, + "learning_rate": 0.00019443013190436934, + "loss": 1.0624, + "step": 1182 + }, + { + "epoch": 0.09127439171284028, + "grad_norm": 0.13515722751617432, + "learning_rate": 0.00019441982687551525, + "loss": 1.0797, + "step": 1184 + }, + { + "epoch": 0.09142857142857143, + "grad_norm": 0.13411575555801392, + "learning_rate": 0.00019440952184666117, + "loss": 1.0637, + "step": 1186 + }, + { + "epoch": 0.09158275114430257, + "grad_norm": 0.12519463896751404, + "learning_rate": 0.0001943992168178071, + "loss": 1.0608, + "step": 1188 + }, + { + "epoch": 0.09173693086003373, + "grad_norm": 0.1267428696155548, + "learning_rate": 0.00019438891178895302, + "loss": 1.0182, + "step": 1190 + }, + { + "epoch": 0.09189111057576488, + "grad_norm": 0.13116560876369476, + "learning_rate": 0.00019437860676009894, + "loss": 1.1139, + "step": 1192 + }, + { + "epoch": 0.09204529029149602, + "grad_norm": 0.14659713208675385, + "learning_rate": 0.00019436830173124485, + "loss": 1.1275, + "step": 1194 + }, + { + "epoch": 0.09219947000722717, + "grad_norm": 0.12913885712623596, + "learning_rate": 0.00019435799670239077, + "loss": 1.0858, + "step": 1196 + }, + { + "epoch": 0.09235364972295833, + "grad_norm": 0.12855856120586395, + "learning_rate": 0.0001943476916735367, + "loss": 1.0811, + "step": 1198 + }, + { + "epoch": 0.09250782943868947, + "grad_norm": 0.1391747146844864, + "learning_rate": 0.00019433738664468263, + "loss": 1.0146, + "step": 1200 + }, + { + "epoch": 0.09250782943868947, + "eval_loss": 1.0912913084030151, + "eval_runtime": 185.3661, + "eval_samples_per_second": 91.403, + "eval_steps_per_second": 1.43, + "step": 1200 + }, + { + "epoch": 0.09266200915442062, + "grad_norm": 0.13186782598495483, + "learning_rate": 0.00019432708161582854, + "loss": 1.1017, + "step": 1202 + }, + { + "epoch": 0.09281618887015176, + "grad_norm": 0.12913943827152252, + "learning_rate": 0.00019431677658697446, + "loss": 1.1027, + "step": 1204 + }, + { + "epoch": 0.09297036858588292, + "grad_norm": 0.1349743753671646, + "learning_rate": 0.00019430647155812037, + "loss": 1.1023, + "step": 1206 + }, + { + "epoch": 0.09312454830161407, + "grad_norm": 0.12534667551517487, + "learning_rate": 0.00019429616652926629, + "loss": 1.0659, + "step": 1208 + }, + { + "epoch": 0.09327872801734521, + "grad_norm": 0.11720700562000275, + "learning_rate": 0.0001942858615004122, + "loss": 1.0532, + "step": 1210 + }, + { + "epoch": 0.09343290773307637, + "grad_norm": 0.1364222913980484, + "learning_rate": 0.00019427555647155812, + "loss": 1.0575, + "step": 1212 + }, + { + "epoch": 0.09358708744880752, + "grad_norm": 0.15532977879047394, + "learning_rate": 0.00019426525144270403, + "loss": 1.1145, + "step": 1214 + }, + { + "epoch": 0.09374126716453866, + "grad_norm": 0.1377478837966919, + "learning_rate": 0.00019425494641384995, + "loss": 1.0505, + "step": 1216 + }, + { + "epoch": 0.09389544688026981, + "grad_norm": 0.1273409128189087, + "learning_rate": 0.0001942446413849959, + "loss": 1.0873, + "step": 1218 + }, + { + "epoch": 0.09404962659600097, + "grad_norm": 0.11990435421466827, + "learning_rate": 0.0001942343363561418, + "loss": 1.0829, + "step": 1220 + }, + { + "epoch": 0.09420380631173211, + "grad_norm": 0.14191892743110657, + "learning_rate": 0.00019422403132728772, + "loss": 1.0992, + "step": 1222 + }, + { + "epoch": 0.09435798602746326, + "grad_norm": 0.14520397782325745, + "learning_rate": 0.00019421372629843363, + "loss": 1.0712, + "step": 1224 + }, + { + "epoch": 0.09451216574319442, + "grad_norm": 0.13780727982521057, + "learning_rate": 0.00019420342126957955, + "loss": 0.9943, + "step": 1226 + }, + { + "epoch": 0.09466634545892556, + "grad_norm": 0.13550738990306854, + "learning_rate": 0.0001941931162407255, + "loss": 1.1264, + "step": 1228 + }, + { + "epoch": 0.09482052517465671, + "grad_norm": 0.12125276774168015, + "learning_rate": 0.0001941828112118714, + "loss": 1.1207, + "step": 1230 + }, + { + "epoch": 0.09497470489038785, + "grad_norm": 0.14529301226139069, + "learning_rate": 0.00019417250618301732, + "loss": 1.144, + "step": 1232 + }, + { + "epoch": 0.09512888460611901, + "grad_norm": 0.15477551519870758, + "learning_rate": 0.00019416220115416323, + "loss": 1.0568, + "step": 1234 + }, + { + "epoch": 0.09528306432185016, + "grad_norm": 0.1299963742494583, + "learning_rate": 0.00019415189612530915, + "loss": 1.0235, + "step": 1236 + }, + { + "epoch": 0.0954372440375813, + "grad_norm": 0.1372281014919281, + "learning_rate": 0.0001941415910964551, + "loss": 1.0764, + "step": 1238 + }, + { + "epoch": 0.09559142375331245, + "grad_norm": 0.1247306764125824, + "learning_rate": 0.000194131286067601, + "loss": 1.1345, + "step": 1240 + }, + { + "epoch": 0.09574560346904361, + "grad_norm": 0.1330571472644806, + "learning_rate": 0.00019412098103874692, + "loss": 1.1596, + "step": 1242 + }, + { + "epoch": 0.09589978318477475, + "grad_norm": 0.15787385404109955, + "learning_rate": 0.00019411067600989284, + "loss": 1.1067, + "step": 1244 + }, + { + "epoch": 0.0960539629005059, + "grad_norm": 0.12646274268627167, + "learning_rate": 0.00019410037098103875, + "loss": 1.0769, + "step": 1246 + }, + { + "epoch": 0.09620814261623706, + "grad_norm": 0.16424262523651123, + "learning_rate": 0.0001940900659521847, + "loss": 1.0459, + "step": 1248 + }, + { + "epoch": 0.0963623223319682, + "grad_norm": 0.1401062309741974, + "learning_rate": 0.0001940797609233306, + "loss": 1.1308, + "step": 1250 + }, + { + "epoch": 0.09651650204769935, + "grad_norm": 0.13971561193466187, + "learning_rate": 0.00019406945589447652, + "loss": 1.1457, + "step": 1252 + }, + { + "epoch": 0.0966706817634305, + "grad_norm": 0.13544687628746033, + "learning_rate": 0.00019405915086562244, + "loss": 1.0532, + "step": 1254 + }, + { + "epoch": 0.09682486147916165, + "grad_norm": 0.13527531921863556, + "learning_rate": 0.00019404884583676835, + "loss": 1.0376, + "step": 1256 + }, + { + "epoch": 0.0969790411948928, + "grad_norm": 0.1731848120689392, + "learning_rate": 0.0001940385408079143, + "loss": 1.2252, + "step": 1258 + }, + { + "epoch": 0.09713322091062394, + "grad_norm": 0.13142083585262299, + "learning_rate": 0.0001940282357790602, + "loss": 1.0254, + "step": 1260 + }, + { + "epoch": 0.09728740062635509, + "grad_norm": 0.13390247523784637, + "learning_rate": 0.00019401793075020612, + "loss": 1.0448, + "step": 1262 + }, + { + "epoch": 0.09744158034208625, + "grad_norm": 0.15188650786876678, + "learning_rate": 0.00019400762572135204, + "loss": 1.1019, + "step": 1264 + }, + { + "epoch": 0.0975957600578174, + "grad_norm": 0.14055617153644562, + "learning_rate": 0.00019399732069249795, + "loss": 1.0835, + "step": 1266 + }, + { + "epoch": 0.09774993977354854, + "grad_norm": 0.12209255248308182, + "learning_rate": 0.00019398701566364387, + "loss": 1.0675, + "step": 1268 + }, + { + "epoch": 0.0979041194892797, + "grad_norm": 0.14639706909656525, + "learning_rate": 0.00019397671063478978, + "loss": 1.049, + "step": 1270 + }, + { + "epoch": 0.09805829920501084, + "grad_norm": 0.13672591745853424, + "learning_rate": 0.0001939664056059357, + "loss": 1.1057, + "step": 1272 + }, + { + "epoch": 0.09821247892074199, + "grad_norm": 0.1522635966539383, + "learning_rate": 0.00019395610057708161, + "loss": 1.14, + "step": 1274 + }, + { + "epoch": 0.09836665863647313, + "grad_norm": 0.13887491822242737, + "learning_rate": 0.00019394579554822753, + "loss": 1.069, + "step": 1276 + }, + { + "epoch": 0.09852083835220429, + "grad_norm": 0.13854965567588806, + "learning_rate": 0.00019393549051937344, + "loss": 1.0704, + "step": 1278 + }, + { + "epoch": 0.09867501806793544, + "grad_norm": 0.12839765846729279, + "learning_rate": 0.00019392518549051939, + "loss": 1.0512, + "step": 1280 + }, + { + "epoch": 0.09882919778366658, + "grad_norm": 0.1270405352115631, + "learning_rate": 0.0001939148804616653, + "loss": 1.0251, + "step": 1282 + }, + { + "epoch": 0.09898337749939773, + "grad_norm": 0.1269143521785736, + "learning_rate": 0.00019390457543281122, + "loss": 1.0433, + "step": 1284 + }, + { + "epoch": 0.09913755721512889, + "grad_norm": 0.14292192459106445, + "learning_rate": 0.00019389427040395713, + "loss": 1.1507, + "step": 1286 + }, + { + "epoch": 0.09929173693086003, + "grad_norm": 0.12512263655662537, + "learning_rate": 0.00019388396537510305, + "loss": 1.0918, + "step": 1288 + }, + { + "epoch": 0.09944591664659118, + "grad_norm": 0.11927679181098938, + "learning_rate": 0.000193873660346249, + "loss": 1.0924, + "step": 1290 + }, + { + "epoch": 0.09960009636232234, + "grad_norm": 0.13639990985393524, + "learning_rate": 0.0001938633553173949, + "loss": 1.1024, + "step": 1292 + }, + { + "epoch": 0.09975427607805348, + "grad_norm": 0.142363503575325, + "learning_rate": 0.00019385305028854082, + "loss": 1.021, + "step": 1294 + }, + { + "epoch": 0.09990845579378463, + "grad_norm": 0.1389359086751938, + "learning_rate": 0.00019384274525968673, + "loss": 1.0269, + "step": 1296 + }, + { + "epoch": 0.10006263550951577, + "grad_norm": 0.15595073997974396, + "learning_rate": 0.00019383244023083265, + "loss": 1.0913, + "step": 1298 + }, + { + "epoch": 0.10021681522524693, + "grad_norm": 0.1324295848608017, + "learning_rate": 0.0001938221352019786, + "loss": 1.1001, + "step": 1300 + }, + { + "epoch": 0.10021681522524693, + "eval_loss": 1.0909266471862793, + "eval_runtime": 185.4116, + "eval_samples_per_second": 91.38, + "eval_steps_per_second": 1.429, + "step": 1300 + }, + { + "epoch": 0.10037099494097808, + "grad_norm": 0.139576256275177, + "learning_rate": 0.0001938118301731245, + "loss": 1.1147, + "step": 1302 + }, + { + "epoch": 0.10052517465670922, + "grad_norm": 0.12854811549186707, + "learning_rate": 0.00019380152514427042, + "loss": 1.0973, + "step": 1304 + }, + { + "epoch": 0.10067935437244037, + "grad_norm": 0.1245393380522728, + "learning_rate": 0.00019379122011541633, + "loss": 1.0485, + "step": 1306 + }, + { + "epoch": 0.10083353408817153, + "grad_norm": 0.13261497020721436, + "learning_rate": 0.00019378091508656225, + "loss": 1.156, + "step": 1308 + }, + { + "epoch": 0.10098771380390267, + "grad_norm": 0.1255144327878952, + "learning_rate": 0.0001937706100577082, + "loss": 1.0852, + "step": 1310 + }, + { + "epoch": 0.10114189351963382, + "grad_norm": 0.1412706971168518, + "learning_rate": 0.0001937603050288541, + "loss": 1.0766, + "step": 1312 + }, + { + "epoch": 0.10129607323536498, + "grad_norm": 0.1281047761440277, + "learning_rate": 0.00019375000000000002, + "loss": 1.0824, + "step": 1314 + }, + { + "epoch": 0.10145025295109612, + "grad_norm": 0.13307350873947144, + "learning_rate": 0.00019373969497114594, + "loss": 1.0887, + "step": 1316 + }, + { + "epoch": 0.10160443266682727, + "grad_norm": 0.1287691742181778, + "learning_rate": 0.00019372938994229185, + "loss": 1.0705, + "step": 1318 + }, + { + "epoch": 0.10175861238255841, + "grad_norm": 0.1303441971540451, + "learning_rate": 0.00019371908491343777, + "loss": 1.1684, + "step": 1320 + }, + { + "epoch": 0.10191279209828957, + "grad_norm": 0.13304616510868073, + "learning_rate": 0.00019370877988458368, + "loss": 1.0944, + "step": 1322 + }, + { + "epoch": 0.10206697181402072, + "grad_norm": 0.13905592262744904, + "learning_rate": 0.0001936984748557296, + "loss": 1.0915, + "step": 1324 + }, + { + "epoch": 0.10222115152975186, + "grad_norm": 0.13225632905960083, + "learning_rate": 0.0001936881698268755, + "loss": 1.0418, + "step": 1326 + }, + { + "epoch": 0.10237533124548302, + "grad_norm": 0.1267402619123459, + "learning_rate": 0.00019367786479802142, + "loss": 1.0446, + "step": 1328 + }, + { + "epoch": 0.10252951096121417, + "grad_norm": 0.1439935863018036, + "learning_rate": 0.00019366755976916737, + "loss": 1.0582, + "step": 1330 + }, + { + "epoch": 0.10268369067694531, + "grad_norm": 0.1267223060131073, + "learning_rate": 0.00019365725474031328, + "loss": 1.0176, + "step": 1332 + }, + { + "epoch": 0.10283787039267646, + "grad_norm": 0.1298942118883133, + "learning_rate": 0.0001936469497114592, + "loss": 1.0552, + "step": 1334 + }, + { + "epoch": 0.10299205010840762, + "grad_norm": 0.13010933995246887, + "learning_rate": 0.0001936366446826051, + "loss": 1.0848, + "step": 1336 + }, + { + "epoch": 0.10314622982413876, + "grad_norm": 0.13728559017181396, + "learning_rate": 0.00019362633965375103, + "loss": 1.0779, + "step": 1338 + }, + { + "epoch": 0.10330040953986991, + "grad_norm": 0.13863548636436462, + "learning_rate": 0.00019361603462489697, + "loss": 1.0326, + "step": 1340 + }, + { + "epoch": 0.10345458925560105, + "grad_norm": 0.12995532155036926, + "learning_rate": 0.00019360572959604288, + "loss": 1.1427, + "step": 1342 + }, + { + "epoch": 0.10360876897133221, + "grad_norm": 0.13650789856910706, + "learning_rate": 0.0001935954245671888, + "loss": 1.0528, + "step": 1344 + }, + { + "epoch": 0.10376294868706336, + "grad_norm": 0.1336941123008728, + "learning_rate": 0.0001935851195383347, + "loss": 1.1155, + "step": 1346 + }, + { + "epoch": 0.1039171284027945, + "grad_norm": 0.13927003741264343, + "learning_rate": 0.00019357481450948063, + "loss": 1.0551, + "step": 1348 + }, + { + "epoch": 0.10407130811852566, + "grad_norm": 0.14504994451999664, + "learning_rate": 0.00019356450948062657, + "loss": 1.1014, + "step": 1350 + }, + { + "epoch": 0.10422548783425681, + "grad_norm": 0.15796230733394623, + "learning_rate": 0.00019355420445177248, + "loss": 1.2115, + "step": 1352 + }, + { + "epoch": 0.10437966754998795, + "grad_norm": 0.1317984163761139, + "learning_rate": 0.0001935438994229184, + "loss": 1.0933, + "step": 1354 + }, + { + "epoch": 0.1045338472657191, + "grad_norm": 0.13189563155174255, + "learning_rate": 0.00019353359439406431, + "loss": 1.0664, + "step": 1356 + }, + { + "epoch": 0.10468802698145026, + "grad_norm": 0.1323234885931015, + "learning_rate": 0.00019352328936521023, + "loss": 1.0824, + "step": 1358 + }, + { + "epoch": 0.1048422066971814, + "grad_norm": 0.13659097254276276, + "learning_rate": 0.00019351298433635614, + "loss": 1.0334, + "step": 1360 + }, + { + "epoch": 0.10499638641291255, + "grad_norm": 0.11882172524929047, + "learning_rate": 0.0001935026793075021, + "loss": 1.0401, + "step": 1362 + }, + { + "epoch": 0.1051505661286437, + "grad_norm": 0.13025067746639252, + "learning_rate": 0.000193492374278648, + "loss": 1.0838, + "step": 1364 + }, + { + "epoch": 0.10530474584437485, + "grad_norm": 0.1249939501285553, + "learning_rate": 0.00019348206924979392, + "loss": 1.0349, + "step": 1366 + }, + { + "epoch": 0.105458925560106, + "grad_norm": 0.12588031589984894, + "learning_rate": 0.00019347176422093983, + "loss": 1.079, + "step": 1368 + }, + { + "epoch": 0.10561310527583714, + "grad_norm": 0.12548890709877014, + "learning_rate": 0.00019346145919208575, + "loss": 1.0062, + "step": 1370 + }, + { + "epoch": 0.1057672849915683, + "grad_norm": 0.13328798115253448, + "learning_rate": 0.00019345115416323166, + "loss": 1.1154, + "step": 1372 + }, + { + "epoch": 0.10592146470729945, + "grad_norm": 0.1443903148174286, + "learning_rate": 0.00019344084913437758, + "loss": 1.097, + "step": 1374 + }, + { + "epoch": 0.1060756444230306, + "grad_norm": 0.12835648655891418, + "learning_rate": 0.0001934305441055235, + "loss": 1.0723, + "step": 1376 + }, + { + "epoch": 0.10622982413876174, + "grad_norm": 0.13068312406539917, + "learning_rate": 0.0001934202390766694, + "loss": 1.1128, + "step": 1378 + }, + { + "epoch": 0.1063840038544929, + "grad_norm": 0.13628961145877838, + "learning_rate": 0.00019340993404781532, + "loss": 1.1146, + "step": 1380 + }, + { + "epoch": 0.10653818357022404, + "grad_norm": 0.12263484299182892, + "learning_rate": 0.00019339962901896126, + "loss": 1.0947, + "step": 1382 + }, + { + "epoch": 0.10669236328595519, + "grad_norm": 0.12684424221515656, + "learning_rate": 0.00019338932399010718, + "loss": 1.059, + "step": 1384 + }, + { + "epoch": 0.10684654300168633, + "grad_norm": 0.1421595960855484, + "learning_rate": 0.0001933790189612531, + "loss": 1.0688, + "step": 1386 + }, + { + "epoch": 0.10700072271741749, + "grad_norm": 0.12416025251150131, + "learning_rate": 0.000193368713932399, + "loss": 1.0905, + "step": 1388 + }, + { + "epoch": 0.10715490243314864, + "grad_norm": 0.1284332126379013, + "learning_rate": 0.00019335840890354492, + "loss": 1.0612, + "step": 1390 + }, + { + "epoch": 0.10730908214887978, + "grad_norm": 0.1282491385936737, + "learning_rate": 0.00019334810387469086, + "loss": 1.0851, + "step": 1392 + }, + { + "epoch": 0.10746326186461094, + "grad_norm": 0.13221289217472076, + "learning_rate": 0.00019333779884583678, + "loss": 1.0446, + "step": 1394 + }, + { + "epoch": 0.10761744158034209, + "grad_norm": 0.12401736527681351, + "learning_rate": 0.0001933274938169827, + "loss": 1.0826, + "step": 1396 + }, + { + "epoch": 0.10777162129607323, + "grad_norm": 0.14316771924495697, + "learning_rate": 0.0001933171887881286, + "loss": 1.1136, + "step": 1398 + }, + { + "epoch": 0.10792580101180438, + "grad_norm": 0.17223364114761353, + "learning_rate": 0.00019330688375927452, + "loss": 1.0752, + "step": 1400 + }, + { + "epoch": 0.10792580101180438, + "eval_loss": 1.0899540185928345, + "eval_runtime": 185.3818, + "eval_samples_per_second": 91.395, + "eval_steps_per_second": 1.429, + "step": 1400 + }, + { + "epoch": 0.10807998072753554, + "grad_norm": 0.15027141571044922, + "learning_rate": 0.00019329657873042047, + "loss": 1.0371, + "step": 1402 + }, + { + "epoch": 0.10823416044326668, + "grad_norm": 0.19876505434513092, + "learning_rate": 0.00019328627370156638, + "loss": 1.0312, + "step": 1404 + }, + { + "epoch": 0.10838834015899783, + "grad_norm": 0.1422131210565567, + "learning_rate": 0.0001932759686727123, + "loss": 1.0597, + "step": 1406 + }, + { + "epoch": 0.10854251987472899, + "grad_norm": 0.13597753643989563, + "learning_rate": 0.0001932656636438582, + "loss": 1.0939, + "step": 1408 + }, + { + "epoch": 0.10869669959046013, + "grad_norm": 0.16808953881263733, + "learning_rate": 0.00019325535861500413, + "loss": 1.1221, + "step": 1410 + }, + { + "epoch": 0.10885087930619128, + "grad_norm": 0.14884881675243378, + "learning_rate": 0.00019324505358615007, + "loss": 1.1114, + "step": 1412 + }, + { + "epoch": 0.10900505902192242, + "grad_norm": 0.12680503726005554, + "learning_rate": 0.00019323474855729598, + "loss": 1.1032, + "step": 1414 + }, + { + "epoch": 0.10915923873765358, + "grad_norm": 0.13997766375541687, + "learning_rate": 0.0001932244435284419, + "loss": 1.0799, + "step": 1416 + }, + { + "epoch": 0.10931341845338473, + "grad_norm": 0.1343669593334198, + "learning_rate": 0.0001932141384995878, + "loss": 1.0778, + "step": 1418 + }, + { + "epoch": 0.10946759816911587, + "grad_norm": 0.12029851973056793, + "learning_rate": 0.00019320383347073373, + "loss": 1.1021, + "step": 1420 + }, + { + "epoch": 0.10962177788484702, + "grad_norm": 0.1322990357875824, + "learning_rate": 0.00019319352844187967, + "loss": 1.1061, + "step": 1422 + }, + { + "epoch": 0.10977595760057818, + "grad_norm": 0.13710594177246094, + "learning_rate": 0.00019318322341302558, + "loss": 1.0786, + "step": 1424 + }, + { + "epoch": 0.10993013731630932, + "grad_norm": 0.11956049501895905, + "learning_rate": 0.0001931729183841715, + "loss": 1.0711, + "step": 1426 + }, + { + "epoch": 0.11008431703204047, + "grad_norm": 0.139973446726799, + "learning_rate": 0.00019316261335531741, + "loss": 1.1162, + "step": 1428 + }, + { + "epoch": 0.11023849674777163, + "grad_norm": 0.1525941640138626, + "learning_rate": 0.00019315230832646333, + "loss": 1.0572, + "step": 1430 + }, + { + "epoch": 0.11039267646350277, + "grad_norm": 0.1349973976612091, + "learning_rate": 0.00019314200329760924, + "loss": 1.1048, + "step": 1432 + }, + { + "epoch": 0.11054685617923392, + "grad_norm": 0.1305711269378662, + "learning_rate": 0.00019313169826875516, + "loss": 1.0841, + "step": 1434 + }, + { + "epoch": 0.11070103589496506, + "grad_norm": 0.16756822168827057, + "learning_rate": 0.00019312139323990107, + "loss": 1.0736, + "step": 1436 + }, + { + "epoch": 0.11085521561069622, + "grad_norm": 0.13367486000061035, + "learning_rate": 0.000193111088211047, + "loss": 1.0774, + "step": 1438 + }, + { + "epoch": 0.11100939532642737, + "grad_norm": 0.12484605610370636, + "learning_rate": 0.0001931007831821929, + "loss": 1.1196, + "step": 1440 + }, + { + "epoch": 0.11116357504215851, + "grad_norm": 0.14064739644527435, + "learning_rate": 0.00019309047815333885, + "loss": 1.1101, + "step": 1442 + }, + { + "epoch": 0.11131775475788966, + "grad_norm": 0.1366916447877884, + "learning_rate": 0.00019308017312448476, + "loss": 1.111, + "step": 1444 + }, + { + "epoch": 0.11147193447362082, + "grad_norm": 0.11520934104919434, + "learning_rate": 0.00019306986809563068, + "loss": 1.065, + "step": 1446 + }, + { + "epoch": 0.11162611418935196, + "grad_norm": 0.15567731857299805, + "learning_rate": 0.0001930595630667766, + "loss": 1.1036, + "step": 1448 + }, + { + "epoch": 0.11178029390508311, + "grad_norm": 0.13628730177879333, + "learning_rate": 0.0001930492580379225, + "loss": 1.0717, + "step": 1450 + }, + { + "epoch": 0.11193447362081427, + "grad_norm": 0.1359964907169342, + "learning_rate": 0.00019303895300906842, + "loss": 1.0986, + "step": 1452 + }, + { + "epoch": 0.11208865333654541, + "grad_norm": 0.16372162103652954, + "learning_rate": 0.00019302864798021436, + "loss": 1.0306, + "step": 1454 + }, + { + "epoch": 0.11224283305227656, + "grad_norm": 0.1724134087562561, + "learning_rate": 0.00019301834295136028, + "loss": 1.0753, + "step": 1456 + }, + { + "epoch": 0.1123970127680077, + "grad_norm": 0.13646383583545685, + "learning_rate": 0.0001930080379225062, + "loss": 1.0975, + "step": 1458 + }, + { + "epoch": 0.11255119248373886, + "grad_norm": 0.1522134691476822, + "learning_rate": 0.0001929977328936521, + "loss": 1.1031, + "step": 1460 + }, + { + "epoch": 0.11270537219947001, + "grad_norm": 0.13656160235404968, + "learning_rate": 0.00019298742786479802, + "loss": 1.0602, + "step": 1462 + }, + { + "epoch": 0.11285955191520115, + "grad_norm": 0.14140130579471588, + "learning_rate": 0.00019297712283594396, + "loss": 1.1289, + "step": 1464 + }, + { + "epoch": 0.1130137316309323, + "grad_norm": 0.1383032351732254, + "learning_rate": 0.00019296681780708988, + "loss": 1.0797, + "step": 1466 + }, + { + "epoch": 0.11316791134666346, + "grad_norm": 0.15723556280136108, + "learning_rate": 0.0001929565127782358, + "loss": 1.1156, + "step": 1468 + }, + { + "epoch": 0.1133220910623946, + "grad_norm": 0.13462230563163757, + "learning_rate": 0.0001929462077493817, + "loss": 1.0953, + "step": 1470 + }, + { + "epoch": 0.11347627077812575, + "grad_norm": 0.14101319015026093, + "learning_rate": 0.00019293590272052762, + "loss": 1.1152, + "step": 1472 + }, + { + "epoch": 0.11363045049385691, + "grad_norm": 0.13705132901668549, + "learning_rate": 0.00019292559769167357, + "loss": 1.0886, + "step": 1474 + }, + { + "epoch": 0.11378463020958805, + "grad_norm": 0.1206672340631485, + "learning_rate": 0.00019291529266281948, + "loss": 1.0995, + "step": 1476 + }, + { + "epoch": 0.1139388099253192, + "grad_norm": 0.13666383922100067, + "learning_rate": 0.0001929049876339654, + "loss": 1.058, + "step": 1478 + }, + { + "epoch": 0.11409298964105034, + "grad_norm": 0.1265423446893692, + "learning_rate": 0.0001928946826051113, + "loss": 1.0676, + "step": 1480 + }, + { + "epoch": 0.1142471693567815, + "grad_norm": 0.1528097242116928, + "learning_rate": 0.00019288437757625723, + "loss": 1.0675, + "step": 1482 + }, + { + "epoch": 0.11440134907251265, + "grad_norm": 0.16541676223278046, + "learning_rate": 0.00019287407254740314, + "loss": 1.1539, + "step": 1484 + }, + { + "epoch": 0.1145555287882438, + "grad_norm": 0.20383091270923615, + "learning_rate": 0.00019286376751854906, + "loss": 1.0472, + "step": 1486 + }, + { + "epoch": 0.11470970850397495, + "grad_norm": 0.13806484639644623, + "learning_rate": 0.00019285346248969497, + "loss": 1.0408, + "step": 1488 + }, + { + "epoch": 0.1148638882197061, + "grad_norm": 0.1251746118068695, + "learning_rate": 0.00019284315746084089, + "loss": 1.1207, + "step": 1490 + }, + { + "epoch": 0.11501806793543724, + "grad_norm": 0.13218504190444946, + "learning_rate": 0.0001928328524319868, + "loss": 1.1131, + "step": 1492 + }, + { + "epoch": 0.11517224765116839, + "grad_norm": 0.21616914868354797, + "learning_rate": 0.00019282254740313274, + "loss": 1.1103, + "step": 1494 + }, + { + "epoch": 0.11532642736689955, + "grad_norm": 0.1437305361032486, + "learning_rate": 0.00019281224237427866, + "loss": 1.1243, + "step": 1496 + }, + { + "epoch": 0.11548060708263069, + "grad_norm": 0.13094168901443481, + "learning_rate": 0.00019280193734542457, + "loss": 1.1012, + "step": 1498 + }, + { + "epoch": 0.11563478679836184, + "grad_norm": 0.12384334206581116, + "learning_rate": 0.0001927916323165705, + "loss": 1.05, + "step": 1500 + }, + { + "epoch": 0.11563478679836184, + "eval_loss": 1.0905406475067139, + "eval_runtime": 185.4473, + "eval_samples_per_second": 91.363, + "eval_steps_per_second": 1.429, + "step": 1500 + }, + { + "epoch": 0.11578896651409298, + "grad_norm": 0.12807106971740723, + "learning_rate": 0.0001927813272877164, + "loss": 1.0754, + "step": 1502 + }, + { + "epoch": 0.11594314622982414, + "grad_norm": 0.12517131865024567, + "learning_rate": 0.00019277102225886234, + "loss": 1.1017, + "step": 1504 + }, + { + "epoch": 0.11609732594555529, + "grad_norm": 0.1704496592283249, + "learning_rate": 0.00019276071723000826, + "loss": 1.098, + "step": 1506 + }, + { + "epoch": 0.11625150566128643, + "grad_norm": 0.12152231484651566, + "learning_rate": 0.00019275041220115417, + "loss": 1.0738, + "step": 1508 + }, + { + "epoch": 0.11640568537701759, + "grad_norm": 0.12952156364917755, + "learning_rate": 0.0001927401071723001, + "loss": 1.0479, + "step": 1510 + }, + { + "epoch": 0.11655986509274874, + "grad_norm": 0.1499640941619873, + "learning_rate": 0.000192729802143446, + "loss": 1.1046, + "step": 1512 + }, + { + "epoch": 0.11671404480847988, + "grad_norm": 0.1331593543291092, + "learning_rate": 0.00019271949711459195, + "loss": 1.1219, + "step": 1514 + }, + { + "epoch": 0.11686822452421103, + "grad_norm": 0.1368558406829834, + "learning_rate": 0.00019270919208573786, + "loss": 1.1357, + "step": 1516 + }, + { + "epoch": 0.11702240423994219, + "grad_norm": 0.12278290838003159, + "learning_rate": 0.00019269888705688378, + "loss": 1.1079, + "step": 1518 + }, + { + "epoch": 0.11717658395567333, + "grad_norm": 0.11737775802612305, + "learning_rate": 0.0001926885820280297, + "loss": 1.1224, + "step": 1520 + }, + { + "epoch": 0.11733076367140448, + "grad_norm": 0.13017341494560242, + "learning_rate": 0.0001926782769991756, + "loss": 1.0648, + "step": 1522 + }, + { + "epoch": 0.11748494338713562, + "grad_norm": 0.11939583718776703, + "learning_rate": 0.00019266797197032155, + "loss": 1.0899, + "step": 1524 + }, + { + "epoch": 0.11763912310286678, + "grad_norm": 0.12446755915880203, + "learning_rate": 0.00019265766694146746, + "loss": 1.0626, + "step": 1526 + }, + { + "epoch": 0.11779330281859793, + "grad_norm": 0.13369430601596832, + "learning_rate": 0.00019264736191261338, + "loss": 1.0526, + "step": 1528 + }, + { + "epoch": 0.11794748253432907, + "grad_norm": 0.13470736145973206, + "learning_rate": 0.0001926370568837593, + "loss": 1.0946, + "step": 1530 + }, + { + "epoch": 0.11810166225006023, + "grad_norm": 0.14193174242973328, + "learning_rate": 0.0001926267518549052, + "loss": 1.1089, + "step": 1532 + }, + { + "epoch": 0.11825584196579138, + "grad_norm": 0.14893026649951935, + "learning_rate": 0.00019261644682605112, + "loss": 1.0606, + "step": 1534 + }, + { + "epoch": 0.11841002168152252, + "grad_norm": 0.20594976842403412, + "learning_rate": 0.00019260614179719704, + "loss": 1.0375, + "step": 1536 + }, + { + "epoch": 0.11856420139725367, + "grad_norm": 0.15287873148918152, + "learning_rate": 0.00019259583676834295, + "loss": 1.1414, + "step": 1538 + }, + { + "epoch": 0.11871838111298483, + "grad_norm": 0.1275177299976349, + "learning_rate": 0.00019258553173948887, + "loss": 1.1084, + "step": 1540 + }, + { + "epoch": 0.11887256082871597, + "grad_norm": 0.20036157965660095, + "learning_rate": 0.00019257522671063478, + "loss": 1.1261, + "step": 1542 + }, + { + "epoch": 0.11902674054444712, + "grad_norm": 0.14492087066173553, + "learning_rate": 0.0001925649216817807, + "loss": 1.1137, + "step": 1544 + }, + { + "epoch": 0.11918092026017826, + "grad_norm": 0.1259312629699707, + "learning_rate": 0.00019255461665292664, + "loss": 1.0409, + "step": 1546 + }, + { + "epoch": 0.11933509997590942, + "grad_norm": 0.1296795755624771, + "learning_rate": 0.00019254431162407255, + "loss": 1.0332, + "step": 1548 + }, + { + "epoch": 0.11948927969164057, + "grad_norm": 0.13372276723384857, + "learning_rate": 0.00019253400659521847, + "loss": 1.1087, + "step": 1550 + }, + { + "epoch": 0.11964345940737171, + "grad_norm": 0.14354725182056427, + "learning_rate": 0.00019252370156636438, + "loss": 1.0398, + "step": 1552 + }, + { + "epoch": 0.11979763912310287, + "grad_norm": 0.1378318965435028, + "learning_rate": 0.0001925133965375103, + "loss": 1.0542, + "step": 1554 + }, + { + "epoch": 0.11995181883883402, + "grad_norm": 0.12171255797147751, + "learning_rate": 0.00019250309150865624, + "loss": 1.0935, + "step": 1556 + }, + { + "epoch": 0.12010599855456516, + "grad_norm": 0.11905664205551147, + "learning_rate": 0.00019249278647980215, + "loss": 1.0097, + "step": 1558 + }, + { + "epoch": 0.12026017827029631, + "grad_norm": 0.12854760885238647, + "learning_rate": 0.00019248248145094807, + "loss": 1.1517, + "step": 1560 + }, + { + "epoch": 0.12041435798602747, + "grad_norm": 0.247908353805542, + "learning_rate": 0.00019247217642209398, + "loss": 1.0876, + "step": 1562 + }, + { + "epoch": 0.12056853770175861, + "grad_norm": 0.1441553235054016, + "learning_rate": 0.0001924618713932399, + "loss": 1.1414, + "step": 1564 + }, + { + "epoch": 0.12072271741748976, + "grad_norm": 0.13307887315750122, + "learning_rate": 0.00019245156636438584, + "loss": 1.1012, + "step": 1566 + }, + { + "epoch": 0.12087689713322092, + "grad_norm": 0.14192406833171844, + "learning_rate": 0.00019244126133553176, + "loss": 1.1418, + "step": 1568 + }, + { + "epoch": 0.12103107684895206, + "grad_norm": 0.11530864983797073, + "learning_rate": 0.00019243095630667767, + "loss": 1.0776, + "step": 1570 + }, + { + "epoch": 0.12118525656468321, + "grad_norm": 0.13385196030139923, + "learning_rate": 0.00019242065127782359, + "loss": 1.1311, + "step": 1572 + }, + { + "epoch": 0.12133943628041435, + "grad_norm": 0.1308089643716812, + "learning_rate": 0.0001924103462489695, + "loss": 1.0625, + "step": 1574 + }, + { + "epoch": 0.12149361599614551, + "grad_norm": 0.11851842701435089, + "learning_rate": 0.00019240004122011544, + "loss": 1.0182, + "step": 1576 + }, + { + "epoch": 0.12164779571187666, + "grad_norm": 0.2496737688779831, + "learning_rate": 0.00019238973619126136, + "loss": 1.0746, + "step": 1578 + }, + { + "epoch": 0.1218019754276078, + "grad_norm": 0.12962055206298828, + "learning_rate": 0.00019237943116240727, + "loss": 1.0245, + "step": 1580 + }, + { + "epoch": 0.12195615514333895, + "grad_norm": 0.13170978426933289, + "learning_rate": 0.0001923691261335532, + "loss": 0.9897, + "step": 1582 + }, + { + "epoch": 0.12211033485907011, + "grad_norm": 0.13226309418678284, + "learning_rate": 0.0001923588211046991, + "loss": 1.1035, + "step": 1584 + }, + { + "epoch": 0.12226451457480125, + "grad_norm": 0.11901077628135681, + "learning_rate": 0.00019234851607584502, + "loss": 1.0084, + "step": 1586 + }, + { + "epoch": 0.1224186942905324, + "grad_norm": 0.15274369716644287, + "learning_rate": 0.00019233821104699093, + "loss": 1.1436, + "step": 1588 + }, + { + "epoch": 0.12257287400626356, + "grad_norm": 0.11832466721534729, + "learning_rate": 0.00019232790601813685, + "loss": 1.0179, + "step": 1590 + }, + { + "epoch": 0.1227270537219947, + "grad_norm": 0.13038666546344757, + "learning_rate": 0.00019231760098928276, + "loss": 1.0779, + "step": 1592 + }, + { + "epoch": 0.12288123343772585, + "grad_norm": 0.12837626039981842, + "learning_rate": 0.00019230729596042868, + "loss": 1.1404, + "step": 1594 + }, + { + "epoch": 0.123035413153457, + "grad_norm": 0.1400509923696518, + "learning_rate": 0.00019229699093157462, + "loss": 1.1132, + "step": 1596 + }, + { + "epoch": 0.12318959286918815, + "grad_norm": 0.13757595419883728, + "learning_rate": 0.00019228668590272053, + "loss": 1.0816, + "step": 1598 + }, + { + "epoch": 0.1233437725849193, + "grad_norm": 0.12403321266174316, + "learning_rate": 0.00019227638087386645, + "loss": 1.039, + "step": 1600 + }, + { + "epoch": 0.1233437725849193, + "eval_loss": 1.0888522863388062, + "eval_runtime": 185.2371, + "eval_samples_per_second": 91.467, + "eval_steps_per_second": 1.431, + "step": 1600 + }, + { + "epoch": 0.12349795230065044, + "grad_norm": 0.12380605190992355, + "learning_rate": 0.00019226607584501236, + "loss": 1.0903, + "step": 1602 + }, + { + "epoch": 0.12365213201638159, + "grad_norm": 0.13564443588256836, + "learning_rate": 0.00019225577081615828, + "loss": 1.0768, + "step": 1604 + }, + { + "epoch": 0.12380631173211275, + "grad_norm": 0.1533685177564621, + "learning_rate": 0.00019224546578730422, + "loss": 1.0852, + "step": 1606 + }, + { + "epoch": 0.12396049144784389, + "grad_norm": 0.1163390502333641, + "learning_rate": 0.00019223516075845014, + "loss": 1.0574, + "step": 1608 + }, + { + "epoch": 0.12411467116357504, + "grad_norm": 0.13867324590682983, + "learning_rate": 0.00019222485572959605, + "loss": 1.0992, + "step": 1610 + }, + { + "epoch": 0.1242688508793062, + "grad_norm": 0.12759087979793549, + "learning_rate": 0.00019221455070074197, + "loss": 1.0738, + "step": 1612 + }, + { + "epoch": 0.12442303059503734, + "grad_norm": 0.1237189844250679, + "learning_rate": 0.00019220424567188788, + "loss": 1.0974, + "step": 1614 + }, + { + "epoch": 0.12457721031076849, + "grad_norm": 0.13331052660942078, + "learning_rate": 0.00019219394064303382, + "loss": 1.0917, + "step": 1616 + }, + { + "epoch": 0.12473139002649963, + "grad_norm": 0.1290212869644165, + "learning_rate": 0.00019218363561417974, + "loss": 1.0696, + "step": 1618 + }, + { + "epoch": 0.12488556974223079, + "grad_norm": 0.13309410214424133, + "learning_rate": 0.00019217333058532565, + "loss": 1.043, + "step": 1620 + }, + { + "epoch": 0.12503974945796192, + "grad_norm": 0.13453248143196106, + "learning_rate": 0.00019216302555647157, + "loss": 1.0435, + "step": 1622 + }, + { + "epoch": 0.1251939291736931, + "grad_norm": 0.11639372259378433, + "learning_rate": 0.00019215272052761748, + "loss": 1.0579, + "step": 1624 + }, + { + "epoch": 0.12534810888942424, + "grad_norm": 0.13231517374515533, + "learning_rate": 0.0001921424154987634, + "loss": 1.1268, + "step": 1626 + }, + { + "epoch": 0.1255022886051554, + "grad_norm": 0.1349351406097412, + "learning_rate": 0.00019213211046990934, + "loss": 1.1599, + "step": 1628 + }, + { + "epoch": 0.12565646832088653, + "grad_norm": 0.13710346817970276, + "learning_rate": 0.00019212180544105525, + "loss": 1.0866, + "step": 1630 + }, + { + "epoch": 0.12581064803661768, + "grad_norm": 0.14535072445869446, + "learning_rate": 0.00019211150041220117, + "loss": 1.0445, + "step": 1632 + }, + { + "epoch": 0.12596482775234882, + "grad_norm": 0.11799806356430054, + "learning_rate": 0.00019210119538334708, + "loss": 1.0525, + "step": 1634 + }, + { + "epoch": 0.12611900746807997, + "grad_norm": 0.13399624824523926, + "learning_rate": 0.000192090890354493, + "loss": 1.0246, + "step": 1636 + }, + { + "epoch": 0.12627318718381114, + "grad_norm": 0.14404788613319397, + "learning_rate": 0.00019208058532563894, + "loss": 1.0582, + "step": 1638 + }, + { + "epoch": 0.1264273668995423, + "grad_norm": 0.14395713806152344, + "learning_rate": 0.00019207028029678486, + "loss": 1.0686, + "step": 1640 + }, + { + "epoch": 0.12658154661527343, + "grad_norm": 0.13249294459819794, + "learning_rate": 0.00019205997526793077, + "loss": 1.1286, + "step": 1642 + }, + { + "epoch": 0.12673572633100458, + "grad_norm": 0.12791812419891357, + "learning_rate": 0.00019204967023907669, + "loss": 1.062, + "step": 1644 + }, + { + "epoch": 0.12688990604673572, + "grad_norm": 0.12210959941148758, + "learning_rate": 0.0001920393652102226, + "loss": 1.0419, + "step": 1646 + }, + { + "epoch": 0.12704408576246687, + "grad_norm": 0.13438813388347626, + "learning_rate": 0.00019202906018136852, + "loss": 1.0589, + "step": 1648 + }, + { + "epoch": 0.127198265478198, + "grad_norm": 0.12953762710094452, + "learning_rate": 0.00019201875515251443, + "loss": 1.0128, + "step": 1650 + }, + { + "epoch": 0.1273524451939292, + "grad_norm": 0.1318603903055191, + "learning_rate": 0.00019200845012366035, + "loss": 1.073, + "step": 1652 + }, + { + "epoch": 0.12750662490966033, + "grad_norm": 0.12956051528453827, + "learning_rate": 0.00019199814509480626, + "loss": 1.0489, + "step": 1654 + }, + { + "epoch": 0.12766080462539148, + "grad_norm": 0.13501368463039398, + "learning_rate": 0.00019198784006595218, + "loss": 1.0198, + "step": 1656 + }, + { + "epoch": 0.12781498434112262, + "grad_norm": 0.13902342319488525, + "learning_rate": 0.00019197753503709812, + "loss": 1.0512, + "step": 1658 + }, + { + "epoch": 0.12796916405685377, + "grad_norm": 0.15590503811836243, + "learning_rate": 0.00019196723000824403, + "loss": 1.1782, + "step": 1660 + }, + { + "epoch": 0.1281233437725849, + "grad_norm": 0.13954932987689972, + "learning_rate": 0.00019195692497938995, + "loss": 1.0421, + "step": 1662 + }, + { + "epoch": 0.12827752348831606, + "grad_norm": 0.11550859361886978, + "learning_rate": 0.00019194661995053586, + "loss": 1.086, + "step": 1664 + }, + { + "epoch": 0.1284317032040472, + "grad_norm": 0.12175869196653366, + "learning_rate": 0.00019193631492168178, + "loss": 1.0704, + "step": 1666 + }, + { + "epoch": 0.12858588291977838, + "grad_norm": 0.13503512740135193, + "learning_rate": 0.00019192600989282772, + "loss": 1.1166, + "step": 1668 + }, + { + "epoch": 0.12874006263550952, + "grad_norm": 0.12849009037017822, + "learning_rate": 0.00019191570486397363, + "loss": 1.0315, + "step": 1670 + }, + { + "epoch": 0.12889424235124067, + "grad_norm": 0.12484319508075714, + "learning_rate": 0.00019190539983511955, + "loss": 1.0737, + "step": 1672 + }, + { + "epoch": 0.1290484220669718, + "grad_norm": 0.1364014446735382, + "learning_rate": 0.00019189509480626546, + "loss": 1.0619, + "step": 1674 + }, + { + "epoch": 0.12920260178270296, + "grad_norm": 0.12930172681808472, + "learning_rate": 0.00019188478977741138, + "loss": 1.046, + "step": 1676 + }, + { + "epoch": 0.1293567814984341, + "grad_norm": 0.13860805332660675, + "learning_rate": 0.00019187448474855732, + "loss": 1.0832, + "step": 1678 + }, + { + "epoch": 0.12951096121416525, + "grad_norm": 0.1379111111164093, + "learning_rate": 0.00019186417971970324, + "loss": 1.1406, + "step": 1680 + }, + { + "epoch": 0.12966514092989642, + "grad_norm": 0.1349123865365982, + "learning_rate": 0.00019185387469084915, + "loss": 1.1055, + "step": 1682 + }, + { + "epoch": 0.12981932064562757, + "grad_norm": 0.13304142653942108, + "learning_rate": 0.00019184356966199507, + "loss": 1.0392, + "step": 1684 + }, + { + "epoch": 0.1299735003613587, + "grad_norm": 0.12159105390310287, + "learning_rate": 0.00019183326463314098, + "loss": 1.0548, + "step": 1686 + }, + { + "epoch": 0.13012768007708986, + "grad_norm": 0.12661418318748474, + "learning_rate": 0.00019182295960428692, + "loss": 1.0588, + "step": 1688 + }, + { + "epoch": 0.130281859792821, + "grad_norm": 0.13691510260105133, + "learning_rate": 0.00019181265457543284, + "loss": 1.0854, + "step": 1690 + }, + { + "epoch": 0.13043603950855215, + "grad_norm": 0.1401318609714508, + "learning_rate": 0.00019180234954657875, + "loss": 1.0864, + "step": 1692 + }, + { + "epoch": 0.1305902192242833, + "grad_norm": 0.1355384737253189, + "learning_rate": 0.00019179204451772467, + "loss": 1.058, + "step": 1694 + }, + { + "epoch": 0.13074439894001447, + "grad_norm": 0.13987474143505096, + "learning_rate": 0.00019178173948887058, + "loss": 1.06, + "step": 1696 + }, + { + "epoch": 0.1308985786557456, + "grad_norm": 0.14350661635398865, + "learning_rate": 0.0001917714344600165, + "loss": 1.0731, + "step": 1698 + }, + { + "epoch": 0.13105275837147676, + "grad_norm": 0.12443742901086807, + "learning_rate": 0.0001917611294311624, + "loss": 1.0987, + "step": 1700 + }, + { + "epoch": 0.13105275837147676, + "eval_loss": 1.0880467891693115, + "eval_runtime": 185.5457, + "eval_samples_per_second": 91.314, + "eval_steps_per_second": 1.428, + "step": 1700 + }, + { + "epoch": 0.1312069380872079, + "grad_norm": 0.10956554859876633, + "learning_rate": 0.00019175082440230833, + "loss": 1.0393, + "step": 1702 + }, + { + "epoch": 0.13136111780293905, + "grad_norm": 0.11846137791872025, + "learning_rate": 0.00019174051937345424, + "loss": 1.0998, + "step": 1704 + }, + { + "epoch": 0.1315152975186702, + "grad_norm": 0.11894328892230988, + "learning_rate": 0.00019173021434460016, + "loss": 1.1007, + "step": 1706 + }, + { + "epoch": 0.13166947723440134, + "grad_norm": 0.11090514808893204, + "learning_rate": 0.00019171990931574607, + "loss": 1.0343, + "step": 1708 + }, + { + "epoch": 0.1318236569501325, + "grad_norm": 0.1276719868183136, + "learning_rate": 0.000191709604286892, + "loss": 1.0392, + "step": 1710 + }, + { + "epoch": 0.13197783666586366, + "grad_norm": 0.12342885881662369, + "learning_rate": 0.00019169929925803793, + "loss": 1.063, + "step": 1712 + }, + { + "epoch": 0.1321320163815948, + "grad_norm": 0.1237882748246193, + "learning_rate": 0.00019168899422918384, + "loss": 1.0558, + "step": 1714 + }, + { + "epoch": 0.13228619609732595, + "grad_norm": 0.12958785891532898, + "learning_rate": 0.00019167868920032976, + "loss": 1.0493, + "step": 1716 + }, + { + "epoch": 0.1324403758130571, + "grad_norm": 0.1181110367178917, + "learning_rate": 0.00019166838417147567, + "loss": 1.0668, + "step": 1718 + }, + { + "epoch": 0.13259455552878824, + "grad_norm": 0.12053950875997543, + "learning_rate": 0.00019165807914262162, + "loss": 1.0392, + "step": 1720 + }, + { + "epoch": 0.13274873524451938, + "grad_norm": 0.11725175380706787, + "learning_rate": 0.00019164777411376753, + "loss": 1.0188, + "step": 1722 + }, + { + "epoch": 0.13290291496025053, + "grad_norm": 0.12475614994764328, + "learning_rate": 0.00019163746908491344, + "loss": 1.0134, + "step": 1724 + }, + { + "epoch": 0.1330570946759817, + "grad_norm": 0.1231207475066185, + "learning_rate": 0.00019162716405605936, + "loss": 1.0309, + "step": 1726 + }, + { + "epoch": 0.13321127439171285, + "grad_norm": 0.1269765943288803, + "learning_rate": 0.00019161685902720527, + "loss": 1.0918, + "step": 1728 + }, + { + "epoch": 0.133365454107444, + "grad_norm": 0.12103556841611862, + "learning_rate": 0.00019160655399835122, + "loss": 1.0453, + "step": 1730 + }, + { + "epoch": 0.13351963382317514, + "grad_norm": 0.12427771091461182, + "learning_rate": 0.00019159624896949713, + "loss": 1.1544, + "step": 1732 + }, + { + "epoch": 0.13367381353890628, + "grad_norm": 0.13416282832622528, + "learning_rate": 0.00019158594394064305, + "loss": 1.0941, + "step": 1734 + }, + { + "epoch": 0.13382799325463743, + "grad_norm": 0.13207705318927765, + "learning_rate": 0.00019157563891178896, + "loss": 1.0998, + "step": 1736 + }, + { + "epoch": 0.13398217297036857, + "grad_norm": 0.1436687856912613, + "learning_rate": 0.00019156533388293488, + "loss": 1.0723, + "step": 1738 + }, + { + "epoch": 0.13413635268609975, + "grad_norm": 0.1206304207444191, + "learning_rate": 0.00019155502885408082, + "loss": 1.0279, + "step": 1740 + }, + { + "epoch": 0.1342905324018309, + "grad_norm": 0.12685900926589966, + "learning_rate": 0.00019154472382522673, + "loss": 1.0683, + "step": 1742 + }, + { + "epoch": 0.13444471211756204, + "grad_norm": 0.12833228707313538, + "learning_rate": 0.00019153441879637265, + "loss": 1.0904, + "step": 1744 + }, + { + "epoch": 0.13459889183329318, + "grad_norm": 0.12999312579631805, + "learning_rate": 0.00019152411376751856, + "loss": 1.0492, + "step": 1746 + }, + { + "epoch": 0.13475307154902433, + "grad_norm": 0.13486912846565247, + "learning_rate": 0.00019151380873866448, + "loss": 1.101, + "step": 1748 + }, + { + "epoch": 0.13490725126475547, + "grad_norm": 0.12793023884296417, + "learning_rate": 0.0001915035037098104, + "loss": 1.1135, + "step": 1750 + }, + { + "epoch": 0.13506143098048662, + "grad_norm": 0.12652675807476044, + "learning_rate": 0.0001914931986809563, + "loss": 1.0902, + "step": 1752 + }, + { + "epoch": 0.1352156106962178, + "grad_norm": 0.12431836873292923, + "learning_rate": 0.00019148289365210222, + "loss": 1.0922, + "step": 1754 + }, + { + "epoch": 0.13536979041194894, + "grad_norm": 0.13665209710597992, + "learning_rate": 0.00019147258862324814, + "loss": 1.0584, + "step": 1756 + }, + { + "epoch": 0.13552397012768008, + "grad_norm": 0.1355196088552475, + "learning_rate": 0.00019146228359439405, + "loss": 1.1199, + "step": 1758 + }, + { + "epoch": 0.13567814984341123, + "grad_norm": 0.14115893840789795, + "learning_rate": 0.00019145197856554, + "loss": 1.0697, + "step": 1760 + }, + { + "epoch": 0.13583232955914237, + "grad_norm": 0.13009534776210785, + "learning_rate": 0.0001914416735366859, + "loss": 1.1111, + "step": 1762 + }, + { + "epoch": 0.13598650927487352, + "grad_norm": 0.12280994653701782, + "learning_rate": 0.00019143136850783182, + "loss": 1.0341, + "step": 1764 + }, + { + "epoch": 0.13614068899060466, + "grad_norm": 0.15171582996845245, + "learning_rate": 0.00019142106347897774, + "loss": 1.1275, + "step": 1766 + }, + { + "epoch": 0.1362948687063358, + "grad_norm": 0.15258526802062988, + "learning_rate": 0.00019141075845012365, + "loss": 1.0513, + "step": 1768 + }, + { + "epoch": 0.13644904842206698, + "grad_norm": 0.132346972823143, + "learning_rate": 0.0001914004534212696, + "loss": 1.0878, + "step": 1770 + }, + { + "epoch": 0.13660322813779813, + "grad_norm": 0.13237041234970093, + "learning_rate": 0.0001913901483924155, + "loss": 1.0845, + "step": 1772 + }, + { + "epoch": 0.13675740785352927, + "grad_norm": 0.13837209343910217, + "learning_rate": 0.00019137984336356143, + "loss": 1.1221, + "step": 1774 + }, + { + "epoch": 0.13691158756926042, + "grad_norm": 0.17590375244617462, + "learning_rate": 0.00019136953833470734, + "loss": 1.1963, + "step": 1776 + }, + { + "epoch": 0.13706576728499156, + "grad_norm": 0.12898488342761993, + "learning_rate": 0.00019135923330585326, + "loss": 1.1306, + "step": 1778 + }, + { + "epoch": 0.1372199470007227, + "grad_norm": 0.12428785115480423, + "learning_rate": 0.0001913489282769992, + "loss": 1.068, + "step": 1780 + }, + { + "epoch": 0.13737412671645385, + "grad_norm": 0.12678809463977814, + "learning_rate": 0.0001913386232481451, + "loss": 1.0709, + "step": 1782 + }, + { + "epoch": 0.13752830643218503, + "grad_norm": 0.1344168782234192, + "learning_rate": 0.00019132831821929103, + "loss": 1.1073, + "step": 1784 + }, + { + "epoch": 0.13768248614791617, + "grad_norm": 0.14730733633041382, + "learning_rate": 0.00019131801319043694, + "loss": 1.0073, + "step": 1786 + }, + { + "epoch": 0.13783666586364732, + "grad_norm": 0.13661792874336243, + "learning_rate": 0.00019130770816158286, + "loss": 1.0637, + "step": 1788 + }, + { + "epoch": 0.13799084557937846, + "grad_norm": 0.1342434138059616, + "learning_rate": 0.0001912974031327288, + "loss": 1.1069, + "step": 1790 + }, + { + "epoch": 0.1381450252951096, + "grad_norm": 0.11941581219434738, + "learning_rate": 0.00019128709810387471, + "loss": 1.1023, + "step": 1792 + }, + { + "epoch": 0.13829920501084075, + "grad_norm": 0.13641759753227234, + "learning_rate": 0.00019127679307502063, + "loss": 1.0564, + "step": 1794 + }, + { + "epoch": 0.1384533847265719, + "grad_norm": 0.11148608475923538, + "learning_rate": 0.00019126648804616654, + "loss": 1.0255, + "step": 1796 + }, + { + "epoch": 0.13860756444230307, + "grad_norm": 0.1387186199426651, + "learning_rate": 0.00019125618301731246, + "loss": 1.0663, + "step": 1798 + }, + { + "epoch": 0.13876174415803422, + "grad_norm": 0.12380651384592056, + "learning_rate": 0.00019124587798845837, + "loss": 1.1222, + "step": 1800 + }, + { + "epoch": 0.13876174415803422, + "eval_loss": 1.0875153541564941, + "eval_runtime": 185.4605, + "eval_samples_per_second": 91.356, + "eval_steps_per_second": 1.429, + "step": 1800 + }, + { + "epoch": 0.13891592387376536, + "grad_norm": 0.13224369287490845, + "learning_rate": 0.00019123557295960432, + "loss": 1.0821, + "step": 1802 + }, + { + "epoch": 0.1390701035894965, + "grad_norm": 0.13096244633197784, + "learning_rate": 0.00019122526793075023, + "loss": 1.0097, + "step": 1804 + }, + { + "epoch": 0.13922428330522765, + "grad_norm": 0.11652527749538422, + "learning_rate": 0.00019121496290189615, + "loss": 1.0517, + "step": 1806 + }, + { + "epoch": 0.1393784630209588, + "grad_norm": 0.13449358940124512, + "learning_rate": 0.00019120465787304206, + "loss": 1.0915, + "step": 1808 + }, + { + "epoch": 0.13953264273668994, + "grad_norm": 0.11550068855285645, + "learning_rate": 0.00019119435284418798, + "loss": 1.0568, + "step": 1810 + }, + { + "epoch": 0.13968682245242112, + "grad_norm": 0.13804587721824646, + "learning_rate": 0.0001911840478153339, + "loss": 1.0933, + "step": 1812 + }, + { + "epoch": 0.13984100216815226, + "grad_norm": 0.12062159180641174, + "learning_rate": 0.0001911737427864798, + "loss": 1.0517, + "step": 1814 + }, + { + "epoch": 0.1399951818838834, + "grad_norm": 0.12154779583215714, + "learning_rate": 0.00019116343775762572, + "loss": 1.0955, + "step": 1816 + }, + { + "epoch": 0.14014936159961455, + "grad_norm": 0.11615799367427826, + "learning_rate": 0.00019115313272877164, + "loss": 0.968, + "step": 1818 + }, + { + "epoch": 0.1403035413153457, + "grad_norm": 0.1207037940621376, + "learning_rate": 0.00019114282769991755, + "loss": 1.0896, + "step": 1820 + }, + { + "epoch": 0.14045772103107684, + "grad_norm": 0.12750887870788574, + "learning_rate": 0.0001911325226710635, + "loss": 1.065, + "step": 1822 + }, + { + "epoch": 0.140611900746808, + "grad_norm": 0.16391952335834503, + "learning_rate": 0.0001911222176422094, + "loss": 1.0232, + "step": 1824 + }, + { + "epoch": 0.14076608046253913, + "grad_norm": 0.14626921713352203, + "learning_rate": 0.00019111191261335532, + "loss": 1.0375, + "step": 1826 + }, + { + "epoch": 0.1409202601782703, + "grad_norm": 0.12393996119499207, + "learning_rate": 0.00019110160758450124, + "loss": 1.0345, + "step": 1828 + }, + { + "epoch": 0.14107443989400145, + "grad_norm": 0.13275925815105438, + "learning_rate": 0.00019109130255564715, + "loss": 1.071, + "step": 1830 + }, + { + "epoch": 0.1412286196097326, + "grad_norm": 0.1255485862493515, + "learning_rate": 0.0001910809975267931, + "loss": 1.1026, + "step": 1832 + }, + { + "epoch": 0.14138279932546374, + "grad_norm": 0.13399668037891388, + "learning_rate": 0.000191070692497939, + "loss": 1.11, + "step": 1834 + }, + { + "epoch": 0.1415369790411949, + "grad_norm": 0.13084925711154938, + "learning_rate": 0.00019106038746908492, + "loss": 1.0528, + "step": 1836 + }, + { + "epoch": 0.14169115875692603, + "grad_norm": 0.15695689618587494, + "learning_rate": 0.00019105008244023084, + "loss": 1.1336, + "step": 1838 + }, + { + "epoch": 0.14184533847265718, + "grad_norm": 0.13630808889865875, + "learning_rate": 0.00019103977741137675, + "loss": 1.0767, + "step": 1840 + }, + { + "epoch": 0.14199951818838835, + "grad_norm": 0.11874844878911972, + "learning_rate": 0.0001910294723825227, + "loss": 1.0511, + "step": 1842 + }, + { + "epoch": 0.1421536979041195, + "grad_norm": 0.11898507922887802, + "learning_rate": 0.0001910191673536686, + "loss": 1.0866, + "step": 1844 + }, + { + "epoch": 0.14230787761985064, + "grad_norm": 0.1393211930990219, + "learning_rate": 0.00019100886232481453, + "loss": 1.0553, + "step": 1846 + }, + { + "epoch": 0.1424620573355818, + "grad_norm": 0.1382310539484024, + "learning_rate": 0.00019099855729596044, + "loss": 1.07, + "step": 1848 + }, + { + "epoch": 0.14261623705131293, + "grad_norm": 0.1471824198961258, + "learning_rate": 0.00019098825226710636, + "loss": 1.0893, + "step": 1850 + }, + { + "epoch": 0.14277041676704408, + "grad_norm": 0.12706084549427032, + "learning_rate": 0.0001909779472382523, + "loss": 1.0848, + "step": 1852 + }, + { + "epoch": 0.14292459648277522, + "grad_norm": 0.1324569135904312, + "learning_rate": 0.0001909676422093982, + "loss": 1.024, + "step": 1854 + }, + { + "epoch": 0.1430787761985064, + "grad_norm": 0.11245544254779816, + "learning_rate": 0.00019095733718054413, + "loss": 1.0802, + "step": 1856 + }, + { + "epoch": 0.14323295591423754, + "grad_norm": 0.15419217944145203, + "learning_rate": 0.00019094703215169004, + "loss": 1.1101, + "step": 1858 + }, + { + "epoch": 0.1433871356299687, + "grad_norm": 0.1071443036198616, + "learning_rate": 0.00019093672712283596, + "loss": 1.0576, + "step": 1860 + }, + { + "epoch": 0.14354131534569983, + "grad_norm": 0.1341090053319931, + "learning_rate": 0.00019092642209398187, + "loss": 1.0606, + "step": 1862 + }, + { + "epoch": 0.14369549506143098, + "grad_norm": 0.11848092079162598, + "learning_rate": 0.0001909161170651278, + "loss": 1.0714, + "step": 1864 + }, + { + "epoch": 0.14384967477716212, + "grad_norm": 0.12697815895080566, + "learning_rate": 0.0001909058120362737, + "loss": 1.092, + "step": 1866 + }, + { + "epoch": 0.14400385449289327, + "grad_norm": 0.11891257762908936, + "learning_rate": 0.00019089550700741962, + "loss": 0.9649, + "step": 1868 + }, + { + "epoch": 0.14415803420862444, + "grad_norm": 0.12616439163684845, + "learning_rate": 0.00019088520197856553, + "loss": 1.0962, + "step": 1870 + }, + { + "epoch": 0.1443122139243556, + "grad_norm": 0.12141067534685135, + "learning_rate": 0.00019087489694971147, + "loss": 1.0838, + "step": 1872 + }, + { + "epoch": 0.14446639364008673, + "grad_norm": 0.13279564678668976, + "learning_rate": 0.0001908645919208574, + "loss": 1.0484, + "step": 1874 + }, + { + "epoch": 0.14462057335581788, + "grad_norm": 0.15748505294322968, + "learning_rate": 0.0001908542868920033, + "loss": 1.1433, + "step": 1876 + }, + { + "epoch": 0.14477475307154902, + "grad_norm": 0.11593475937843323, + "learning_rate": 0.00019084398186314922, + "loss": 1.1483, + "step": 1878 + }, + { + "epoch": 0.14492893278728017, + "grad_norm": 0.14499489963054657, + "learning_rate": 0.00019083367683429513, + "loss": 1.0782, + "step": 1880 + }, + { + "epoch": 0.1450831125030113, + "grad_norm": 0.13570410013198853, + "learning_rate": 0.00019082337180544105, + "loss": 1.0989, + "step": 1882 + }, + { + "epoch": 0.14523729221874246, + "grad_norm": 0.12810774147510529, + "learning_rate": 0.000190813066776587, + "loss": 1.0374, + "step": 1884 + }, + { + "epoch": 0.14539147193447363, + "grad_norm": 0.11781581491231918, + "learning_rate": 0.0001908027617477329, + "loss": 1.0796, + "step": 1886 + }, + { + "epoch": 0.14554565165020478, + "grad_norm": 0.12243229150772095, + "learning_rate": 0.00019079245671887882, + "loss": 1.0477, + "step": 1888 + }, + { + "epoch": 0.14569983136593592, + "grad_norm": 0.1385030299425125, + "learning_rate": 0.00019078215169002474, + "loss": 1.0349, + "step": 1890 + }, + { + "epoch": 0.14585401108166707, + "grad_norm": 0.12011386454105377, + "learning_rate": 0.00019077184666117065, + "loss": 1.0718, + "step": 1892 + }, + { + "epoch": 0.1460081907973982, + "grad_norm": 0.12646062672138214, + "learning_rate": 0.0001907615416323166, + "loss": 1.1228, + "step": 1894 + }, + { + "epoch": 0.14616237051312936, + "grad_norm": 0.1284620612859726, + "learning_rate": 0.0001907512366034625, + "loss": 1.079, + "step": 1896 + }, + { + "epoch": 0.1463165502288605, + "grad_norm": 0.15374581515789032, + "learning_rate": 0.00019074093157460842, + "loss": 1.1147, + "step": 1898 + }, + { + "epoch": 0.14647072994459168, + "grad_norm": 0.1325882524251938, + "learning_rate": 0.00019073062654575434, + "loss": 1.0404, + "step": 1900 + }, + { + "epoch": 0.14647072994459168, + "eval_loss": 1.0869932174682617, + "eval_runtime": 185.4754, + "eval_samples_per_second": 91.349, + "eval_steps_per_second": 1.429, + "step": 1900 + }, + { + "epoch": 0.14662490966032282, + "grad_norm": 0.14041611552238464, + "learning_rate": 0.00019072032151690025, + "loss": 1.095, + "step": 1902 + }, + { + "epoch": 0.14677908937605397, + "grad_norm": 0.14162160456180573, + "learning_rate": 0.0001907100164880462, + "loss": 1.1714, + "step": 1904 + }, + { + "epoch": 0.1469332690917851, + "grad_norm": 0.12077832221984863, + "learning_rate": 0.0001906997114591921, + "loss": 1.1109, + "step": 1906 + }, + { + "epoch": 0.14708744880751626, + "grad_norm": 0.1738968789577484, + "learning_rate": 0.00019068940643033802, + "loss": 1.0838, + "step": 1908 + }, + { + "epoch": 0.1472416285232474, + "grad_norm": 0.13948039710521698, + "learning_rate": 0.00019067910140148394, + "loss": 1.0494, + "step": 1910 + }, + { + "epoch": 0.14739580823897855, + "grad_norm": 0.21179239451885223, + "learning_rate": 0.00019066879637262985, + "loss": 1.0962, + "step": 1912 + }, + { + "epoch": 0.14754998795470972, + "grad_norm": 0.12927787005901337, + "learning_rate": 0.00019065849134377577, + "loss": 1.1113, + "step": 1914 + }, + { + "epoch": 0.14770416767044087, + "grad_norm": 0.1296701431274414, + "learning_rate": 0.00019064818631492168, + "loss": 1.0603, + "step": 1916 + }, + { + "epoch": 0.147858347386172, + "grad_norm": 0.1282590925693512, + "learning_rate": 0.0001906378812860676, + "loss": 1.0594, + "step": 1918 + }, + { + "epoch": 0.14801252710190316, + "grad_norm": 0.13304758071899414, + "learning_rate": 0.0001906275762572135, + "loss": 1.0784, + "step": 1920 + }, + { + "epoch": 0.1481667068176343, + "grad_norm": 0.15661965310573578, + "learning_rate": 0.00019061727122835943, + "loss": 1.008, + "step": 1922 + }, + { + "epoch": 0.14832088653336545, + "grad_norm": 0.12986873090267181, + "learning_rate": 0.00019060696619950537, + "loss": 1.0788, + "step": 1924 + }, + { + "epoch": 0.1484750662490966, + "grad_norm": 0.1128251776099205, + "learning_rate": 0.00019059666117065128, + "loss": 1.1449, + "step": 1926 + }, + { + "epoch": 0.14862924596482774, + "grad_norm": 0.13722160458564758, + "learning_rate": 0.0001905863561417972, + "loss": 1.0914, + "step": 1928 + }, + { + "epoch": 0.1487834256805589, + "grad_norm": 0.1507786512374878, + "learning_rate": 0.00019057605111294311, + "loss": 1.0694, + "step": 1930 + }, + { + "epoch": 0.14893760539629006, + "grad_norm": 0.1368752121925354, + "learning_rate": 0.00019056574608408903, + "loss": 1.0417, + "step": 1932 + }, + { + "epoch": 0.1490917851120212, + "grad_norm": 0.12566259503364563, + "learning_rate": 0.00019055544105523497, + "loss": 1.0853, + "step": 1934 + }, + { + "epoch": 0.14924596482775235, + "grad_norm": 0.12362397462129593, + "learning_rate": 0.0001905451360263809, + "loss": 1.1136, + "step": 1936 + }, + { + "epoch": 0.1494001445434835, + "grad_norm": 0.12472514808177948, + "learning_rate": 0.0001905348309975268, + "loss": 1.0628, + "step": 1938 + }, + { + "epoch": 0.14955432425921464, + "grad_norm": 0.1355161964893341, + "learning_rate": 0.00019052452596867272, + "loss": 1.1211, + "step": 1940 + }, + { + "epoch": 0.14970850397494578, + "grad_norm": 0.13438721001148224, + "learning_rate": 0.00019051422093981863, + "loss": 1.0758, + "step": 1942 + }, + { + "epoch": 0.14986268369067696, + "grad_norm": 0.11768204718828201, + "learning_rate": 0.00019050391591096457, + "loss": 1.0533, + "step": 1944 + }, + { + "epoch": 0.1500168634064081, + "grad_norm": 0.13892577588558197, + "learning_rate": 0.0001904936108821105, + "loss": 1.1076, + "step": 1946 + }, + { + "epoch": 0.15017104312213925, + "grad_norm": 0.1532358080148697, + "learning_rate": 0.0001904833058532564, + "loss": 1.0706, + "step": 1948 + }, + { + "epoch": 0.1503252228378704, + "grad_norm": 0.13364464044570923, + "learning_rate": 0.00019047300082440232, + "loss": 1.1322, + "step": 1950 + }, + { + "epoch": 0.15047940255360154, + "grad_norm": 0.12663134932518005, + "learning_rate": 0.00019046269579554823, + "loss": 1.0749, + "step": 1952 + }, + { + "epoch": 0.15063358226933268, + "grad_norm": 0.1297607123851776, + "learning_rate": 0.00019045239076669417, + "loss": 1.0594, + "step": 1954 + }, + { + "epoch": 0.15078776198506383, + "grad_norm": 0.11931920051574707, + "learning_rate": 0.0001904420857378401, + "loss": 1.0522, + "step": 1956 + }, + { + "epoch": 0.150941941700795, + "grad_norm": 0.1334810107946396, + "learning_rate": 0.000190431780708986, + "loss": 1.0674, + "step": 1958 + }, + { + "epoch": 0.15109612141652615, + "grad_norm": 0.12633340060710907, + "learning_rate": 0.00019042147568013192, + "loss": 1.0139, + "step": 1960 + }, + { + "epoch": 0.1512503011322573, + "grad_norm": 0.12485836446285248, + "learning_rate": 0.00019041117065127783, + "loss": 1.0288, + "step": 1962 + }, + { + "epoch": 0.15140448084798844, + "grad_norm": 0.10940799117088318, + "learning_rate": 0.00019040086562242375, + "loss": 1.0475, + "step": 1964 + }, + { + "epoch": 0.15155866056371958, + "grad_norm": 0.12229325622320175, + "learning_rate": 0.00019039056059356966, + "loss": 1.0628, + "step": 1966 + }, + { + "epoch": 0.15171284027945073, + "grad_norm": 0.14333505928516388, + "learning_rate": 0.00019038025556471558, + "loss": 1.0423, + "step": 1968 + }, + { + "epoch": 0.15186701999518187, + "grad_norm": 0.12773017585277557, + "learning_rate": 0.0001903699505358615, + "loss": 1.1283, + "step": 1970 + }, + { + "epoch": 0.15202119971091305, + "grad_norm": 0.11913473904132843, + "learning_rate": 0.0001903596455070074, + "loss": 1.0646, + "step": 1972 + }, + { + "epoch": 0.1521753794266442, + "grad_norm": 0.13321518898010254, + "learning_rate": 0.00019034934047815332, + "loss": 1.0476, + "step": 1974 + }, + { + "epoch": 0.15232955914237534, + "grad_norm": 0.1362799108028412, + "learning_rate": 0.00019033903544929927, + "loss": 1.0937, + "step": 1976 + }, + { + "epoch": 0.15248373885810648, + "grad_norm": 0.13804180920124054, + "learning_rate": 0.00019032873042044518, + "loss": 1.113, + "step": 1978 + }, + { + "epoch": 0.15263791857383763, + "grad_norm": 0.1774570494890213, + "learning_rate": 0.0001903184253915911, + "loss": 1.0795, + "step": 1980 + }, + { + "epoch": 0.15279209828956877, + "grad_norm": 0.13106994330883026, + "learning_rate": 0.000190308120362737, + "loss": 1.098, + "step": 1982 + }, + { + "epoch": 0.15294627800529992, + "grad_norm": 0.14435411989688873, + "learning_rate": 0.00019029781533388293, + "loss": 1.0814, + "step": 1984 + }, + { + "epoch": 0.15310045772103106, + "grad_norm": 0.13178013265132904, + "learning_rate": 0.00019028751030502887, + "loss": 1.1002, + "step": 1986 + }, + { + "epoch": 0.15325463743676224, + "grad_norm": 0.1283218264579773, + "learning_rate": 0.00019027720527617478, + "loss": 1.0749, + "step": 1988 + }, + { + "epoch": 0.15340881715249338, + "grad_norm": 0.12113723158836365, + "learning_rate": 0.0001902669002473207, + "loss": 1.0831, + "step": 1990 + }, + { + "epoch": 0.15356299686822453, + "grad_norm": 0.12649892270565033, + "learning_rate": 0.0001902565952184666, + "loss": 1.0166, + "step": 1992 + }, + { + "epoch": 0.15371717658395567, + "grad_norm": 0.12823793292045593, + "learning_rate": 0.00019024629018961253, + "loss": 1.0273, + "step": 1994 + }, + { + "epoch": 0.15387135629968682, + "grad_norm": 0.1291527897119522, + "learning_rate": 0.00019023598516075847, + "loss": 1.1092, + "step": 1996 + }, + { + "epoch": 0.15402553601541796, + "grad_norm": 0.12588894367218018, + "learning_rate": 0.00019022568013190438, + "loss": 1.0627, + "step": 1998 + }, + { + "epoch": 0.1541797157311491, + "grad_norm": 0.12996312975883484, + "learning_rate": 0.0001902153751030503, + "loss": 1.1196, + "step": 2000 + }, + { + "epoch": 0.1541797157311491, + "eval_loss": 1.0863893032073975, + "eval_runtime": 185.3254, + "eval_samples_per_second": 91.423, + "eval_steps_per_second": 1.43, + "step": 2000 + }, + { + "epoch": 0.15433389544688028, + "grad_norm": 0.14361834526062012, + "learning_rate": 0.00019020507007419621, + "loss": 1.1151, + "step": 2002 + }, + { + "epoch": 0.15448807516261143, + "grad_norm": 0.12650837004184723, + "learning_rate": 0.00019019476504534213, + "loss": 1.1155, + "step": 2004 + }, + { + "epoch": 0.15464225487834257, + "grad_norm": 0.13820499181747437, + "learning_rate": 0.00019018446001648807, + "loss": 1.1243, + "step": 2006 + }, + { + "epoch": 0.15479643459407372, + "grad_norm": 0.13205693662166595, + "learning_rate": 0.00019017415498763399, + "loss": 1.0626, + "step": 2008 + }, + { + "epoch": 0.15495061430980486, + "grad_norm": 0.13930106163024902, + "learning_rate": 0.0001901638499587799, + "loss": 1.1105, + "step": 2010 + }, + { + "epoch": 0.155104794025536, + "grad_norm": 0.14711922407150269, + "learning_rate": 0.00019015354492992582, + "loss": 1.0556, + "step": 2012 + }, + { + "epoch": 0.15525897374126715, + "grad_norm": 0.11909156292676926, + "learning_rate": 0.00019014323990107173, + "loss": 1.1025, + "step": 2014 + }, + { + "epoch": 0.15541315345699833, + "grad_norm": 0.14099714159965515, + "learning_rate": 0.00019013293487221767, + "loss": 1.064, + "step": 2016 + }, + { + "epoch": 0.15556733317272947, + "grad_norm": 0.11500216275453568, + "learning_rate": 0.0001901226298433636, + "loss": 1.1196, + "step": 2018 + }, + { + "epoch": 0.15572151288846062, + "grad_norm": 0.12341683357954025, + "learning_rate": 0.0001901123248145095, + "loss": 1.0625, + "step": 2020 + }, + { + "epoch": 0.15587569260419176, + "grad_norm": 0.1390669196844101, + "learning_rate": 0.00019010201978565542, + "loss": 1.0526, + "step": 2022 + }, + { + "epoch": 0.1560298723199229, + "grad_norm": 0.13482992351055145, + "learning_rate": 0.00019009171475680133, + "loss": 1.1074, + "step": 2024 + }, + { + "epoch": 0.15618405203565405, + "grad_norm": 0.12277045845985413, + "learning_rate": 0.00019008140972794725, + "loss": 1.0648, + "step": 2026 + }, + { + "epoch": 0.1563382317513852, + "grad_norm": 0.13579949736595154, + "learning_rate": 0.00019007110469909316, + "loss": 1.1235, + "step": 2028 + }, + { + "epoch": 0.15649241146711637, + "grad_norm": 0.14128637313842773, + "learning_rate": 0.00019006079967023908, + "loss": 1.0442, + "step": 2030 + }, + { + "epoch": 0.15664659118284752, + "grad_norm": 0.13722474873065948, + "learning_rate": 0.000190050494641385, + "loss": 1.1215, + "step": 2032 + }, + { + "epoch": 0.15680077089857866, + "grad_norm": 0.13500674068927765, + "learning_rate": 0.0001900401896125309, + "loss": 1.0776, + "step": 2034 + }, + { + "epoch": 0.1569549506143098, + "grad_norm": 0.11917294561862946, + "learning_rate": 0.00019002988458367685, + "loss": 1.0698, + "step": 2036 + }, + { + "epoch": 0.15710913033004095, + "grad_norm": 0.12245581299066544, + "learning_rate": 0.00019001957955482276, + "loss": 1.0166, + "step": 2038 + }, + { + "epoch": 0.1572633100457721, + "grad_norm": 0.12556669116020203, + "learning_rate": 0.00019000927452596868, + "loss": 1.0846, + "step": 2040 + }, + { + "epoch": 0.15741748976150324, + "grad_norm": 0.13316373527050018, + "learning_rate": 0.0001899989694971146, + "loss": 1.0566, + "step": 2042 + }, + { + "epoch": 0.1575716694772344, + "grad_norm": 0.1296815425157547, + "learning_rate": 0.0001899886644682605, + "loss": 1.0824, + "step": 2044 + }, + { + "epoch": 0.15772584919296556, + "grad_norm": 0.1288246214389801, + "learning_rate": 0.00018997835943940645, + "loss": 1.0974, + "step": 2046 + }, + { + "epoch": 0.1578800289086967, + "grad_norm": 0.1185479462146759, + "learning_rate": 0.00018996805441055237, + "loss": 1.1443, + "step": 2048 + }, + { + "epoch": 0.15803420862442785, + "grad_norm": 0.12504369020462036, + "learning_rate": 0.00018995774938169828, + "loss": 1.0899, + "step": 2050 + }, + { + "epoch": 0.158188388340159, + "grad_norm": 0.1266452521085739, + "learning_rate": 0.0001899474443528442, + "loss": 1.0654, + "step": 2052 + }, + { + "epoch": 0.15834256805589014, + "grad_norm": 0.13447126746177673, + "learning_rate": 0.0001899371393239901, + "loss": 1.0649, + "step": 2054 + }, + { + "epoch": 0.1584967477716213, + "grad_norm": 0.1446131467819214, + "learning_rate": 0.00018992683429513603, + "loss": 1.1439, + "step": 2056 + }, + { + "epoch": 0.15865092748735243, + "grad_norm": 0.12688389420509338, + "learning_rate": 0.00018991652926628197, + "loss": 1.0262, + "step": 2058 + }, + { + "epoch": 0.1588051072030836, + "grad_norm": 0.12581713497638702, + "learning_rate": 0.00018990622423742788, + "loss": 1.0723, + "step": 2060 + }, + { + "epoch": 0.15895928691881475, + "grad_norm": 0.15745951235294342, + "learning_rate": 0.0001898959192085738, + "loss": 1.1038, + "step": 2062 + }, + { + "epoch": 0.1591134666345459, + "grad_norm": 0.14457587897777557, + "learning_rate": 0.0001898856141797197, + "loss": 1.1072, + "step": 2064 + }, + { + "epoch": 0.15926764635027704, + "grad_norm": 0.11454683542251587, + "learning_rate": 0.00018987530915086563, + "loss": 1.0605, + "step": 2066 + }, + { + "epoch": 0.1594218260660082, + "grad_norm": 0.1137547716498375, + "learning_rate": 0.00018986500412201157, + "loss": 1.0405, + "step": 2068 + }, + { + "epoch": 0.15957600578173933, + "grad_norm": 0.1220378428697586, + "learning_rate": 0.00018985469909315748, + "loss": 1.086, + "step": 2070 + }, + { + "epoch": 0.15973018549747048, + "grad_norm": 0.13579098880290985, + "learning_rate": 0.0001898443940643034, + "loss": 1.0334, + "step": 2072 + }, + { + "epoch": 0.15988436521320165, + "grad_norm": 0.1529407948255539, + "learning_rate": 0.00018983408903544931, + "loss": 1.0614, + "step": 2074 + }, + { + "epoch": 0.1600385449289328, + "grad_norm": 0.13769444823265076, + "learning_rate": 0.00018982378400659523, + "loss": 1.1212, + "step": 2076 + }, + { + "epoch": 0.16019272464466394, + "grad_norm": 0.12095335125923157, + "learning_rate": 0.00018981347897774114, + "loss": 1.047, + "step": 2078 + }, + { + "epoch": 0.1603469043603951, + "grad_norm": 0.12483233958482742, + "learning_rate": 0.00018980317394888706, + "loss": 1.0808, + "step": 2080 + }, + { + "epoch": 0.16050108407612623, + "grad_norm": 0.12451382726430893, + "learning_rate": 0.00018979286892003297, + "loss": 1.1259, + "step": 2082 + }, + { + "epoch": 0.16065526379185738, + "grad_norm": 0.12540730834007263, + "learning_rate": 0.0001897825638911789, + "loss": 1.0761, + "step": 2084 + }, + { + "epoch": 0.16080944350758852, + "grad_norm": 0.12948516011238098, + "learning_rate": 0.0001897722588623248, + "loss": 1.0621, + "step": 2086 + }, + { + "epoch": 0.16096362322331967, + "grad_norm": 0.1349886953830719, + "learning_rate": 0.00018976195383347075, + "loss": 1.0549, + "step": 2088 + }, + { + "epoch": 0.16111780293905084, + "grad_norm": 0.1249813437461853, + "learning_rate": 0.00018975164880461666, + "loss": 1.0828, + "step": 2090 + }, + { + "epoch": 0.161271982654782, + "grad_norm": 0.1299104243516922, + "learning_rate": 0.00018974134377576258, + "loss": 1.097, + "step": 2092 + }, + { + "epoch": 0.16142616237051313, + "grad_norm": 0.13004744052886963, + "learning_rate": 0.0001897310387469085, + "loss": 1.0417, + "step": 2094 + }, + { + "epoch": 0.16158034208624428, + "grad_norm": 0.11553830653429031, + "learning_rate": 0.0001897207337180544, + "loss": 1.0563, + "step": 2096 + }, + { + "epoch": 0.16173452180197542, + "grad_norm": 0.12000396102666855, + "learning_rate": 0.00018971042868920035, + "loss": 1.077, + "step": 2098 + }, + { + "epoch": 0.16188870151770657, + "grad_norm": 0.13707685470581055, + "learning_rate": 0.00018970012366034626, + "loss": 1.0994, + "step": 2100 + }, + { + "epoch": 0.16188870151770657, + "eval_loss": 1.0858707427978516, + "eval_runtime": 185.7188, + "eval_samples_per_second": 91.229, + "eval_steps_per_second": 1.427, + "step": 2100 + } + ], + "logging_steps": 2, + "max_steps": 38916, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.132999221824717e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}