| { | |
| "best_global_step": 2100, | |
| "best_metric": 1.0858707427978516, | |
| "best_model_checkpoint": "./outputs/checkpoint-2100", | |
| "epoch": 0.16188870151770657, | |
| "eval_steps": 100, | |
| "global_step": 2100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00015417971573114913, | |
| "grad_norm": 1.2087944746017456, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.8689, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.00030835943146229826, | |
| "grad_norm": 1.2666666507720947, | |
| "learning_rate": 6e-06, | |
| "loss": 1.7785, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.00046253914719344736, | |
| "grad_norm": 0.7307026982307434, | |
| "learning_rate": 1e-05, | |
| "loss": 1.6809, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0006167188629245965, | |
| "grad_norm": 1.2569252252578735, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 1.9048, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.0007708985786557456, | |
| "grad_norm": 0.9572980403900146, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.7574, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0009250782943868947, | |
| "grad_norm": 0.9918506145477295, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 1.858, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0010792580101180438, | |
| "grad_norm": 0.9316955208778381, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.8238, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.001233437725849193, | |
| "grad_norm": 0.8265096545219421, | |
| "learning_rate": 3e-05, | |
| "loss": 1.6852, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.001387617441580342, | |
| "grad_norm": 0.900516152381897, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 1.8227, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0015417971573114912, | |
| "grad_norm": 0.9343056678771973, | |
| "learning_rate": 3.8e-05, | |
| "loss": 1.7732, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0016959768730426404, | |
| "grad_norm": 0.8314495086669922, | |
| "learning_rate": 4.2e-05, | |
| "loss": 1.732, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.0018501565887737894, | |
| "grad_norm": 0.8370314240455627, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 1.6725, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.0020043363045049384, | |
| "grad_norm": 0.6678845286369324, | |
| "learning_rate": 5e-05, | |
| "loss": 1.5638, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0021585160202360876, | |
| "grad_norm": 0.6469596028327942, | |
| "learning_rate": 5.4000000000000005e-05, | |
| "loss": 1.6414, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.002312695735967237, | |
| "grad_norm": 1.1161589622497559, | |
| "learning_rate": 5.8e-05, | |
| "loss": 1.6015, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.002466875451698386, | |
| "grad_norm": 0.6085391044616699, | |
| "learning_rate": 6.2e-05, | |
| "loss": 1.4577, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.0026210551674295353, | |
| "grad_norm": 0.7159522175788879, | |
| "learning_rate": 6.6e-05, | |
| "loss": 1.4667, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.002775234883160684, | |
| "grad_norm": 0.67247074842453, | |
| "learning_rate": 7e-05, | |
| "loss": 1.5619, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0029294145988918332, | |
| "grad_norm": 0.6272625923156738, | |
| "learning_rate": 7.4e-05, | |
| "loss": 1.322, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0030835943146229824, | |
| "grad_norm": 0.7291163206100464, | |
| "learning_rate": 7.800000000000001e-05, | |
| "loss": 1.3936, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0032377740303541317, | |
| "grad_norm": 0.4980190396308899, | |
| "learning_rate": 8.2e-05, | |
| "loss": 1.3322, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.003391953746085281, | |
| "grad_norm": 1.032578945159912, | |
| "learning_rate": 8.6e-05, | |
| "loss": 1.3657, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0035461334618164296, | |
| "grad_norm": 0.5118615031242371, | |
| "learning_rate": 9e-05, | |
| "loss": 1.2866, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.003700313177547579, | |
| "grad_norm": 0.5234407782554626, | |
| "learning_rate": 9.4e-05, | |
| "loss": 1.2806, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.003854492893278728, | |
| "grad_norm": 0.49764135479927063, | |
| "learning_rate": 9.8e-05, | |
| "loss": 1.2004, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.004008672609009877, | |
| "grad_norm": 0.34377485513687134, | |
| "learning_rate": 0.00010200000000000001, | |
| "loss": 1.1947, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.0041628523247410265, | |
| "grad_norm": 0.41426530480384827, | |
| "learning_rate": 0.00010600000000000002, | |
| "loss": 1.2689, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.004317032040472175, | |
| "grad_norm": 0.5027992129325867, | |
| "learning_rate": 0.00011000000000000002, | |
| "loss": 1.2249, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.004471211756203325, | |
| "grad_norm": 0.44335752725601196, | |
| "learning_rate": 0.00011399999999999999, | |
| "loss": 1.2771, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.004625391471934474, | |
| "grad_norm": 0.3176646828651428, | |
| "learning_rate": 0.000118, | |
| "loss": 1.1873, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0047795711876656224, | |
| "grad_norm": 0.24802716076374054, | |
| "learning_rate": 0.000122, | |
| "loss": 1.1989, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.004933750903396772, | |
| "grad_norm": 0.23831751942634583, | |
| "learning_rate": 0.000126, | |
| "loss": 1.1093, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.005087930619127921, | |
| "grad_norm": 0.24024009704589844, | |
| "learning_rate": 0.00013000000000000002, | |
| "loss": 1.2196, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.0052421103348590705, | |
| "grad_norm": 0.2745237350463867, | |
| "learning_rate": 0.000134, | |
| "loss": 1.1802, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.005396290050590219, | |
| "grad_norm": 0.27817806601524353, | |
| "learning_rate": 0.000138, | |
| "loss": 1.1939, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.005550469766321368, | |
| "grad_norm": 0.19907328486442566, | |
| "learning_rate": 0.000142, | |
| "loss": 1.2061, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.005704649482052518, | |
| "grad_norm": 0.18879663944244385, | |
| "learning_rate": 0.000146, | |
| "loss": 1.2149, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.0058588291977836665, | |
| "grad_norm": 0.21456782519817352, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 1.1726, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.006013008913514816, | |
| "grad_norm": 0.23913143575191498, | |
| "learning_rate": 0.000154, | |
| "loss": 1.148, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.006167188629245965, | |
| "grad_norm": 0.2148526906967163, | |
| "learning_rate": 0.00015800000000000002, | |
| "loss": 1.1925, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.006321368344977114, | |
| "grad_norm": 0.2392999231815338, | |
| "learning_rate": 0.000162, | |
| "loss": 1.1488, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.006475548060708263, | |
| "grad_norm": 0.16503232717514038, | |
| "learning_rate": 0.000166, | |
| "loss": 1.1555, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.006629727776439412, | |
| "grad_norm": 0.1844739466905594, | |
| "learning_rate": 0.00017, | |
| "loss": 1.1934, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.006783907492170562, | |
| "grad_norm": 0.23832857608795166, | |
| "learning_rate": 0.000174, | |
| "loss": 1.1129, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.0069380872079017105, | |
| "grad_norm": 0.8846365809440613, | |
| "learning_rate": 0.00017800000000000002, | |
| "loss": 1.1028, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.007092266923632859, | |
| "grad_norm": 0.187076598405838, | |
| "learning_rate": 0.000182, | |
| "loss": 1.1, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.007246446639364009, | |
| "grad_norm": 0.1795521378517151, | |
| "learning_rate": 0.00018600000000000002, | |
| "loss": 1.1478, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.007400626355095158, | |
| "grad_norm": 0.199871227145195, | |
| "learning_rate": 0.00019, | |
| "loss": 1.1223, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.007554806070826307, | |
| "grad_norm": 0.17832662165164948, | |
| "learning_rate": 0.000194, | |
| "loss": 1.0909, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.007708985786557456, | |
| "grad_norm": 0.17023932933807373, | |
| "learning_rate": 0.00019800000000000002, | |
| "loss": 1.1526, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.007708985786557456, | |
| "eval_loss": 1.1401352882385254, | |
| "eval_runtime": 185.6269, | |
| "eval_samples_per_second": 91.274, | |
| "eval_steps_per_second": 1.428, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.007863165502288605, | |
| "grad_norm": 0.17429223656654358, | |
| "learning_rate": 0.00019999484748557298, | |
| "loss": 1.1597, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.008017345218019754, | |
| "grad_norm": 0.16158349812030792, | |
| "learning_rate": 0.0001999845424567189, | |
| "loss": 1.1297, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.008171524933750904, | |
| "grad_norm": 0.15818771719932556, | |
| "learning_rate": 0.0001999742374278648, | |
| "loss": 1.083, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.008325704649482053, | |
| "grad_norm": 0.1591726392507553, | |
| "learning_rate": 0.00019996393239901073, | |
| "loss": 1.086, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.008479884365213202, | |
| "grad_norm": 0.174184650182724, | |
| "learning_rate": 0.00019995362737015664, | |
| "loss": 1.0769, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.00863406408094435, | |
| "grad_norm": 0.15928815305233002, | |
| "learning_rate": 0.00019994332234130258, | |
| "loss": 1.1315, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.0087882437966755, | |
| "grad_norm": 0.19639264047145844, | |
| "learning_rate": 0.0001999330173124485, | |
| "loss": 1.1339, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.00894242351240665, | |
| "grad_norm": 0.1639835238456726, | |
| "learning_rate": 0.0001999227122835944, | |
| "loss": 1.0836, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.009096603228137799, | |
| "grad_norm": 0.18691964447498322, | |
| "learning_rate": 0.00019991240725474033, | |
| "loss": 1.2109, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.009250782943868947, | |
| "grad_norm": 0.188096821308136, | |
| "learning_rate": 0.00019990210222588624, | |
| "loss": 1.1778, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.009404962659600096, | |
| "grad_norm": 0.1527150571346283, | |
| "learning_rate": 0.00019989179719703218, | |
| "loss": 1.0977, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.009559142375331245, | |
| "grad_norm": 0.1705218255519867, | |
| "learning_rate": 0.0001998814921681781, | |
| "loss": 1.1333, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.009713322091062395, | |
| "grad_norm": 0.1888928860425949, | |
| "learning_rate": 0.00019987118713932401, | |
| "loss": 1.1843, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.009867501806793544, | |
| "grad_norm": 0.1778104603290558, | |
| "learning_rate": 0.00019986088211046993, | |
| "loss": 1.0766, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.010021681522524693, | |
| "grad_norm": 0.15807992219924927, | |
| "learning_rate": 0.00019985057708161584, | |
| "loss": 1.0449, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.010175861238255842, | |
| "grad_norm": 0.16706159710884094, | |
| "learning_rate": 0.00019984027205276176, | |
| "loss": 1.0644, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.01033004095398699, | |
| "grad_norm": 0.16455501317977905, | |
| "learning_rate": 0.00019982996702390767, | |
| "loss": 1.1479, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.010484220669718141, | |
| "grad_norm": 0.17258939146995544, | |
| "learning_rate": 0.0001998196619950536, | |
| "loss": 1.0614, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.01063840038544929, | |
| "grad_norm": 0.15501369535923004, | |
| "learning_rate": 0.0001998093569661995, | |
| "loss": 1.1045, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.010792580101180439, | |
| "grad_norm": 0.1534334272146225, | |
| "learning_rate": 0.00019979905193734542, | |
| "loss": 1.1035, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.010946759816911587, | |
| "grad_norm": 0.14120443165302277, | |
| "learning_rate": 0.00019978874690849136, | |
| "loss": 1.0618, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.011100939532642736, | |
| "grad_norm": 0.17808520793914795, | |
| "learning_rate": 0.00019977844187963728, | |
| "loss": 1.1687, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.011255119248373887, | |
| "grad_norm": 0.16697613894939423, | |
| "learning_rate": 0.0001997681368507832, | |
| "loss": 1.0979, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.011409298964105035, | |
| "grad_norm": 0.16491086781024933, | |
| "learning_rate": 0.0001997578318219291, | |
| "loss": 1.1219, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.011563478679836184, | |
| "grad_norm": 0.15342313051223755, | |
| "learning_rate": 0.00019974752679307502, | |
| "loss": 1.1169, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.011717658395567333, | |
| "grad_norm": 0.1539286971092224, | |
| "learning_rate": 0.00019973722176422093, | |
| "loss": 1.1288, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.011871838111298482, | |
| "grad_norm": 0.15605852007865906, | |
| "learning_rate": 0.00019972691673536688, | |
| "loss": 1.0445, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.012026017827029632, | |
| "grad_norm": 0.14324098825454712, | |
| "learning_rate": 0.0001997166117065128, | |
| "loss": 1.1309, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.012180197542760781, | |
| "grad_norm": 0.21045701205730438, | |
| "learning_rate": 0.0001997063066776587, | |
| "loss": 1.0946, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.01233437725849193, | |
| "grad_norm": 0.16019922494888306, | |
| "learning_rate": 0.00019969600164880462, | |
| "loss": 1.11, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.012488556974223079, | |
| "grad_norm": 0.15740078687667847, | |
| "learning_rate": 0.00019968569661995054, | |
| "loss": 1.112, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.012642736689954227, | |
| "grad_norm": 0.16974380612373352, | |
| "learning_rate": 0.00019967539159109648, | |
| "loss": 1.1279, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.012796916405685378, | |
| "grad_norm": 0.16405288875102997, | |
| "learning_rate": 0.0001996650865622424, | |
| "loss": 1.0952, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.012951096121416527, | |
| "grad_norm": 0.16120509803295135, | |
| "learning_rate": 0.0001996547815333883, | |
| "loss": 1.1203, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.013105275837147675, | |
| "grad_norm": 0.17402276396751404, | |
| "learning_rate": 0.00019964447650453422, | |
| "loss": 1.0991, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.013259455552878824, | |
| "grad_norm": 0.18349111080169678, | |
| "learning_rate": 0.00019963417147568014, | |
| "loss": 1.1394, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.013413635268609973, | |
| "grad_norm": 0.14613087475299835, | |
| "learning_rate": 0.00019962386644682608, | |
| "loss": 1.1357, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.013567814984341123, | |
| "grad_norm": 0.142988383769989, | |
| "learning_rate": 0.000199613561417972, | |
| "loss": 1.0169, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.013721994700072272, | |
| "grad_norm": 0.14817160367965698, | |
| "learning_rate": 0.0001996032563891179, | |
| "loss": 1.1238, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.013876174415803421, | |
| "grad_norm": 0.15391133725643158, | |
| "learning_rate": 0.00019959295136026382, | |
| "loss": 1.0712, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.01403035413153457, | |
| "grad_norm": 0.1766846477985382, | |
| "learning_rate": 0.00019958264633140974, | |
| "loss": 1.1422, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.014184533847265719, | |
| "grad_norm": 0.16789212822914124, | |
| "learning_rate": 0.00019957234130255565, | |
| "loss": 1.1266, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.014338713562996869, | |
| "grad_norm": 0.1527165323495865, | |
| "learning_rate": 0.00019956203627370157, | |
| "loss": 1.0667, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.014492893278728018, | |
| "grad_norm": 0.1772206574678421, | |
| "learning_rate": 0.00019955173124484748, | |
| "loss": 1.1182, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.014647072994459167, | |
| "grad_norm": 0.15008313953876495, | |
| "learning_rate": 0.0001995414262159934, | |
| "loss": 1.0382, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.014801252710190315, | |
| "grad_norm": 0.16365988552570343, | |
| "learning_rate": 0.00019953112118713931, | |
| "loss": 1.1262, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.014955432425921464, | |
| "grad_norm": 0.14952193200588226, | |
| "learning_rate": 0.00019952081615828526, | |
| "loss": 1.1245, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.015109612141652615, | |
| "grad_norm": 0.15425263345241547, | |
| "learning_rate": 0.00019951051112943117, | |
| "loss": 1.1452, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.015263791857383763, | |
| "grad_norm": 0.1567617654800415, | |
| "learning_rate": 0.00019950020610057709, | |
| "loss": 1.0392, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.015417971573114912, | |
| "grad_norm": 0.14292609691619873, | |
| "learning_rate": 0.000199489901071723, | |
| "loss": 1.0728, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.015417971573114912, | |
| "eval_loss": 1.1127630472183228, | |
| "eval_runtime": 185.2528, | |
| "eval_samples_per_second": 91.459, | |
| "eval_steps_per_second": 1.43, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.015572151288846061, | |
| "grad_norm": 0.15465517342090607, | |
| "learning_rate": 0.00019947959604286892, | |
| "loss": 1.0596, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.01572633100457721, | |
| "grad_norm": 0.16749607026576996, | |
| "learning_rate": 0.00019946929101401486, | |
| "loss": 1.1005, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.01588051072030836, | |
| "grad_norm": 0.15854287147521973, | |
| "learning_rate": 0.00019945898598516077, | |
| "loss": 1.0963, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.016034690436039507, | |
| "grad_norm": 0.1457831859588623, | |
| "learning_rate": 0.0001994486809563067, | |
| "loss": 1.1149, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.016188870151770656, | |
| "grad_norm": 0.15744629502296448, | |
| "learning_rate": 0.0001994383759274526, | |
| "loss": 1.0789, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.01634304986750181, | |
| "grad_norm": 0.13411423563957214, | |
| "learning_rate": 0.00019942807089859852, | |
| "loss": 1.0641, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.016497229583232957, | |
| "grad_norm": 0.1575399488210678, | |
| "learning_rate": 0.00019941776586974446, | |
| "loss": 1.0888, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.016651409298964106, | |
| "grad_norm": 0.14619529247283936, | |
| "learning_rate": 0.00019940746084089037, | |
| "loss": 1.081, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.016805589014695255, | |
| "grad_norm": 0.15578237175941467, | |
| "learning_rate": 0.0001993971558120363, | |
| "loss": 1.1434, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.016959768730426403, | |
| "grad_norm": 0.1516629308462143, | |
| "learning_rate": 0.0001993868507831822, | |
| "loss": 1.0909, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.017113948446157552, | |
| "grad_norm": 0.15613436698913574, | |
| "learning_rate": 0.00019937654575432812, | |
| "loss": 1.0999, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.0172681281618887, | |
| "grad_norm": 0.14825573563575745, | |
| "learning_rate": 0.00019936624072547406, | |
| "loss": 1.0827, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.01742230787761985, | |
| "grad_norm": 0.1624906212091446, | |
| "learning_rate": 0.00019935593569661998, | |
| "loss": 1.0856, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.017576487593351, | |
| "grad_norm": 0.1380940079689026, | |
| "learning_rate": 0.0001993456306677659, | |
| "loss": 1.0514, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.017730667309082147, | |
| "grad_norm": 0.13712120056152344, | |
| "learning_rate": 0.0001993353256389118, | |
| "loss": 1.0977, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.0178848470248133, | |
| "grad_norm": 0.1448957622051239, | |
| "learning_rate": 0.00019932502061005772, | |
| "loss": 1.0729, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.01803902674054445, | |
| "grad_norm": 0.13421876728534698, | |
| "learning_rate": 0.00019931471558120364, | |
| "loss": 1.0879, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.018193206456275597, | |
| "grad_norm": 0.16884732246398926, | |
| "learning_rate": 0.00019930441055234955, | |
| "loss": 1.1159, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.018347386172006746, | |
| "grad_norm": 0.14634890854358673, | |
| "learning_rate": 0.00019929410552349547, | |
| "loss": 1.0568, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.018501565887737895, | |
| "grad_norm": 0.16796648502349854, | |
| "learning_rate": 0.00019928380049464138, | |
| "loss": 1.0944, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.018655745603469043, | |
| "grad_norm": 0.13724717497825623, | |
| "learning_rate": 0.0001992734954657873, | |
| "loss": 1.0609, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.018809925319200192, | |
| "grad_norm": 0.14133594930171967, | |
| "learning_rate": 0.0001992631904369332, | |
| "loss": 1.0879, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.01896410503493134, | |
| "grad_norm": 0.1611246019601822, | |
| "learning_rate": 0.00019925288540807915, | |
| "loss": 1.0681, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.01911828475066249, | |
| "grad_norm": 0.17420877516269684, | |
| "learning_rate": 0.00019924258037922507, | |
| "loss": 1.1336, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.01927246446639364, | |
| "grad_norm": 0.13766029477119446, | |
| "learning_rate": 0.00019923227535037098, | |
| "loss": 1.075, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.01942664418212479, | |
| "grad_norm": 0.1691662222146988, | |
| "learning_rate": 0.0001992219703215169, | |
| "loss": 1.1369, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.01958082389785594, | |
| "grad_norm": 0.14959432184696198, | |
| "learning_rate": 0.0001992116652926628, | |
| "loss": 1.1129, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.01973500361358709, | |
| "grad_norm": 0.14996406435966492, | |
| "learning_rate": 0.00019920136026380875, | |
| "loss": 1.0304, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.019889183329318237, | |
| "grad_norm": 0.13211801648139954, | |
| "learning_rate": 0.00019919105523495467, | |
| "loss": 1.0652, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.020043363045049386, | |
| "grad_norm": 0.16041967272758484, | |
| "learning_rate": 0.00019918075020610058, | |
| "loss": 1.077, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.020197542760780535, | |
| "grad_norm": 0.1524546593427658, | |
| "learning_rate": 0.0001991704451772465, | |
| "loss": 1.1176, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.020351722476511683, | |
| "grad_norm": 0.16032540798187256, | |
| "learning_rate": 0.00019916014014839241, | |
| "loss": 1.0736, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.020505902192242832, | |
| "grad_norm": 0.17891019582748413, | |
| "learning_rate": 0.00019914983511953836, | |
| "loss": 1.1435, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.02066008190797398, | |
| "grad_norm": 0.14484059810638428, | |
| "learning_rate": 0.00019913953009068427, | |
| "loss": 1.0356, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.02081426162370513, | |
| "grad_norm": 0.14321155846118927, | |
| "learning_rate": 0.00019912922506183019, | |
| "loss": 1.0536, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.020968441339436282, | |
| "grad_norm": 0.17357808351516724, | |
| "learning_rate": 0.0001991189200329761, | |
| "loss": 1.171, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.02112262105516743, | |
| "grad_norm": 0.13990800082683563, | |
| "learning_rate": 0.00019910861500412202, | |
| "loss": 1.0946, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.02127680077089858, | |
| "grad_norm": 0.16634231805801392, | |
| "learning_rate": 0.00019909830997526796, | |
| "loss": 1.1029, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.02143098048662973, | |
| "grad_norm": 0.16322381794452667, | |
| "learning_rate": 0.00019908800494641387, | |
| "loss": 1.0688, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.021585160202360877, | |
| "grad_norm": 0.1652844250202179, | |
| "learning_rate": 0.0001990776999175598, | |
| "loss": 1.1237, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.021739339918092026, | |
| "grad_norm": 0.14457885921001434, | |
| "learning_rate": 0.0001990673948887057, | |
| "loss": 1.1995, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.021893519633823175, | |
| "grad_norm": 0.15549878776073456, | |
| "learning_rate": 0.00019905708985985162, | |
| "loss": 1.0475, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.022047699349554323, | |
| "grad_norm": 0.15715502202510834, | |
| "learning_rate": 0.00019904678483099756, | |
| "loss": 1.1211, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.022201879065285472, | |
| "grad_norm": 0.14022529125213623, | |
| "learning_rate": 0.00019903647980214347, | |
| "loss": 1.1056, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.02235605878101662, | |
| "grad_norm": 0.13293786346912384, | |
| "learning_rate": 0.0001990261747732894, | |
| "loss": 1.0877, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.022510238496747773, | |
| "grad_norm": 0.14625073969364166, | |
| "learning_rate": 0.0001990158697444353, | |
| "loss": 1.0375, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.022664418212478922, | |
| "grad_norm": 0.1417943835258484, | |
| "learning_rate": 0.0001990055647155812, | |
| "loss": 1.091, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.02281859792821007, | |
| "grad_norm": 0.1519964039325714, | |
| "learning_rate": 0.00019899525968672713, | |
| "loss": 1.0396, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.02297277764394122, | |
| "grad_norm": 0.1676655411720276, | |
| "learning_rate": 0.00019898495465787305, | |
| "loss": 1.1249, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.02312695735967237, | |
| "grad_norm": 0.1487220674753189, | |
| "learning_rate": 0.00019897464962901896, | |
| "loss": 1.1768, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02312695735967237, | |
| "eval_loss": 1.1061022281646729, | |
| "eval_runtime": 185.239, | |
| "eval_samples_per_second": 91.466, | |
| "eval_steps_per_second": 1.431, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.023281137075403517, | |
| "grad_norm": 0.1399739533662796, | |
| "learning_rate": 0.00019896434460016488, | |
| "loss": 1.0962, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.023435316791134666, | |
| "grad_norm": 0.15282337367534637, | |
| "learning_rate": 0.0001989540395713108, | |
| "loss": 1.1688, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.023589496506865815, | |
| "grad_norm": 0.15459619462490082, | |
| "learning_rate": 0.00019894373454245674, | |
| "loss": 1.0216, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.023743676222596963, | |
| "grad_norm": 0.15799634158611298, | |
| "learning_rate": 0.00019893342951360265, | |
| "loss": 1.1429, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.023897855938328112, | |
| "grad_norm": 0.1343819946050644, | |
| "learning_rate": 0.00019892312448474857, | |
| "loss": 1.0959, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.024052035654059264, | |
| "grad_norm": 0.14791317284107208, | |
| "learning_rate": 0.00019891281945589448, | |
| "loss": 1.0636, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.024206215369790413, | |
| "grad_norm": 0.1442137360572815, | |
| "learning_rate": 0.0001989025144270404, | |
| "loss": 1.055, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.024360395085521562, | |
| "grad_norm": 0.14649145305156708, | |
| "learning_rate": 0.00019889220939818634, | |
| "loss": 1.0906, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.02451457480125271, | |
| "grad_norm": 0.14234665036201477, | |
| "learning_rate": 0.00019888190436933225, | |
| "loss": 1.0853, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.02466875451698386, | |
| "grad_norm": 0.1419668048620224, | |
| "learning_rate": 0.00019887159934047817, | |
| "loss": 1.0296, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.02482293423271501, | |
| "grad_norm": 0.14730845391750336, | |
| "learning_rate": 0.00019886129431162408, | |
| "loss": 1.0421, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.024977113948446157, | |
| "grad_norm": 0.1400081068277359, | |
| "learning_rate": 0.00019885098928277, | |
| "loss": 1.0291, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.025131293664177306, | |
| "grad_norm": 0.15542668104171753, | |
| "learning_rate": 0.0001988406842539159, | |
| "loss": 1.0597, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.025285473379908455, | |
| "grad_norm": 0.14521440863609314, | |
| "learning_rate": 0.00019883037922506185, | |
| "loss": 1.0491, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.025439653095639603, | |
| "grad_norm": 0.16224826872348785, | |
| "learning_rate": 0.00019882007419620777, | |
| "loss": 1.1031, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.025593832811370756, | |
| "grad_norm": 0.15028877556324005, | |
| "learning_rate": 0.00019880976916735368, | |
| "loss": 1.1154, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.025748012527101904, | |
| "grad_norm": 0.12962941825389862, | |
| "learning_rate": 0.0001987994641384996, | |
| "loss": 1.0363, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.025902192242833053, | |
| "grad_norm": 0.14908359944820404, | |
| "learning_rate": 0.0001987891591096455, | |
| "loss": 1.1513, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.026056371958564202, | |
| "grad_norm": 0.15441828966140747, | |
| "learning_rate": 0.00019877885408079146, | |
| "loss": 1.1303, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.02621055167429535, | |
| "grad_norm": 0.12669101357460022, | |
| "learning_rate": 0.00019876854905193737, | |
| "loss": 1.0875, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.0263647313900265, | |
| "grad_norm": 0.13190661370754242, | |
| "learning_rate": 0.00019875824402308329, | |
| "loss": 1.0778, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.02651891110575765, | |
| "grad_norm": 0.14043989777565002, | |
| "learning_rate": 0.0001987479389942292, | |
| "loss": 1.1011, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.026673090821488797, | |
| "grad_norm": 0.13694870471954346, | |
| "learning_rate": 0.00019873763396537512, | |
| "loss": 1.0532, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.026827270537219946, | |
| "grad_norm": 0.15089921653270721, | |
| "learning_rate": 0.00019872732893652103, | |
| "loss": 1.1292, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.026981450252951095, | |
| "grad_norm": 0.14839838445186615, | |
| "learning_rate": 0.00019871702390766694, | |
| "loss": 1.0275, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.027135629968682247, | |
| "grad_norm": 0.16198500990867615, | |
| "learning_rate": 0.00019870671887881286, | |
| "loss": 1.1453, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.027289809684413396, | |
| "grad_norm": 0.14694632589817047, | |
| "learning_rate": 0.00019869641384995877, | |
| "loss": 1.129, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.027443989400144544, | |
| "grad_norm": 0.16091379523277283, | |
| "learning_rate": 0.0001986861088211047, | |
| "loss": 1.1186, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.027598169115875693, | |
| "grad_norm": 0.144720658659935, | |
| "learning_rate": 0.00019867580379225063, | |
| "loss": 1.0224, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.027752348831606842, | |
| "grad_norm": 0.13851307332515717, | |
| "learning_rate": 0.00019866549876339655, | |
| "loss": 1.1421, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.02790652854733799, | |
| "grad_norm": 0.13124969601631165, | |
| "learning_rate": 0.00019865519373454246, | |
| "loss": 1.0938, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.02806070826306914, | |
| "grad_norm": 0.14723828434944153, | |
| "learning_rate": 0.00019864488870568838, | |
| "loss": 1.1335, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.02821488797880029, | |
| "grad_norm": 0.17669795453548431, | |
| "learning_rate": 0.0001986345836768343, | |
| "loss": 1.0765, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.028369067694531437, | |
| "grad_norm": 0.1457260102033615, | |
| "learning_rate": 0.00019862427864798023, | |
| "loss": 1.1073, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.028523247410262586, | |
| "grad_norm": 0.13594554364681244, | |
| "learning_rate": 0.00019861397361912615, | |
| "loss": 1.0587, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.028677427125993738, | |
| "grad_norm": 0.13798941671848297, | |
| "learning_rate": 0.00019860366859027206, | |
| "loss": 1.0833, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.028831606841724887, | |
| "grad_norm": 0.15587519109249115, | |
| "learning_rate": 0.00019859336356141798, | |
| "loss": 1.0287, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.028985786557456036, | |
| "grad_norm": 0.16585086286067963, | |
| "learning_rate": 0.0001985830585325639, | |
| "loss": 1.1786, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.029139966273187184, | |
| "grad_norm": 0.1444484293460846, | |
| "learning_rate": 0.00019857275350370983, | |
| "loss": 1.1793, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.029294145988918333, | |
| "grad_norm": 0.14413981139659882, | |
| "learning_rate": 0.00019856244847485575, | |
| "loss": 1.1141, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.029448325704649482, | |
| "grad_norm": 0.142032191157341, | |
| "learning_rate": 0.00019855214344600166, | |
| "loss": 1.1033, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.02960250542038063, | |
| "grad_norm": 0.1490195393562317, | |
| "learning_rate": 0.00019854183841714758, | |
| "loss": 1.1592, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.02975668513611178, | |
| "grad_norm": 0.1408643275499344, | |
| "learning_rate": 0.0001985315333882935, | |
| "loss": 1.1505, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.02991086485184293, | |
| "grad_norm": 0.12526237964630127, | |
| "learning_rate": 0.00019852122835943944, | |
| "loss": 1.1027, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.030065044567574077, | |
| "grad_norm": 0.1339711844921112, | |
| "learning_rate": 0.00019851092333058535, | |
| "loss": 1.1238, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.03021922428330523, | |
| "grad_norm": 0.13032345473766327, | |
| "learning_rate": 0.00019850061830173127, | |
| "loss": 1.1121, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.030373403999036378, | |
| "grad_norm": 0.15815846621990204, | |
| "learning_rate": 0.00019849031327287718, | |
| "loss": 1.168, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.030527583714767527, | |
| "grad_norm": 0.14245116710662842, | |
| "learning_rate": 0.0001984800082440231, | |
| "loss": 1.0436, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.030681763430498676, | |
| "grad_norm": 0.15660050511360168, | |
| "learning_rate": 0.000198469703215169, | |
| "loss": 1.158, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.030835943146229824, | |
| "grad_norm": 0.1654158979654312, | |
| "learning_rate": 0.00019845939818631493, | |
| "loss": 1.0802, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.030835943146229824, | |
| "eval_loss": 1.1026971340179443, | |
| "eval_runtime": 185.7295, | |
| "eval_samples_per_second": 91.224, | |
| "eval_steps_per_second": 1.427, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.030990122861960973, | |
| "grad_norm": 0.13845407962799072, | |
| "learning_rate": 0.00019844909315746084, | |
| "loss": 1.1055, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.031144302577692122, | |
| "grad_norm": 0.14852891862392426, | |
| "learning_rate": 0.00019843878812860676, | |
| "loss": 1.0983, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.031298482293423274, | |
| "grad_norm": 0.13408593833446503, | |
| "learning_rate": 0.00019842848309975267, | |
| "loss": 1.1063, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.03145266200915442, | |
| "grad_norm": 0.14041072130203247, | |
| "learning_rate": 0.00019841817807089859, | |
| "loss": 1.0327, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.03160684172488557, | |
| "grad_norm": 0.16119754314422607, | |
| "learning_rate": 0.00019840787304204453, | |
| "loss": 1.1, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.03176102144061672, | |
| "grad_norm": 0.14471223950386047, | |
| "learning_rate": 0.00019839756801319044, | |
| "loss": 1.0783, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.03191520115634787, | |
| "grad_norm": 0.15591050684452057, | |
| "learning_rate": 0.00019838726298433636, | |
| "loss": 1.1782, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.032069380872079015, | |
| "grad_norm": 0.1766556203365326, | |
| "learning_rate": 0.00019837695795548227, | |
| "loss": 1.1063, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.03222356058781017, | |
| "grad_norm": 0.16078630089759827, | |
| "learning_rate": 0.0001983666529266282, | |
| "loss": 1.0891, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.03237774030354131, | |
| "grad_norm": 0.13378402590751648, | |
| "learning_rate": 0.00019835634789777413, | |
| "loss": 1.074, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.032531920019272464, | |
| "grad_norm": 0.14526261389255524, | |
| "learning_rate": 0.00019834604286892004, | |
| "loss": 1.108, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.03268609973500362, | |
| "grad_norm": 0.1321713775396347, | |
| "learning_rate": 0.00019833573784006596, | |
| "loss": 1.019, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.03284027945073476, | |
| "grad_norm": 0.12685374915599823, | |
| "learning_rate": 0.00019832543281121187, | |
| "loss": 1.09, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.032994459166465914, | |
| "grad_norm": 0.13825605809688568, | |
| "learning_rate": 0.0001983151277823578, | |
| "loss": 1.1356, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.03314863888219706, | |
| "grad_norm": 0.13683827221393585, | |
| "learning_rate": 0.00019830482275350373, | |
| "loss": 1.1405, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.03330281859792821, | |
| "grad_norm": 0.16707143187522888, | |
| "learning_rate": 0.00019829451772464965, | |
| "loss": 1.1305, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.03345699831365936, | |
| "grad_norm": 0.11735045164823532, | |
| "learning_rate": 0.00019828421269579556, | |
| "loss": 1.0421, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.03361117802939051, | |
| "grad_norm": 0.1337989866733551, | |
| "learning_rate": 0.00019827390766694148, | |
| "loss": 1.0572, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.033765357745121655, | |
| "grad_norm": 0.17111611366271973, | |
| "learning_rate": 0.0001982636026380874, | |
| "loss": 1.1698, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.03391953746085281, | |
| "grad_norm": 0.13785259425640106, | |
| "learning_rate": 0.00019825329760923333, | |
| "loss": 1.056, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.03407371717658395, | |
| "grad_norm": 0.15061460435390472, | |
| "learning_rate": 0.00019824299258037925, | |
| "loss": 1.0963, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.034227896892315104, | |
| "grad_norm": 0.1231001690030098, | |
| "learning_rate": 0.00019823268755152516, | |
| "loss": 1.1264, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.03438207660804626, | |
| "grad_norm": 0.13752298057079315, | |
| "learning_rate": 0.00019822238252267108, | |
| "loss": 1.0672, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.0345362563237774, | |
| "grad_norm": 0.13519813120365143, | |
| "learning_rate": 0.000198212077493817, | |
| "loss": 1.0882, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.034690436039508554, | |
| "grad_norm": 0.140150785446167, | |
| "learning_rate": 0.0001982017724649629, | |
| "loss": 1.0572, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.0348446157552397, | |
| "grad_norm": 0.13910406827926636, | |
| "learning_rate": 0.00019819146743610882, | |
| "loss": 1.0762, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.03499879547097085, | |
| "grad_norm": 0.14587442576885223, | |
| "learning_rate": 0.00019818116240725474, | |
| "loss": 1.1232, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.035152975186702, | |
| "grad_norm": 0.14476893842220306, | |
| "learning_rate": 0.00019817085737840065, | |
| "loss": 1.1004, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.03530715490243315, | |
| "grad_norm": 0.13861101865768433, | |
| "learning_rate": 0.00019816055234954657, | |
| "loss": 1.0302, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.035461334618164295, | |
| "grad_norm": 0.14342686533927917, | |
| "learning_rate": 0.0001981502473206925, | |
| "loss": 1.1092, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.03561551433389545, | |
| "grad_norm": 0.11709775030612946, | |
| "learning_rate": 0.00019813994229183842, | |
| "loss": 1.0463, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.0357696940496266, | |
| "grad_norm": 0.15154917538166046, | |
| "learning_rate": 0.00019812963726298434, | |
| "loss": 1.0897, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.035923873765357744, | |
| "grad_norm": 0.16716259717941284, | |
| "learning_rate": 0.00019811933223413025, | |
| "loss": 1.1214, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.0360780534810889, | |
| "grad_norm": 0.13513320684432983, | |
| "learning_rate": 0.00019810902720527617, | |
| "loss": 1.0623, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.03623223319682004, | |
| "grad_norm": 0.15930432081222534, | |
| "learning_rate": 0.0001980987221764221, | |
| "loss": 1.1092, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.036386412912551194, | |
| "grad_norm": 0.13990509510040283, | |
| "learning_rate": 0.00019808841714756803, | |
| "loss": 1.1048, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.03654059262828234, | |
| "grad_norm": 0.18784300982952118, | |
| "learning_rate": 0.00019807811211871394, | |
| "loss": 1.1676, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.03669477234401349, | |
| "grad_norm": 0.152045339345932, | |
| "learning_rate": 0.00019806780708985986, | |
| "loss": 1.1303, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.03684895205974464, | |
| "grad_norm": 0.1409967988729477, | |
| "learning_rate": 0.00019805750206100577, | |
| "loss": 1.0972, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.03700313177547579, | |
| "grad_norm": 0.13838854432106018, | |
| "learning_rate": 0.0001980471970321517, | |
| "loss": 1.101, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.037157311491206935, | |
| "grad_norm": 0.1579430103302002, | |
| "learning_rate": 0.00019803689200329763, | |
| "loss": 1.1077, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.03731149120693809, | |
| "grad_norm": 0.15061910450458527, | |
| "learning_rate": 0.00019802658697444354, | |
| "loss": 1.1239, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.03746567092266924, | |
| "grad_norm": 0.16408291459083557, | |
| "learning_rate": 0.00019801628194558946, | |
| "loss": 1.0961, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.037619850638400384, | |
| "grad_norm": 0.15612424910068512, | |
| "learning_rate": 0.00019800597691673537, | |
| "loss": 1.1299, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.03777403035413154, | |
| "grad_norm": 0.14135530591011047, | |
| "learning_rate": 0.00019799567188788131, | |
| "loss": 1.0489, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.03792821006986268, | |
| "grad_norm": 0.13743548095226288, | |
| "learning_rate": 0.00019798536685902723, | |
| "loss": 1.0837, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.038082389785593834, | |
| "grad_norm": 0.157401442527771, | |
| "learning_rate": 0.00019797506183017314, | |
| "loss": 1.0573, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.03823656950132498, | |
| "grad_norm": 0.14982052147388458, | |
| "learning_rate": 0.00019796475680131906, | |
| "loss": 1.0839, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.03839074921705613, | |
| "grad_norm": 0.1347000151872635, | |
| "learning_rate": 0.00019795445177246497, | |
| "loss": 1.113, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.03854492893278728, | |
| "grad_norm": 0.14478904008865356, | |
| "learning_rate": 0.0001979441467436109, | |
| "loss": 1.0514, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03854492893278728, | |
| "eval_loss": 1.1000746488571167, | |
| "eval_runtime": 185.5217, | |
| "eval_samples_per_second": 91.326, | |
| "eval_steps_per_second": 1.428, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03869910864851843, | |
| "grad_norm": 0.14274291694164276, | |
| "learning_rate": 0.00019793384171475683, | |
| "loss": 1.0847, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 0.03885328836424958, | |
| "grad_norm": 0.14326965808868408, | |
| "learning_rate": 0.00019792353668590275, | |
| "loss": 1.0865, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 0.03900746807998073, | |
| "grad_norm": 0.1575518548488617, | |
| "learning_rate": 0.00019791323165704866, | |
| "loss": 1.1258, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 0.03916164779571188, | |
| "grad_norm": 0.14699862897396088, | |
| "learning_rate": 0.00019790292662819458, | |
| "loss": 1.1687, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 0.039315827511443024, | |
| "grad_norm": 0.1394687294960022, | |
| "learning_rate": 0.0001978926215993405, | |
| "loss": 1.1214, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.03947000722717418, | |
| "grad_norm": 0.14366985857486725, | |
| "learning_rate": 0.0001978823165704864, | |
| "loss": 1.0651, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.03962418694290532, | |
| "grad_norm": 0.14171218872070312, | |
| "learning_rate": 0.00019787201154163232, | |
| "loss": 1.1398, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 0.039778366658636474, | |
| "grad_norm": 0.13258612155914307, | |
| "learning_rate": 0.00019786170651277824, | |
| "loss": 1.1234, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 0.03993254637436762, | |
| "grad_norm": 0.17693160474300385, | |
| "learning_rate": 0.00019785140148392415, | |
| "loss": 1.1121, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 0.04008672609009877, | |
| "grad_norm": 0.143838569521904, | |
| "learning_rate": 0.00019784109645507006, | |
| "loss": 1.102, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.04024090580582992, | |
| "grad_norm": 0.14078038930892944, | |
| "learning_rate": 0.000197830791426216, | |
| "loss": 1.1044, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 0.04039508552156107, | |
| "grad_norm": 0.12367985397577286, | |
| "learning_rate": 0.00019782048639736192, | |
| "loss": 1.102, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 0.04054926523729222, | |
| "grad_norm": 0.136929452419281, | |
| "learning_rate": 0.00019781018136850784, | |
| "loss": 1.0802, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 0.04070344495302337, | |
| "grad_norm": 0.15831957757472992, | |
| "learning_rate": 0.00019779987633965375, | |
| "loss": 1.09, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 0.04085762466875452, | |
| "grad_norm": 0.15482452511787415, | |
| "learning_rate": 0.00019778957131079967, | |
| "loss": 1.0828, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.041011804384485664, | |
| "grad_norm": 0.13797122240066528, | |
| "learning_rate": 0.0001977792662819456, | |
| "loss": 1.1263, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 0.04116598410021682, | |
| "grad_norm": 0.18304814398288727, | |
| "learning_rate": 0.00019776896125309152, | |
| "loss": 1.0991, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 0.04132016381594796, | |
| "grad_norm": 0.1509987860918045, | |
| "learning_rate": 0.00019775865622423744, | |
| "loss": 1.0804, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 0.041474343531679114, | |
| "grad_norm": 0.13406258821487427, | |
| "learning_rate": 0.00019774835119538335, | |
| "loss": 1.0348, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 0.04162852324741026, | |
| "grad_norm": 0.1413736194372177, | |
| "learning_rate": 0.00019773804616652927, | |
| "loss": 1.066, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.04178270296314141, | |
| "grad_norm": 0.1451394259929657, | |
| "learning_rate": 0.0001977277411376752, | |
| "loss": 1.0485, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 0.041936882678872564, | |
| "grad_norm": 0.13275358080863953, | |
| "learning_rate": 0.00019771743610882113, | |
| "loss": 1.1164, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 0.04209106239460371, | |
| "grad_norm": 0.15869611501693726, | |
| "learning_rate": 0.00019770713107996704, | |
| "loss": 1.1361, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 0.04224524211033486, | |
| "grad_norm": 0.14091487228870392, | |
| "learning_rate": 0.00019769682605111295, | |
| "loss": 1.061, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 0.04239942182606601, | |
| "grad_norm": 0.13538867235183716, | |
| "learning_rate": 0.00019768652102225887, | |
| "loss": 1.0607, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.04255360154179716, | |
| "grad_norm": 0.15626317262649536, | |
| "learning_rate": 0.0001976762159934048, | |
| "loss": 1.0758, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 0.042707781257528304, | |
| "grad_norm": 0.1293731927871704, | |
| "learning_rate": 0.00019766591096455073, | |
| "loss": 1.0434, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 0.04286196097325946, | |
| "grad_norm": 0.13498535752296448, | |
| "learning_rate": 0.00019765560593569664, | |
| "loss": 1.0953, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 0.0430161406889906, | |
| "grad_norm": 0.14134527742862701, | |
| "learning_rate": 0.00019764530090684256, | |
| "loss": 1.1559, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 0.043170320404721754, | |
| "grad_norm": 0.13958705961704254, | |
| "learning_rate": 0.00019763499587798847, | |
| "loss": 1.2585, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.0433245001204529, | |
| "grad_norm": 0.2181047797203064, | |
| "learning_rate": 0.0001976246908491344, | |
| "loss": 1.0164, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 0.04347867983618405, | |
| "grad_norm": 0.1365436315536499, | |
| "learning_rate": 0.0001976143858202803, | |
| "loss": 1.124, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 0.043632859551915204, | |
| "grad_norm": 0.12809793651103973, | |
| "learning_rate": 0.00019760408079142622, | |
| "loss": 1.0378, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 0.04378703926764635, | |
| "grad_norm": 0.12341924756765366, | |
| "learning_rate": 0.00019759377576257213, | |
| "loss": 1.1091, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 0.0439412189833775, | |
| "grad_norm": 0.14291982352733612, | |
| "learning_rate": 0.00019758347073371805, | |
| "loss": 1.1366, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.04409539869910865, | |
| "grad_norm": 0.14486652612686157, | |
| "learning_rate": 0.000197573165704864, | |
| "loss": 1.0168, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 0.0442495784148398, | |
| "grad_norm": 0.1724916249513626, | |
| "learning_rate": 0.0001975628606760099, | |
| "loss": 1.1037, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 0.044403758130570944, | |
| "grad_norm": 0.13338427245616913, | |
| "learning_rate": 0.00019755255564715582, | |
| "loss": 1.0259, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.0445579378463021, | |
| "grad_norm": 0.1372508853673935, | |
| "learning_rate": 0.00019754225061830173, | |
| "loss": 1.0784, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 0.04471211756203324, | |
| "grad_norm": 0.11633725464344025, | |
| "learning_rate": 0.00019753194558944765, | |
| "loss": 1.0648, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.044866297277764394, | |
| "grad_norm": 0.14386776089668274, | |
| "learning_rate": 0.00019752164056059356, | |
| "loss": 1.0777, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 0.045020476993495546, | |
| "grad_norm": 0.14929193258285522, | |
| "learning_rate": 0.0001975113355317395, | |
| "loss": 1.1319, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 0.04517465670922669, | |
| "grad_norm": 0.1324220448732376, | |
| "learning_rate": 0.00019750103050288542, | |
| "loss": 1.0614, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 0.045328836424957844, | |
| "grad_norm": 0.1392926126718521, | |
| "learning_rate": 0.00019749072547403133, | |
| "loss": 1.142, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 0.04548301614068899, | |
| "grad_norm": 0.2632090151309967, | |
| "learning_rate": 0.00019748042044517725, | |
| "loss": 1.0159, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.04563719585642014, | |
| "grad_norm": 0.13699129223823547, | |
| "learning_rate": 0.00019747011541632316, | |
| "loss": 1.0778, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 0.04579137557215129, | |
| "grad_norm": 0.13768675923347473, | |
| "learning_rate": 0.0001974598103874691, | |
| "loss": 1.0719, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 0.04594555528788244, | |
| "grad_norm": 0.13458684086799622, | |
| "learning_rate": 0.00019744950535861502, | |
| "loss": 1.0145, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 0.046099735003613584, | |
| "grad_norm": 0.1772696077823639, | |
| "learning_rate": 0.00019743920032976094, | |
| "loss": 1.0629, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 0.04625391471934474, | |
| "grad_norm": 0.13998697698116302, | |
| "learning_rate": 0.00019742889530090685, | |
| "loss": 1.102, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04625391471934474, | |
| "eval_loss": 1.098169207572937, | |
| "eval_runtime": 185.5141, | |
| "eval_samples_per_second": 91.33, | |
| "eval_steps_per_second": 1.428, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04640809443507588, | |
| "grad_norm": 0.13928066194057465, | |
| "learning_rate": 0.00019741859027205277, | |
| "loss": 1.1527, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 0.046562274150807034, | |
| "grad_norm": 0.13011601567268372, | |
| "learning_rate": 0.0001974082852431987, | |
| "loss": 1.1259, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 0.046716453866538186, | |
| "grad_norm": 0.1306074559688568, | |
| "learning_rate": 0.00019739798021434462, | |
| "loss": 1.0951, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 0.04687063358226933, | |
| "grad_norm": 0.14797037839889526, | |
| "learning_rate": 0.00019738767518549054, | |
| "loss": 1.0321, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 0.047024813298000484, | |
| "grad_norm": 0.14849938452243805, | |
| "learning_rate": 0.00019737737015663645, | |
| "loss": 1.1096, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.04717899301373163, | |
| "grad_norm": 0.12060682475566864, | |
| "learning_rate": 0.00019736706512778237, | |
| "loss": 1.0652, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 0.04733317272946278, | |
| "grad_norm": 0.12754854559898376, | |
| "learning_rate": 0.00019735676009892828, | |
| "loss": 1.1097, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 0.04748735244519393, | |
| "grad_norm": 0.12162326276302338, | |
| "learning_rate": 0.0001973464550700742, | |
| "loss": 1.1087, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 0.04764153216092508, | |
| "grad_norm": 0.175630122423172, | |
| "learning_rate": 0.0001973361500412201, | |
| "loss": 1.0723, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 0.047795711876656224, | |
| "grad_norm": 0.15365472435951233, | |
| "learning_rate": 0.00019732584501236603, | |
| "loss": 1.1009, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.04794989159238738, | |
| "grad_norm": 0.13359837234020233, | |
| "learning_rate": 0.00019731553998351194, | |
| "loss": 1.0974, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 0.04810407130811853, | |
| "grad_norm": 0.1482960432767868, | |
| "learning_rate": 0.00019730523495465788, | |
| "loss": 1.1214, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.048258251023849674, | |
| "grad_norm": 0.1309668868780136, | |
| "learning_rate": 0.0001972949299258038, | |
| "loss": 1.0849, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 0.048412430739580826, | |
| "grad_norm": 0.1544414609670639, | |
| "learning_rate": 0.00019728462489694971, | |
| "loss": 1.092, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 0.04856661045531197, | |
| "grad_norm": 0.14907146990299225, | |
| "learning_rate": 0.00019727431986809563, | |
| "loss": 1.0671, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.048720790171043124, | |
| "grad_norm": 0.16943813860416412, | |
| "learning_rate": 0.00019726401483924154, | |
| "loss": 1.1433, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 0.04887496988677427, | |
| "grad_norm": 0.14070230722427368, | |
| "learning_rate": 0.00019725370981038749, | |
| "loss": 1.1613, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 0.04902914960250542, | |
| "grad_norm": 0.15507204830646515, | |
| "learning_rate": 0.0001972434047815334, | |
| "loss": 1.1286, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 0.04918332931823657, | |
| "grad_norm": 0.13587893545627594, | |
| "learning_rate": 0.00019723309975267932, | |
| "loss": 1.1094, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 0.04933750903396772, | |
| "grad_norm": 0.12399852275848389, | |
| "learning_rate": 0.00019722279472382523, | |
| "loss": 1.058, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.049491688749698864, | |
| "grad_norm": 0.12497518211603165, | |
| "learning_rate": 0.00019721248969497115, | |
| "loss": 1.0716, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 0.04964586846543002, | |
| "grad_norm": 0.15282607078552246, | |
| "learning_rate": 0.0001972021846661171, | |
| "loss": 1.0912, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 0.04980004818116117, | |
| "grad_norm": 0.14203013479709625, | |
| "learning_rate": 0.000197191879637263, | |
| "loss": 1.0846, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 0.049954227896892314, | |
| "grad_norm": 0.12308704853057861, | |
| "learning_rate": 0.00019718157460840892, | |
| "loss": 1.1202, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 0.050108407612623466, | |
| "grad_norm": 0.15226681530475616, | |
| "learning_rate": 0.00019717126957955483, | |
| "loss": 1.0626, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.05026258732835461, | |
| "grad_norm": 0.12636694312095642, | |
| "learning_rate": 0.00019716096455070075, | |
| "loss": 1.1086, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 0.050416767044085764, | |
| "grad_norm": 0.14969666302204132, | |
| "learning_rate": 0.0001971506595218467, | |
| "loss": 1.1602, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 0.05057094675981691, | |
| "grad_norm": 0.130833700299263, | |
| "learning_rate": 0.0001971403544929926, | |
| "loss": 1.0657, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 0.05072512647554806, | |
| "grad_norm": 0.1283751279115677, | |
| "learning_rate": 0.00019713004946413852, | |
| "loss": 1.0371, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 0.05087930619127921, | |
| "grad_norm": 0.11827697604894638, | |
| "learning_rate": 0.00019711974443528443, | |
| "loss": 1.0308, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.05103348590701036, | |
| "grad_norm": 0.12265590578317642, | |
| "learning_rate": 0.00019710943940643035, | |
| "loss": 1.1127, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 0.05118766562274151, | |
| "grad_norm": 0.13979150354862213, | |
| "learning_rate": 0.0001970991343775763, | |
| "loss": 1.1011, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 0.05134184533847266, | |
| "grad_norm": 0.1368461698293686, | |
| "learning_rate": 0.0001970888293487222, | |
| "loss": 1.0857, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 0.05149602505420381, | |
| "grad_norm": 0.13669301569461823, | |
| "learning_rate": 0.00019707852431986812, | |
| "loss": 1.0971, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 0.051650204769934954, | |
| "grad_norm": 0.12659449875354767, | |
| "learning_rate": 0.00019706821929101404, | |
| "loss": 1.0556, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.051804384485666106, | |
| "grad_norm": 0.14103113114833832, | |
| "learning_rate": 0.00019705791426215995, | |
| "loss": 1.0913, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 0.05195856420139725, | |
| "grad_norm": 0.16134017705917358, | |
| "learning_rate": 0.00019704760923330587, | |
| "loss": 1.0994, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 0.052112743917128404, | |
| "grad_norm": 0.12725086510181427, | |
| "learning_rate": 0.00019703730420445178, | |
| "loss": 1.1008, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 0.05226692363285955, | |
| "grad_norm": 0.12865908443927765, | |
| "learning_rate": 0.0001970269991755977, | |
| "loss": 1.0186, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 0.0524211033485907, | |
| "grad_norm": 0.1661859154701233, | |
| "learning_rate": 0.0001970166941467436, | |
| "loss": 1.068, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.05257528306432185, | |
| "grad_norm": 0.14370663464069366, | |
| "learning_rate": 0.00019700638911788953, | |
| "loss": 1.102, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 0.052729462780053, | |
| "grad_norm": 0.13285204768180847, | |
| "learning_rate": 0.00019699608408903544, | |
| "loss": 1.1055, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 0.05288364249578415, | |
| "grad_norm": 0.17762747406959534, | |
| "learning_rate": 0.00019698577906018138, | |
| "loss": 1.1601, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 0.0530378222115153, | |
| "grad_norm": 0.12693317234516144, | |
| "learning_rate": 0.0001969754740313273, | |
| "loss": 1.0494, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.05319200192724645, | |
| "grad_norm": 0.1302707940340042, | |
| "learning_rate": 0.0001969651690024732, | |
| "loss": 1.066, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.053346181642977594, | |
| "grad_norm": 0.11844471096992493, | |
| "learning_rate": 0.00019695486397361913, | |
| "loss": 1.0085, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 0.053500361358708746, | |
| "grad_norm": 0.12299422174692154, | |
| "learning_rate": 0.00019694455894476504, | |
| "loss": 1.0985, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 0.05365454107443989, | |
| "grad_norm": 0.1222420409321785, | |
| "learning_rate": 0.00019693425391591098, | |
| "loss": 1.0648, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 0.053808720790171044, | |
| "grad_norm": 0.13273879885673523, | |
| "learning_rate": 0.0001969239488870569, | |
| "loss": 1.1108, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 0.05396290050590219, | |
| "grad_norm": 0.13202215731143951, | |
| "learning_rate": 0.00019691364385820281, | |
| "loss": 1.1013, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05396290050590219, | |
| "eval_loss": 1.0964874029159546, | |
| "eval_runtime": 185.3303, | |
| "eval_samples_per_second": 91.421, | |
| "eval_steps_per_second": 1.43, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.05411708022163334, | |
| "grad_norm": 0.13038010895252228, | |
| "learning_rate": 0.00019690333882934873, | |
| "loss": 1.0642, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 0.054271259937364494, | |
| "grad_norm": 0.18084144592285156, | |
| "learning_rate": 0.00019689303380049464, | |
| "loss": 1.0673, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.05442543965309564, | |
| "grad_norm": 0.18958036601543427, | |
| "learning_rate": 0.00019688272877164059, | |
| "loss": 1.0925, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 0.05457961936882679, | |
| "grad_norm": 0.13386841118335724, | |
| "learning_rate": 0.0001968724237427865, | |
| "loss": 1.0978, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 0.05473379908455794, | |
| "grad_norm": 0.1408504843711853, | |
| "learning_rate": 0.00019686211871393242, | |
| "loss": 1.1158, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.05488797880028909, | |
| "grad_norm": 0.12006545811891556, | |
| "learning_rate": 0.00019685181368507833, | |
| "loss": 1.0395, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.055042158516020234, | |
| "grad_norm": 0.13973191380500793, | |
| "learning_rate": 0.00019684150865622425, | |
| "loss": 1.0685, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 0.055196338231751386, | |
| "grad_norm": 0.14461107552051544, | |
| "learning_rate": 0.0001968312036273702, | |
| "loss": 1.0924, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 0.05535051794748253, | |
| "grad_norm": 0.13358595967292786, | |
| "learning_rate": 0.0001968208985985161, | |
| "loss": 1.0479, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 0.055504697663213684, | |
| "grad_norm": 0.13416843116283417, | |
| "learning_rate": 0.00019681059356966202, | |
| "loss": 1.0166, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.05565887737894483, | |
| "grad_norm": 0.15217959880828857, | |
| "learning_rate": 0.00019680028854080793, | |
| "loss": 1.0918, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 0.05581305709467598, | |
| "grad_norm": 0.13012762367725372, | |
| "learning_rate": 0.00019678998351195385, | |
| "loss": 1.0967, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 0.055967236810407134, | |
| "grad_norm": 0.13023535907268524, | |
| "learning_rate": 0.00019677967848309976, | |
| "loss": 1.0247, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 0.05612141652613828, | |
| "grad_norm": 0.13703665137290955, | |
| "learning_rate": 0.00019676937345424568, | |
| "loss": 1.0969, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 0.05627559624186943, | |
| "grad_norm": 0.12767066061496735, | |
| "learning_rate": 0.0001967590684253916, | |
| "loss": 1.08, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.05642977595760058, | |
| "grad_norm": 0.12238382548093796, | |
| "learning_rate": 0.0001967487633965375, | |
| "loss": 1.1233, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 0.05658395567333173, | |
| "grad_norm": 0.1356974095106125, | |
| "learning_rate": 0.00019673845836768342, | |
| "loss": 1.0439, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 0.056738135389062874, | |
| "grad_norm": 0.14199669659137726, | |
| "learning_rate": 0.00019672815333882936, | |
| "loss": 1.0753, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 0.056892315104794026, | |
| "grad_norm": 0.12904112040996552, | |
| "learning_rate": 0.00019671784830997528, | |
| "loss": 1.0749, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 0.05704649482052517, | |
| "grad_norm": 0.1235031932592392, | |
| "learning_rate": 0.0001967075432811212, | |
| "loss": 1.0275, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.057200674536256324, | |
| "grad_norm": 0.170023113489151, | |
| "learning_rate": 0.0001966972382522671, | |
| "loss": 1.1295, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 0.057354854251987476, | |
| "grad_norm": 0.15533532202243805, | |
| "learning_rate": 0.00019668693322341302, | |
| "loss": 1.0629, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 0.05750903396771862, | |
| "grad_norm": 0.1602126806974411, | |
| "learning_rate": 0.00019667662819455897, | |
| "loss": 1.1538, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 0.057663213683449774, | |
| "grad_norm": 0.16433580219745636, | |
| "learning_rate": 0.00019666632316570488, | |
| "loss": 1.1322, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 0.05781739339918092, | |
| "grad_norm": 0.13925233483314514, | |
| "learning_rate": 0.0001966560181368508, | |
| "loss": 1.083, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.05797157311491207, | |
| "grad_norm": 0.12234565615653992, | |
| "learning_rate": 0.0001966457131079967, | |
| "loss": 1.0113, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.05812575283064322, | |
| "grad_norm": 0.1425125002861023, | |
| "learning_rate": 0.00019663540807914262, | |
| "loss": 1.0762, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 0.05827993254637437, | |
| "grad_norm": 0.14309099316596985, | |
| "learning_rate": 0.00019662510305028854, | |
| "loss": 1.0633, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 0.058434112262105514, | |
| "grad_norm": 0.1381814330816269, | |
| "learning_rate": 0.00019661479802143448, | |
| "loss": 1.142, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 0.058588291977836666, | |
| "grad_norm": 0.15551595389842987, | |
| "learning_rate": 0.0001966044929925804, | |
| "loss": 1.026, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.05874247169356781, | |
| "grad_norm": 0.14606410264968872, | |
| "learning_rate": 0.0001965941879637263, | |
| "loss": 1.1265, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 0.058896651409298964, | |
| "grad_norm": 0.13017289340496063, | |
| "learning_rate": 0.00019658388293487223, | |
| "loss": 1.1051, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 0.059050831125030116, | |
| "grad_norm": 0.1500990092754364, | |
| "learning_rate": 0.00019657357790601814, | |
| "loss": 1.0948, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 0.05920501084076126, | |
| "grad_norm": 0.14307473599910736, | |
| "learning_rate": 0.00019656327287716408, | |
| "loss": 1.0667, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.059359190556492414, | |
| "grad_norm": 0.13513712584972382, | |
| "learning_rate": 0.00019655296784831, | |
| "loss": 1.0488, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.05951337027222356, | |
| "grad_norm": 0.13991938531398773, | |
| "learning_rate": 0.0001965426628194559, | |
| "loss": 1.0888, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 0.05966754998795471, | |
| "grad_norm": 0.15015999972820282, | |
| "learning_rate": 0.00019653235779060183, | |
| "loss": 1.0774, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 0.05982172970368586, | |
| "grad_norm": 0.16419099271297455, | |
| "learning_rate": 0.00019652205276174774, | |
| "loss": 1.0661, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 0.05997590941941701, | |
| "grad_norm": 0.12072901427745819, | |
| "learning_rate": 0.00019651174773289366, | |
| "loss": 1.0645, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 0.060130089135148154, | |
| "grad_norm": 0.13410696387290955, | |
| "learning_rate": 0.00019650144270403957, | |
| "loss": 1.0677, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.060284268850879306, | |
| "grad_norm": 0.13373896479606628, | |
| "learning_rate": 0.0001964911376751855, | |
| "loss": 1.0055, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 0.06043844856661046, | |
| "grad_norm": 0.13043928146362305, | |
| "learning_rate": 0.0001964808326463314, | |
| "loss": 1.0579, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 0.060592628282341604, | |
| "grad_norm": 0.13334155082702637, | |
| "learning_rate": 0.00019647052761747732, | |
| "loss": 1.0781, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 0.060746807998072756, | |
| "grad_norm": 0.14660002291202545, | |
| "learning_rate": 0.00019646022258862326, | |
| "loss": 1.1244, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 0.0609009877138039, | |
| "grad_norm": 0.1240791380405426, | |
| "learning_rate": 0.00019644991755976917, | |
| "loss": 1.0353, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.061055167429535054, | |
| "grad_norm": 0.12248943001031876, | |
| "learning_rate": 0.0001964396125309151, | |
| "loss": 1.1292, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 0.0612093471452662, | |
| "grad_norm": 0.1340823471546173, | |
| "learning_rate": 0.000196429307502061, | |
| "loss": 1.0764, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 0.06136352686099735, | |
| "grad_norm": 0.1297413557767868, | |
| "learning_rate": 0.00019641900247320692, | |
| "loss": 1.0998, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 0.0615177065767285, | |
| "grad_norm": 0.13512568175792694, | |
| "learning_rate": 0.00019640869744435286, | |
| "loss": 1.0349, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 0.06167188629245965, | |
| "grad_norm": 0.13964438438415527, | |
| "learning_rate": 0.00019639839241549878, | |
| "loss": 1.0543, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06167188629245965, | |
| "eval_loss": 1.0952669382095337, | |
| "eval_runtime": 185.8383, | |
| "eval_samples_per_second": 91.171, | |
| "eval_steps_per_second": 1.426, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.061826066008190794, | |
| "grad_norm": 0.1318446695804596, | |
| "learning_rate": 0.0001963880873866447, | |
| "loss": 1.1469, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 0.061980245723921946, | |
| "grad_norm": 0.13778544962406158, | |
| "learning_rate": 0.0001963777823577906, | |
| "loss": 1.0361, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 0.0621344254396531, | |
| "grad_norm": 0.14804169535636902, | |
| "learning_rate": 0.00019636747732893652, | |
| "loss": 1.0537, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 0.062288605155384244, | |
| "grad_norm": 0.1363479495048523, | |
| "learning_rate": 0.00019635717230008246, | |
| "loss": 1.0819, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 0.062442784871115396, | |
| "grad_norm": 0.12277363240718842, | |
| "learning_rate": 0.00019634686727122838, | |
| "loss": 1.0629, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.06259696458684655, | |
| "grad_norm": 0.13027344644069672, | |
| "learning_rate": 0.0001963365622423743, | |
| "loss": 1.0544, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 0.0627511443025777, | |
| "grad_norm": 0.1274079531431198, | |
| "learning_rate": 0.0001963262572135202, | |
| "loss": 1.0685, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 0.06290532401830884, | |
| "grad_norm": 0.1349189281463623, | |
| "learning_rate": 0.00019631595218466612, | |
| "loss": 1.0289, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.06305950373403998, | |
| "grad_norm": 0.1265273541212082, | |
| "learning_rate": 0.00019630564715581206, | |
| "loss": 1.0765, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 0.06321368344977114, | |
| "grad_norm": 0.1393941193819046, | |
| "learning_rate": 0.00019629534212695798, | |
| "loss": 1.0918, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.06336786316550229, | |
| "grad_norm": 0.12475106865167618, | |
| "learning_rate": 0.0001962850370981039, | |
| "loss": 1.027, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 0.06352204288123343, | |
| "grad_norm": 0.13844382762908936, | |
| "learning_rate": 0.0001962747320692498, | |
| "loss": 1.1482, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 0.0636762225969646, | |
| "grad_norm": 0.1444624364376068, | |
| "learning_rate": 0.00019626442704039572, | |
| "loss": 1.0659, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 0.06383040231269574, | |
| "grad_norm": 0.13939915597438812, | |
| "learning_rate": 0.00019625412201154164, | |
| "loss": 1.0392, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 0.06398458202842688, | |
| "grad_norm": 0.12919913232326508, | |
| "learning_rate": 0.00019624381698268755, | |
| "loss": 1.0566, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.06413876174415803, | |
| "grad_norm": 0.1297498196363449, | |
| "learning_rate": 0.00019623351195383347, | |
| "loss": 1.058, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.06429294145988919, | |
| "grad_norm": 0.16311457753181458, | |
| "learning_rate": 0.00019622320692497938, | |
| "loss": 1.1175, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 0.06444712117562033, | |
| "grad_norm": 0.14434239268302917, | |
| "learning_rate": 0.0001962129018961253, | |
| "loss": 1.0966, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 0.06460130089135148, | |
| "grad_norm": 0.13500697910785675, | |
| "learning_rate": 0.00019620259686727121, | |
| "loss": 1.138, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 0.06475548060708262, | |
| "grad_norm": 0.13175781071186066, | |
| "learning_rate": 0.00019619229183841716, | |
| "loss": 1.0744, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.06490966032281378, | |
| "grad_norm": 0.142098531126976, | |
| "learning_rate": 0.00019618198680956307, | |
| "loss": 1.0686, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 0.06506384003854493, | |
| "grad_norm": 0.16844119131565094, | |
| "learning_rate": 0.00019617168178070899, | |
| "loss": 1.0992, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 0.06521801975427607, | |
| "grad_norm": 0.13562923669815063, | |
| "learning_rate": 0.0001961613767518549, | |
| "loss": 1.0749, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 0.06537219947000723, | |
| "grad_norm": 0.14538466930389404, | |
| "learning_rate": 0.00019615107172300082, | |
| "loss": 1.123, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 0.06552637918573838, | |
| "grad_norm": 0.13058879971504211, | |
| "learning_rate": 0.00019614076669414676, | |
| "loss": 1.0835, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.06568055890146952, | |
| "grad_norm": 0.1567140519618988, | |
| "learning_rate": 0.00019613046166529267, | |
| "loss": 1.1157, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 0.06583473861720067, | |
| "grad_norm": 0.12576104700565338, | |
| "learning_rate": 0.0001961201566364386, | |
| "loss": 1.0143, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 0.06598891833293183, | |
| "grad_norm": 0.13823091983795166, | |
| "learning_rate": 0.0001961098516075845, | |
| "loss": 1.0797, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 0.06614309804866297, | |
| "grad_norm": 0.12293639779090881, | |
| "learning_rate": 0.00019609954657873042, | |
| "loss": 1.0808, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 0.06629727776439412, | |
| "grad_norm": 0.13951502740383148, | |
| "learning_rate": 0.00019608924154987636, | |
| "loss": 1.076, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.06645145748012526, | |
| "grad_norm": 0.13900773227214813, | |
| "learning_rate": 0.00019607893652102227, | |
| "loss": 1.0846, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 0.06660563719585642, | |
| "grad_norm": 0.14335249364376068, | |
| "learning_rate": 0.0001960686314921682, | |
| "loss": 1.0639, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 0.06675981691158757, | |
| "grad_norm": 0.1712643951177597, | |
| "learning_rate": 0.0001960583264633141, | |
| "loss": 1.1411, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 0.06691399662731871, | |
| "grad_norm": 0.12118082493543625, | |
| "learning_rate": 0.00019604802143446002, | |
| "loss": 1.0807, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 0.06706817634304987, | |
| "grad_norm": 0.141808420419693, | |
| "learning_rate": 0.00019603771640560596, | |
| "loss": 1.0641, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.06722235605878102, | |
| "grad_norm": 0.14798308908939362, | |
| "learning_rate": 0.00019602741137675188, | |
| "loss": 1.073, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 0.06737653577451216, | |
| "grad_norm": 0.13768306374549866, | |
| "learning_rate": 0.0001960171063478978, | |
| "loss": 1.0735, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 0.06753071549024331, | |
| "grad_norm": 0.12452355027198792, | |
| "learning_rate": 0.0001960068013190437, | |
| "loss": 1.0509, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 0.06768489520597447, | |
| "grad_norm": 0.1402217000722885, | |
| "learning_rate": 0.00019599649629018962, | |
| "loss": 1.1157, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 0.06783907492170561, | |
| "grad_norm": 0.12509870529174805, | |
| "learning_rate": 0.00019598619126133556, | |
| "loss": 1.0516, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.06799325463743676, | |
| "grad_norm": 0.1574297547340393, | |
| "learning_rate": 0.00019597588623248148, | |
| "loss": 1.0823, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 0.0681474343531679, | |
| "grad_norm": 0.14185413718223572, | |
| "learning_rate": 0.0001959655812036274, | |
| "loss": 1.0444, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 0.06830161406889906, | |
| "grad_norm": 0.1380462348461151, | |
| "learning_rate": 0.0001959552761747733, | |
| "loss": 1.1066, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 0.06845579378463021, | |
| "grad_norm": 0.12986746430397034, | |
| "learning_rate": 0.00019594497114591922, | |
| "loss": 1.1006, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 0.06860997350036135, | |
| "grad_norm": 0.13894346356391907, | |
| "learning_rate": 0.00019593466611706514, | |
| "loss": 1.0569, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.06876415321609251, | |
| "grad_norm": 0.12822435796260834, | |
| "learning_rate": 0.00019592436108821105, | |
| "loss": 1.0696, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 0.06891833293182366, | |
| "grad_norm": 0.1369408816099167, | |
| "learning_rate": 0.00019591405605935697, | |
| "loss": 1.0691, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 0.0690725126475548, | |
| "grad_norm": 0.13459660112857819, | |
| "learning_rate": 0.00019590375103050288, | |
| "loss": 1.0801, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.06922669236328595, | |
| "grad_norm": 0.1299123764038086, | |
| "learning_rate": 0.0001958934460016488, | |
| "loss": 1.0885, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 0.06938087207901711, | |
| "grad_norm": 0.12562230229377747, | |
| "learning_rate": 0.00019588314097279474, | |
| "loss": 1.183, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.06938087207901711, | |
| "eval_loss": 1.0944268703460693, | |
| "eval_runtime": 185.3723, | |
| "eval_samples_per_second": 91.4, | |
| "eval_steps_per_second": 1.43, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.06953505179474825, | |
| "grad_norm": 0.13996927440166473, | |
| "learning_rate": 0.00019587283594394065, | |
| "loss": 1.0356, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 0.0696892315104794, | |
| "grad_norm": 0.128004252910614, | |
| "learning_rate": 0.00019586253091508657, | |
| "loss": 1.0343, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 0.06984341122621056, | |
| "grad_norm": 0.15650418400764465, | |
| "learning_rate": 0.00019585222588623248, | |
| "loss": 1.1138, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 0.0699975909419417, | |
| "grad_norm": 0.5840476751327515, | |
| "learning_rate": 0.0001958419208573784, | |
| "loss": 1.1785, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 0.07015177065767285, | |
| "grad_norm": 0.15330374240875244, | |
| "learning_rate": 0.00019583161582852434, | |
| "loss": 1.0243, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.070305950373404, | |
| "grad_norm": 0.1603543907403946, | |
| "learning_rate": 0.00019582131079967026, | |
| "loss": 1.1228, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 0.07046013008913515, | |
| "grad_norm": 0.14209845662117004, | |
| "learning_rate": 0.00019581100577081617, | |
| "loss": 1.0939, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 0.0706143098048663, | |
| "grad_norm": 0.16117019951343536, | |
| "learning_rate": 0.00019580070074196209, | |
| "loss": 1.1447, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 0.07076848952059744, | |
| "grad_norm": 0.14068694412708282, | |
| "learning_rate": 0.000195790395713108, | |
| "loss": 1.0642, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 0.07092266923632859, | |
| "grad_norm": 0.15248316526412964, | |
| "learning_rate": 0.00019578009068425394, | |
| "loss": 1.0162, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.07107684895205975, | |
| "grad_norm": 0.22734233736991882, | |
| "learning_rate": 0.00019576978565539986, | |
| "loss": 1.1123, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 0.0712310286677909, | |
| "grad_norm": 0.1393287032842636, | |
| "learning_rate": 0.00019575948062654577, | |
| "loss": 1.0862, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 0.07138520838352204, | |
| "grad_norm": 0.12911191582679749, | |
| "learning_rate": 0.0001957491755976917, | |
| "loss": 1.0651, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 0.0715393880992532, | |
| "grad_norm": 0.12298440933227539, | |
| "learning_rate": 0.0001957388705688376, | |
| "loss": 1.1227, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 0.07169356781498434, | |
| "grad_norm": 0.14941005408763885, | |
| "learning_rate": 0.00019572856553998352, | |
| "loss": 1.0989, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.07184774753071549, | |
| "grad_norm": 0.1411515325307846, | |
| "learning_rate": 0.00019571826051112946, | |
| "loss": 1.0816, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 0.07200192724644663, | |
| "grad_norm": 0.11999720335006714, | |
| "learning_rate": 0.00019570795548227537, | |
| "loss": 1.0306, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 0.0721561069621778, | |
| "grad_norm": 0.1500861495733261, | |
| "learning_rate": 0.0001956976504534213, | |
| "loss": 1.0678, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 0.07231028667790894, | |
| "grad_norm": 0.12102475017309189, | |
| "learning_rate": 0.0001956873454245672, | |
| "loss": 1.0534, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 0.07246446639364008, | |
| "grad_norm": 0.11554603278636932, | |
| "learning_rate": 0.00019567704039571312, | |
| "loss": 1.0535, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.07261864610937123, | |
| "grad_norm": 0.12290264666080475, | |
| "learning_rate": 0.00019566673536685903, | |
| "loss": 1.0738, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 0.07277282582510239, | |
| "grad_norm": 0.17740991711616516, | |
| "learning_rate": 0.00019565643033800495, | |
| "loss": 1.0811, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.07292700554083353, | |
| "grad_norm": 0.14767777919769287, | |
| "learning_rate": 0.00019564612530915086, | |
| "loss": 1.105, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 0.07308118525656468, | |
| "grad_norm": 0.13773177564144135, | |
| "learning_rate": 0.00019563582028029678, | |
| "loss": 1.0983, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 0.07323536497229584, | |
| "grad_norm": 0.13891370594501495, | |
| "learning_rate": 0.0001956255152514427, | |
| "loss": 1.1349, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.07338954468802698, | |
| "grad_norm": 0.14717017114162445, | |
| "learning_rate": 0.00019561521022258863, | |
| "loss": 1.134, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 0.07354372440375813, | |
| "grad_norm": 0.15095743536949158, | |
| "learning_rate": 0.00019560490519373455, | |
| "loss": 1.063, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 0.07369790411948927, | |
| "grad_norm": 0.12851206958293915, | |
| "learning_rate": 0.00019559460016488046, | |
| "loss": 1.1005, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 0.07385208383522043, | |
| "grad_norm": 0.13364006578922272, | |
| "learning_rate": 0.00019558429513602638, | |
| "loss": 1.0429, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 0.07400626355095158, | |
| "grad_norm": 0.1326039433479309, | |
| "learning_rate": 0.0001955739901071723, | |
| "loss": 1.1586, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.07416044326668272, | |
| "grad_norm": 0.13149486482143402, | |
| "learning_rate": 0.00019556368507831824, | |
| "loss": 1.109, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 0.07431462298241387, | |
| "grad_norm": 0.1189669519662857, | |
| "learning_rate": 0.00019555338004946415, | |
| "loss": 1.0462, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 0.07446880269814503, | |
| "grad_norm": 0.14341482520103455, | |
| "learning_rate": 0.00019554307502061007, | |
| "loss": 1.0623, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 0.07462298241387617, | |
| "grad_norm": 0.14133721590042114, | |
| "learning_rate": 0.00019553276999175598, | |
| "loss": 1.0945, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 0.07477716212960732, | |
| "grad_norm": 0.1351941078901291, | |
| "learning_rate": 0.0001955224649629019, | |
| "loss": 1.0327, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.07493134184533848, | |
| "grad_norm": 0.12836019694805145, | |
| "learning_rate": 0.00019551215993404784, | |
| "loss": 1.069, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 0.07508552156106962, | |
| "grad_norm": 0.13199055194854736, | |
| "learning_rate": 0.00019550185490519375, | |
| "loss": 1.0323, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 0.07523970127680077, | |
| "grad_norm": 0.14991353452205658, | |
| "learning_rate": 0.00019549154987633967, | |
| "loss": 1.0625, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 0.07539388099253191, | |
| "grad_norm": 0.13832435011863708, | |
| "learning_rate": 0.00019548124484748558, | |
| "loss": 1.1031, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 0.07554806070826307, | |
| "grad_norm": 0.12351599335670471, | |
| "learning_rate": 0.0001954709398186315, | |
| "loss": 1.0286, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.07570224042399422, | |
| "grad_norm": 0.12360050529241562, | |
| "learning_rate": 0.00019546063478977744, | |
| "loss": 1.0652, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 0.07585642013972536, | |
| "grad_norm": 0.13384872674942017, | |
| "learning_rate": 0.00019545032976092335, | |
| "loss": 1.1125, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 0.07601059985545652, | |
| "grad_norm": 0.13200527429580688, | |
| "learning_rate": 0.00019544002473206927, | |
| "loss": 1.0727, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 0.07616477957118767, | |
| "grad_norm": 0.143647700548172, | |
| "learning_rate": 0.00019542971970321518, | |
| "loss": 1.1207, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 0.07631895928691881, | |
| "grad_norm": 0.13605177402496338, | |
| "learning_rate": 0.0001954194146743611, | |
| "loss": 1.0225, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.07647313900264996, | |
| "grad_norm": 0.12646125257015228, | |
| "learning_rate": 0.00019540910964550701, | |
| "loss": 1.11, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 0.07662731871838112, | |
| "grad_norm": 0.132467120885849, | |
| "learning_rate": 0.00019539880461665293, | |
| "loss": 1.1092, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 0.07678149843411226, | |
| "grad_norm": 0.12461701035499573, | |
| "learning_rate": 0.00019538849958779884, | |
| "loss": 1.0854, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 0.07693567814984341, | |
| "grad_norm": 0.13430501520633698, | |
| "learning_rate": 0.00019537819455894476, | |
| "loss": 1.2, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 0.07708985786557455, | |
| "grad_norm": 0.12623916566371918, | |
| "learning_rate": 0.00019536788953009067, | |
| "loss": 1.0522, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07708985786557455, | |
| "eval_loss": 1.0930616855621338, | |
| "eval_runtime": 185.4001, | |
| "eval_samples_per_second": 91.386, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07724403758130571, | |
| "grad_norm": 0.11760087311267853, | |
| "learning_rate": 0.00019535758450123662, | |
| "loss": 1.1566, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 0.07739821729703686, | |
| "grad_norm": 0.145633727312088, | |
| "learning_rate": 0.00019534727947238253, | |
| "loss": 1.094, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 0.077552397012768, | |
| "grad_norm": 0.1311633288860321, | |
| "learning_rate": 0.00019533697444352845, | |
| "loss": 1.0792, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 0.07770657672849916, | |
| "grad_norm": 0.12563548982143402, | |
| "learning_rate": 0.00019532666941467436, | |
| "loss": 1.0601, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 0.07786075644423031, | |
| "grad_norm": 0.14429886639118195, | |
| "learning_rate": 0.00019531636438582028, | |
| "loss": 1.0926, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.07801493615996145, | |
| "grad_norm": 0.13131891191005707, | |
| "learning_rate": 0.0001953060593569662, | |
| "loss": 1.1012, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 0.0781691158756926, | |
| "grad_norm": 0.14185300469398499, | |
| "learning_rate": 0.00019529575432811213, | |
| "loss": 1.1113, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 0.07832329559142376, | |
| "grad_norm": 0.14298418164253235, | |
| "learning_rate": 0.00019528544929925805, | |
| "loss": 1.0909, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 0.0784774753071549, | |
| "grad_norm": 0.1339821219444275, | |
| "learning_rate": 0.00019527514427040396, | |
| "loss": 1.0994, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 0.07863165502288605, | |
| "grad_norm": 0.1252928525209427, | |
| "learning_rate": 0.00019526483924154988, | |
| "loss": 1.0316, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.0787858347386172, | |
| "grad_norm": 0.1277703046798706, | |
| "learning_rate": 0.0001952545342126958, | |
| "loss": 1.1067, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 0.07894001445434835, | |
| "grad_norm": 0.12644124031066895, | |
| "learning_rate": 0.00019524422918384173, | |
| "loss": 1.0176, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.0790941941700795, | |
| "grad_norm": 0.13443627953529358, | |
| "learning_rate": 0.00019523392415498765, | |
| "loss": 1.0754, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 0.07924837388581064, | |
| "grad_norm": 0.1895609050989151, | |
| "learning_rate": 0.00019522361912613356, | |
| "loss": 1.0551, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 0.0794025536015418, | |
| "grad_norm": 0.1372397392988205, | |
| "learning_rate": 0.00019521331409727948, | |
| "loss": 1.0442, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.07955673331727295, | |
| "grad_norm": 0.14173942804336548, | |
| "learning_rate": 0.0001952030090684254, | |
| "loss": 1.0692, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 0.0797109130330041, | |
| "grad_norm": 0.12321804463863373, | |
| "learning_rate": 0.00019519270403957134, | |
| "loss": 1.0276, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 0.07986509274873524, | |
| "grad_norm": 0.12327130138874054, | |
| "learning_rate": 0.00019518239901071725, | |
| "loss": 1.0376, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 0.0800192724644664, | |
| "grad_norm": 0.12301841378211975, | |
| "learning_rate": 0.00019517209398186317, | |
| "loss": 1.0887, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 0.08017345218019754, | |
| "grad_norm": 0.1429559886455536, | |
| "learning_rate": 0.00019516178895300908, | |
| "loss": 1.0321, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.08032763189592869, | |
| "grad_norm": 0.13955366611480713, | |
| "learning_rate": 0.000195151483924155, | |
| "loss": 1.1081, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 0.08048181161165983, | |
| "grad_norm": 0.13553303480148315, | |
| "learning_rate": 0.00019514117889530094, | |
| "loss": 1.0252, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 0.080635991327391, | |
| "grad_norm": 0.14100225269794464, | |
| "learning_rate": 0.00019513087386644685, | |
| "loss": 1.1071, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 0.08079017104312214, | |
| "grad_norm": 0.14522643387317657, | |
| "learning_rate": 0.00019512056883759277, | |
| "loss": 1.0653, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 0.08094435075885328, | |
| "grad_norm": 0.14540371298789978, | |
| "learning_rate": 0.00019511026380873868, | |
| "loss": 1.01, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.08109853047458444, | |
| "grad_norm": 0.1459018737077713, | |
| "learning_rate": 0.0001950999587798846, | |
| "loss": 1.1147, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 0.08125271019031559, | |
| "grad_norm": 0.12590867280960083, | |
| "learning_rate": 0.0001950896537510305, | |
| "loss": 1.0685, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 0.08140688990604673, | |
| "grad_norm": 0.11943504959344864, | |
| "learning_rate": 0.00019507934872217643, | |
| "loss": 1.0854, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 0.08156106962177788, | |
| "grad_norm": 0.12039398401975632, | |
| "learning_rate": 0.00019506904369332234, | |
| "loss": 1.1397, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 0.08171524933750904, | |
| "grad_norm": 0.1411554217338562, | |
| "learning_rate": 0.00019505873866446826, | |
| "loss": 1.1271, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.08186942905324018, | |
| "grad_norm": 0.1402871012687683, | |
| "learning_rate": 0.00019504843363561417, | |
| "loss": 1.0425, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 0.08202360876897133, | |
| "grad_norm": 0.13545840978622437, | |
| "learning_rate": 0.00019503812860676011, | |
| "loss": 1.0571, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 0.08217778848470249, | |
| "grad_norm": 0.12789209187030792, | |
| "learning_rate": 0.00019502782357790603, | |
| "loss": 1.0596, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 0.08233196820043363, | |
| "grad_norm": 0.13018928468227386, | |
| "learning_rate": 0.00019501751854905194, | |
| "loss": 1.1188, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 0.08248614791616478, | |
| "grad_norm": 0.12482234835624695, | |
| "learning_rate": 0.00019500721352019786, | |
| "loss": 1.0831, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.08264032763189592, | |
| "grad_norm": 0.11897309869527817, | |
| "learning_rate": 0.00019499690849134377, | |
| "loss": 1.0658, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 0.08279450734762708, | |
| "grad_norm": 0.12954497337341309, | |
| "learning_rate": 0.00019498660346248972, | |
| "loss": 1.0204, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 0.08294868706335823, | |
| "grad_norm": 0.14220042526721954, | |
| "learning_rate": 0.00019497629843363563, | |
| "loss": 1.1101, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 0.08310286677908937, | |
| "grad_norm": 0.1631559580564499, | |
| "learning_rate": 0.00019496599340478155, | |
| "loss": 1.1352, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 0.08325704649482052, | |
| "grad_norm": 0.13439539074897766, | |
| "learning_rate": 0.00019495568837592746, | |
| "loss": 1.0108, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.08341122621055168, | |
| "grad_norm": 0.12389718741178513, | |
| "learning_rate": 0.00019494538334707338, | |
| "loss": 1.0155, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 0.08356540592628282, | |
| "grad_norm": 0.1241556853055954, | |
| "learning_rate": 0.00019493507831821932, | |
| "loss": 1.1428, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 0.08371958564201397, | |
| "grad_norm": 0.13087880611419678, | |
| "learning_rate": 0.00019492477328936523, | |
| "loss": 1.0876, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 0.08387376535774513, | |
| "grad_norm": 0.12431449443101883, | |
| "learning_rate": 0.00019491446826051115, | |
| "loss": 1.0758, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.08402794507347627, | |
| "grad_norm": 0.13807635009288788, | |
| "learning_rate": 0.00019490416323165706, | |
| "loss": 1.0902, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.08418212478920742, | |
| "grad_norm": 0.12751048803329468, | |
| "learning_rate": 0.00019489385820280298, | |
| "loss": 1.0732, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 0.08433630450493856, | |
| "grad_norm": 0.15594707429409027, | |
| "learning_rate": 0.00019488355317394892, | |
| "loss": 1.1115, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 0.08449048422066972, | |
| "grad_norm": 0.11647301912307739, | |
| "learning_rate": 0.00019487324814509483, | |
| "loss": 1.1592, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 0.08464466393640087, | |
| "grad_norm": 0.13609850406646729, | |
| "learning_rate": 0.00019486294311624075, | |
| "loss": 1.1139, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 0.08479884365213201, | |
| "grad_norm": 0.1234198659658432, | |
| "learning_rate": 0.00019485263808738666, | |
| "loss": 1.0682, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08479884365213201, | |
| "eval_loss": 1.0920624732971191, | |
| "eval_runtime": 185.5142, | |
| "eval_samples_per_second": 91.33, | |
| "eval_steps_per_second": 1.428, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08495302336786316, | |
| "grad_norm": 0.1375039666891098, | |
| "learning_rate": 0.00019484233305853258, | |
| "loss": 1.0585, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 0.08510720308359432, | |
| "grad_norm": 0.14471521973609924, | |
| "learning_rate": 0.0001948320280296785, | |
| "loss": 1.1115, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 0.08526138279932546, | |
| "grad_norm": 0.12425632029771805, | |
| "learning_rate": 0.0001948217230008244, | |
| "loss": 1.0501, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 0.08541556251505661, | |
| "grad_norm": 0.1161596029996872, | |
| "learning_rate": 0.00019481141797197032, | |
| "loss": 1.0182, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 0.08556974223078777, | |
| "grad_norm": 0.11700072139501572, | |
| "learning_rate": 0.00019480111294311624, | |
| "loss": 1.0579, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.08572392194651891, | |
| "grad_norm": 0.14330415427684784, | |
| "learning_rate": 0.00019479080791426215, | |
| "loss": 1.1211, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 0.08587810166225006, | |
| "grad_norm": 0.14039026200771332, | |
| "learning_rate": 0.00019478050288540807, | |
| "loss": 1.0826, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 0.0860322813779812, | |
| "grad_norm": 0.14031362533569336, | |
| "learning_rate": 0.000194770197856554, | |
| "loss": 1.0871, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 0.08618646109371236, | |
| "grad_norm": 0.12351037561893463, | |
| "learning_rate": 0.00019475989282769993, | |
| "loss": 1.001, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 0.08634064080944351, | |
| "grad_norm": 0.11667052656412125, | |
| "learning_rate": 0.00019474958779884584, | |
| "loss": 1.0421, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.08649482052517465, | |
| "grad_norm": 0.1489124447107315, | |
| "learning_rate": 0.00019473928276999175, | |
| "loss": 1.1644, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 0.0866490002409058, | |
| "grad_norm": 0.1338202804327011, | |
| "learning_rate": 0.00019472897774113767, | |
| "loss": 1.1239, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 0.08680317995663696, | |
| "grad_norm": 0.13266493380069733, | |
| "learning_rate": 0.0001947186727122836, | |
| "loss": 1.0839, | |
| "step": 1126 | |
| }, | |
| { | |
| "epoch": 0.0869573596723681, | |
| "grad_norm": 0.13726286590099335, | |
| "learning_rate": 0.00019470836768342953, | |
| "loss": 1.1325, | |
| "step": 1128 | |
| }, | |
| { | |
| "epoch": 0.08711153938809925, | |
| "grad_norm": 0.14077100157737732, | |
| "learning_rate": 0.00019469806265457544, | |
| "loss": 1.0429, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.08726571910383041, | |
| "grad_norm": 0.1362866312265396, | |
| "learning_rate": 0.00019468775762572136, | |
| "loss": 1.0715, | |
| "step": 1132 | |
| }, | |
| { | |
| "epoch": 0.08741989881956155, | |
| "grad_norm": 0.12472223490476608, | |
| "learning_rate": 0.00019467745259686727, | |
| "loss": 1.0503, | |
| "step": 1134 | |
| }, | |
| { | |
| "epoch": 0.0875740785352927, | |
| "grad_norm": 0.1350635141134262, | |
| "learning_rate": 0.0001946671475680132, | |
| "loss": 1.0498, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 0.08772825825102384, | |
| "grad_norm": 0.1424301117658615, | |
| "learning_rate": 0.00019465684253915913, | |
| "loss": 1.1589, | |
| "step": 1138 | |
| }, | |
| { | |
| "epoch": 0.087882437966755, | |
| "grad_norm": 0.12365067005157471, | |
| "learning_rate": 0.00019464653751030504, | |
| "loss": 1.1065, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.08803661768248615, | |
| "grad_norm": 0.16497495770454407, | |
| "learning_rate": 0.00019463623248145096, | |
| "loss": 1.0189, | |
| "step": 1142 | |
| }, | |
| { | |
| "epoch": 0.0881907973982173, | |
| "grad_norm": 0.1381298303604126, | |
| "learning_rate": 0.00019462592745259687, | |
| "loss": 1.0426, | |
| "step": 1144 | |
| }, | |
| { | |
| "epoch": 0.08834497711394845, | |
| "grad_norm": 0.15007291734218597, | |
| "learning_rate": 0.00019461562242374282, | |
| "loss": 1.1108, | |
| "step": 1146 | |
| }, | |
| { | |
| "epoch": 0.0884991568296796, | |
| "grad_norm": 0.19384606182575226, | |
| "learning_rate": 0.00019460531739488873, | |
| "loss": 1.0664, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 0.08865333654541074, | |
| "grad_norm": 0.12032177299261093, | |
| "learning_rate": 0.00019459501236603465, | |
| "loss": 1.018, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.08880751626114189, | |
| "grad_norm": 0.1197669506072998, | |
| "learning_rate": 0.00019458470733718056, | |
| "loss": 1.071, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.08896169597687305, | |
| "grad_norm": 0.12108784914016724, | |
| "learning_rate": 0.00019457440230832647, | |
| "loss": 1.0499, | |
| "step": 1154 | |
| }, | |
| { | |
| "epoch": 0.0891158756926042, | |
| "grad_norm": 0.1270270049571991, | |
| "learning_rate": 0.0001945640972794724, | |
| "loss": 1.1172, | |
| "step": 1156 | |
| }, | |
| { | |
| "epoch": 0.08927005540833534, | |
| "grad_norm": 0.13599786162376404, | |
| "learning_rate": 0.0001945537922506183, | |
| "loss": 1.103, | |
| "step": 1158 | |
| }, | |
| { | |
| "epoch": 0.08942423512406648, | |
| "grad_norm": 0.12051045894622803, | |
| "learning_rate": 0.00019454348722176422, | |
| "loss": 1.0905, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.08957841483979764, | |
| "grad_norm": 0.12117696553468704, | |
| "learning_rate": 0.00019453318219291013, | |
| "loss": 1.0611, | |
| "step": 1162 | |
| }, | |
| { | |
| "epoch": 0.08973259455552879, | |
| "grad_norm": 0.13710887730121613, | |
| "learning_rate": 0.00019452287716405605, | |
| "loss": 1.0242, | |
| "step": 1164 | |
| }, | |
| { | |
| "epoch": 0.08988677427125993, | |
| "grad_norm": 0.1160813644528389, | |
| "learning_rate": 0.000194512572135202, | |
| "loss": 1.0863, | |
| "step": 1166 | |
| }, | |
| { | |
| "epoch": 0.09004095398699109, | |
| "grad_norm": 0.1754099279642105, | |
| "learning_rate": 0.0001945022671063479, | |
| "loss": 1.0938, | |
| "step": 1168 | |
| }, | |
| { | |
| "epoch": 0.09019513370272224, | |
| "grad_norm": 0.1331128627061844, | |
| "learning_rate": 0.00019449196207749382, | |
| "loss": 1.0692, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.09034931341845338, | |
| "grad_norm": 0.13422611355781555, | |
| "learning_rate": 0.00019448165704863974, | |
| "loss": 1.0699, | |
| "step": 1172 | |
| }, | |
| { | |
| "epoch": 0.09050349313418453, | |
| "grad_norm": 0.12999802827835083, | |
| "learning_rate": 0.00019447135201978565, | |
| "loss": 1.0957, | |
| "step": 1174 | |
| }, | |
| { | |
| "epoch": 0.09065767284991569, | |
| "grad_norm": 0.13413815200328827, | |
| "learning_rate": 0.0001944610469909316, | |
| "loss": 1.0869, | |
| "step": 1176 | |
| }, | |
| { | |
| "epoch": 0.09081185256564683, | |
| "grad_norm": 0.12901006639003754, | |
| "learning_rate": 0.0001944507419620775, | |
| "loss": 1.0442, | |
| "step": 1178 | |
| }, | |
| { | |
| "epoch": 0.09096603228137798, | |
| "grad_norm": 0.11824194341897964, | |
| "learning_rate": 0.00019444043693322342, | |
| "loss": 1.0935, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.09112021199710912, | |
| "grad_norm": 0.14895616471767426, | |
| "learning_rate": 0.00019443013190436934, | |
| "loss": 1.0624, | |
| "step": 1182 | |
| }, | |
| { | |
| "epoch": 0.09127439171284028, | |
| "grad_norm": 0.13515722751617432, | |
| "learning_rate": 0.00019441982687551525, | |
| "loss": 1.0797, | |
| "step": 1184 | |
| }, | |
| { | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.13411575555801392, | |
| "learning_rate": 0.00019440952184666117, | |
| "loss": 1.0637, | |
| "step": 1186 | |
| }, | |
| { | |
| "epoch": 0.09158275114430257, | |
| "grad_norm": 0.12519463896751404, | |
| "learning_rate": 0.0001943992168178071, | |
| "loss": 1.0608, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 0.09173693086003373, | |
| "grad_norm": 0.1267428696155548, | |
| "learning_rate": 0.00019438891178895302, | |
| "loss": 1.0182, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.09189111057576488, | |
| "grad_norm": 0.13116560876369476, | |
| "learning_rate": 0.00019437860676009894, | |
| "loss": 1.1139, | |
| "step": 1192 | |
| }, | |
| { | |
| "epoch": 0.09204529029149602, | |
| "grad_norm": 0.14659713208675385, | |
| "learning_rate": 0.00019436830173124485, | |
| "loss": 1.1275, | |
| "step": 1194 | |
| }, | |
| { | |
| "epoch": 0.09219947000722717, | |
| "grad_norm": 0.12913885712623596, | |
| "learning_rate": 0.00019435799670239077, | |
| "loss": 1.0858, | |
| "step": 1196 | |
| }, | |
| { | |
| "epoch": 0.09235364972295833, | |
| "grad_norm": 0.12855856120586395, | |
| "learning_rate": 0.0001943476916735367, | |
| "loss": 1.0811, | |
| "step": 1198 | |
| }, | |
| { | |
| "epoch": 0.09250782943868947, | |
| "grad_norm": 0.1391747146844864, | |
| "learning_rate": 0.00019433738664468263, | |
| "loss": 1.0146, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09250782943868947, | |
| "eval_loss": 1.0912913084030151, | |
| "eval_runtime": 185.3661, | |
| "eval_samples_per_second": 91.403, | |
| "eval_steps_per_second": 1.43, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09266200915442062, | |
| "grad_norm": 0.13186782598495483, | |
| "learning_rate": 0.00019432708161582854, | |
| "loss": 1.1017, | |
| "step": 1202 | |
| }, | |
| { | |
| "epoch": 0.09281618887015176, | |
| "grad_norm": 0.12913943827152252, | |
| "learning_rate": 0.00019431677658697446, | |
| "loss": 1.1027, | |
| "step": 1204 | |
| }, | |
| { | |
| "epoch": 0.09297036858588292, | |
| "grad_norm": 0.1349743753671646, | |
| "learning_rate": 0.00019430647155812037, | |
| "loss": 1.1023, | |
| "step": 1206 | |
| }, | |
| { | |
| "epoch": 0.09312454830161407, | |
| "grad_norm": 0.12534667551517487, | |
| "learning_rate": 0.00019429616652926629, | |
| "loss": 1.0659, | |
| "step": 1208 | |
| }, | |
| { | |
| "epoch": 0.09327872801734521, | |
| "grad_norm": 0.11720700562000275, | |
| "learning_rate": 0.0001942858615004122, | |
| "loss": 1.0532, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.09343290773307637, | |
| "grad_norm": 0.1364222913980484, | |
| "learning_rate": 0.00019427555647155812, | |
| "loss": 1.0575, | |
| "step": 1212 | |
| }, | |
| { | |
| "epoch": 0.09358708744880752, | |
| "grad_norm": 0.15532977879047394, | |
| "learning_rate": 0.00019426525144270403, | |
| "loss": 1.1145, | |
| "step": 1214 | |
| }, | |
| { | |
| "epoch": 0.09374126716453866, | |
| "grad_norm": 0.1377478837966919, | |
| "learning_rate": 0.00019425494641384995, | |
| "loss": 1.0505, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.09389544688026981, | |
| "grad_norm": 0.1273409128189087, | |
| "learning_rate": 0.0001942446413849959, | |
| "loss": 1.0873, | |
| "step": 1218 | |
| }, | |
| { | |
| "epoch": 0.09404962659600097, | |
| "grad_norm": 0.11990435421466827, | |
| "learning_rate": 0.0001942343363561418, | |
| "loss": 1.0829, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.09420380631173211, | |
| "grad_norm": 0.14191892743110657, | |
| "learning_rate": 0.00019422403132728772, | |
| "loss": 1.0992, | |
| "step": 1222 | |
| }, | |
| { | |
| "epoch": 0.09435798602746326, | |
| "grad_norm": 0.14520397782325745, | |
| "learning_rate": 0.00019421372629843363, | |
| "loss": 1.0712, | |
| "step": 1224 | |
| }, | |
| { | |
| "epoch": 0.09451216574319442, | |
| "grad_norm": 0.13780727982521057, | |
| "learning_rate": 0.00019420342126957955, | |
| "loss": 0.9943, | |
| "step": 1226 | |
| }, | |
| { | |
| "epoch": 0.09466634545892556, | |
| "grad_norm": 0.13550738990306854, | |
| "learning_rate": 0.0001941931162407255, | |
| "loss": 1.1264, | |
| "step": 1228 | |
| }, | |
| { | |
| "epoch": 0.09482052517465671, | |
| "grad_norm": 0.12125276774168015, | |
| "learning_rate": 0.0001941828112118714, | |
| "loss": 1.1207, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.09497470489038785, | |
| "grad_norm": 0.14529301226139069, | |
| "learning_rate": 0.00019417250618301732, | |
| "loss": 1.144, | |
| "step": 1232 | |
| }, | |
| { | |
| "epoch": 0.09512888460611901, | |
| "grad_norm": 0.15477551519870758, | |
| "learning_rate": 0.00019416220115416323, | |
| "loss": 1.0568, | |
| "step": 1234 | |
| }, | |
| { | |
| "epoch": 0.09528306432185016, | |
| "grad_norm": 0.1299963742494583, | |
| "learning_rate": 0.00019415189612530915, | |
| "loss": 1.0235, | |
| "step": 1236 | |
| }, | |
| { | |
| "epoch": 0.0954372440375813, | |
| "grad_norm": 0.1372281014919281, | |
| "learning_rate": 0.0001941415910964551, | |
| "loss": 1.0764, | |
| "step": 1238 | |
| }, | |
| { | |
| "epoch": 0.09559142375331245, | |
| "grad_norm": 0.1247306764125824, | |
| "learning_rate": 0.000194131286067601, | |
| "loss": 1.1345, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.09574560346904361, | |
| "grad_norm": 0.1330571472644806, | |
| "learning_rate": 0.00019412098103874692, | |
| "loss": 1.1596, | |
| "step": 1242 | |
| }, | |
| { | |
| "epoch": 0.09589978318477475, | |
| "grad_norm": 0.15787385404109955, | |
| "learning_rate": 0.00019411067600989284, | |
| "loss": 1.1067, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 0.0960539629005059, | |
| "grad_norm": 0.12646274268627167, | |
| "learning_rate": 0.00019410037098103875, | |
| "loss": 1.0769, | |
| "step": 1246 | |
| }, | |
| { | |
| "epoch": 0.09620814261623706, | |
| "grad_norm": 0.16424262523651123, | |
| "learning_rate": 0.0001940900659521847, | |
| "loss": 1.0459, | |
| "step": 1248 | |
| }, | |
| { | |
| "epoch": 0.0963623223319682, | |
| "grad_norm": 0.1401062309741974, | |
| "learning_rate": 0.0001940797609233306, | |
| "loss": 1.1308, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.09651650204769935, | |
| "grad_norm": 0.13971561193466187, | |
| "learning_rate": 0.00019406945589447652, | |
| "loss": 1.1457, | |
| "step": 1252 | |
| }, | |
| { | |
| "epoch": 0.0966706817634305, | |
| "grad_norm": 0.13544687628746033, | |
| "learning_rate": 0.00019405915086562244, | |
| "loss": 1.0532, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 0.09682486147916165, | |
| "grad_norm": 0.13527531921863556, | |
| "learning_rate": 0.00019404884583676835, | |
| "loss": 1.0376, | |
| "step": 1256 | |
| }, | |
| { | |
| "epoch": 0.0969790411948928, | |
| "grad_norm": 0.1731848120689392, | |
| "learning_rate": 0.0001940385408079143, | |
| "loss": 1.2252, | |
| "step": 1258 | |
| }, | |
| { | |
| "epoch": 0.09713322091062394, | |
| "grad_norm": 0.13142083585262299, | |
| "learning_rate": 0.0001940282357790602, | |
| "loss": 1.0254, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.09728740062635509, | |
| "grad_norm": 0.13390247523784637, | |
| "learning_rate": 0.00019401793075020612, | |
| "loss": 1.0448, | |
| "step": 1262 | |
| }, | |
| { | |
| "epoch": 0.09744158034208625, | |
| "grad_norm": 0.15188650786876678, | |
| "learning_rate": 0.00019400762572135204, | |
| "loss": 1.1019, | |
| "step": 1264 | |
| }, | |
| { | |
| "epoch": 0.0975957600578174, | |
| "grad_norm": 0.14055617153644562, | |
| "learning_rate": 0.00019399732069249795, | |
| "loss": 1.0835, | |
| "step": 1266 | |
| }, | |
| { | |
| "epoch": 0.09774993977354854, | |
| "grad_norm": 0.12209255248308182, | |
| "learning_rate": 0.00019398701566364387, | |
| "loss": 1.0675, | |
| "step": 1268 | |
| }, | |
| { | |
| "epoch": 0.0979041194892797, | |
| "grad_norm": 0.14639706909656525, | |
| "learning_rate": 0.00019397671063478978, | |
| "loss": 1.049, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.09805829920501084, | |
| "grad_norm": 0.13672591745853424, | |
| "learning_rate": 0.0001939664056059357, | |
| "loss": 1.1057, | |
| "step": 1272 | |
| }, | |
| { | |
| "epoch": 0.09821247892074199, | |
| "grad_norm": 0.1522635966539383, | |
| "learning_rate": 0.00019395610057708161, | |
| "loss": 1.14, | |
| "step": 1274 | |
| }, | |
| { | |
| "epoch": 0.09836665863647313, | |
| "grad_norm": 0.13887491822242737, | |
| "learning_rate": 0.00019394579554822753, | |
| "loss": 1.069, | |
| "step": 1276 | |
| }, | |
| { | |
| "epoch": 0.09852083835220429, | |
| "grad_norm": 0.13854965567588806, | |
| "learning_rate": 0.00019393549051937344, | |
| "loss": 1.0704, | |
| "step": 1278 | |
| }, | |
| { | |
| "epoch": 0.09867501806793544, | |
| "grad_norm": 0.12839765846729279, | |
| "learning_rate": 0.00019392518549051939, | |
| "loss": 1.0512, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.09882919778366658, | |
| "grad_norm": 0.1270405352115631, | |
| "learning_rate": 0.0001939148804616653, | |
| "loss": 1.0251, | |
| "step": 1282 | |
| }, | |
| { | |
| "epoch": 0.09898337749939773, | |
| "grad_norm": 0.1269143521785736, | |
| "learning_rate": 0.00019390457543281122, | |
| "loss": 1.0433, | |
| "step": 1284 | |
| }, | |
| { | |
| "epoch": 0.09913755721512889, | |
| "grad_norm": 0.14292192459106445, | |
| "learning_rate": 0.00019389427040395713, | |
| "loss": 1.1507, | |
| "step": 1286 | |
| }, | |
| { | |
| "epoch": 0.09929173693086003, | |
| "grad_norm": 0.12512263655662537, | |
| "learning_rate": 0.00019388396537510305, | |
| "loss": 1.0918, | |
| "step": 1288 | |
| }, | |
| { | |
| "epoch": 0.09944591664659118, | |
| "grad_norm": 0.11927679181098938, | |
| "learning_rate": 0.000193873660346249, | |
| "loss": 1.0924, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.09960009636232234, | |
| "grad_norm": 0.13639990985393524, | |
| "learning_rate": 0.0001938633553173949, | |
| "loss": 1.1024, | |
| "step": 1292 | |
| }, | |
| { | |
| "epoch": 0.09975427607805348, | |
| "grad_norm": 0.142363503575325, | |
| "learning_rate": 0.00019385305028854082, | |
| "loss": 1.021, | |
| "step": 1294 | |
| }, | |
| { | |
| "epoch": 0.09990845579378463, | |
| "grad_norm": 0.1389359086751938, | |
| "learning_rate": 0.00019384274525968673, | |
| "loss": 1.0269, | |
| "step": 1296 | |
| }, | |
| { | |
| "epoch": 0.10006263550951577, | |
| "grad_norm": 0.15595073997974396, | |
| "learning_rate": 0.00019383244023083265, | |
| "loss": 1.0913, | |
| "step": 1298 | |
| }, | |
| { | |
| "epoch": 0.10021681522524693, | |
| "grad_norm": 0.1324295848608017, | |
| "learning_rate": 0.0001938221352019786, | |
| "loss": 1.1001, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10021681522524693, | |
| "eval_loss": 1.0909266471862793, | |
| "eval_runtime": 185.4116, | |
| "eval_samples_per_second": 91.38, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.10037099494097808, | |
| "grad_norm": 0.139576256275177, | |
| "learning_rate": 0.0001938118301731245, | |
| "loss": 1.1147, | |
| "step": 1302 | |
| }, | |
| { | |
| "epoch": 0.10052517465670922, | |
| "grad_norm": 0.12854811549186707, | |
| "learning_rate": 0.00019380152514427042, | |
| "loss": 1.0973, | |
| "step": 1304 | |
| }, | |
| { | |
| "epoch": 0.10067935437244037, | |
| "grad_norm": 0.1245393380522728, | |
| "learning_rate": 0.00019379122011541633, | |
| "loss": 1.0485, | |
| "step": 1306 | |
| }, | |
| { | |
| "epoch": 0.10083353408817153, | |
| "grad_norm": 0.13261497020721436, | |
| "learning_rate": 0.00019378091508656225, | |
| "loss": 1.156, | |
| "step": 1308 | |
| }, | |
| { | |
| "epoch": 0.10098771380390267, | |
| "grad_norm": 0.1255144327878952, | |
| "learning_rate": 0.0001937706100577082, | |
| "loss": 1.0852, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.10114189351963382, | |
| "grad_norm": 0.1412706971168518, | |
| "learning_rate": 0.0001937603050288541, | |
| "loss": 1.0766, | |
| "step": 1312 | |
| }, | |
| { | |
| "epoch": 0.10129607323536498, | |
| "grad_norm": 0.1281047761440277, | |
| "learning_rate": 0.00019375000000000002, | |
| "loss": 1.0824, | |
| "step": 1314 | |
| }, | |
| { | |
| "epoch": 0.10145025295109612, | |
| "grad_norm": 0.13307350873947144, | |
| "learning_rate": 0.00019373969497114594, | |
| "loss": 1.0887, | |
| "step": 1316 | |
| }, | |
| { | |
| "epoch": 0.10160443266682727, | |
| "grad_norm": 0.1287691742181778, | |
| "learning_rate": 0.00019372938994229185, | |
| "loss": 1.0705, | |
| "step": 1318 | |
| }, | |
| { | |
| "epoch": 0.10175861238255841, | |
| "grad_norm": 0.1303441971540451, | |
| "learning_rate": 0.00019371908491343777, | |
| "loss": 1.1684, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.10191279209828957, | |
| "grad_norm": 0.13304616510868073, | |
| "learning_rate": 0.00019370877988458368, | |
| "loss": 1.0944, | |
| "step": 1322 | |
| }, | |
| { | |
| "epoch": 0.10206697181402072, | |
| "grad_norm": 0.13905592262744904, | |
| "learning_rate": 0.0001936984748557296, | |
| "loss": 1.0915, | |
| "step": 1324 | |
| }, | |
| { | |
| "epoch": 0.10222115152975186, | |
| "grad_norm": 0.13225632905960083, | |
| "learning_rate": 0.0001936881698268755, | |
| "loss": 1.0418, | |
| "step": 1326 | |
| }, | |
| { | |
| "epoch": 0.10237533124548302, | |
| "grad_norm": 0.1267402619123459, | |
| "learning_rate": 0.00019367786479802142, | |
| "loss": 1.0446, | |
| "step": 1328 | |
| }, | |
| { | |
| "epoch": 0.10252951096121417, | |
| "grad_norm": 0.1439935863018036, | |
| "learning_rate": 0.00019366755976916737, | |
| "loss": 1.0582, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.10268369067694531, | |
| "grad_norm": 0.1267223060131073, | |
| "learning_rate": 0.00019365725474031328, | |
| "loss": 1.0176, | |
| "step": 1332 | |
| }, | |
| { | |
| "epoch": 0.10283787039267646, | |
| "grad_norm": 0.1298942118883133, | |
| "learning_rate": 0.0001936469497114592, | |
| "loss": 1.0552, | |
| "step": 1334 | |
| }, | |
| { | |
| "epoch": 0.10299205010840762, | |
| "grad_norm": 0.13010933995246887, | |
| "learning_rate": 0.0001936366446826051, | |
| "loss": 1.0848, | |
| "step": 1336 | |
| }, | |
| { | |
| "epoch": 0.10314622982413876, | |
| "grad_norm": 0.13728559017181396, | |
| "learning_rate": 0.00019362633965375103, | |
| "loss": 1.0779, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 0.10330040953986991, | |
| "grad_norm": 0.13863548636436462, | |
| "learning_rate": 0.00019361603462489697, | |
| "loss": 1.0326, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.10345458925560105, | |
| "grad_norm": 0.12995532155036926, | |
| "learning_rate": 0.00019360572959604288, | |
| "loss": 1.1427, | |
| "step": 1342 | |
| }, | |
| { | |
| "epoch": 0.10360876897133221, | |
| "grad_norm": 0.13650789856910706, | |
| "learning_rate": 0.0001935954245671888, | |
| "loss": 1.0528, | |
| "step": 1344 | |
| }, | |
| { | |
| "epoch": 0.10376294868706336, | |
| "grad_norm": 0.1336941123008728, | |
| "learning_rate": 0.0001935851195383347, | |
| "loss": 1.1155, | |
| "step": 1346 | |
| }, | |
| { | |
| "epoch": 0.1039171284027945, | |
| "grad_norm": 0.13927003741264343, | |
| "learning_rate": 0.00019357481450948063, | |
| "loss": 1.0551, | |
| "step": 1348 | |
| }, | |
| { | |
| "epoch": 0.10407130811852566, | |
| "grad_norm": 0.14504994451999664, | |
| "learning_rate": 0.00019356450948062657, | |
| "loss": 1.1014, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.10422548783425681, | |
| "grad_norm": 0.15796230733394623, | |
| "learning_rate": 0.00019355420445177248, | |
| "loss": 1.2115, | |
| "step": 1352 | |
| }, | |
| { | |
| "epoch": 0.10437966754998795, | |
| "grad_norm": 0.1317984163761139, | |
| "learning_rate": 0.0001935438994229184, | |
| "loss": 1.0933, | |
| "step": 1354 | |
| }, | |
| { | |
| "epoch": 0.1045338472657191, | |
| "grad_norm": 0.13189563155174255, | |
| "learning_rate": 0.00019353359439406431, | |
| "loss": 1.0664, | |
| "step": 1356 | |
| }, | |
| { | |
| "epoch": 0.10468802698145026, | |
| "grad_norm": 0.1323234885931015, | |
| "learning_rate": 0.00019352328936521023, | |
| "loss": 1.0824, | |
| "step": 1358 | |
| }, | |
| { | |
| "epoch": 0.1048422066971814, | |
| "grad_norm": 0.13659097254276276, | |
| "learning_rate": 0.00019351298433635614, | |
| "loss": 1.0334, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.10499638641291255, | |
| "grad_norm": 0.11882172524929047, | |
| "learning_rate": 0.0001935026793075021, | |
| "loss": 1.0401, | |
| "step": 1362 | |
| }, | |
| { | |
| "epoch": 0.1051505661286437, | |
| "grad_norm": 0.13025067746639252, | |
| "learning_rate": 0.000193492374278648, | |
| "loss": 1.0838, | |
| "step": 1364 | |
| }, | |
| { | |
| "epoch": 0.10530474584437485, | |
| "grad_norm": 0.1249939501285553, | |
| "learning_rate": 0.00019348206924979392, | |
| "loss": 1.0349, | |
| "step": 1366 | |
| }, | |
| { | |
| "epoch": 0.105458925560106, | |
| "grad_norm": 0.12588031589984894, | |
| "learning_rate": 0.00019347176422093983, | |
| "loss": 1.079, | |
| "step": 1368 | |
| }, | |
| { | |
| "epoch": 0.10561310527583714, | |
| "grad_norm": 0.12548890709877014, | |
| "learning_rate": 0.00019346145919208575, | |
| "loss": 1.0062, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.1057672849915683, | |
| "grad_norm": 0.13328798115253448, | |
| "learning_rate": 0.00019345115416323166, | |
| "loss": 1.1154, | |
| "step": 1372 | |
| }, | |
| { | |
| "epoch": 0.10592146470729945, | |
| "grad_norm": 0.1443903148174286, | |
| "learning_rate": 0.00019344084913437758, | |
| "loss": 1.097, | |
| "step": 1374 | |
| }, | |
| { | |
| "epoch": 0.1060756444230306, | |
| "grad_norm": 0.12835648655891418, | |
| "learning_rate": 0.0001934305441055235, | |
| "loss": 1.0723, | |
| "step": 1376 | |
| }, | |
| { | |
| "epoch": 0.10622982413876174, | |
| "grad_norm": 0.13068312406539917, | |
| "learning_rate": 0.0001934202390766694, | |
| "loss": 1.1128, | |
| "step": 1378 | |
| }, | |
| { | |
| "epoch": 0.1063840038544929, | |
| "grad_norm": 0.13628961145877838, | |
| "learning_rate": 0.00019340993404781532, | |
| "loss": 1.1146, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.10653818357022404, | |
| "grad_norm": 0.12263484299182892, | |
| "learning_rate": 0.00019339962901896126, | |
| "loss": 1.0947, | |
| "step": 1382 | |
| }, | |
| { | |
| "epoch": 0.10669236328595519, | |
| "grad_norm": 0.12684424221515656, | |
| "learning_rate": 0.00019338932399010718, | |
| "loss": 1.059, | |
| "step": 1384 | |
| }, | |
| { | |
| "epoch": 0.10684654300168633, | |
| "grad_norm": 0.1421595960855484, | |
| "learning_rate": 0.0001933790189612531, | |
| "loss": 1.0688, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 0.10700072271741749, | |
| "grad_norm": 0.12416025251150131, | |
| "learning_rate": 0.000193368713932399, | |
| "loss": 1.0905, | |
| "step": 1388 | |
| }, | |
| { | |
| "epoch": 0.10715490243314864, | |
| "grad_norm": 0.1284332126379013, | |
| "learning_rate": 0.00019335840890354492, | |
| "loss": 1.0612, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.10730908214887978, | |
| "grad_norm": 0.1282491385936737, | |
| "learning_rate": 0.00019334810387469086, | |
| "loss": 1.0851, | |
| "step": 1392 | |
| }, | |
| { | |
| "epoch": 0.10746326186461094, | |
| "grad_norm": 0.13221289217472076, | |
| "learning_rate": 0.00019333779884583678, | |
| "loss": 1.0446, | |
| "step": 1394 | |
| }, | |
| { | |
| "epoch": 0.10761744158034209, | |
| "grad_norm": 0.12401736527681351, | |
| "learning_rate": 0.0001933274938169827, | |
| "loss": 1.0826, | |
| "step": 1396 | |
| }, | |
| { | |
| "epoch": 0.10777162129607323, | |
| "grad_norm": 0.14316771924495697, | |
| "learning_rate": 0.0001933171887881286, | |
| "loss": 1.1136, | |
| "step": 1398 | |
| }, | |
| { | |
| "epoch": 0.10792580101180438, | |
| "grad_norm": 0.17223364114761353, | |
| "learning_rate": 0.00019330688375927452, | |
| "loss": 1.0752, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.10792580101180438, | |
| "eval_loss": 1.0899540185928345, | |
| "eval_runtime": 185.3818, | |
| "eval_samples_per_second": 91.395, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.10807998072753554, | |
| "grad_norm": 0.15027141571044922, | |
| "learning_rate": 0.00019329657873042047, | |
| "loss": 1.0371, | |
| "step": 1402 | |
| }, | |
| { | |
| "epoch": 0.10823416044326668, | |
| "grad_norm": 0.19876505434513092, | |
| "learning_rate": 0.00019328627370156638, | |
| "loss": 1.0312, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 0.10838834015899783, | |
| "grad_norm": 0.1422131210565567, | |
| "learning_rate": 0.0001932759686727123, | |
| "loss": 1.0597, | |
| "step": 1406 | |
| }, | |
| { | |
| "epoch": 0.10854251987472899, | |
| "grad_norm": 0.13597753643989563, | |
| "learning_rate": 0.0001932656636438582, | |
| "loss": 1.0939, | |
| "step": 1408 | |
| }, | |
| { | |
| "epoch": 0.10869669959046013, | |
| "grad_norm": 0.16808953881263733, | |
| "learning_rate": 0.00019325535861500413, | |
| "loss": 1.1221, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.10885087930619128, | |
| "grad_norm": 0.14884881675243378, | |
| "learning_rate": 0.00019324505358615007, | |
| "loss": 1.1114, | |
| "step": 1412 | |
| }, | |
| { | |
| "epoch": 0.10900505902192242, | |
| "grad_norm": 0.12680503726005554, | |
| "learning_rate": 0.00019323474855729598, | |
| "loss": 1.1032, | |
| "step": 1414 | |
| }, | |
| { | |
| "epoch": 0.10915923873765358, | |
| "grad_norm": 0.13997766375541687, | |
| "learning_rate": 0.0001932244435284419, | |
| "loss": 1.0799, | |
| "step": 1416 | |
| }, | |
| { | |
| "epoch": 0.10931341845338473, | |
| "grad_norm": 0.1343669593334198, | |
| "learning_rate": 0.0001932141384995878, | |
| "loss": 1.0778, | |
| "step": 1418 | |
| }, | |
| { | |
| "epoch": 0.10946759816911587, | |
| "grad_norm": 0.12029851973056793, | |
| "learning_rate": 0.00019320383347073373, | |
| "loss": 1.1021, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.10962177788484702, | |
| "grad_norm": 0.1322990357875824, | |
| "learning_rate": 0.00019319352844187967, | |
| "loss": 1.1061, | |
| "step": 1422 | |
| }, | |
| { | |
| "epoch": 0.10977595760057818, | |
| "grad_norm": 0.13710594177246094, | |
| "learning_rate": 0.00019318322341302558, | |
| "loss": 1.0786, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 0.10993013731630932, | |
| "grad_norm": 0.11956049501895905, | |
| "learning_rate": 0.0001931729183841715, | |
| "loss": 1.0711, | |
| "step": 1426 | |
| }, | |
| { | |
| "epoch": 0.11008431703204047, | |
| "grad_norm": 0.139973446726799, | |
| "learning_rate": 0.00019316261335531741, | |
| "loss": 1.1162, | |
| "step": 1428 | |
| }, | |
| { | |
| "epoch": 0.11023849674777163, | |
| "grad_norm": 0.1525941640138626, | |
| "learning_rate": 0.00019315230832646333, | |
| "loss": 1.0572, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.11039267646350277, | |
| "grad_norm": 0.1349973976612091, | |
| "learning_rate": 0.00019314200329760924, | |
| "loss": 1.1048, | |
| "step": 1432 | |
| }, | |
| { | |
| "epoch": 0.11054685617923392, | |
| "grad_norm": 0.1305711269378662, | |
| "learning_rate": 0.00019313169826875516, | |
| "loss": 1.0841, | |
| "step": 1434 | |
| }, | |
| { | |
| "epoch": 0.11070103589496506, | |
| "grad_norm": 0.16756822168827057, | |
| "learning_rate": 0.00019312139323990107, | |
| "loss": 1.0736, | |
| "step": 1436 | |
| }, | |
| { | |
| "epoch": 0.11085521561069622, | |
| "grad_norm": 0.13367486000061035, | |
| "learning_rate": 0.000193111088211047, | |
| "loss": 1.0774, | |
| "step": 1438 | |
| }, | |
| { | |
| "epoch": 0.11100939532642737, | |
| "grad_norm": 0.12484605610370636, | |
| "learning_rate": 0.0001931007831821929, | |
| "loss": 1.1196, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.11116357504215851, | |
| "grad_norm": 0.14064739644527435, | |
| "learning_rate": 0.00019309047815333885, | |
| "loss": 1.1101, | |
| "step": 1442 | |
| }, | |
| { | |
| "epoch": 0.11131775475788966, | |
| "grad_norm": 0.1366916447877884, | |
| "learning_rate": 0.00019308017312448476, | |
| "loss": 1.111, | |
| "step": 1444 | |
| }, | |
| { | |
| "epoch": 0.11147193447362082, | |
| "grad_norm": 0.11520934104919434, | |
| "learning_rate": 0.00019306986809563068, | |
| "loss": 1.065, | |
| "step": 1446 | |
| }, | |
| { | |
| "epoch": 0.11162611418935196, | |
| "grad_norm": 0.15567731857299805, | |
| "learning_rate": 0.0001930595630667766, | |
| "loss": 1.1036, | |
| "step": 1448 | |
| }, | |
| { | |
| "epoch": 0.11178029390508311, | |
| "grad_norm": 0.13628730177879333, | |
| "learning_rate": 0.0001930492580379225, | |
| "loss": 1.0717, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.11193447362081427, | |
| "grad_norm": 0.1359964907169342, | |
| "learning_rate": 0.00019303895300906842, | |
| "loss": 1.0986, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 0.11208865333654541, | |
| "grad_norm": 0.16372162103652954, | |
| "learning_rate": 0.00019302864798021436, | |
| "loss": 1.0306, | |
| "step": 1454 | |
| }, | |
| { | |
| "epoch": 0.11224283305227656, | |
| "grad_norm": 0.1724134087562561, | |
| "learning_rate": 0.00019301834295136028, | |
| "loss": 1.0753, | |
| "step": 1456 | |
| }, | |
| { | |
| "epoch": 0.1123970127680077, | |
| "grad_norm": 0.13646383583545685, | |
| "learning_rate": 0.0001930080379225062, | |
| "loss": 1.0975, | |
| "step": 1458 | |
| }, | |
| { | |
| "epoch": 0.11255119248373886, | |
| "grad_norm": 0.1522134691476822, | |
| "learning_rate": 0.0001929977328936521, | |
| "loss": 1.1031, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.11270537219947001, | |
| "grad_norm": 0.13656160235404968, | |
| "learning_rate": 0.00019298742786479802, | |
| "loss": 1.0602, | |
| "step": 1462 | |
| }, | |
| { | |
| "epoch": 0.11285955191520115, | |
| "grad_norm": 0.14140130579471588, | |
| "learning_rate": 0.00019297712283594396, | |
| "loss": 1.1289, | |
| "step": 1464 | |
| }, | |
| { | |
| "epoch": 0.1130137316309323, | |
| "grad_norm": 0.1383032351732254, | |
| "learning_rate": 0.00019296681780708988, | |
| "loss": 1.0797, | |
| "step": 1466 | |
| }, | |
| { | |
| "epoch": 0.11316791134666346, | |
| "grad_norm": 0.15723556280136108, | |
| "learning_rate": 0.0001929565127782358, | |
| "loss": 1.1156, | |
| "step": 1468 | |
| }, | |
| { | |
| "epoch": 0.1133220910623946, | |
| "grad_norm": 0.13462230563163757, | |
| "learning_rate": 0.0001929462077493817, | |
| "loss": 1.0953, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.11347627077812575, | |
| "grad_norm": 0.14101319015026093, | |
| "learning_rate": 0.00019293590272052762, | |
| "loss": 1.1152, | |
| "step": 1472 | |
| }, | |
| { | |
| "epoch": 0.11363045049385691, | |
| "grad_norm": 0.13705132901668549, | |
| "learning_rate": 0.00019292559769167357, | |
| "loss": 1.0886, | |
| "step": 1474 | |
| }, | |
| { | |
| "epoch": 0.11378463020958805, | |
| "grad_norm": 0.1206672340631485, | |
| "learning_rate": 0.00019291529266281948, | |
| "loss": 1.0995, | |
| "step": 1476 | |
| }, | |
| { | |
| "epoch": 0.1139388099253192, | |
| "grad_norm": 0.13666383922100067, | |
| "learning_rate": 0.0001929049876339654, | |
| "loss": 1.058, | |
| "step": 1478 | |
| }, | |
| { | |
| "epoch": 0.11409298964105034, | |
| "grad_norm": 0.1265423446893692, | |
| "learning_rate": 0.0001928946826051113, | |
| "loss": 1.0676, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.1142471693567815, | |
| "grad_norm": 0.1528097242116928, | |
| "learning_rate": 0.00019288437757625723, | |
| "loss": 1.0675, | |
| "step": 1482 | |
| }, | |
| { | |
| "epoch": 0.11440134907251265, | |
| "grad_norm": 0.16541676223278046, | |
| "learning_rate": 0.00019287407254740314, | |
| "loss": 1.1539, | |
| "step": 1484 | |
| }, | |
| { | |
| "epoch": 0.1145555287882438, | |
| "grad_norm": 0.20383091270923615, | |
| "learning_rate": 0.00019286376751854906, | |
| "loss": 1.0472, | |
| "step": 1486 | |
| }, | |
| { | |
| "epoch": 0.11470970850397495, | |
| "grad_norm": 0.13806484639644623, | |
| "learning_rate": 0.00019285346248969497, | |
| "loss": 1.0408, | |
| "step": 1488 | |
| }, | |
| { | |
| "epoch": 0.1148638882197061, | |
| "grad_norm": 0.1251746118068695, | |
| "learning_rate": 0.00019284315746084089, | |
| "loss": 1.1207, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.11501806793543724, | |
| "grad_norm": 0.13218504190444946, | |
| "learning_rate": 0.0001928328524319868, | |
| "loss": 1.1131, | |
| "step": 1492 | |
| }, | |
| { | |
| "epoch": 0.11517224765116839, | |
| "grad_norm": 0.21616914868354797, | |
| "learning_rate": 0.00019282254740313274, | |
| "loss": 1.1103, | |
| "step": 1494 | |
| }, | |
| { | |
| "epoch": 0.11532642736689955, | |
| "grad_norm": 0.1437305361032486, | |
| "learning_rate": 0.00019281224237427866, | |
| "loss": 1.1243, | |
| "step": 1496 | |
| }, | |
| { | |
| "epoch": 0.11548060708263069, | |
| "grad_norm": 0.13094168901443481, | |
| "learning_rate": 0.00019280193734542457, | |
| "loss": 1.1012, | |
| "step": 1498 | |
| }, | |
| { | |
| "epoch": 0.11563478679836184, | |
| "grad_norm": 0.12384334206581116, | |
| "learning_rate": 0.0001927916323165705, | |
| "loss": 1.05, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11563478679836184, | |
| "eval_loss": 1.0905406475067139, | |
| "eval_runtime": 185.4473, | |
| "eval_samples_per_second": 91.363, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11578896651409298, | |
| "grad_norm": 0.12807106971740723, | |
| "learning_rate": 0.0001927813272877164, | |
| "loss": 1.0754, | |
| "step": 1502 | |
| }, | |
| { | |
| "epoch": 0.11594314622982414, | |
| "grad_norm": 0.12517131865024567, | |
| "learning_rate": 0.00019277102225886234, | |
| "loss": 1.1017, | |
| "step": 1504 | |
| }, | |
| { | |
| "epoch": 0.11609732594555529, | |
| "grad_norm": 0.1704496592283249, | |
| "learning_rate": 0.00019276071723000826, | |
| "loss": 1.098, | |
| "step": 1506 | |
| }, | |
| { | |
| "epoch": 0.11625150566128643, | |
| "grad_norm": 0.12152231484651566, | |
| "learning_rate": 0.00019275041220115417, | |
| "loss": 1.0738, | |
| "step": 1508 | |
| }, | |
| { | |
| "epoch": 0.11640568537701759, | |
| "grad_norm": 0.12952156364917755, | |
| "learning_rate": 0.0001927401071723001, | |
| "loss": 1.0479, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.11655986509274874, | |
| "grad_norm": 0.1499640941619873, | |
| "learning_rate": 0.000192729802143446, | |
| "loss": 1.1046, | |
| "step": 1512 | |
| }, | |
| { | |
| "epoch": 0.11671404480847988, | |
| "grad_norm": 0.1331593543291092, | |
| "learning_rate": 0.00019271949711459195, | |
| "loss": 1.1219, | |
| "step": 1514 | |
| }, | |
| { | |
| "epoch": 0.11686822452421103, | |
| "grad_norm": 0.1368558406829834, | |
| "learning_rate": 0.00019270919208573786, | |
| "loss": 1.1357, | |
| "step": 1516 | |
| }, | |
| { | |
| "epoch": 0.11702240423994219, | |
| "grad_norm": 0.12278290838003159, | |
| "learning_rate": 0.00019269888705688378, | |
| "loss": 1.1079, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 0.11717658395567333, | |
| "grad_norm": 0.11737775802612305, | |
| "learning_rate": 0.0001926885820280297, | |
| "loss": 1.1224, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.11733076367140448, | |
| "grad_norm": 0.13017341494560242, | |
| "learning_rate": 0.0001926782769991756, | |
| "loss": 1.0648, | |
| "step": 1522 | |
| }, | |
| { | |
| "epoch": 0.11748494338713562, | |
| "grad_norm": 0.11939583718776703, | |
| "learning_rate": 0.00019266797197032155, | |
| "loss": 1.0899, | |
| "step": 1524 | |
| }, | |
| { | |
| "epoch": 0.11763912310286678, | |
| "grad_norm": 0.12446755915880203, | |
| "learning_rate": 0.00019265766694146746, | |
| "loss": 1.0626, | |
| "step": 1526 | |
| }, | |
| { | |
| "epoch": 0.11779330281859793, | |
| "grad_norm": 0.13369430601596832, | |
| "learning_rate": 0.00019264736191261338, | |
| "loss": 1.0526, | |
| "step": 1528 | |
| }, | |
| { | |
| "epoch": 0.11794748253432907, | |
| "grad_norm": 0.13470736145973206, | |
| "learning_rate": 0.0001926370568837593, | |
| "loss": 1.0946, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.11810166225006023, | |
| "grad_norm": 0.14193174242973328, | |
| "learning_rate": 0.0001926267518549052, | |
| "loss": 1.1089, | |
| "step": 1532 | |
| }, | |
| { | |
| "epoch": 0.11825584196579138, | |
| "grad_norm": 0.14893026649951935, | |
| "learning_rate": 0.00019261644682605112, | |
| "loss": 1.0606, | |
| "step": 1534 | |
| }, | |
| { | |
| "epoch": 0.11841002168152252, | |
| "grad_norm": 0.20594976842403412, | |
| "learning_rate": 0.00019260614179719704, | |
| "loss": 1.0375, | |
| "step": 1536 | |
| }, | |
| { | |
| "epoch": 0.11856420139725367, | |
| "grad_norm": 0.15287873148918152, | |
| "learning_rate": 0.00019259583676834295, | |
| "loss": 1.1414, | |
| "step": 1538 | |
| }, | |
| { | |
| "epoch": 0.11871838111298483, | |
| "grad_norm": 0.1275177299976349, | |
| "learning_rate": 0.00019258553173948887, | |
| "loss": 1.1084, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.11887256082871597, | |
| "grad_norm": 0.20036157965660095, | |
| "learning_rate": 0.00019257522671063478, | |
| "loss": 1.1261, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 0.11902674054444712, | |
| "grad_norm": 0.14492087066173553, | |
| "learning_rate": 0.0001925649216817807, | |
| "loss": 1.1137, | |
| "step": 1544 | |
| }, | |
| { | |
| "epoch": 0.11918092026017826, | |
| "grad_norm": 0.1259312629699707, | |
| "learning_rate": 0.00019255461665292664, | |
| "loss": 1.0409, | |
| "step": 1546 | |
| }, | |
| { | |
| "epoch": 0.11933509997590942, | |
| "grad_norm": 0.1296795755624771, | |
| "learning_rate": 0.00019254431162407255, | |
| "loss": 1.0332, | |
| "step": 1548 | |
| }, | |
| { | |
| "epoch": 0.11948927969164057, | |
| "grad_norm": 0.13372276723384857, | |
| "learning_rate": 0.00019253400659521847, | |
| "loss": 1.1087, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.11964345940737171, | |
| "grad_norm": 0.14354725182056427, | |
| "learning_rate": 0.00019252370156636438, | |
| "loss": 1.0398, | |
| "step": 1552 | |
| }, | |
| { | |
| "epoch": 0.11979763912310287, | |
| "grad_norm": 0.1378318965435028, | |
| "learning_rate": 0.0001925133965375103, | |
| "loss": 1.0542, | |
| "step": 1554 | |
| }, | |
| { | |
| "epoch": 0.11995181883883402, | |
| "grad_norm": 0.12171255797147751, | |
| "learning_rate": 0.00019250309150865624, | |
| "loss": 1.0935, | |
| "step": 1556 | |
| }, | |
| { | |
| "epoch": 0.12010599855456516, | |
| "grad_norm": 0.11905664205551147, | |
| "learning_rate": 0.00019249278647980215, | |
| "loss": 1.0097, | |
| "step": 1558 | |
| }, | |
| { | |
| "epoch": 0.12026017827029631, | |
| "grad_norm": 0.12854760885238647, | |
| "learning_rate": 0.00019248248145094807, | |
| "loss": 1.1517, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.12041435798602747, | |
| "grad_norm": 0.247908353805542, | |
| "learning_rate": 0.00019247217642209398, | |
| "loss": 1.0876, | |
| "step": 1562 | |
| }, | |
| { | |
| "epoch": 0.12056853770175861, | |
| "grad_norm": 0.1441553235054016, | |
| "learning_rate": 0.0001924618713932399, | |
| "loss": 1.1414, | |
| "step": 1564 | |
| }, | |
| { | |
| "epoch": 0.12072271741748976, | |
| "grad_norm": 0.13307887315750122, | |
| "learning_rate": 0.00019245156636438584, | |
| "loss": 1.1012, | |
| "step": 1566 | |
| }, | |
| { | |
| "epoch": 0.12087689713322092, | |
| "grad_norm": 0.14192406833171844, | |
| "learning_rate": 0.00019244126133553176, | |
| "loss": 1.1418, | |
| "step": 1568 | |
| }, | |
| { | |
| "epoch": 0.12103107684895206, | |
| "grad_norm": 0.11530864983797073, | |
| "learning_rate": 0.00019243095630667767, | |
| "loss": 1.0776, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.12118525656468321, | |
| "grad_norm": 0.13385196030139923, | |
| "learning_rate": 0.00019242065127782359, | |
| "loss": 1.1311, | |
| "step": 1572 | |
| }, | |
| { | |
| "epoch": 0.12133943628041435, | |
| "grad_norm": 0.1308089643716812, | |
| "learning_rate": 0.0001924103462489695, | |
| "loss": 1.0625, | |
| "step": 1574 | |
| }, | |
| { | |
| "epoch": 0.12149361599614551, | |
| "grad_norm": 0.11851842701435089, | |
| "learning_rate": 0.00019240004122011544, | |
| "loss": 1.0182, | |
| "step": 1576 | |
| }, | |
| { | |
| "epoch": 0.12164779571187666, | |
| "grad_norm": 0.2496737688779831, | |
| "learning_rate": 0.00019238973619126136, | |
| "loss": 1.0746, | |
| "step": 1578 | |
| }, | |
| { | |
| "epoch": 0.1218019754276078, | |
| "grad_norm": 0.12962055206298828, | |
| "learning_rate": 0.00019237943116240727, | |
| "loss": 1.0245, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.12195615514333895, | |
| "grad_norm": 0.13170978426933289, | |
| "learning_rate": 0.0001923691261335532, | |
| "loss": 0.9897, | |
| "step": 1582 | |
| }, | |
| { | |
| "epoch": 0.12211033485907011, | |
| "grad_norm": 0.13226309418678284, | |
| "learning_rate": 0.0001923588211046991, | |
| "loss": 1.1035, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 0.12226451457480125, | |
| "grad_norm": 0.11901077628135681, | |
| "learning_rate": 0.00019234851607584502, | |
| "loss": 1.0084, | |
| "step": 1586 | |
| }, | |
| { | |
| "epoch": 0.1224186942905324, | |
| "grad_norm": 0.15274369716644287, | |
| "learning_rate": 0.00019233821104699093, | |
| "loss": 1.1436, | |
| "step": 1588 | |
| }, | |
| { | |
| "epoch": 0.12257287400626356, | |
| "grad_norm": 0.11832466721534729, | |
| "learning_rate": 0.00019232790601813685, | |
| "loss": 1.0179, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.1227270537219947, | |
| "grad_norm": 0.13038666546344757, | |
| "learning_rate": 0.00019231760098928276, | |
| "loss": 1.0779, | |
| "step": 1592 | |
| }, | |
| { | |
| "epoch": 0.12288123343772585, | |
| "grad_norm": 0.12837626039981842, | |
| "learning_rate": 0.00019230729596042868, | |
| "loss": 1.1404, | |
| "step": 1594 | |
| }, | |
| { | |
| "epoch": 0.123035413153457, | |
| "grad_norm": 0.1400509923696518, | |
| "learning_rate": 0.00019229699093157462, | |
| "loss": 1.1132, | |
| "step": 1596 | |
| }, | |
| { | |
| "epoch": 0.12318959286918815, | |
| "grad_norm": 0.13757595419883728, | |
| "learning_rate": 0.00019228668590272053, | |
| "loss": 1.0816, | |
| "step": 1598 | |
| }, | |
| { | |
| "epoch": 0.1233437725849193, | |
| "grad_norm": 0.12403321266174316, | |
| "learning_rate": 0.00019227638087386645, | |
| "loss": 1.039, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.1233437725849193, | |
| "eval_loss": 1.0888522863388062, | |
| "eval_runtime": 185.2371, | |
| "eval_samples_per_second": 91.467, | |
| "eval_steps_per_second": 1.431, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.12349795230065044, | |
| "grad_norm": 0.12380605190992355, | |
| "learning_rate": 0.00019226607584501236, | |
| "loss": 1.0903, | |
| "step": 1602 | |
| }, | |
| { | |
| "epoch": 0.12365213201638159, | |
| "grad_norm": 0.13564443588256836, | |
| "learning_rate": 0.00019225577081615828, | |
| "loss": 1.0768, | |
| "step": 1604 | |
| }, | |
| { | |
| "epoch": 0.12380631173211275, | |
| "grad_norm": 0.1533685177564621, | |
| "learning_rate": 0.00019224546578730422, | |
| "loss": 1.0852, | |
| "step": 1606 | |
| }, | |
| { | |
| "epoch": 0.12396049144784389, | |
| "grad_norm": 0.1163390502333641, | |
| "learning_rate": 0.00019223516075845014, | |
| "loss": 1.0574, | |
| "step": 1608 | |
| }, | |
| { | |
| "epoch": 0.12411467116357504, | |
| "grad_norm": 0.13867324590682983, | |
| "learning_rate": 0.00019222485572959605, | |
| "loss": 1.0992, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.1242688508793062, | |
| "grad_norm": 0.12759087979793549, | |
| "learning_rate": 0.00019221455070074197, | |
| "loss": 1.0738, | |
| "step": 1612 | |
| }, | |
| { | |
| "epoch": 0.12442303059503734, | |
| "grad_norm": 0.1237189844250679, | |
| "learning_rate": 0.00019220424567188788, | |
| "loss": 1.0974, | |
| "step": 1614 | |
| }, | |
| { | |
| "epoch": 0.12457721031076849, | |
| "grad_norm": 0.13331052660942078, | |
| "learning_rate": 0.00019219394064303382, | |
| "loss": 1.0917, | |
| "step": 1616 | |
| }, | |
| { | |
| "epoch": 0.12473139002649963, | |
| "grad_norm": 0.1290212869644165, | |
| "learning_rate": 0.00019218363561417974, | |
| "loss": 1.0696, | |
| "step": 1618 | |
| }, | |
| { | |
| "epoch": 0.12488556974223079, | |
| "grad_norm": 0.13309410214424133, | |
| "learning_rate": 0.00019217333058532565, | |
| "loss": 1.043, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.12503974945796192, | |
| "grad_norm": 0.13453248143196106, | |
| "learning_rate": 0.00019216302555647157, | |
| "loss": 1.0435, | |
| "step": 1622 | |
| }, | |
| { | |
| "epoch": 0.1251939291736931, | |
| "grad_norm": 0.11639372259378433, | |
| "learning_rate": 0.00019215272052761748, | |
| "loss": 1.0579, | |
| "step": 1624 | |
| }, | |
| { | |
| "epoch": 0.12534810888942424, | |
| "grad_norm": 0.13231517374515533, | |
| "learning_rate": 0.0001921424154987634, | |
| "loss": 1.1268, | |
| "step": 1626 | |
| }, | |
| { | |
| "epoch": 0.1255022886051554, | |
| "grad_norm": 0.1349351406097412, | |
| "learning_rate": 0.00019213211046990934, | |
| "loss": 1.1599, | |
| "step": 1628 | |
| }, | |
| { | |
| "epoch": 0.12565646832088653, | |
| "grad_norm": 0.13710346817970276, | |
| "learning_rate": 0.00019212180544105525, | |
| "loss": 1.0866, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.12581064803661768, | |
| "grad_norm": 0.14535072445869446, | |
| "learning_rate": 0.00019211150041220117, | |
| "loss": 1.0445, | |
| "step": 1632 | |
| }, | |
| { | |
| "epoch": 0.12596482775234882, | |
| "grad_norm": 0.11799806356430054, | |
| "learning_rate": 0.00019210119538334708, | |
| "loss": 1.0525, | |
| "step": 1634 | |
| }, | |
| { | |
| "epoch": 0.12611900746807997, | |
| "grad_norm": 0.13399624824523926, | |
| "learning_rate": 0.000192090890354493, | |
| "loss": 1.0246, | |
| "step": 1636 | |
| }, | |
| { | |
| "epoch": 0.12627318718381114, | |
| "grad_norm": 0.14404788613319397, | |
| "learning_rate": 0.00019208058532563894, | |
| "loss": 1.0582, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 0.1264273668995423, | |
| "grad_norm": 0.14395713806152344, | |
| "learning_rate": 0.00019207028029678486, | |
| "loss": 1.0686, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.12658154661527343, | |
| "grad_norm": 0.13249294459819794, | |
| "learning_rate": 0.00019205997526793077, | |
| "loss": 1.1286, | |
| "step": 1642 | |
| }, | |
| { | |
| "epoch": 0.12673572633100458, | |
| "grad_norm": 0.12791812419891357, | |
| "learning_rate": 0.00019204967023907669, | |
| "loss": 1.062, | |
| "step": 1644 | |
| }, | |
| { | |
| "epoch": 0.12688990604673572, | |
| "grad_norm": 0.12210959941148758, | |
| "learning_rate": 0.0001920393652102226, | |
| "loss": 1.0419, | |
| "step": 1646 | |
| }, | |
| { | |
| "epoch": 0.12704408576246687, | |
| "grad_norm": 0.13438813388347626, | |
| "learning_rate": 0.00019202906018136852, | |
| "loss": 1.0589, | |
| "step": 1648 | |
| }, | |
| { | |
| "epoch": 0.127198265478198, | |
| "grad_norm": 0.12953762710094452, | |
| "learning_rate": 0.00019201875515251443, | |
| "loss": 1.0128, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.1273524451939292, | |
| "grad_norm": 0.1318603903055191, | |
| "learning_rate": 0.00019200845012366035, | |
| "loss": 1.073, | |
| "step": 1652 | |
| }, | |
| { | |
| "epoch": 0.12750662490966033, | |
| "grad_norm": 0.12956051528453827, | |
| "learning_rate": 0.00019199814509480626, | |
| "loss": 1.0489, | |
| "step": 1654 | |
| }, | |
| { | |
| "epoch": 0.12766080462539148, | |
| "grad_norm": 0.13501368463039398, | |
| "learning_rate": 0.00019198784006595218, | |
| "loss": 1.0198, | |
| "step": 1656 | |
| }, | |
| { | |
| "epoch": 0.12781498434112262, | |
| "grad_norm": 0.13902342319488525, | |
| "learning_rate": 0.00019197753503709812, | |
| "loss": 1.0512, | |
| "step": 1658 | |
| }, | |
| { | |
| "epoch": 0.12796916405685377, | |
| "grad_norm": 0.15590503811836243, | |
| "learning_rate": 0.00019196723000824403, | |
| "loss": 1.1782, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.1281233437725849, | |
| "grad_norm": 0.13954932987689972, | |
| "learning_rate": 0.00019195692497938995, | |
| "loss": 1.0421, | |
| "step": 1662 | |
| }, | |
| { | |
| "epoch": 0.12827752348831606, | |
| "grad_norm": 0.11550859361886978, | |
| "learning_rate": 0.00019194661995053586, | |
| "loss": 1.086, | |
| "step": 1664 | |
| }, | |
| { | |
| "epoch": 0.1284317032040472, | |
| "grad_norm": 0.12175869196653366, | |
| "learning_rate": 0.00019193631492168178, | |
| "loss": 1.0704, | |
| "step": 1666 | |
| }, | |
| { | |
| "epoch": 0.12858588291977838, | |
| "grad_norm": 0.13503512740135193, | |
| "learning_rate": 0.00019192600989282772, | |
| "loss": 1.1166, | |
| "step": 1668 | |
| }, | |
| { | |
| "epoch": 0.12874006263550952, | |
| "grad_norm": 0.12849009037017822, | |
| "learning_rate": 0.00019191570486397363, | |
| "loss": 1.0315, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.12889424235124067, | |
| "grad_norm": 0.12484319508075714, | |
| "learning_rate": 0.00019190539983511955, | |
| "loss": 1.0737, | |
| "step": 1672 | |
| }, | |
| { | |
| "epoch": 0.1290484220669718, | |
| "grad_norm": 0.1364014446735382, | |
| "learning_rate": 0.00019189509480626546, | |
| "loss": 1.0619, | |
| "step": 1674 | |
| }, | |
| { | |
| "epoch": 0.12920260178270296, | |
| "grad_norm": 0.12930172681808472, | |
| "learning_rate": 0.00019188478977741138, | |
| "loss": 1.046, | |
| "step": 1676 | |
| }, | |
| { | |
| "epoch": 0.1293567814984341, | |
| "grad_norm": 0.13860805332660675, | |
| "learning_rate": 0.00019187448474855732, | |
| "loss": 1.0832, | |
| "step": 1678 | |
| }, | |
| { | |
| "epoch": 0.12951096121416525, | |
| "grad_norm": 0.1379111111164093, | |
| "learning_rate": 0.00019186417971970324, | |
| "loss": 1.1406, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.12966514092989642, | |
| "grad_norm": 0.1349123865365982, | |
| "learning_rate": 0.00019185387469084915, | |
| "loss": 1.1055, | |
| "step": 1682 | |
| }, | |
| { | |
| "epoch": 0.12981932064562757, | |
| "grad_norm": 0.13304142653942108, | |
| "learning_rate": 0.00019184356966199507, | |
| "loss": 1.0392, | |
| "step": 1684 | |
| }, | |
| { | |
| "epoch": 0.1299735003613587, | |
| "grad_norm": 0.12159105390310287, | |
| "learning_rate": 0.00019183326463314098, | |
| "loss": 1.0548, | |
| "step": 1686 | |
| }, | |
| { | |
| "epoch": 0.13012768007708986, | |
| "grad_norm": 0.12661418318748474, | |
| "learning_rate": 0.00019182295960428692, | |
| "loss": 1.0588, | |
| "step": 1688 | |
| }, | |
| { | |
| "epoch": 0.130281859792821, | |
| "grad_norm": 0.13691510260105133, | |
| "learning_rate": 0.00019181265457543284, | |
| "loss": 1.0854, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.13043603950855215, | |
| "grad_norm": 0.1401318609714508, | |
| "learning_rate": 0.00019180234954657875, | |
| "loss": 1.0864, | |
| "step": 1692 | |
| }, | |
| { | |
| "epoch": 0.1305902192242833, | |
| "grad_norm": 0.1355384737253189, | |
| "learning_rate": 0.00019179204451772467, | |
| "loss": 1.058, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 0.13074439894001447, | |
| "grad_norm": 0.13987474143505096, | |
| "learning_rate": 0.00019178173948887058, | |
| "loss": 1.06, | |
| "step": 1696 | |
| }, | |
| { | |
| "epoch": 0.1308985786557456, | |
| "grad_norm": 0.14350661635398865, | |
| "learning_rate": 0.0001917714344600165, | |
| "loss": 1.0731, | |
| "step": 1698 | |
| }, | |
| { | |
| "epoch": 0.13105275837147676, | |
| "grad_norm": 0.12443742901086807, | |
| "learning_rate": 0.0001917611294311624, | |
| "loss": 1.0987, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.13105275837147676, | |
| "eval_loss": 1.0880467891693115, | |
| "eval_runtime": 185.5457, | |
| "eval_samples_per_second": 91.314, | |
| "eval_steps_per_second": 1.428, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1312069380872079, | |
| "grad_norm": 0.10956554859876633, | |
| "learning_rate": 0.00019175082440230833, | |
| "loss": 1.0393, | |
| "step": 1702 | |
| }, | |
| { | |
| "epoch": 0.13136111780293905, | |
| "grad_norm": 0.11846137791872025, | |
| "learning_rate": 0.00019174051937345424, | |
| "loss": 1.0998, | |
| "step": 1704 | |
| }, | |
| { | |
| "epoch": 0.1315152975186702, | |
| "grad_norm": 0.11894328892230988, | |
| "learning_rate": 0.00019173021434460016, | |
| "loss": 1.1007, | |
| "step": 1706 | |
| }, | |
| { | |
| "epoch": 0.13166947723440134, | |
| "grad_norm": 0.11090514808893204, | |
| "learning_rate": 0.00019171990931574607, | |
| "loss": 1.0343, | |
| "step": 1708 | |
| }, | |
| { | |
| "epoch": 0.1318236569501325, | |
| "grad_norm": 0.1276719868183136, | |
| "learning_rate": 0.000191709604286892, | |
| "loss": 1.0392, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.13197783666586366, | |
| "grad_norm": 0.12342885881662369, | |
| "learning_rate": 0.00019169929925803793, | |
| "loss": 1.063, | |
| "step": 1712 | |
| }, | |
| { | |
| "epoch": 0.1321320163815948, | |
| "grad_norm": 0.1237882748246193, | |
| "learning_rate": 0.00019168899422918384, | |
| "loss": 1.0558, | |
| "step": 1714 | |
| }, | |
| { | |
| "epoch": 0.13228619609732595, | |
| "grad_norm": 0.12958785891532898, | |
| "learning_rate": 0.00019167868920032976, | |
| "loss": 1.0493, | |
| "step": 1716 | |
| }, | |
| { | |
| "epoch": 0.1324403758130571, | |
| "grad_norm": 0.1181110367178917, | |
| "learning_rate": 0.00019166838417147567, | |
| "loss": 1.0668, | |
| "step": 1718 | |
| }, | |
| { | |
| "epoch": 0.13259455552878824, | |
| "grad_norm": 0.12053950875997543, | |
| "learning_rate": 0.00019165807914262162, | |
| "loss": 1.0392, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.13274873524451938, | |
| "grad_norm": 0.11725175380706787, | |
| "learning_rate": 0.00019164777411376753, | |
| "loss": 1.0188, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 0.13290291496025053, | |
| "grad_norm": 0.12475614994764328, | |
| "learning_rate": 0.00019163746908491344, | |
| "loss": 1.0134, | |
| "step": 1724 | |
| }, | |
| { | |
| "epoch": 0.1330570946759817, | |
| "grad_norm": 0.1231207475066185, | |
| "learning_rate": 0.00019162716405605936, | |
| "loss": 1.0309, | |
| "step": 1726 | |
| }, | |
| { | |
| "epoch": 0.13321127439171285, | |
| "grad_norm": 0.1269765943288803, | |
| "learning_rate": 0.00019161685902720527, | |
| "loss": 1.0918, | |
| "step": 1728 | |
| }, | |
| { | |
| "epoch": 0.133365454107444, | |
| "grad_norm": 0.12103556841611862, | |
| "learning_rate": 0.00019160655399835122, | |
| "loss": 1.0453, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.13351963382317514, | |
| "grad_norm": 0.12427771091461182, | |
| "learning_rate": 0.00019159624896949713, | |
| "loss": 1.1544, | |
| "step": 1732 | |
| }, | |
| { | |
| "epoch": 0.13367381353890628, | |
| "grad_norm": 0.13416282832622528, | |
| "learning_rate": 0.00019158594394064305, | |
| "loss": 1.0941, | |
| "step": 1734 | |
| }, | |
| { | |
| "epoch": 0.13382799325463743, | |
| "grad_norm": 0.13207705318927765, | |
| "learning_rate": 0.00019157563891178896, | |
| "loss": 1.0998, | |
| "step": 1736 | |
| }, | |
| { | |
| "epoch": 0.13398217297036857, | |
| "grad_norm": 0.1436687856912613, | |
| "learning_rate": 0.00019156533388293488, | |
| "loss": 1.0723, | |
| "step": 1738 | |
| }, | |
| { | |
| "epoch": 0.13413635268609975, | |
| "grad_norm": 0.1206304207444191, | |
| "learning_rate": 0.00019155502885408082, | |
| "loss": 1.0279, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.1342905324018309, | |
| "grad_norm": 0.12685900926589966, | |
| "learning_rate": 0.00019154472382522673, | |
| "loss": 1.0683, | |
| "step": 1742 | |
| }, | |
| { | |
| "epoch": 0.13444471211756204, | |
| "grad_norm": 0.12833228707313538, | |
| "learning_rate": 0.00019153441879637265, | |
| "loss": 1.0904, | |
| "step": 1744 | |
| }, | |
| { | |
| "epoch": 0.13459889183329318, | |
| "grad_norm": 0.12999312579631805, | |
| "learning_rate": 0.00019152411376751856, | |
| "loss": 1.0492, | |
| "step": 1746 | |
| }, | |
| { | |
| "epoch": 0.13475307154902433, | |
| "grad_norm": 0.13486912846565247, | |
| "learning_rate": 0.00019151380873866448, | |
| "loss": 1.101, | |
| "step": 1748 | |
| }, | |
| { | |
| "epoch": 0.13490725126475547, | |
| "grad_norm": 0.12793023884296417, | |
| "learning_rate": 0.0001915035037098104, | |
| "loss": 1.1135, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.13506143098048662, | |
| "grad_norm": 0.12652675807476044, | |
| "learning_rate": 0.0001914931986809563, | |
| "loss": 1.0902, | |
| "step": 1752 | |
| }, | |
| { | |
| "epoch": 0.1352156106962178, | |
| "grad_norm": 0.12431836873292923, | |
| "learning_rate": 0.00019148289365210222, | |
| "loss": 1.0922, | |
| "step": 1754 | |
| }, | |
| { | |
| "epoch": 0.13536979041194894, | |
| "grad_norm": 0.13665209710597992, | |
| "learning_rate": 0.00019147258862324814, | |
| "loss": 1.0584, | |
| "step": 1756 | |
| }, | |
| { | |
| "epoch": 0.13552397012768008, | |
| "grad_norm": 0.1355196088552475, | |
| "learning_rate": 0.00019146228359439405, | |
| "loss": 1.1199, | |
| "step": 1758 | |
| }, | |
| { | |
| "epoch": 0.13567814984341123, | |
| "grad_norm": 0.14115893840789795, | |
| "learning_rate": 0.00019145197856554, | |
| "loss": 1.0697, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.13583232955914237, | |
| "grad_norm": 0.13009534776210785, | |
| "learning_rate": 0.0001914416735366859, | |
| "loss": 1.1111, | |
| "step": 1762 | |
| }, | |
| { | |
| "epoch": 0.13598650927487352, | |
| "grad_norm": 0.12280994653701782, | |
| "learning_rate": 0.00019143136850783182, | |
| "loss": 1.0341, | |
| "step": 1764 | |
| }, | |
| { | |
| "epoch": 0.13614068899060466, | |
| "grad_norm": 0.15171582996845245, | |
| "learning_rate": 0.00019142106347897774, | |
| "loss": 1.1275, | |
| "step": 1766 | |
| }, | |
| { | |
| "epoch": 0.1362948687063358, | |
| "grad_norm": 0.15258526802062988, | |
| "learning_rate": 0.00019141075845012365, | |
| "loss": 1.0513, | |
| "step": 1768 | |
| }, | |
| { | |
| "epoch": 0.13644904842206698, | |
| "grad_norm": 0.132346972823143, | |
| "learning_rate": 0.0001914004534212696, | |
| "loss": 1.0878, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.13660322813779813, | |
| "grad_norm": 0.13237041234970093, | |
| "learning_rate": 0.0001913901483924155, | |
| "loss": 1.0845, | |
| "step": 1772 | |
| }, | |
| { | |
| "epoch": 0.13675740785352927, | |
| "grad_norm": 0.13837209343910217, | |
| "learning_rate": 0.00019137984336356143, | |
| "loss": 1.1221, | |
| "step": 1774 | |
| }, | |
| { | |
| "epoch": 0.13691158756926042, | |
| "grad_norm": 0.17590375244617462, | |
| "learning_rate": 0.00019136953833470734, | |
| "loss": 1.1963, | |
| "step": 1776 | |
| }, | |
| { | |
| "epoch": 0.13706576728499156, | |
| "grad_norm": 0.12898488342761993, | |
| "learning_rate": 0.00019135923330585326, | |
| "loss": 1.1306, | |
| "step": 1778 | |
| }, | |
| { | |
| "epoch": 0.1372199470007227, | |
| "grad_norm": 0.12428785115480423, | |
| "learning_rate": 0.0001913489282769992, | |
| "loss": 1.068, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.13737412671645385, | |
| "grad_norm": 0.12678809463977814, | |
| "learning_rate": 0.0001913386232481451, | |
| "loss": 1.0709, | |
| "step": 1782 | |
| }, | |
| { | |
| "epoch": 0.13752830643218503, | |
| "grad_norm": 0.1344168782234192, | |
| "learning_rate": 0.00019132831821929103, | |
| "loss": 1.1073, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 0.13768248614791617, | |
| "grad_norm": 0.14730733633041382, | |
| "learning_rate": 0.00019131801319043694, | |
| "loss": 1.0073, | |
| "step": 1786 | |
| }, | |
| { | |
| "epoch": 0.13783666586364732, | |
| "grad_norm": 0.13661792874336243, | |
| "learning_rate": 0.00019130770816158286, | |
| "loss": 1.0637, | |
| "step": 1788 | |
| }, | |
| { | |
| "epoch": 0.13799084557937846, | |
| "grad_norm": 0.1342434138059616, | |
| "learning_rate": 0.0001912974031327288, | |
| "loss": 1.1069, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.1381450252951096, | |
| "grad_norm": 0.11941581219434738, | |
| "learning_rate": 0.00019128709810387471, | |
| "loss": 1.1023, | |
| "step": 1792 | |
| }, | |
| { | |
| "epoch": 0.13829920501084075, | |
| "grad_norm": 0.13641759753227234, | |
| "learning_rate": 0.00019127679307502063, | |
| "loss": 1.0564, | |
| "step": 1794 | |
| }, | |
| { | |
| "epoch": 0.1384533847265719, | |
| "grad_norm": 0.11148608475923538, | |
| "learning_rate": 0.00019126648804616654, | |
| "loss": 1.0255, | |
| "step": 1796 | |
| }, | |
| { | |
| "epoch": 0.13860756444230307, | |
| "grad_norm": 0.1387186199426651, | |
| "learning_rate": 0.00019125618301731246, | |
| "loss": 1.0663, | |
| "step": 1798 | |
| }, | |
| { | |
| "epoch": 0.13876174415803422, | |
| "grad_norm": 0.12380651384592056, | |
| "learning_rate": 0.00019124587798845837, | |
| "loss": 1.1222, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.13876174415803422, | |
| "eval_loss": 1.0875153541564941, | |
| "eval_runtime": 185.4605, | |
| "eval_samples_per_second": 91.356, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.13891592387376536, | |
| "grad_norm": 0.13224369287490845, | |
| "learning_rate": 0.00019123557295960432, | |
| "loss": 1.0821, | |
| "step": 1802 | |
| }, | |
| { | |
| "epoch": 0.1390701035894965, | |
| "grad_norm": 0.13096244633197784, | |
| "learning_rate": 0.00019122526793075023, | |
| "loss": 1.0097, | |
| "step": 1804 | |
| }, | |
| { | |
| "epoch": 0.13922428330522765, | |
| "grad_norm": 0.11652527749538422, | |
| "learning_rate": 0.00019121496290189615, | |
| "loss": 1.0517, | |
| "step": 1806 | |
| }, | |
| { | |
| "epoch": 0.1393784630209588, | |
| "grad_norm": 0.13449358940124512, | |
| "learning_rate": 0.00019120465787304206, | |
| "loss": 1.0915, | |
| "step": 1808 | |
| }, | |
| { | |
| "epoch": 0.13953264273668994, | |
| "grad_norm": 0.11550068855285645, | |
| "learning_rate": 0.00019119435284418798, | |
| "loss": 1.0568, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.13968682245242112, | |
| "grad_norm": 0.13804587721824646, | |
| "learning_rate": 0.0001911840478153339, | |
| "loss": 1.0933, | |
| "step": 1812 | |
| }, | |
| { | |
| "epoch": 0.13984100216815226, | |
| "grad_norm": 0.12062159180641174, | |
| "learning_rate": 0.0001911737427864798, | |
| "loss": 1.0517, | |
| "step": 1814 | |
| }, | |
| { | |
| "epoch": 0.1399951818838834, | |
| "grad_norm": 0.12154779583215714, | |
| "learning_rate": 0.00019116343775762572, | |
| "loss": 1.0955, | |
| "step": 1816 | |
| }, | |
| { | |
| "epoch": 0.14014936159961455, | |
| "grad_norm": 0.11615799367427826, | |
| "learning_rate": 0.00019115313272877164, | |
| "loss": 0.968, | |
| "step": 1818 | |
| }, | |
| { | |
| "epoch": 0.1403035413153457, | |
| "grad_norm": 0.1207037940621376, | |
| "learning_rate": 0.00019114282769991755, | |
| "loss": 1.0896, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.14045772103107684, | |
| "grad_norm": 0.12750887870788574, | |
| "learning_rate": 0.0001911325226710635, | |
| "loss": 1.065, | |
| "step": 1822 | |
| }, | |
| { | |
| "epoch": 0.140611900746808, | |
| "grad_norm": 0.16391952335834503, | |
| "learning_rate": 0.0001911222176422094, | |
| "loss": 1.0232, | |
| "step": 1824 | |
| }, | |
| { | |
| "epoch": 0.14076608046253913, | |
| "grad_norm": 0.14626921713352203, | |
| "learning_rate": 0.00019111191261335532, | |
| "loss": 1.0375, | |
| "step": 1826 | |
| }, | |
| { | |
| "epoch": 0.1409202601782703, | |
| "grad_norm": 0.12393996119499207, | |
| "learning_rate": 0.00019110160758450124, | |
| "loss": 1.0345, | |
| "step": 1828 | |
| }, | |
| { | |
| "epoch": 0.14107443989400145, | |
| "grad_norm": 0.13275925815105438, | |
| "learning_rate": 0.00019109130255564715, | |
| "loss": 1.071, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.1412286196097326, | |
| "grad_norm": 0.1255485862493515, | |
| "learning_rate": 0.0001910809975267931, | |
| "loss": 1.1026, | |
| "step": 1832 | |
| }, | |
| { | |
| "epoch": 0.14138279932546374, | |
| "grad_norm": 0.13399668037891388, | |
| "learning_rate": 0.000191070692497939, | |
| "loss": 1.11, | |
| "step": 1834 | |
| }, | |
| { | |
| "epoch": 0.1415369790411949, | |
| "grad_norm": 0.13084925711154938, | |
| "learning_rate": 0.00019106038746908492, | |
| "loss": 1.0528, | |
| "step": 1836 | |
| }, | |
| { | |
| "epoch": 0.14169115875692603, | |
| "grad_norm": 0.15695689618587494, | |
| "learning_rate": 0.00019105008244023084, | |
| "loss": 1.1336, | |
| "step": 1838 | |
| }, | |
| { | |
| "epoch": 0.14184533847265718, | |
| "grad_norm": 0.13630808889865875, | |
| "learning_rate": 0.00019103977741137675, | |
| "loss": 1.0767, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.14199951818838835, | |
| "grad_norm": 0.11874844878911972, | |
| "learning_rate": 0.0001910294723825227, | |
| "loss": 1.0511, | |
| "step": 1842 | |
| }, | |
| { | |
| "epoch": 0.1421536979041195, | |
| "grad_norm": 0.11898507922887802, | |
| "learning_rate": 0.0001910191673536686, | |
| "loss": 1.0866, | |
| "step": 1844 | |
| }, | |
| { | |
| "epoch": 0.14230787761985064, | |
| "grad_norm": 0.1393211930990219, | |
| "learning_rate": 0.00019100886232481453, | |
| "loss": 1.0553, | |
| "step": 1846 | |
| }, | |
| { | |
| "epoch": 0.1424620573355818, | |
| "grad_norm": 0.1382310539484024, | |
| "learning_rate": 0.00019099855729596044, | |
| "loss": 1.07, | |
| "step": 1848 | |
| }, | |
| { | |
| "epoch": 0.14261623705131293, | |
| "grad_norm": 0.1471824198961258, | |
| "learning_rate": 0.00019098825226710636, | |
| "loss": 1.0893, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.14277041676704408, | |
| "grad_norm": 0.12706084549427032, | |
| "learning_rate": 0.0001909779472382523, | |
| "loss": 1.0848, | |
| "step": 1852 | |
| }, | |
| { | |
| "epoch": 0.14292459648277522, | |
| "grad_norm": 0.1324569135904312, | |
| "learning_rate": 0.0001909676422093982, | |
| "loss": 1.024, | |
| "step": 1854 | |
| }, | |
| { | |
| "epoch": 0.1430787761985064, | |
| "grad_norm": 0.11245544254779816, | |
| "learning_rate": 0.00019095733718054413, | |
| "loss": 1.0802, | |
| "step": 1856 | |
| }, | |
| { | |
| "epoch": 0.14323295591423754, | |
| "grad_norm": 0.15419217944145203, | |
| "learning_rate": 0.00019094703215169004, | |
| "loss": 1.1101, | |
| "step": 1858 | |
| }, | |
| { | |
| "epoch": 0.1433871356299687, | |
| "grad_norm": 0.1071443036198616, | |
| "learning_rate": 0.00019093672712283596, | |
| "loss": 1.0576, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.14354131534569983, | |
| "grad_norm": 0.1341090053319931, | |
| "learning_rate": 0.00019092642209398187, | |
| "loss": 1.0606, | |
| "step": 1862 | |
| }, | |
| { | |
| "epoch": 0.14369549506143098, | |
| "grad_norm": 0.11848092079162598, | |
| "learning_rate": 0.0001909161170651278, | |
| "loss": 1.0714, | |
| "step": 1864 | |
| }, | |
| { | |
| "epoch": 0.14384967477716212, | |
| "grad_norm": 0.12697815895080566, | |
| "learning_rate": 0.0001909058120362737, | |
| "loss": 1.092, | |
| "step": 1866 | |
| }, | |
| { | |
| "epoch": 0.14400385449289327, | |
| "grad_norm": 0.11891257762908936, | |
| "learning_rate": 0.00019089550700741962, | |
| "loss": 0.9649, | |
| "step": 1868 | |
| }, | |
| { | |
| "epoch": 0.14415803420862444, | |
| "grad_norm": 0.12616439163684845, | |
| "learning_rate": 0.00019088520197856553, | |
| "loss": 1.0962, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 0.1443122139243556, | |
| "grad_norm": 0.12141067534685135, | |
| "learning_rate": 0.00019087489694971147, | |
| "loss": 1.0838, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 0.14446639364008673, | |
| "grad_norm": 0.13279564678668976, | |
| "learning_rate": 0.0001908645919208574, | |
| "loss": 1.0484, | |
| "step": 1874 | |
| }, | |
| { | |
| "epoch": 0.14462057335581788, | |
| "grad_norm": 0.15748505294322968, | |
| "learning_rate": 0.0001908542868920033, | |
| "loss": 1.1433, | |
| "step": 1876 | |
| }, | |
| { | |
| "epoch": 0.14477475307154902, | |
| "grad_norm": 0.11593475937843323, | |
| "learning_rate": 0.00019084398186314922, | |
| "loss": 1.1483, | |
| "step": 1878 | |
| }, | |
| { | |
| "epoch": 0.14492893278728017, | |
| "grad_norm": 0.14499489963054657, | |
| "learning_rate": 0.00019083367683429513, | |
| "loss": 1.0782, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 0.1450831125030113, | |
| "grad_norm": 0.13570410013198853, | |
| "learning_rate": 0.00019082337180544105, | |
| "loss": 1.0989, | |
| "step": 1882 | |
| }, | |
| { | |
| "epoch": 0.14523729221874246, | |
| "grad_norm": 0.12810774147510529, | |
| "learning_rate": 0.000190813066776587, | |
| "loss": 1.0374, | |
| "step": 1884 | |
| }, | |
| { | |
| "epoch": 0.14539147193447363, | |
| "grad_norm": 0.11781581491231918, | |
| "learning_rate": 0.0001908027617477329, | |
| "loss": 1.0796, | |
| "step": 1886 | |
| }, | |
| { | |
| "epoch": 0.14554565165020478, | |
| "grad_norm": 0.12243229150772095, | |
| "learning_rate": 0.00019079245671887882, | |
| "loss": 1.0477, | |
| "step": 1888 | |
| }, | |
| { | |
| "epoch": 0.14569983136593592, | |
| "grad_norm": 0.1385030299425125, | |
| "learning_rate": 0.00019078215169002474, | |
| "loss": 1.0349, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 0.14585401108166707, | |
| "grad_norm": 0.12011386454105377, | |
| "learning_rate": 0.00019077184666117065, | |
| "loss": 1.0718, | |
| "step": 1892 | |
| }, | |
| { | |
| "epoch": 0.1460081907973982, | |
| "grad_norm": 0.12646062672138214, | |
| "learning_rate": 0.0001907615416323166, | |
| "loss": 1.1228, | |
| "step": 1894 | |
| }, | |
| { | |
| "epoch": 0.14616237051312936, | |
| "grad_norm": 0.1284620612859726, | |
| "learning_rate": 0.0001907512366034625, | |
| "loss": 1.079, | |
| "step": 1896 | |
| }, | |
| { | |
| "epoch": 0.1463165502288605, | |
| "grad_norm": 0.15374581515789032, | |
| "learning_rate": 0.00019074093157460842, | |
| "loss": 1.1147, | |
| "step": 1898 | |
| }, | |
| { | |
| "epoch": 0.14647072994459168, | |
| "grad_norm": 0.1325882524251938, | |
| "learning_rate": 0.00019073062654575434, | |
| "loss": 1.0404, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14647072994459168, | |
| "eval_loss": 1.0869932174682617, | |
| "eval_runtime": 185.4754, | |
| "eval_samples_per_second": 91.349, | |
| "eval_steps_per_second": 1.429, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14662490966032282, | |
| "grad_norm": 0.14041611552238464, | |
| "learning_rate": 0.00019072032151690025, | |
| "loss": 1.095, | |
| "step": 1902 | |
| }, | |
| { | |
| "epoch": 0.14677908937605397, | |
| "grad_norm": 0.14162160456180573, | |
| "learning_rate": 0.0001907100164880462, | |
| "loss": 1.1714, | |
| "step": 1904 | |
| }, | |
| { | |
| "epoch": 0.1469332690917851, | |
| "grad_norm": 0.12077832221984863, | |
| "learning_rate": 0.0001906997114591921, | |
| "loss": 1.1109, | |
| "step": 1906 | |
| }, | |
| { | |
| "epoch": 0.14708744880751626, | |
| "grad_norm": 0.1738968789577484, | |
| "learning_rate": 0.00019068940643033802, | |
| "loss": 1.0838, | |
| "step": 1908 | |
| }, | |
| { | |
| "epoch": 0.1472416285232474, | |
| "grad_norm": 0.13948039710521698, | |
| "learning_rate": 0.00019067910140148394, | |
| "loss": 1.0494, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 0.14739580823897855, | |
| "grad_norm": 0.21179239451885223, | |
| "learning_rate": 0.00019066879637262985, | |
| "loss": 1.0962, | |
| "step": 1912 | |
| }, | |
| { | |
| "epoch": 0.14754998795470972, | |
| "grad_norm": 0.12927787005901337, | |
| "learning_rate": 0.00019065849134377577, | |
| "loss": 1.1113, | |
| "step": 1914 | |
| }, | |
| { | |
| "epoch": 0.14770416767044087, | |
| "grad_norm": 0.1296701431274414, | |
| "learning_rate": 0.00019064818631492168, | |
| "loss": 1.0603, | |
| "step": 1916 | |
| }, | |
| { | |
| "epoch": 0.147858347386172, | |
| "grad_norm": 0.1282590925693512, | |
| "learning_rate": 0.0001906378812860676, | |
| "loss": 1.0594, | |
| "step": 1918 | |
| }, | |
| { | |
| "epoch": 0.14801252710190316, | |
| "grad_norm": 0.13304758071899414, | |
| "learning_rate": 0.0001906275762572135, | |
| "loss": 1.0784, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 0.1481667068176343, | |
| "grad_norm": 0.15661965310573578, | |
| "learning_rate": 0.00019061727122835943, | |
| "loss": 1.008, | |
| "step": 1922 | |
| }, | |
| { | |
| "epoch": 0.14832088653336545, | |
| "grad_norm": 0.12986873090267181, | |
| "learning_rate": 0.00019060696619950537, | |
| "loss": 1.0788, | |
| "step": 1924 | |
| }, | |
| { | |
| "epoch": 0.1484750662490966, | |
| "grad_norm": 0.1128251776099205, | |
| "learning_rate": 0.00019059666117065128, | |
| "loss": 1.1449, | |
| "step": 1926 | |
| }, | |
| { | |
| "epoch": 0.14862924596482774, | |
| "grad_norm": 0.13722160458564758, | |
| "learning_rate": 0.0001905863561417972, | |
| "loss": 1.0914, | |
| "step": 1928 | |
| }, | |
| { | |
| "epoch": 0.1487834256805589, | |
| "grad_norm": 0.1507786512374878, | |
| "learning_rate": 0.00019057605111294311, | |
| "loss": 1.0694, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 0.14893760539629006, | |
| "grad_norm": 0.1368752121925354, | |
| "learning_rate": 0.00019056574608408903, | |
| "loss": 1.0417, | |
| "step": 1932 | |
| }, | |
| { | |
| "epoch": 0.1490917851120212, | |
| "grad_norm": 0.12566259503364563, | |
| "learning_rate": 0.00019055544105523497, | |
| "loss": 1.0853, | |
| "step": 1934 | |
| }, | |
| { | |
| "epoch": 0.14924596482775235, | |
| "grad_norm": 0.12362397462129593, | |
| "learning_rate": 0.0001905451360263809, | |
| "loss": 1.1136, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 0.1494001445434835, | |
| "grad_norm": 0.12472514808177948, | |
| "learning_rate": 0.0001905348309975268, | |
| "loss": 1.0628, | |
| "step": 1938 | |
| }, | |
| { | |
| "epoch": 0.14955432425921464, | |
| "grad_norm": 0.1355161964893341, | |
| "learning_rate": 0.00019052452596867272, | |
| "loss": 1.1211, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 0.14970850397494578, | |
| "grad_norm": 0.13438721001148224, | |
| "learning_rate": 0.00019051422093981863, | |
| "loss": 1.0758, | |
| "step": 1942 | |
| }, | |
| { | |
| "epoch": 0.14986268369067696, | |
| "grad_norm": 0.11768204718828201, | |
| "learning_rate": 0.00019050391591096457, | |
| "loss": 1.0533, | |
| "step": 1944 | |
| }, | |
| { | |
| "epoch": 0.1500168634064081, | |
| "grad_norm": 0.13892577588558197, | |
| "learning_rate": 0.0001904936108821105, | |
| "loss": 1.1076, | |
| "step": 1946 | |
| }, | |
| { | |
| "epoch": 0.15017104312213925, | |
| "grad_norm": 0.1532358080148697, | |
| "learning_rate": 0.0001904833058532564, | |
| "loss": 1.0706, | |
| "step": 1948 | |
| }, | |
| { | |
| "epoch": 0.1503252228378704, | |
| "grad_norm": 0.13364464044570923, | |
| "learning_rate": 0.00019047300082440232, | |
| "loss": 1.1322, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.15047940255360154, | |
| "grad_norm": 0.12663134932518005, | |
| "learning_rate": 0.00019046269579554823, | |
| "loss": 1.0749, | |
| "step": 1952 | |
| }, | |
| { | |
| "epoch": 0.15063358226933268, | |
| "grad_norm": 0.1297607123851776, | |
| "learning_rate": 0.00019045239076669417, | |
| "loss": 1.0594, | |
| "step": 1954 | |
| }, | |
| { | |
| "epoch": 0.15078776198506383, | |
| "grad_norm": 0.11931920051574707, | |
| "learning_rate": 0.0001904420857378401, | |
| "loss": 1.0522, | |
| "step": 1956 | |
| }, | |
| { | |
| "epoch": 0.150941941700795, | |
| "grad_norm": 0.1334810107946396, | |
| "learning_rate": 0.000190431780708986, | |
| "loss": 1.0674, | |
| "step": 1958 | |
| }, | |
| { | |
| "epoch": 0.15109612141652615, | |
| "grad_norm": 0.12633340060710907, | |
| "learning_rate": 0.00019042147568013192, | |
| "loss": 1.0139, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 0.1512503011322573, | |
| "grad_norm": 0.12485836446285248, | |
| "learning_rate": 0.00019041117065127783, | |
| "loss": 1.0288, | |
| "step": 1962 | |
| }, | |
| { | |
| "epoch": 0.15140448084798844, | |
| "grad_norm": 0.10940799117088318, | |
| "learning_rate": 0.00019040086562242375, | |
| "loss": 1.0475, | |
| "step": 1964 | |
| }, | |
| { | |
| "epoch": 0.15155866056371958, | |
| "grad_norm": 0.12229325622320175, | |
| "learning_rate": 0.00019039056059356966, | |
| "loss": 1.0628, | |
| "step": 1966 | |
| }, | |
| { | |
| "epoch": 0.15171284027945073, | |
| "grad_norm": 0.14333505928516388, | |
| "learning_rate": 0.00019038025556471558, | |
| "loss": 1.0423, | |
| "step": 1968 | |
| }, | |
| { | |
| "epoch": 0.15186701999518187, | |
| "grad_norm": 0.12773017585277557, | |
| "learning_rate": 0.0001903699505358615, | |
| "loss": 1.1283, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 0.15202119971091305, | |
| "grad_norm": 0.11913473904132843, | |
| "learning_rate": 0.0001903596455070074, | |
| "loss": 1.0646, | |
| "step": 1972 | |
| }, | |
| { | |
| "epoch": 0.1521753794266442, | |
| "grad_norm": 0.13321518898010254, | |
| "learning_rate": 0.00019034934047815332, | |
| "loss": 1.0476, | |
| "step": 1974 | |
| }, | |
| { | |
| "epoch": 0.15232955914237534, | |
| "grad_norm": 0.1362799108028412, | |
| "learning_rate": 0.00019033903544929927, | |
| "loss": 1.0937, | |
| "step": 1976 | |
| }, | |
| { | |
| "epoch": 0.15248373885810648, | |
| "grad_norm": 0.13804180920124054, | |
| "learning_rate": 0.00019032873042044518, | |
| "loss": 1.113, | |
| "step": 1978 | |
| }, | |
| { | |
| "epoch": 0.15263791857383763, | |
| "grad_norm": 0.1774570494890213, | |
| "learning_rate": 0.0001903184253915911, | |
| "loss": 1.0795, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 0.15279209828956877, | |
| "grad_norm": 0.13106994330883026, | |
| "learning_rate": 0.000190308120362737, | |
| "loss": 1.098, | |
| "step": 1982 | |
| }, | |
| { | |
| "epoch": 0.15294627800529992, | |
| "grad_norm": 0.14435411989688873, | |
| "learning_rate": 0.00019029781533388293, | |
| "loss": 1.0814, | |
| "step": 1984 | |
| }, | |
| { | |
| "epoch": 0.15310045772103106, | |
| "grad_norm": 0.13178013265132904, | |
| "learning_rate": 0.00019028751030502887, | |
| "loss": 1.1002, | |
| "step": 1986 | |
| }, | |
| { | |
| "epoch": 0.15325463743676224, | |
| "grad_norm": 0.1283218264579773, | |
| "learning_rate": 0.00019027720527617478, | |
| "loss": 1.0749, | |
| "step": 1988 | |
| }, | |
| { | |
| "epoch": 0.15340881715249338, | |
| "grad_norm": 0.12113723158836365, | |
| "learning_rate": 0.0001902669002473207, | |
| "loss": 1.0831, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 0.15356299686822453, | |
| "grad_norm": 0.12649892270565033, | |
| "learning_rate": 0.0001902565952184666, | |
| "loss": 1.0166, | |
| "step": 1992 | |
| }, | |
| { | |
| "epoch": 0.15371717658395567, | |
| "grad_norm": 0.12823793292045593, | |
| "learning_rate": 0.00019024629018961253, | |
| "loss": 1.0273, | |
| "step": 1994 | |
| }, | |
| { | |
| "epoch": 0.15387135629968682, | |
| "grad_norm": 0.1291527897119522, | |
| "learning_rate": 0.00019023598516075847, | |
| "loss": 1.1092, | |
| "step": 1996 | |
| }, | |
| { | |
| "epoch": 0.15402553601541796, | |
| "grad_norm": 0.12588894367218018, | |
| "learning_rate": 0.00019022568013190438, | |
| "loss": 1.0627, | |
| "step": 1998 | |
| }, | |
| { | |
| "epoch": 0.1541797157311491, | |
| "grad_norm": 0.12996312975883484, | |
| "learning_rate": 0.0001902153751030503, | |
| "loss": 1.1196, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1541797157311491, | |
| "eval_loss": 1.0863893032073975, | |
| "eval_runtime": 185.3254, | |
| "eval_samples_per_second": 91.423, | |
| "eval_steps_per_second": 1.43, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.15433389544688028, | |
| "grad_norm": 0.14361834526062012, | |
| "learning_rate": 0.00019020507007419621, | |
| "loss": 1.1151, | |
| "step": 2002 | |
| }, | |
| { | |
| "epoch": 0.15448807516261143, | |
| "grad_norm": 0.12650837004184723, | |
| "learning_rate": 0.00019019476504534213, | |
| "loss": 1.1155, | |
| "step": 2004 | |
| }, | |
| { | |
| "epoch": 0.15464225487834257, | |
| "grad_norm": 0.13820499181747437, | |
| "learning_rate": 0.00019018446001648807, | |
| "loss": 1.1243, | |
| "step": 2006 | |
| }, | |
| { | |
| "epoch": 0.15479643459407372, | |
| "grad_norm": 0.13205693662166595, | |
| "learning_rate": 0.00019017415498763399, | |
| "loss": 1.0626, | |
| "step": 2008 | |
| }, | |
| { | |
| "epoch": 0.15495061430980486, | |
| "grad_norm": 0.13930106163024902, | |
| "learning_rate": 0.0001901638499587799, | |
| "loss": 1.1105, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 0.155104794025536, | |
| "grad_norm": 0.14711922407150269, | |
| "learning_rate": 0.00019015354492992582, | |
| "loss": 1.0556, | |
| "step": 2012 | |
| }, | |
| { | |
| "epoch": 0.15525897374126715, | |
| "grad_norm": 0.11909156292676926, | |
| "learning_rate": 0.00019014323990107173, | |
| "loss": 1.1025, | |
| "step": 2014 | |
| }, | |
| { | |
| "epoch": 0.15541315345699833, | |
| "grad_norm": 0.14099714159965515, | |
| "learning_rate": 0.00019013293487221767, | |
| "loss": 1.064, | |
| "step": 2016 | |
| }, | |
| { | |
| "epoch": 0.15556733317272947, | |
| "grad_norm": 0.11500216275453568, | |
| "learning_rate": 0.0001901226298433636, | |
| "loss": 1.1196, | |
| "step": 2018 | |
| }, | |
| { | |
| "epoch": 0.15572151288846062, | |
| "grad_norm": 0.12341683357954025, | |
| "learning_rate": 0.0001901123248145095, | |
| "loss": 1.0625, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 0.15587569260419176, | |
| "grad_norm": 0.1390669196844101, | |
| "learning_rate": 0.00019010201978565542, | |
| "loss": 1.0526, | |
| "step": 2022 | |
| }, | |
| { | |
| "epoch": 0.1560298723199229, | |
| "grad_norm": 0.13482992351055145, | |
| "learning_rate": 0.00019009171475680133, | |
| "loss": 1.1074, | |
| "step": 2024 | |
| }, | |
| { | |
| "epoch": 0.15618405203565405, | |
| "grad_norm": 0.12277045845985413, | |
| "learning_rate": 0.00019008140972794725, | |
| "loss": 1.0648, | |
| "step": 2026 | |
| }, | |
| { | |
| "epoch": 0.1563382317513852, | |
| "grad_norm": 0.13579949736595154, | |
| "learning_rate": 0.00019007110469909316, | |
| "loss": 1.1235, | |
| "step": 2028 | |
| }, | |
| { | |
| "epoch": 0.15649241146711637, | |
| "grad_norm": 0.14128637313842773, | |
| "learning_rate": 0.00019006079967023908, | |
| "loss": 1.0442, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 0.15664659118284752, | |
| "grad_norm": 0.13722474873065948, | |
| "learning_rate": 0.000190050494641385, | |
| "loss": 1.1215, | |
| "step": 2032 | |
| }, | |
| { | |
| "epoch": 0.15680077089857866, | |
| "grad_norm": 0.13500674068927765, | |
| "learning_rate": 0.0001900401896125309, | |
| "loss": 1.0776, | |
| "step": 2034 | |
| }, | |
| { | |
| "epoch": 0.1569549506143098, | |
| "grad_norm": 0.11917294561862946, | |
| "learning_rate": 0.00019002988458367685, | |
| "loss": 1.0698, | |
| "step": 2036 | |
| }, | |
| { | |
| "epoch": 0.15710913033004095, | |
| "grad_norm": 0.12245581299066544, | |
| "learning_rate": 0.00019001957955482276, | |
| "loss": 1.0166, | |
| "step": 2038 | |
| }, | |
| { | |
| "epoch": 0.1572633100457721, | |
| "grad_norm": 0.12556669116020203, | |
| "learning_rate": 0.00019000927452596868, | |
| "loss": 1.0846, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 0.15741748976150324, | |
| "grad_norm": 0.13316373527050018, | |
| "learning_rate": 0.0001899989694971146, | |
| "loss": 1.0566, | |
| "step": 2042 | |
| }, | |
| { | |
| "epoch": 0.1575716694772344, | |
| "grad_norm": 0.1296815425157547, | |
| "learning_rate": 0.0001899886644682605, | |
| "loss": 1.0824, | |
| "step": 2044 | |
| }, | |
| { | |
| "epoch": 0.15772584919296556, | |
| "grad_norm": 0.1288246214389801, | |
| "learning_rate": 0.00018997835943940645, | |
| "loss": 1.0974, | |
| "step": 2046 | |
| }, | |
| { | |
| "epoch": 0.1578800289086967, | |
| "grad_norm": 0.1185479462146759, | |
| "learning_rate": 0.00018996805441055237, | |
| "loss": 1.1443, | |
| "step": 2048 | |
| }, | |
| { | |
| "epoch": 0.15803420862442785, | |
| "grad_norm": 0.12504369020462036, | |
| "learning_rate": 0.00018995774938169828, | |
| "loss": 1.0899, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.158188388340159, | |
| "grad_norm": 0.1266452521085739, | |
| "learning_rate": 0.0001899474443528442, | |
| "loss": 1.0654, | |
| "step": 2052 | |
| }, | |
| { | |
| "epoch": 0.15834256805589014, | |
| "grad_norm": 0.13447126746177673, | |
| "learning_rate": 0.0001899371393239901, | |
| "loss": 1.0649, | |
| "step": 2054 | |
| }, | |
| { | |
| "epoch": 0.1584967477716213, | |
| "grad_norm": 0.1446131467819214, | |
| "learning_rate": 0.00018992683429513603, | |
| "loss": 1.1439, | |
| "step": 2056 | |
| }, | |
| { | |
| "epoch": 0.15865092748735243, | |
| "grad_norm": 0.12688389420509338, | |
| "learning_rate": 0.00018991652926628197, | |
| "loss": 1.0262, | |
| "step": 2058 | |
| }, | |
| { | |
| "epoch": 0.1588051072030836, | |
| "grad_norm": 0.12581713497638702, | |
| "learning_rate": 0.00018990622423742788, | |
| "loss": 1.0723, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 0.15895928691881475, | |
| "grad_norm": 0.15745951235294342, | |
| "learning_rate": 0.0001898959192085738, | |
| "loss": 1.1038, | |
| "step": 2062 | |
| }, | |
| { | |
| "epoch": 0.1591134666345459, | |
| "grad_norm": 0.14457587897777557, | |
| "learning_rate": 0.0001898856141797197, | |
| "loss": 1.1072, | |
| "step": 2064 | |
| }, | |
| { | |
| "epoch": 0.15926764635027704, | |
| "grad_norm": 0.11454683542251587, | |
| "learning_rate": 0.00018987530915086563, | |
| "loss": 1.0605, | |
| "step": 2066 | |
| }, | |
| { | |
| "epoch": 0.1594218260660082, | |
| "grad_norm": 0.1137547716498375, | |
| "learning_rate": 0.00018986500412201157, | |
| "loss": 1.0405, | |
| "step": 2068 | |
| }, | |
| { | |
| "epoch": 0.15957600578173933, | |
| "grad_norm": 0.1220378428697586, | |
| "learning_rate": 0.00018985469909315748, | |
| "loss": 1.086, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 0.15973018549747048, | |
| "grad_norm": 0.13579098880290985, | |
| "learning_rate": 0.0001898443940643034, | |
| "loss": 1.0334, | |
| "step": 2072 | |
| }, | |
| { | |
| "epoch": 0.15988436521320165, | |
| "grad_norm": 0.1529407948255539, | |
| "learning_rate": 0.00018983408903544931, | |
| "loss": 1.0614, | |
| "step": 2074 | |
| }, | |
| { | |
| "epoch": 0.1600385449289328, | |
| "grad_norm": 0.13769444823265076, | |
| "learning_rate": 0.00018982378400659523, | |
| "loss": 1.1212, | |
| "step": 2076 | |
| }, | |
| { | |
| "epoch": 0.16019272464466394, | |
| "grad_norm": 0.12095335125923157, | |
| "learning_rate": 0.00018981347897774114, | |
| "loss": 1.047, | |
| "step": 2078 | |
| }, | |
| { | |
| "epoch": 0.1603469043603951, | |
| "grad_norm": 0.12483233958482742, | |
| "learning_rate": 0.00018980317394888706, | |
| "loss": 1.0808, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 0.16050108407612623, | |
| "grad_norm": 0.12451382726430893, | |
| "learning_rate": 0.00018979286892003297, | |
| "loss": 1.1259, | |
| "step": 2082 | |
| }, | |
| { | |
| "epoch": 0.16065526379185738, | |
| "grad_norm": 0.12540730834007263, | |
| "learning_rate": 0.0001897825638911789, | |
| "loss": 1.0761, | |
| "step": 2084 | |
| }, | |
| { | |
| "epoch": 0.16080944350758852, | |
| "grad_norm": 0.12948516011238098, | |
| "learning_rate": 0.0001897722588623248, | |
| "loss": 1.0621, | |
| "step": 2086 | |
| }, | |
| { | |
| "epoch": 0.16096362322331967, | |
| "grad_norm": 0.1349886953830719, | |
| "learning_rate": 0.00018976195383347075, | |
| "loss": 1.0549, | |
| "step": 2088 | |
| }, | |
| { | |
| "epoch": 0.16111780293905084, | |
| "grad_norm": 0.1249813437461853, | |
| "learning_rate": 0.00018975164880461666, | |
| "loss": 1.0828, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 0.161271982654782, | |
| "grad_norm": 0.1299104243516922, | |
| "learning_rate": 0.00018974134377576258, | |
| "loss": 1.097, | |
| "step": 2092 | |
| }, | |
| { | |
| "epoch": 0.16142616237051313, | |
| "grad_norm": 0.13004744052886963, | |
| "learning_rate": 0.0001897310387469085, | |
| "loss": 1.0417, | |
| "step": 2094 | |
| }, | |
| { | |
| "epoch": 0.16158034208624428, | |
| "grad_norm": 0.11553830653429031, | |
| "learning_rate": 0.0001897207337180544, | |
| "loss": 1.0563, | |
| "step": 2096 | |
| }, | |
| { | |
| "epoch": 0.16173452180197542, | |
| "grad_norm": 0.12000396102666855, | |
| "learning_rate": 0.00018971042868920035, | |
| "loss": 1.077, | |
| "step": 2098 | |
| }, | |
| { | |
| "epoch": 0.16188870151770657, | |
| "grad_norm": 0.13707685470581055, | |
| "learning_rate": 0.00018970012366034626, | |
| "loss": 1.0994, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.16188870151770657, | |
| "eval_loss": 1.0858707427978516, | |
| "eval_runtime": 185.7188, | |
| "eval_samples_per_second": 91.229, | |
| "eval_steps_per_second": 1.427, | |
| "step": 2100 | |
| } | |
| ], | |
| "logging_steps": 2, | |
| "max_steps": 38916, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 3, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.132999221824717e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |