{ "best_global_step": 1420000, "best_metric": 0.013992251828312874, "best_model_checkpoint": "./results_selfies_decoder_only/checkpoint-1420000", "epoch": 0.0142, "eval_steps": 20000, "global_step": 1420000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1e-06, "grad_norm": 79.6296157836914, "learning_rate": 4.909469568363889e-06, "loss": 11.9356, "step": 100 }, { "epoch": 2e-06, "grad_norm": 143.89768981933594, "learning_rate": 5.7082530717588735e-06, "loss": 9.9587, "step": 200 }, { "epoch": 3e-06, "grad_norm": 175.45932006835938, "learning_rate": 6.1597324724647675e-06, "loss": 8.6366, "step": 300 }, { "epoch": 4e-06, "grad_norm": 174.09616088867188, "learning_rate": 6.480441893489666e-06, "loss": 7.1984, "step": 400 }, { "epoch": 5e-06, "grad_norm": 161.46743774414062, "learning_rate": 6.72770373030742e-06, "loss": 5.6376, "step": 500 }, { "epoch": 6e-06, "grad_norm": 129.3070831298828, "learning_rate": 6.928968702203138e-06, "loss": 4.0149, "step": 600 }, { "epoch": 7e-06, "grad_norm": 64.37738800048828, "learning_rate": 7.098695118435496e-06, "loss": 2.9002, "step": 700 }, { "epoch": 8e-06, "grad_norm": 13.311994552612305, "learning_rate": 7.245441208744192e-06, "loss": 2.4644, "step": 800 }, { "epoch": 9e-06, "grad_norm": 6.188905239105225, "learning_rate": 7.374694260092186e-06, "loss": 2.4164, "step": 900 }, { "epoch": 1e-05, "grad_norm": 6.606897354125977, "learning_rate": 7.490184136213188e-06, "loss": 2.3921, "step": 1000 }, { "epoch": 1.1e-05, "grad_norm": 8.123486518859863, "learning_rate": 7.594561876470854e-06, "loss": 2.3689, "step": 1100 }, { "epoch": 1.2e-05, "grad_norm": 5.709030628204346, "learning_rate": 7.689779403706943e-06, "loss": 2.34, "step": 1200 }, { "epoch": 1.3e-05, "grad_norm": 3.3269248008728027, "learning_rate": 7.77731560566605e-06, "loss": 2.3049, "step": 1300 }, { "epoch": 1.4e-05, "grad_norm": 4.203290939331055, "learning_rate": 7.858317824980115e-06, "loss": 2.2719, "step": 1400 }, { "epoch": 1.5e-05, "grad_norm": 6.1287617683410645, "learning_rate": 7.933694108632486e-06, "loss": 2.2579, "step": 1500 }, { "epoch": 1.6e-05, "grad_norm": 7.363707065582275, "learning_rate": 8.004175449116454e-06, "loss": 2.2264, "step": 1600 }, { "epoch": 1.7e-05, "grad_norm": 3.850295305252075, "learning_rate": 8.070359018994354e-06, "loss": 2.1867, "step": 1700 }, { "epoch": 1.8e-05, "grad_norm": 4.057143688201904, "learning_rate": 8.132738964622577e-06, "loss": 2.1609, "step": 1800 }, { "epoch": 1.9e-05, "grad_norm": 5.508134365081787, "learning_rate": 8.191728822112599e-06, "loss": 2.126, "step": 1900 }, { "epoch": 2e-05, "grad_norm": 4.244765758514404, "learning_rate": 8.247678150068523e-06, "loss": 2.0946, "step": 2000 }, { "epoch": 2.1e-05, "grad_norm": 4.900757789611816, "learning_rate": 8.300885082044178e-06, "loss": 2.0594, "step": 2100 }, { "epoch": 2.2e-05, "grad_norm": 7.236623764038086, "learning_rate": 8.351605943901763e-06, "loss": 2.0215, "step": 2200 }, { "epoch": 2.3e-05, "grad_norm": 4.497879981994629, "learning_rate": 8.400062722973494e-06, "loss": 1.9637, "step": 2300 }, { "epoch": 2.4e-05, "grad_norm": 3.534646511077881, "learning_rate": 8.446448940289436e-06, "loss": 1.8918, "step": 2400 }, { "epoch": 2.5e-05, "grad_norm": 5.800156116485596, "learning_rate": 8.490934318841267e-06, "loss": 1.8287, "step": 2500 }, { "epoch": 2.6e-05, "grad_norm": 6.477711200714111, "learning_rate": 8.533668532464562e-06, "loss": 1.744, "step": 2600 }, { "epoch": 2.7e-05, "grad_norm": 3.663442611694336, "learning_rate": 8.574784244409386e-06, "loss": 1.6117, "step": 2700 }, { "epoch": 2.8e-05, "grad_norm": 4.181194305419922, "learning_rate": 8.614399591221578e-06, "loss": 1.3685, "step": 2800 }, { "epoch": 2.9e-05, "grad_norm": 3.2756075859069824, "learning_rate": 8.652620229176643e-06, "loss": 1.0925, "step": 2900 }, { "epoch": 3e-05, "grad_norm": 4.282373905181885, "learning_rate": 8.689541032578294e-06, "loss": 0.844, "step": 3000 }, { "epoch": 3.1e-05, "grad_norm": 2.8236887454986572, "learning_rate": 8.725247512658261e-06, "loss": 0.6808, "step": 3100 }, { "epoch": 3.2e-05, "grad_norm": 2.3734543323516846, "learning_rate": 8.759817010483776e-06, "loss": 0.5684, "step": 3200 }, { "epoch": 3.3e-05, "grad_norm": 2.235910177230835, "learning_rate": 8.793319705735932e-06, "loss": 0.4913, "step": 3300 }, { "epoch": 3.4e-05, "grad_norm": 2.969576358795166, "learning_rate": 8.825819474445214e-06, "loss": 0.4361, "step": 3400 }, { "epoch": 3.5e-05, "grad_norm": 1.8202422857284546, "learning_rate": 8.857374622035448e-06, "loss": 0.3882, "step": 3500 }, { "epoch": 3.6e-05, "grad_norm": 1.740777611732483, "learning_rate": 8.888038512815182e-06, "loss": 0.3516, "step": 3600 }, { "epoch": 3.7e-05, "grad_norm": 2.0678975582122803, "learning_rate": 8.917860112989142e-06, "loss": 0.3264, "step": 3700 }, { "epoch": 3.8e-05, "grad_norm": 1.4011726379394531, "learning_rate": 8.946884461066085e-06, "loss": 0.3057, "step": 3800 }, { "epoch": 3.9e-05, "grad_norm": 1.2788996696472168, "learning_rate": 8.975153077009357e-06, "loss": 0.2898, "step": 3900 }, { "epoch": 4e-05, "grad_norm": 2.4466378688812256, "learning_rate": 9.002704319460057e-06, "loss": 0.2746, "step": 4000 }, { "epoch": 4.1e-05, "grad_norm": 1.8154698610305786, "learning_rate": 9.029573698745934e-06, "loss": 0.2613, "step": 4100 }, { "epoch": 4.2e-05, "grad_norm": 1.257989764213562, "learning_rate": 9.055794152084609e-06, "loss": 0.2529, "step": 4200 }, { "epoch": 4.3e-05, "grad_norm": 1.622819185256958, "learning_rate": 9.081396286331678e-06, "loss": 0.2423, "step": 4300 }, { "epoch": 4.4e-05, "grad_norm": 1.5510390996932983, "learning_rate": 9.106408592760968e-06, "loss": 0.2364, "step": 4400 }, { "epoch": 4.5e-05, "grad_norm": 1.1682218313217163, "learning_rate": 9.130857637656786e-06, "loss": 0.2311, "step": 4500 }, { "epoch": 4.6e-05, "grad_norm": 1.1299382448196411, "learning_rate": 9.154768231915053e-06, "loss": 0.2223, "step": 4600 }, { "epoch": 4.7e-05, "grad_norm": 0.9747074246406555, "learning_rate": 9.178163582367894e-06, "loss": 0.2177, "step": 4700 }, { "epoch": 4.8e-05, "grad_norm": 1.4106073379516602, "learning_rate": 9.201065427145363e-06, "loss": 0.2115, "step": 4800 }, { "epoch": 4.9e-05, "grad_norm": 1.4786523580551147, "learning_rate": 9.223494157053206e-06, "loss": 0.2083, "step": 4900 }, { "epoch": 5e-05, "grad_norm": 1.3448201417922974, "learning_rate": 9.245468924665305e-06, "loss": 0.2038, "step": 5000 }, { "epoch": 5.1e-05, "grad_norm": 1.2726072072982788, "learning_rate": 9.267007742593346e-06, "loss": 0.2004, "step": 5100 }, { "epoch": 5.2e-05, "grad_norm": 0.9162949919700623, "learning_rate": 9.288127572197123e-06, "loss": 0.1949, "step": 5200 }, { "epoch": 5.3e-05, "grad_norm": 1.2940577268600464, "learning_rate": 9.30884440383014e-06, "loss": 0.1913, "step": 5300 }, { "epoch": 5.4e-05, "grad_norm": 1.3474128246307373, "learning_rate": 9.329173329571589e-06, "loss": 0.1882, "step": 5400 }, { "epoch": 5.5e-05, "grad_norm": 1.3732306957244873, "learning_rate": 9.349128609273442e-06, "loss": 0.1832, "step": 5500 }, { "epoch": 5.6e-05, "grad_norm": 1.6505043506622314, "learning_rate": 9.368723730646682e-06, "loss": 0.1823, "step": 5600 }, { "epoch": 5.7e-05, "grad_norm": 0.9124543070793152, "learning_rate": 9.387971464020811e-06, "loss": 0.1795, "step": 5700 }, { "epoch": 5.8e-05, "grad_norm": 1.0410525798797607, "learning_rate": 9.406883912333433e-06, "loss": 0.1778, "step": 5800 }, { "epoch": 5.9e-05, "grad_norm": 1.0157634019851685, "learning_rate": 9.425472556839983e-06, "loss": 0.1749, "step": 5900 }, { "epoch": 6e-05, "grad_norm": 1.0557055473327637, "learning_rate": 9.443748298975912e-06, "loss": 0.1742, "step": 6000 }, { "epoch": 6.1e-05, "grad_norm": 1.1464877128601074, "learning_rate": 9.461721498753552e-06, "loss": 0.1714, "step": 6100 }, { "epoch": 6.2e-05, "grad_norm": 0.8783308863639832, "learning_rate": 9.479402010032261e-06, "loss": 0.1709, "step": 6200 }, { "epoch": 6.3e-05, "grad_norm": 0.965745210647583, "learning_rate": 9.496799212962515e-06, "loss": 0.1679, "step": 6300 }, { "epoch": 6.4e-05, "grad_norm": 0.9326144456863403, "learning_rate": 9.51392204387139e-06, "loss": 0.1654, "step": 6400 }, { "epoch": 6.5e-05, "grad_norm": 0.8788262009620667, "learning_rate": 9.530779022827808e-06, "loss": 0.1649, "step": 6500 }, { "epoch": 6.6e-05, "grad_norm": 1.3065279722213745, "learning_rate": 9.547378279100432e-06, "loss": 0.1616, "step": 6600 }, { "epoch": 6.7e-05, "grad_norm": 0.9988572001457214, "learning_rate": 9.563727574698575e-06, "loss": 0.1603, "step": 6700 }, { "epoch": 6.8e-05, "grad_norm": 1.0125445127487183, "learning_rate": 9.579834326166864e-06, "loss": 0.1594, "step": 6800 }, { "epoch": 6.9e-05, "grad_norm": 1.2627679109573364, "learning_rate": 9.595705624786722e-06, "loss": 0.1571, "step": 6900 }, { "epoch": 7e-05, "grad_norm": 0.9639262557029724, "learning_rate": 9.611348255322519e-06, "loss": 0.1563, "step": 7000 }, { "epoch": 7.1e-05, "grad_norm": 0.7676745653152466, "learning_rate": 9.626768713436342e-06, "loss": 0.1545, "step": 7100 }, { "epoch": 7.2e-05, "grad_norm": 1.1250144243240356, "learning_rate": 9.64197322188329e-06, "loss": 0.1539, "step": 7200 }, { "epoch": 7.3e-05, "grad_norm": 0.8999272584915161, "learning_rate": 9.65696774558836e-06, "loss": 0.1508, "step": 7300 }, { "epoch": 7.4e-05, "grad_norm": 1.0707303285598755, "learning_rate": 9.671758005696342e-06, "loss": 0.1516, "step": 7400 }, { "epoch": 7.5e-05, "grad_norm": 1.2484121322631836, "learning_rate": 9.686349492677527e-06, "loss": 0.149, "step": 7500 }, { "epoch": 7.6e-05, "grad_norm": 0.9610104560852051, "learning_rate": 9.700747478564381e-06, "loss": 0.1478, "step": 7600 }, { "epoch": 7.7e-05, "grad_norm": 1.1037452220916748, "learning_rate": 9.714957028387432e-06, "loss": 0.1484, "step": 7700 }, { "epoch": 7.8e-05, "grad_norm": 0.8117715716362, "learning_rate": 9.728983010872413e-06, "loss": 0.1452, "step": 7800 }, { "epoch": 7.9e-05, "grad_norm": 0.8357411623001099, "learning_rate": 9.742830108455235e-06, "loss": 0.1446, "step": 7900 }, { "epoch": 8e-05, "grad_norm": 1.31978178024292, "learning_rate": 9.756502826666329e-06, "loss": 0.1432, "step": 8000 }, { "epoch": 8.1e-05, "grad_norm": 0.8099932074546814, "learning_rate": 9.770005502931384e-06, "loss": 0.1428, "step": 8100 }, { "epoch": 8.2e-05, "grad_norm": 0.8183396458625793, "learning_rate": 9.783342314831557e-06, "loss": 0.143, "step": 8200 }, { "epoch": 8.3e-05, "grad_norm": 0.7853631377220154, "learning_rate": 9.796517287862453e-06, "loss": 0.1388, "step": 8300 }, { "epoch": 8.4e-05, "grad_norm": 0.8730000257492065, "learning_rate": 9.809534302727974e-06, "loss": 0.139, "step": 8400 }, { "epoch": 8.5e-05, "grad_norm": 0.7672497034072876, "learning_rate": 9.822397102202073e-06, "loss": 0.1386, "step": 8500 }, { "epoch": 8.6e-05, "grad_norm": 0.9393063187599182, "learning_rate": 9.835109297588812e-06, "loss": 0.1374, "step": 8600 }, { "epoch": 8.7e-05, "grad_norm": 0.8092203140258789, "learning_rate": 9.84767437480856e-06, "loss": 0.1366, "step": 8700 }, { "epoch": 8.8e-05, "grad_norm": 0.8519502878189087, "learning_rate": 9.860095700136045e-06, "loss": 0.1381, "step": 8800 }, { "epoch": 8.9e-05, "grad_norm": 1.196933627128601, "learning_rate": 9.872376525613871e-06, "loss": 0.1367, "step": 8900 }, { "epoch": 9e-05, "grad_norm": 0.739916980266571, "learning_rate": 9.884519994163267e-06, "loss": 0.134, "step": 9000 }, { "epoch": 9.1e-05, "grad_norm": 0.7095433473587036, "learning_rate": 9.896529144412197e-06, "loss": 0.1327, "step": 9100 }, { "epoch": 9.2e-05, "grad_norm": 0.9242630004882812, "learning_rate": 9.908406915259342e-06, "loss": 0.1309, "step": 9200 }, { "epoch": 9.3e-05, "grad_norm": 0.7990697026252747, "learning_rate": 9.920156150191122e-06, "loss": 0.1311, "step": 9300 }, { "epoch": 9.4e-05, "grad_norm": 0.6396790742874146, "learning_rate": 9.931663980665275e-06, "loss": 0.1324, "step": 9400 }, { "epoch": 9.5e-05, "grad_norm": 0.7568519115447998, "learning_rate": 9.943051116588463e-06, "loss": 0.1315, "step": 9500 }, { "epoch": 9.6e-05, "grad_norm": 0.82841557264328, "learning_rate": 9.954433296569367e-06, "loss": 0.1289, "step": 9600 }, { "epoch": 9.7e-05, "grad_norm": 0.8421795964241028, "learning_rate": 9.965697389764977e-06, "loss": 0.1295, "step": 9700 }, { "epoch": 9.8e-05, "grad_norm": 1.2406800985336304, "learning_rate": 9.976734901699378e-06, "loss": 0.1282, "step": 9800 }, { "epoch": 9.9e-05, "grad_norm": 0.7478300929069519, "learning_rate": 9.987771144361851e-06, "loss": 0.1284, "step": 9900 }, { "epoch": 0.0001, "grad_norm": 0.6323376297950745, "learning_rate": 9.998696334198274e-06, "loss": 0.1245, "step": 10000 }, { "epoch": 0.000101, "grad_norm": 1.0688190460205078, "learning_rate": 1e-05, "loss": 0.127, "step": 10100 }, { "epoch": 0.000102, "grad_norm": 0.573604166507721, "learning_rate": 1e-05, "loss": 0.1254, "step": 10200 }, { "epoch": 0.000103, "grad_norm": 0.9026819467544556, "learning_rate": 1e-05, "loss": 0.125, "step": 10300 }, { "epoch": 0.000104, "grad_norm": 0.8466529846191406, "learning_rate": 1e-05, "loss": 0.1254, "step": 10400 }, { "epoch": 0.000105, "grad_norm": 0.6952452659606934, "learning_rate": 1e-05, "loss": 0.1237, "step": 10500 }, { "epoch": 0.000106, "grad_norm": 0.6572213172912598, "learning_rate": 1e-05, "loss": 0.123, "step": 10600 }, { "epoch": 0.000107, "grad_norm": 0.589888334274292, "learning_rate": 1e-05, "loss": 0.1225, "step": 10700 }, { "epoch": 0.000108, "grad_norm": 0.7127901911735535, "learning_rate": 1e-05, "loss": 0.1217, "step": 10800 }, { "epoch": 0.000109, "grad_norm": 0.7307358384132385, "learning_rate": 1e-05, "loss": 0.1223, "step": 10900 }, { "epoch": 0.00011, "grad_norm": 0.729360818862915, "learning_rate": 1e-05, "loss": 0.119, "step": 11000 }, { "epoch": 0.000111, "grad_norm": 0.696792721748352, "learning_rate": 1e-05, "loss": 0.1199, "step": 11100 }, { "epoch": 0.000112, "grad_norm": 0.6785752773284912, "learning_rate": 1e-05, "loss": 0.1208, "step": 11200 }, { "epoch": 0.000113, "grad_norm": 0.8026683330535889, "learning_rate": 1e-05, "loss": 0.1199, "step": 11300 }, { "epoch": 0.000114, "grad_norm": 0.6693434119224548, "learning_rate": 1e-05, "loss": 0.1167, "step": 11400 }, { "epoch": 0.000115, "grad_norm": 0.83057701587677, "learning_rate": 1e-05, "loss": 0.118, "step": 11500 }, { "epoch": 0.000116, "grad_norm": 0.8229856491088867, "learning_rate": 1e-05, "loss": 0.1184, "step": 11600 }, { "epoch": 0.000117, "grad_norm": 0.8003752827644348, "learning_rate": 1e-05, "loss": 0.1169, "step": 11700 }, { "epoch": 0.000118, "grad_norm": 0.9748431444168091, "learning_rate": 1e-05, "loss": 0.1178, "step": 11800 }, { "epoch": 0.000119, "grad_norm": 0.7694656252861023, "learning_rate": 1e-05, "loss": 0.1168, "step": 11900 }, { "epoch": 0.00012, "grad_norm": 0.7737765312194824, "learning_rate": 1e-05, "loss": 0.1148, "step": 12000 }, { "epoch": 0.000121, "grad_norm": 0.7660036683082581, "learning_rate": 1e-05, "loss": 0.115, "step": 12100 }, { "epoch": 0.000122, "grad_norm": 0.6069872975349426, "learning_rate": 1e-05, "loss": 0.114, "step": 12200 }, { "epoch": 0.000123, "grad_norm": 0.6484584808349609, "learning_rate": 1e-05, "loss": 0.1135, "step": 12300 }, { "epoch": 0.000124, "grad_norm": 0.6783984899520874, "learning_rate": 1e-05, "loss": 0.1127, "step": 12400 }, { "epoch": 0.000125, "grad_norm": 0.6452168226242065, "learning_rate": 1e-05, "loss": 0.1141, "step": 12500 }, { "epoch": 0.000126, "grad_norm": 0.7753832936286926, "learning_rate": 1e-05, "loss": 0.1127, "step": 12600 }, { "epoch": 0.000127, "grad_norm": 0.7392111420631409, "learning_rate": 1e-05, "loss": 0.1119, "step": 12700 }, { "epoch": 0.000128, "grad_norm": 0.6474965810775757, "learning_rate": 1e-05, "loss": 0.1118, "step": 12800 }, { "epoch": 0.000129, "grad_norm": 0.6379477381706238, "learning_rate": 1e-05, "loss": 0.1124, "step": 12900 }, { "epoch": 0.00013, "grad_norm": 0.5817776322364807, "learning_rate": 1e-05, "loss": 0.1096, "step": 13000 }, { "epoch": 0.000131, "grad_norm": 0.5496137142181396, "learning_rate": 1e-05, "loss": 0.1114, "step": 13100 }, { "epoch": 0.000132, "grad_norm": 0.7300313115119934, "learning_rate": 1e-05, "loss": 0.1099, "step": 13200 }, { "epoch": 0.000133, "grad_norm": 0.6597778797149658, "learning_rate": 1e-05, "loss": 0.1107, "step": 13300 }, { "epoch": 0.000134, "grad_norm": 0.7793627381324768, "learning_rate": 1e-05, "loss": 0.1096, "step": 13400 }, { "epoch": 0.000135, "grad_norm": 0.5615856647491455, "learning_rate": 1e-05, "loss": 0.1074, "step": 13500 }, { "epoch": 0.000136, "grad_norm": 0.5467724204063416, "learning_rate": 1e-05, "loss": 0.1092, "step": 13600 }, { "epoch": 0.000137, "grad_norm": 0.9674643874168396, "learning_rate": 1e-05, "loss": 0.1092, "step": 13700 }, { "epoch": 0.000138, "grad_norm": 0.7385745644569397, "learning_rate": 1e-05, "loss": 0.1077, "step": 13800 }, { "epoch": 0.000139, "grad_norm": 0.5477937459945679, "learning_rate": 1e-05, "loss": 0.1073, "step": 13900 }, { "epoch": 0.00014, "grad_norm": 0.6651309132575989, "learning_rate": 1e-05, "loss": 0.1076, "step": 14000 }, { "epoch": 0.000141, "grad_norm": 0.6908727884292603, "learning_rate": 1e-05, "loss": 0.1063, "step": 14100 }, { "epoch": 0.000142, "grad_norm": 0.5261303782463074, "learning_rate": 1e-05, "loss": 0.1061, "step": 14200 }, { "epoch": 0.000143, "grad_norm": 0.6175729632377625, "learning_rate": 1e-05, "loss": 0.1059, "step": 14300 }, { "epoch": 0.000144, "grad_norm": 1.0094670057296753, "learning_rate": 1e-05, "loss": 0.1067, "step": 14400 }, { "epoch": 0.000145, "grad_norm": 1.1605887413024902, "learning_rate": 1e-05, "loss": 0.1046, "step": 14500 }, { "epoch": 0.000146, "grad_norm": 0.5776452422142029, "learning_rate": 1e-05, "loss": 0.1029, "step": 14600 }, { "epoch": 0.000147, "grad_norm": 0.682066798210144, "learning_rate": 1e-05, "loss": 0.1034, "step": 14700 }, { "epoch": 0.000148, "grad_norm": 0.6078820824623108, "learning_rate": 1e-05, "loss": 0.1031, "step": 14800 }, { "epoch": 0.000149, "grad_norm": 0.7544209361076355, "learning_rate": 1e-05, "loss": 0.1041, "step": 14900 }, { "epoch": 0.00015, "grad_norm": 0.6524618864059448, "learning_rate": 1e-05, "loss": 0.1028, "step": 15000 }, { "epoch": 0.000151, "grad_norm": 0.7535697221755981, "learning_rate": 1e-05, "loss": 0.1014, "step": 15100 }, { "epoch": 0.000152, "grad_norm": 0.536113440990448, "learning_rate": 1e-05, "loss": 0.1048, "step": 15200 }, { "epoch": 0.000153, "grad_norm": 0.5757768750190735, "learning_rate": 1e-05, "loss": 0.1026, "step": 15300 }, { "epoch": 0.000154, "grad_norm": 0.7588083744049072, "learning_rate": 1e-05, "loss": 0.1021, "step": 15400 }, { "epoch": 0.000155, "grad_norm": 0.7120155692100525, "learning_rate": 1e-05, "loss": 0.1007, "step": 15500 }, { "epoch": 0.000156, "grad_norm": 0.5009164810180664, "learning_rate": 1e-05, "loss": 0.1002, "step": 15600 }, { "epoch": 0.000157, "grad_norm": 0.5904166102409363, "learning_rate": 1e-05, "loss": 0.0993, "step": 15700 }, { "epoch": 0.000158, "grad_norm": 0.6884382367134094, "learning_rate": 1e-05, "loss": 0.0995, "step": 15800 }, { "epoch": 0.000159, "grad_norm": 0.5788587927818298, "learning_rate": 1e-05, "loss": 0.0996, "step": 15900 }, { "epoch": 0.00016, "grad_norm": 0.44828513264656067, "learning_rate": 1e-05, "loss": 0.0989, "step": 16000 }, { "epoch": 0.000161, "grad_norm": 0.5391846299171448, "learning_rate": 1e-05, "loss": 0.1005, "step": 16100 }, { "epoch": 0.000162, "grad_norm": 0.6277580261230469, "learning_rate": 1e-05, "loss": 0.0993, "step": 16200 }, { "epoch": 0.000163, "grad_norm": 0.4787243902683258, "learning_rate": 1e-05, "loss": 0.0987, "step": 16300 }, { "epoch": 0.000164, "grad_norm": 0.5492888689041138, "learning_rate": 1e-05, "loss": 0.098, "step": 16400 }, { "epoch": 0.000165, "grad_norm": 0.9008030891418457, "learning_rate": 1e-05, "loss": 0.0974, "step": 16500 }, { "epoch": 0.000166, "grad_norm": 0.5352482199668884, "learning_rate": 1e-05, "loss": 0.0972, "step": 16600 }, { "epoch": 0.000167, "grad_norm": 0.6650977730751038, "learning_rate": 1e-05, "loss": 0.0979, "step": 16700 }, { "epoch": 0.000168, "grad_norm": 1.0476378202438354, "learning_rate": 1e-05, "loss": 0.0972, "step": 16800 }, { "epoch": 0.000169, "grad_norm": 0.844245970249176, "learning_rate": 1e-05, "loss": 0.0971, "step": 16900 }, { "epoch": 0.00017, "grad_norm": 0.48450130224227905, "learning_rate": 1e-05, "loss": 0.0966, "step": 17000 }, { "epoch": 0.000171, "grad_norm": 0.5955687761306763, "learning_rate": 1e-05, "loss": 0.0977, "step": 17100 }, { "epoch": 0.000172, "grad_norm": 0.7060225009918213, "learning_rate": 1e-05, "loss": 0.0969, "step": 17200 }, { "epoch": 0.000173, "grad_norm": 0.7918571829795837, "learning_rate": 1e-05, "loss": 0.0951, "step": 17300 }, { "epoch": 0.000174, "grad_norm": 0.5289338231086731, "learning_rate": 1e-05, "loss": 0.0957, "step": 17400 }, { "epoch": 0.000175, "grad_norm": 0.731738805770874, "learning_rate": 1e-05, "loss": 0.0956, "step": 17500 }, { "epoch": 0.000176, "grad_norm": 0.5855299234390259, "learning_rate": 1e-05, "loss": 0.095, "step": 17600 }, { "epoch": 0.000177, "grad_norm": 0.980675995349884, "learning_rate": 1e-05, "loss": 0.0944, "step": 17700 }, { "epoch": 0.000178, "grad_norm": 0.5245928764343262, "learning_rate": 1e-05, "loss": 0.0961, "step": 17800 }, { "epoch": 0.000179, "grad_norm": 0.5417737364768982, "learning_rate": 1e-05, "loss": 0.0948, "step": 17900 }, { "epoch": 0.00018, "grad_norm": 0.5689075589179993, "learning_rate": 1e-05, "loss": 0.0938, "step": 18000 }, { "epoch": 0.000181, "grad_norm": 0.6176053285598755, "learning_rate": 1e-05, "loss": 0.0944, "step": 18100 }, { "epoch": 0.000182, "grad_norm": 0.5518333911895752, "learning_rate": 1e-05, "loss": 0.0934, "step": 18200 }, { "epoch": 0.000183, "grad_norm": 0.5716297626495361, "learning_rate": 1e-05, "loss": 0.0932, "step": 18300 }, { "epoch": 0.000184, "grad_norm": 0.5970026850700378, "learning_rate": 1e-05, "loss": 0.0921, "step": 18400 }, { "epoch": 0.000185, "grad_norm": 0.6821596622467041, "learning_rate": 1e-05, "loss": 0.0923, "step": 18500 }, { "epoch": 0.000186, "grad_norm": 0.5702884197235107, "learning_rate": 1e-05, "loss": 0.0928, "step": 18600 }, { "epoch": 0.000187, "grad_norm": 0.5844604969024658, "learning_rate": 1e-05, "loss": 0.0927, "step": 18700 }, { "epoch": 0.000188, "grad_norm": 0.5112538933753967, "learning_rate": 1e-05, "loss": 0.0921, "step": 18800 }, { "epoch": 0.000189, "grad_norm": 0.5456957817077637, "learning_rate": 1e-05, "loss": 0.0925, "step": 18900 }, { "epoch": 0.00019, "grad_norm": 0.6565130352973938, "learning_rate": 1e-05, "loss": 0.091, "step": 19000 }, { "epoch": 0.000191, "grad_norm": 0.5464591979980469, "learning_rate": 1e-05, "loss": 0.092, "step": 19100 }, { "epoch": 0.000192, "grad_norm": 0.4368850290775299, "learning_rate": 1e-05, "loss": 0.0911, "step": 19200 }, { "epoch": 0.000193, "grad_norm": 0.5311338305473328, "learning_rate": 1e-05, "loss": 0.0903, "step": 19300 }, { "epoch": 0.000194, "grad_norm": 0.6288972496986389, "learning_rate": 1e-05, "loss": 0.0899, "step": 19400 }, { "epoch": 0.000195, "grad_norm": 0.48520079255104065, "learning_rate": 1e-05, "loss": 0.0908, "step": 19500 }, { "epoch": 0.000196, "grad_norm": 0.678741991519928, "learning_rate": 1e-05, "loss": 0.0898, "step": 19600 }, { "epoch": 0.000197, "grad_norm": 0.5822370052337646, "learning_rate": 1e-05, "loss": 0.0899, "step": 19700 }, { "epoch": 0.000198, "grad_norm": 0.9428046941757202, "learning_rate": 1e-05, "loss": 0.0896, "step": 19800 }, { "epoch": 0.000199, "grad_norm": 0.5581244230270386, "learning_rate": 1e-05, "loss": 0.0902, "step": 19900 }, { "epoch": 0.0002, "grad_norm": 0.5948507189750671, "learning_rate": 1e-05, "loss": 0.0898, "step": 20000 }, { "epoch": 0.0002, "eval_loss": 0.07388898730278015, "eval_runtime": 212.5627, "eval_samples_per_second": 235.225, "eval_steps_per_second": 14.702, "step": 20000 }, { "epoch": 0.000201, "grad_norm": 0.6277576684951782, "learning_rate": 1e-05, "loss": 0.0887, "step": 20100 }, { "epoch": 0.000202, "grad_norm": 0.5984957814216614, "learning_rate": 1e-05, "loss": 0.0887, "step": 20200 }, { "epoch": 0.000203, "grad_norm": 0.5705910921096802, "learning_rate": 1e-05, "loss": 0.0873, "step": 20300 }, { "epoch": 0.000204, "grad_norm": 0.5601595044136047, "learning_rate": 1e-05, "loss": 0.0883, "step": 20400 }, { "epoch": 0.000205, "grad_norm": 0.6183778643608093, "learning_rate": 1e-05, "loss": 0.0897, "step": 20500 }, { "epoch": 0.000206, "grad_norm": 0.5629507303237915, "learning_rate": 1e-05, "loss": 0.0879, "step": 20600 }, { "epoch": 0.000207, "grad_norm": 1.0217275619506836, "learning_rate": 1e-05, "loss": 0.0872, "step": 20700 }, { "epoch": 0.000208, "grad_norm": 0.5833017230033875, "learning_rate": 1e-05, "loss": 0.0886, "step": 20800 }, { "epoch": 0.000209, "grad_norm": 0.48639556765556335, "learning_rate": 1e-05, "loss": 0.0872, "step": 20900 }, { "epoch": 0.00021, "grad_norm": 0.4580599069595337, "learning_rate": 1e-05, "loss": 0.0872, "step": 21000 }, { "epoch": 0.000211, "grad_norm": 0.6739394664764404, "learning_rate": 1e-05, "loss": 0.087, "step": 21100 }, { "epoch": 0.000212, "grad_norm": 0.581522524356842, "learning_rate": 1e-05, "loss": 0.0874, "step": 21200 }, { "epoch": 0.000213, "grad_norm": 0.5214967131614685, "learning_rate": 1e-05, "loss": 0.0865, "step": 21300 }, { "epoch": 0.000214, "grad_norm": 0.7569633722305298, "learning_rate": 1e-05, "loss": 0.0862, "step": 21400 }, { "epoch": 0.000215, "grad_norm": 0.6048968434333801, "learning_rate": 1e-05, "loss": 0.0863, "step": 21500 }, { "epoch": 0.000216, "grad_norm": 0.5842893123626709, "learning_rate": 1e-05, "loss": 0.0847, "step": 21600 }, { "epoch": 0.000217, "grad_norm": 0.584141731262207, "learning_rate": 1e-05, "loss": 0.0854, "step": 21700 }, { "epoch": 0.000218, "grad_norm": 0.6055395007133484, "learning_rate": 1e-05, "loss": 0.0855, "step": 21800 }, { "epoch": 0.000219, "grad_norm": 0.5281064510345459, "learning_rate": 1e-05, "loss": 0.0856, "step": 21900 }, { "epoch": 0.00022, "grad_norm": 0.582008421421051, "learning_rate": 1e-05, "loss": 0.0863, "step": 22000 }, { "epoch": 0.000221, "grad_norm": 0.6220390200614929, "learning_rate": 1e-05, "loss": 0.0854, "step": 22100 }, { "epoch": 0.000222, "grad_norm": 0.5058777332305908, "learning_rate": 1e-05, "loss": 0.0839, "step": 22200 }, { "epoch": 0.000223, "grad_norm": 0.48659491539001465, "learning_rate": 1e-05, "loss": 0.0838, "step": 22300 }, { "epoch": 0.000224, "grad_norm": 0.5176419615745544, "learning_rate": 1e-05, "loss": 0.0853, "step": 22400 }, { "epoch": 0.000225, "grad_norm": 0.5722528100013733, "learning_rate": 1e-05, "loss": 0.0852, "step": 22500 }, { "epoch": 0.000226, "grad_norm": 0.428993284702301, "learning_rate": 1e-05, "loss": 0.0844, "step": 22600 }, { "epoch": 0.000227, "grad_norm": 0.4709361493587494, "learning_rate": 1e-05, "loss": 0.0843, "step": 22700 }, { "epoch": 0.000228, "grad_norm": 0.5407422780990601, "learning_rate": 1e-05, "loss": 0.0838, "step": 22800 }, { "epoch": 0.000229, "grad_norm": 0.5861979126930237, "learning_rate": 1e-05, "loss": 0.0836, "step": 22900 }, { "epoch": 0.00023, "grad_norm": 0.5901806354522705, "learning_rate": 1e-05, "loss": 0.0829, "step": 23000 }, { "epoch": 0.000231, "grad_norm": 0.42989861965179443, "learning_rate": 1e-05, "loss": 0.0831, "step": 23100 }, { "epoch": 0.000232, "grad_norm": 0.43294867873191833, "learning_rate": 1e-05, "loss": 0.0836, "step": 23200 }, { "epoch": 0.000233, "grad_norm": 0.4844062924385071, "learning_rate": 1e-05, "loss": 0.082, "step": 23300 }, { "epoch": 0.000234, "grad_norm": 0.5185121297836304, "learning_rate": 1e-05, "loss": 0.0825, "step": 23400 }, { "epoch": 0.000235, "grad_norm": 0.5434885621070862, "learning_rate": 1e-05, "loss": 0.0819, "step": 23500 }, { "epoch": 0.000236, "grad_norm": 0.5804125070571899, "learning_rate": 1e-05, "loss": 0.0827, "step": 23600 }, { "epoch": 0.000237, "grad_norm": 0.38040152192115784, "learning_rate": 1e-05, "loss": 0.0813, "step": 23700 }, { "epoch": 0.000238, "grad_norm": 0.4954085350036621, "learning_rate": 1e-05, "loss": 0.0829, "step": 23800 }, { "epoch": 0.000239, "grad_norm": 0.4209844172000885, "learning_rate": 1e-05, "loss": 0.082, "step": 23900 }, { "epoch": 0.00024, "grad_norm": 0.6441807150840759, "learning_rate": 1e-05, "loss": 0.0825, "step": 24000 }, { "epoch": 0.000241, "grad_norm": 0.569752037525177, "learning_rate": 1e-05, "loss": 0.0823, "step": 24100 }, { "epoch": 0.000242, "grad_norm": 0.6879532933235168, "learning_rate": 1e-05, "loss": 0.0814, "step": 24200 }, { "epoch": 0.000243, "grad_norm": 0.4654333293437958, "learning_rate": 1e-05, "loss": 0.0807, "step": 24300 }, { "epoch": 0.000244, "grad_norm": 0.44748032093048096, "learning_rate": 1e-05, "loss": 0.0817, "step": 24400 }, { "epoch": 0.000245, "grad_norm": 0.4972628057003021, "learning_rate": 1e-05, "loss": 0.0822, "step": 24500 }, { "epoch": 0.000246, "grad_norm": 0.5613740682601929, "learning_rate": 1e-05, "loss": 0.0813, "step": 24600 }, { "epoch": 0.000247, "grad_norm": 0.5209745168685913, "learning_rate": 1e-05, "loss": 0.0811, "step": 24700 }, { "epoch": 0.000248, "grad_norm": 0.4822278916835785, "learning_rate": 1e-05, "loss": 0.0793, "step": 24800 }, { "epoch": 0.000249, "grad_norm": 0.4792374074459076, "learning_rate": 1e-05, "loss": 0.0812, "step": 24900 }, { "epoch": 0.00025, "grad_norm": 0.5684154033660889, "learning_rate": 1e-05, "loss": 0.0795, "step": 25000 }, { "epoch": 0.000251, "grad_norm": 0.532620906829834, "learning_rate": 1e-05, "loss": 0.0802, "step": 25100 }, { "epoch": 0.000252, "grad_norm": 0.520088791847229, "learning_rate": 1e-05, "loss": 0.0797, "step": 25200 }, { "epoch": 0.000253, "grad_norm": 0.501161515712738, "learning_rate": 1e-05, "loss": 0.0799, "step": 25300 }, { "epoch": 0.000254, "grad_norm": 0.6673761010169983, "learning_rate": 1e-05, "loss": 0.0786, "step": 25400 }, { "epoch": 0.000255, "grad_norm": 0.5167078971862793, "learning_rate": 1e-05, "loss": 0.0791, "step": 25500 }, { "epoch": 0.000256, "grad_norm": 0.4736845791339874, "learning_rate": 1e-05, "loss": 0.0792, "step": 25600 }, { "epoch": 0.000257, "grad_norm": 0.5831692218780518, "learning_rate": 1e-05, "loss": 0.079, "step": 25700 }, { "epoch": 0.000258, "grad_norm": 0.6056302189826965, "learning_rate": 1e-05, "loss": 0.0794, "step": 25800 }, { "epoch": 0.000259, "grad_norm": 0.5732226967811584, "learning_rate": 1e-05, "loss": 0.0784, "step": 25900 }, { "epoch": 0.00026, "grad_norm": 1.0038753747940063, "learning_rate": 1e-05, "loss": 0.0782, "step": 26000 }, { "epoch": 0.000261, "grad_norm": 0.5736016035079956, "learning_rate": 1e-05, "loss": 0.0785, "step": 26100 }, { "epoch": 0.000262, "grad_norm": 0.4046754539012909, "learning_rate": 1e-05, "loss": 0.0783, "step": 26200 }, { "epoch": 0.000263, "grad_norm": 0.4073411524295807, "learning_rate": 1e-05, "loss": 0.0782, "step": 26300 }, { "epoch": 0.000264, "grad_norm": 0.5220291018486023, "learning_rate": 1e-05, "loss": 0.0777, "step": 26400 }, { "epoch": 0.000265, "grad_norm": 0.3735087215900421, "learning_rate": 1e-05, "loss": 0.0771, "step": 26500 }, { "epoch": 0.000266, "grad_norm": 0.5236836075782776, "learning_rate": 1e-05, "loss": 0.0783, "step": 26600 }, { "epoch": 0.000267, "grad_norm": 0.5982091426849365, "learning_rate": 1e-05, "loss": 0.0778, "step": 26700 }, { "epoch": 0.000268, "grad_norm": 0.5543742179870605, "learning_rate": 1e-05, "loss": 0.0772, "step": 26800 }, { "epoch": 0.000269, "grad_norm": 0.5550549030303955, "learning_rate": 1e-05, "loss": 0.0771, "step": 26900 }, { "epoch": 0.00027, "grad_norm": 0.4392569959163666, "learning_rate": 1e-05, "loss": 0.0763, "step": 27000 }, { "epoch": 0.000271, "grad_norm": 0.5545178055763245, "learning_rate": 1e-05, "loss": 0.0766, "step": 27100 }, { "epoch": 0.000272, "grad_norm": 0.6181541085243225, "learning_rate": 1e-05, "loss": 0.0767, "step": 27200 }, { "epoch": 0.000273, "grad_norm": 0.5784180760383606, "learning_rate": 1e-05, "loss": 0.0762, "step": 27300 }, { "epoch": 0.000274, "grad_norm": 0.5900094509124756, "learning_rate": 1e-05, "loss": 0.077, "step": 27400 }, { "epoch": 0.000275, "grad_norm": 0.7064343094825745, "learning_rate": 1e-05, "loss": 0.0774, "step": 27500 }, { "epoch": 0.000276, "grad_norm": 0.7509303092956543, "learning_rate": 1e-05, "loss": 0.0768, "step": 27600 }, { "epoch": 0.000277, "grad_norm": 0.6275886297225952, "learning_rate": 1e-05, "loss": 0.076, "step": 27700 }, { "epoch": 0.000278, "grad_norm": 0.4593959450721741, "learning_rate": 1e-05, "loss": 0.0764, "step": 27800 }, { "epoch": 0.000279, "grad_norm": 0.46685513854026794, "learning_rate": 1e-05, "loss": 0.0764, "step": 27900 }, { "epoch": 0.00028, "grad_norm": 0.5562271475791931, "learning_rate": 1e-05, "loss": 0.0761, "step": 28000 }, { "epoch": 0.000281, "grad_norm": 0.40557411313056946, "learning_rate": 1e-05, "loss": 0.0748, "step": 28100 }, { "epoch": 0.000282, "grad_norm": 0.35687893629074097, "learning_rate": 1e-05, "loss": 0.0754, "step": 28200 }, { "epoch": 0.000283, "grad_norm": 0.41895216703414917, "learning_rate": 1e-05, "loss": 0.0751, "step": 28300 }, { "epoch": 0.000284, "grad_norm": 0.4409799575805664, "learning_rate": 1e-05, "loss": 0.0763, "step": 28400 }, { "epoch": 0.000285, "grad_norm": 0.40275201201438904, "learning_rate": 1e-05, "loss": 0.0756, "step": 28500 }, { "epoch": 0.000286, "grad_norm": 0.4333484470844269, "learning_rate": 1e-05, "loss": 0.0749, "step": 28600 }, { "epoch": 0.000287, "grad_norm": 0.4015273451805115, "learning_rate": 1e-05, "loss": 0.0747, "step": 28700 }, { "epoch": 0.000288, "grad_norm": 0.5252915024757385, "learning_rate": 1e-05, "loss": 0.0753, "step": 28800 }, { "epoch": 0.000289, "grad_norm": 0.5835415720939636, "learning_rate": 1e-05, "loss": 0.0756, "step": 28900 }, { "epoch": 0.00029, "grad_norm": 0.5033889412879944, "learning_rate": 1e-05, "loss": 0.0745, "step": 29000 }, { "epoch": 0.000291, "grad_norm": 0.4723038077354431, "learning_rate": 1e-05, "loss": 0.0742, "step": 29100 }, { "epoch": 0.000292, "grad_norm": 0.43267202377319336, "learning_rate": 1e-05, "loss": 0.075, "step": 29200 }, { "epoch": 0.000293, "grad_norm": 0.47075212001800537, "learning_rate": 1e-05, "loss": 0.075, "step": 29300 }, { "epoch": 0.000294, "grad_norm": 0.4467204213142395, "learning_rate": 1e-05, "loss": 0.0732, "step": 29400 }, { "epoch": 0.000295, "grad_norm": 0.4327106773853302, "learning_rate": 1e-05, "loss": 0.0739, "step": 29500 }, { "epoch": 0.000296, "grad_norm": 0.4779268205165863, "learning_rate": 1e-05, "loss": 0.0736, "step": 29600 }, { "epoch": 0.000297, "grad_norm": 0.6247934699058533, "learning_rate": 1e-05, "loss": 0.072, "step": 29700 }, { "epoch": 0.000298, "grad_norm": 0.7319450974464417, "learning_rate": 1e-05, "loss": 0.074, "step": 29800 }, { "epoch": 0.000299, "grad_norm": 0.5517095327377319, "learning_rate": 1e-05, "loss": 0.0731, "step": 29900 }, { "epoch": 0.0003, "grad_norm": 0.666907012462616, "learning_rate": 1e-05, "loss": 0.0729, "step": 30000 }, { "epoch": 0.000301, "grad_norm": 0.45552945137023926, "learning_rate": 1e-05, "loss": 0.0738, "step": 30100 }, { "epoch": 0.000302, "grad_norm": 0.43140140175819397, "learning_rate": 1e-05, "loss": 0.0725, "step": 30200 }, { "epoch": 0.000303, "grad_norm": 0.44867539405822754, "learning_rate": 1e-05, "loss": 0.0738, "step": 30300 }, { "epoch": 0.000304, "grad_norm": 0.4584346413612366, "learning_rate": 1e-05, "loss": 0.0733, "step": 30400 }, { "epoch": 0.000305, "grad_norm": 0.4556216597557068, "learning_rate": 1e-05, "loss": 0.0726, "step": 30500 }, { "epoch": 0.000306, "grad_norm": 0.5096617937088013, "learning_rate": 1e-05, "loss": 0.0726, "step": 30600 }, { "epoch": 0.000307, "grad_norm": 0.42534521222114563, "learning_rate": 1e-05, "loss": 0.0717, "step": 30700 }, { "epoch": 0.000308, "grad_norm": 0.5214276909828186, "learning_rate": 1e-05, "loss": 0.0727, "step": 30800 }, { "epoch": 0.000309, "grad_norm": 0.5053911805152893, "learning_rate": 1e-05, "loss": 0.0716, "step": 30900 }, { "epoch": 0.00031, "grad_norm": 0.45592135190963745, "learning_rate": 1e-05, "loss": 0.0728, "step": 31000 }, { "epoch": 0.000311, "grad_norm": 0.38146597146987915, "learning_rate": 1e-05, "loss": 0.0724, "step": 31100 }, { "epoch": 0.000312, "grad_norm": 0.4527932107448578, "learning_rate": 1e-05, "loss": 0.0719, "step": 31200 }, { "epoch": 0.000313, "grad_norm": 0.42920634150505066, "learning_rate": 1e-05, "loss": 0.0715, "step": 31300 }, { "epoch": 0.000314, "grad_norm": 0.47643887996673584, "learning_rate": 1e-05, "loss": 0.0714, "step": 31400 }, { "epoch": 0.000315, "grad_norm": 0.3669392466545105, "learning_rate": 1e-05, "loss": 0.0708, "step": 31500 }, { "epoch": 0.000316, "grad_norm": 0.41031989455223083, "learning_rate": 1e-05, "loss": 0.0715, "step": 31600 }, { "epoch": 0.000317, "grad_norm": 0.45500802993774414, "learning_rate": 1e-05, "loss": 0.0716, "step": 31700 }, { "epoch": 0.000318, "grad_norm": 0.4827985167503357, "learning_rate": 1e-05, "loss": 0.0715, "step": 31800 }, { "epoch": 0.000319, "grad_norm": 0.9082238674163818, "learning_rate": 1e-05, "loss": 0.0725, "step": 31900 }, { "epoch": 0.00032, "grad_norm": 0.4585539400577545, "learning_rate": 1e-05, "loss": 0.0713, "step": 32000 }, { "epoch": 0.000321, "grad_norm": 0.40137600898742676, "learning_rate": 1e-05, "loss": 0.0704, "step": 32100 }, { "epoch": 0.000322, "grad_norm": 0.588096559047699, "learning_rate": 1e-05, "loss": 0.0717, "step": 32200 }, { "epoch": 0.000323, "grad_norm": 0.43688681721687317, "learning_rate": 1e-05, "loss": 0.0716, "step": 32300 }, { "epoch": 0.000324, "grad_norm": 0.6022541522979736, "learning_rate": 1e-05, "loss": 0.0707, "step": 32400 }, { "epoch": 0.000325, "grad_norm": 0.5712875723838806, "learning_rate": 1e-05, "loss": 0.0699, "step": 32500 }, { "epoch": 0.000326, "grad_norm": 0.47729793190956116, "learning_rate": 1e-05, "loss": 0.0711, "step": 32600 }, { "epoch": 0.000327, "grad_norm": 0.45117008686065674, "learning_rate": 1e-05, "loss": 0.0699, "step": 32700 }, { "epoch": 0.000328, "grad_norm": 0.41114693880081177, "learning_rate": 1e-05, "loss": 0.0703, "step": 32800 }, { "epoch": 0.000329, "grad_norm": 0.5459929704666138, "learning_rate": 1e-05, "loss": 0.0705, "step": 32900 }, { "epoch": 0.00033, "grad_norm": 0.4436359405517578, "learning_rate": 1e-05, "loss": 0.0704, "step": 33000 }, { "epoch": 0.000331, "grad_norm": 0.39174625277519226, "learning_rate": 1e-05, "loss": 0.07, "step": 33100 }, { "epoch": 0.000332, "grad_norm": 0.5490001440048218, "learning_rate": 1e-05, "loss": 0.0694, "step": 33200 }, { "epoch": 0.000333, "grad_norm": 0.40828678011894226, "learning_rate": 1e-05, "loss": 0.0703, "step": 33300 }, { "epoch": 0.000334, "grad_norm": 0.49713191390037537, "learning_rate": 1e-05, "loss": 0.0697, "step": 33400 }, { "epoch": 0.000335, "grad_norm": 0.3616642355918884, "learning_rate": 1e-05, "loss": 0.0689, "step": 33500 }, { "epoch": 0.000336, "grad_norm": 0.39729055762290955, "learning_rate": 1e-05, "loss": 0.0699, "step": 33600 }, { "epoch": 0.000337, "grad_norm": 0.45994576811790466, "learning_rate": 1e-05, "loss": 0.0701, "step": 33700 }, { "epoch": 0.000338, "grad_norm": 0.39138391613960266, "learning_rate": 1e-05, "loss": 0.0702, "step": 33800 }, { "epoch": 0.000339, "grad_norm": 0.39784473180770874, "learning_rate": 1e-05, "loss": 0.0697, "step": 33900 }, { "epoch": 0.00034, "grad_norm": 0.38873356580734253, "learning_rate": 1e-05, "loss": 0.0697, "step": 34000 }, { "epoch": 0.000341, "grad_norm": 0.37431013584136963, "learning_rate": 1e-05, "loss": 0.0682, "step": 34100 }, { "epoch": 0.000342, "grad_norm": 0.4191737174987793, "learning_rate": 1e-05, "loss": 0.0697, "step": 34200 }, { "epoch": 0.000343, "grad_norm": 0.4395553171634674, "learning_rate": 1e-05, "loss": 0.0689, "step": 34300 }, { "epoch": 0.000344, "grad_norm": 0.44228512048721313, "learning_rate": 1e-05, "loss": 0.0693, "step": 34400 }, { "epoch": 0.000345, "grad_norm": 1.0667295455932617, "learning_rate": 1e-05, "loss": 0.0692, "step": 34500 }, { "epoch": 0.000346, "grad_norm": 0.42393195629119873, "learning_rate": 1e-05, "loss": 0.0688, "step": 34600 }, { "epoch": 0.000347, "grad_norm": 0.36522743105888367, "learning_rate": 1e-05, "loss": 0.0676, "step": 34700 }, { "epoch": 0.000348, "grad_norm": 0.42714449763298035, "learning_rate": 1e-05, "loss": 0.069, "step": 34800 }, { "epoch": 0.000349, "grad_norm": 0.5284758806228638, "learning_rate": 1e-05, "loss": 0.0694, "step": 34900 }, { "epoch": 0.00035, "grad_norm": 0.44011613726615906, "learning_rate": 1e-05, "loss": 0.0683, "step": 35000 }, { "epoch": 0.000351, "grad_norm": 0.5626910924911499, "learning_rate": 1e-05, "loss": 0.0686, "step": 35100 }, { "epoch": 0.000352, "grad_norm": 0.4532577395439148, "learning_rate": 1e-05, "loss": 0.0682, "step": 35200 }, { "epoch": 0.000353, "grad_norm": 0.37654298543930054, "learning_rate": 1e-05, "loss": 0.0678, "step": 35300 }, { "epoch": 0.000354, "grad_norm": 0.6779673099517822, "learning_rate": 1e-05, "loss": 0.0666, "step": 35400 }, { "epoch": 0.000355, "grad_norm": 0.3796120584011078, "learning_rate": 1e-05, "loss": 0.0669, "step": 35500 }, { "epoch": 0.000356, "grad_norm": 0.4905095398426056, "learning_rate": 1e-05, "loss": 0.0682, "step": 35600 }, { "epoch": 0.000357, "grad_norm": 0.39613139629364014, "learning_rate": 1e-05, "loss": 0.0672, "step": 35700 }, { "epoch": 0.000358, "grad_norm": 0.438360333442688, "learning_rate": 1e-05, "loss": 0.0676, "step": 35800 }, { "epoch": 0.000359, "grad_norm": 0.5236893892288208, "learning_rate": 1e-05, "loss": 0.0672, "step": 35900 }, { "epoch": 0.00036, "grad_norm": 0.5479728579521179, "learning_rate": 1e-05, "loss": 0.0678, "step": 36000 }, { "epoch": 0.000361, "grad_norm": 0.37572795152664185, "learning_rate": 1e-05, "loss": 0.0681, "step": 36100 }, { "epoch": 0.000362, "grad_norm": 0.5196158289909363, "learning_rate": 1e-05, "loss": 0.0674, "step": 36200 }, { "epoch": 0.000363, "grad_norm": 0.44885027408599854, "learning_rate": 1e-05, "loss": 0.0672, "step": 36300 }, { "epoch": 0.000364, "grad_norm": 0.42065951228141785, "learning_rate": 1e-05, "loss": 0.0684, "step": 36400 }, { "epoch": 0.000365, "grad_norm": 0.5113188624382019, "learning_rate": 1e-05, "loss": 0.0672, "step": 36500 }, { "epoch": 0.000366, "grad_norm": 0.35379719734191895, "learning_rate": 1e-05, "loss": 0.0671, "step": 36600 }, { "epoch": 0.000367, "grad_norm": 0.4362645745277405, "learning_rate": 1e-05, "loss": 0.0664, "step": 36700 }, { "epoch": 0.000368, "grad_norm": 0.4421688914299011, "learning_rate": 1e-05, "loss": 0.0685, "step": 36800 }, { "epoch": 0.000369, "grad_norm": 0.49405044317245483, "learning_rate": 1e-05, "loss": 0.0671, "step": 36900 }, { "epoch": 0.00037, "grad_norm": 0.42832764983177185, "learning_rate": 1e-05, "loss": 0.0661, "step": 37000 }, { "epoch": 0.000371, "grad_norm": 0.4356270134449005, "learning_rate": 1e-05, "loss": 0.0668, "step": 37100 }, { "epoch": 0.000372, "grad_norm": 0.40980684757232666, "learning_rate": 1e-05, "loss": 0.0664, "step": 37200 }, { "epoch": 0.000373, "grad_norm": 0.42890721559524536, "learning_rate": 1e-05, "loss": 0.0656, "step": 37300 }, { "epoch": 0.000374, "grad_norm": 0.45674893260002136, "learning_rate": 1e-05, "loss": 0.0664, "step": 37400 }, { "epoch": 0.000375, "grad_norm": 0.571071982383728, "learning_rate": 1e-05, "loss": 0.0671, "step": 37500 }, { "epoch": 0.000376, "grad_norm": 0.3623720407485962, "learning_rate": 1e-05, "loss": 0.0661, "step": 37600 }, { "epoch": 0.000377, "grad_norm": 0.34308719635009766, "learning_rate": 1e-05, "loss": 0.067, "step": 37700 }, { "epoch": 0.000378, "grad_norm": 0.4325920641422272, "learning_rate": 1e-05, "loss": 0.0669, "step": 37800 }, { "epoch": 0.000379, "grad_norm": 0.4445410370826721, "learning_rate": 1e-05, "loss": 0.0657, "step": 37900 }, { "epoch": 0.00038, "grad_norm": 0.5341411232948303, "learning_rate": 1e-05, "loss": 0.0652, "step": 38000 }, { "epoch": 0.000381, "grad_norm": 0.5067209005355835, "learning_rate": 1e-05, "loss": 0.0653, "step": 38100 }, { "epoch": 0.000382, "grad_norm": 0.4209887385368347, "learning_rate": 1e-05, "loss": 0.0645, "step": 38200 }, { "epoch": 0.000383, "grad_norm": 0.4327578842639923, "learning_rate": 1e-05, "loss": 0.0646, "step": 38300 }, { "epoch": 0.000384, "grad_norm": 0.4193218946456909, "learning_rate": 1e-05, "loss": 0.067, "step": 38400 }, { "epoch": 0.000385, "grad_norm": 0.4596867561340332, "learning_rate": 1e-05, "loss": 0.0654, "step": 38500 }, { "epoch": 0.000386, "grad_norm": 0.3950844705104828, "learning_rate": 1e-05, "loss": 0.0646, "step": 38600 }, { "epoch": 0.000387, "grad_norm": 0.4508519768714905, "learning_rate": 1e-05, "loss": 0.0664, "step": 38700 }, { "epoch": 0.000388, "grad_norm": 0.5605178475379944, "learning_rate": 1e-05, "loss": 0.0651, "step": 38800 }, { "epoch": 0.000389, "grad_norm": 0.539380669593811, "learning_rate": 1e-05, "loss": 0.0649, "step": 38900 }, { "epoch": 0.00039, "grad_norm": 0.48480191826820374, "learning_rate": 1e-05, "loss": 0.0661, "step": 39000 }, { "epoch": 0.000391, "grad_norm": 0.366256982088089, "learning_rate": 1e-05, "loss": 0.0647, "step": 39100 }, { "epoch": 0.000392, "grad_norm": 0.434061199426651, "learning_rate": 1e-05, "loss": 0.0644, "step": 39200 }, { "epoch": 0.000393, "grad_norm": 0.3698447048664093, "learning_rate": 1e-05, "loss": 0.0645, "step": 39300 }, { "epoch": 0.000394, "grad_norm": 0.4280993938446045, "learning_rate": 1e-05, "loss": 0.0642, "step": 39400 }, { "epoch": 0.000395, "grad_norm": 0.39769768714904785, "learning_rate": 1e-05, "loss": 0.0638, "step": 39500 }, { "epoch": 0.000396, "grad_norm": 0.42667558789253235, "learning_rate": 1e-05, "loss": 0.0634, "step": 39600 }, { "epoch": 0.000397, "grad_norm": 0.4679233431816101, "learning_rate": 1e-05, "loss": 0.0637, "step": 39700 }, { "epoch": 0.000398, "grad_norm": 0.33855000138282776, "learning_rate": 1e-05, "loss": 0.0647, "step": 39800 }, { "epoch": 0.000399, "grad_norm": 0.3238857388496399, "learning_rate": 1e-05, "loss": 0.0651, "step": 39900 }, { "epoch": 0.0004, "grad_norm": 0.40340402722358704, "learning_rate": 1e-05, "loss": 0.0643, "step": 40000 }, { "epoch": 0.0004, "eval_loss": 0.05428723990917206, "eval_runtime": 218.1426, "eval_samples_per_second": 229.208, "eval_steps_per_second": 14.325, "step": 40000 }, { "epoch": 0.000401, "grad_norm": 0.34717127680778503, "learning_rate": 1e-05, "loss": 0.0647, "step": 40100 }, { "epoch": 0.000402, "grad_norm": 0.3985711336135864, "learning_rate": 1e-05, "loss": 0.0645, "step": 40200 }, { "epoch": 0.000403, "grad_norm": 0.5190109014511108, "learning_rate": 1e-05, "loss": 0.0643, "step": 40300 }, { "epoch": 0.000404, "grad_norm": 0.42202332615852356, "learning_rate": 1e-05, "loss": 0.0647, "step": 40400 }, { "epoch": 0.000405, "grad_norm": 0.448262095451355, "learning_rate": 1e-05, "loss": 0.0631, "step": 40500 }, { "epoch": 0.000406, "grad_norm": 0.3727732002735138, "learning_rate": 1e-05, "loss": 0.0648, "step": 40600 }, { "epoch": 0.000407, "grad_norm": 0.35818856954574585, "learning_rate": 1e-05, "loss": 0.0645, "step": 40700 }, { "epoch": 0.000408, "grad_norm": 0.6731548309326172, "learning_rate": 1e-05, "loss": 0.0631, "step": 40800 }, { "epoch": 0.000409, "grad_norm": 0.45917606353759766, "learning_rate": 1e-05, "loss": 0.0646, "step": 40900 }, { "epoch": 0.00041, "grad_norm": 0.40148112177848816, "learning_rate": 1e-05, "loss": 0.0634, "step": 41000 }, { "epoch": 0.000411, "grad_norm": 0.4074353873729706, "learning_rate": 1e-05, "loss": 0.0636, "step": 41100 }, { "epoch": 0.000412, "grad_norm": 0.5032891631126404, "learning_rate": 1e-05, "loss": 0.0633, "step": 41200 }, { "epoch": 0.000413, "grad_norm": 0.5298800468444824, "learning_rate": 1e-05, "loss": 0.0634, "step": 41300 }, { "epoch": 0.000414, "grad_norm": 0.4057115614414215, "learning_rate": 1e-05, "loss": 0.0636, "step": 41400 }, { "epoch": 0.000415, "grad_norm": 0.397958368062973, "learning_rate": 1e-05, "loss": 0.0635, "step": 41500 }, { "epoch": 0.000416, "grad_norm": 0.41334208846092224, "learning_rate": 1e-05, "loss": 0.0633, "step": 41600 }, { "epoch": 0.000417, "grad_norm": 0.44163739681243896, "learning_rate": 1e-05, "loss": 0.0626, "step": 41700 }, { "epoch": 0.000418, "grad_norm": 0.39334794878959656, "learning_rate": 1e-05, "loss": 0.063, "step": 41800 }, { "epoch": 0.000419, "grad_norm": 0.5406715273857117, "learning_rate": 1e-05, "loss": 0.0636, "step": 41900 }, { "epoch": 0.00042, "grad_norm": 0.36068785190582275, "learning_rate": 1e-05, "loss": 0.0631, "step": 42000 }, { "epoch": 0.000421, "grad_norm": 0.3875243663787842, "learning_rate": 1e-05, "loss": 0.0639, "step": 42100 }, { "epoch": 0.000422, "grad_norm": 0.30413714051246643, "learning_rate": 1e-05, "loss": 0.0624, "step": 42200 }, { "epoch": 0.000423, "grad_norm": 0.31724658608436584, "learning_rate": 1e-05, "loss": 0.0629, "step": 42300 }, { "epoch": 0.000424, "grad_norm": 0.5136147141456604, "learning_rate": 1e-05, "loss": 0.0624, "step": 42400 }, { "epoch": 0.000425, "grad_norm": 0.35487043857574463, "learning_rate": 1e-05, "loss": 0.0626, "step": 42500 }, { "epoch": 0.000426, "grad_norm": 0.4076051414012909, "learning_rate": 1e-05, "loss": 0.0623, "step": 42600 }, { "epoch": 0.000427, "grad_norm": 0.470480352640152, "learning_rate": 1e-05, "loss": 0.0631, "step": 42700 }, { "epoch": 0.000428, "grad_norm": 0.42898502945899963, "learning_rate": 1e-05, "loss": 0.0626, "step": 42800 }, { "epoch": 0.000429, "grad_norm": 0.38925185799598694, "learning_rate": 1e-05, "loss": 0.0631, "step": 42900 }, { "epoch": 0.00043, "grad_norm": 0.40000417828559875, "learning_rate": 1e-05, "loss": 0.062, "step": 43000 }, { "epoch": 0.000431, "grad_norm": 0.39736539125442505, "learning_rate": 1e-05, "loss": 0.062, "step": 43100 }, { "epoch": 0.000432, "grad_norm": 0.41373589634895325, "learning_rate": 1e-05, "loss": 0.062, "step": 43200 }, { "epoch": 0.000433, "grad_norm": 0.4297386407852173, "learning_rate": 1e-05, "loss": 0.061, "step": 43300 }, { "epoch": 0.000434, "grad_norm": 0.9109225869178772, "learning_rate": 1e-05, "loss": 0.0617, "step": 43400 }, { "epoch": 0.000435, "grad_norm": 0.3536689281463623, "learning_rate": 1e-05, "loss": 0.0618, "step": 43500 }, { "epoch": 0.000436, "grad_norm": 0.39054861664772034, "learning_rate": 1e-05, "loss": 0.0614, "step": 43600 }, { "epoch": 0.000437, "grad_norm": 0.49647456407546997, "learning_rate": 1e-05, "loss": 0.0625, "step": 43700 }, { "epoch": 0.000438, "grad_norm": 0.4875349998474121, "learning_rate": 1e-05, "loss": 0.0619, "step": 43800 }, { "epoch": 0.000439, "grad_norm": 0.43423017859458923, "learning_rate": 1e-05, "loss": 0.0617, "step": 43900 }, { "epoch": 0.00044, "grad_norm": 0.3378565013408661, "learning_rate": 1e-05, "loss": 0.0613, "step": 44000 }, { "epoch": 0.000441, "grad_norm": 0.3839372396469116, "learning_rate": 1e-05, "loss": 0.0606, "step": 44100 }, { "epoch": 0.000442, "grad_norm": 0.39564248919487, "learning_rate": 1e-05, "loss": 0.0613, "step": 44200 }, { "epoch": 0.000443, "grad_norm": 0.4240016043186188, "learning_rate": 1e-05, "loss": 0.0611, "step": 44300 }, { "epoch": 0.000444, "grad_norm": 0.41239339113235474, "learning_rate": 1e-05, "loss": 0.061, "step": 44400 }, { "epoch": 0.000445, "grad_norm": 0.4169689416885376, "learning_rate": 1e-05, "loss": 0.0615, "step": 44500 }, { "epoch": 0.000446, "grad_norm": 0.38648462295532227, "learning_rate": 1e-05, "loss": 0.0608, "step": 44600 }, { "epoch": 0.000447, "grad_norm": 0.5468778014183044, "learning_rate": 1e-05, "loss": 0.0602, "step": 44700 }, { "epoch": 0.000448, "grad_norm": 0.40551289916038513, "learning_rate": 1e-05, "loss": 0.061, "step": 44800 }, { "epoch": 0.000449, "grad_norm": 0.41861075162887573, "learning_rate": 1e-05, "loss": 0.0596, "step": 44900 }, { "epoch": 0.00045, "grad_norm": 0.45969244837760925, "learning_rate": 1e-05, "loss": 0.0627, "step": 45000 }, { "epoch": 0.000451, "grad_norm": 0.39819177985191345, "learning_rate": 1e-05, "loss": 0.0608, "step": 45100 }, { "epoch": 0.000452, "grad_norm": 0.36020275950431824, "learning_rate": 1e-05, "loss": 0.0603, "step": 45200 }, { "epoch": 0.000453, "grad_norm": 0.31229498982429504, "learning_rate": 1e-05, "loss": 0.0598, "step": 45300 }, { "epoch": 0.000454, "grad_norm": 0.339152455329895, "learning_rate": 1e-05, "loss": 0.0606, "step": 45400 }, { "epoch": 0.000455, "grad_norm": 0.36252322793006897, "learning_rate": 1e-05, "loss": 0.0613, "step": 45500 }, { "epoch": 0.000456, "grad_norm": 0.3090766370296478, "learning_rate": 1e-05, "loss": 0.0598, "step": 45600 }, { "epoch": 0.000457, "grad_norm": 0.41197821497917175, "learning_rate": 1e-05, "loss": 0.0603, "step": 45700 }, { "epoch": 0.000458, "grad_norm": 0.43346548080444336, "learning_rate": 1e-05, "loss": 0.0609, "step": 45800 }, { "epoch": 0.000459, "grad_norm": 0.41119447350502014, "learning_rate": 1e-05, "loss": 0.0603, "step": 45900 }, { "epoch": 0.00046, "grad_norm": 0.46673718094825745, "learning_rate": 1e-05, "loss": 0.0595, "step": 46000 }, { "epoch": 0.000461, "grad_norm": 0.4427727460861206, "learning_rate": 1e-05, "loss": 0.0593, "step": 46100 }, { "epoch": 0.000462, "grad_norm": 0.36742228269577026, "learning_rate": 1e-05, "loss": 0.0596, "step": 46200 }, { "epoch": 0.000463, "grad_norm": 0.4321599304676056, "learning_rate": 1e-05, "loss": 0.0597, "step": 46300 }, { "epoch": 0.000464, "grad_norm": 0.4276648759841919, "learning_rate": 1e-05, "loss": 0.0594, "step": 46400 }, { "epoch": 0.000465, "grad_norm": 0.27002257108688354, "learning_rate": 1e-05, "loss": 0.0596, "step": 46500 }, { "epoch": 0.000466, "grad_norm": 0.5171845555305481, "learning_rate": 1e-05, "loss": 0.0601, "step": 46600 }, { "epoch": 0.000467, "grad_norm": 0.4415430724620819, "learning_rate": 1e-05, "loss": 0.0603, "step": 46700 }, { "epoch": 0.000468, "grad_norm": 0.4018206000328064, "learning_rate": 1e-05, "loss": 0.0588, "step": 46800 }, { "epoch": 0.000469, "grad_norm": 0.7742670178413391, "learning_rate": 1e-05, "loss": 0.0599, "step": 46900 }, { "epoch": 0.00047, "grad_norm": 0.46240389347076416, "learning_rate": 1e-05, "loss": 0.0599, "step": 47000 }, { "epoch": 0.000471, "grad_norm": 0.34471505880355835, "learning_rate": 1e-05, "loss": 0.0602, "step": 47100 }, { "epoch": 0.000472, "grad_norm": 0.3588685691356659, "learning_rate": 1e-05, "loss": 0.0596, "step": 47200 }, { "epoch": 0.000473, "grad_norm": 0.5853482484817505, "learning_rate": 1e-05, "loss": 0.0597, "step": 47300 }, { "epoch": 0.000474, "grad_norm": 0.6169067025184631, "learning_rate": 1e-05, "loss": 0.0594, "step": 47400 }, { "epoch": 0.000475, "grad_norm": 0.37326478958129883, "learning_rate": 1e-05, "loss": 0.0595, "step": 47500 }, { "epoch": 0.000476, "grad_norm": 0.4440366327762604, "learning_rate": 1e-05, "loss": 0.0593, "step": 47600 }, { "epoch": 0.000477, "grad_norm": 0.39765655994415283, "learning_rate": 1e-05, "loss": 0.0594, "step": 47700 }, { "epoch": 0.000478, "grad_norm": 0.2948087155818939, "learning_rate": 1e-05, "loss": 0.0594, "step": 47800 }, { "epoch": 0.000479, "grad_norm": 0.6331756711006165, "learning_rate": 1e-05, "loss": 0.0588, "step": 47900 }, { "epoch": 0.00048, "grad_norm": 0.3598988354206085, "learning_rate": 1e-05, "loss": 0.0591, "step": 48000 }, { "epoch": 0.000481, "grad_norm": 0.4574572443962097, "learning_rate": 1e-05, "loss": 0.0581, "step": 48100 }, { "epoch": 0.000482, "grad_norm": 0.3979628384113312, "learning_rate": 1e-05, "loss": 0.0584, "step": 48200 }, { "epoch": 0.000483, "grad_norm": 0.4021476209163666, "learning_rate": 1e-05, "loss": 0.0598, "step": 48300 }, { "epoch": 0.000484, "grad_norm": 0.402538001537323, "learning_rate": 1e-05, "loss": 0.0583, "step": 48400 }, { "epoch": 0.000485, "grad_norm": 0.39232808351516724, "learning_rate": 1e-05, "loss": 0.0592, "step": 48500 }, { "epoch": 0.000486, "grad_norm": 0.3039535582065582, "learning_rate": 1e-05, "loss": 0.0581, "step": 48600 }, { "epoch": 0.000487, "grad_norm": 0.3507876992225647, "learning_rate": 1e-05, "loss": 0.0583, "step": 48700 }, { "epoch": 0.000488, "grad_norm": 0.31425949931144714, "learning_rate": 1e-05, "loss": 0.0595, "step": 48800 }, { "epoch": 0.000489, "grad_norm": 0.42988407611846924, "learning_rate": 1e-05, "loss": 0.0579, "step": 48900 }, { "epoch": 0.00049, "grad_norm": 0.32415011525154114, "learning_rate": 1e-05, "loss": 0.0595, "step": 49000 }, { "epoch": 0.000491, "grad_norm": 0.38861942291259766, "learning_rate": 1e-05, "loss": 0.0586, "step": 49100 }, { "epoch": 0.000492, "grad_norm": 0.3301945924758911, "learning_rate": 1e-05, "loss": 0.0579, "step": 49200 }, { "epoch": 0.000493, "grad_norm": 0.45552974939346313, "learning_rate": 1e-05, "loss": 0.0579, "step": 49300 }, { "epoch": 0.000494, "grad_norm": 0.3537488281726837, "learning_rate": 1e-05, "loss": 0.0591, "step": 49400 }, { "epoch": 0.000495, "grad_norm": 0.4062543511390686, "learning_rate": 1e-05, "loss": 0.0591, "step": 49500 }, { "epoch": 0.000496, "grad_norm": 0.40616175532341003, "learning_rate": 1e-05, "loss": 0.0578, "step": 49600 }, { "epoch": 0.000497, "grad_norm": 0.3473791778087616, "learning_rate": 1e-05, "loss": 0.0591, "step": 49700 }, { "epoch": 0.000498, "grad_norm": 0.4690682888031006, "learning_rate": 1e-05, "loss": 0.0579, "step": 49800 }, { "epoch": 0.000499, "grad_norm": 0.34557801485061646, "learning_rate": 1e-05, "loss": 0.0581, "step": 49900 }, { "epoch": 0.0005, "grad_norm": 0.5246384739875793, "learning_rate": 1e-05, "loss": 0.0581, "step": 50000 }, { "epoch": 0.000501, "grad_norm": 0.356649249792099, "learning_rate": 1e-05, "loss": 0.0576, "step": 50100 }, { "epoch": 0.000502, "grad_norm": 0.44426289200782776, "learning_rate": 1e-05, "loss": 0.0583, "step": 50200 }, { "epoch": 0.000503, "grad_norm": 0.5504688024520874, "learning_rate": 1e-05, "loss": 0.0576, "step": 50300 }, { "epoch": 0.000504, "grad_norm": 0.43694159388542175, "learning_rate": 1e-05, "loss": 0.0576, "step": 50400 }, { "epoch": 0.000505, "grad_norm": 0.36993882060050964, "learning_rate": 1e-05, "loss": 0.0584, "step": 50500 }, { "epoch": 0.000506, "grad_norm": 0.3848142921924591, "learning_rate": 1e-05, "loss": 0.0575, "step": 50600 }, { "epoch": 0.000507, "grad_norm": 0.3087194263935089, "learning_rate": 1e-05, "loss": 0.0567, "step": 50700 }, { "epoch": 0.000508, "grad_norm": 0.39588144421577454, "learning_rate": 1e-05, "loss": 0.0581, "step": 50800 }, { "epoch": 0.000509, "grad_norm": 0.3697458505630493, "learning_rate": 1e-05, "loss": 0.0574, "step": 50900 }, { "epoch": 0.00051, "grad_norm": 0.4626401364803314, "learning_rate": 1e-05, "loss": 0.0578, "step": 51000 }, { "epoch": 0.000511, "grad_norm": 0.5205323100090027, "learning_rate": 1e-05, "loss": 0.0571, "step": 51100 }, { "epoch": 0.000512, "grad_norm": 0.3990142047405243, "learning_rate": 1e-05, "loss": 0.0566, "step": 51200 }, { "epoch": 0.000513, "grad_norm": 0.4172525703907013, "learning_rate": 1e-05, "loss": 0.0576, "step": 51300 }, { "epoch": 0.000514, "grad_norm": 0.4951062500476837, "learning_rate": 1e-05, "loss": 0.0577, "step": 51400 }, { "epoch": 0.000515, "grad_norm": 0.61896812915802, "learning_rate": 1e-05, "loss": 0.0573, "step": 51500 }, { "epoch": 0.000516, "grad_norm": 0.3298381268978119, "learning_rate": 1e-05, "loss": 0.0571, "step": 51600 }, { "epoch": 0.000517, "grad_norm": 0.39272406697273254, "learning_rate": 1e-05, "loss": 0.0572, "step": 51700 }, { "epoch": 0.000518, "grad_norm": 0.37638354301452637, "learning_rate": 1e-05, "loss": 0.0572, "step": 51800 }, { "epoch": 0.000519, "grad_norm": 0.40485790371894836, "learning_rate": 1e-05, "loss": 0.0559, "step": 51900 }, { "epoch": 0.00052, "grad_norm": 0.4324975907802582, "learning_rate": 1e-05, "loss": 0.0578, "step": 52000 }, { "epoch": 0.000521, "grad_norm": 0.38482868671417236, "learning_rate": 1e-05, "loss": 0.0572, "step": 52100 }, { "epoch": 0.000522, "grad_norm": 0.444576621055603, "learning_rate": 1e-05, "loss": 0.0571, "step": 52200 }, { "epoch": 0.000523, "grad_norm": 0.5325146317481995, "learning_rate": 1e-05, "loss": 0.0568, "step": 52300 }, { "epoch": 0.000524, "grad_norm": 0.340398371219635, "learning_rate": 1e-05, "loss": 0.0573, "step": 52400 }, { "epoch": 0.000525, "grad_norm": 0.5380735993385315, "learning_rate": 1e-05, "loss": 0.0576, "step": 52500 }, { "epoch": 0.000526, "grad_norm": 0.37018531560897827, "learning_rate": 1e-05, "loss": 0.0577, "step": 52600 }, { "epoch": 0.000527, "grad_norm": 0.3170204162597656, "learning_rate": 1e-05, "loss": 0.0569, "step": 52700 }, { "epoch": 0.000528, "grad_norm": 0.41752737760543823, "learning_rate": 1e-05, "loss": 0.0572, "step": 52800 }, { "epoch": 0.000529, "grad_norm": 0.38540002703666687, "learning_rate": 1e-05, "loss": 0.0565, "step": 52900 }, { "epoch": 0.00053, "grad_norm": 0.363292932510376, "learning_rate": 1e-05, "loss": 0.0567, "step": 53000 }, { "epoch": 0.000531, "grad_norm": 0.4280114769935608, "learning_rate": 1e-05, "loss": 0.0564, "step": 53100 }, { "epoch": 0.000532, "grad_norm": 0.34955087304115295, "learning_rate": 1e-05, "loss": 0.0573, "step": 53200 }, { "epoch": 0.000533, "grad_norm": 0.31237488985061646, "learning_rate": 1e-05, "loss": 0.0571, "step": 53300 }, { "epoch": 0.000534, "grad_norm": 0.4390700161457062, "learning_rate": 1e-05, "loss": 0.0572, "step": 53400 }, { "epoch": 0.000535, "grad_norm": 0.39538365602493286, "learning_rate": 1e-05, "loss": 0.0569, "step": 53500 }, { "epoch": 0.000536, "grad_norm": 0.3888275623321533, "learning_rate": 1e-05, "loss": 0.056, "step": 53600 }, { "epoch": 0.000537, "grad_norm": 0.33096790313720703, "learning_rate": 1e-05, "loss": 0.0554, "step": 53700 }, { "epoch": 0.000538, "grad_norm": 0.2942635715007782, "learning_rate": 1e-05, "loss": 0.0555, "step": 53800 }, { "epoch": 0.000539, "grad_norm": 0.3539855182170868, "learning_rate": 1e-05, "loss": 0.0559, "step": 53900 }, { "epoch": 0.00054, "grad_norm": 0.3039146065711975, "learning_rate": 1e-05, "loss": 0.0565, "step": 54000 }, { "epoch": 0.000541, "grad_norm": 0.3857875466346741, "learning_rate": 1e-05, "loss": 0.0567, "step": 54100 }, { "epoch": 0.000542, "grad_norm": 0.3531968295574188, "learning_rate": 1e-05, "loss": 0.0559, "step": 54200 }, { "epoch": 0.000543, "grad_norm": 0.3532152473926544, "learning_rate": 1e-05, "loss": 0.0563, "step": 54300 }, { "epoch": 0.000544, "grad_norm": 0.3237742483615875, "learning_rate": 1e-05, "loss": 0.0558, "step": 54400 }, { "epoch": 0.000545, "grad_norm": 0.48030340671539307, "learning_rate": 1e-05, "loss": 0.0555, "step": 54500 }, { "epoch": 0.000546, "grad_norm": 0.4738022983074188, "learning_rate": 1e-05, "loss": 0.0559, "step": 54600 }, { "epoch": 0.000547, "grad_norm": 0.39561378955841064, "learning_rate": 1e-05, "loss": 0.0555, "step": 54700 }, { "epoch": 0.000548, "grad_norm": 0.3880506455898285, "learning_rate": 1e-05, "loss": 0.0557, "step": 54800 }, { "epoch": 0.000549, "grad_norm": 0.35852280259132385, "learning_rate": 1e-05, "loss": 0.0548, "step": 54900 }, { "epoch": 0.00055, "grad_norm": 0.3947020173072815, "learning_rate": 1e-05, "loss": 0.0556, "step": 55000 }, { "epoch": 0.000551, "grad_norm": 0.4581582546234131, "learning_rate": 1e-05, "loss": 0.0556, "step": 55100 }, { "epoch": 0.000552, "grad_norm": 0.4208582043647766, "learning_rate": 1e-05, "loss": 0.0556, "step": 55200 }, { "epoch": 0.000553, "grad_norm": 0.42351165413856506, "learning_rate": 1e-05, "loss": 0.0561, "step": 55300 }, { "epoch": 0.000554, "grad_norm": 0.4377772808074951, "learning_rate": 1e-05, "loss": 0.0553, "step": 55400 }, { "epoch": 0.000555, "grad_norm": 0.5557478070259094, "learning_rate": 1e-05, "loss": 0.0554, "step": 55500 }, { "epoch": 0.000556, "grad_norm": 0.3564606308937073, "learning_rate": 1e-05, "loss": 0.0555, "step": 55600 }, { "epoch": 0.000557, "grad_norm": 0.28881803154945374, "learning_rate": 1e-05, "loss": 0.0554, "step": 55700 }, { "epoch": 0.000558, "grad_norm": 0.3655983507633209, "learning_rate": 1e-05, "loss": 0.0545, "step": 55800 }, { "epoch": 0.000559, "grad_norm": 0.39696529507637024, "learning_rate": 1e-05, "loss": 0.0564, "step": 55900 }, { "epoch": 0.00056, "grad_norm": 0.3657435178756714, "learning_rate": 1e-05, "loss": 0.0546, "step": 56000 }, { "epoch": 0.000561, "grad_norm": 0.39722418785095215, "learning_rate": 1e-05, "loss": 0.0553, "step": 56100 }, { "epoch": 0.000562, "grad_norm": 0.32188084721565247, "learning_rate": 1e-05, "loss": 0.0542, "step": 56200 }, { "epoch": 0.000563, "grad_norm": 0.4055284857749939, "learning_rate": 1e-05, "loss": 0.0551, "step": 56300 }, { "epoch": 0.000564, "grad_norm": 0.3120182454586029, "learning_rate": 1e-05, "loss": 0.0547, "step": 56400 }, { "epoch": 0.000565, "grad_norm": 0.3582640588283539, "learning_rate": 1e-05, "loss": 0.0555, "step": 56500 }, { "epoch": 0.000566, "grad_norm": 0.3979005217552185, "learning_rate": 1e-05, "loss": 0.0551, "step": 56600 }, { "epoch": 0.000567, "grad_norm": 0.43867969512939453, "learning_rate": 1e-05, "loss": 0.0548, "step": 56700 }, { "epoch": 0.000568, "grad_norm": 0.3772152364253998, "learning_rate": 1e-05, "loss": 0.0545, "step": 56800 }, { "epoch": 0.000569, "grad_norm": 0.4547737240791321, "learning_rate": 1e-05, "loss": 0.0549, "step": 56900 }, { "epoch": 0.00057, "grad_norm": 0.39923152327537537, "learning_rate": 1e-05, "loss": 0.0548, "step": 57000 }, { "epoch": 0.000571, "grad_norm": 0.4818025529384613, "learning_rate": 1e-05, "loss": 0.0547, "step": 57100 }, { "epoch": 0.000572, "grad_norm": 0.3292573392391205, "learning_rate": 1e-05, "loss": 0.0545, "step": 57200 }, { "epoch": 0.000573, "grad_norm": 0.3038542866706848, "learning_rate": 1e-05, "loss": 0.0545, "step": 57300 }, { "epoch": 0.000574, "grad_norm": 0.32784566283226013, "learning_rate": 1e-05, "loss": 0.0554, "step": 57400 }, { "epoch": 0.000575, "grad_norm": 0.33108627796173096, "learning_rate": 1e-05, "loss": 0.0548, "step": 57500 }, { "epoch": 0.000576, "grad_norm": 0.370631605386734, "learning_rate": 1e-05, "loss": 0.0549, "step": 57600 }, { "epoch": 0.000577, "grad_norm": 0.3862476050853729, "learning_rate": 1e-05, "loss": 0.0542, "step": 57700 }, { "epoch": 0.000578, "grad_norm": 0.3659495711326599, "learning_rate": 1e-05, "loss": 0.0544, "step": 57800 }, { "epoch": 0.000579, "grad_norm": 0.4000944197177887, "learning_rate": 1e-05, "loss": 0.0546, "step": 57900 }, { "epoch": 0.00058, "grad_norm": 0.43611007928848267, "learning_rate": 1e-05, "loss": 0.055, "step": 58000 }, { "epoch": 0.000581, "grad_norm": 0.3359104096889496, "learning_rate": 1e-05, "loss": 0.0545, "step": 58100 }, { "epoch": 0.000582, "grad_norm": 0.36203429102897644, "learning_rate": 1e-05, "loss": 0.0532, "step": 58200 }, { "epoch": 0.000583, "grad_norm": 0.4052965044975281, "learning_rate": 1e-05, "loss": 0.0548, "step": 58300 }, { "epoch": 0.000584, "grad_norm": 0.33484601974487305, "learning_rate": 1e-05, "loss": 0.0541, "step": 58400 }, { "epoch": 0.000585, "grad_norm": 0.3541145920753479, "learning_rate": 1e-05, "loss": 0.0538, "step": 58500 }, { "epoch": 0.000586, "grad_norm": 0.4506705701351166, "learning_rate": 1e-05, "loss": 0.0549, "step": 58600 }, { "epoch": 0.000587, "grad_norm": 0.383552610874176, "learning_rate": 1e-05, "loss": 0.0544, "step": 58700 }, { "epoch": 0.000588, "grad_norm": 0.5501090288162231, "learning_rate": 1e-05, "loss": 0.0542, "step": 58800 }, { "epoch": 0.000589, "grad_norm": 0.36635923385620117, "learning_rate": 1e-05, "loss": 0.0545, "step": 58900 }, { "epoch": 0.00059, "grad_norm": 0.3647475838661194, "learning_rate": 1e-05, "loss": 0.0545, "step": 59000 }, { "epoch": 0.000591, "grad_norm": 0.3209948241710663, "learning_rate": 1e-05, "loss": 0.0536, "step": 59100 }, { "epoch": 0.000592, "grad_norm": 0.28525248169898987, "learning_rate": 1e-05, "loss": 0.0533, "step": 59200 }, { "epoch": 0.000593, "grad_norm": 0.3613777756690979, "learning_rate": 1e-05, "loss": 0.053, "step": 59300 }, { "epoch": 0.000594, "grad_norm": 0.35602521896362305, "learning_rate": 1e-05, "loss": 0.0548, "step": 59400 }, { "epoch": 0.000595, "grad_norm": 0.3504873514175415, "learning_rate": 1e-05, "loss": 0.0539, "step": 59500 }, { "epoch": 0.000596, "grad_norm": 0.5655069947242737, "learning_rate": 1e-05, "loss": 0.0535, "step": 59600 }, { "epoch": 0.000597, "grad_norm": 0.34876635670661926, "learning_rate": 1e-05, "loss": 0.0537, "step": 59700 }, { "epoch": 0.000598, "grad_norm": 0.39395707845687866, "learning_rate": 1e-05, "loss": 0.054, "step": 59800 }, { "epoch": 0.000599, "grad_norm": 0.41300731897354126, "learning_rate": 1e-05, "loss": 0.0535, "step": 59900 }, { "epoch": 0.0006, "grad_norm": 0.34211117029190063, "learning_rate": 1e-05, "loss": 0.0536, "step": 60000 }, { "epoch": 0.0006, "eval_loss": 0.04531589522957802, "eval_runtime": 219.9002, "eval_samples_per_second": 227.376, "eval_steps_per_second": 14.211, "step": 60000 }, { "epoch": 0.000601, "grad_norm": 0.43069586157798767, "learning_rate": 1e-05, "loss": 0.0529, "step": 60100 }, { "epoch": 0.000602, "grad_norm": 0.33741074800491333, "learning_rate": 1e-05, "loss": 0.053, "step": 60200 }, { "epoch": 0.000603, "grad_norm": 0.3577895164489746, "learning_rate": 1e-05, "loss": 0.0532, "step": 60300 }, { "epoch": 0.000604, "grad_norm": 0.5071342587471008, "learning_rate": 1e-05, "loss": 0.0528, "step": 60400 }, { "epoch": 0.000605, "grad_norm": 0.4268397092819214, "learning_rate": 1e-05, "loss": 0.0532, "step": 60500 }, { "epoch": 0.000606, "grad_norm": 0.4457167983055115, "learning_rate": 1e-05, "loss": 0.0536, "step": 60600 }, { "epoch": 0.000607, "grad_norm": 0.32387426495552063, "learning_rate": 1e-05, "loss": 0.0537, "step": 60700 }, { "epoch": 0.000608, "grad_norm": 0.3494728207588196, "learning_rate": 1e-05, "loss": 0.0532, "step": 60800 }, { "epoch": 0.000609, "grad_norm": 0.35315635800361633, "learning_rate": 1e-05, "loss": 0.0536, "step": 60900 }, { "epoch": 0.00061, "grad_norm": 0.4636612832546234, "learning_rate": 1e-05, "loss": 0.0527, "step": 61000 }, { "epoch": 0.000611, "grad_norm": 0.32300132513046265, "learning_rate": 1e-05, "loss": 0.052, "step": 61100 }, { "epoch": 0.000612, "grad_norm": 0.2963297367095947, "learning_rate": 1e-05, "loss": 0.0535, "step": 61200 }, { "epoch": 0.000613, "grad_norm": 0.5265359878540039, "learning_rate": 1e-05, "loss": 0.0536, "step": 61300 }, { "epoch": 0.000614, "grad_norm": 0.3320706784725189, "learning_rate": 1e-05, "loss": 0.0536, "step": 61400 }, { "epoch": 0.000615, "grad_norm": 0.5555657148361206, "learning_rate": 1e-05, "loss": 0.0522, "step": 61500 }, { "epoch": 0.000616, "grad_norm": 0.43259820342063904, "learning_rate": 1e-05, "loss": 0.0534, "step": 61600 }, { "epoch": 0.000617, "grad_norm": 0.5858323574066162, "learning_rate": 1e-05, "loss": 0.0528, "step": 61700 }, { "epoch": 0.000618, "grad_norm": 0.2777584195137024, "learning_rate": 1e-05, "loss": 0.0529, "step": 61800 }, { "epoch": 0.000619, "grad_norm": 0.4293353855609894, "learning_rate": 1e-05, "loss": 0.053, "step": 61900 }, { "epoch": 0.00062, "grad_norm": 0.3417837917804718, "learning_rate": 1e-05, "loss": 0.0534, "step": 62000 }, { "epoch": 0.000621, "grad_norm": 0.4296804964542389, "learning_rate": 1e-05, "loss": 0.0533, "step": 62100 }, { "epoch": 0.000622, "grad_norm": 0.3573191463947296, "learning_rate": 1e-05, "loss": 0.0527, "step": 62200 }, { "epoch": 0.000623, "grad_norm": 0.5062029957771301, "learning_rate": 1e-05, "loss": 0.0517, "step": 62300 }, { "epoch": 0.000624, "grad_norm": 0.5318054556846619, "learning_rate": 1e-05, "loss": 0.0532, "step": 62400 }, { "epoch": 0.000625, "grad_norm": 0.3115977346897125, "learning_rate": 1e-05, "loss": 0.0514, "step": 62500 }, { "epoch": 0.000626, "grad_norm": 0.43097102642059326, "learning_rate": 1e-05, "loss": 0.0521, "step": 62600 }, { "epoch": 0.000627, "grad_norm": 0.3909270465373993, "learning_rate": 1e-05, "loss": 0.0529, "step": 62700 }, { "epoch": 0.000628, "grad_norm": 0.3545512855052948, "learning_rate": 1e-05, "loss": 0.0532, "step": 62800 }, { "epoch": 0.000629, "grad_norm": 0.2722860872745514, "learning_rate": 1e-05, "loss": 0.0526, "step": 62900 }, { "epoch": 0.00063, "grad_norm": 0.45007529854774475, "learning_rate": 1e-05, "loss": 0.0528, "step": 63000 }, { "epoch": 0.000631, "grad_norm": 0.38086941838264465, "learning_rate": 1e-05, "loss": 0.0514, "step": 63100 }, { "epoch": 0.000632, "grad_norm": 0.4122013747692108, "learning_rate": 1e-05, "loss": 0.0533, "step": 63200 }, { "epoch": 0.000633, "grad_norm": 0.3021005690097809, "learning_rate": 1e-05, "loss": 0.0529, "step": 63300 }, { "epoch": 0.000634, "grad_norm": 0.4840705990791321, "learning_rate": 1e-05, "loss": 0.0523, "step": 63400 }, { "epoch": 0.000635, "grad_norm": 0.5266294479370117, "learning_rate": 1e-05, "loss": 0.0523, "step": 63500 }, { "epoch": 0.000636, "grad_norm": 0.32081693410873413, "learning_rate": 1e-05, "loss": 0.0519, "step": 63600 }, { "epoch": 0.000637, "grad_norm": 0.3658919036388397, "learning_rate": 1e-05, "loss": 0.0523, "step": 63700 }, { "epoch": 0.000638, "grad_norm": 0.34707629680633545, "learning_rate": 1e-05, "loss": 0.052, "step": 63800 }, { "epoch": 0.000639, "grad_norm": 0.4076574742794037, "learning_rate": 1e-05, "loss": 0.0522, "step": 63900 }, { "epoch": 0.00064, "grad_norm": 0.4823018014431, "learning_rate": 1e-05, "loss": 0.0521, "step": 64000 }, { "epoch": 0.000641, "grad_norm": 0.32524824142456055, "learning_rate": 1e-05, "loss": 0.0516, "step": 64100 }, { "epoch": 0.000642, "grad_norm": 0.45501217246055603, "learning_rate": 1e-05, "loss": 0.0519, "step": 64200 }, { "epoch": 0.000643, "grad_norm": 0.2624323070049286, "learning_rate": 1e-05, "loss": 0.0517, "step": 64300 }, { "epoch": 0.000644, "grad_norm": 0.3485264778137207, "learning_rate": 1e-05, "loss": 0.0524, "step": 64400 }, { "epoch": 0.000645, "grad_norm": 0.4499082565307617, "learning_rate": 1e-05, "loss": 0.0524, "step": 64500 }, { "epoch": 0.000646, "grad_norm": 0.344684362411499, "learning_rate": 1e-05, "loss": 0.0517, "step": 64600 }, { "epoch": 0.000647, "grad_norm": 0.3188033998012543, "learning_rate": 1e-05, "loss": 0.0524, "step": 64700 }, { "epoch": 0.000648, "grad_norm": 0.2919550836086273, "learning_rate": 1e-05, "loss": 0.0517, "step": 64800 }, { "epoch": 0.000649, "grad_norm": 0.3938411772251129, "learning_rate": 1e-05, "loss": 0.0511, "step": 64900 }, { "epoch": 0.00065, "grad_norm": 0.3932763934135437, "learning_rate": 1e-05, "loss": 0.0519, "step": 65000 }, { "epoch": 0.000651, "grad_norm": 0.41117754578590393, "learning_rate": 1e-05, "loss": 0.0521, "step": 65100 }, { "epoch": 0.000652, "grad_norm": 0.40838417410850525, "learning_rate": 1e-05, "loss": 0.0512, "step": 65200 }, { "epoch": 0.000653, "grad_norm": 0.5048505067825317, "learning_rate": 1e-05, "loss": 0.0519, "step": 65300 }, { "epoch": 0.000654, "grad_norm": 0.3566140830516815, "learning_rate": 1e-05, "loss": 0.0517, "step": 65400 }, { "epoch": 0.000655, "grad_norm": 0.4652426838874817, "learning_rate": 1e-05, "loss": 0.0518, "step": 65500 }, { "epoch": 0.000656, "grad_norm": 0.35729262232780457, "learning_rate": 1e-05, "loss": 0.0523, "step": 65600 }, { "epoch": 0.000657, "grad_norm": 0.3700712323188782, "learning_rate": 1e-05, "loss": 0.0517, "step": 65700 }, { "epoch": 0.000658, "grad_norm": 0.3848475515842438, "learning_rate": 1e-05, "loss": 0.0514, "step": 65800 }, { "epoch": 0.000659, "grad_norm": 0.3264726996421814, "learning_rate": 1e-05, "loss": 0.0508, "step": 65900 }, { "epoch": 0.00066, "grad_norm": 0.35905686020851135, "learning_rate": 1e-05, "loss": 0.0508, "step": 66000 }, { "epoch": 0.000661, "grad_norm": 0.35630860924720764, "learning_rate": 1e-05, "loss": 0.0524, "step": 66100 }, { "epoch": 0.000662, "grad_norm": 0.3681713342666626, "learning_rate": 1e-05, "loss": 0.0512, "step": 66200 }, { "epoch": 0.000663, "grad_norm": 0.37034353613853455, "learning_rate": 1e-05, "loss": 0.0507, "step": 66300 }, { "epoch": 0.000664, "grad_norm": 0.352030873298645, "learning_rate": 1e-05, "loss": 0.0516, "step": 66400 }, { "epoch": 0.000665, "grad_norm": 0.35171565413475037, "learning_rate": 1e-05, "loss": 0.0521, "step": 66500 }, { "epoch": 0.000666, "grad_norm": 0.3107181191444397, "learning_rate": 1e-05, "loss": 0.0514, "step": 66600 }, { "epoch": 0.000667, "grad_norm": 0.39687761664390564, "learning_rate": 1e-05, "loss": 0.0513, "step": 66700 }, { "epoch": 0.000668, "grad_norm": 0.4727918803691864, "learning_rate": 1e-05, "loss": 0.0508, "step": 66800 }, { "epoch": 0.000669, "grad_norm": 0.36843380331993103, "learning_rate": 1e-05, "loss": 0.0514, "step": 66900 }, { "epoch": 0.00067, "grad_norm": 0.4955510199069977, "learning_rate": 1e-05, "loss": 0.0519, "step": 67000 }, { "epoch": 0.000671, "grad_norm": 0.3539014756679535, "learning_rate": 1e-05, "loss": 0.0515, "step": 67100 }, { "epoch": 0.000672, "grad_norm": 0.3970053195953369, "learning_rate": 1e-05, "loss": 0.0515, "step": 67200 }, { "epoch": 0.000673, "grad_norm": 0.38408026099205017, "learning_rate": 1e-05, "loss": 0.0504, "step": 67300 }, { "epoch": 0.000674, "grad_norm": 0.4743775427341461, "learning_rate": 1e-05, "loss": 0.0515, "step": 67400 }, { "epoch": 0.000675, "grad_norm": 0.42972061038017273, "learning_rate": 1e-05, "loss": 0.0509, "step": 67500 }, { "epoch": 0.000676, "grad_norm": 0.3206966519355774, "learning_rate": 1e-05, "loss": 0.0519, "step": 67600 }, { "epoch": 0.000677, "grad_norm": 0.33453166484832764, "learning_rate": 1e-05, "loss": 0.0505, "step": 67700 }, { "epoch": 0.000678, "grad_norm": 0.3362928032875061, "learning_rate": 1e-05, "loss": 0.051, "step": 67800 }, { "epoch": 0.000679, "grad_norm": 0.3097308874130249, "learning_rate": 1e-05, "loss": 0.0505, "step": 67900 }, { "epoch": 0.00068, "grad_norm": 0.39247313141822815, "learning_rate": 1e-05, "loss": 0.0506, "step": 68000 }, { "epoch": 0.000681, "grad_norm": 0.3802288770675659, "learning_rate": 1e-05, "loss": 0.0512, "step": 68100 }, { "epoch": 0.000682, "grad_norm": 0.39734145998954773, "learning_rate": 1e-05, "loss": 0.0514, "step": 68200 }, { "epoch": 0.000683, "grad_norm": 0.2548407018184662, "learning_rate": 1e-05, "loss": 0.0503, "step": 68300 }, { "epoch": 0.000684, "grad_norm": 0.3169516324996948, "learning_rate": 1e-05, "loss": 0.0513, "step": 68400 }, { "epoch": 0.000685, "grad_norm": 0.4066063463687897, "learning_rate": 1e-05, "loss": 0.0497, "step": 68500 }, { "epoch": 0.000686, "grad_norm": 0.3124655485153198, "learning_rate": 1e-05, "loss": 0.0498, "step": 68600 }, { "epoch": 0.000687, "grad_norm": 0.41603246331214905, "learning_rate": 1e-05, "loss": 0.0512, "step": 68700 }, { "epoch": 0.000688, "grad_norm": 0.37482598423957825, "learning_rate": 1e-05, "loss": 0.0498, "step": 68800 }, { "epoch": 0.000689, "grad_norm": 0.39417704939842224, "learning_rate": 1e-05, "loss": 0.0496, "step": 68900 }, { "epoch": 0.00069, "grad_norm": 0.4208214581012726, "learning_rate": 1e-05, "loss": 0.0509, "step": 69000 }, { "epoch": 0.000691, "grad_norm": 0.36657410860061646, "learning_rate": 1e-05, "loss": 0.0506, "step": 69100 }, { "epoch": 0.000692, "grad_norm": 0.35307565331459045, "learning_rate": 1e-05, "loss": 0.0506, "step": 69200 }, { "epoch": 0.000693, "grad_norm": 0.37397703528404236, "learning_rate": 1e-05, "loss": 0.0505, "step": 69300 }, { "epoch": 0.000694, "grad_norm": 0.4969698488712311, "learning_rate": 1e-05, "loss": 0.0512, "step": 69400 }, { "epoch": 0.000695, "grad_norm": 0.36873698234558105, "learning_rate": 1e-05, "loss": 0.0508, "step": 69500 }, { "epoch": 0.000696, "grad_norm": 0.38630130887031555, "learning_rate": 1e-05, "loss": 0.0505, "step": 69600 }, { "epoch": 0.000697, "grad_norm": 0.2929743826389313, "learning_rate": 1e-05, "loss": 0.0506, "step": 69700 }, { "epoch": 0.000698, "grad_norm": 0.35411337018013, "learning_rate": 1e-05, "loss": 0.0503, "step": 69800 }, { "epoch": 0.000699, "grad_norm": 0.3946784734725952, "learning_rate": 1e-05, "loss": 0.05, "step": 69900 }, { "epoch": 0.0007, "grad_norm": 0.3455301821231842, "learning_rate": 1e-05, "loss": 0.0497, "step": 70000 }, { "epoch": 0.000701, "grad_norm": 0.36670762300491333, "learning_rate": 1e-05, "loss": 0.0498, "step": 70100 }, { "epoch": 0.000702, "grad_norm": 0.3253152668476105, "learning_rate": 1e-05, "loss": 0.0499, "step": 70200 }, { "epoch": 0.000703, "grad_norm": 0.304723858833313, "learning_rate": 1e-05, "loss": 0.0506, "step": 70300 }, { "epoch": 0.000704, "grad_norm": 0.4939587712287903, "learning_rate": 1e-05, "loss": 0.0498, "step": 70400 }, { "epoch": 0.000705, "grad_norm": 0.26826539635658264, "learning_rate": 1e-05, "loss": 0.0506, "step": 70500 }, { "epoch": 0.000706, "grad_norm": 0.3034983277320862, "learning_rate": 1e-05, "loss": 0.0502, "step": 70600 }, { "epoch": 0.000707, "grad_norm": 0.3437539041042328, "learning_rate": 1e-05, "loss": 0.0504, "step": 70700 }, { "epoch": 0.000708, "grad_norm": 0.3020792603492737, "learning_rate": 1e-05, "loss": 0.0502, "step": 70800 }, { "epoch": 0.000709, "grad_norm": 0.2701978385448456, "learning_rate": 1e-05, "loss": 0.05, "step": 70900 }, { "epoch": 0.00071, "grad_norm": 0.33766886591911316, "learning_rate": 1e-05, "loss": 0.0498, "step": 71000 }, { "epoch": 0.000711, "grad_norm": 0.3468993008136749, "learning_rate": 1e-05, "loss": 0.0497, "step": 71100 }, { "epoch": 0.000712, "grad_norm": 0.43122342228889465, "learning_rate": 1e-05, "loss": 0.0496, "step": 71200 }, { "epoch": 0.000713, "grad_norm": 0.3271734416484833, "learning_rate": 1e-05, "loss": 0.0488, "step": 71300 }, { "epoch": 0.000714, "grad_norm": 0.2678506076335907, "learning_rate": 1e-05, "loss": 0.0491, "step": 71400 }, { "epoch": 0.000715, "grad_norm": 0.2575507164001465, "learning_rate": 1e-05, "loss": 0.0496, "step": 71500 }, { "epoch": 0.000716, "grad_norm": 0.30854716897010803, "learning_rate": 1e-05, "loss": 0.0499, "step": 71600 }, { "epoch": 0.000717, "grad_norm": 0.47582992911338806, "learning_rate": 1e-05, "loss": 0.0486, "step": 71700 }, { "epoch": 0.000718, "grad_norm": 0.36227524280548096, "learning_rate": 1e-05, "loss": 0.049, "step": 71800 }, { "epoch": 0.000719, "grad_norm": 0.355499267578125, "learning_rate": 1e-05, "loss": 0.0495, "step": 71900 }, { "epoch": 0.00072, "grad_norm": 0.4007318615913391, "learning_rate": 1e-05, "loss": 0.0491, "step": 72000 }, { "epoch": 0.000721, "grad_norm": 0.34989556670188904, "learning_rate": 1e-05, "loss": 0.0503, "step": 72100 }, { "epoch": 0.000722, "grad_norm": 0.31340816617012024, "learning_rate": 1e-05, "loss": 0.0487, "step": 72200 }, { "epoch": 0.000723, "grad_norm": 0.24752527475357056, "learning_rate": 1e-05, "loss": 0.0486, "step": 72300 }, { "epoch": 0.000724, "grad_norm": 0.31003397703170776, "learning_rate": 1e-05, "loss": 0.0488, "step": 72400 }, { "epoch": 0.000725, "grad_norm": 0.2911018431186676, "learning_rate": 1e-05, "loss": 0.0496, "step": 72500 }, { "epoch": 0.000726, "grad_norm": 0.5012894868850708, "learning_rate": 1e-05, "loss": 0.0497, "step": 72600 }, { "epoch": 0.000727, "grad_norm": 0.33135247230529785, "learning_rate": 1e-05, "loss": 0.0486, "step": 72700 }, { "epoch": 0.000728, "grad_norm": 0.33216941356658936, "learning_rate": 1e-05, "loss": 0.0499, "step": 72800 }, { "epoch": 0.000729, "grad_norm": 0.2781502902507782, "learning_rate": 1e-05, "loss": 0.0496, "step": 72900 }, { "epoch": 0.00073, "grad_norm": 0.39392635226249695, "learning_rate": 1e-05, "loss": 0.0491, "step": 73000 }, { "epoch": 0.000731, "grad_norm": 0.306863009929657, "learning_rate": 1e-05, "loss": 0.0495, "step": 73100 }, { "epoch": 0.000732, "grad_norm": 0.3702819347381592, "learning_rate": 1e-05, "loss": 0.0485, "step": 73200 }, { "epoch": 0.000733, "grad_norm": 0.3114405572414398, "learning_rate": 1e-05, "loss": 0.0489, "step": 73300 }, { "epoch": 0.000734, "grad_norm": 0.3184596300125122, "learning_rate": 1e-05, "loss": 0.0496, "step": 73400 }, { "epoch": 0.000735, "grad_norm": 0.5314540863037109, "learning_rate": 1e-05, "loss": 0.0491, "step": 73500 }, { "epoch": 0.000736, "grad_norm": 0.383344441652298, "learning_rate": 1e-05, "loss": 0.0492, "step": 73600 }, { "epoch": 0.000737, "grad_norm": 0.3960054814815521, "learning_rate": 1e-05, "loss": 0.049, "step": 73700 }, { "epoch": 0.000738, "grad_norm": 0.365721195936203, "learning_rate": 1e-05, "loss": 0.0491, "step": 73800 }, { "epoch": 0.000739, "grad_norm": 0.4698300361633301, "learning_rate": 1e-05, "loss": 0.0493, "step": 73900 }, { "epoch": 0.00074, "grad_norm": 0.29865899682044983, "learning_rate": 1e-05, "loss": 0.049, "step": 74000 }, { "epoch": 0.000741, "grad_norm": 0.38016796112060547, "learning_rate": 1e-05, "loss": 0.0487, "step": 74100 }, { "epoch": 0.000742, "grad_norm": 0.4035409092903137, "learning_rate": 1e-05, "loss": 0.0499, "step": 74200 }, { "epoch": 0.000743, "grad_norm": 0.33639755845069885, "learning_rate": 1e-05, "loss": 0.0489, "step": 74300 }, { "epoch": 0.000744, "grad_norm": 0.3183378279209137, "learning_rate": 1e-05, "loss": 0.0489, "step": 74400 }, { "epoch": 0.000745, "grad_norm": 0.3105449974536896, "learning_rate": 1e-05, "loss": 0.0488, "step": 74500 }, { "epoch": 0.000746, "grad_norm": 0.5107073783874512, "learning_rate": 1e-05, "loss": 0.0489, "step": 74600 }, { "epoch": 0.000747, "grad_norm": 0.28981220722198486, "learning_rate": 1e-05, "loss": 0.0477, "step": 74700 }, { "epoch": 0.000748, "grad_norm": 0.3311959505081177, "learning_rate": 1e-05, "loss": 0.0488, "step": 74800 }, { "epoch": 0.000749, "grad_norm": 0.3612598776817322, "learning_rate": 1e-05, "loss": 0.0479, "step": 74900 }, { "epoch": 0.00075, "grad_norm": 0.30859631299972534, "learning_rate": 1e-05, "loss": 0.0483, "step": 75000 }, { "epoch": 0.000751, "grad_norm": 0.3856201469898224, "learning_rate": 1e-05, "loss": 0.0481, "step": 75100 }, { "epoch": 0.000752, "grad_norm": 0.6222278475761414, "learning_rate": 1e-05, "loss": 0.0488, "step": 75200 }, { "epoch": 0.000753, "grad_norm": 0.4198635220527649, "learning_rate": 1e-05, "loss": 0.0481, "step": 75300 }, { "epoch": 0.000754, "grad_norm": 0.3192780911922455, "learning_rate": 1e-05, "loss": 0.0481, "step": 75400 }, { "epoch": 0.000755, "grad_norm": 0.3419158458709717, "learning_rate": 1e-05, "loss": 0.0489, "step": 75500 }, { "epoch": 0.000756, "grad_norm": 0.2644210755825043, "learning_rate": 1e-05, "loss": 0.0487, "step": 75600 }, { "epoch": 0.000757, "grad_norm": 0.4006797671318054, "learning_rate": 1e-05, "loss": 0.0485, "step": 75700 }, { "epoch": 0.000758, "grad_norm": 0.3275812268257141, "learning_rate": 1e-05, "loss": 0.0481, "step": 75800 }, { "epoch": 0.000759, "grad_norm": 0.35157835483551025, "learning_rate": 1e-05, "loss": 0.0486, "step": 75900 }, { "epoch": 0.00076, "grad_norm": 0.36288097500801086, "learning_rate": 1e-05, "loss": 0.0483, "step": 76000 }, { "epoch": 0.000761, "grad_norm": 0.47141945362091064, "learning_rate": 1e-05, "loss": 0.0483, "step": 76100 }, { "epoch": 0.000762, "grad_norm": 0.3608536422252655, "learning_rate": 1e-05, "loss": 0.048, "step": 76200 }, { "epoch": 0.000763, "grad_norm": 0.36795374751091003, "learning_rate": 1e-05, "loss": 0.0473, "step": 76300 }, { "epoch": 0.000764, "grad_norm": 0.3102205991744995, "learning_rate": 1e-05, "loss": 0.048, "step": 76400 }, { "epoch": 0.000765, "grad_norm": 0.3840140700340271, "learning_rate": 1e-05, "loss": 0.0488, "step": 76500 }, { "epoch": 0.000766, "grad_norm": 0.4733908176422119, "learning_rate": 1e-05, "loss": 0.0475, "step": 76600 }, { "epoch": 0.000767, "grad_norm": 0.41359737515449524, "learning_rate": 1e-05, "loss": 0.048, "step": 76700 }, { "epoch": 0.000768, "grad_norm": 0.31524357199668884, "learning_rate": 1e-05, "loss": 0.048, "step": 76800 }, { "epoch": 0.000769, "grad_norm": 0.28958508372306824, "learning_rate": 1e-05, "loss": 0.0487, "step": 76900 }, { "epoch": 0.00077, "grad_norm": 0.34593164920806885, "learning_rate": 1e-05, "loss": 0.0481, "step": 77000 }, { "epoch": 0.000771, "grad_norm": 0.31738200783729553, "learning_rate": 1e-05, "loss": 0.0487, "step": 77100 }, { "epoch": 0.000772, "grad_norm": 0.42067286372184753, "learning_rate": 1e-05, "loss": 0.047, "step": 77200 }, { "epoch": 0.000773, "grad_norm": 0.3165457248687744, "learning_rate": 1e-05, "loss": 0.0479, "step": 77300 }, { "epoch": 0.000774, "grad_norm": 0.29272714257240295, "learning_rate": 1e-05, "loss": 0.048, "step": 77400 }, { "epoch": 0.000775, "grad_norm": 0.31512001156806946, "learning_rate": 1e-05, "loss": 0.0482, "step": 77500 }, { "epoch": 0.000776, "grad_norm": 0.44647252559661865, "learning_rate": 1e-05, "loss": 0.0479, "step": 77600 }, { "epoch": 0.000777, "grad_norm": 0.3410949110984802, "learning_rate": 1e-05, "loss": 0.0481, "step": 77700 }, { "epoch": 0.000778, "grad_norm": 0.35861316323280334, "learning_rate": 1e-05, "loss": 0.0476, "step": 77800 }, { "epoch": 0.000779, "grad_norm": 0.37881168723106384, "learning_rate": 1e-05, "loss": 0.0466, "step": 77900 }, { "epoch": 0.00078, "grad_norm": 0.459828644990921, "learning_rate": 1e-05, "loss": 0.0484, "step": 78000 }, { "epoch": 0.000781, "grad_norm": 0.31915751099586487, "learning_rate": 1e-05, "loss": 0.0487, "step": 78100 }, { "epoch": 0.000782, "grad_norm": 0.3420863151550293, "learning_rate": 1e-05, "loss": 0.0474, "step": 78200 }, { "epoch": 0.000783, "grad_norm": 0.307573139667511, "learning_rate": 1e-05, "loss": 0.0483, "step": 78300 }, { "epoch": 0.000784, "grad_norm": 0.9490934610366821, "learning_rate": 1e-05, "loss": 0.0476, "step": 78400 }, { "epoch": 0.000785, "grad_norm": 0.274152547121048, "learning_rate": 1e-05, "loss": 0.0472, "step": 78500 }, { "epoch": 0.000786, "grad_norm": 0.31905946135520935, "learning_rate": 1e-05, "loss": 0.0478, "step": 78600 }, { "epoch": 0.000787, "grad_norm": 0.32099300622940063, "learning_rate": 1e-05, "loss": 0.0476, "step": 78700 }, { "epoch": 0.000788, "grad_norm": 0.2800377607345581, "learning_rate": 1e-05, "loss": 0.0475, "step": 78800 }, { "epoch": 0.000789, "grad_norm": 0.2727205157279968, "learning_rate": 1e-05, "loss": 0.0479, "step": 78900 }, { "epoch": 0.00079, "grad_norm": 0.2901556193828583, "learning_rate": 1e-05, "loss": 0.0472, "step": 79000 }, { "epoch": 0.000791, "grad_norm": 0.32120728492736816, "learning_rate": 1e-05, "loss": 0.0463, "step": 79100 }, { "epoch": 0.000792, "grad_norm": 0.3305732309818268, "learning_rate": 1e-05, "loss": 0.0475, "step": 79200 }, { "epoch": 0.000793, "grad_norm": 0.3053627908229828, "learning_rate": 1e-05, "loss": 0.0476, "step": 79300 }, { "epoch": 0.000794, "grad_norm": 0.35901638865470886, "learning_rate": 1e-05, "loss": 0.047, "step": 79400 }, { "epoch": 0.000795, "grad_norm": 0.36568284034729004, "learning_rate": 1e-05, "loss": 0.047, "step": 79500 }, { "epoch": 0.000796, "grad_norm": 0.32313764095306396, "learning_rate": 1e-05, "loss": 0.0476, "step": 79600 }, { "epoch": 0.000797, "grad_norm": 0.3958641290664673, "learning_rate": 1e-05, "loss": 0.0475, "step": 79700 }, { "epoch": 0.000798, "grad_norm": 0.2990384101867676, "learning_rate": 1e-05, "loss": 0.0472, "step": 79800 }, { "epoch": 0.000799, "grad_norm": 0.2650670111179352, "learning_rate": 1e-05, "loss": 0.0477, "step": 79900 }, { "epoch": 0.0008, "grad_norm": 0.30135366320610046, "learning_rate": 1e-05, "loss": 0.0473, "step": 80000 }, { "epoch": 0.0008, "eval_loss": 0.04017254337668419, "eval_runtime": 212.4631, "eval_samples_per_second": 235.335, "eval_steps_per_second": 14.708, "step": 80000 }, { "epoch": 0.000801, "grad_norm": 0.3213781416416168, "learning_rate": 1e-05, "loss": 0.0474, "step": 80100 }, { "epoch": 0.000802, "grad_norm": 0.3178090453147888, "learning_rate": 1e-05, "loss": 0.0465, "step": 80200 }, { "epoch": 0.000803, "grad_norm": 0.34021860361099243, "learning_rate": 1e-05, "loss": 0.047, "step": 80300 }, { "epoch": 0.000804, "grad_norm": 0.3322145640850067, "learning_rate": 1e-05, "loss": 0.0477, "step": 80400 }, { "epoch": 0.000805, "grad_norm": 0.2663530111312866, "learning_rate": 1e-05, "loss": 0.0467, "step": 80500 }, { "epoch": 0.000806, "grad_norm": 0.3427180349826813, "learning_rate": 1e-05, "loss": 0.0472, "step": 80600 }, { "epoch": 0.000807, "grad_norm": 0.36006274819374084, "learning_rate": 1e-05, "loss": 0.047, "step": 80700 }, { "epoch": 0.000808, "grad_norm": 0.3332625925540924, "learning_rate": 1e-05, "loss": 0.0464, "step": 80800 }, { "epoch": 0.000809, "grad_norm": 0.3925895094871521, "learning_rate": 1e-05, "loss": 0.0465, "step": 80900 }, { "epoch": 0.00081, "grad_norm": 0.2846932113170624, "learning_rate": 1e-05, "loss": 0.0465, "step": 81000 }, { "epoch": 0.000811, "grad_norm": 0.3175708055496216, "learning_rate": 1e-05, "loss": 0.0475, "step": 81100 }, { "epoch": 0.000812, "grad_norm": 0.3311077058315277, "learning_rate": 1e-05, "loss": 0.0463, "step": 81200 }, { "epoch": 0.000813, "grad_norm": 0.3333699703216553, "learning_rate": 1e-05, "loss": 0.0467, "step": 81300 }, { "epoch": 0.000814, "grad_norm": 0.28906020522117615, "learning_rate": 1e-05, "loss": 0.047, "step": 81400 }, { "epoch": 0.000815, "grad_norm": 0.3447752296924591, "learning_rate": 1e-05, "loss": 0.0462, "step": 81500 }, { "epoch": 0.000816, "grad_norm": 0.325057715177536, "learning_rate": 1e-05, "loss": 0.0472, "step": 81600 }, { "epoch": 0.000817, "grad_norm": 0.3964000642299652, "learning_rate": 1e-05, "loss": 0.0473, "step": 81700 }, { "epoch": 0.000818, "grad_norm": 0.3415572941303253, "learning_rate": 1e-05, "loss": 0.0465, "step": 81800 }, { "epoch": 0.000819, "grad_norm": 0.30907049775123596, "learning_rate": 1e-05, "loss": 0.0468, "step": 81900 }, { "epoch": 0.00082, "grad_norm": 0.4039738178253174, "learning_rate": 1e-05, "loss": 0.0469, "step": 82000 }, { "epoch": 0.000821, "grad_norm": 0.3790346682071686, "learning_rate": 1e-05, "loss": 0.0462, "step": 82100 }, { "epoch": 0.000822, "grad_norm": 0.36530008912086487, "learning_rate": 1e-05, "loss": 0.0453, "step": 82200 }, { "epoch": 0.000823, "grad_norm": 0.2553286552429199, "learning_rate": 1e-05, "loss": 0.0474, "step": 82300 }, { "epoch": 0.000824, "grad_norm": 0.42663460969924927, "learning_rate": 1e-05, "loss": 0.0466, "step": 82400 }, { "epoch": 0.000825, "grad_norm": 0.33019331097602844, "learning_rate": 1e-05, "loss": 0.0477, "step": 82500 }, { "epoch": 0.000826, "grad_norm": 0.3135083019733429, "learning_rate": 1e-05, "loss": 0.0462, "step": 82600 }, { "epoch": 0.000827, "grad_norm": 0.31327399611473083, "learning_rate": 1e-05, "loss": 0.0463, "step": 82700 }, { "epoch": 0.000828, "grad_norm": 0.35115787386894226, "learning_rate": 1e-05, "loss": 0.0453, "step": 82800 }, { "epoch": 0.000829, "grad_norm": 0.3061750829219818, "learning_rate": 1e-05, "loss": 0.0458, "step": 82900 }, { "epoch": 0.00083, "grad_norm": 0.2539336085319519, "learning_rate": 1e-05, "loss": 0.0459, "step": 83000 }, { "epoch": 0.000831, "grad_norm": 0.3080989420413971, "learning_rate": 1e-05, "loss": 0.0464, "step": 83100 }, { "epoch": 0.000832, "grad_norm": 0.26679521799087524, "learning_rate": 1e-05, "loss": 0.0468, "step": 83200 }, { "epoch": 0.000833, "grad_norm": 0.3923046290874481, "learning_rate": 1e-05, "loss": 0.0456, "step": 83300 }, { "epoch": 0.000834, "grad_norm": 0.30227354168891907, "learning_rate": 1e-05, "loss": 0.047, "step": 83400 }, { "epoch": 0.000835, "grad_norm": 0.3947966694831848, "learning_rate": 1e-05, "loss": 0.0464, "step": 83500 }, { "epoch": 0.000836, "grad_norm": 0.3319546580314636, "learning_rate": 1e-05, "loss": 0.0461, "step": 83600 }, { "epoch": 0.000837, "grad_norm": 0.32283881306648254, "learning_rate": 1e-05, "loss": 0.0458, "step": 83700 }, { "epoch": 0.000838, "grad_norm": 0.36611098051071167, "learning_rate": 1e-05, "loss": 0.0471, "step": 83800 }, { "epoch": 0.000839, "grad_norm": 0.35389381647109985, "learning_rate": 1e-05, "loss": 0.046, "step": 83900 }, { "epoch": 0.00084, "grad_norm": 0.5042591094970703, "learning_rate": 1e-05, "loss": 0.0456, "step": 84000 }, { "epoch": 0.000841, "grad_norm": 0.29622530937194824, "learning_rate": 1e-05, "loss": 0.0456, "step": 84100 }, { "epoch": 0.000842, "grad_norm": 0.30188027024269104, "learning_rate": 1e-05, "loss": 0.0466, "step": 84200 }, { "epoch": 0.000843, "grad_norm": 0.28298214077949524, "learning_rate": 1e-05, "loss": 0.046, "step": 84300 }, { "epoch": 0.000844, "grad_norm": 0.3248654007911682, "learning_rate": 1e-05, "loss": 0.0458, "step": 84400 }, { "epoch": 0.000845, "grad_norm": 0.4771508276462555, "learning_rate": 1e-05, "loss": 0.0463, "step": 84500 }, { "epoch": 0.000846, "grad_norm": 0.3331417739391327, "learning_rate": 1e-05, "loss": 0.0464, "step": 84600 }, { "epoch": 0.000847, "grad_norm": 0.31564566493034363, "learning_rate": 1e-05, "loss": 0.0459, "step": 84700 }, { "epoch": 0.000848, "grad_norm": 0.32385772466659546, "learning_rate": 1e-05, "loss": 0.0465, "step": 84800 }, { "epoch": 0.000849, "grad_norm": 0.49447697401046753, "learning_rate": 1e-05, "loss": 0.0465, "step": 84900 }, { "epoch": 0.00085, "grad_norm": 0.2832833230495453, "learning_rate": 1e-05, "loss": 0.0454, "step": 85000 }, { "epoch": 0.000851, "grad_norm": 0.3475983738899231, "learning_rate": 1e-05, "loss": 0.0463, "step": 85100 }, { "epoch": 0.000852, "grad_norm": 0.359330415725708, "learning_rate": 1e-05, "loss": 0.0461, "step": 85200 }, { "epoch": 0.000853, "grad_norm": 0.2958233058452606, "learning_rate": 1e-05, "loss": 0.046, "step": 85300 }, { "epoch": 0.000854, "grad_norm": 0.27185893058776855, "learning_rate": 1e-05, "loss": 0.0464, "step": 85400 }, { "epoch": 0.000855, "grad_norm": 0.4057115316390991, "learning_rate": 1e-05, "loss": 0.0468, "step": 85500 }, { "epoch": 0.000856, "grad_norm": 0.274044394493103, "learning_rate": 1e-05, "loss": 0.0455, "step": 85600 }, { "epoch": 0.000857, "grad_norm": 0.3238118290901184, "learning_rate": 1e-05, "loss": 0.0461, "step": 85700 }, { "epoch": 0.000858, "grad_norm": 0.3129659593105316, "learning_rate": 1e-05, "loss": 0.0456, "step": 85800 }, { "epoch": 0.000859, "grad_norm": 0.3373371362686157, "learning_rate": 1e-05, "loss": 0.0459, "step": 85900 }, { "epoch": 0.00086, "grad_norm": 0.30496785044670105, "learning_rate": 1e-05, "loss": 0.0454, "step": 86000 }, { "epoch": 0.000861, "grad_norm": 0.2749803364276886, "learning_rate": 1e-05, "loss": 0.0463, "step": 86100 }, { "epoch": 0.000862, "grad_norm": 0.3140694499015808, "learning_rate": 1e-05, "loss": 0.0458, "step": 86200 }, { "epoch": 0.000863, "grad_norm": 0.31033578515052795, "learning_rate": 1e-05, "loss": 0.0455, "step": 86300 }, { "epoch": 0.000864, "grad_norm": 0.4615796208381653, "learning_rate": 1e-05, "loss": 0.0453, "step": 86400 }, { "epoch": 0.000865, "grad_norm": 0.2911795973777771, "learning_rate": 1e-05, "loss": 0.0448, "step": 86500 }, { "epoch": 0.000866, "grad_norm": 0.3546088635921478, "learning_rate": 1e-05, "loss": 0.0459, "step": 86600 }, { "epoch": 0.000867, "grad_norm": 0.33922651410102844, "learning_rate": 1e-05, "loss": 0.0459, "step": 86700 }, { "epoch": 0.000868, "grad_norm": 0.31192243099212646, "learning_rate": 1e-05, "loss": 0.0455, "step": 86800 }, { "epoch": 0.000869, "grad_norm": 0.3535122573375702, "learning_rate": 1e-05, "loss": 0.0456, "step": 86900 }, { "epoch": 0.00087, "grad_norm": 0.3543219566345215, "learning_rate": 1e-05, "loss": 0.0457, "step": 87000 }, { "epoch": 0.000871, "grad_norm": 0.30386295914649963, "learning_rate": 1e-05, "loss": 0.0457, "step": 87100 }, { "epoch": 0.000872, "grad_norm": 0.2788744866847992, "learning_rate": 1e-05, "loss": 0.0449, "step": 87200 }, { "epoch": 0.000873, "grad_norm": 0.3341538608074188, "learning_rate": 1e-05, "loss": 0.0458, "step": 87300 }, { "epoch": 0.000874, "grad_norm": 0.39732661843299866, "learning_rate": 1e-05, "loss": 0.0447, "step": 87400 }, { "epoch": 0.000875, "grad_norm": 0.46283286809921265, "learning_rate": 1e-05, "loss": 0.0469, "step": 87500 }, { "epoch": 0.000876, "grad_norm": 0.5847726464271545, "learning_rate": 1e-05, "loss": 0.0456, "step": 87600 }, { "epoch": 0.000877, "grad_norm": 0.3262990415096283, "learning_rate": 1e-05, "loss": 0.0451, "step": 87700 }, { "epoch": 0.000878, "grad_norm": 0.3130342662334442, "learning_rate": 1e-05, "loss": 0.0452, "step": 87800 }, { "epoch": 0.000879, "grad_norm": 0.2922936975955963, "learning_rate": 1e-05, "loss": 0.0449, "step": 87900 }, { "epoch": 0.00088, "grad_norm": 0.2955271601676941, "learning_rate": 1e-05, "loss": 0.0456, "step": 88000 }, { "epoch": 0.000881, "grad_norm": 0.3558080494403839, "learning_rate": 1e-05, "loss": 0.0457, "step": 88100 }, { "epoch": 0.000882, "grad_norm": 0.3319706320762634, "learning_rate": 1e-05, "loss": 0.0454, "step": 88200 }, { "epoch": 0.000883, "grad_norm": 0.38755717873573303, "learning_rate": 1e-05, "loss": 0.0454, "step": 88300 }, { "epoch": 0.000884, "grad_norm": 0.31640440225601196, "learning_rate": 1e-05, "loss": 0.0455, "step": 88400 }, { "epoch": 0.000885, "grad_norm": 0.3367459774017334, "learning_rate": 1e-05, "loss": 0.0448, "step": 88500 }, { "epoch": 0.000886, "grad_norm": 0.320428729057312, "learning_rate": 1e-05, "loss": 0.0455, "step": 88600 }, { "epoch": 0.000887, "grad_norm": 0.3226354122161865, "learning_rate": 1e-05, "loss": 0.0455, "step": 88700 }, { "epoch": 0.000888, "grad_norm": 0.33319684863090515, "learning_rate": 1e-05, "loss": 0.0448, "step": 88800 }, { "epoch": 0.000889, "grad_norm": 0.3111964166164398, "learning_rate": 1e-05, "loss": 0.0449, "step": 88900 }, { "epoch": 0.00089, "grad_norm": 0.3649284243583679, "learning_rate": 1e-05, "loss": 0.0444, "step": 89000 }, { "epoch": 0.000891, "grad_norm": 0.3107942044734955, "learning_rate": 1e-05, "loss": 0.0445, "step": 89100 }, { "epoch": 0.000892, "grad_norm": 0.32820025086402893, "learning_rate": 1e-05, "loss": 0.0452, "step": 89200 }, { "epoch": 0.000893, "grad_norm": 0.32359620928764343, "learning_rate": 1e-05, "loss": 0.0446, "step": 89300 }, { "epoch": 0.000894, "grad_norm": 0.2796304225921631, "learning_rate": 1e-05, "loss": 0.0447, "step": 89400 }, { "epoch": 0.000895, "grad_norm": 0.32259878516197205, "learning_rate": 1e-05, "loss": 0.0452, "step": 89500 }, { "epoch": 0.000896, "grad_norm": 0.3799293339252472, "learning_rate": 1e-05, "loss": 0.0451, "step": 89600 }, { "epoch": 0.000897, "grad_norm": 0.3556264340877533, "learning_rate": 1e-05, "loss": 0.0452, "step": 89700 }, { "epoch": 0.000898, "grad_norm": 0.24393457174301147, "learning_rate": 1e-05, "loss": 0.0443, "step": 89800 }, { "epoch": 0.000899, "grad_norm": 0.5716689229011536, "learning_rate": 1e-05, "loss": 0.0448, "step": 89900 }, { "epoch": 0.0009, "grad_norm": 0.3054894506931305, "learning_rate": 1e-05, "loss": 0.0445, "step": 90000 }, { "epoch": 0.000901, "grad_norm": 0.2873828113079071, "learning_rate": 1e-05, "loss": 0.0446, "step": 90100 }, { "epoch": 0.000902, "grad_norm": 0.37362852692604065, "learning_rate": 1e-05, "loss": 0.0447, "step": 90200 }, { "epoch": 0.000903, "grad_norm": 0.3093227446079254, "learning_rate": 1e-05, "loss": 0.0456, "step": 90300 }, { "epoch": 0.000904, "grad_norm": 0.3574603497982025, "learning_rate": 1e-05, "loss": 0.0444, "step": 90400 }, { "epoch": 0.000905, "grad_norm": 0.40761929750442505, "learning_rate": 1e-05, "loss": 0.0446, "step": 90500 }, { "epoch": 0.000906, "grad_norm": 0.31372490525245667, "learning_rate": 1e-05, "loss": 0.0446, "step": 90600 }, { "epoch": 0.000907, "grad_norm": 0.29550039768218994, "learning_rate": 1e-05, "loss": 0.0446, "step": 90700 }, { "epoch": 0.000908, "grad_norm": 0.3591623604297638, "learning_rate": 1e-05, "loss": 0.0451, "step": 90800 }, { "epoch": 0.000909, "grad_norm": 0.36907652020454407, "learning_rate": 1e-05, "loss": 0.0446, "step": 90900 }, { "epoch": 0.00091, "grad_norm": 0.2970718443393707, "learning_rate": 1e-05, "loss": 0.0446, "step": 91000 }, { "epoch": 0.000911, "grad_norm": 0.3444265127182007, "learning_rate": 1e-05, "loss": 0.0436, "step": 91100 }, { "epoch": 0.000912, "grad_norm": 0.30564844608306885, "learning_rate": 1e-05, "loss": 0.0443, "step": 91200 }, { "epoch": 0.000913, "grad_norm": 0.26162412762641907, "learning_rate": 1e-05, "loss": 0.045, "step": 91300 }, { "epoch": 0.000914, "grad_norm": 0.25329187512397766, "learning_rate": 1e-05, "loss": 0.045, "step": 91400 }, { "epoch": 0.000915, "grad_norm": 0.3086632192134857, "learning_rate": 1e-05, "loss": 0.0439, "step": 91500 }, { "epoch": 0.000916, "grad_norm": 0.2906673848628998, "learning_rate": 1e-05, "loss": 0.0446, "step": 91600 }, { "epoch": 0.000917, "grad_norm": 0.29768794775009155, "learning_rate": 1e-05, "loss": 0.0449, "step": 91700 }, { "epoch": 0.000918, "grad_norm": 0.27093201875686646, "learning_rate": 1e-05, "loss": 0.045, "step": 91800 }, { "epoch": 0.000919, "grad_norm": 0.32795876264572144, "learning_rate": 1e-05, "loss": 0.044, "step": 91900 }, { "epoch": 0.00092, "grad_norm": 0.4039120376110077, "learning_rate": 1e-05, "loss": 0.0433, "step": 92000 }, { "epoch": 0.000921, "grad_norm": 0.3751141130924225, "learning_rate": 1e-05, "loss": 0.0445, "step": 92100 }, { "epoch": 0.000922, "grad_norm": 0.30629658699035645, "learning_rate": 1e-05, "loss": 0.0443, "step": 92200 }, { "epoch": 0.000923, "grad_norm": 0.27837151288986206, "learning_rate": 1e-05, "loss": 0.0439, "step": 92300 }, { "epoch": 0.000924, "grad_norm": 0.2930457592010498, "learning_rate": 1e-05, "loss": 0.0447, "step": 92400 }, { "epoch": 0.000925, "grad_norm": 0.3771900236606598, "learning_rate": 1e-05, "loss": 0.0439, "step": 92500 }, { "epoch": 0.000926, "grad_norm": 0.3665456771850586, "learning_rate": 1e-05, "loss": 0.0439, "step": 92600 }, { "epoch": 0.000927, "grad_norm": 0.28601157665252686, "learning_rate": 1e-05, "loss": 0.0437, "step": 92700 }, { "epoch": 0.000928, "grad_norm": 0.26007533073425293, "learning_rate": 1e-05, "loss": 0.045, "step": 92800 }, { "epoch": 0.000929, "grad_norm": 0.2893894612789154, "learning_rate": 1e-05, "loss": 0.0439, "step": 92900 }, { "epoch": 0.00093, "grad_norm": 0.3981125056743622, "learning_rate": 1e-05, "loss": 0.045, "step": 93000 }, { "epoch": 0.000931, "grad_norm": 0.320941299200058, "learning_rate": 1e-05, "loss": 0.0435, "step": 93100 }, { "epoch": 0.000932, "grad_norm": 0.2948634624481201, "learning_rate": 1e-05, "loss": 0.0431, "step": 93200 }, { "epoch": 0.000933, "grad_norm": 0.2815658748149872, "learning_rate": 1e-05, "loss": 0.0436, "step": 93300 }, { "epoch": 0.000934, "grad_norm": 0.3733885884284973, "learning_rate": 1e-05, "loss": 0.045, "step": 93400 }, { "epoch": 0.000935, "grad_norm": 0.287563294172287, "learning_rate": 1e-05, "loss": 0.0441, "step": 93500 }, { "epoch": 0.000936, "grad_norm": 0.2510587275028229, "learning_rate": 1e-05, "loss": 0.0437, "step": 93600 }, { "epoch": 0.000937, "grad_norm": 0.32831403613090515, "learning_rate": 1e-05, "loss": 0.0443, "step": 93700 }, { "epoch": 0.000938, "grad_norm": 0.2637534439563751, "learning_rate": 1e-05, "loss": 0.0449, "step": 93800 }, { "epoch": 0.000939, "grad_norm": 0.33846515417099, "learning_rate": 1e-05, "loss": 0.045, "step": 93900 }, { "epoch": 0.00094, "grad_norm": 0.30465173721313477, "learning_rate": 1e-05, "loss": 0.0442, "step": 94000 }, { "epoch": 0.000941, "grad_norm": 0.3105590045452118, "learning_rate": 1e-05, "loss": 0.0439, "step": 94100 }, { "epoch": 0.000942, "grad_norm": 0.32218441367149353, "learning_rate": 1e-05, "loss": 0.0437, "step": 94200 }, { "epoch": 0.000943, "grad_norm": 0.2730764150619507, "learning_rate": 1e-05, "loss": 0.0443, "step": 94300 }, { "epoch": 0.000944, "grad_norm": 0.32355883717536926, "learning_rate": 1e-05, "loss": 0.0432, "step": 94400 }, { "epoch": 0.000945, "grad_norm": 0.26898112893104553, "learning_rate": 1e-05, "loss": 0.0441, "step": 94500 }, { "epoch": 0.000946, "grad_norm": 0.29850703477859497, "learning_rate": 1e-05, "loss": 0.0441, "step": 94600 }, { "epoch": 0.000947, "grad_norm": 0.3214324414730072, "learning_rate": 1e-05, "loss": 0.0444, "step": 94700 }, { "epoch": 0.000948, "grad_norm": 0.3554806411266327, "learning_rate": 1e-05, "loss": 0.0439, "step": 94800 }, { "epoch": 0.000949, "grad_norm": 0.37169715762138367, "learning_rate": 1e-05, "loss": 0.0442, "step": 94900 }, { "epoch": 0.00095, "grad_norm": 0.5487441420555115, "learning_rate": 1e-05, "loss": 0.0432, "step": 95000 }, { "epoch": 0.000951, "grad_norm": 0.4752700924873352, "learning_rate": 1e-05, "loss": 0.0444, "step": 95100 }, { "epoch": 0.000952, "grad_norm": 0.2988210618495941, "learning_rate": 1e-05, "loss": 0.0435, "step": 95200 }, { "epoch": 0.000953, "grad_norm": 0.301700621843338, "learning_rate": 1e-05, "loss": 0.0438, "step": 95300 }, { "epoch": 0.000954, "grad_norm": 0.3373384475708008, "learning_rate": 1e-05, "loss": 0.0439, "step": 95400 }, { "epoch": 0.000955, "grad_norm": 0.31041210889816284, "learning_rate": 1e-05, "loss": 0.0445, "step": 95500 }, { "epoch": 0.000956, "grad_norm": 0.27122679352760315, "learning_rate": 1e-05, "loss": 0.0425, "step": 95600 }, { "epoch": 0.000957, "grad_norm": 1.2427619695663452, "learning_rate": 1e-05, "loss": 0.0434, "step": 95700 }, { "epoch": 0.000958, "grad_norm": 0.29347118735313416, "learning_rate": 1e-05, "loss": 0.0445, "step": 95800 }, { "epoch": 0.000959, "grad_norm": 0.3172023296356201, "learning_rate": 1e-05, "loss": 0.0439, "step": 95900 }, { "epoch": 0.00096, "grad_norm": 0.5564087629318237, "learning_rate": 1e-05, "loss": 0.0439, "step": 96000 }, { "epoch": 0.000961, "grad_norm": 0.32653549313545227, "learning_rate": 1e-05, "loss": 0.0434, "step": 96100 }, { "epoch": 0.000962, "grad_norm": 0.2726428806781769, "learning_rate": 1e-05, "loss": 0.043, "step": 96200 }, { "epoch": 0.000963, "grad_norm": 0.29731717705726624, "learning_rate": 1e-05, "loss": 0.044, "step": 96300 }, { "epoch": 0.000964, "grad_norm": 0.27266982197761536, "learning_rate": 1e-05, "loss": 0.0432, "step": 96400 }, { "epoch": 0.000965, "grad_norm": 0.42307907342910767, "learning_rate": 1e-05, "loss": 0.0434, "step": 96500 }, { "epoch": 0.000966, "grad_norm": 0.3745628297328949, "learning_rate": 1e-05, "loss": 0.0434, "step": 96600 }, { "epoch": 0.000967, "grad_norm": 0.3205169141292572, "learning_rate": 1e-05, "loss": 0.0433, "step": 96700 }, { "epoch": 0.000968, "grad_norm": 0.26576706767082214, "learning_rate": 1e-05, "loss": 0.0429, "step": 96800 }, { "epoch": 0.000969, "grad_norm": 0.2742008864879608, "learning_rate": 1e-05, "loss": 0.0436, "step": 96900 }, { "epoch": 0.00097, "grad_norm": 0.32406845688819885, "learning_rate": 1e-05, "loss": 0.0429, "step": 97000 }, { "epoch": 0.000971, "grad_norm": 0.36446699500083923, "learning_rate": 1e-05, "loss": 0.0436, "step": 97100 }, { "epoch": 0.000972, "grad_norm": 0.2812323570251465, "learning_rate": 1e-05, "loss": 0.0429, "step": 97200 }, { "epoch": 0.000973, "grad_norm": 0.3129689395427704, "learning_rate": 1e-05, "loss": 0.0431, "step": 97300 }, { "epoch": 0.000974, "grad_norm": 0.27977362275123596, "learning_rate": 1e-05, "loss": 0.0429, "step": 97400 }, { "epoch": 0.000975, "grad_norm": 0.3748459815979004, "learning_rate": 1e-05, "loss": 0.0431, "step": 97500 }, { "epoch": 0.000976, "grad_norm": 0.29172664880752563, "learning_rate": 1e-05, "loss": 0.0435, "step": 97600 }, { "epoch": 0.000977, "grad_norm": 0.28548872470855713, "learning_rate": 1e-05, "loss": 0.043, "step": 97700 }, { "epoch": 0.000978, "grad_norm": 0.31074953079223633, "learning_rate": 1e-05, "loss": 0.0429, "step": 97800 }, { "epoch": 0.000979, "grad_norm": 0.3147795498371124, "learning_rate": 1e-05, "loss": 0.0434, "step": 97900 }, { "epoch": 0.00098, "grad_norm": 0.2891879677772522, "learning_rate": 1e-05, "loss": 0.0429, "step": 98000 }, { "epoch": 0.000981, "grad_norm": 0.253547728061676, "learning_rate": 1e-05, "loss": 0.0433, "step": 98100 }, { "epoch": 0.000982, "grad_norm": 0.6176358461380005, "learning_rate": 1e-05, "loss": 0.0424, "step": 98200 }, { "epoch": 0.000983, "grad_norm": 0.37052324414253235, "learning_rate": 1e-05, "loss": 0.0439, "step": 98300 }, { "epoch": 0.000984, "grad_norm": 0.38643914461135864, "learning_rate": 1e-05, "loss": 0.0432, "step": 98400 }, { "epoch": 0.000985, "grad_norm": 0.28971168398857117, "learning_rate": 1e-05, "loss": 0.0434, "step": 98500 }, { "epoch": 0.000986, "grad_norm": 0.34310775995254517, "learning_rate": 1e-05, "loss": 0.0437, "step": 98600 }, { "epoch": 0.000987, "grad_norm": 0.2895912230014801, "learning_rate": 1e-05, "loss": 0.0432, "step": 98700 }, { "epoch": 0.000988, "grad_norm": 0.3242495656013489, "learning_rate": 1e-05, "loss": 0.0437, "step": 98800 }, { "epoch": 0.000989, "grad_norm": 0.8490259647369385, "learning_rate": 1e-05, "loss": 0.0425, "step": 98900 }, { "epoch": 0.00099, "grad_norm": 0.3516567051410675, "learning_rate": 1e-05, "loss": 0.0435, "step": 99000 }, { "epoch": 0.000991, "grad_norm": 0.2762267589569092, "learning_rate": 1e-05, "loss": 0.0434, "step": 99100 }, { "epoch": 0.000992, "grad_norm": 0.3795773684978485, "learning_rate": 1e-05, "loss": 0.0442, "step": 99200 }, { "epoch": 0.000993, "grad_norm": 0.24358591437339783, "learning_rate": 1e-05, "loss": 0.0433, "step": 99300 }, { "epoch": 0.000994, "grad_norm": 0.31998828053474426, "learning_rate": 1e-05, "loss": 0.0439, "step": 99400 }, { "epoch": 0.000995, "grad_norm": 0.24701328575611115, "learning_rate": 1e-05, "loss": 0.0434, "step": 99500 }, { "epoch": 0.000996, "grad_norm": 0.30590155720710754, "learning_rate": 1e-05, "loss": 0.0427, "step": 99600 }, { "epoch": 0.000997, "grad_norm": 0.3667706549167633, "learning_rate": 1e-05, "loss": 0.0425, "step": 99700 }, { "epoch": 0.000998, "grad_norm": 0.27431920170783997, "learning_rate": 1e-05, "loss": 0.0431, "step": 99800 }, { "epoch": 0.000999, "grad_norm": 0.3364028334617615, "learning_rate": 1e-05, "loss": 0.0429, "step": 99900 }, { "epoch": 0.001, "grad_norm": 0.3014712333679199, "learning_rate": 1e-05, "loss": 0.0425, "step": 100000 }, { "epoch": 0.001, "eval_loss": 0.03758490830659866, "eval_runtime": 212.1366, "eval_samples_per_second": 235.697, "eval_steps_per_second": 14.731, "step": 100000 }, { "epoch": 0.001001, "grad_norm": 0.3600820004940033, "learning_rate": 1e-05, "loss": 0.0431, "step": 100100 }, { "epoch": 0.001002, "grad_norm": 0.3437288701534271, "learning_rate": 1e-05, "loss": 0.0422, "step": 100200 }, { "epoch": 0.001003, "grad_norm": 0.4374743700027466, "learning_rate": 1e-05, "loss": 0.0431, "step": 100300 }, { "epoch": 0.001004, "grad_norm": 0.3906211853027344, "learning_rate": 1e-05, "loss": 0.043, "step": 100400 }, { "epoch": 0.001005, "grad_norm": 0.3004099428653717, "learning_rate": 1e-05, "loss": 0.0425, "step": 100500 }, { "epoch": 0.001006, "grad_norm": 0.3393288850784302, "learning_rate": 1e-05, "loss": 0.0424, "step": 100600 }, { "epoch": 0.001007, "grad_norm": 0.29948556423187256, "learning_rate": 1e-05, "loss": 0.0428, "step": 100700 }, { "epoch": 0.001008, "grad_norm": 0.26268288493156433, "learning_rate": 1e-05, "loss": 0.0418, "step": 100800 }, { "epoch": 0.001009, "grad_norm": 0.3090648055076599, "learning_rate": 1e-05, "loss": 0.0432, "step": 100900 }, { "epoch": 0.00101, "grad_norm": 0.3150724172592163, "learning_rate": 1e-05, "loss": 0.0421, "step": 101000 }, { "epoch": 0.001011, "grad_norm": 0.27027425169944763, "learning_rate": 1e-05, "loss": 0.0429, "step": 101100 }, { "epoch": 0.001012, "grad_norm": 0.2704673409461975, "learning_rate": 1e-05, "loss": 0.0429, "step": 101200 }, { "epoch": 0.001013, "grad_norm": 0.3278999924659729, "learning_rate": 1e-05, "loss": 0.0425, "step": 101300 }, { "epoch": 0.001014, "grad_norm": 0.3307479918003082, "learning_rate": 1e-05, "loss": 0.0429, "step": 101400 }, { "epoch": 0.001015, "grad_norm": 0.25723832845687866, "learning_rate": 1e-05, "loss": 0.0422, "step": 101500 }, { "epoch": 0.001016, "grad_norm": 0.2673822343349457, "learning_rate": 1e-05, "loss": 0.0419, "step": 101600 }, { "epoch": 0.001017, "grad_norm": 0.33123090863227844, "learning_rate": 1e-05, "loss": 0.042, "step": 101700 }, { "epoch": 0.001018, "grad_norm": 0.27778753638267517, "learning_rate": 1e-05, "loss": 0.0421, "step": 101800 }, { "epoch": 0.001019, "grad_norm": 0.3065844774246216, "learning_rate": 1e-05, "loss": 0.041, "step": 101900 }, { "epoch": 0.00102, "grad_norm": 0.2914925217628479, "learning_rate": 1e-05, "loss": 0.0414, "step": 102000 }, { "epoch": 0.001021, "grad_norm": 0.3135759234428406, "learning_rate": 1e-05, "loss": 0.0424, "step": 102100 }, { "epoch": 0.001022, "grad_norm": 0.3547412157058716, "learning_rate": 1e-05, "loss": 0.0428, "step": 102200 }, { "epoch": 0.001023, "grad_norm": 0.4109500050544739, "learning_rate": 1e-05, "loss": 0.0426, "step": 102300 }, { "epoch": 0.001024, "grad_norm": 0.3496847450733185, "learning_rate": 1e-05, "loss": 0.0426, "step": 102400 }, { "epoch": 0.001025, "grad_norm": 0.26087436079978943, "learning_rate": 1e-05, "loss": 0.0427, "step": 102500 }, { "epoch": 0.001026, "grad_norm": 0.2786722481250763, "learning_rate": 1e-05, "loss": 0.0424, "step": 102600 }, { "epoch": 0.001027, "grad_norm": 0.3218824565410614, "learning_rate": 1e-05, "loss": 0.0422, "step": 102700 }, { "epoch": 0.001028, "grad_norm": 0.27059468626976013, "learning_rate": 1e-05, "loss": 0.0422, "step": 102800 }, { "epoch": 0.001029, "grad_norm": 0.4380857050418854, "learning_rate": 1e-05, "loss": 0.0425, "step": 102900 }, { "epoch": 0.00103, "grad_norm": 0.3089819550514221, "learning_rate": 1e-05, "loss": 0.0427, "step": 103000 }, { "epoch": 0.001031, "grad_norm": 0.3011089265346527, "learning_rate": 1e-05, "loss": 0.042, "step": 103100 }, { "epoch": 0.001032, "grad_norm": 0.3286416828632355, "learning_rate": 1e-05, "loss": 0.0429, "step": 103200 }, { "epoch": 0.001033, "grad_norm": 0.24805861711502075, "learning_rate": 1e-05, "loss": 0.0417, "step": 103300 }, { "epoch": 0.001034, "grad_norm": 0.30277788639068604, "learning_rate": 1e-05, "loss": 0.0421, "step": 103400 }, { "epoch": 0.001035, "grad_norm": 0.25480252504348755, "learning_rate": 1e-05, "loss": 0.0424, "step": 103500 }, { "epoch": 0.001036, "grad_norm": 0.3024289906024933, "learning_rate": 1e-05, "loss": 0.0431, "step": 103600 }, { "epoch": 0.001037, "grad_norm": 0.36490127444267273, "learning_rate": 1e-05, "loss": 0.0427, "step": 103700 }, { "epoch": 0.001038, "grad_norm": 0.3333105444908142, "learning_rate": 1e-05, "loss": 0.0423, "step": 103800 }, { "epoch": 0.001039, "grad_norm": 0.4482794404029846, "learning_rate": 1e-05, "loss": 0.0427, "step": 103900 }, { "epoch": 0.00104, "grad_norm": 0.32482078671455383, "learning_rate": 1e-05, "loss": 0.042, "step": 104000 }, { "epoch": 0.001041, "grad_norm": 0.26272276043891907, "learning_rate": 1e-05, "loss": 0.0417, "step": 104100 }, { "epoch": 0.001042, "grad_norm": 0.35563990473747253, "learning_rate": 1e-05, "loss": 0.0409, "step": 104200 }, { "epoch": 0.001043, "grad_norm": 0.26554933190345764, "learning_rate": 1e-05, "loss": 0.0421, "step": 104300 }, { "epoch": 0.001044, "grad_norm": 0.4722619354724884, "learning_rate": 1e-05, "loss": 0.0419, "step": 104400 }, { "epoch": 0.001045, "grad_norm": 0.26303401589393616, "learning_rate": 1e-05, "loss": 0.0419, "step": 104500 }, { "epoch": 0.001046, "grad_norm": 0.25386306643486023, "learning_rate": 1e-05, "loss": 0.0418, "step": 104600 }, { "epoch": 0.001047, "grad_norm": 0.30301883816719055, "learning_rate": 1e-05, "loss": 0.042, "step": 104700 }, { "epoch": 0.001048, "grad_norm": 0.2415623962879181, "learning_rate": 1e-05, "loss": 0.0426, "step": 104800 }, { "epoch": 0.001049, "grad_norm": 0.2489190697669983, "learning_rate": 1e-05, "loss": 0.0424, "step": 104900 }, { "epoch": 0.00105, "grad_norm": 0.31197717785835266, "learning_rate": 1e-05, "loss": 0.0422, "step": 105000 }, { "epoch": 0.001051, "grad_norm": 0.4588204622268677, "learning_rate": 1e-05, "loss": 0.0419, "step": 105100 }, { "epoch": 0.001052, "grad_norm": 0.3295010030269623, "learning_rate": 1e-05, "loss": 0.0419, "step": 105200 }, { "epoch": 0.001053, "grad_norm": 0.44823509454727173, "learning_rate": 1e-05, "loss": 0.0425, "step": 105300 }, { "epoch": 0.001054, "grad_norm": 0.3709229826927185, "learning_rate": 1e-05, "loss": 0.0412, "step": 105400 }, { "epoch": 0.001055, "grad_norm": 0.42086902260780334, "learning_rate": 1e-05, "loss": 0.0414, "step": 105500 }, { "epoch": 0.001056, "grad_norm": 0.33265379071235657, "learning_rate": 1e-05, "loss": 0.0418, "step": 105600 }, { "epoch": 0.001057, "grad_norm": 0.36545461416244507, "learning_rate": 1e-05, "loss": 0.0416, "step": 105700 }, { "epoch": 0.001058, "grad_norm": 0.3076435327529907, "learning_rate": 1e-05, "loss": 0.0415, "step": 105800 }, { "epoch": 0.001059, "grad_norm": 0.3301655352115631, "learning_rate": 1e-05, "loss": 0.0416, "step": 105900 }, { "epoch": 0.00106, "grad_norm": 0.41150417923927307, "learning_rate": 1e-05, "loss": 0.0413, "step": 106000 }, { "epoch": 0.001061, "grad_norm": 0.2820735275745392, "learning_rate": 1e-05, "loss": 0.0424, "step": 106100 }, { "epoch": 0.001062, "grad_norm": 0.35754042863845825, "learning_rate": 1e-05, "loss": 0.0416, "step": 106200 }, { "epoch": 0.001063, "grad_norm": 0.23662069439888, "learning_rate": 1e-05, "loss": 0.0419, "step": 106300 }, { "epoch": 0.001064, "grad_norm": 0.43722280859947205, "learning_rate": 1e-05, "loss": 0.0421, "step": 106400 }, { "epoch": 0.001065, "grad_norm": 0.315915584564209, "learning_rate": 1e-05, "loss": 0.0413, "step": 106500 }, { "epoch": 0.001066, "grad_norm": 0.29390791058540344, "learning_rate": 1e-05, "loss": 0.0415, "step": 106600 }, { "epoch": 0.001067, "grad_norm": 0.32245537638664246, "learning_rate": 1e-05, "loss": 0.0414, "step": 106700 }, { "epoch": 0.001068, "grad_norm": 0.30486103892326355, "learning_rate": 1e-05, "loss": 0.0416, "step": 106800 }, { "epoch": 0.001069, "grad_norm": 0.3411523103713989, "learning_rate": 1e-05, "loss": 0.0416, "step": 106900 }, { "epoch": 0.00107, "grad_norm": 0.34101754426956177, "learning_rate": 1e-05, "loss": 0.0414, "step": 107000 }, { "epoch": 0.001071, "grad_norm": 0.2802806794643402, "learning_rate": 1e-05, "loss": 0.0414, "step": 107100 }, { "epoch": 0.001072, "grad_norm": 0.2565758228302002, "learning_rate": 1e-05, "loss": 0.0417, "step": 107200 }, { "epoch": 0.001073, "grad_norm": 0.5222147703170776, "learning_rate": 1e-05, "loss": 0.0412, "step": 107300 }, { "epoch": 0.001074, "grad_norm": 0.3869835138320923, "learning_rate": 1e-05, "loss": 0.041, "step": 107400 }, { "epoch": 0.001075, "grad_norm": 0.270320326089859, "learning_rate": 1e-05, "loss": 0.0413, "step": 107500 }, { "epoch": 0.001076, "grad_norm": 0.32045966386795044, "learning_rate": 1e-05, "loss": 0.0422, "step": 107600 }, { "epoch": 0.001077, "grad_norm": 0.2832574248313904, "learning_rate": 1e-05, "loss": 0.0419, "step": 107700 }, { "epoch": 0.001078, "grad_norm": 0.27641215920448303, "learning_rate": 1e-05, "loss": 0.0415, "step": 107800 }, { "epoch": 0.001079, "grad_norm": 0.32512393593788147, "learning_rate": 1e-05, "loss": 0.0422, "step": 107900 }, { "epoch": 0.00108, "grad_norm": 0.27362582087516785, "learning_rate": 1e-05, "loss": 0.0422, "step": 108000 }, { "epoch": 0.001081, "grad_norm": 0.24896275997161865, "learning_rate": 1e-05, "loss": 0.0419, "step": 108100 }, { "epoch": 0.001082, "grad_norm": 0.46317827701568604, "learning_rate": 1e-05, "loss": 0.0415, "step": 108200 }, { "epoch": 0.001083, "grad_norm": 0.2799619436264038, "learning_rate": 1e-05, "loss": 0.0408, "step": 108300 }, { "epoch": 0.001084, "grad_norm": 0.33896782994270325, "learning_rate": 1e-05, "loss": 0.0412, "step": 108400 }, { "epoch": 0.001085, "grad_norm": 0.2619785666465759, "learning_rate": 1e-05, "loss": 0.0415, "step": 108500 }, { "epoch": 0.001086, "grad_norm": 0.27988171577453613, "learning_rate": 1e-05, "loss": 0.0414, "step": 108600 }, { "epoch": 0.001087, "grad_norm": 0.3690216541290283, "learning_rate": 1e-05, "loss": 0.042, "step": 108700 }, { "epoch": 0.001088, "grad_norm": 0.2924842834472656, "learning_rate": 1e-05, "loss": 0.0414, "step": 108800 }, { "epoch": 0.001089, "grad_norm": 0.3515413999557495, "learning_rate": 1e-05, "loss": 0.0413, "step": 108900 }, { "epoch": 0.00109, "grad_norm": 0.3179611563682556, "learning_rate": 1e-05, "loss": 0.041, "step": 109000 }, { "epoch": 0.001091, "grad_norm": 0.28372862935066223, "learning_rate": 1e-05, "loss": 0.041, "step": 109100 }, { "epoch": 0.001092, "grad_norm": 0.30155834555625916, "learning_rate": 1e-05, "loss": 0.0415, "step": 109200 }, { "epoch": 0.001093, "grad_norm": 0.3081270754337311, "learning_rate": 1e-05, "loss": 0.0411, "step": 109300 }, { "epoch": 0.001094, "grad_norm": 0.32964420318603516, "learning_rate": 1e-05, "loss": 0.0407, "step": 109400 }, { "epoch": 0.001095, "grad_norm": 0.31642162799835205, "learning_rate": 1e-05, "loss": 0.0407, "step": 109500 }, { "epoch": 0.001096, "grad_norm": 0.23503410816192627, "learning_rate": 1e-05, "loss": 0.0408, "step": 109600 }, { "epoch": 0.001097, "grad_norm": 0.42588627338409424, "learning_rate": 1e-05, "loss": 0.0423, "step": 109700 }, { "epoch": 0.001098, "grad_norm": 0.3393394351005554, "learning_rate": 1e-05, "loss": 0.0408, "step": 109800 }, { "epoch": 0.001099, "grad_norm": 0.31233447790145874, "learning_rate": 1e-05, "loss": 0.041, "step": 109900 }, { "epoch": 0.0011, "grad_norm": 0.28448012471199036, "learning_rate": 1e-05, "loss": 0.0406, "step": 110000 }, { "epoch": 0.001101, "grad_norm": 0.24423803389072418, "learning_rate": 1e-05, "loss": 0.0413, "step": 110100 }, { "epoch": 0.001102, "grad_norm": 0.29771897196769714, "learning_rate": 1e-05, "loss": 0.0409, "step": 110200 }, { "epoch": 0.001103, "grad_norm": 0.3095645010471344, "learning_rate": 1e-05, "loss": 0.0403, "step": 110300 }, { "epoch": 0.001104, "grad_norm": 0.2559695541858673, "learning_rate": 1e-05, "loss": 0.0406, "step": 110400 }, { "epoch": 0.001105, "grad_norm": 0.4614701271057129, "learning_rate": 1e-05, "loss": 0.0414, "step": 110500 }, { "epoch": 0.001106, "grad_norm": 0.3098299205303192, "learning_rate": 1e-05, "loss": 0.0405, "step": 110600 }, { "epoch": 0.001107, "grad_norm": 0.3528378903865814, "learning_rate": 1e-05, "loss": 0.0415, "step": 110700 }, { "epoch": 0.001108, "grad_norm": 0.2913011312484741, "learning_rate": 1e-05, "loss": 0.0406, "step": 110800 }, { "epoch": 0.001109, "grad_norm": 0.3564182221889496, "learning_rate": 1e-05, "loss": 0.0413, "step": 110900 }, { "epoch": 0.00111, "grad_norm": 0.29625824093818665, "learning_rate": 1e-05, "loss": 0.0411, "step": 111000 }, { "epoch": 0.001111, "grad_norm": 0.2920742630958557, "learning_rate": 1e-05, "loss": 0.0409, "step": 111100 }, { "epoch": 0.001112, "grad_norm": 0.34061306715011597, "learning_rate": 1e-05, "loss": 0.0415, "step": 111200 }, { "epoch": 0.001113, "grad_norm": 0.27325764298439026, "learning_rate": 1e-05, "loss": 0.0409, "step": 111300 }, { "epoch": 0.001114, "grad_norm": 0.3565654754638672, "learning_rate": 1e-05, "loss": 0.0407, "step": 111400 }, { "epoch": 0.001115, "grad_norm": 0.27489152550697327, "learning_rate": 1e-05, "loss": 0.0409, "step": 111500 }, { "epoch": 0.001116, "grad_norm": 0.29584425687789917, "learning_rate": 1e-05, "loss": 0.0409, "step": 111600 }, { "epoch": 0.001117, "grad_norm": 0.289985328912735, "learning_rate": 1e-05, "loss": 0.0411, "step": 111700 }, { "epoch": 0.001118, "grad_norm": 0.2522970139980316, "learning_rate": 1e-05, "loss": 0.0394, "step": 111800 }, { "epoch": 0.001119, "grad_norm": 0.48581740260124207, "learning_rate": 1e-05, "loss": 0.0406, "step": 111900 }, { "epoch": 0.00112, "grad_norm": 0.3156406581401825, "learning_rate": 1e-05, "loss": 0.0409, "step": 112000 }, { "epoch": 0.001121, "grad_norm": 0.3361566960811615, "learning_rate": 1e-05, "loss": 0.0407, "step": 112100 }, { "epoch": 0.001122, "grad_norm": 0.33062365651130676, "learning_rate": 1e-05, "loss": 0.0406, "step": 112200 }, { "epoch": 0.001123, "grad_norm": 0.3271249532699585, "learning_rate": 1e-05, "loss": 0.0412, "step": 112300 }, { "epoch": 0.001124, "grad_norm": 0.4379163086414337, "learning_rate": 1e-05, "loss": 0.0404, "step": 112400 }, { "epoch": 0.001125, "grad_norm": 0.3108104467391968, "learning_rate": 1e-05, "loss": 0.0402, "step": 112500 }, { "epoch": 0.001126, "grad_norm": 0.25851625204086304, "learning_rate": 1e-05, "loss": 0.0416, "step": 112600 }, { "epoch": 0.001127, "grad_norm": 0.24953243136405945, "learning_rate": 1e-05, "loss": 0.0409, "step": 112700 }, { "epoch": 0.001128, "grad_norm": 0.23491275310516357, "learning_rate": 1e-05, "loss": 0.0412, "step": 112800 }, { "epoch": 0.001129, "grad_norm": 0.2658729553222656, "learning_rate": 1e-05, "loss": 0.0407, "step": 112900 }, { "epoch": 0.00113, "grad_norm": 0.31635645031929016, "learning_rate": 1e-05, "loss": 0.0407, "step": 113000 }, { "epoch": 0.001131, "grad_norm": 0.3112212121486664, "learning_rate": 1e-05, "loss": 0.0416, "step": 113100 }, { "epoch": 0.001132, "grad_norm": 0.34162387251853943, "learning_rate": 1e-05, "loss": 0.0411, "step": 113200 }, { "epoch": 0.001133, "grad_norm": 0.3131328225135803, "learning_rate": 1e-05, "loss": 0.0407, "step": 113300 }, { "epoch": 0.001134, "grad_norm": 0.2678537964820862, "learning_rate": 1e-05, "loss": 0.0409, "step": 113400 }, { "epoch": 0.001135, "grad_norm": 0.2659609913825989, "learning_rate": 1e-05, "loss": 0.0402, "step": 113500 }, { "epoch": 0.001136, "grad_norm": 0.2517271935939789, "learning_rate": 1e-05, "loss": 0.0405, "step": 113600 }, { "epoch": 0.001137, "grad_norm": 0.4701042175292969, "learning_rate": 1e-05, "loss": 0.0407, "step": 113700 }, { "epoch": 0.001138, "grad_norm": 0.2426786869764328, "learning_rate": 1e-05, "loss": 0.04, "step": 113800 }, { "epoch": 0.001139, "grad_norm": 0.42582666873931885, "learning_rate": 1e-05, "loss": 0.0404, "step": 113900 }, { "epoch": 0.00114, "grad_norm": 0.2715524733066559, "learning_rate": 1e-05, "loss": 0.0405, "step": 114000 }, { "epoch": 0.001141, "grad_norm": 0.31258702278137207, "learning_rate": 1e-05, "loss": 0.0409, "step": 114100 }, { "epoch": 0.001142, "grad_norm": 0.3225792944431305, "learning_rate": 1e-05, "loss": 0.0411, "step": 114200 }, { "epoch": 0.001143, "grad_norm": 0.25024500489234924, "learning_rate": 1e-05, "loss": 0.0406, "step": 114300 }, { "epoch": 0.001144, "grad_norm": 0.2840322256088257, "learning_rate": 1e-05, "loss": 0.0403, "step": 114400 }, { "epoch": 0.001145, "grad_norm": 0.30635979771614075, "learning_rate": 1e-05, "loss": 0.0398, "step": 114500 }, { "epoch": 0.001146, "grad_norm": 0.364570289850235, "learning_rate": 1e-05, "loss": 0.0414, "step": 114600 }, { "epoch": 0.001147, "grad_norm": 0.3620624840259552, "learning_rate": 1e-05, "loss": 0.04, "step": 114700 }, { "epoch": 0.001148, "grad_norm": 0.3164653480052948, "learning_rate": 1e-05, "loss": 0.0401, "step": 114800 }, { "epoch": 0.001149, "grad_norm": 0.2977074384689331, "learning_rate": 1e-05, "loss": 0.0404, "step": 114900 }, { "epoch": 0.00115, "grad_norm": 0.2921755909919739, "learning_rate": 1e-05, "loss": 0.041, "step": 115000 }, { "epoch": 0.001151, "grad_norm": 0.27668341994285583, "learning_rate": 1e-05, "loss": 0.0403, "step": 115100 }, { "epoch": 0.001152, "grad_norm": 0.333588182926178, "learning_rate": 1e-05, "loss": 0.0406, "step": 115200 }, { "epoch": 0.001153, "grad_norm": 0.26749905943870544, "learning_rate": 1e-05, "loss": 0.0403, "step": 115300 }, { "epoch": 0.001154, "grad_norm": 0.2865462899208069, "learning_rate": 1e-05, "loss": 0.0405, "step": 115400 }, { "epoch": 0.001155, "grad_norm": 0.29916709661483765, "learning_rate": 1e-05, "loss": 0.0398, "step": 115500 }, { "epoch": 0.001156, "grad_norm": 0.3161410987377167, "learning_rate": 1e-05, "loss": 0.0392, "step": 115600 }, { "epoch": 0.001157, "grad_norm": 0.29946115612983704, "learning_rate": 1e-05, "loss": 0.0407, "step": 115700 }, { "epoch": 0.001158, "grad_norm": 0.2623734474182129, "learning_rate": 1e-05, "loss": 0.0408, "step": 115800 }, { "epoch": 0.001159, "grad_norm": 0.2578590214252472, "learning_rate": 1e-05, "loss": 0.0401, "step": 115900 }, { "epoch": 0.00116, "grad_norm": 0.25838586688041687, "learning_rate": 1e-05, "loss": 0.0393, "step": 116000 }, { "epoch": 0.001161, "grad_norm": 0.3004215657711029, "learning_rate": 1e-05, "loss": 0.0394, "step": 116100 }, { "epoch": 0.001162, "grad_norm": 0.2695614993572235, "learning_rate": 1e-05, "loss": 0.0394, "step": 116200 }, { "epoch": 0.001163, "grad_norm": 0.31523579359054565, "learning_rate": 1e-05, "loss": 0.0408, "step": 116300 }, { "epoch": 0.001164, "grad_norm": 0.23632577061653137, "learning_rate": 1e-05, "loss": 0.0406, "step": 116400 }, { "epoch": 0.001165, "grad_norm": 0.3339804708957672, "learning_rate": 1e-05, "loss": 0.0405, "step": 116500 }, { "epoch": 0.001166, "grad_norm": 0.28059354424476624, "learning_rate": 1e-05, "loss": 0.0393, "step": 116600 }, { "epoch": 0.001167, "grad_norm": 0.2590678036212921, "learning_rate": 1e-05, "loss": 0.0402, "step": 116700 }, { "epoch": 0.001168, "grad_norm": 0.24643708765506744, "learning_rate": 1e-05, "loss": 0.0398, "step": 116800 }, { "epoch": 0.001169, "grad_norm": 0.2586285471916199, "learning_rate": 1e-05, "loss": 0.0395, "step": 116900 }, { "epoch": 0.00117, "grad_norm": 0.3024047911167145, "learning_rate": 1e-05, "loss": 0.0399, "step": 117000 }, { "epoch": 0.001171, "grad_norm": 0.30159011483192444, "learning_rate": 1e-05, "loss": 0.0403, "step": 117100 }, { "epoch": 0.001172, "grad_norm": 0.3137671649456024, "learning_rate": 1e-05, "loss": 0.0405, "step": 117200 }, { "epoch": 0.001173, "grad_norm": 0.3134880065917969, "learning_rate": 1e-05, "loss": 0.04, "step": 117300 }, { "epoch": 0.001174, "grad_norm": 0.28703588247299194, "learning_rate": 1e-05, "loss": 0.04, "step": 117400 }, { "epoch": 0.001175, "grad_norm": 0.2983036935329437, "learning_rate": 1e-05, "loss": 0.0406, "step": 117500 }, { "epoch": 0.001176, "grad_norm": 0.3457993268966675, "learning_rate": 1e-05, "loss": 0.04, "step": 117600 }, { "epoch": 0.001177, "grad_norm": 0.42219504714012146, "learning_rate": 1e-05, "loss": 0.0399, "step": 117700 }, { "epoch": 0.001178, "grad_norm": 0.25757643580436707, "learning_rate": 1e-05, "loss": 0.0397, "step": 117800 }, { "epoch": 0.001179, "grad_norm": 0.2792271673679352, "learning_rate": 1e-05, "loss": 0.0388, "step": 117900 }, { "epoch": 0.00118, "grad_norm": 0.2346213012933731, "learning_rate": 1e-05, "loss": 0.0399, "step": 118000 }, { "epoch": 0.001181, "grad_norm": 0.2374030202627182, "learning_rate": 1e-05, "loss": 0.0391, "step": 118100 }, { "epoch": 0.001182, "grad_norm": 0.4347716271877289, "learning_rate": 1e-05, "loss": 0.0403, "step": 118200 }, { "epoch": 0.001183, "grad_norm": 0.26426148414611816, "learning_rate": 1e-05, "loss": 0.0397, "step": 118300 }, { "epoch": 0.001184, "grad_norm": 0.22731809318065643, "learning_rate": 1e-05, "loss": 0.0406, "step": 118400 }, { "epoch": 0.001185, "grad_norm": 0.28672975301742554, "learning_rate": 1e-05, "loss": 0.0396, "step": 118500 }, { "epoch": 0.001186, "grad_norm": 0.3868153393268585, "learning_rate": 1e-05, "loss": 0.0404, "step": 118600 }, { "epoch": 0.001187, "grad_norm": 0.348033607006073, "learning_rate": 1e-05, "loss": 0.0404, "step": 118700 }, { "epoch": 0.001188, "grad_norm": 0.23416511714458466, "learning_rate": 1e-05, "loss": 0.0395, "step": 118800 }, { "epoch": 0.001189, "grad_norm": 0.5253860354423523, "learning_rate": 1e-05, "loss": 0.0402, "step": 118900 }, { "epoch": 0.00119, "grad_norm": 0.2285417914390564, "learning_rate": 1e-05, "loss": 0.0401, "step": 119000 }, { "epoch": 0.001191, "grad_norm": 0.38372063636779785, "learning_rate": 1e-05, "loss": 0.0392, "step": 119100 }, { "epoch": 0.001192, "grad_norm": 0.38462480902671814, "learning_rate": 1e-05, "loss": 0.0394, "step": 119200 }, { "epoch": 0.001193, "grad_norm": 0.2637360990047455, "learning_rate": 1e-05, "loss": 0.0398, "step": 119300 }, { "epoch": 0.001194, "grad_norm": 0.2682214081287384, "learning_rate": 1e-05, "loss": 0.0399, "step": 119400 }, { "epoch": 0.001195, "grad_norm": 0.29552289843559265, "learning_rate": 1e-05, "loss": 0.0398, "step": 119500 }, { "epoch": 0.001196, "grad_norm": 0.31739193201065063, "learning_rate": 1e-05, "loss": 0.0393, "step": 119600 }, { "epoch": 0.001197, "grad_norm": 0.3245454430580139, "learning_rate": 1e-05, "loss": 0.0402, "step": 119700 }, { "epoch": 0.001198, "grad_norm": 0.2770306468009949, "learning_rate": 1e-05, "loss": 0.0397, "step": 119800 }, { "epoch": 0.001199, "grad_norm": 0.2661876976490021, "learning_rate": 1e-05, "loss": 0.0395, "step": 119900 }, { "epoch": 0.0012, "grad_norm": 0.3718849718570709, "learning_rate": 1e-05, "loss": 0.0399, "step": 120000 }, { "epoch": 0.0012, "eval_loss": 0.033286646008491516, "eval_runtime": 173.058, "eval_samples_per_second": 288.921, "eval_steps_per_second": 18.058, "step": 120000 }, { "epoch": 0.001201, "grad_norm": 0.30990082025527954, "learning_rate": 1e-05, "loss": 0.0403, "step": 120100 }, { "epoch": 0.001202, "grad_norm": 0.3355345129966736, "learning_rate": 1e-05, "loss": 0.0389, "step": 120200 }, { "epoch": 0.001203, "grad_norm": 0.3418188691139221, "learning_rate": 1e-05, "loss": 0.0395, "step": 120300 }, { "epoch": 0.001204, "grad_norm": 0.2886827290058136, "learning_rate": 1e-05, "loss": 0.04, "step": 120400 }, { "epoch": 0.001205, "grad_norm": 0.373727411031723, "learning_rate": 1e-05, "loss": 0.0393, "step": 120500 }, { "epoch": 0.001206, "grad_norm": 0.29651013016700745, "learning_rate": 1e-05, "loss": 0.0389, "step": 120600 }, { "epoch": 0.001207, "grad_norm": 0.23777133226394653, "learning_rate": 1e-05, "loss": 0.0394, "step": 120700 }, { "epoch": 0.001208, "grad_norm": 0.3108882009983063, "learning_rate": 1e-05, "loss": 0.04, "step": 120800 }, { "epoch": 0.001209, "grad_norm": 0.40241652727127075, "learning_rate": 1e-05, "loss": 0.0389, "step": 120900 }, { "epoch": 0.00121, "grad_norm": 0.43079206347465515, "learning_rate": 1e-05, "loss": 0.0394, "step": 121000 }, { "epoch": 0.001211, "grad_norm": 0.27280592918395996, "learning_rate": 1e-05, "loss": 0.039, "step": 121100 }, { "epoch": 0.001212, "grad_norm": 0.27316394448280334, "learning_rate": 1e-05, "loss": 0.0391, "step": 121200 }, { "epoch": 0.001213, "grad_norm": 0.39037245512008667, "learning_rate": 1e-05, "loss": 0.0403, "step": 121300 }, { "epoch": 0.001214, "grad_norm": 0.3806420862674713, "learning_rate": 1e-05, "loss": 0.0393, "step": 121400 }, { "epoch": 0.001215, "grad_norm": 0.3074783384799957, "learning_rate": 1e-05, "loss": 0.0399, "step": 121500 }, { "epoch": 0.001216, "grad_norm": 0.28789782524108887, "learning_rate": 1e-05, "loss": 0.0401, "step": 121600 }, { "epoch": 0.001217, "grad_norm": 0.295183002948761, "learning_rate": 1e-05, "loss": 0.0392, "step": 121700 }, { "epoch": 0.001218, "grad_norm": 0.2700131833553314, "learning_rate": 1e-05, "loss": 0.0396, "step": 121800 }, { "epoch": 0.001219, "grad_norm": 0.4575229287147522, "learning_rate": 1e-05, "loss": 0.0396, "step": 121900 }, { "epoch": 0.00122, "grad_norm": 0.26459062099456787, "learning_rate": 1e-05, "loss": 0.0395, "step": 122000 }, { "epoch": 0.001221, "grad_norm": 0.25325772166252136, "learning_rate": 1e-05, "loss": 0.0392, "step": 122100 }, { "epoch": 0.001222, "grad_norm": 0.2668212652206421, "learning_rate": 1e-05, "loss": 0.0382, "step": 122200 }, { "epoch": 0.001223, "grad_norm": 0.21850472688674927, "learning_rate": 1e-05, "loss": 0.0393, "step": 122300 }, { "epoch": 0.001224, "grad_norm": 0.27605777978897095, "learning_rate": 1e-05, "loss": 0.0391, "step": 122400 }, { "epoch": 0.001225, "grad_norm": 0.3576308488845825, "learning_rate": 1e-05, "loss": 0.0388, "step": 122500 }, { "epoch": 0.001226, "grad_norm": 0.24588488042354584, "learning_rate": 1e-05, "loss": 0.0388, "step": 122600 }, { "epoch": 0.001227, "grad_norm": 0.26362577080726624, "learning_rate": 1e-05, "loss": 0.0395, "step": 122700 }, { "epoch": 0.001228, "grad_norm": 0.35819968581199646, "learning_rate": 1e-05, "loss": 0.0385, "step": 122800 }, { "epoch": 0.001229, "grad_norm": 0.2863531708717346, "learning_rate": 1e-05, "loss": 0.0387, "step": 122900 }, { "epoch": 0.00123, "grad_norm": 0.27627554535865784, "learning_rate": 1e-05, "loss": 0.0391, "step": 123000 }, { "epoch": 0.001231, "grad_norm": 0.2435821294784546, "learning_rate": 1e-05, "loss": 0.0386, "step": 123100 }, { "epoch": 0.001232, "grad_norm": 0.33664873242378235, "learning_rate": 1e-05, "loss": 0.0386, "step": 123200 }, { "epoch": 0.001233, "grad_norm": 0.28725332021713257, "learning_rate": 1e-05, "loss": 0.0392, "step": 123300 }, { "epoch": 0.001234, "grad_norm": 0.2943493127822876, "learning_rate": 1e-05, "loss": 0.0399, "step": 123400 }, { "epoch": 0.001235, "grad_norm": 0.39345744252204895, "learning_rate": 1e-05, "loss": 0.0388, "step": 123500 }, { "epoch": 0.001236, "grad_norm": 0.3676437735557556, "learning_rate": 1e-05, "loss": 0.0397, "step": 123600 }, { "epoch": 0.001237, "grad_norm": 0.30897441506385803, "learning_rate": 1e-05, "loss": 0.0391, "step": 123700 }, { "epoch": 0.001238, "grad_norm": 0.3308824300765991, "learning_rate": 1e-05, "loss": 0.0392, "step": 123800 }, { "epoch": 0.001239, "grad_norm": 0.3199542164802551, "learning_rate": 1e-05, "loss": 0.039, "step": 123900 }, { "epoch": 0.00124, "grad_norm": 0.2804358899593353, "learning_rate": 1e-05, "loss": 0.0389, "step": 124000 }, { "epoch": 0.001241, "grad_norm": 0.27048102021217346, "learning_rate": 1e-05, "loss": 0.0388, "step": 124100 }, { "epoch": 0.001242, "grad_norm": 0.3876795172691345, "learning_rate": 1e-05, "loss": 0.0391, "step": 124200 }, { "epoch": 0.001243, "grad_norm": 0.28901731967926025, "learning_rate": 1e-05, "loss": 0.0381, "step": 124300 }, { "epoch": 0.001244, "grad_norm": 0.29046908020973206, "learning_rate": 1e-05, "loss": 0.039, "step": 124400 }, { "epoch": 0.001245, "grad_norm": 0.3421037495136261, "learning_rate": 1e-05, "loss": 0.0394, "step": 124500 }, { "epoch": 0.001246, "grad_norm": 0.22454392910003662, "learning_rate": 1e-05, "loss": 0.0386, "step": 124600 }, { "epoch": 0.001247, "grad_norm": 0.24638046324253082, "learning_rate": 1e-05, "loss": 0.0387, "step": 124700 }, { "epoch": 0.001248, "grad_norm": 0.2888917028903961, "learning_rate": 1e-05, "loss": 0.0386, "step": 124800 }, { "epoch": 0.001249, "grad_norm": 0.5723574757575989, "learning_rate": 1e-05, "loss": 0.0389, "step": 124900 }, { "epoch": 0.00125, "grad_norm": 0.3907880187034607, "learning_rate": 1e-05, "loss": 0.0386, "step": 125000 }, { "epoch": 0.001251, "grad_norm": 0.3683117926120758, "learning_rate": 1e-05, "loss": 0.0392, "step": 125100 }, { "epoch": 0.001252, "grad_norm": 0.2724876403808594, "learning_rate": 1e-05, "loss": 0.0388, "step": 125200 }, { "epoch": 0.001253, "grad_norm": 0.2940162718296051, "learning_rate": 1e-05, "loss": 0.039, "step": 125300 }, { "epoch": 0.001254, "grad_norm": 0.31821408867836, "learning_rate": 1e-05, "loss": 0.039, "step": 125400 }, { "epoch": 0.001255, "grad_norm": 0.33350062370300293, "learning_rate": 1e-05, "loss": 0.0383, "step": 125500 }, { "epoch": 0.001256, "grad_norm": 0.4433381259441376, "learning_rate": 1e-05, "loss": 0.039, "step": 125600 }, { "epoch": 0.001257, "grad_norm": 0.23397083580493927, "learning_rate": 1e-05, "loss": 0.0382, "step": 125700 }, { "epoch": 0.001258, "grad_norm": 0.27680906653404236, "learning_rate": 1e-05, "loss": 0.0381, "step": 125800 }, { "epoch": 0.001259, "grad_norm": 0.2746189534664154, "learning_rate": 1e-05, "loss": 0.0389, "step": 125900 }, { "epoch": 0.00126, "grad_norm": 0.2551538646221161, "learning_rate": 1e-05, "loss": 0.0389, "step": 126000 }, { "epoch": 0.001261, "grad_norm": 0.30969518423080444, "learning_rate": 1e-05, "loss": 0.0386, "step": 126100 }, { "epoch": 0.001262, "grad_norm": 0.3261389434337616, "learning_rate": 1e-05, "loss": 0.0391, "step": 126200 }, { "epoch": 0.001263, "grad_norm": 0.31876540184020996, "learning_rate": 1e-05, "loss": 0.0389, "step": 126300 }, { "epoch": 0.001264, "grad_norm": 0.25880858302116394, "learning_rate": 1e-05, "loss": 0.0387, "step": 126400 }, { "epoch": 0.001265, "grad_norm": 0.2761210799217224, "learning_rate": 1e-05, "loss": 0.0391, "step": 126500 }, { "epoch": 0.001266, "grad_norm": 0.2946438789367676, "learning_rate": 1e-05, "loss": 0.0392, "step": 126600 }, { "epoch": 0.001267, "grad_norm": 0.4052491784095764, "learning_rate": 1e-05, "loss": 0.0394, "step": 126700 }, { "epoch": 0.001268, "grad_norm": 0.2996732294559479, "learning_rate": 1e-05, "loss": 0.0391, "step": 126800 }, { "epoch": 0.001269, "grad_norm": 0.3514162003993988, "learning_rate": 1e-05, "loss": 0.0395, "step": 126900 }, { "epoch": 0.00127, "grad_norm": 0.39903080463409424, "learning_rate": 1e-05, "loss": 0.0383, "step": 127000 }, { "epoch": 0.001271, "grad_norm": 0.40052351355552673, "learning_rate": 1e-05, "loss": 0.0385, "step": 127100 }, { "epoch": 0.001272, "grad_norm": 0.2332727313041687, "learning_rate": 1e-05, "loss": 0.0385, "step": 127200 }, { "epoch": 0.001273, "grad_norm": 0.35858142375946045, "learning_rate": 1e-05, "loss": 0.0381, "step": 127300 }, { "epoch": 0.001274, "grad_norm": 0.27403345704078674, "learning_rate": 1e-05, "loss": 0.0389, "step": 127400 }, { "epoch": 0.001275, "grad_norm": 0.3137023448944092, "learning_rate": 1e-05, "loss": 0.0392, "step": 127500 }, { "epoch": 0.001276, "grad_norm": 0.3528238534927368, "learning_rate": 1e-05, "loss": 0.0386, "step": 127600 }, { "epoch": 0.001277, "grad_norm": 0.2936684191226959, "learning_rate": 1e-05, "loss": 0.0386, "step": 127700 }, { "epoch": 0.001278, "grad_norm": 0.25667810440063477, "learning_rate": 1e-05, "loss": 0.0385, "step": 127800 }, { "epoch": 0.001279, "grad_norm": 0.41605308651924133, "learning_rate": 1e-05, "loss": 0.0384, "step": 127900 }, { "epoch": 0.00128, "grad_norm": 0.23532454669475555, "learning_rate": 1e-05, "loss": 0.0389, "step": 128000 }, { "epoch": 0.001281, "grad_norm": 0.2681902050971985, "learning_rate": 1e-05, "loss": 0.0384, "step": 128100 }, { "epoch": 0.001282, "grad_norm": 0.33489149808883667, "learning_rate": 1e-05, "loss": 0.0379, "step": 128200 }, { "epoch": 0.001283, "grad_norm": 0.39637380838394165, "learning_rate": 1e-05, "loss": 0.0386, "step": 128300 }, { "epoch": 0.001284, "grad_norm": 0.36724480986595154, "learning_rate": 1e-05, "loss": 0.0383, "step": 128400 }, { "epoch": 0.001285, "grad_norm": 0.23808501660823822, "learning_rate": 1e-05, "loss": 0.0379, "step": 128500 }, { "epoch": 0.001286, "grad_norm": 0.4511736333370209, "learning_rate": 1e-05, "loss": 0.0383, "step": 128600 }, { "epoch": 0.001287, "grad_norm": 0.3220783770084381, "learning_rate": 1e-05, "loss": 0.0392, "step": 128700 }, { "epoch": 0.001288, "grad_norm": 0.2793736755847931, "learning_rate": 1e-05, "loss": 0.0386, "step": 128800 }, { "epoch": 0.001289, "grad_norm": 0.24454036355018616, "learning_rate": 1e-05, "loss": 0.0386, "step": 128900 }, { "epoch": 0.00129, "grad_norm": 0.3350345492362976, "learning_rate": 1e-05, "loss": 0.0384, "step": 129000 }, { "epoch": 0.001291, "grad_norm": 0.21636413037776947, "learning_rate": 1e-05, "loss": 0.0387, "step": 129100 }, { "epoch": 0.001292, "grad_norm": 0.3326009213924408, "learning_rate": 1e-05, "loss": 0.0383, "step": 129200 }, { "epoch": 0.001293, "grad_norm": 0.2936112582683563, "learning_rate": 1e-05, "loss": 0.0388, "step": 129300 }, { "epoch": 0.001294, "grad_norm": 0.3892439007759094, "learning_rate": 1e-05, "loss": 0.0386, "step": 129400 }, { "epoch": 0.001295, "grad_norm": 0.41272836923599243, "learning_rate": 1e-05, "loss": 0.0382, "step": 129500 }, { "epoch": 0.001296, "grad_norm": 0.34827834367752075, "learning_rate": 1e-05, "loss": 0.0383, "step": 129600 }, { "epoch": 0.001297, "grad_norm": 0.5001487135887146, "learning_rate": 1e-05, "loss": 0.0385, "step": 129700 }, { "epoch": 0.001298, "grad_norm": 0.3431733548641205, "learning_rate": 1e-05, "loss": 0.0378, "step": 129800 }, { "epoch": 0.001299, "grad_norm": 0.3033442497253418, "learning_rate": 1e-05, "loss": 0.038, "step": 129900 }, { "epoch": 0.0013, "grad_norm": 0.4297935962677002, "learning_rate": 1e-05, "loss": 0.0381, "step": 130000 }, { "epoch": 0.001301, "grad_norm": 0.36328575015068054, "learning_rate": 1e-05, "loss": 0.039, "step": 130100 }, { "epoch": 0.001302, "grad_norm": 0.2821417450904846, "learning_rate": 1e-05, "loss": 0.0387, "step": 130200 }, { "epoch": 0.001303, "grad_norm": 0.32476913928985596, "learning_rate": 1e-05, "loss": 0.0384, "step": 130300 }, { "epoch": 0.001304, "grad_norm": 0.2892526090145111, "learning_rate": 1e-05, "loss": 0.0389, "step": 130400 }, { "epoch": 0.001305, "grad_norm": 0.32251548767089844, "learning_rate": 1e-05, "loss": 0.0377, "step": 130500 }, { "epoch": 0.001306, "grad_norm": 0.2658371925354004, "learning_rate": 1e-05, "loss": 0.0395, "step": 130600 }, { "epoch": 0.001307, "grad_norm": 0.2543509900569916, "learning_rate": 1e-05, "loss": 0.0381, "step": 130700 }, { "epoch": 0.001308, "grad_norm": 0.3639354705810547, "learning_rate": 1e-05, "loss": 0.0383, "step": 130800 }, { "epoch": 0.001309, "grad_norm": 0.31591033935546875, "learning_rate": 1e-05, "loss": 0.0382, "step": 130900 }, { "epoch": 0.00131, "grad_norm": 0.24314932525157928, "learning_rate": 1e-05, "loss": 0.038, "step": 131000 }, { "epoch": 0.001311, "grad_norm": 0.24956506490707397, "learning_rate": 1e-05, "loss": 0.0387, "step": 131100 }, { "epoch": 0.001312, "grad_norm": 0.3525024950504303, "learning_rate": 1e-05, "loss": 0.0379, "step": 131200 }, { "epoch": 0.001313, "grad_norm": 0.3611176013946533, "learning_rate": 1e-05, "loss": 0.0382, "step": 131300 }, { "epoch": 0.001314, "grad_norm": 0.33662885427474976, "learning_rate": 1e-05, "loss": 0.0379, "step": 131400 }, { "epoch": 0.001315, "grad_norm": 0.28874483704566956, "learning_rate": 1e-05, "loss": 0.0372, "step": 131500 }, { "epoch": 0.001316, "grad_norm": 0.25963303446769714, "learning_rate": 1e-05, "loss": 0.0383, "step": 131600 }, { "epoch": 0.001317, "grad_norm": 0.26510611176490784, "learning_rate": 1e-05, "loss": 0.0384, "step": 131700 }, { "epoch": 0.001318, "grad_norm": 0.2677308917045593, "learning_rate": 1e-05, "loss": 0.0377, "step": 131800 }, { "epoch": 0.001319, "grad_norm": 0.29066377878189087, "learning_rate": 1e-05, "loss": 0.0379, "step": 131900 }, { "epoch": 0.00132, "grad_norm": 0.48313379287719727, "learning_rate": 1e-05, "loss": 0.0379, "step": 132000 }, { "epoch": 0.001321, "grad_norm": 0.2672960162162781, "learning_rate": 1e-05, "loss": 0.0382, "step": 132100 }, { "epoch": 0.001322, "grad_norm": 0.30425071716308594, "learning_rate": 1e-05, "loss": 0.0382, "step": 132200 }, { "epoch": 0.001323, "grad_norm": 0.2751413881778717, "learning_rate": 1e-05, "loss": 0.0385, "step": 132300 }, { "epoch": 0.001324, "grad_norm": 0.25183311104774475, "learning_rate": 1e-05, "loss": 0.0384, "step": 132400 }, { "epoch": 0.001325, "grad_norm": 0.25614091753959656, "learning_rate": 1e-05, "loss": 0.038, "step": 132500 }, { "epoch": 0.001326, "grad_norm": 0.28427428007125854, "learning_rate": 1e-05, "loss": 0.0382, "step": 132600 }, { "epoch": 0.001327, "grad_norm": 0.3760775923728943, "learning_rate": 1e-05, "loss": 0.0382, "step": 132700 }, { "epoch": 0.001328, "grad_norm": 0.3421640694141388, "learning_rate": 1e-05, "loss": 0.0381, "step": 132800 }, { "epoch": 0.001329, "grad_norm": 0.29974809288978577, "learning_rate": 1e-05, "loss": 0.0376, "step": 132900 }, { "epoch": 0.00133, "grad_norm": 0.33247998356819153, "learning_rate": 1e-05, "loss": 0.0385, "step": 133000 }, { "epoch": 0.001331, "grad_norm": 0.25724130868911743, "learning_rate": 1e-05, "loss": 0.0383, "step": 133100 }, { "epoch": 0.001332, "grad_norm": 0.3182983994483948, "learning_rate": 1e-05, "loss": 0.0386, "step": 133200 }, { "epoch": 0.001333, "grad_norm": 0.28793081641197205, "learning_rate": 1e-05, "loss": 0.0374, "step": 133300 }, { "epoch": 0.001334, "grad_norm": 0.24577206373214722, "learning_rate": 1e-05, "loss": 0.0377, "step": 133400 }, { "epoch": 0.001335, "grad_norm": 0.3063439726829529, "learning_rate": 1e-05, "loss": 0.0378, "step": 133500 }, { "epoch": 0.001336, "grad_norm": 0.4100284278392792, "learning_rate": 1e-05, "loss": 0.0382, "step": 133600 }, { "epoch": 0.001337, "grad_norm": 0.22226127982139587, "learning_rate": 1e-05, "loss": 0.0378, "step": 133700 }, { "epoch": 0.001338, "grad_norm": 0.2896203398704529, "learning_rate": 1e-05, "loss": 0.0372, "step": 133800 }, { "epoch": 0.001339, "grad_norm": 0.3422582745552063, "learning_rate": 1e-05, "loss": 0.0376, "step": 133900 }, { "epoch": 0.00134, "grad_norm": 0.26845985651016235, "learning_rate": 1e-05, "loss": 0.0377, "step": 134000 }, { "epoch": 0.001341, "grad_norm": 0.2758520543575287, "learning_rate": 1e-05, "loss": 0.038, "step": 134100 }, { "epoch": 0.001342, "grad_norm": 0.31132134795188904, "learning_rate": 1e-05, "loss": 0.0379, "step": 134200 }, { "epoch": 0.001343, "grad_norm": 0.2832513153553009, "learning_rate": 1e-05, "loss": 0.0379, "step": 134300 }, { "epoch": 0.001344, "grad_norm": 0.34779369831085205, "learning_rate": 1e-05, "loss": 0.0381, "step": 134400 }, { "epoch": 0.001345, "grad_norm": 0.3126806318759918, "learning_rate": 1e-05, "loss": 0.0375, "step": 134500 }, { "epoch": 0.001346, "grad_norm": 0.34992459416389465, "learning_rate": 1e-05, "loss": 0.0374, "step": 134600 }, { "epoch": 0.001347, "grad_norm": 0.33344659209251404, "learning_rate": 1e-05, "loss": 0.0374, "step": 134700 }, { "epoch": 0.001348, "grad_norm": 0.3194992244243622, "learning_rate": 1e-05, "loss": 0.0378, "step": 134800 }, { "epoch": 0.001349, "grad_norm": 0.337230384349823, "learning_rate": 1e-05, "loss": 0.0379, "step": 134900 }, { "epoch": 0.00135, "grad_norm": 0.2927236557006836, "learning_rate": 1e-05, "loss": 0.0382, "step": 135000 }, { "epoch": 0.001351, "grad_norm": 0.2726239860057831, "learning_rate": 1e-05, "loss": 0.0377, "step": 135100 }, { "epoch": 0.001352, "grad_norm": 0.26833677291870117, "learning_rate": 1e-05, "loss": 0.0384, "step": 135200 }, { "epoch": 0.001353, "grad_norm": 0.2850685119628906, "learning_rate": 1e-05, "loss": 0.0374, "step": 135300 }, { "epoch": 0.001354, "grad_norm": 0.41030463576316833, "learning_rate": 1e-05, "loss": 0.0381, "step": 135400 }, { "epoch": 0.001355, "grad_norm": 0.2863805592060089, "learning_rate": 1e-05, "loss": 0.0383, "step": 135500 }, { "epoch": 0.001356, "grad_norm": 0.28887733817100525, "learning_rate": 1e-05, "loss": 0.0376, "step": 135600 }, { "epoch": 0.001357, "grad_norm": 0.2449544221162796, "learning_rate": 1e-05, "loss": 0.0369, "step": 135700 }, { "epoch": 0.001358, "grad_norm": 0.25260239839553833, "learning_rate": 1e-05, "loss": 0.0371, "step": 135800 }, { "epoch": 0.001359, "grad_norm": 0.279849648475647, "learning_rate": 1e-05, "loss": 0.0373, "step": 135900 }, { "epoch": 0.00136, "grad_norm": 0.3000042140483856, "learning_rate": 1e-05, "loss": 0.0366, "step": 136000 }, { "epoch": 0.001361, "grad_norm": 0.36749324202537537, "learning_rate": 1e-05, "loss": 0.037, "step": 136100 }, { "epoch": 0.001362, "grad_norm": 0.4402848780155182, "learning_rate": 1e-05, "loss": 0.0378, "step": 136200 }, { "epoch": 0.001363, "grad_norm": 0.302613765001297, "learning_rate": 1e-05, "loss": 0.0378, "step": 136300 }, { "epoch": 0.001364, "grad_norm": 0.28477007150650024, "learning_rate": 1e-05, "loss": 0.0378, "step": 136400 }, { "epoch": 0.001365, "grad_norm": 0.33985957503318787, "learning_rate": 1e-05, "loss": 0.0377, "step": 136500 }, { "epoch": 0.001366, "grad_norm": 0.23110279440879822, "learning_rate": 1e-05, "loss": 0.0372, "step": 136600 }, { "epoch": 0.001367, "grad_norm": 0.27974122762680054, "learning_rate": 1e-05, "loss": 0.038, "step": 136700 }, { "epoch": 0.001368, "grad_norm": 0.30741623044013977, "learning_rate": 1e-05, "loss": 0.038, "step": 136800 }, { "epoch": 0.001369, "grad_norm": 0.2733733057975769, "learning_rate": 1e-05, "loss": 0.0373, "step": 136900 }, { "epoch": 0.00137, "grad_norm": 0.26094895601272583, "learning_rate": 1e-05, "loss": 0.0372, "step": 137000 }, { "epoch": 0.001371, "grad_norm": 0.31036990880966187, "learning_rate": 1e-05, "loss": 0.0373, "step": 137100 }, { "epoch": 0.001372, "grad_norm": 0.29670894145965576, "learning_rate": 1e-05, "loss": 0.037, "step": 137200 }, { "epoch": 0.001373, "grad_norm": 0.32709282636642456, "learning_rate": 1e-05, "loss": 0.0369, "step": 137300 }, { "epoch": 0.001374, "grad_norm": 0.23582397401332855, "learning_rate": 1e-05, "loss": 0.0372, "step": 137400 }, { "epoch": 0.001375, "grad_norm": 0.2786724269390106, "learning_rate": 1e-05, "loss": 0.0372, "step": 137500 }, { "epoch": 0.001376, "grad_norm": 0.3406980633735657, "learning_rate": 1e-05, "loss": 0.0372, "step": 137600 }, { "epoch": 0.001377, "grad_norm": 0.2617824077606201, "learning_rate": 1e-05, "loss": 0.0371, "step": 137700 }, { "epoch": 0.001378, "grad_norm": 0.29655721783638, "learning_rate": 1e-05, "loss": 0.0373, "step": 137800 }, { "epoch": 0.001379, "grad_norm": 0.2712962031364441, "learning_rate": 1e-05, "loss": 0.0375, "step": 137900 }, { "epoch": 0.00138, "grad_norm": 0.27035683393478394, "learning_rate": 1e-05, "loss": 0.0377, "step": 138000 }, { "epoch": 0.001381, "grad_norm": 0.24407118558883667, "learning_rate": 1e-05, "loss": 0.0371, "step": 138100 }, { "epoch": 0.001382, "grad_norm": 0.34199485182762146, "learning_rate": 1e-05, "loss": 0.0373, "step": 138200 }, { "epoch": 0.001383, "grad_norm": 0.21995343267917633, "learning_rate": 1e-05, "loss": 0.037, "step": 138300 }, { "epoch": 0.001384, "grad_norm": 0.3194929361343384, "learning_rate": 1e-05, "loss": 0.0366, "step": 138400 }, { "epoch": 0.001385, "grad_norm": 0.26971960067749023, "learning_rate": 1e-05, "loss": 0.0377, "step": 138500 }, { "epoch": 0.001386, "grad_norm": 0.26891013979911804, "learning_rate": 1e-05, "loss": 0.0371, "step": 138600 }, { "epoch": 0.001387, "grad_norm": 0.2648075222969055, "learning_rate": 1e-05, "loss": 0.0371, "step": 138700 }, { "epoch": 0.001388, "grad_norm": 0.27340883016586304, "learning_rate": 1e-05, "loss": 0.0367, "step": 138800 }, { "epoch": 0.001389, "grad_norm": 0.3441414535045624, "learning_rate": 1e-05, "loss": 0.0367, "step": 138900 }, { "epoch": 0.00139, "grad_norm": 0.2464142143726349, "learning_rate": 1e-05, "loss": 0.0376, "step": 139000 }, { "epoch": 0.001391, "grad_norm": 0.2617616653442383, "learning_rate": 1e-05, "loss": 0.0368, "step": 139100 }, { "epoch": 0.001392, "grad_norm": 0.3562218248844147, "learning_rate": 1e-05, "loss": 0.0367, "step": 139200 }, { "epoch": 0.001393, "grad_norm": 0.4030774235725403, "learning_rate": 1e-05, "loss": 0.0375, "step": 139300 }, { "epoch": 0.001394, "grad_norm": 0.2560139298439026, "learning_rate": 1e-05, "loss": 0.0358, "step": 139400 }, { "epoch": 0.001395, "grad_norm": 0.2256765216588974, "learning_rate": 1e-05, "loss": 0.0359, "step": 139500 }, { "epoch": 0.001396, "grad_norm": 0.29446402192115784, "learning_rate": 1e-05, "loss": 0.0374, "step": 139600 }, { "epoch": 0.001397, "grad_norm": 0.24487186968326569, "learning_rate": 1e-05, "loss": 0.0369, "step": 139700 }, { "epoch": 0.001398, "grad_norm": 0.32879963517189026, "learning_rate": 1e-05, "loss": 0.0374, "step": 139800 }, { "epoch": 0.001399, "grad_norm": 0.28030383586883545, "learning_rate": 1e-05, "loss": 0.0371, "step": 139900 }, { "epoch": 0.0014, "grad_norm": 0.345354288816452, "learning_rate": 1e-05, "loss": 0.0363, "step": 140000 }, { "epoch": 0.0014, "eval_loss": 0.03141069784760475, "eval_runtime": 188.6669, "eval_samples_per_second": 265.017, "eval_steps_per_second": 16.564, "step": 140000 }, { "epoch": 0.001401, "grad_norm": 0.27843356132507324, "learning_rate": 1e-05, "loss": 0.0369, "step": 140100 }, { "epoch": 0.001402, "grad_norm": 0.3215148448944092, "learning_rate": 1e-05, "loss": 0.0369, "step": 140200 }, { "epoch": 0.001403, "grad_norm": 0.21244385838508606, "learning_rate": 1e-05, "loss": 0.0364, "step": 140300 }, { "epoch": 0.001404, "grad_norm": 0.21311864256858826, "learning_rate": 1e-05, "loss": 0.0372, "step": 140400 }, { "epoch": 0.001405, "grad_norm": 0.30552905797958374, "learning_rate": 1e-05, "loss": 0.037, "step": 140500 }, { "epoch": 0.001406, "grad_norm": 0.29784369468688965, "learning_rate": 1e-05, "loss": 0.0375, "step": 140600 }, { "epoch": 0.001407, "grad_norm": 0.2521277070045471, "learning_rate": 1e-05, "loss": 0.0366, "step": 140700 }, { "epoch": 0.001408, "grad_norm": 0.3006814420223236, "learning_rate": 1e-05, "loss": 0.0365, "step": 140800 }, { "epoch": 0.001409, "grad_norm": 0.2572740316390991, "learning_rate": 1e-05, "loss": 0.0367, "step": 140900 }, { "epoch": 0.00141, "grad_norm": 0.224111869931221, "learning_rate": 1e-05, "loss": 0.0374, "step": 141000 }, { "epoch": 0.001411, "grad_norm": 0.22797194123268127, "learning_rate": 1e-05, "loss": 0.0363, "step": 141100 }, { "epoch": 0.001412, "grad_norm": 0.3036273717880249, "learning_rate": 1e-05, "loss": 0.0365, "step": 141200 }, { "epoch": 0.001413, "grad_norm": 0.3766455054283142, "learning_rate": 1e-05, "loss": 0.0366, "step": 141300 }, { "epoch": 0.001414, "grad_norm": 0.2616423964500427, "learning_rate": 1e-05, "loss": 0.037, "step": 141400 }, { "epoch": 0.001415, "grad_norm": 0.307314932346344, "learning_rate": 1e-05, "loss": 0.0371, "step": 141500 }, { "epoch": 0.001416, "grad_norm": 0.26275721192359924, "learning_rate": 1e-05, "loss": 0.037, "step": 141600 }, { "epoch": 0.001417, "grad_norm": 0.2526923716068268, "learning_rate": 1e-05, "loss": 0.0376, "step": 141700 }, { "epoch": 0.001418, "grad_norm": 0.29048073291778564, "learning_rate": 1e-05, "loss": 0.0376, "step": 141800 }, { "epoch": 0.001419, "grad_norm": 0.38564181327819824, "learning_rate": 1e-05, "loss": 0.0369, "step": 141900 }, { "epoch": 0.00142, "grad_norm": 0.2564755082130432, "learning_rate": 1e-05, "loss": 0.036, "step": 142000 }, { "epoch": 0.001421, "grad_norm": 0.2981272041797638, "learning_rate": 1e-05, "loss": 0.0366, "step": 142100 }, { "epoch": 0.001422, "grad_norm": 0.32480481266975403, "learning_rate": 1e-05, "loss": 0.0366, "step": 142200 }, { "epoch": 0.001423, "grad_norm": 0.2913365364074707, "learning_rate": 1e-05, "loss": 0.037, "step": 142300 }, { "epoch": 0.001424, "grad_norm": 0.32469871640205383, "learning_rate": 1e-05, "loss": 0.0367, "step": 142400 }, { "epoch": 0.001425, "grad_norm": 0.32723814249038696, "learning_rate": 1e-05, "loss": 0.0369, "step": 142500 }, { "epoch": 0.001426, "grad_norm": 0.3579145073890686, "learning_rate": 1e-05, "loss": 0.0364, "step": 142600 }, { "epoch": 0.001427, "grad_norm": 0.2979782819747925, "learning_rate": 1e-05, "loss": 0.0366, "step": 142700 }, { "epoch": 0.001428, "grad_norm": 0.2589656710624695, "learning_rate": 1e-05, "loss": 0.0369, "step": 142800 }, { "epoch": 0.001429, "grad_norm": 0.29071253538131714, "learning_rate": 1e-05, "loss": 0.0371, "step": 142900 }, { "epoch": 0.00143, "grad_norm": 0.2934455871582031, "learning_rate": 1e-05, "loss": 0.0365, "step": 143000 }, { "epoch": 0.001431, "grad_norm": 0.3704672157764435, "learning_rate": 1e-05, "loss": 0.0359, "step": 143100 }, { "epoch": 0.001432, "grad_norm": 0.26006075739860535, "learning_rate": 1e-05, "loss": 0.0365, "step": 143200 }, { "epoch": 0.001433, "grad_norm": 0.29314252734184265, "learning_rate": 1e-05, "loss": 0.037, "step": 143300 }, { "epoch": 0.001434, "grad_norm": 0.24608591198921204, "learning_rate": 1e-05, "loss": 0.036, "step": 143400 }, { "epoch": 0.001435, "grad_norm": 0.2138095200061798, "learning_rate": 1e-05, "loss": 0.037, "step": 143500 }, { "epoch": 0.001436, "grad_norm": 0.3208875060081482, "learning_rate": 1e-05, "loss": 0.0367, "step": 143600 }, { "epoch": 0.001437, "grad_norm": 0.29256126284599304, "learning_rate": 1e-05, "loss": 0.0374, "step": 143700 }, { "epoch": 0.001438, "grad_norm": 0.26836228370666504, "learning_rate": 1e-05, "loss": 0.0364, "step": 143800 }, { "epoch": 0.001439, "grad_norm": 0.24618670344352722, "learning_rate": 1e-05, "loss": 0.0366, "step": 143900 }, { "epoch": 0.00144, "grad_norm": 0.3413907587528229, "learning_rate": 1e-05, "loss": 0.0367, "step": 144000 }, { "epoch": 0.001441, "grad_norm": 0.26727259159088135, "learning_rate": 1e-05, "loss": 0.0368, "step": 144100 }, { "epoch": 0.001442, "grad_norm": 0.2617424428462982, "learning_rate": 1e-05, "loss": 0.0365, "step": 144200 }, { "epoch": 0.001443, "grad_norm": 0.33920544385910034, "learning_rate": 1e-05, "loss": 0.0363, "step": 144300 }, { "epoch": 0.001444, "grad_norm": 0.2792336940765381, "learning_rate": 1e-05, "loss": 0.0367, "step": 144400 }, { "epoch": 0.001445, "grad_norm": 0.32061052322387695, "learning_rate": 1e-05, "loss": 0.037, "step": 144500 }, { "epoch": 0.001446, "grad_norm": 0.38173362612724304, "learning_rate": 1e-05, "loss": 0.0368, "step": 144600 }, { "epoch": 0.001447, "grad_norm": 0.25419801473617554, "learning_rate": 1e-05, "loss": 0.0367, "step": 144700 }, { "epoch": 0.001448, "grad_norm": 0.27371639013290405, "learning_rate": 1e-05, "loss": 0.0361, "step": 144800 }, { "epoch": 0.001449, "grad_norm": 0.337908536195755, "learning_rate": 1e-05, "loss": 0.037, "step": 144900 }, { "epoch": 0.00145, "grad_norm": 0.38578206300735474, "learning_rate": 1e-05, "loss": 0.0361, "step": 145000 }, { "epoch": 0.001451, "grad_norm": 0.28068986535072327, "learning_rate": 1e-05, "loss": 0.0368, "step": 145100 }, { "epoch": 0.001452, "grad_norm": 0.24464528262615204, "learning_rate": 1e-05, "loss": 0.0366, "step": 145200 }, { "epoch": 0.001453, "grad_norm": 0.3447825312614441, "learning_rate": 1e-05, "loss": 0.0366, "step": 145300 }, { "epoch": 0.001454, "grad_norm": 0.2901204824447632, "learning_rate": 1e-05, "loss": 0.0366, "step": 145400 }, { "epoch": 0.001455, "grad_norm": 0.2698995769023895, "learning_rate": 1e-05, "loss": 0.0364, "step": 145500 }, { "epoch": 0.001456, "grad_norm": 0.27128103375434875, "learning_rate": 1e-05, "loss": 0.0366, "step": 145600 }, { "epoch": 0.001457, "grad_norm": 0.22578035295009613, "learning_rate": 1e-05, "loss": 0.0367, "step": 145700 }, { "epoch": 0.001458, "grad_norm": 0.24039269983768463, "learning_rate": 1e-05, "loss": 0.0367, "step": 145800 }, { "epoch": 0.001459, "grad_norm": 0.28638139367103577, "learning_rate": 1e-05, "loss": 0.0364, "step": 145900 }, { "epoch": 0.00146, "grad_norm": 0.3537798821926117, "learning_rate": 1e-05, "loss": 0.0364, "step": 146000 }, { "epoch": 0.001461, "grad_norm": 0.322256863117218, "learning_rate": 1e-05, "loss": 0.0362, "step": 146100 }, { "epoch": 0.001462, "grad_norm": 0.34133994579315186, "learning_rate": 1e-05, "loss": 0.0355, "step": 146200 }, { "epoch": 0.001463, "grad_norm": 0.2470310777425766, "learning_rate": 1e-05, "loss": 0.0365, "step": 146300 }, { "epoch": 0.001464, "grad_norm": 0.26455312967300415, "learning_rate": 1e-05, "loss": 0.0357, "step": 146400 }, { "epoch": 0.001465, "grad_norm": 0.32307836413383484, "learning_rate": 1e-05, "loss": 0.0368, "step": 146500 }, { "epoch": 0.001466, "grad_norm": 0.2147117257118225, "learning_rate": 1e-05, "loss": 0.0355, "step": 146600 }, { "epoch": 0.001467, "grad_norm": 0.3612409234046936, "learning_rate": 1e-05, "loss": 0.0366, "step": 146700 }, { "epoch": 0.001468, "grad_norm": 0.28496691584587097, "learning_rate": 1e-05, "loss": 0.0359, "step": 146800 }, { "epoch": 0.001469, "grad_norm": 0.34731581807136536, "learning_rate": 1e-05, "loss": 0.036, "step": 146900 }, { "epoch": 0.00147, "grad_norm": 0.2987736165523529, "learning_rate": 1e-05, "loss": 0.0362, "step": 147000 }, { "epoch": 0.001471, "grad_norm": 0.3577730655670166, "learning_rate": 1e-05, "loss": 0.0358, "step": 147100 }, { "epoch": 0.001472, "grad_norm": 0.20653052628040314, "learning_rate": 1e-05, "loss": 0.0365, "step": 147200 }, { "epoch": 0.001473, "grad_norm": 0.36424019932746887, "learning_rate": 1e-05, "loss": 0.0367, "step": 147300 }, { "epoch": 0.001474, "grad_norm": 0.2508068382740021, "learning_rate": 1e-05, "loss": 0.0361, "step": 147400 }, { "epoch": 0.001475, "grad_norm": 0.2445961833000183, "learning_rate": 1e-05, "loss": 0.0364, "step": 147500 }, { "epoch": 0.001476, "grad_norm": 0.28082066774368286, "learning_rate": 1e-05, "loss": 0.0361, "step": 147600 }, { "epoch": 0.001477, "grad_norm": 0.24130816757678986, "learning_rate": 1e-05, "loss": 0.0361, "step": 147700 }, { "epoch": 0.001478, "grad_norm": 0.3385671079158783, "learning_rate": 1e-05, "loss": 0.0359, "step": 147800 }, { "epoch": 0.001479, "grad_norm": 0.23715735971927643, "learning_rate": 1e-05, "loss": 0.0365, "step": 147900 }, { "epoch": 0.00148, "grad_norm": 0.41885629296302795, "learning_rate": 1e-05, "loss": 0.036, "step": 148000 }, { "epoch": 0.001481, "grad_norm": 0.3805704116821289, "learning_rate": 1e-05, "loss": 0.0352, "step": 148100 }, { "epoch": 0.001482, "grad_norm": 0.25459396839141846, "learning_rate": 1e-05, "loss": 0.0358, "step": 148200 }, { "epoch": 0.001483, "grad_norm": 0.33226701617240906, "learning_rate": 1e-05, "loss": 0.0364, "step": 148300 }, { "epoch": 0.001484, "grad_norm": 0.268392413854599, "learning_rate": 1e-05, "loss": 0.0363, "step": 148400 }, { "epoch": 0.001485, "grad_norm": 0.2770199477672577, "learning_rate": 1e-05, "loss": 0.0358, "step": 148500 }, { "epoch": 0.001486, "grad_norm": 0.24576789140701294, "learning_rate": 1e-05, "loss": 0.036, "step": 148600 }, { "epoch": 0.001487, "grad_norm": 0.23976580798625946, "learning_rate": 1e-05, "loss": 0.0355, "step": 148700 }, { "epoch": 0.001488, "grad_norm": 0.25729691982269287, "learning_rate": 1e-05, "loss": 0.0368, "step": 148800 }, { "epoch": 0.001489, "grad_norm": 0.2572067975997925, "learning_rate": 1e-05, "loss": 0.0359, "step": 148900 }, { "epoch": 0.00149, "grad_norm": 0.35858389735221863, "learning_rate": 1e-05, "loss": 0.036, "step": 149000 }, { "epoch": 0.001491, "grad_norm": 0.25127366185188293, "learning_rate": 1e-05, "loss": 0.0372, "step": 149100 }, { "epoch": 0.001492, "grad_norm": 0.2475125789642334, "learning_rate": 1e-05, "loss": 0.0362, "step": 149200 }, { "epoch": 0.001493, "grad_norm": 0.2967037260532379, "learning_rate": 1e-05, "loss": 0.0364, "step": 149300 }, { "epoch": 0.001494, "grad_norm": 0.3322179913520813, "learning_rate": 1e-05, "loss": 0.0361, "step": 149400 }, { "epoch": 0.001495, "grad_norm": 0.2789592742919922, "learning_rate": 1e-05, "loss": 0.0362, "step": 149500 }, { "epoch": 0.001496, "grad_norm": 0.2685825526714325, "learning_rate": 1e-05, "loss": 0.0354, "step": 149600 }, { "epoch": 0.001497, "grad_norm": 0.3304698169231415, "learning_rate": 1e-05, "loss": 0.0361, "step": 149700 }, { "epoch": 0.001498, "grad_norm": 0.24894975125789642, "learning_rate": 1e-05, "loss": 0.0363, "step": 149800 }, { "epoch": 0.001499, "grad_norm": 0.31222957372665405, "learning_rate": 1e-05, "loss": 0.0365, "step": 149900 }, { "epoch": 0.0015, "grad_norm": 0.26425331830978394, "learning_rate": 1e-05, "loss": 0.0358, "step": 150000 }, { "epoch": 0.001501, "grad_norm": 0.23718877136707306, "learning_rate": 1e-05, "loss": 0.0363, "step": 150100 }, { "epoch": 0.001502, "grad_norm": 0.22689980268478394, "learning_rate": 1e-05, "loss": 0.0363, "step": 150200 }, { "epoch": 0.001503, "grad_norm": 0.270194947719574, "learning_rate": 1e-05, "loss": 0.0357, "step": 150300 }, { "epoch": 0.001504, "grad_norm": 0.38323286175727844, "learning_rate": 1e-05, "loss": 0.0364, "step": 150400 }, { "epoch": 0.001505, "grad_norm": 0.25549739599227905, "learning_rate": 1e-05, "loss": 0.0359, "step": 150500 }, { "epoch": 0.001506, "grad_norm": 0.25252965092658997, "learning_rate": 1e-05, "loss": 0.0353, "step": 150600 }, { "epoch": 0.001507, "grad_norm": 0.26133981347084045, "learning_rate": 1e-05, "loss": 0.0357, "step": 150700 }, { "epoch": 0.001508, "grad_norm": 0.3608594238758087, "learning_rate": 1e-05, "loss": 0.0364, "step": 150800 }, { "epoch": 0.001509, "grad_norm": 0.25991785526275635, "learning_rate": 1e-05, "loss": 0.0357, "step": 150900 }, { "epoch": 0.00151, "grad_norm": 0.27862784266471863, "learning_rate": 1e-05, "loss": 0.0361, "step": 151000 }, { "epoch": 0.001511, "grad_norm": 0.3046956956386566, "learning_rate": 1e-05, "loss": 0.0362, "step": 151100 }, { "epoch": 0.001512, "grad_norm": 0.2541850507259369, "learning_rate": 1e-05, "loss": 0.0362, "step": 151200 }, { "epoch": 0.001513, "grad_norm": 0.3113613724708557, "learning_rate": 1e-05, "loss": 0.0363, "step": 151300 }, { "epoch": 0.001514, "grad_norm": 0.23317605257034302, "learning_rate": 1e-05, "loss": 0.0355, "step": 151400 }, { "epoch": 0.001515, "grad_norm": 0.2493402510881424, "learning_rate": 1e-05, "loss": 0.0357, "step": 151500 }, { "epoch": 0.001516, "grad_norm": 0.2282235473394394, "learning_rate": 1e-05, "loss": 0.0354, "step": 151600 }, { "epoch": 0.001517, "grad_norm": 0.3015749752521515, "learning_rate": 1e-05, "loss": 0.0361, "step": 151700 }, { "epoch": 0.001518, "grad_norm": 0.4876458942890167, "learning_rate": 1e-05, "loss": 0.0355, "step": 151800 }, { "epoch": 0.001519, "grad_norm": 0.2479935884475708, "learning_rate": 1e-05, "loss": 0.036, "step": 151900 }, { "epoch": 0.00152, "grad_norm": 0.32084548473358154, "learning_rate": 1e-05, "loss": 0.0362, "step": 152000 }, { "epoch": 0.001521, "grad_norm": 0.2858216464519501, "learning_rate": 1e-05, "loss": 0.0357, "step": 152100 }, { "epoch": 0.001522, "grad_norm": 0.32674285769462585, "learning_rate": 1e-05, "loss": 0.0359, "step": 152200 }, { "epoch": 0.001523, "grad_norm": 0.33190637826919556, "learning_rate": 1e-05, "loss": 0.0352, "step": 152300 }, { "epoch": 0.001524, "grad_norm": 0.35373207926750183, "learning_rate": 1e-05, "loss": 0.0357, "step": 152400 }, { "epoch": 0.001525, "grad_norm": 0.19833964109420776, "learning_rate": 1e-05, "loss": 0.0369, "step": 152500 }, { "epoch": 0.001526, "grad_norm": 0.31542879343032837, "learning_rate": 1e-05, "loss": 0.0356, "step": 152600 }, { "epoch": 0.001527, "grad_norm": 0.2857697308063507, "learning_rate": 1e-05, "loss": 0.0352, "step": 152700 }, { "epoch": 0.001528, "grad_norm": 0.28824642300605774, "learning_rate": 1e-05, "loss": 0.0362, "step": 152800 }, { "epoch": 0.001529, "grad_norm": 0.2830536663532257, "learning_rate": 1e-05, "loss": 0.0355, "step": 152900 }, { "epoch": 0.00153, "grad_norm": 0.3028011620044708, "learning_rate": 1e-05, "loss": 0.0354, "step": 153000 }, { "epoch": 0.001531, "grad_norm": 0.25026169419288635, "learning_rate": 1e-05, "loss": 0.0352, "step": 153100 }, { "epoch": 0.001532, "grad_norm": 0.2843903601169586, "learning_rate": 1e-05, "loss": 0.0358, "step": 153200 }, { "epoch": 0.001533, "grad_norm": 0.26483601331710815, "learning_rate": 1e-05, "loss": 0.035, "step": 153300 }, { "epoch": 0.001534, "grad_norm": 0.23602305352687836, "learning_rate": 1e-05, "loss": 0.0357, "step": 153400 }, { "epoch": 0.001535, "grad_norm": 0.22439594566822052, "learning_rate": 1e-05, "loss": 0.0352, "step": 153500 }, { "epoch": 0.001536, "grad_norm": 0.3337893784046173, "learning_rate": 1e-05, "loss": 0.0361, "step": 153600 }, { "epoch": 0.001537, "grad_norm": 0.22755467891693115, "learning_rate": 1e-05, "loss": 0.0353, "step": 153700 }, { "epoch": 0.001538, "grad_norm": 0.24223746359348297, "learning_rate": 1e-05, "loss": 0.0354, "step": 153800 }, { "epoch": 0.001539, "grad_norm": 0.31616684794425964, "learning_rate": 1e-05, "loss": 0.0354, "step": 153900 }, { "epoch": 0.00154, "grad_norm": 0.19992034137248993, "learning_rate": 1e-05, "loss": 0.0358, "step": 154000 }, { "epoch": 0.001541, "grad_norm": 0.22452488541603088, "learning_rate": 1e-05, "loss": 0.0357, "step": 154100 }, { "epoch": 0.001542, "grad_norm": 0.29476243257522583, "learning_rate": 1e-05, "loss": 0.0356, "step": 154200 }, { "epoch": 0.001543, "grad_norm": 0.2583869397640228, "learning_rate": 1e-05, "loss": 0.035, "step": 154300 }, { "epoch": 0.001544, "grad_norm": 0.3268553912639618, "learning_rate": 1e-05, "loss": 0.036, "step": 154400 }, { "epoch": 0.001545, "grad_norm": 0.2054702490568161, "learning_rate": 1e-05, "loss": 0.0356, "step": 154500 }, { "epoch": 0.001546, "grad_norm": 0.31020477414131165, "learning_rate": 1e-05, "loss": 0.0349, "step": 154600 }, { "epoch": 0.001547, "grad_norm": 0.3033059537410736, "learning_rate": 1e-05, "loss": 0.035, "step": 154700 }, { "epoch": 0.001548, "grad_norm": 0.30400195717811584, "learning_rate": 1e-05, "loss": 0.0355, "step": 154800 }, { "epoch": 0.001549, "grad_norm": 0.2748301029205322, "learning_rate": 1e-05, "loss": 0.0354, "step": 154900 }, { "epoch": 0.00155, "grad_norm": 0.2413843721151352, "learning_rate": 1e-05, "loss": 0.0353, "step": 155000 }, { "epoch": 0.001551, "grad_norm": 0.2315683811903, "learning_rate": 1e-05, "loss": 0.0351, "step": 155100 }, { "epoch": 0.001552, "grad_norm": 0.4572807550430298, "learning_rate": 1e-05, "loss": 0.0357, "step": 155200 }, { "epoch": 0.001553, "grad_norm": 0.4190671741962433, "learning_rate": 1e-05, "loss": 0.0352, "step": 155300 }, { "epoch": 0.001554, "grad_norm": 0.34505701065063477, "learning_rate": 1e-05, "loss": 0.035, "step": 155400 }, { "epoch": 0.001555, "grad_norm": 0.3159753680229187, "learning_rate": 1e-05, "loss": 0.036, "step": 155500 }, { "epoch": 0.001556, "grad_norm": 0.3317139744758606, "learning_rate": 1e-05, "loss": 0.035, "step": 155600 }, { "epoch": 0.001557, "grad_norm": 0.2388676255941391, "learning_rate": 1e-05, "loss": 0.036, "step": 155700 }, { "epoch": 0.001558, "grad_norm": 0.26645150780677795, "learning_rate": 1e-05, "loss": 0.0362, "step": 155800 }, { "epoch": 0.001559, "grad_norm": 0.26671499013900757, "learning_rate": 1e-05, "loss": 0.0353, "step": 155900 }, { "epoch": 0.00156, "grad_norm": 0.22539860010147095, "learning_rate": 1e-05, "loss": 0.0355, "step": 156000 }, { "epoch": 0.001561, "grad_norm": 0.25375470519065857, "learning_rate": 1e-05, "loss": 0.0353, "step": 156100 }, { "epoch": 0.001562, "grad_norm": 0.24917878210544586, "learning_rate": 1e-05, "loss": 0.0364, "step": 156200 }, { "epoch": 0.001563, "grad_norm": 0.32847774028778076, "learning_rate": 1e-05, "loss": 0.0356, "step": 156300 }, { "epoch": 0.001564, "grad_norm": 0.2627456784248352, "learning_rate": 1e-05, "loss": 0.0352, "step": 156400 }, { "epoch": 0.001565, "grad_norm": 0.26376283168792725, "learning_rate": 1e-05, "loss": 0.0356, "step": 156500 }, { "epoch": 0.001566, "grad_norm": 0.25354859232902527, "learning_rate": 1e-05, "loss": 0.0357, "step": 156600 }, { "epoch": 0.001567, "grad_norm": 0.3617793321609497, "learning_rate": 1e-05, "loss": 0.0353, "step": 156700 }, { "epoch": 0.001568, "grad_norm": 0.250704824924469, "learning_rate": 1e-05, "loss": 0.0352, "step": 156800 }, { "epoch": 0.001569, "grad_norm": 0.24372927844524384, "learning_rate": 1e-05, "loss": 0.0357, "step": 156900 }, { "epoch": 0.00157, "grad_norm": 0.3011695444583893, "learning_rate": 1e-05, "loss": 0.0351, "step": 157000 }, { "epoch": 0.001571, "grad_norm": 0.3521178364753723, "learning_rate": 1e-05, "loss": 0.0357, "step": 157100 }, { "epoch": 0.001572, "grad_norm": 0.3228747248649597, "learning_rate": 1e-05, "loss": 0.0355, "step": 157200 }, { "epoch": 0.001573, "grad_norm": 0.24858912825584412, "learning_rate": 1e-05, "loss": 0.0352, "step": 157300 }, { "epoch": 0.001574, "grad_norm": 0.32230839133262634, "learning_rate": 1e-05, "loss": 0.0344, "step": 157400 }, { "epoch": 0.001575, "grad_norm": 0.31567245721817017, "learning_rate": 1e-05, "loss": 0.0353, "step": 157500 }, { "epoch": 0.001576, "grad_norm": 0.2233029454946518, "learning_rate": 1e-05, "loss": 0.0353, "step": 157600 }, { "epoch": 0.001577, "grad_norm": 0.2861238121986389, "learning_rate": 1e-05, "loss": 0.0352, "step": 157700 }, { "epoch": 0.001578, "grad_norm": 0.2539554834365845, "learning_rate": 1e-05, "loss": 0.0346, "step": 157800 }, { "epoch": 0.001579, "grad_norm": 0.25857779383659363, "learning_rate": 1e-05, "loss": 0.0352, "step": 157900 }, { "epoch": 0.00158, "grad_norm": 0.31999021768569946, "learning_rate": 1e-05, "loss": 0.0359, "step": 158000 }, { "epoch": 0.001581, "grad_norm": 0.26091596484184265, "learning_rate": 1e-05, "loss": 0.0354, "step": 158100 }, { "epoch": 0.001582, "grad_norm": 0.2694457173347473, "learning_rate": 1e-05, "loss": 0.0346, "step": 158200 }, { "epoch": 0.001583, "grad_norm": 0.29052361845970154, "learning_rate": 1e-05, "loss": 0.0354, "step": 158300 }, { "epoch": 0.001584, "grad_norm": 0.30961135029792786, "learning_rate": 1e-05, "loss": 0.0351, "step": 158400 }, { "epoch": 0.001585, "grad_norm": 0.3189465403556824, "learning_rate": 1e-05, "loss": 0.0348, "step": 158500 }, { "epoch": 0.001586, "grad_norm": 0.257111519575119, "learning_rate": 1e-05, "loss": 0.0355, "step": 158600 }, { "epoch": 0.001587, "grad_norm": 0.2427392154932022, "learning_rate": 1e-05, "loss": 0.0347, "step": 158700 }, { "epoch": 0.001588, "grad_norm": 0.31273630261421204, "learning_rate": 1e-05, "loss": 0.0354, "step": 158800 }, { "epoch": 0.001589, "grad_norm": 0.38570258021354675, "learning_rate": 1e-05, "loss": 0.0345, "step": 158900 }, { "epoch": 0.00159, "grad_norm": 0.4180574119091034, "learning_rate": 1e-05, "loss": 0.0353, "step": 159000 }, { "epoch": 0.001591, "grad_norm": 0.346671998500824, "learning_rate": 1e-05, "loss": 0.0347, "step": 159100 }, { "epoch": 0.001592, "grad_norm": 0.2895566523075104, "learning_rate": 1e-05, "loss": 0.035, "step": 159200 }, { "epoch": 0.001593, "grad_norm": 0.341688871383667, "learning_rate": 1e-05, "loss": 0.0351, "step": 159300 }, { "epoch": 0.001594, "grad_norm": 0.27643173933029175, "learning_rate": 1e-05, "loss": 0.035, "step": 159400 }, { "epoch": 0.001595, "grad_norm": 0.2622915208339691, "learning_rate": 1e-05, "loss": 0.0349, "step": 159500 }, { "epoch": 0.001596, "grad_norm": 0.22932672500610352, "learning_rate": 1e-05, "loss": 0.0353, "step": 159600 }, { "epoch": 0.001597, "grad_norm": 0.2503047287464142, "learning_rate": 1e-05, "loss": 0.0352, "step": 159700 }, { "epoch": 0.001598, "grad_norm": 0.3100641369819641, "learning_rate": 1e-05, "loss": 0.0358, "step": 159800 }, { "epoch": 0.001599, "grad_norm": 0.3228934109210968, "learning_rate": 1e-05, "loss": 0.0355, "step": 159900 }, { "epoch": 0.0016, "grad_norm": 0.2840133607387543, "learning_rate": 1e-05, "loss": 0.035, "step": 160000 }, { "epoch": 0.0016, "eval_loss": 0.030000435188412666, "eval_runtime": 187.0838, "eval_samples_per_second": 267.26, "eval_steps_per_second": 16.704, "step": 160000 }, { "epoch": 0.001601, "grad_norm": 0.26991984248161316, "learning_rate": 1e-05, "loss": 0.0351, "step": 160100 }, { "epoch": 0.001602, "grad_norm": 0.30078035593032837, "learning_rate": 1e-05, "loss": 0.0356, "step": 160200 }, { "epoch": 0.001603, "grad_norm": 0.285980761051178, "learning_rate": 1e-05, "loss": 0.0356, "step": 160300 }, { "epoch": 0.001604, "grad_norm": 0.28516823053359985, "learning_rate": 1e-05, "loss": 0.0354, "step": 160400 }, { "epoch": 0.001605, "grad_norm": 0.2768954336643219, "learning_rate": 1e-05, "loss": 0.0346, "step": 160500 }, { "epoch": 0.001606, "grad_norm": 0.2555577754974365, "learning_rate": 1e-05, "loss": 0.0349, "step": 160600 }, { "epoch": 0.001607, "grad_norm": 0.2399977445602417, "learning_rate": 1e-05, "loss": 0.034, "step": 160700 }, { "epoch": 0.001608, "grad_norm": 0.2901008129119873, "learning_rate": 1e-05, "loss": 0.0348, "step": 160800 }, { "epoch": 0.001609, "grad_norm": 0.31857243180274963, "learning_rate": 1e-05, "loss": 0.035, "step": 160900 }, { "epoch": 0.00161, "grad_norm": 0.3923068642616272, "learning_rate": 1e-05, "loss": 0.0357, "step": 161000 }, { "epoch": 0.001611, "grad_norm": 0.24952055513858795, "learning_rate": 1e-05, "loss": 0.0345, "step": 161100 }, { "epoch": 0.001612, "grad_norm": 0.287492036819458, "learning_rate": 1e-05, "loss": 0.0353, "step": 161200 }, { "epoch": 0.001613, "grad_norm": 0.21950365602970123, "learning_rate": 1e-05, "loss": 0.0347, "step": 161300 }, { "epoch": 0.001614, "grad_norm": 0.2931610941886902, "learning_rate": 1e-05, "loss": 0.0351, "step": 161400 }, { "epoch": 0.001615, "grad_norm": 0.2590772807598114, "learning_rate": 1e-05, "loss": 0.0346, "step": 161500 }, { "epoch": 0.001616, "grad_norm": 0.29332560300827026, "learning_rate": 1e-05, "loss": 0.0349, "step": 161600 }, { "epoch": 0.001617, "grad_norm": 0.2652427852153778, "learning_rate": 1e-05, "loss": 0.0348, "step": 161700 }, { "epoch": 0.001618, "grad_norm": 0.22862274944782257, "learning_rate": 1e-05, "loss": 0.0347, "step": 161800 }, { "epoch": 0.001619, "grad_norm": 0.20794892311096191, "learning_rate": 1e-05, "loss": 0.035, "step": 161900 }, { "epoch": 0.00162, "grad_norm": 0.2536805272102356, "learning_rate": 1e-05, "loss": 0.0343, "step": 162000 }, { "epoch": 0.001621, "grad_norm": 0.4602454900741577, "learning_rate": 1e-05, "loss": 0.0351, "step": 162100 }, { "epoch": 0.001622, "grad_norm": 0.21611171960830688, "learning_rate": 1e-05, "loss": 0.0347, "step": 162200 }, { "epoch": 0.001623, "grad_norm": 0.4412219524383545, "learning_rate": 1e-05, "loss": 0.0348, "step": 162300 }, { "epoch": 0.001624, "grad_norm": 0.21275532245635986, "learning_rate": 1e-05, "loss": 0.0348, "step": 162400 }, { "epoch": 0.001625, "grad_norm": 0.2872399687767029, "learning_rate": 1e-05, "loss": 0.0344, "step": 162500 }, { "epoch": 0.001626, "grad_norm": 0.24584607779979706, "learning_rate": 1e-05, "loss": 0.0346, "step": 162600 }, { "epoch": 0.001627, "grad_norm": 0.2837746739387512, "learning_rate": 1e-05, "loss": 0.0349, "step": 162700 }, { "epoch": 0.001628, "grad_norm": 0.27610549330711365, "learning_rate": 1e-05, "loss": 0.0348, "step": 162800 }, { "epoch": 0.001629, "grad_norm": 0.25236809253692627, "learning_rate": 1e-05, "loss": 0.035, "step": 162900 }, { "epoch": 0.00163, "grad_norm": 0.22216157615184784, "learning_rate": 1e-05, "loss": 0.0347, "step": 163000 }, { "epoch": 0.001631, "grad_norm": 0.299146831035614, "learning_rate": 1e-05, "loss": 0.0354, "step": 163100 }, { "epoch": 0.001632, "grad_norm": 0.26845693588256836, "learning_rate": 1e-05, "loss": 0.0353, "step": 163200 }, { "epoch": 0.001633, "grad_norm": 0.23686626553535461, "learning_rate": 1e-05, "loss": 0.0353, "step": 163300 }, { "epoch": 0.001634, "grad_norm": 0.2802309989929199, "learning_rate": 1e-05, "loss": 0.0345, "step": 163400 }, { "epoch": 0.001635, "grad_norm": 0.25246092677116394, "learning_rate": 1e-05, "loss": 0.0347, "step": 163500 }, { "epoch": 0.001636, "grad_norm": 0.37216630578041077, "learning_rate": 1e-05, "loss": 0.0346, "step": 163600 }, { "epoch": 0.001637, "grad_norm": 0.32333695888519287, "learning_rate": 1e-05, "loss": 0.0352, "step": 163700 }, { "epoch": 0.001638, "grad_norm": 0.3110905885696411, "learning_rate": 1e-05, "loss": 0.0348, "step": 163800 }, { "epoch": 0.001639, "grad_norm": 0.42455801367759705, "learning_rate": 1e-05, "loss": 0.0344, "step": 163900 }, { "epoch": 0.00164, "grad_norm": 0.24967637658119202, "learning_rate": 1e-05, "loss": 0.0347, "step": 164000 }, { "epoch": 0.001641, "grad_norm": 0.2895750105381012, "learning_rate": 1e-05, "loss": 0.0347, "step": 164100 }, { "epoch": 0.001642, "grad_norm": 0.33086949586868286, "learning_rate": 1e-05, "loss": 0.0346, "step": 164200 }, { "epoch": 0.001643, "grad_norm": 0.37601691484451294, "learning_rate": 1e-05, "loss": 0.0345, "step": 164300 }, { "epoch": 0.001644, "grad_norm": 0.2899239659309387, "learning_rate": 1e-05, "loss": 0.0347, "step": 164400 }, { "epoch": 0.001645, "grad_norm": 0.21798618137836456, "learning_rate": 1e-05, "loss": 0.0346, "step": 164500 }, { "epoch": 0.001646, "grad_norm": 0.31432250142097473, "learning_rate": 1e-05, "loss": 0.0345, "step": 164600 }, { "epoch": 0.001647, "grad_norm": 0.25909945368766785, "learning_rate": 1e-05, "loss": 0.0348, "step": 164700 }, { "epoch": 0.001648, "grad_norm": 0.25648802518844604, "learning_rate": 1e-05, "loss": 0.034, "step": 164800 }, { "epoch": 0.001649, "grad_norm": 0.31037193536758423, "learning_rate": 1e-05, "loss": 0.0348, "step": 164900 }, { "epoch": 0.00165, "grad_norm": 0.26846280694007874, "learning_rate": 1e-05, "loss": 0.0346, "step": 165000 }, { "epoch": 0.001651, "grad_norm": 0.3110648989677429, "learning_rate": 1e-05, "loss": 0.0343, "step": 165100 }, { "epoch": 0.001652, "grad_norm": 0.257804274559021, "learning_rate": 1e-05, "loss": 0.0349, "step": 165200 }, { "epoch": 0.001653, "grad_norm": 0.3462386727333069, "learning_rate": 1e-05, "loss": 0.0341, "step": 165300 }, { "epoch": 0.001654, "grad_norm": 0.2818313539028168, "learning_rate": 1e-05, "loss": 0.0345, "step": 165400 }, { "epoch": 0.001655, "grad_norm": 0.24017642438411713, "learning_rate": 1e-05, "loss": 0.035, "step": 165500 }, { "epoch": 0.001656, "grad_norm": 0.2607235312461853, "learning_rate": 1e-05, "loss": 0.0354, "step": 165600 }, { "epoch": 0.001657, "grad_norm": 0.22233878076076508, "learning_rate": 1e-05, "loss": 0.0349, "step": 165700 }, { "epoch": 0.001658, "grad_norm": 0.2199426293373108, "learning_rate": 1e-05, "loss": 0.0342, "step": 165800 }, { "epoch": 0.001659, "grad_norm": 0.25949713587760925, "learning_rate": 1e-05, "loss": 0.0347, "step": 165900 }, { "epoch": 0.00166, "grad_norm": 0.21830064058303833, "learning_rate": 1e-05, "loss": 0.0341, "step": 166000 }, { "epoch": 0.001661, "grad_norm": 0.2837875783443451, "learning_rate": 1e-05, "loss": 0.035, "step": 166100 }, { "epoch": 0.001662, "grad_norm": 0.24523448944091797, "learning_rate": 1e-05, "loss": 0.0347, "step": 166200 }, { "epoch": 0.001663, "grad_norm": 0.2355676293373108, "learning_rate": 1e-05, "loss": 0.0339, "step": 166300 }, { "epoch": 0.001664, "grad_norm": 0.42384350299835205, "learning_rate": 1e-05, "loss": 0.0337, "step": 166400 }, { "epoch": 0.001665, "grad_norm": 0.2698984146118164, "learning_rate": 1e-05, "loss": 0.0345, "step": 166500 }, { "epoch": 0.001666, "grad_norm": 0.34053897857666016, "learning_rate": 1e-05, "loss": 0.0346, "step": 166600 }, { "epoch": 0.001667, "grad_norm": 0.24192652106285095, "learning_rate": 1e-05, "loss": 0.034, "step": 166700 }, { "epoch": 0.001668, "grad_norm": 0.2774716019630432, "learning_rate": 1e-05, "loss": 0.0336, "step": 166800 }, { "epoch": 0.001669, "grad_norm": 0.2470908910036087, "learning_rate": 1e-05, "loss": 0.0341, "step": 166900 }, { "epoch": 0.00167, "grad_norm": 0.3088979721069336, "learning_rate": 1e-05, "loss": 0.0342, "step": 167000 }, { "epoch": 0.001671, "grad_norm": 0.29100170731544495, "learning_rate": 1e-05, "loss": 0.035, "step": 167100 }, { "epoch": 0.001672, "grad_norm": 0.3007741868495941, "learning_rate": 1e-05, "loss": 0.0343, "step": 167200 }, { "epoch": 0.001673, "grad_norm": 0.23604948818683624, "learning_rate": 1e-05, "loss": 0.0337, "step": 167300 }, { "epoch": 0.001674, "grad_norm": 0.363710880279541, "learning_rate": 1e-05, "loss": 0.0349, "step": 167400 }, { "epoch": 0.001675, "grad_norm": 0.23457486927509308, "learning_rate": 1e-05, "loss": 0.0343, "step": 167500 }, { "epoch": 0.001676, "grad_norm": 0.24869729578495026, "learning_rate": 1e-05, "loss": 0.0339, "step": 167600 }, { "epoch": 0.001677, "grad_norm": 0.2273103892803192, "learning_rate": 1e-05, "loss": 0.0346, "step": 167700 }, { "epoch": 0.001678, "grad_norm": 0.2258453518152237, "learning_rate": 1e-05, "loss": 0.0347, "step": 167800 }, { "epoch": 0.001679, "grad_norm": 0.31026628613471985, "learning_rate": 1e-05, "loss": 0.0338, "step": 167900 }, { "epoch": 0.00168, "grad_norm": 0.26028507947921753, "learning_rate": 1e-05, "loss": 0.0344, "step": 168000 }, { "epoch": 0.001681, "grad_norm": 0.20397011935710907, "learning_rate": 1e-05, "loss": 0.0338, "step": 168100 }, { "epoch": 0.001682, "grad_norm": 0.2919481098651886, "learning_rate": 1e-05, "loss": 0.0349, "step": 168200 }, { "epoch": 0.001683, "grad_norm": 0.3299901485443115, "learning_rate": 1e-05, "loss": 0.0338, "step": 168300 }, { "epoch": 0.001684, "grad_norm": 0.2661847770214081, "learning_rate": 1e-05, "loss": 0.0345, "step": 168400 }, { "epoch": 0.001685, "grad_norm": 0.20423321425914764, "learning_rate": 1e-05, "loss": 0.0339, "step": 168500 }, { "epoch": 0.001686, "grad_norm": 0.2971194386482239, "learning_rate": 1e-05, "loss": 0.0344, "step": 168600 }, { "epoch": 0.001687, "grad_norm": 0.2570949196815491, "learning_rate": 1e-05, "loss": 0.0342, "step": 168700 }, { "epoch": 0.001688, "grad_norm": 0.4466119706630707, "learning_rate": 1e-05, "loss": 0.0345, "step": 168800 }, { "epoch": 0.001689, "grad_norm": 0.32079559564590454, "learning_rate": 1e-05, "loss": 0.0342, "step": 168900 }, { "epoch": 0.00169, "grad_norm": 0.23981431126594543, "learning_rate": 1e-05, "loss": 0.0345, "step": 169000 }, { "epoch": 0.001691, "grad_norm": 0.42814046144485474, "learning_rate": 1e-05, "loss": 0.0343, "step": 169100 }, { "epoch": 0.001692, "grad_norm": 0.3155921995639801, "learning_rate": 1e-05, "loss": 0.0339, "step": 169200 }, { "epoch": 0.001693, "grad_norm": 0.2609718441963196, "learning_rate": 1e-05, "loss": 0.0337, "step": 169300 }, { "epoch": 0.001694, "grad_norm": 0.31464606523513794, "learning_rate": 1e-05, "loss": 0.0349, "step": 169400 }, { "epoch": 0.001695, "grad_norm": 0.26944229006767273, "learning_rate": 1e-05, "loss": 0.0346, "step": 169500 }, { "epoch": 0.001696, "grad_norm": 0.2509726881980896, "learning_rate": 1e-05, "loss": 0.0339, "step": 169600 }, { "epoch": 0.001697, "grad_norm": 0.22716431319713593, "learning_rate": 1e-05, "loss": 0.0341, "step": 169700 }, { "epoch": 0.001698, "grad_norm": 0.3855615258216858, "learning_rate": 1e-05, "loss": 0.0335, "step": 169800 }, { "epoch": 0.001699, "grad_norm": 0.3477301299571991, "learning_rate": 1e-05, "loss": 0.0344, "step": 169900 }, { "epoch": 0.0017, "grad_norm": 0.20858147740364075, "learning_rate": 1e-05, "loss": 0.0344, "step": 170000 }, { "epoch": 0.001701, "grad_norm": 0.3614365756511688, "learning_rate": 1e-05, "loss": 0.0338, "step": 170100 }, { "epoch": 0.001702, "grad_norm": 0.2969813048839569, "learning_rate": 1e-05, "loss": 0.0343, "step": 170200 }, { "epoch": 0.001703, "grad_norm": 0.2566182017326355, "learning_rate": 1e-05, "loss": 0.0338, "step": 170300 }, { "epoch": 0.001704, "grad_norm": 0.3033989667892456, "learning_rate": 1e-05, "loss": 0.0334, "step": 170400 }, { "epoch": 0.001705, "grad_norm": 0.289620578289032, "learning_rate": 1e-05, "loss": 0.0343, "step": 170500 }, { "epoch": 0.001706, "grad_norm": 0.2588507831096649, "learning_rate": 1e-05, "loss": 0.0346, "step": 170600 }, { "epoch": 0.001707, "grad_norm": 0.30555808544158936, "learning_rate": 1e-05, "loss": 0.0337, "step": 170700 }, { "epoch": 0.001708, "grad_norm": 0.33761274814605713, "learning_rate": 1e-05, "loss": 0.0341, "step": 170800 }, { "epoch": 0.001709, "grad_norm": 0.24097822606563568, "learning_rate": 1e-05, "loss": 0.0335, "step": 170900 }, { "epoch": 0.00171, "grad_norm": 0.22846506536006927, "learning_rate": 1e-05, "loss": 0.0341, "step": 171000 }, { "epoch": 0.001711, "grad_norm": 0.24729125201702118, "learning_rate": 1e-05, "loss": 0.0336, "step": 171100 }, { "epoch": 0.001712, "grad_norm": 0.4203731119632721, "learning_rate": 1e-05, "loss": 0.0344, "step": 171200 }, { "epoch": 0.001713, "grad_norm": 0.257803350687027, "learning_rate": 1e-05, "loss": 0.0337, "step": 171300 }, { "epoch": 0.001714, "grad_norm": 0.23512259125709534, "learning_rate": 1e-05, "loss": 0.0339, "step": 171400 }, { "epoch": 0.001715, "grad_norm": 0.2952919006347656, "learning_rate": 1e-05, "loss": 0.0348, "step": 171500 }, { "epoch": 0.001716, "grad_norm": 0.3079668879508972, "learning_rate": 1e-05, "loss": 0.0335, "step": 171600 }, { "epoch": 0.001717, "grad_norm": 0.24873563647270203, "learning_rate": 1e-05, "loss": 0.0343, "step": 171700 }, { "epoch": 0.001718, "grad_norm": 0.27448970079421997, "learning_rate": 1e-05, "loss": 0.0337, "step": 171800 }, { "epoch": 0.001719, "grad_norm": 0.2692762017250061, "learning_rate": 1e-05, "loss": 0.0344, "step": 171900 }, { "epoch": 0.00172, "grad_norm": 0.2886498272418976, "learning_rate": 1e-05, "loss": 0.0333, "step": 172000 }, { "epoch": 0.001721, "grad_norm": 0.27389976382255554, "learning_rate": 1e-05, "loss": 0.0335, "step": 172100 }, { "epoch": 0.001722, "grad_norm": 0.31537455320358276, "learning_rate": 1e-05, "loss": 0.0337, "step": 172200 }, { "epoch": 0.001723, "grad_norm": 0.25125744938850403, "learning_rate": 1e-05, "loss": 0.034, "step": 172300 }, { "epoch": 0.001724, "grad_norm": 0.2595754861831665, "learning_rate": 1e-05, "loss": 0.0341, "step": 172400 }, { "epoch": 0.001725, "grad_norm": 0.3856462836265564, "learning_rate": 1e-05, "loss": 0.0335, "step": 172500 }, { "epoch": 0.001726, "grad_norm": 0.31457358598709106, "learning_rate": 1e-05, "loss": 0.0338, "step": 172600 }, { "epoch": 0.001727, "grad_norm": 0.2751544415950775, "learning_rate": 1e-05, "loss": 0.0341, "step": 172700 }, { "epoch": 0.001728, "grad_norm": 0.2751900255680084, "learning_rate": 1e-05, "loss": 0.0332, "step": 172800 }, { "epoch": 0.001729, "grad_norm": 0.31339791417121887, "learning_rate": 1e-05, "loss": 0.0335, "step": 172900 }, { "epoch": 0.00173, "grad_norm": 0.29081615805625916, "learning_rate": 1e-05, "loss": 0.0336, "step": 173000 }, { "epoch": 0.001731, "grad_norm": 0.26912426948547363, "learning_rate": 1e-05, "loss": 0.0337, "step": 173100 }, { "epoch": 0.001732, "grad_norm": 0.23936721682548523, "learning_rate": 1e-05, "loss": 0.0338, "step": 173200 }, { "epoch": 0.001733, "grad_norm": 0.24531204998493195, "learning_rate": 1e-05, "loss": 0.0339, "step": 173300 }, { "epoch": 0.001734, "grad_norm": 0.255636602640152, "learning_rate": 1e-05, "loss": 0.0336, "step": 173400 }, { "epoch": 0.001735, "grad_norm": 0.39259791374206543, "learning_rate": 1e-05, "loss": 0.0335, "step": 173500 }, { "epoch": 0.001736, "grad_norm": 0.321489542722702, "learning_rate": 1e-05, "loss": 0.0342, "step": 173600 }, { "epoch": 0.001737, "grad_norm": 0.2807994782924652, "learning_rate": 1e-05, "loss": 0.0341, "step": 173700 }, { "epoch": 0.001738, "grad_norm": 0.22246520221233368, "learning_rate": 1e-05, "loss": 0.0338, "step": 173800 }, { "epoch": 0.001739, "grad_norm": 0.25296080112457275, "learning_rate": 1e-05, "loss": 0.0336, "step": 173900 }, { "epoch": 0.00174, "grad_norm": 0.2789648175239563, "learning_rate": 1e-05, "loss": 0.0332, "step": 174000 }, { "epoch": 0.001741, "grad_norm": 0.23489654064178467, "learning_rate": 1e-05, "loss": 0.033, "step": 174100 }, { "epoch": 0.001742, "grad_norm": 0.30469810962677, "learning_rate": 1e-05, "loss": 0.0341, "step": 174200 }, { "epoch": 0.001743, "grad_norm": 0.2674339711666107, "learning_rate": 1e-05, "loss": 0.0333, "step": 174300 }, { "epoch": 0.001744, "grad_norm": 0.27444738149642944, "learning_rate": 1e-05, "loss": 0.0337, "step": 174400 }, { "epoch": 0.001745, "grad_norm": 0.2845589518547058, "learning_rate": 1e-05, "loss": 0.0339, "step": 174500 }, { "epoch": 0.001746, "grad_norm": 0.2515588402748108, "learning_rate": 1e-05, "loss": 0.0332, "step": 174600 }, { "epoch": 0.001747, "grad_norm": 0.2958022356033325, "learning_rate": 1e-05, "loss": 0.034, "step": 174700 }, { "epoch": 0.001748, "grad_norm": 0.3303828537464142, "learning_rate": 1e-05, "loss": 0.034, "step": 174800 }, { "epoch": 0.001749, "grad_norm": 0.25713637471199036, "learning_rate": 1e-05, "loss": 0.0342, "step": 174900 }, { "epoch": 0.00175, "grad_norm": 0.30058687925338745, "learning_rate": 1e-05, "loss": 0.0338, "step": 175000 }, { "epoch": 0.001751, "grad_norm": 0.24354539811611176, "learning_rate": 1e-05, "loss": 0.034, "step": 175100 }, { "epoch": 0.001752, "grad_norm": 0.2782229781150818, "learning_rate": 1e-05, "loss": 0.0331, "step": 175200 }, { "epoch": 0.001753, "grad_norm": 0.29881346225738525, "learning_rate": 1e-05, "loss": 0.0334, "step": 175300 }, { "epoch": 0.001754, "grad_norm": 0.2080865055322647, "learning_rate": 1e-05, "loss": 0.0337, "step": 175400 }, { "epoch": 0.001755, "grad_norm": 0.30373525619506836, "learning_rate": 1e-05, "loss": 0.0338, "step": 175500 }, { "epoch": 0.001756, "grad_norm": 0.2750962972640991, "learning_rate": 1e-05, "loss": 0.0335, "step": 175600 }, { "epoch": 0.001757, "grad_norm": 0.26978927850723267, "learning_rate": 1e-05, "loss": 0.0334, "step": 175700 }, { "epoch": 0.001758, "grad_norm": 0.3102504312992096, "learning_rate": 1e-05, "loss": 0.0333, "step": 175800 }, { "epoch": 0.001759, "grad_norm": 0.2395389825105667, "learning_rate": 1e-05, "loss": 0.0336, "step": 175900 }, { "epoch": 0.00176, "grad_norm": 0.2935570776462555, "learning_rate": 1e-05, "loss": 0.0337, "step": 176000 }, { "epoch": 0.001761, "grad_norm": 0.34253615140914917, "learning_rate": 1e-05, "loss": 0.0332, "step": 176100 }, { "epoch": 0.001762, "grad_norm": 0.2153007537126541, "learning_rate": 1e-05, "loss": 0.0345, "step": 176200 }, { "epoch": 0.001763, "grad_norm": 0.20891909301280975, "learning_rate": 1e-05, "loss": 0.0334, "step": 176300 }, { "epoch": 0.001764, "grad_norm": 0.22327275574207306, "learning_rate": 1e-05, "loss": 0.0332, "step": 176400 }, { "epoch": 0.001765, "grad_norm": 0.3256758749485016, "learning_rate": 1e-05, "loss": 0.0337, "step": 176500 }, { "epoch": 0.001766, "grad_norm": 0.2020077258348465, "learning_rate": 1e-05, "loss": 0.0332, "step": 176600 }, { "epoch": 0.001767, "grad_norm": 0.2757721543312073, "learning_rate": 1e-05, "loss": 0.0339, "step": 176700 }, { "epoch": 0.001768, "grad_norm": 0.23078325390815735, "learning_rate": 1e-05, "loss": 0.0335, "step": 176800 }, { "epoch": 0.001769, "grad_norm": 0.22903937101364136, "learning_rate": 1e-05, "loss": 0.0337, "step": 176900 }, { "epoch": 0.00177, "grad_norm": 0.23434357345104218, "learning_rate": 1e-05, "loss": 0.0335, "step": 177000 }, { "epoch": 0.001771, "grad_norm": 0.19928239285945892, "learning_rate": 1e-05, "loss": 0.0336, "step": 177100 }, { "epoch": 0.001772, "grad_norm": 0.36052605509757996, "learning_rate": 1e-05, "loss": 0.0339, "step": 177200 }, { "epoch": 0.001773, "grad_norm": 0.2869988679885864, "learning_rate": 1e-05, "loss": 0.034, "step": 177300 }, { "epoch": 0.001774, "grad_norm": 0.2615896165370941, "learning_rate": 1e-05, "loss": 0.0335, "step": 177400 }, { "epoch": 0.001775, "grad_norm": 0.2378082126379013, "learning_rate": 1e-05, "loss": 0.0336, "step": 177500 }, { "epoch": 0.001776, "grad_norm": 0.2423420548439026, "learning_rate": 1e-05, "loss": 0.0332, "step": 177600 }, { "epoch": 0.001777, "grad_norm": 0.283549964427948, "learning_rate": 1e-05, "loss": 0.0329, "step": 177700 }, { "epoch": 0.001778, "grad_norm": 0.2543993592262268, "learning_rate": 1e-05, "loss": 0.0333, "step": 177800 }, { "epoch": 0.001779, "grad_norm": 0.2176186442375183, "learning_rate": 1e-05, "loss": 0.0338, "step": 177900 }, { "epoch": 0.00178, "grad_norm": 0.2954069972038269, "learning_rate": 1e-05, "loss": 0.0336, "step": 178000 }, { "epoch": 0.001781, "grad_norm": 0.2999890148639679, "learning_rate": 1e-05, "loss": 0.0339, "step": 178100 }, { "epoch": 0.001782, "grad_norm": 0.3116333484649658, "learning_rate": 1e-05, "loss": 0.0341, "step": 178200 }, { "epoch": 0.001783, "grad_norm": 0.23983174562454224, "learning_rate": 1e-05, "loss": 0.0334, "step": 178300 }, { "epoch": 0.001784, "grad_norm": 0.3012688159942627, "learning_rate": 1e-05, "loss": 0.0339, "step": 178400 }, { "epoch": 0.001785, "grad_norm": 0.28697216510772705, "learning_rate": 1e-05, "loss": 0.034, "step": 178500 }, { "epoch": 0.001786, "grad_norm": 0.27888059616088867, "learning_rate": 1e-05, "loss": 0.0338, "step": 178600 }, { "epoch": 0.001787, "grad_norm": 0.2618900239467621, "learning_rate": 1e-05, "loss": 0.0335, "step": 178700 }, { "epoch": 0.001788, "grad_norm": 0.2513633072376251, "learning_rate": 1e-05, "loss": 0.0336, "step": 178800 }, { "epoch": 0.001789, "grad_norm": 0.3682166039943695, "learning_rate": 1e-05, "loss": 0.0335, "step": 178900 }, { "epoch": 0.00179, "grad_norm": 0.22769583761692047, "learning_rate": 1e-05, "loss": 0.034, "step": 179000 }, { "epoch": 0.001791, "grad_norm": 0.23748233914375305, "learning_rate": 1e-05, "loss": 0.0328, "step": 179100 }, { "epoch": 0.001792, "grad_norm": 0.21373635530471802, "learning_rate": 1e-05, "loss": 0.0329, "step": 179200 }, { "epoch": 0.001793, "grad_norm": 0.24816814064979553, "learning_rate": 1e-05, "loss": 0.0337, "step": 179300 }, { "epoch": 0.001794, "grad_norm": 0.2681199014186859, "learning_rate": 1e-05, "loss": 0.0331, "step": 179400 }, { "epoch": 0.001795, "grad_norm": 0.2132350355386734, "learning_rate": 1e-05, "loss": 0.0329, "step": 179500 }, { "epoch": 0.001796, "grad_norm": 0.22535862028598785, "learning_rate": 1e-05, "loss": 0.0333, "step": 179600 }, { "epoch": 0.001797, "grad_norm": 0.2647210359573364, "learning_rate": 1e-05, "loss": 0.0335, "step": 179700 }, { "epoch": 0.001798, "grad_norm": 0.3285861015319824, "learning_rate": 1e-05, "loss": 0.0333, "step": 179800 }, { "epoch": 0.001799, "grad_norm": 0.5168442726135254, "learning_rate": 1e-05, "loss": 0.0329, "step": 179900 }, { "epoch": 0.0018, "grad_norm": 0.25108155608177185, "learning_rate": 1e-05, "loss": 0.0333, "step": 180000 }, { "epoch": 0.0018, "eval_loss": 0.02923957258462906, "eval_runtime": 190.5331, "eval_samples_per_second": 262.422, "eval_steps_per_second": 16.401, "step": 180000 }, { "epoch": 0.001801, "grad_norm": 0.2565983235836029, "learning_rate": 1e-05, "loss": 0.0334, "step": 180100 }, { "epoch": 0.001802, "grad_norm": 0.22116540372371674, "learning_rate": 1e-05, "loss": 0.0335, "step": 180200 }, { "epoch": 0.001803, "grad_norm": 0.2648037374019623, "learning_rate": 1e-05, "loss": 0.0334, "step": 180300 }, { "epoch": 0.001804, "grad_norm": 0.27524662017822266, "learning_rate": 1e-05, "loss": 0.0326, "step": 180400 }, { "epoch": 0.001805, "grad_norm": 0.23623907566070557, "learning_rate": 1e-05, "loss": 0.0332, "step": 180500 }, { "epoch": 0.001806, "grad_norm": 0.3295729458332062, "learning_rate": 1e-05, "loss": 0.0336, "step": 180600 }, { "epoch": 0.001807, "grad_norm": 0.5013869404792786, "learning_rate": 1e-05, "loss": 0.033, "step": 180700 }, { "epoch": 0.001808, "grad_norm": 0.21693319082260132, "learning_rate": 1e-05, "loss": 0.033, "step": 180800 }, { "epoch": 0.001809, "grad_norm": 0.3113457262516022, "learning_rate": 1e-05, "loss": 0.0333, "step": 180900 }, { "epoch": 0.00181, "grad_norm": 0.32292118668556213, "learning_rate": 1e-05, "loss": 0.0333, "step": 181000 }, { "epoch": 0.001811, "grad_norm": 0.2475491762161255, "learning_rate": 1e-05, "loss": 0.0332, "step": 181100 }, { "epoch": 0.001812, "grad_norm": 0.3540252447128296, "learning_rate": 1e-05, "loss": 0.0336, "step": 181200 }, { "epoch": 0.001813, "grad_norm": 0.2882533371448517, "learning_rate": 1e-05, "loss": 0.0335, "step": 181300 }, { "epoch": 0.001814, "grad_norm": 0.2918182611465454, "learning_rate": 1e-05, "loss": 0.0339, "step": 181400 }, { "epoch": 0.001815, "grad_norm": 0.32958486676216125, "learning_rate": 1e-05, "loss": 0.0331, "step": 181500 }, { "epoch": 0.001816, "grad_norm": 0.2181263566017151, "learning_rate": 1e-05, "loss": 0.0333, "step": 181600 }, { "epoch": 0.001817, "grad_norm": 0.24456720054149628, "learning_rate": 1e-05, "loss": 0.0331, "step": 181700 }, { "epoch": 0.001818, "grad_norm": 0.35883161425590515, "learning_rate": 1e-05, "loss": 0.0334, "step": 181800 }, { "epoch": 0.001819, "grad_norm": 0.2980010509490967, "learning_rate": 1e-05, "loss": 0.0337, "step": 181900 }, { "epoch": 0.00182, "grad_norm": 0.2375982254743576, "learning_rate": 1e-05, "loss": 0.0334, "step": 182000 }, { "epoch": 0.001821, "grad_norm": 0.3131486475467682, "learning_rate": 1e-05, "loss": 0.0333, "step": 182100 }, { "epoch": 0.001822, "grad_norm": 0.21573175489902496, "learning_rate": 1e-05, "loss": 0.0333, "step": 182200 }, { "epoch": 0.001823, "grad_norm": 0.2779366970062256, "learning_rate": 1e-05, "loss": 0.0334, "step": 182300 }, { "epoch": 0.001824, "grad_norm": 0.24103473126888275, "learning_rate": 1e-05, "loss": 0.0332, "step": 182400 }, { "epoch": 0.001825, "grad_norm": 0.219675213098526, "learning_rate": 1e-05, "loss": 0.0332, "step": 182500 }, { "epoch": 0.001826, "grad_norm": 0.2842915952205658, "learning_rate": 1e-05, "loss": 0.0327, "step": 182600 }, { "epoch": 0.001827, "grad_norm": 0.28422147035598755, "learning_rate": 1e-05, "loss": 0.0335, "step": 182700 }, { "epoch": 0.001828, "grad_norm": 0.3290017247200012, "learning_rate": 1e-05, "loss": 0.0326, "step": 182800 }, { "epoch": 0.001829, "grad_norm": 0.2743884027004242, "learning_rate": 1e-05, "loss": 0.0323, "step": 182900 }, { "epoch": 0.00183, "grad_norm": 0.2970394492149353, "learning_rate": 1e-05, "loss": 0.0337, "step": 183000 }, { "epoch": 0.001831, "grad_norm": 0.3129160404205322, "learning_rate": 1e-05, "loss": 0.0333, "step": 183100 }, { "epoch": 0.001832, "grad_norm": 0.27394020557403564, "learning_rate": 1e-05, "loss": 0.0327, "step": 183200 }, { "epoch": 0.001833, "grad_norm": 0.3385072350502014, "learning_rate": 1e-05, "loss": 0.0329, "step": 183300 }, { "epoch": 0.001834, "grad_norm": 0.19546039402484894, "learning_rate": 1e-05, "loss": 0.0334, "step": 183400 }, { "epoch": 0.001835, "grad_norm": 0.3378957211971283, "learning_rate": 1e-05, "loss": 0.0331, "step": 183500 }, { "epoch": 0.001836, "grad_norm": 0.2609913945198059, "learning_rate": 1e-05, "loss": 0.0329, "step": 183600 }, { "epoch": 0.001837, "grad_norm": 0.41809314489364624, "learning_rate": 1e-05, "loss": 0.0331, "step": 183700 }, { "epoch": 0.001838, "grad_norm": 0.28767791390419006, "learning_rate": 1e-05, "loss": 0.0339, "step": 183800 }, { "epoch": 0.001839, "grad_norm": 0.2566281855106354, "learning_rate": 1e-05, "loss": 0.033, "step": 183900 }, { "epoch": 0.00184, "grad_norm": 0.23523253202438354, "learning_rate": 1e-05, "loss": 0.0327, "step": 184000 }, { "epoch": 0.001841, "grad_norm": 0.332202672958374, "learning_rate": 1e-05, "loss": 0.0333, "step": 184100 }, { "epoch": 0.001842, "grad_norm": 0.26818403601646423, "learning_rate": 1e-05, "loss": 0.0329, "step": 184200 }, { "epoch": 0.001843, "grad_norm": 0.27122044563293457, "learning_rate": 1e-05, "loss": 0.0333, "step": 184300 }, { "epoch": 0.001844, "grad_norm": 0.27591198682785034, "learning_rate": 1e-05, "loss": 0.0337, "step": 184400 }, { "epoch": 0.001845, "grad_norm": 0.2676272392272949, "learning_rate": 1e-05, "loss": 0.0332, "step": 184500 }, { "epoch": 0.001846, "grad_norm": 0.31358590722084045, "learning_rate": 1e-05, "loss": 0.0332, "step": 184600 }, { "epoch": 0.001847, "grad_norm": 0.22878064215183258, "learning_rate": 1e-05, "loss": 0.0328, "step": 184700 }, { "epoch": 0.001848, "grad_norm": 0.24278567731380463, "learning_rate": 1e-05, "loss": 0.0332, "step": 184800 }, { "epoch": 0.001849, "grad_norm": 0.2703821659088135, "learning_rate": 1e-05, "loss": 0.0331, "step": 184900 }, { "epoch": 0.00185, "grad_norm": 0.27002954483032227, "learning_rate": 1e-05, "loss": 0.0334, "step": 185000 }, { "epoch": 0.001851, "grad_norm": 0.2585814893245697, "learning_rate": 1e-05, "loss": 0.0335, "step": 185100 }, { "epoch": 0.001852, "grad_norm": 0.28434547781944275, "learning_rate": 1e-05, "loss": 0.0331, "step": 185200 }, { "epoch": 0.001853, "grad_norm": 0.2478020042181015, "learning_rate": 1e-05, "loss": 0.0332, "step": 185300 }, { "epoch": 0.001854, "grad_norm": 0.2303665578365326, "learning_rate": 1e-05, "loss": 0.0331, "step": 185400 }, { "epoch": 0.001855, "grad_norm": 0.25849053263664246, "learning_rate": 1e-05, "loss": 0.0329, "step": 185500 }, { "epoch": 0.001856, "grad_norm": 0.3219095766544342, "learning_rate": 1e-05, "loss": 0.0333, "step": 185600 }, { "epoch": 0.001857, "grad_norm": 0.2839357554912567, "learning_rate": 1e-05, "loss": 0.0329, "step": 185700 }, { "epoch": 0.001858, "grad_norm": 0.3176965117454529, "learning_rate": 1e-05, "loss": 0.0322, "step": 185800 }, { "epoch": 0.001859, "grad_norm": 0.2577592730522156, "learning_rate": 1e-05, "loss": 0.0326, "step": 185900 }, { "epoch": 0.00186, "grad_norm": 0.33923929929733276, "learning_rate": 1e-05, "loss": 0.032, "step": 186000 }, { "epoch": 0.001861, "grad_norm": 0.3180791735649109, "learning_rate": 1e-05, "loss": 0.032, "step": 186100 }, { "epoch": 0.001862, "grad_norm": 0.22188317775726318, "learning_rate": 1e-05, "loss": 0.0326, "step": 186200 }, { "epoch": 0.001863, "grad_norm": 0.24470332264900208, "learning_rate": 1e-05, "loss": 0.0332, "step": 186300 }, { "epoch": 0.001864, "grad_norm": 0.40069741010665894, "learning_rate": 1e-05, "loss": 0.033, "step": 186400 }, { "epoch": 0.001865, "grad_norm": 0.21728770434856415, "learning_rate": 1e-05, "loss": 0.0325, "step": 186500 }, { "epoch": 0.001866, "grad_norm": 0.22420375049114227, "learning_rate": 1e-05, "loss": 0.033, "step": 186600 }, { "epoch": 0.001867, "grad_norm": 0.2249716818332672, "learning_rate": 1e-05, "loss": 0.0335, "step": 186700 }, { "epoch": 0.001868, "grad_norm": 0.2527987062931061, "learning_rate": 1e-05, "loss": 0.0331, "step": 186800 }, { "epoch": 0.001869, "grad_norm": 0.24008408188819885, "learning_rate": 1e-05, "loss": 0.0333, "step": 186900 }, { "epoch": 0.00187, "grad_norm": 0.254478394985199, "learning_rate": 1e-05, "loss": 0.0329, "step": 187000 }, { "epoch": 0.001871, "grad_norm": 0.2203466147184372, "learning_rate": 1e-05, "loss": 0.033, "step": 187100 }, { "epoch": 0.001872, "grad_norm": 0.2345256358385086, "learning_rate": 1e-05, "loss": 0.0327, "step": 187200 }, { "epoch": 0.001873, "grad_norm": 0.21138468384742737, "learning_rate": 1e-05, "loss": 0.0327, "step": 187300 }, { "epoch": 0.001874, "grad_norm": 0.23684906959533691, "learning_rate": 1e-05, "loss": 0.0328, "step": 187400 }, { "epoch": 0.001875, "grad_norm": 0.26282799243927, "learning_rate": 1e-05, "loss": 0.0324, "step": 187500 }, { "epoch": 0.001876, "grad_norm": 0.25969406962394714, "learning_rate": 1e-05, "loss": 0.0333, "step": 187600 }, { "epoch": 0.001877, "grad_norm": 0.2465859204530716, "learning_rate": 1e-05, "loss": 0.0325, "step": 187700 }, { "epoch": 0.001878, "grad_norm": 0.21989566087722778, "learning_rate": 1e-05, "loss": 0.0327, "step": 187800 }, { "epoch": 0.001879, "grad_norm": 0.31595125794410706, "learning_rate": 1e-05, "loss": 0.0323, "step": 187900 }, { "epoch": 0.00188, "grad_norm": 0.21837303042411804, "learning_rate": 1e-05, "loss": 0.0327, "step": 188000 }, { "epoch": 0.001881, "grad_norm": 0.27855274081230164, "learning_rate": 1e-05, "loss": 0.0331, "step": 188100 }, { "epoch": 0.001882, "grad_norm": 0.38418394327163696, "learning_rate": 1e-05, "loss": 0.033, "step": 188200 }, { "epoch": 0.001883, "grad_norm": 0.25750595331192017, "learning_rate": 1e-05, "loss": 0.0322, "step": 188300 }, { "epoch": 0.001884, "grad_norm": 0.26400867104530334, "learning_rate": 1e-05, "loss": 0.0328, "step": 188400 }, { "epoch": 0.001885, "grad_norm": 0.325977087020874, "learning_rate": 1e-05, "loss": 0.0321, "step": 188500 }, { "epoch": 0.001886, "grad_norm": 0.22428768873214722, "learning_rate": 1e-05, "loss": 0.0317, "step": 188600 }, { "epoch": 0.001887, "grad_norm": 0.305190771818161, "learning_rate": 1e-05, "loss": 0.033, "step": 188700 }, { "epoch": 0.001888, "grad_norm": 0.3786349892616272, "learning_rate": 1e-05, "loss": 0.0326, "step": 188800 }, { "epoch": 0.001889, "grad_norm": 0.23940782248973846, "learning_rate": 1e-05, "loss": 0.033, "step": 188900 }, { "epoch": 0.00189, "grad_norm": 0.4181540906429291, "learning_rate": 1e-05, "loss": 0.0328, "step": 189000 }, { "epoch": 0.001891, "grad_norm": 0.2620537281036377, "learning_rate": 1e-05, "loss": 0.0328, "step": 189100 }, { "epoch": 0.001892, "grad_norm": 0.2930894196033478, "learning_rate": 1e-05, "loss": 0.0322, "step": 189200 }, { "epoch": 0.001893, "grad_norm": 0.2942257821559906, "learning_rate": 1e-05, "loss": 0.033, "step": 189300 }, { "epoch": 0.001894, "grad_norm": 0.2582312226295471, "learning_rate": 1e-05, "loss": 0.033, "step": 189400 }, { "epoch": 0.001895, "grad_norm": 0.23726144433021545, "learning_rate": 1e-05, "loss": 0.0323, "step": 189500 }, { "epoch": 0.001896, "grad_norm": 0.2391706109046936, "learning_rate": 1e-05, "loss": 0.0324, "step": 189600 }, { "epoch": 0.001897, "grad_norm": 0.2739484906196594, "learning_rate": 1e-05, "loss": 0.0325, "step": 189700 }, { "epoch": 0.001898, "grad_norm": 0.23339012265205383, "learning_rate": 1e-05, "loss": 0.0331, "step": 189800 }, { "epoch": 0.001899, "grad_norm": 0.23878024518489838, "learning_rate": 1e-05, "loss": 0.0326, "step": 189900 }, { "epoch": 0.0019, "grad_norm": 0.2759780287742615, "learning_rate": 1e-05, "loss": 0.0325, "step": 190000 }, { "epoch": 0.001901, "grad_norm": 0.22625140845775604, "learning_rate": 1e-05, "loss": 0.032, "step": 190100 }, { "epoch": 0.001902, "grad_norm": 0.2552201747894287, "learning_rate": 1e-05, "loss": 0.0331, "step": 190200 }, { "epoch": 0.001903, "grad_norm": 0.31288185715675354, "learning_rate": 1e-05, "loss": 0.0325, "step": 190300 }, { "epoch": 0.001904, "grad_norm": 0.26151400804519653, "learning_rate": 1e-05, "loss": 0.0322, "step": 190400 }, { "epoch": 0.001905, "grad_norm": 0.26803916692733765, "learning_rate": 1e-05, "loss": 0.0331, "step": 190500 }, { "epoch": 0.001906, "grad_norm": 0.23230962455272675, "learning_rate": 1e-05, "loss": 0.0317, "step": 190600 }, { "epoch": 0.001907, "grad_norm": 0.2651875913143158, "learning_rate": 1e-05, "loss": 0.0326, "step": 190700 }, { "epoch": 0.001908, "grad_norm": 0.2327023446559906, "learning_rate": 1e-05, "loss": 0.033, "step": 190800 }, { "epoch": 0.001909, "grad_norm": 0.25830167531967163, "learning_rate": 1e-05, "loss": 0.0325, "step": 190900 }, { "epoch": 0.00191, "grad_norm": 0.23783588409423828, "learning_rate": 1e-05, "loss": 0.0328, "step": 191000 }, { "epoch": 0.001911, "grad_norm": 0.24176934361457825, "learning_rate": 1e-05, "loss": 0.0318, "step": 191100 }, { "epoch": 0.001912, "grad_norm": 0.27731117606163025, "learning_rate": 1e-05, "loss": 0.0321, "step": 191200 }, { "epoch": 0.001913, "grad_norm": 0.2152896225452423, "learning_rate": 1e-05, "loss": 0.0318, "step": 191300 }, { "epoch": 0.001914, "grad_norm": 0.27967289090156555, "learning_rate": 1e-05, "loss": 0.0316, "step": 191400 }, { "epoch": 0.001915, "grad_norm": 0.2576562464237213, "learning_rate": 1e-05, "loss": 0.0328, "step": 191500 }, { "epoch": 0.001916, "grad_norm": 0.24486824870109558, "learning_rate": 1e-05, "loss": 0.0316, "step": 191600 }, { "epoch": 0.001917, "grad_norm": 0.34429627656936646, "learning_rate": 1e-05, "loss": 0.0328, "step": 191700 }, { "epoch": 0.001918, "grad_norm": 0.37388914823532104, "learning_rate": 1e-05, "loss": 0.0325, "step": 191800 }, { "epoch": 0.001919, "grad_norm": 0.32336920499801636, "learning_rate": 1e-05, "loss": 0.0325, "step": 191900 }, { "epoch": 0.00192, "grad_norm": 0.3329275846481323, "learning_rate": 1e-05, "loss": 0.0321, "step": 192000 }, { "epoch": 0.001921, "grad_norm": 0.18974027037620544, "learning_rate": 1e-05, "loss": 0.0329, "step": 192100 }, { "epoch": 0.001922, "grad_norm": 0.2590031623840332, "learning_rate": 1e-05, "loss": 0.0327, "step": 192200 }, { "epoch": 0.001923, "grad_norm": 0.27270475029945374, "learning_rate": 1e-05, "loss": 0.0328, "step": 192300 }, { "epoch": 0.001924, "grad_norm": 0.21917562186717987, "learning_rate": 1e-05, "loss": 0.0325, "step": 192400 }, { "epoch": 0.001925, "grad_norm": 0.2148435115814209, "learning_rate": 1e-05, "loss": 0.0321, "step": 192500 }, { "epoch": 0.001926, "grad_norm": 0.2470512092113495, "learning_rate": 1e-05, "loss": 0.0324, "step": 192600 }, { "epoch": 0.001927, "grad_norm": 0.3500710725784302, "learning_rate": 1e-05, "loss": 0.0326, "step": 192700 }, { "epoch": 0.001928, "grad_norm": 0.23941197991371155, "learning_rate": 1e-05, "loss": 0.0321, "step": 192800 }, { "epoch": 0.001929, "grad_norm": 0.20990531146526337, "learning_rate": 1e-05, "loss": 0.0326, "step": 192900 }, { "epoch": 0.00193, "grad_norm": 0.31436702609062195, "learning_rate": 1e-05, "loss": 0.0326, "step": 193000 }, { "epoch": 0.001931, "grad_norm": 0.2531227469444275, "learning_rate": 1e-05, "loss": 0.0322, "step": 193100 }, { "epoch": 0.001932, "grad_norm": 0.43895450234413147, "learning_rate": 1e-05, "loss": 0.0323, "step": 193200 }, { "epoch": 0.001933, "grad_norm": 0.23874956369400024, "learning_rate": 1e-05, "loss": 0.0324, "step": 193300 }, { "epoch": 0.001934, "grad_norm": 0.27833685278892517, "learning_rate": 1e-05, "loss": 0.0326, "step": 193400 }, { "epoch": 0.001935, "grad_norm": 0.2543386220932007, "learning_rate": 1e-05, "loss": 0.0316, "step": 193500 }, { "epoch": 0.001936, "grad_norm": 0.21738335490226746, "learning_rate": 1e-05, "loss": 0.0317, "step": 193600 }, { "epoch": 0.001937, "grad_norm": 0.2733924090862274, "learning_rate": 1e-05, "loss": 0.0327, "step": 193700 }, { "epoch": 0.001938, "grad_norm": 0.29313164949417114, "learning_rate": 1e-05, "loss": 0.0321, "step": 193800 }, { "epoch": 0.001939, "grad_norm": 0.3227449655532837, "learning_rate": 1e-05, "loss": 0.0334, "step": 193900 }, { "epoch": 0.00194, "grad_norm": 0.2968357503414154, "learning_rate": 1e-05, "loss": 0.0317, "step": 194000 }, { "epoch": 0.001941, "grad_norm": 0.25908568501472473, "learning_rate": 1e-05, "loss": 0.0325, "step": 194100 }, { "epoch": 0.001942, "grad_norm": 0.24762418866157532, "learning_rate": 1e-05, "loss": 0.0319, "step": 194200 }, { "epoch": 0.001943, "grad_norm": 0.2765520513057709, "learning_rate": 1e-05, "loss": 0.0326, "step": 194300 }, { "epoch": 0.001944, "grad_norm": 0.21772415935993195, "learning_rate": 1e-05, "loss": 0.0323, "step": 194400 }, { "epoch": 0.001945, "grad_norm": 0.2840191721916199, "learning_rate": 1e-05, "loss": 0.0327, "step": 194500 }, { "epoch": 0.001946, "grad_norm": 0.2410508096218109, "learning_rate": 1e-05, "loss": 0.0321, "step": 194600 }, { "epoch": 0.001947, "grad_norm": 0.2957497239112854, "learning_rate": 1e-05, "loss": 0.0329, "step": 194700 }, { "epoch": 0.001948, "grad_norm": 0.2740709185600281, "learning_rate": 1e-05, "loss": 0.0319, "step": 194800 }, { "epoch": 0.001949, "grad_norm": 0.2569412589073181, "learning_rate": 1e-05, "loss": 0.0314, "step": 194900 }, { "epoch": 0.00195, "grad_norm": 0.236479252576828, "learning_rate": 1e-05, "loss": 0.0327, "step": 195000 }, { "epoch": 0.001951, "grad_norm": 0.2721831798553467, "learning_rate": 1e-05, "loss": 0.0322, "step": 195100 }, { "epoch": 0.001952, "grad_norm": 0.210642471909523, "learning_rate": 1e-05, "loss": 0.0329, "step": 195200 }, { "epoch": 0.001953, "grad_norm": 0.2308679223060608, "learning_rate": 1e-05, "loss": 0.0317, "step": 195300 }, { "epoch": 0.001954, "grad_norm": 0.3192179203033447, "learning_rate": 1e-05, "loss": 0.0313, "step": 195400 }, { "epoch": 0.001955, "grad_norm": 0.3236648738384247, "learning_rate": 1e-05, "loss": 0.0319, "step": 195500 }, { "epoch": 0.001956, "grad_norm": 0.20895443856716156, "learning_rate": 1e-05, "loss": 0.0323, "step": 195600 }, { "epoch": 0.001957, "grad_norm": 0.3253025412559509, "learning_rate": 1e-05, "loss": 0.0323, "step": 195700 }, { "epoch": 0.001958, "grad_norm": 0.22354339063167572, "learning_rate": 1e-05, "loss": 0.0315, "step": 195800 }, { "epoch": 0.001959, "grad_norm": 0.21652694046497345, "learning_rate": 1e-05, "loss": 0.0324, "step": 195900 }, { "epoch": 0.00196, "grad_norm": 0.2668282091617584, "learning_rate": 1e-05, "loss": 0.0321, "step": 196000 }, { "epoch": 0.001961, "grad_norm": 0.27519479393959045, "learning_rate": 1e-05, "loss": 0.0318, "step": 196100 }, { "epoch": 0.001962, "grad_norm": 0.21463170647621155, "learning_rate": 1e-05, "loss": 0.0318, "step": 196200 }, { "epoch": 0.001963, "grad_norm": 0.3027854263782501, "learning_rate": 1e-05, "loss": 0.0323, "step": 196300 }, { "epoch": 0.001964, "grad_norm": 0.18787100911140442, "learning_rate": 1e-05, "loss": 0.0318, "step": 196400 }, { "epoch": 0.001965, "grad_norm": 0.26937076449394226, "learning_rate": 1e-05, "loss": 0.0311, "step": 196500 }, { "epoch": 0.001966, "grad_norm": 0.2677050530910492, "learning_rate": 1e-05, "loss": 0.0325, "step": 196600 }, { "epoch": 0.001967, "grad_norm": 0.3546921908855438, "learning_rate": 1e-05, "loss": 0.0322, "step": 196700 }, { "epoch": 0.001968, "grad_norm": 0.2790840268135071, "learning_rate": 1e-05, "loss": 0.0315, "step": 196800 }, { "epoch": 0.001969, "grad_norm": 0.3081589639186859, "learning_rate": 1e-05, "loss": 0.0323, "step": 196900 }, { "epoch": 0.00197, "grad_norm": 0.27054348587989807, "learning_rate": 1e-05, "loss": 0.0321, "step": 197000 }, { "epoch": 0.001971, "grad_norm": 0.2294885367155075, "learning_rate": 1e-05, "loss": 0.0319, "step": 197100 }, { "epoch": 0.001972, "grad_norm": 0.32998618483543396, "learning_rate": 1e-05, "loss": 0.0313, "step": 197200 }, { "epoch": 0.001973, "grad_norm": 0.2631285786628723, "learning_rate": 1e-05, "loss": 0.0319, "step": 197300 }, { "epoch": 0.001974, "grad_norm": 0.2692215144634247, "learning_rate": 1e-05, "loss": 0.032, "step": 197400 }, { "epoch": 0.001975, "grad_norm": 0.20809699594974518, "learning_rate": 1e-05, "loss": 0.0318, "step": 197500 }, { "epoch": 0.001976, "grad_norm": 0.2499137669801712, "learning_rate": 1e-05, "loss": 0.0319, "step": 197600 }, { "epoch": 0.001977, "grad_norm": 0.38662397861480713, "learning_rate": 1e-05, "loss": 0.0324, "step": 197700 }, { "epoch": 0.001978, "grad_norm": 0.2577098608016968, "learning_rate": 1e-05, "loss": 0.0328, "step": 197800 }, { "epoch": 0.001979, "grad_norm": 0.31214746832847595, "learning_rate": 1e-05, "loss": 0.032, "step": 197900 }, { "epoch": 0.00198, "grad_norm": 0.271256685256958, "learning_rate": 1e-05, "loss": 0.0324, "step": 198000 }, { "epoch": 0.001981, "grad_norm": 0.3630572557449341, "learning_rate": 1e-05, "loss": 0.0312, "step": 198100 }, { "epoch": 0.001982, "grad_norm": 0.25808608531951904, "learning_rate": 1e-05, "loss": 0.0318, "step": 198200 }, { "epoch": 0.001983, "grad_norm": 0.22318284213542938, "learning_rate": 1e-05, "loss": 0.0323, "step": 198300 }, { "epoch": 0.001984, "grad_norm": 0.25587937235832214, "learning_rate": 1e-05, "loss": 0.0319, "step": 198400 }, { "epoch": 0.001985, "grad_norm": 0.2652595043182373, "learning_rate": 1e-05, "loss": 0.0316, "step": 198500 }, { "epoch": 0.001986, "grad_norm": 0.20676416158676147, "learning_rate": 1e-05, "loss": 0.0318, "step": 198600 }, { "epoch": 0.001987, "grad_norm": 0.2597635090351105, "learning_rate": 1e-05, "loss": 0.0325, "step": 198700 }, { "epoch": 0.001988, "grad_norm": 0.2545841634273529, "learning_rate": 1e-05, "loss": 0.0318, "step": 198800 }, { "epoch": 0.001989, "grad_norm": 0.2772659659385681, "learning_rate": 1e-05, "loss": 0.0322, "step": 198900 }, { "epoch": 0.00199, "grad_norm": 0.28483980894088745, "learning_rate": 1e-05, "loss": 0.0316, "step": 199000 }, { "epoch": 0.001991, "grad_norm": 0.3817993402481079, "learning_rate": 1e-05, "loss": 0.0318, "step": 199100 }, { "epoch": 0.001992, "grad_norm": 0.22074788808822632, "learning_rate": 1e-05, "loss": 0.0319, "step": 199200 }, { "epoch": 0.001993, "grad_norm": 0.23463021218776703, "learning_rate": 1e-05, "loss": 0.032, "step": 199300 }, { "epoch": 0.001994, "grad_norm": 0.30473804473876953, "learning_rate": 1e-05, "loss": 0.0321, "step": 199400 }, { "epoch": 0.001995, "grad_norm": 0.25269436836242676, "learning_rate": 1e-05, "loss": 0.0319, "step": 199500 }, { "epoch": 0.001996, "grad_norm": 0.28520408272743225, "learning_rate": 1e-05, "loss": 0.0321, "step": 199600 }, { "epoch": 0.001997, "grad_norm": 0.19588550925254822, "learning_rate": 1e-05, "loss": 0.0315, "step": 199700 }, { "epoch": 0.001998, "grad_norm": 0.25672486424446106, "learning_rate": 1e-05, "loss": 0.0308, "step": 199800 }, { "epoch": 0.001999, "grad_norm": 0.22884820401668549, "learning_rate": 1e-05, "loss": 0.0318, "step": 199900 }, { "epoch": 0.002, "grad_norm": 0.24550040066242218, "learning_rate": 1e-05, "loss": 0.0324, "step": 200000 }, { "epoch": 0.002, "eval_loss": 0.02840259298682213, "eval_runtime": 170.6321, "eval_samples_per_second": 293.028, "eval_steps_per_second": 18.314, "step": 200000 }, { "epoch": 0.002001, "grad_norm": 0.3713010251522064, "learning_rate": 1e-05, "loss": 0.0314, "step": 200100 }, { "epoch": 0.002002, "grad_norm": 0.294329434633255, "learning_rate": 1e-05, "loss": 0.0319, "step": 200200 }, { "epoch": 0.002003, "grad_norm": 0.2280438244342804, "learning_rate": 1e-05, "loss": 0.032, "step": 200300 }, { "epoch": 0.002004, "grad_norm": 0.22641757130622864, "learning_rate": 1e-05, "loss": 0.0321, "step": 200400 }, { "epoch": 0.002005, "grad_norm": 0.2424178421497345, "learning_rate": 1e-05, "loss": 0.0322, "step": 200500 }, { "epoch": 0.002006, "grad_norm": 0.23992714285850525, "learning_rate": 1e-05, "loss": 0.032, "step": 200600 }, { "epoch": 0.002007, "grad_norm": 0.23278282582759857, "learning_rate": 1e-05, "loss": 0.0321, "step": 200700 }, { "epoch": 0.002008, "grad_norm": 0.2505403757095337, "learning_rate": 1e-05, "loss": 0.0314, "step": 200800 }, { "epoch": 0.002009, "grad_norm": 0.26906564831733704, "learning_rate": 1e-05, "loss": 0.0314, "step": 200900 }, { "epoch": 0.00201, "grad_norm": 0.32744213938713074, "learning_rate": 1e-05, "loss": 0.0315, "step": 201000 }, { "epoch": 0.002011, "grad_norm": 0.268772155046463, "learning_rate": 1e-05, "loss": 0.0315, "step": 201100 }, { "epoch": 0.002012, "grad_norm": 0.265301913022995, "learning_rate": 1e-05, "loss": 0.0317, "step": 201200 }, { "epoch": 0.002013, "grad_norm": 0.28051915764808655, "learning_rate": 1e-05, "loss": 0.0319, "step": 201300 }, { "epoch": 0.002014, "grad_norm": 0.3532405495643616, "learning_rate": 1e-05, "loss": 0.0314, "step": 201400 }, { "epoch": 0.002015, "grad_norm": 0.2259407341480255, "learning_rate": 1e-05, "loss": 0.0319, "step": 201500 }, { "epoch": 0.002016, "grad_norm": 0.1892709881067276, "learning_rate": 1e-05, "loss": 0.032, "step": 201600 }, { "epoch": 0.002017, "grad_norm": 0.36434316635131836, "learning_rate": 1e-05, "loss": 0.0321, "step": 201700 }, { "epoch": 0.002018, "grad_norm": 0.2927830219268799, "learning_rate": 1e-05, "loss": 0.0321, "step": 201800 }, { "epoch": 0.002019, "grad_norm": 0.3251431882381439, "learning_rate": 1e-05, "loss": 0.0321, "step": 201900 }, { "epoch": 0.00202, "grad_norm": 0.36362919211387634, "learning_rate": 1e-05, "loss": 0.0318, "step": 202000 }, { "epoch": 0.002021, "grad_norm": 0.2283594310283661, "learning_rate": 1e-05, "loss": 0.0313, "step": 202100 }, { "epoch": 0.002022, "grad_norm": 0.27608364820480347, "learning_rate": 1e-05, "loss": 0.0316, "step": 202200 }, { "epoch": 0.002023, "grad_norm": 0.2059921771287918, "learning_rate": 1e-05, "loss": 0.0319, "step": 202300 }, { "epoch": 0.002024, "grad_norm": 0.26748126745224, "learning_rate": 1e-05, "loss": 0.031, "step": 202400 }, { "epoch": 0.002025, "grad_norm": 0.29838794469833374, "learning_rate": 1e-05, "loss": 0.0318, "step": 202500 }, { "epoch": 0.002026, "grad_norm": 0.24897854030132294, "learning_rate": 1e-05, "loss": 0.0318, "step": 202600 }, { "epoch": 0.002027, "grad_norm": 0.19141924381256104, "learning_rate": 1e-05, "loss": 0.0318, "step": 202700 }, { "epoch": 0.002028, "grad_norm": 0.31545889377593994, "learning_rate": 1e-05, "loss": 0.032, "step": 202800 }, { "epoch": 0.002029, "grad_norm": 0.2763608694076538, "learning_rate": 1e-05, "loss": 0.0316, "step": 202900 }, { "epoch": 0.00203, "grad_norm": 0.31953635811805725, "learning_rate": 1e-05, "loss": 0.032, "step": 203000 }, { "epoch": 0.002031, "grad_norm": 0.2509276866912842, "learning_rate": 1e-05, "loss": 0.0316, "step": 203100 }, { "epoch": 0.002032, "grad_norm": 0.21137331426143646, "learning_rate": 1e-05, "loss": 0.0313, "step": 203200 }, { "epoch": 0.002033, "grad_norm": 0.2176308035850525, "learning_rate": 1e-05, "loss": 0.0316, "step": 203300 }, { "epoch": 0.002034, "grad_norm": 0.19215302169322968, "learning_rate": 1e-05, "loss": 0.0316, "step": 203400 }, { "epoch": 0.002035, "grad_norm": 0.26203399896621704, "learning_rate": 1e-05, "loss": 0.032, "step": 203500 }, { "epoch": 0.002036, "grad_norm": 0.23399977385997772, "learning_rate": 1e-05, "loss": 0.0318, "step": 203600 }, { "epoch": 0.002037, "grad_norm": 0.22014564275741577, "learning_rate": 1e-05, "loss": 0.0316, "step": 203700 }, { "epoch": 0.002038, "grad_norm": 0.2181723266839981, "learning_rate": 1e-05, "loss": 0.0318, "step": 203800 }, { "epoch": 0.002039, "grad_norm": 0.21330033242702484, "learning_rate": 1e-05, "loss": 0.0319, "step": 203900 }, { "epoch": 0.00204, "grad_norm": 0.24238085746765137, "learning_rate": 1e-05, "loss": 0.0313, "step": 204000 }, { "epoch": 0.002041, "grad_norm": 0.2515721023082733, "learning_rate": 1e-05, "loss": 0.0316, "step": 204100 }, { "epoch": 0.002042, "grad_norm": 0.25421950221061707, "learning_rate": 1e-05, "loss": 0.0315, "step": 204200 }, { "epoch": 0.002043, "grad_norm": 0.24131140112876892, "learning_rate": 1e-05, "loss": 0.0319, "step": 204300 }, { "epoch": 0.002044, "grad_norm": 0.28161683678627014, "learning_rate": 1e-05, "loss": 0.0317, "step": 204400 }, { "epoch": 0.002045, "grad_norm": 0.24693188071250916, "learning_rate": 1e-05, "loss": 0.0313, "step": 204500 }, { "epoch": 0.002046, "grad_norm": 0.2773473560810089, "learning_rate": 1e-05, "loss": 0.0318, "step": 204600 }, { "epoch": 0.002047, "grad_norm": 0.2546924650669098, "learning_rate": 1e-05, "loss": 0.0314, "step": 204700 }, { "epoch": 0.002048, "grad_norm": 0.2774355709552765, "learning_rate": 1e-05, "loss": 0.0316, "step": 204800 }, { "epoch": 0.002049, "grad_norm": 0.29458382725715637, "learning_rate": 1e-05, "loss": 0.0319, "step": 204900 }, { "epoch": 0.00205, "grad_norm": 0.26015326380729675, "learning_rate": 1e-05, "loss": 0.0312, "step": 205000 }, { "epoch": 0.002051, "grad_norm": 0.26257333159446716, "learning_rate": 1e-05, "loss": 0.0321, "step": 205100 }, { "epoch": 0.002052, "grad_norm": 0.28353893756866455, "learning_rate": 1e-05, "loss": 0.0312, "step": 205200 }, { "epoch": 0.002053, "grad_norm": 0.23534321784973145, "learning_rate": 1e-05, "loss": 0.0313, "step": 205300 }, { "epoch": 0.002054, "grad_norm": 0.22987417876720428, "learning_rate": 1e-05, "loss": 0.0311, "step": 205400 }, { "epoch": 0.002055, "grad_norm": 0.28824928402900696, "learning_rate": 1e-05, "loss": 0.0319, "step": 205500 }, { "epoch": 0.002056, "grad_norm": 0.2876168489456177, "learning_rate": 1e-05, "loss": 0.0316, "step": 205600 }, { "epoch": 0.002057, "grad_norm": 0.22605589032173157, "learning_rate": 1e-05, "loss": 0.0312, "step": 205700 }, { "epoch": 0.002058, "grad_norm": 0.254177987575531, "learning_rate": 1e-05, "loss": 0.0319, "step": 205800 }, { "epoch": 0.002059, "grad_norm": 0.22825880348682404, "learning_rate": 1e-05, "loss": 0.0317, "step": 205900 }, { "epoch": 0.00206, "grad_norm": 0.25835809111595154, "learning_rate": 1e-05, "loss": 0.032, "step": 206000 }, { "epoch": 0.002061, "grad_norm": 0.23885509371757507, "learning_rate": 1e-05, "loss": 0.031, "step": 206100 }, { "epoch": 0.002062, "grad_norm": 0.20054255425930023, "learning_rate": 1e-05, "loss": 0.0316, "step": 206200 }, { "epoch": 0.002063, "grad_norm": 0.24497418105602264, "learning_rate": 1e-05, "loss": 0.032, "step": 206300 }, { "epoch": 0.002064, "grad_norm": 0.3156883716583252, "learning_rate": 1e-05, "loss": 0.0314, "step": 206400 }, { "epoch": 0.002065, "grad_norm": 0.21604397892951965, "learning_rate": 1e-05, "loss": 0.0318, "step": 206500 }, { "epoch": 0.002066, "grad_norm": 0.25790107250213623, "learning_rate": 1e-05, "loss": 0.0313, "step": 206600 }, { "epoch": 0.002067, "grad_norm": 0.21217142045497894, "learning_rate": 1e-05, "loss": 0.0307, "step": 206700 }, { "epoch": 0.002068, "grad_norm": 0.2312048077583313, "learning_rate": 1e-05, "loss": 0.0311, "step": 206800 }, { "epoch": 0.002069, "grad_norm": 0.3301689028739929, "learning_rate": 1e-05, "loss": 0.0315, "step": 206900 }, { "epoch": 0.00207, "grad_norm": 0.2856263816356659, "learning_rate": 1e-05, "loss": 0.0316, "step": 207000 }, { "epoch": 0.002071, "grad_norm": 0.20427465438842773, "learning_rate": 1e-05, "loss": 0.0318, "step": 207100 }, { "epoch": 0.002072, "grad_norm": 0.21189464628696442, "learning_rate": 1e-05, "loss": 0.031, "step": 207200 }, { "epoch": 0.002073, "grad_norm": 0.234110027551651, "learning_rate": 1e-05, "loss": 0.0311, "step": 207300 }, { "epoch": 0.002074, "grad_norm": 0.23344667255878448, "learning_rate": 1e-05, "loss": 0.0314, "step": 207400 }, { "epoch": 0.002075, "grad_norm": 0.27144289016723633, "learning_rate": 1e-05, "loss": 0.0312, "step": 207500 }, { "epoch": 0.002076, "grad_norm": 0.31398525834083557, "learning_rate": 1e-05, "loss": 0.0315, "step": 207600 }, { "epoch": 0.002077, "grad_norm": 0.24528735876083374, "learning_rate": 1e-05, "loss": 0.0319, "step": 207700 }, { "epoch": 0.002078, "grad_norm": 0.24895218014717102, "learning_rate": 1e-05, "loss": 0.0312, "step": 207800 }, { "epoch": 0.002079, "grad_norm": 0.20411962270736694, "learning_rate": 1e-05, "loss": 0.0312, "step": 207900 }, { "epoch": 0.00208, "grad_norm": 0.2445424497127533, "learning_rate": 1e-05, "loss": 0.031, "step": 208000 }, { "epoch": 0.002081, "grad_norm": 0.36851388216018677, "learning_rate": 1e-05, "loss": 0.0313, "step": 208100 }, { "epoch": 0.002082, "grad_norm": 0.29357439279556274, "learning_rate": 1e-05, "loss": 0.0318, "step": 208200 }, { "epoch": 0.002083, "grad_norm": 0.2965017557144165, "learning_rate": 1e-05, "loss": 0.0312, "step": 208300 }, { "epoch": 0.002084, "grad_norm": 0.2734137773513794, "learning_rate": 1e-05, "loss": 0.0318, "step": 208400 }, { "epoch": 0.002085, "grad_norm": 0.2179960161447525, "learning_rate": 1e-05, "loss": 0.0318, "step": 208500 }, { "epoch": 0.002086, "grad_norm": 0.27637144923210144, "learning_rate": 1e-05, "loss": 0.0309, "step": 208600 }, { "epoch": 0.002087, "grad_norm": 0.25545233488082886, "learning_rate": 1e-05, "loss": 0.0311, "step": 208700 }, { "epoch": 0.002088, "grad_norm": 0.28326815366744995, "learning_rate": 1e-05, "loss": 0.0311, "step": 208800 }, { "epoch": 0.002089, "grad_norm": 0.25906437635421753, "learning_rate": 1e-05, "loss": 0.0307, "step": 208900 }, { "epoch": 0.00209, "grad_norm": 0.4232887625694275, "learning_rate": 1e-05, "loss": 0.0316, "step": 209000 }, { "epoch": 0.002091, "grad_norm": 0.20014026761054993, "learning_rate": 1e-05, "loss": 0.031, "step": 209100 }, { "epoch": 0.002092, "grad_norm": 0.3111647665500641, "learning_rate": 1e-05, "loss": 0.0321, "step": 209200 }, { "epoch": 0.002093, "grad_norm": 0.27051740884780884, "learning_rate": 1e-05, "loss": 0.0314, "step": 209300 }, { "epoch": 0.002094, "grad_norm": 0.42275404930114746, "learning_rate": 1e-05, "loss": 0.0313, "step": 209400 }, { "epoch": 0.002095, "grad_norm": 0.28600335121154785, "learning_rate": 1e-05, "loss": 0.0315, "step": 209500 }, { "epoch": 0.002096, "grad_norm": 0.20048090815544128, "learning_rate": 1e-05, "loss": 0.0307, "step": 209600 }, { "epoch": 0.002097, "grad_norm": 0.2883336842060089, "learning_rate": 1e-05, "loss": 0.0316, "step": 209700 }, { "epoch": 0.002098, "grad_norm": 0.23330430686473846, "learning_rate": 1e-05, "loss": 0.0322, "step": 209800 }, { "epoch": 0.002099, "grad_norm": 0.24406182765960693, "learning_rate": 1e-05, "loss": 0.0314, "step": 209900 }, { "epoch": 0.0021, "grad_norm": 0.2691080570220947, "learning_rate": 1e-05, "loss": 0.0309, "step": 210000 }, { "epoch": 0.002101, "grad_norm": 0.24835684895515442, "learning_rate": 1e-05, "loss": 0.031, "step": 210100 }, { "epoch": 0.002102, "grad_norm": 0.25190868973731995, "learning_rate": 1e-05, "loss": 0.0317, "step": 210200 }, { "epoch": 0.002103, "grad_norm": 0.3989490270614624, "learning_rate": 1e-05, "loss": 0.0316, "step": 210300 }, { "epoch": 0.002104, "grad_norm": 0.3042777180671692, "learning_rate": 1e-05, "loss": 0.031, "step": 210400 }, { "epoch": 0.002105, "grad_norm": 0.24429039657115936, "learning_rate": 1e-05, "loss": 0.0309, "step": 210500 }, { "epoch": 0.002106, "grad_norm": 0.23127245903015137, "learning_rate": 1e-05, "loss": 0.0311, "step": 210600 }, { "epoch": 0.002107, "grad_norm": 0.2589346468448639, "learning_rate": 1e-05, "loss": 0.0312, "step": 210700 }, { "epoch": 0.002108, "grad_norm": 0.2769606113433838, "learning_rate": 1e-05, "loss": 0.031, "step": 210800 }, { "epoch": 0.002109, "grad_norm": 0.22139549255371094, "learning_rate": 1e-05, "loss": 0.0307, "step": 210900 }, { "epoch": 0.00211, "grad_norm": 0.22128580510616302, "learning_rate": 1e-05, "loss": 0.0315, "step": 211000 }, { "epoch": 0.002111, "grad_norm": 0.3315107524394989, "learning_rate": 1e-05, "loss": 0.0312, "step": 211100 }, { "epoch": 0.002112, "grad_norm": 0.2628828287124634, "learning_rate": 1e-05, "loss": 0.031, "step": 211200 }, { "epoch": 0.002113, "grad_norm": 0.21890178322792053, "learning_rate": 1e-05, "loss": 0.0313, "step": 211300 }, { "epoch": 0.002114, "grad_norm": 0.2530531585216522, "learning_rate": 1e-05, "loss": 0.031, "step": 211400 }, { "epoch": 0.002115, "grad_norm": 0.3045262098312378, "learning_rate": 1e-05, "loss": 0.0309, "step": 211500 }, { "epoch": 0.002116, "grad_norm": 0.40805113315582275, "learning_rate": 1e-05, "loss": 0.031, "step": 211600 }, { "epoch": 0.002117, "grad_norm": 0.301496297121048, "learning_rate": 1e-05, "loss": 0.0307, "step": 211700 }, { "epoch": 0.002118, "grad_norm": 0.20831428468227386, "learning_rate": 1e-05, "loss": 0.0318, "step": 211800 }, { "epoch": 0.002119, "grad_norm": 0.21850167214870453, "learning_rate": 1e-05, "loss": 0.0314, "step": 211900 }, { "epoch": 0.00212, "grad_norm": 0.4092355966567993, "learning_rate": 1e-05, "loss": 0.0313, "step": 212000 }, { "epoch": 0.002121, "grad_norm": 0.3496255874633789, "learning_rate": 1e-05, "loss": 0.0311, "step": 212100 }, { "epoch": 0.002122, "grad_norm": 0.2331617772579193, "learning_rate": 1e-05, "loss": 0.0309, "step": 212200 }, { "epoch": 0.002123, "grad_norm": 0.198782280087471, "learning_rate": 1e-05, "loss": 0.0316, "step": 212300 }, { "epoch": 0.002124, "grad_norm": 0.2033265084028244, "learning_rate": 1e-05, "loss": 0.0302, "step": 212400 }, { "epoch": 0.002125, "grad_norm": 0.2730502188205719, "learning_rate": 1e-05, "loss": 0.0307, "step": 212500 }, { "epoch": 0.002126, "grad_norm": 0.30204683542251587, "learning_rate": 1e-05, "loss": 0.031, "step": 212600 }, { "epoch": 0.002127, "grad_norm": 0.19519208371639252, "learning_rate": 1e-05, "loss": 0.0312, "step": 212700 }, { "epoch": 0.002128, "grad_norm": 0.31348133087158203, "learning_rate": 1e-05, "loss": 0.0304, "step": 212800 }, { "epoch": 0.002129, "grad_norm": 0.2733170688152313, "learning_rate": 1e-05, "loss": 0.0306, "step": 212900 }, { "epoch": 0.00213, "grad_norm": 0.28653883934020996, "learning_rate": 1e-05, "loss": 0.0315, "step": 213000 }, { "epoch": 0.002131, "grad_norm": 0.27193760871887207, "learning_rate": 1e-05, "loss": 0.0308, "step": 213100 }, { "epoch": 0.002132, "grad_norm": 0.26788580417633057, "learning_rate": 1e-05, "loss": 0.0312, "step": 213200 }, { "epoch": 0.002133, "grad_norm": 0.2696700394153595, "learning_rate": 1e-05, "loss": 0.031, "step": 213300 }, { "epoch": 0.002134, "grad_norm": 0.16829313337802887, "learning_rate": 1e-05, "loss": 0.0309, "step": 213400 }, { "epoch": 0.002135, "grad_norm": 0.21741607785224915, "learning_rate": 1e-05, "loss": 0.0313, "step": 213500 }, { "epoch": 0.002136, "grad_norm": 0.3716385066509247, "learning_rate": 1e-05, "loss": 0.0311, "step": 213600 }, { "epoch": 0.002137, "grad_norm": 0.3846241533756256, "learning_rate": 1e-05, "loss": 0.0304, "step": 213700 }, { "epoch": 0.002138, "grad_norm": 0.2845756411552429, "learning_rate": 1e-05, "loss": 0.0314, "step": 213800 }, { "epoch": 0.002139, "grad_norm": 0.2426975518465042, "learning_rate": 1e-05, "loss": 0.031, "step": 213900 }, { "epoch": 0.00214, "grad_norm": 0.2607607841491699, "learning_rate": 1e-05, "loss": 0.0304, "step": 214000 }, { "epoch": 0.002141, "grad_norm": 0.27649804949760437, "learning_rate": 1e-05, "loss": 0.0311, "step": 214100 }, { "epoch": 0.002142, "grad_norm": 0.21441884338855743, "learning_rate": 1e-05, "loss": 0.0306, "step": 214200 }, { "epoch": 0.002143, "grad_norm": 0.2090100646018982, "learning_rate": 1e-05, "loss": 0.0302, "step": 214300 }, { "epoch": 0.002144, "grad_norm": 0.2683175206184387, "learning_rate": 1e-05, "loss": 0.0309, "step": 214400 }, { "epoch": 0.002145, "grad_norm": 0.27921321988105774, "learning_rate": 1e-05, "loss": 0.0312, "step": 214500 }, { "epoch": 0.002146, "grad_norm": 0.21561023592948914, "learning_rate": 1e-05, "loss": 0.0311, "step": 214600 }, { "epoch": 0.002147, "grad_norm": 0.2376718819141388, "learning_rate": 1e-05, "loss": 0.0313, "step": 214700 }, { "epoch": 0.002148, "grad_norm": 0.3396456241607666, "learning_rate": 1e-05, "loss": 0.0313, "step": 214800 }, { "epoch": 0.002149, "grad_norm": 0.2552381157875061, "learning_rate": 1e-05, "loss": 0.0313, "step": 214900 }, { "epoch": 0.00215, "grad_norm": 0.22426673769950867, "learning_rate": 1e-05, "loss": 0.0307, "step": 215000 }, { "epoch": 0.002151, "grad_norm": 0.40186548233032227, "learning_rate": 1e-05, "loss": 0.0309, "step": 215100 }, { "epoch": 0.002152, "grad_norm": 0.2553323209285736, "learning_rate": 1e-05, "loss": 0.0306, "step": 215200 }, { "epoch": 0.002153, "grad_norm": 0.2309524416923523, "learning_rate": 1e-05, "loss": 0.0305, "step": 215300 }, { "epoch": 0.002154, "grad_norm": 0.2798805832862854, "learning_rate": 1e-05, "loss": 0.0311, "step": 215400 }, { "epoch": 0.002155, "grad_norm": 0.3938901722431183, "learning_rate": 1e-05, "loss": 0.0312, "step": 215500 }, { "epoch": 0.002156, "grad_norm": 0.2644768953323364, "learning_rate": 1e-05, "loss": 0.0309, "step": 215600 }, { "epoch": 0.002157, "grad_norm": 0.22452636063098907, "learning_rate": 1e-05, "loss": 0.0304, "step": 215700 }, { "epoch": 0.002158, "grad_norm": 0.23389554023742676, "learning_rate": 1e-05, "loss": 0.0311, "step": 215800 }, { "epoch": 0.002159, "grad_norm": 0.21474826335906982, "learning_rate": 1e-05, "loss": 0.0307, "step": 215900 }, { "epoch": 0.00216, "grad_norm": 0.2629447877407074, "learning_rate": 1e-05, "loss": 0.0303, "step": 216000 }, { "epoch": 0.002161, "grad_norm": 0.21196302771568298, "learning_rate": 1e-05, "loss": 0.0312, "step": 216100 }, { "epoch": 0.002162, "grad_norm": 0.17929323017597198, "learning_rate": 1e-05, "loss": 0.0313, "step": 216200 }, { "epoch": 0.002163, "grad_norm": 0.30359330773353577, "learning_rate": 1e-05, "loss": 0.0308, "step": 216300 }, { "epoch": 0.002164, "grad_norm": 0.21613185107707977, "learning_rate": 1e-05, "loss": 0.031, "step": 216400 }, { "epoch": 0.002165, "grad_norm": 0.27735695242881775, "learning_rate": 1e-05, "loss": 0.0307, "step": 216500 }, { "epoch": 0.002166, "grad_norm": 0.2872326672077179, "learning_rate": 1e-05, "loss": 0.0305, "step": 216600 }, { "epoch": 0.002167, "grad_norm": 0.42893916368484497, "learning_rate": 1e-05, "loss": 0.0304, "step": 216700 }, { "epoch": 0.002168, "grad_norm": 0.24062584340572357, "learning_rate": 1e-05, "loss": 0.0307, "step": 216800 }, { "epoch": 0.002169, "grad_norm": 0.2828642725944519, "learning_rate": 1e-05, "loss": 0.0305, "step": 216900 }, { "epoch": 0.00217, "grad_norm": 0.2787730097770691, "learning_rate": 1e-05, "loss": 0.031, "step": 217000 }, { "epoch": 0.002171, "grad_norm": 0.29275327920913696, "learning_rate": 1e-05, "loss": 0.0314, "step": 217100 }, { "epoch": 0.002172, "grad_norm": 0.30340468883514404, "learning_rate": 1e-05, "loss": 0.0306, "step": 217200 }, { "epoch": 0.002173, "grad_norm": 0.28298419713974, "learning_rate": 1e-05, "loss": 0.0307, "step": 217300 }, { "epoch": 0.002174, "grad_norm": 0.22536629438400269, "learning_rate": 1e-05, "loss": 0.0313, "step": 217400 }, { "epoch": 0.002175, "grad_norm": 0.3355392813682556, "learning_rate": 1e-05, "loss": 0.0311, "step": 217500 }, { "epoch": 0.002176, "grad_norm": 0.23939184844493866, "learning_rate": 1e-05, "loss": 0.0309, "step": 217600 }, { "epoch": 0.002177, "grad_norm": 0.29297277331352234, "learning_rate": 1e-05, "loss": 0.0304, "step": 217700 }, { "epoch": 0.002178, "grad_norm": 0.19180326163768768, "learning_rate": 1e-05, "loss": 0.0303, "step": 217800 }, { "epoch": 0.002179, "grad_norm": 0.4385153651237488, "learning_rate": 1e-05, "loss": 0.0312, "step": 217900 }, { "epoch": 0.00218, "grad_norm": 0.25987300276756287, "learning_rate": 1e-05, "loss": 0.0311, "step": 218000 }, { "epoch": 0.002181, "grad_norm": 0.19591198861598969, "learning_rate": 1e-05, "loss": 0.031, "step": 218100 }, { "epoch": 0.002182, "grad_norm": 0.25740036368370056, "learning_rate": 1e-05, "loss": 0.0311, "step": 218200 }, { "epoch": 0.002183, "grad_norm": 0.29761946201324463, "learning_rate": 1e-05, "loss": 0.0307, "step": 218300 }, { "epoch": 0.002184, "grad_norm": 0.2735758423805237, "learning_rate": 1e-05, "loss": 0.0314, "step": 218400 }, { "epoch": 0.002185, "grad_norm": 0.295699805021286, "learning_rate": 1e-05, "loss": 0.0303, "step": 218500 }, { "epoch": 0.002186, "grad_norm": 0.2229842245578766, "learning_rate": 1e-05, "loss": 0.0302, "step": 218600 }, { "epoch": 0.002187, "grad_norm": 0.27031978964805603, "learning_rate": 1e-05, "loss": 0.0302, "step": 218700 }, { "epoch": 0.002188, "grad_norm": 0.36505091190338135, "learning_rate": 1e-05, "loss": 0.03, "step": 218800 }, { "epoch": 0.002189, "grad_norm": 0.2590303421020508, "learning_rate": 1e-05, "loss": 0.0305, "step": 218900 }, { "epoch": 0.00219, "grad_norm": 0.22334624826908112, "learning_rate": 1e-05, "loss": 0.0307, "step": 219000 }, { "epoch": 0.002191, "grad_norm": 0.24113090336322784, "learning_rate": 1e-05, "loss": 0.0311, "step": 219100 }, { "epoch": 0.002192, "grad_norm": 0.21133147180080414, "learning_rate": 1e-05, "loss": 0.0303, "step": 219200 }, { "epoch": 0.002193, "grad_norm": 0.28187668323516846, "learning_rate": 1e-05, "loss": 0.0302, "step": 219300 }, { "epoch": 0.002194, "grad_norm": 0.22814737260341644, "learning_rate": 1e-05, "loss": 0.0307, "step": 219400 }, { "epoch": 0.002195, "grad_norm": 0.2847799062728882, "learning_rate": 1e-05, "loss": 0.0305, "step": 219500 }, { "epoch": 0.002196, "grad_norm": 0.28008124232292175, "learning_rate": 1e-05, "loss": 0.0304, "step": 219600 }, { "epoch": 0.002197, "grad_norm": 0.2556409537792206, "learning_rate": 1e-05, "loss": 0.031, "step": 219700 }, { "epoch": 0.002198, "grad_norm": 0.22865626215934753, "learning_rate": 1e-05, "loss": 0.0306, "step": 219800 }, { "epoch": 0.002199, "grad_norm": 0.22166462242603302, "learning_rate": 1e-05, "loss": 0.0298, "step": 219900 }, { "epoch": 0.0022, "grad_norm": 0.2650294005870819, "learning_rate": 1e-05, "loss": 0.0305, "step": 220000 }, { "epoch": 0.0022, "eval_loss": 0.0266337301582098, "eval_runtime": 167.6701, "eval_samples_per_second": 298.205, "eval_steps_per_second": 18.638, "step": 220000 }, { "epoch": 0.002201, "grad_norm": 0.278293251991272, "learning_rate": 1e-05, "loss": 0.0312, "step": 220100 }, { "epoch": 0.002202, "grad_norm": 0.20711100101470947, "learning_rate": 1e-05, "loss": 0.0299, "step": 220200 }, { "epoch": 0.002203, "grad_norm": 0.2676500678062439, "learning_rate": 1e-05, "loss": 0.0302, "step": 220300 }, { "epoch": 0.002204, "grad_norm": 0.42296257615089417, "learning_rate": 1e-05, "loss": 0.0309, "step": 220400 }, { "epoch": 0.002205, "grad_norm": 0.3330267667770386, "learning_rate": 1e-05, "loss": 0.0305, "step": 220500 }, { "epoch": 0.002206, "grad_norm": 0.3559701144695282, "learning_rate": 1e-05, "loss": 0.031, "step": 220600 }, { "epoch": 0.002207, "grad_norm": 0.28792017698287964, "learning_rate": 1e-05, "loss": 0.0309, "step": 220700 }, { "epoch": 0.002208, "grad_norm": 0.22370798885822296, "learning_rate": 1e-05, "loss": 0.0308, "step": 220800 }, { "epoch": 0.002209, "grad_norm": 0.22848130762577057, "learning_rate": 1e-05, "loss": 0.0306, "step": 220900 }, { "epoch": 0.00221, "grad_norm": 0.3187582790851593, "learning_rate": 1e-05, "loss": 0.0305, "step": 221000 }, { "epoch": 0.002211, "grad_norm": 0.21967394649982452, "learning_rate": 1e-05, "loss": 0.03, "step": 221100 }, { "epoch": 0.002212, "grad_norm": 0.22291195392608643, "learning_rate": 1e-05, "loss": 0.0307, "step": 221200 }, { "epoch": 0.002213, "grad_norm": 0.2493019849061966, "learning_rate": 1e-05, "loss": 0.0304, "step": 221300 }, { "epoch": 0.002214, "grad_norm": 0.32579416036605835, "learning_rate": 1e-05, "loss": 0.0307, "step": 221400 }, { "epoch": 0.002215, "grad_norm": 0.2671898603439331, "learning_rate": 1e-05, "loss": 0.0304, "step": 221500 }, { "epoch": 0.002216, "grad_norm": 0.21876537799835205, "learning_rate": 1e-05, "loss": 0.03, "step": 221600 }, { "epoch": 0.002217, "grad_norm": 0.1979455202817917, "learning_rate": 1e-05, "loss": 0.0304, "step": 221700 }, { "epoch": 0.002218, "grad_norm": 0.26235881447792053, "learning_rate": 1e-05, "loss": 0.0306, "step": 221800 }, { "epoch": 0.002219, "grad_norm": 0.2997877895832062, "learning_rate": 1e-05, "loss": 0.0309, "step": 221900 }, { "epoch": 0.00222, "grad_norm": 0.26717695593833923, "learning_rate": 1e-05, "loss": 0.0303, "step": 222000 }, { "epoch": 0.002221, "grad_norm": 0.2387426644563675, "learning_rate": 1e-05, "loss": 0.0304, "step": 222100 }, { "epoch": 0.002222, "grad_norm": 0.25602054595947266, "learning_rate": 1e-05, "loss": 0.0311, "step": 222200 }, { "epoch": 0.002223, "grad_norm": 0.24863144755363464, "learning_rate": 1e-05, "loss": 0.0306, "step": 222300 }, { "epoch": 0.002224, "grad_norm": 0.26705580949783325, "learning_rate": 1e-05, "loss": 0.0307, "step": 222400 }, { "epoch": 0.002225, "grad_norm": 0.1753436177968979, "learning_rate": 1e-05, "loss": 0.0304, "step": 222500 }, { "epoch": 0.002226, "grad_norm": 0.25405699014663696, "learning_rate": 1e-05, "loss": 0.0304, "step": 222600 }, { "epoch": 0.002227, "grad_norm": 0.23297768831253052, "learning_rate": 1e-05, "loss": 0.0302, "step": 222700 }, { "epoch": 0.002228, "grad_norm": 0.3197582960128784, "learning_rate": 1e-05, "loss": 0.0304, "step": 222800 }, { "epoch": 0.002229, "grad_norm": 0.2818903923034668, "learning_rate": 1e-05, "loss": 0.0304, "step": 222900 }, { "epoch": 0.00223, "grad_norm": 0.21863234043121338, "learning_rate": 1e-05, "loss": 0.0311, "step": 223000 }, { "epoch": 0.002231, "grad_norm": 0.2764856219291687, "learning_rate": 1e-05, "loss": 0.0311, "step": 223100 }, { "epoch": 0.002232, "grad_norm": 0.3021007776260376, "learning_rate": 1e-05, "loss": 0.0303, "step": 223200 }, { "epoch": 0.002233, "grad_norm": 0.2563741207122803, "learning_rate": 1e-05, "loss": 0.0302, "step": 223300 }, { "epoch": 0.002234, "grad_norm": 0.25011688470840454, "learning_rate": 1e-05, "loss": 0.0305, "step": 223400 }, { "epoch": 0.002235, "grad_norm": 0.21708010137081146, "learning_rate": 1e-05, "loss": 0.0301, "step": 223500 }, { "epoch": 0.002236, "grad_norm": 0.34240832924842834, "learning_rate": 1e-05, "loss": 0.0302, "step": 223600 }, { "epoch": 0.002237, "grad_norm": 0.2996751070022583, "learning_rate": 1e-05, "loss": 0.0303, "step": 223700 }, { "epoch": 0.002238, "grad_norm": 0.2609923779964447, "learning_rate": 1e-05, "loss": 0.03, "step": 223800 }, { "epoch": 0.002239, "grad_norm": 0.21548140048980713, "learning_rate": 1e-05, "loss": 0.0304, "step": 223900 }, { "epoch": 0.00224, "grad_norm": 0.26150786876678467, "learning_rate": 1e-05, "loss": 0.0307, "step": 224000 }, { "epoch": 0.002241, "grad_norm": 0.2945968806743622, "learning_rate": 1e-05, "loss": 0.0305, "step": 224100 }, { "epoch": 0.002242, "grad_norm": 0.2476995438337326, "learning_rate": 1e-05, "loss": 0.0298, "step": 224200 }, { "epoch": 0.002243, "grad_norm": 0.22437891364097595, "learning_rate": 1e-05, "loss": 0.0305, "step": 224300 }, { "epoch": 0.002244, "grad_norm": 0.21513286232948303, "learning_rate": 1e-05, "loss": 0.0295, "step": 224400 }, { "epoch": 0.002245, "grad_norm": 0.23618008196353912, "learning_rate": 1e-05, "loss": 0.0302, "step": 224500 }, { "epoch": 0.002246, "grad_norm": 0.23724280297756195, "learning_rate": 1e-05, "loss": 0.03, "step": 224600 }, { "epoch": 0.002247, "grad_norm": 0.24962781369686127, "learning_rate": 1e-05, "loss": 0.0311, "step": 224700 }, { "epoch": 0.002248, "grad_norm": 0.19250474870204926, "learning_rate": 1e-05, "loss": 0.0308, "step": 224800 }, { "epoch": 0.002249, "grad_norm": 0.2244672328233719, "learning_rate": 1e-05, "loss": 0.0303, "step": 224900 }, { "epoch": 0.00225, "grad_norm": 0.21727219223976135, "learning_rate": 1e-05, "loss": 0.0304, "step": 225000 }, { "epoch": 0.002251, "grad_norm": 0.24451684951782227, "learning_rate": 1e-05, "loss": 0.0311, "step": 225100 }, { "epoch": 0.002252, "grad_norm": 0.31561654806137085, "learning_rate": 1e-05, "loss": 0.0301, "step": 225200 }, { "epoch": 0.002253, "grad_norm": 0.26076367497444153, "learning_rate": 1e-05, "loss": 0.0305, "step": 225300 }, { "epoch": 0.002254, "grad_norm": 0.20927996933460236, "learning_rate": 1e-05, "loss": 0.03, "step": 225400 }, { "epoch": 0.002255, "grad_norm": 0.2806721031665802, "learning_rate": 1e-05, "loss": 0.0305, "step": 225500 }, { "epoch": 0.002256, "grad_norm": 0.21102814376354218, "learning_rate": 1e-05, "loss": 0.0307, "step": 225600 }, { "epoch": 0.002257, "grad_norm": 0.36105602979660034, "learning_rate": 1e-05, "loss": 0.0303, "step": 225700 }, { "epoch": 0.002258, "grad_norm": 0.2698630392551422, "learning_rate": 1e-05, "loss": 0.0296, "step": 225800 }, { "epoch": 0.002259, "grad_norm": 0.2897622287273407, "learning_rate": 1e-05, "loss": 0.0303, "step": 225900 }, { "epoch": 0.00226, "grad_norm": 0.20064426958560944, "learning_rate": 1e-05, "loss": 0.0306, "step": 226000 }, { "epoch": 0.002261, "grad_norm": 0.1857953816652298, "learning_rate": 1e-05, "loss": 0.0299, "step": 226100 }, { "epoch": 0.002262, "grad_norm": 0.2849755883216858, "learning_rate": 1e-05, "loss": 0.0304, "step": 226200 }, { "epoch": 0.002263, "grad_norm": 0.40963149070739746, "learning_rate": 1e-05, "loss": 0.0307, "step": 226300 }, { "epoch": 0.002264, "grad_norm": 0.23681430518627167, "learning_rate": 1e-05, "loss": 0.0303, "step": 226400 }, { "epoch": 0.002265, "grad_norm": 0.25645869970321655, "learning_rate": 1e-05, "loss": 0.0301, "step": 226500 }, { "epoch": 0.002266, "grad_norm": 0.30637863278388977, "learning_rate": 1e-05, "loss": 0.0303, "step": 226600 }, { "epoch": 0.002267, "grad_norm": 0.22620275616645813, "learning_rate": 1e-05, "loss": 0.0303, "step": 226700 }, { "epoch": 0.002268, "grad_norm": 0.32906070351600647, "learning_rate": 1e-05, "loss": 0.0301, "step": 226800 }, { "epoch": 0.002269, "grad_norm": 0.239684596657753, "learning_rate": 1e-05, "loss": 0.03, "step": 226900 }, { "epoch": 0.00227, "grad_norm": 0.24670958518981934, "learning_rate": 1e-05, "loss": 0.0311, "step": 227000 }, { "epoch": 0.002271, "grad_norm": 0.22786454856395721, "learning_rate": 1e-05, "loss": 0.0303, "step": 227100 }, { "epoch": 0.002272, "grad_norm": 0.2280299812555313, "learning_rate": 1e-05, "loss": 0.0308, "step": 227200 }, { "epoch": 0.002273, "grad_norm": 0.27017447352409363, "learning_rate": 1e-05, "loss": 0.0307, "step": 227300 }, { "epoch": 0.002274, "grad_norm": 0.32255983352661133, "learning_rate": 1e-05, "loss": 0.0302, "step": 227400 }, { "epoch": 0.002275, "grad_norm": 0.26582470536231995, "learning_rate": 1e-05, "loss": 0.0299, "step": 227500 }, { "epoch": 0.002276, "grad_norm": 0.3163512647151947, "learning_rate": 1e-05, "loss": 0.0308, "step": 227600 }, { "epoch": 0.002277, "grad_norm": 0.18314936757087708, "learning_rate": 1e-05, "loss": 0.0294, "step": 227700 }, { "epoch": 0.002278, "grad_norm": 0.2451498657464981, "learning_rate": 1e-05, "loss": 0.0303, "step": 227800 }, { "epoch": 0.002279, "grad_norm": 0.213813915848732, "learning_rate": 1e-05, "loss": 0.0302, "step": 227900 }, { "epoch": 0.00228, "grad_norm": 0.34764569997787476, "learning_rate": 1e-05, "loss": 0.03, "step": 228000 }, { "epoch": 0.002281, "grad_norm": 0.29473602771759033, "learning_rate": 1e-05, "loss": 0.0299, "step": 228100 }, { "epoch": 0.002282, "grad_norm": 0.2199229598045349, "learning_rate": 1e-05, "loss": 0.0298, "step": 228200 }, { "epoch": 0.002283, "grad_norm": 0.20880143344402313, "learning_rate": 1e-05, "loss": 0.0307, "step": 228300 }, { "epoch": 0.002284, "grad_norm": 0.2598123848438263, "learning_rate": 1e-05, "loss": 0.0296, "step": 228400 }, { "epoch": 0.002285, "grad_norm": 0.25579914450645447, "learning_rate": 1e-05, "loss": 0.0297, "step": 228500 }, { "epoch": 0.002286, "grad_norm": 0.2361450344324112, "learning_rate": 1e-05, "loss": 0.0298, "step": 228600 }, { "epoch": 0.002287, "grad_norm": 0.25956714153289795, "learning_rate": 1e-05, "loss": 0.0302, "step": 228700 }, { "epoch": 0.002288, "grad_norm": 0.3018101453781128, "learning_rate": 1e-05, "loss": 0.0305, "step": 228800 }, { "epoch": 0.002289, "grad_norm": 0.24171124398708344, "learning_rate": 1e-05, "loss": 0.0303, "step": 228900 }, { "epoch": 0.00229, "grad_norm": 0.24277599155902863, "learning_rate": 1e-05, "loss": 0.0299, "step": 229000 }, { "epoch": 0.002291, "grad_norm": 0.2649170756340027, "learning_rate": 1e-05, "loss": 0.0296, "step": 229100 }, { "epoch": 0.002292, "grad_norm": 0.24409988522529602, "learning_rate": 1e-05, "loss": 0.0299, "step": 229200 }, { "epoch": 0.002293, "grad_norm": 0.27241450548171997, "learning_rate": 1e-05, "loss": 0.0299, "step": 229300 }, { "epoch": 0.002294, "grad_norm": 0.41177889704704285, "learning_rate": 1e-05, "loss": 0.0302, "step": 229400 }, { "epoch": 0.002295, "grad_norm": 0.22077633440494537, "learning_rate": 1e-05, "loss": 0.0303, "step": 229500 }, { "epoch": 0.002296, "grad_norm": 0.2652680277824402, "learning_rate": 1e-05, "loss": 0.0296, "step": 229600 }, { "epoch": 0.002297, "grad_norm": 0.37857919931411743, "learning_rate": 1e-05, "loss": 0.03, "step": 229700 }, { "epoch": 0.002298, "grad_norm": 0.35918357968330383, "learning_rate": 1e-05, "loss": 0.0302, "step": 229800 }, { "epoch": 0.002299, "grad_norm": 0.3768517076969147, "learning_rate": 1e-05, "loss": 0.0292, "step": 229900 }, { "epoch": 0.0023, "grad_norm": 0.2535454332828522, "learning_rate": 1e-05, "loss": 0.0304, "step": 230000 }, { "epoch": 0.002301, "grad_norm": 0.21306835114955902, "learning_rate": 1e-05, "loss": 0.0305, "step": 230100 }, { "epoch": 0.002302, "grad_norm": 0.21607105433940887, "learning_rate": 1e-05, "loss": 0.0295, "step": 230200 }, { "epoch": 0.002303, "grad_norm": 0.31265753507614136, "learning_rate": 1e-05, "loss": 0.03, "step": 230300 }, { "epoch": 0.002304, "grad_norm": 0.20603011548519135, "learning_rate": 1e-05, "loss": 0.0304, "step": 230400 }, { "epoch": 0.002305, "grad_norm": 0.32483819127082825, "learning_rate": 1e-05, "loss": 0.0298, "step": 230500 }, { "epoch": 0.002306, "grad_norm": 0.3106380105018616, "learning_rate": 1e-05, "loss": 0.0298, "step": 230600 }, { "epoch": 0.002307, "grad_norm": 0.24206934869289398, "learning_rate": 1e-05, "loss": 0.0304, "step": 230700 }, { "epoch": 0.002308, "grad_norm": 0.22719141840934753, "learning_rate": 1e-05, "loss": 0.0302, "step": 230800 }, { "epoch": 0.002309, "grad_norm": 0.3945215046405792, "learning_rate": 1e-05, "loss": 0.0299, "step": 230900 }, { "epoch": 0.00231, "grad_norm": 0.3312758207321167, "learning_rate": 1e-05, "loss": 0.0301, "step": 231000 }, { "epoch": 0.002311, "grad_norm": 0.2840631306171417, "learning_rate": 1e-05, "loss": 0.03, "step": 231100 }, { "epoch": 0.002312, "grad_norm": 0.2904289662837982, "learning_rate": 1e-05, "loss": 0.0298, "step": 231200 }, { "epoch": 0.002313, "grad_norm": 0.2176913619041443, "learning_rate": 1e-05, "loss": 0.0298, "step": 231300 }, { "epoch": 0.002314, "grad_norm": 0.25382092595100403, "learning_rate": 1e-05, "loss": 0.0305, "step": 231400 }, { "epoch": 0.002315, "grad_norm": 0.24920842051506042, "learning_rate": 1e-05, "loss": 0.0298, "step": 231500 }, { "epoch": 0.002316, "grad_norm": 0.2275884598493576, "learning_rate": 1e-05, "loss": 0.0292, "step": 231600 }, { "epoch": 0.002317, "grad_norm": 0.16515184938907623, "learning_rate": 1e-05, "loss": 0.0302, "step": 231700 }, { "epoch": 0.002318, "grad_norm": 0.2481859028339386, "learning_rate": 1e-05, "loss": 0.0305, "step": 231800 }, { "epoch": 0.002319, "grad_norm": 0.20166514813899994, "learning_rate": 1e-05, "loss": 0.0301, "step": 231900 }, { "epoch": 0.00232, "grad_norm": 0.23703810572624207, "learning_rate": 1e-05, "loss": 0.0293, "step": 232000 }, { "epoch": 0.002321, "grad_norm": 0.21265992522239685, "learning_rate": 1e-05, "loss": 0.0302, "step": 232100 }, { "epoch": 0.002322, "grad_norm": 0.2748515009880066, "learning_rate": 1e-05, "loss": 0.0301, "step": 232200 }, { "epoch": 0.002323, "grad_norm": 0.1912570297718048, "learning_rate": 1e-05, "loss": 0.0294, "step": 232300 }, { "epoch": 0.002324, "grad_norm": 0.2582959830760956, "learning_rate": 1e-05, "loss": 0.0302, "step": 232400 }, { "epoch": 0.002325, "grad_norm": 0.2270803302526474, "learning_rate": 1e-05, "loss": 0.0299, "step": 232500 }, { "epoch": 0.002326, "grad_norm": 0.3403969407081604, "learning_rate": 1e-05, "loss": 0.0295, "step": 232600 }, { "epoch": 0.002327, "grad_norm": 0.25836843252182007, "learning_rate": 1e-05, "loss": 0.0295, "step": 232700 }, { "epoch": 0.002328, "grad_norm": 0.21023710072040558, "learning_rate": 1e-05, "loss": 0.0301, "step": 232800 }, { "epoch": 0.002329, "grad_norm": 0.2713964581489563, "learning_rate": 1e-05, "loss": 0.03, "step": 232900 }, { "epoch": 0.00233, "grad_norm": 0.1990632712841034, "learning_rate": 1e-05, "loss": 0.0301, "step": 233000 }, { "epoch": 0.002331, "grad_norm": 0.22848600149154663, "learning_rate": 1e-05, "loss": 0.0301, "step": 233100 }, { "epoch": 0.002332, "grad_norm": 0.26198697090148926, "learning_rate": 1e-05, "loss": 0.0295, "step": 233200 }, { "epoch": 0.002333, "grad_norm": 0.2515060603618622, "learning_rate": 1e-05, "loss": 0.0299, "step": 233300 }, { "epoch": 0.002334, "grad_norm": 0.21063554286956787, "learning_rate": 1e-05, "loss": 0.0299, "step": 233400 }, { "epoch": 0.002335, "grad_norm": 0.2577035129070282, "learning_rate": 1e-05, "loss": 0.0299, "step": 233500 }, { "epoch": 0.002336, "grad_norm": 0.28672710061073303, "learning_rate": 1e-05, "loss": 0.03, "step": 233600 }, { "epoch": 0.002337, "grad_norm": 0.4056204855442047, "learning_rate": 1e-05, "loss": 0.0299, "step": 233700 }, { "epoch": 0.002338, "grad_norm": 0.32512199878692627, "learning_rate": 1e-05, "loss": 0.0293, "step": 233800 }, { "epoch": 0.002339, "grad_norm": 0.25995564460754395, "learning_rate": 1e-05, "loss": 0.0294, "step": 233900 }, { "epoch": 0.00234, "grad_norm": 0.2763254940509796, "learning_rate": 1e-05, "loss": 0.0291, "step": 234000 }, { "epoch": 0.002341, "grad_norm": 0.2914525270462036, "learning_rate": 1e-05, "loss": 0.0296, "step": 234100 }, { "epoch": 0.002342, "grad_norm": 0.2813495397567749, "learning_rate": 1e-05, "loss": 0.0298, "step": 234200 }, { "epoch": 0.002343, "grad_norm": 0.23574820160865784, "learning_rate": 1e-05, "loss": 0.0296, "step": 234300 }, { "epoch": 0.002344, "grad_norm": 0.2505672872066498, "learning_rate": 1e-05, "loss": 0.0303, "step": 234400 }, { "epoch": 0.002345, "grad_norm": 0.25891348719596863, "learning_rate": 1e-05, "loss": 0.0295, "step": 234500 }, { "epoch": 0.002346, "grad_norm": 0.2680392861366272, "learning_rate": 1e-05, "loss": 0.0294, "step": 234600 }, { "epoch": 0.002347, "grad_norm": 0.2134384959936142, "learning_rate": 1e-05, "loss": 0.0291, "step": 234700 }, { "epoch": 0.002348, "grad_norm": 0.25188949704170227, "learning_rate": 1e-05, "loss": 0.0299, "step": 234800 }, { "epoch": 0.002349, "grad_norm": 0.2775766849517822, "learning_rate": 1e-05, "loss": 0.0292, "step": 234900 }, { "epoch": 0.00235, "grad_norm": 0.22586295008659363, "learning_rate": 1e-05, "loss": 0.0295, "step": 235000 }, { "epoch": 0.002351, "grad_norm": 0.2862175703048706, "learning_rate": 1e-05, "loss": 0.0306, "step": 235100 }, { "epoch": 0.002352, "grad_norm": 0.25473010540008545, "learning_rate": 1e-05, "loss": 0.0308, "step": 235200 }, { "epoch": 0.002353, "grad_norm": 0.21394848823547363, "learning_rate": 1e-05, "loss": 0.03, "step": 235300 }, { "epoch": 0.002354, "grad_norm": 0.2874073088169098, "learning_rate": 1e-05, "loss": 0.03, "step": 235400 }, { "epoch": 0.002355, "grad_norm": 0.3191010057926178, "learning_rate": 1e-05, "loss": 0.0295, "step": 235500 }, { "epoch": 0.002356, "grad_norm": 0.20335015654563904, "learning_rate": 1e-05, "loss": 0.0301, "step": 235600 }, { "epoch": 0.002357, "grad_norm": 0.29930636286735535, "learning_rate": 1e-05, "loss": 0.0298, "step": 235700 }, { "epoch": 0.002358, "grad_norm": 0.23620493710041046, "learning_rate": 1e-05, "loss": 0.0294, "step": 235800 }, { "epoch": 0.002359, "grad_norm": 0.42103874683380127, "learning_rate": 1e-05, "loss": 0.0293, "step": 235900 }, { "epoch": 0.00236, "grad_norm": 0.2203570455312729, "learning_rate": 1e-05, "loss": 0.0291, "step": 236000 }, { "epoch": 0.002361, "grad_norm": 0.2287958860397339, "learning_rate": 1e-05, "loss": 0.0308, "step": 236100 }, { "epoch": 0.002362, "grad_norm": 0.20824848115444183, "learning_rate": 1e-05, "loss": 0.03, "step": 236200 }, { "epoch": 0.002363, "grad_norm": 0.2804105281829834, "learning_rate": 1e-05, "loss": 0.0297, "step": 236300 }, { "epoch": 0.002364, "grad_norm": 0.2696130573749542, "learning_rate": 1e-05, "loss": 0.0296, "step": 236400 }, { "epoch": 0.002365, "grad_norm": 0.2526698410511017, "learning_rate": 1e-05, "loss": 0.0294, "step": 236500 }, { "epoch": 0.002366, "grad_norm": 0.3089619278907776, "learning_rate": 1e-05, "loss": 0.0301, "step": 236600 }, { "epoch": 0.002367, "grad_norm": 0.24289670586585999, "learning_rate": 1e-05, "loss": 0.0298, "step": 236700 }, { "epoch": 0.002368, "grad_norm": 0.28791072964668274, "learning_rate": 1e-05, "loss": 0.029, "step": 236800 }, { "epoch": 0.002369, "grad_norm": 0.2932698428630829, "learning_rate": 1e-05, "loss": 0.0296, "step": 236900 }, { "epoch": 0.00237, "grad_norm": 0.2269309014081955, "learning_rate": 1e-05, "loss": 0.0298, "step": 237000 }, { "epoch": 0.002371, "grad_norm": 0.2511924207210541, "learning_rate": 1e-05, "loss": 0.03, "step": 237100 }, { "epoch": 0.002372, "grad_norm": 0.25714829564094543, "learning_rate": 1e-05, "loss": 0.0297, "step": 237200 }, { "epoch": 0.002373, "grad_norm": 0.21061991155147552, "learning_rate": 1e-05, "loss": 0.0292, "step": 237300 }, { "epoch": 0.002374, "grad_norm": 0.1974622905254364, "learning_rate": 1e-05, "loss": 0.0297, "step": 237400 }, { "epoch": 0.002375, "grad_norm": 0.2432764172554016, "learning_rate": 1e-05, "loss": 0.0298, "step": 237500 }, { "epoch": 0.002376, "grad_norm": 1.0309339761734009, "learning_rate": 1e-05, "loss": 0.0303, "step": 237600 }, { "epoch": 0.002377, "grad_norm": 0.2751801908016205, "learning_rate": 1e-05, "loss": 0.0302, "step": 237700 }, { "epoch": 0.002378, "grad_norm": 0.1974973976612091, "learning_rate": 1e-05, "loss": 0.0295, "step": 237800 }, { "epoch": 0.002379, "grad_norm": 0.36366137862205505, "learning_rate": 1e-05, "loss": 0.0295, "step": 237900 }, { "epoch": 0.00238, "grad_norm": 0.25252091884613037, "learning_rate": 1e-05, "loss": 0.0291, "step": 238000 }, { "epoch": 0.002381, "grad_norm": 0.20120005309581757, "learning_rate": 1e-05, "loss": 0.0295, "step": 238100 }, { "epoch": 0.002382, "grad_norm": 0.20900118350982666, "learning_rate": 1e-05, "loss": 0.0291, "step": 238200 }, { "epoch": 0.002383, "grad_norm": 0.2689586281776428, "learning_rate": 1e-05, "loss": 0.0297, "step": 238300 }, { "epoch": 0.002384, "grad_norm": 0.2784338593482971, "learning_rate": 1e-05, "loss": 0.0292, "step": 238400 }, { "epoch": 0.002385, "grad_norm": 0.2410692721605301, "learning_rate": 1e-05, "loss": 0.0296, "step": 238500 }, { "epoch": 0.002386, "grad_norm": 0.2596879303455353, "learning_rate": 1e-05, "loss": 0.0297, "step": 238600 }, { "epoch": 0.002387, "grad_norm": 0.260688453912735, "learning_rate": 1e-05, "loss": 0.0294, "step": 238700 }, { "epoch": 0.002388, "grad_norm": 0.38807234168052673, "learning_rate": 1e-05, "loss": 0.03, "step": 238800 }, { "epoch": 0.002389, "grad_norm": 0.2262476235628128, "learning_rate": 1e-05, "loss": 0.029, "step": 238900 }, { "epoch": 0.00239, "grad_norm": 0.2794170677661896, "learning_rate": 1e-05, "loss": 0.0292, "step": 239000 }, { "epoch": 0.002391, "grad_norm": 0.16978326439857483, "learning_rate": 1e-05, "loss": 0.0296, "step": 239100 }, { "epoch": 0.002392, "grad_norm": 0.25167325139045715, "learning_rate": 1e-05, "loss": 0.0298, "step": 239200 }, { "epoch": 0.002393, "grad_norm": 0.2835477888584137, "learning_rate": 1e-05, "loss": 0.0298, "step": 239300 }, { "epoch": 0.002394, "grad_norm": 0.23926566541194916, "learning_rate": 1e-05, "loss": 0.03, "step": 239400 }, { "epoch": 0.002395, "grad_norm": 0.25200363993644714, "learning_rate": 1e-05, "loss": 0.0291, "step": 239500 }, { "epoch": 0.002396, "grad_norm": 0.3402925431728363, "learning_rate": 1e-05, "loss": 0.0295, "step": 239600 }, { "epoch": 0.002397, "grad_norm": 0.25247400999069214, "learning_rate": 1e-05, "loss": 0.0302, "step": 239700 }, { "epoch": 0.002398, "grad_norm": 0.2636953890323639, "learning_rate": 1e-05, "loss": 0.0291, "step": 239800 }, { "epoch": 0.002399, "grad_norm": 0.29208695888519287, "learning_rate": 1e-05, "loss": 0.0291, "step": 239900 }, { "epoch": 0.0024, "grad_norm": 0.20240412652492523, "learning_rate": 1e-05, "loss": 0.0299, "step": 240000 }, { "epoch": 0.0024, "eval_loss": 0.026435546576976776, "eval_runtime": 166.8966, "eval_samples_per_second": 299.587, "eval_steps_per_second": 18.724, "step": 240000 }, { "epoch": 0.002401, "grad_norm": 0.22942417860031128, "learning_rate": 1e-05, "loss": 0.0297, "step": 240100 }, { "epoch": 0.002402, "grad_norm": 0.2450142502784729, "learning_rate": 1e-05, "loss": 0.0294, "step": 240200 }, { "epoch": 0.002403, "grad_norm": 0.31019580364227295, "learning_rate": 1e-05, "loss": 0.0296, "step": 240300 }, { "epoch": 0.002404, "grad_norm": 0.24487605690956116, "learning_rate": 1e-05, "loss": 0.0292, "step": 240400 }, { "epoch": 0.002405, "grad_norm": 0.4314933717250824, "learning_rate": 1e-05, "loss": 0.0297, "step": 240500 }, { "epoch": 0.002406, "grad_norm": 0.20933778584003448, "learning_rate": 1e-05, "loss": 0.0298, "step": 240600 }, { "epoch": 0.002407, "grad_norm": 0.194622203707695, "learning_rate": 1e-05, "loss": 0.0297, "step": 240700 }, { "epoch": 0.002408, "grad_norm": 0.1910499483346939, "learning_rate": 1e-05, "loss": 0.0299, "step": 240800 }, { "epoch": 0.002409, "grad_norm": 0.20666643977165222, "learning_rate": 1e-05, "loss": 0.0295, "step": 240900 }, { "epoch": 0.00241, "grad_norm": 0.25058698654174805, "learning_rate": 1e-05, "loss": 0.0294, "step": 241000 }, { "epoch": 0.002411, "grad_norm": 0.21813885867595673, "learning_rate": 1e-05, "loss": 0.0296, "step": 241100 }, { "epoch": 0.002412, "grad_norm": 0.27979591488838196, "learning_rate": 1e-05, "loss": 0.0296, "step": 241200 }, { "epoch": 0.002413, "grad_norm": 0.2971433401107788, "learning_rate": 1e-05, "loss": 0.0298, "step": 241300 }, { "epoch": 0.002414, "grad_norm": 0.2413114458322525, "learning_rate": 1e-05, "loss": 0.0292, "step": 241400 }, { "epoch": 0.002415, "grad_norm": 0.23285312950611115, "learning_rate": 1e-05, "loss": 0.0293, "step": 241500 }, { "epoch": 0.002416, "grad_norm": 0.2382165938615799, "learning_rate": 1e-05, "loss": 0.0289, "step": 241600 }, { "epoch": 0.002417, "grad_norm": 0.26661425828933716, "learning_rate": 1e-05, "loss": 0.0292, "step": 241700 }, { "epoch": 0.002418, "grad_norm": 0.20393359661102295, "learning_rate": 1e-05, "loss": 0.029, "step": 241800 }, { "epoch": 0.002419, "grad_norm": 0.3801039159297943, "learning_rate": 1e-05, "loss": 0.0288, "step": 241900 }, { "epoch": 0.00242, "grad_norm": 0.2663302719593048, "learning_rate": 1e-05, "loss": 0.0295, "step": 242000 }, { "epoch": 0.002421, "grad_norm": 0.2549029588699341, "learning_rate": 1e-05, "loss": 0.0294, "step": 242100 }, { "epoch": 0.002422, "grad_norm": 0.32529813051223755, "learning_rate": 1e-05, "loss": 0.0294, "step": 242200 }, { "epoch": 0.002423, "grad_norm": 0.27202799916267395, "learning_rate": 1e-05, "loss": 0.0294, "step": 242300 }, { "epoch": 0.002424, "grad_norm": 0.3163006901741028, "learning_rate": 1e-05, "loss": 0.0301, "step": 242400 }, { "epoch": 0.002425, "grad_norm": 0.3038056790828705, "learning_rate": 1e-05, "loss": 0.0287, "step": 242500 }, { "epoch": 0.002426, "grad_norm": 0.30944833159446716, "learning_rate": 1e-05, "loss": 0.0297, "step": 242600 }, { "epoch": 0.002427, "grad_norm": 0.2842150330543518, "learning_rate": 1e-05, "loss": 0.0291, "step": 242700 }, { "epoch": 0.002428, "grad_norm": 0.23013119399547577, "learning_rate": 1e-05, "loss": 0.0292, "step": 242800 }, { "epoch": 0.002429, "grad_norm": 0.22545816004276276, "learning_rate": 1e-05, "loss": 0.0293, "step": 242900 }, { "epoch": 0.00243, "grad_norm": 0.3524695634841919, "learning_rate": 1e-05, "loss": 0.0293, "step": 243000 }, { "epoch": 0.002431, "grad_norm": 0.2935996949672699, "learning_rate": 1e-05, "loss": 0.0297, "step": 243100 }, { "epoch": 0.002432, "grad_norm": 0.17696090042591095, "learning_rate": 1e-05, "loss": 0.0288, "step": 243200 }, { "epoch": 0.002433, "grad_norm": 0.24564161896705627, "learning_rate": 1e-05, "loss": 0.0292, "step": 243300 }, { "epoch": 0.002434, "grad_norm": 0.3347853720188141, "learning_rate": 1e-05, "loss": 0.0289, "step": 243400 }, { "epoch": 0.002435, "grad_norm": 0.3246592879295349, "learning_rate": 1e-05, "loss": 0.0293, "step": 243500 }, { "epoch": 0.002436, "grad_norm": 0.2011803686618805, "learning_rate": 1e-05, "loss": 0.0296, "step": 243600 }, { "epoch": 0.002437, "grad_norm": 0.256133109331131, "learning_rate": 1e-05, "loss": 0.0297, "step": 243700 }, { "epoch": 0.002438, "grad_norm": 0.2820090353488922, "learning_rate": 1e-05, "loss": 0.0291, "step": 243800 }, { "epoch": 0.002439, "grad_norm": 0.17747530341148376, "learning_rate": 1e-05, "loss": 0.0289, "step": 243900 }, { "epoch": 0.00244, "grad_norm": 0.2025347203016281, "learning_rate": 1e-05, "loss": 0.0295, "step": 244000 }, { "epoch": 0.002441, "grad_norm": 0.2269492894411087, "learning_rate": 1e-05, "loss": 0.0294, "step": 244100 }, { "epoch": 0.002442, "grad_norm": 0.26530927419662476, "learning_rate": 1e-05, "loss": 0.0289, "step": 244200 }, { "epoch": 0.002443, "grad_norm": 0.2534483075141907, "learning_rate": 1e-05, "loss": 0.0293, "step": 244300 }, { "epoch": 0.002444, "grad_norm": 0.19797545671463013, "learning_rate": 1e-05, "loss": 0.0296, "step": 244400 }, { "epoch": 0.002445, "grad_norm": 0.26378655433654785, "learning_rate": 1e-05, "loss": 0.0295, "step": 244500 }, { "epoch": 0.002446, "grad_norm": 0.2022930085659027, "learning_rate": 1e-05, "loss": 0.0288, "step": 244600 }, { "epoch": 0.002447, "grad_norm": 0.19641751050949097, "learning_rate": 1e-05, "loss": 0.0292, "step": 244700 }, { "epoch": 0.002448, "grad_norm": 0.2373054027557373, "learning_rate": 1e-05, "loss": 0.0294, "step": 244800 }, { "epoch": 0.002449, "grad_norm": 0.22543640434741974, "learning_rate": 1e-05, "loss": 0.0293, "step": 244900 }, { "epoch": 0.00245, "grad_norm": 0.1844436228275299, "learning_rate": 1e-05, "loss": 0.0292, "step": 245000 }, { "epoch": 0.002451, "grad_norm": 0.2961357831954956, "learning_rate": 1e-05, "loss": 0.0291, "step": 245100 }, { "epoch": 0.002452, "grad_norm": 0.41860923171043396, "learning_rate": 1e-05, "loss": 0.029, "step": 245200 }, { "epoch": 0.002453, "grad_norm": 0.25561535358428955, "learning_rate": 1e-05, "loss": 0.0292, "step": 245300 }, { "epoch": 0.002454, "grad_norm": 0.23748420178890228, "learning_rate": 1e-05, "loss": 0.0293, "step": 245400 }, { "epoch": 0.002455, "grad_norm": 0.26495659351348877, "learning_rate": 1e-05, "loss": 0.0294, "step": 245500 }, { "epoch": 0.002456, "grad_norm": 0.24367013573646545, "learning_rate": 1e-05, "loss": 0.0283, "step": 245600 }, { "epoch": 0.002457, "grad_norm": 0.24345558881759644, "learning_rate": 1e-05, "loss": 0.0292, "step": 245700 }, { "epoch": 0.002458, "grad_norm": 0.22929994761943817, "learning_rate": 1e-05, "loss": 0.0289, "step": 245800 }, { "epoch": 0.002459, "grad_norm": 0.22716623544692993, "learning_rate": 1e-05, "loss": 0.0297, "step": 245900 }, { "epoch": 0.00246, "grad_norm": 0.24323037266731262, "learning_rate": 1e-05, "loss": 0.0292, "step": 246000 }, { "epoch": 0.002461, "grad_norm": 0.23903292417526245, "learning_rate": 1e-05, "loss": 0.0292, "step": 246100 }, { "epoch": 0.002462, "grad_norm": 0.4810551702976227, "learning_rate": 1e-05, "loss": 0.0288, "step": 246200 }, { "epoch": 0.002463, "grad_norm": 0.2350904494524002, "learning_rate": 1e-05, "loss": 0.0297, "step": 246300 }, { "epoch": 0.002464, "grad_norm": 0.2692091166973114, "learning_rate": 1e-05, "loss": 0.0289, "step": 246400 }, { "epoch": 0.002465, "grad_norm": 0.212846577167511, "learning_rate": 1e-05, "loss": 0.0297, "step": 246500 }, { "epoch": 0.002466, "grad_norm": 0.20752248167991638, "learning_rate": 1e-05, "loss": 0.0291, "step": 246600 }, { "epoch": 0.002467, "grad_norm": 0.4195525348186493, "learning_rate": 1e-05, "loss": 0.0297, "step": 246700 }, { "epoch": 0.002468, "grad_norm": 0.26601943373680115, "learning_rate": 1e-05, "loss": 0.0296, "step": 246800 }, { "epoch": 0.002469, "grad_norm": 0.352395236492157, "learning_rate": 1e-05, "loss": 0.0292, "step": 246900 }, { "epoch": 0.00247, "grad_norm": 0.22589793801307678, "learning_rate": 1e-05, "loss": 0.0296, "step": 247000 }, { "epoch": 0.002471, "grad_norm": 0.2576340138912201, "learning_rate": 1e-05, "loss": 0.0295, "step": 247100 }, { "epoch": 0.002472, "grad_norm": 0.24597634375095367, "learning_rate": 1e-05, "loss": 0.0291, "step": 247200 }, { "epoch": 0.002473, "grad_norm": 0.27076464891433716, "learning_rate": 1e-05, "loss": 0.0291, "step": 247300 }, { "epoch": 0.002474, "grad_norm": 0.21371306478977203, "learning_rate": 1e-05, "loss": 0.0297, "step": 247400 }, { "epoch": 0.002475, "grad_norm": 0.2787526547908783, "learning_rate": 1e-05, "loss": 0.0293, "step": 247500 }, { "epoch": 0.002476, "grad_norm": 0.2649995982646942, "learning_rate": 1e-05, "loss": 0.0288, "step": 247600 }, { "epoch": 0.002477, "grad_norm": 0.2270011454820633, "learning_rate": 1e-05, "loss": 0.0295, "step": 247700 }, { "epoch": 0.002478, "grad_norm": 0.24795164167881012, "learning_rate": 1e-05, "loss": 0.0302, "step": 247800 }, { "epoch": 0.002479, "grad_norm": 0.29243603348731995, "learning_rate": 1e-05, "loss": 0.0295, "step": 247900 }, { "epoch": 0.00248, "grad_norm": 0.25839880108833313, "learning_rate": 1e-05, "loss": 0.0296, "step": 248000 }, { "epoch": 0.002481, "grad_norm": 0.2395935207605362, "learning_rate": 1e-05, "loss": 0.0293, "step": 248100 }, { "epoch": 0.002482, "grad_norm": 0.2529207766056061, "learning_rate": 1e-05, "loss": 0.029, "step": 248200 }, { "epoch": 0.002483, "grad_norm": 0.36292538046836853, "learning_rate": 1e-05, "loss": 0.029, "step": 248300 }, { "epoch": 0.002484, "grad_norm": 0.2274818867444992, "learning_rate": 1e-05, "loss": 0.0291, "step": 248400 }, { "epoch": 0.002485, "grad_norm": 0.3007954955101013, "learning_rate": 1e-05, "loss": 0.0287, "step": 248500 }, { "epoch": 0.002486, "grad_norm": 0.20032091438770294, "learning_rate": 1e-05, "loss": 0.0294, "step": 248600 }, { "epoch": 0.002487, "grad_norm": 0.26916471123695374, "learning_rate": 1e-05, "loss": 0.029, "step": 248700 }, { "epoch": 0.002488, "grad_norm": 0.25720733404159546, "learning_rate": 1e-05, "loss": 0.0289, "step": 248800 }, { "epoch": 0.002489, "grad_norm": 0.2644922435283661, "learning_rate": 1e-05, "loss": 0.0291, "step": 248900 }, { "epoch": 0.00249, "grad_norm": 0.3281361758708954, "learning_rate": 1e-05, "loss": 0.0294, "step": 249000 }, { "epoch": 0.002491, "grad_norm": 0.28311294317245483, "learning_rate": 1e-05, "loss": 0.029, "step": 249100 }, { "epoch": 0.002492, "grad_norm": 0.22137269377708435, "learning_rate": 1e-05, "loss": 0.029, "step": 249200 }, { "epoch": 0.002493, "grad_norm": 0.29408779740333557, "learning_rate": 1e-05, "loss": 0.0292, "step": 249300 }, { "epoch": 0.002494, "grad_norm": 0.210012286901474, "learning_rate": 1e-05, "loss": 0.0288, "step": 249400 }, { "epoch": 0.002495, "grad_norm": 0.2690602242946625, "learning_rate": 1e-05, "loss": 0.0291, "step": 249500 }, { "epoch": 0.002496, "grad_norm": 0.2281748354434967, "learning_rate": 1e-05, "loss": 0.0294, "step": 249600 }, { "epoch": 0.002497, "grad_norm": 0.28414419293403625, "learning_rate": 1e-05, "loss": 0.0291, "step": 249700 }, { "epoch": 0.002498, "grad_norm": 0.6935933828353882, "learning_rate": 1e-05, "loss": 0.0295, "step": 249800 }, { "epoch": 0.002499, "grad_norm": 0.22387097775936127, "learning_rate": 1e-05, "loss": 0.0291, "step": 249900 }, { "epoch": 0.0025, "grad_norm": 0.2554456889629364, "learning_rate": 1e-05, "loss": 0.0295, "step": 250000 }, { "epoch": 0.002501, "grad_norm": 0.25942081212997437, "learning_rate": 1e-05, "loss": 0.0292, "step": 250100 }, { "epoch": 0.002502, "grad_norm": 0.251875102519989, "learning_rate": 1e-05, "loss": 0.029, "step": 250200 }, { "epoch": 0.002503, "grad_norm": 0.22729599475860596, "learning_rate": 1e-05, "loss": 0.0288, "step": 250300 }, { "epoch": 0.002504, "grad_norm": 0.23671074211597443, "learning_rate": 1e-05, "loss": 0.0291, "step": 250400 }, { "epoch": 0.002505, "grad_norm": 0.23543670773506165, "learning_rate": 1e-05, "loss": 0.0302, "step": 250500 }, { "epoch": 0.002506, "grad_norm": 0.20938076078891754, "learning_rate": 1e-05, "loss": 0.0286, "step": 250600 }, { "epoch": 0.002507, "grad_norm": 0.3242614269256592, "learning_rate": 1e-05, "loss": 0.0291, "step": 250700 }, { "epoch": 0.002508, "grad_norm": 0.22934499382972717, "learning_rate": 1e-05, "loss": 0.0292, "step": 250800 }, { "epoch": 0.002509, "grad_norm": 0.3817967474460602, "learning_rate": 1e-05, "loss": 0.0293, "step": 250900 }, { "epoch": 0.00251, "grad_norm": 0.24634303152561188, "learning_rate": 1e-05, "loss": 0.0289, "step": 251000 }, { "epoch": 0.002511, "grad_norm": 0.21884410083293915, "learning_rate": 1e-05, "loss": 0.0288, "step": 251100 }, { "epoch": 0.002512, "grad_norm": 0.22617746889591217, "learning_rate": 1e-05, "loss": 0.0285, "step": 251200 }, { "epoch": 0.002513, "grad_norm": 0.20297017693519592, "learning_rate": 1e-05, "loss": 0.0289, "step": 251300 }, { "epoch": 0.002514, "grad_norm": 0.25358349084854126, "learning_rate": 1e-05, "loss": 0.0291, "step": 251400 }, { "epoch": 0.002515, "grad_norm": 0.2551290988922119, "learning_rate": 1e-05, "loss": 0.0289, "step": 251500 }, { "epoch": 0.002516, "grad_norm": 0.21778014302253723, "learning_rate": 1e-05, "loss": 0.029, "step": 251600 }, { "epoch": 0.002517, "grad_norm": 0.2957081198692322, "learning_rate": 1e-05, "loss": 0.0295, "step": 251700 }, { "epoch": 0.002518, "grad_norm": 0.22073987126350403, "learning_rate": 1e-05, "loss": 0.0291, "step": 251800 }, { "epoch": 0.002519, "grad_norm": 0.2087729424238205, "learning_rate": 1e-05, "loss": 0.0287, "step": 251900 }, { "epoch": 0.00252, "grad_norm": 0.25924205780029297, "learning_rate": 1e-05, "loss": 0.0288, "step": 252000 }, { "epoch": 0.002521, "grad_norm": 0.21046370267868042, "learning_rate": 1e-05, "loss": 0.0287, "step": 252100 }, { "epoch": 0.002522, "grad_norm": 0.39730602502822876, "learning_rate": 1e-05, "loss": 0.0293, "step": 252200 }, { "epoch": 0.002523, "grad_norm": 0.21154016256332397, "learning_rate": 1e-05, "loss": 0.0293, "step": 252300 }, { "epoch": 0.002524, "grad_norm": 0.27598458528518677, "learning_rate": 1e-05, "loss": 0.0295, "step": 252400 }, { "epoch": 0.002525, "grad_norm": 0.2395186871290207, "learning_rate": 1e-05, "loss": 0.0292, "step": 252500 }, { "epoch": 0.002526, "grad_norm": 0.22632406651973724, "learning_rate": 1e-05, "loss": 0.0281, "step": 252600 }, { "epoch": 0.002527, "grad_norm": 0.235222727060318, "learning_rate": 1e-05, "loss": 0.0286, "step": 252700 }, { "epoch": 0.002528, "grad_norm": 0.22708271443843842, "learning_rate": 1e-05, "loss": 0.0299, "step": 252800 }, { "epoch": 0.002529, "grad_norm": 0.2553348243236542, "learning_rate": 1e-05, "loss": 0.0289, "step": 252900 }, { "epoch": 0.00253, "grad_norm": 0.27289512753486633, "learning_rate": 1e-05, "loss": 0.0288, "step": 253000 }, { "epoch": 0.002531, "grad_norm": 0.25373581051826477, "learning_rate": 1e-05, "loss": 0.0294, "step": 253100 }, { "epoch": 0.002532, "grad_norm": 0.24083180725574493, "learning_rate": 1e-05, "loss": 0.0291, "step": 253200 }, { "epoch": 0.002533, "grad_norm": 0.2436070442199707, "learning_rate": 1e-05, "loss": 0.0291, "step": 253300 }, { "epoch": 0.002534, "grad_norm": 0.19918474555015564, "learning_rate": 1e-05, "loss": 0.0286, "step": 253400 }, { "epoch": 0.002535, "grad_norm": 0.28222763538360596, "learning_rate": 1e-05, "loss": 0.0291, "step": 253500 }, { "epoch": 0.002536, "grad_norm": 0.23622412979602814, "learning_rate": 1e-05, "loss": 0.0291, "step": 253600 }, { "epoch": 0.002537, "grad_norm": 0.17536179721355438, "learning_rate": 1e-05, "loss": 0.0292, "step": 253700 }, { "epoch": 0.002538, "grad_norm": 0.39509597420692444, "learning_rate": 1e-05, "loss": 0.0286, "step": 253800 }, { "epoch": 0.002539, "grad_norm": 0.2821272313594818, "learning_rate": 1e-05, "loss": 0.0297, "step": 253900 }, { "epoch": 0.00254, "grad_norm": 0.3663839101791382, "learning_rate": 1e-05, "loss": 0.0292, "step": 254000 }, { "epoch": 0.002541, "grad_norm": 0.20842768251895905, "learning_rate": 1e-05, "loss": 0.0293, "step": 254100 }, { "epoch": 0.002542, "grad_norm": 0.20694629848003387, "learning_rate": 1e-05, "loss": 0.0289, "step": 254200 }, { "epoch": 0.002543, "grad_norm": 0.24385510385036469, "learning_rate": 1e-05, "loss": 0.0293, "step": 254300 }, { "epoch": 0.002544, "grad_norm": 0.287412166595459, "learning_rate": 1e-05, "loss": 0.0283, "step": 254400 }, { "epoch": 0.002545, "grad_norm": 0.28677842020988464, "learning_rate": 1e-05, "loss": 0.0282, "step": 254500 }, { "epoch": 0.002546, "grad_norm": 0.21103179454803467, "learning_rate": 1e-05, "loss": 0.0287, "step": 254600 }, { "epoch": 0.002547, "grad_norm": 0.20787681639194489, "learning_rate": 1e-05, "loss": 0.0289, "step": 254700 }, { "epoch": 0.002548, "grad_norm": 0.24427814781665802, "learning_rate": 1e-05, "loss": 0.0289, "step": 254800 }, { "epoch": 0.002549, "grad_norm": 0.2560745179653168, "learning_rate": 1e-05, "loss": 0.0291, "step": 254900 }, { "epoch": 0.00255, "grad_norm": 0.23485979437828064, "learning_rate": 1e-05, "loss": 0.029, "step": 255000 }, { "epoch": 0.002551, "grad_norm": 0.1755056381225586, "learning_rate": 1e-05, "loss": 0.0292, "step": 255100 }, { "epoch": 0.002552, "grad_norm": 0.27200135588645935, "learning_rate": 1e-05, "loss": 0.0289, "step": 255200 }, { "epoch": 0.002553, "grad_norm": 0.2216617912054062, "learning_rate": 1e-05, "loss": 0.0291, "step": 255300 }, { "epoch": 0.002554, "grad_norm": 0.21275901794433594, "learning_rate": 1e-05, "loss": 0.0282, "step": 255400 }, { "epoch": 0.002555, "grad_norm": 0.218951016664505, "learning_rate": 1e-05, "loss": 0.0285, "step": 255500 }, { "epoch": 0.002556, "grad_norm": 0.23367995023727417, "learning_rate": 1e-05, "loss": 0.0291, "step": 255600 }, { "epoch": 0.002557, "grad_norm": 0.18301744759082794, "learning_rate": 1e-05, "loss": 0.0287, "step": 255700 }, { "epoch": 0.002558, "grad_norm": 0.2318456768989563, "learning_rate": 1e-05, "loss": 0.0284, "step": 255800 }, { "epoch": 0.002559, "grad_norm": 0.26336345076560974, "learning_rate": 1e-05, "loss": 0.0288, "step": 255900 }, { "epoch": 0.00256, "grad_norm": 0.17314843833446503, "learning_rate": 1e-05, "loss": 0.029, "step": 256000 }, { "epoch": 0.002561, "grad_norm": 0.211277037858963, "learning_rate": 1e-05, "loss": 0.0282, "step": 256100 }, { "epoch": 0.002562, "grad_norm": 0.21661502122879028, "learning_rate": 1e-05, "loss": 0.0286, "step": 256200 }, { "epoch": 0.002563, "grad_norm": 0.23435957729816437, "learning_rate": 1e-05, "loss": 0.0291, "step": 256300 }, { "epoch": 0.002564, "grad_norm": 0.2809585928916931, "learning_rate": 1e-05, "loss": 0.0287, "step": 256400 }, { "epoch": 0.002565, "grad_norm": 0.21416081488132477, "learning_rate": 1e-05, "loss": 0.028, "step": 256500 }, { "epoch": 0.002566, "grad_norm": 0.22057950496673584, "learning_rate": 1e-05, "loss": 0.0286, "step": 256600 }, { "epoch": 0.002567, "grad_norm": 0.23020265996456146, "learning_rate": 1e-05, "loss": 0.0291, "step": 256700 }, { "epoch": 0.002568, "grad_norm": 0.29537150263786316, "learning_rate": 1e-05, "loss": 0.0288, "step": 256800 }, { "epoch": 0.002569, "grad_norm": 0.2267046719789505, "learning_rate": 1e-05, "loss": 0.0287, "step": 256900 }, { "epoch": 0.00257, "grad_norm": 0.23491130769252777, "learning_rate": 1e-05, "loss": 0.0288, "step": 257000 }, { "epoch": 0.002571, "grad_norm": 0.17680279910564423, "learning_rate": 1e-05, "loss": 0.029, "step": 257100 }, { "epoch": 0.002572, "grad_norm": 0.28517043590545654, "learning_rate": 1e-05, "loss": 0.0288, "step": 257200 }, { "epoch": 0.002573, "grad_norm": 0.23137113451957703, "learning_rate": 1e-05, "loss": 0.0293, "step": 257300 }, { "epoch": 0.002574, "grad_norm": 0.2595881521701813, "learning_rate": 1e-05, "loss": 0.0287, "step": 257400 }, { "epoch": 0.002575, "grad_norm": 0.31294557452201843, "learning_rate": 1e-05, "loss": 0.0285, "step": 257500 }, { "epoch": 0.002576, "grad_norm": 0.20776967704296112, "learning_rate": 1e-05, "loss": 0.0286, "step": 257600 }, { "epoch": 0.002577, "grad_norm": 0.24618488550186157, "learning_rate": 1e-05, "loss": 0.0292, "step": 257700 }, { "epoch": 0.002578, "grad_norm": 0.32695844769477844, "learning_rate": 1e-05, "loss": 0.0288, "step": 257800 }, { "epoch": 0.002579, "grad_norm": 0.27535203099250793, "learning_rate": 1e-05, "loss": 0.0285, "step": 257900 }, { "epoch": 0.00258, "grad_norm": 0.26305317878723145, "learning_rate": 1e-05, "loss": 0.0282, "step": 258000 }, { "epoch": 0.002581, "grad_norm": 0.314491868019104, "learning_rate": 1e-05, "loss": 0.029, "step": 258100 }, { "epoch": 0.002582, "grad_norm": 0.22846883535385132, "learning_rate": 1e-05, "loss": 0.0282, "step": 258200 }, { "epoch": 0.002583, "grad_norm": 0.28324589133262634, "learning_rate": 1e-05, "loss": 0.0285, "step": 258300 }, { "epoch": 0.002584, "grad_norm": 0.2766980230808258, "learning_rate": 1e-05, "loss": 0.0287, "step": 258400 }, { "epoch": 0.002585, "grad_norm": 0.2663501799106598, "learning_rate": 1e-05, "loss": 0.0274, "step": 258500 }, { "epoch": 0.002586, "grad_norm": 0.21719248592853546, "learning_rate": 1e-05, "loss": 0.0282, "step": 258600 }, { "epoch": 0.002587, "grad_norm": 0.25272127985954285, "learning_rate": 1e-05, "loss": 0.0289, "step": 258700 }, { "epoch": 0.002588, "grad_norm": 0.22183309495449066, "learning_rate": 1e-05, "loss": 0.0289, "step": 258800 }, { "epoch": 0.002589, "grad_norm": 0.22783738374710083, "learning_rate": 1e-05, "loss": 0.0291, "step": 258900 }, { "epoch": 0.00259, "grad_norm": 0.2521553039550781, "learning_rate": 1e-05, "loss": 0.0287, "step": 259000 }, { "epoch": 0.002591, "grad_norm": 0.22521817684173584, "learning_rate": 1e-05, "loss": 0.0287, "step": 259100 }, { "epoch": 0.002592, "grad_norm": 0.22021915018558502, "learning_rate": 1e-05, "loss": 0.0282, "step": 259200 }, { "epoch": 0.002593, "grad_norm": 0.2461334615945816, "learning_rate": 1e-05, "loss": 0.0283, "step": 259300 }, { "epoch": 0.002594, "grad_norm": 0.20055550336837769, "learning_rate": 1e-05, "loss": 0.0291, "step": 259400 }, { "epoch": 0.002595, "grad_norm": 0.23955921828746796, "learning_rate": 1e-05, "loss": 0.0291, "step": 259500 }, { "epoch": 0.002596, "grad_norm": 0.37329164147377014, "learning_rate": 1e-05, "loss": 0.0287, "step": 259600 }, { "epoch": 0.002597, "grad_norm": 0.20861756801605225, "learning_rate": 1e-05, "loss": 0.0286, "step": 259700 }, { "epoch": 0.002598, "grad_norm": 0.383291095495224, "learning_rate": 1e-05, "loss": 0.0283, "step": 259800 }, { "epoch": 0.002599, "grad_norm": 0.18318399786949158, "learning_rate": 1e-05, "loss": 0.0287, "step": 259900 }, { "epoch": 0.0026, "grad_norm": 0.24593296647071838, "learning_rate": 1e-05, "loss": 0.0288, "step": 260000 }, { "epoch": 0.0026, "eval_loss": 0.02653355523943901, "eval_runtime": 197.7341, "eval_samples_per_second": 252.865, "eval_steps_per_second": 15.804, "step": 260000 }, { "epoch": 0.002601, "grad_norm": 0.29775065183639526, "learning_rate": 1e-05, "loss": 0.0281, "step": 260100 }, { "epoch": 0.002602, "grad_norm": 0.2955515384674072, "learning_rate": 1e-05, "loss": 0.0283, "step": 260200 }, { "epoch": 0.002603, "grad_norm": 0.25055626034736633, "learning_rate": 1e-05, "loss": 0.0292, "step": 260300 }, { "epoch": 0.002604, "grad_norm": 0.21901902556419373, "learning_rate": 1e-05, "loss": 0.0281, "step": 260400 }, { "epoch": 0.002605, "grad_norm": 0.25109124183654785, "learning_rate": 1e-05, "loss": 0.0287, "step": 260500 }, { "epoch": 0.002606, "grad_norm": 0.25809720158576965, "learning_rate": 1e-05, "loss": 0.0296, "step": 260600 }, { "epoch": 0.002607, "grad_norm": 0.250294953584671, "learning_rate": 1e-05, "loss": 0.0291, "step": 260700 }, { "epoch": 0.002608, "grad_norm": 0.1993354707956314, "learning_rate": 1e-05, "loss": 0.0292, "step": 260800 }, { "epoch": 0.002609, "grad_norm": 0.2547664940357208, "learning_rate": 1e-05, "loss": 0.0282, "step": 260900 }, { "epoch": 0.00261, "grad_norm": 0.26779067516326904, "learning_rate": 1e-05, "loss": 0.0284, "step": 261000 }, { "epoch": 0.002611, "grad_norm": 0.29541540145874023, "learning_rate": 1e-05, "loss": 0.0288, "step": 261100 }, { "epoch": 0.002612, "grad_norm": 0.17150868475437164, "learning_rate": 1e-05, "loss": 0.0282, "step": 261200 }, { "epoch": 0.002613, "grad_norm": 0.32568323612213135, "learning_rate": 1e-05, "loss": 0.0288, "step": 261300 }, { "epoch": 0.002614, "grad_norm": 0.2487480491399765, "learning_rate": 1e-05, "loss": 0.0284, "step": 261400 }, { "epoch": 0.002615, "grad_norm": 0.2246582955121994, "learning_rate": 1e-05, "loss": 0.0282, "step": 261500 }, { "epoch": 0.002616, "grad_norm": 0.2725414037704468, "learning_rate": 1e-05, "loss": 0.0278, "step": 261600 }, { "epoch": 0.002617, "grad_norm": 0.18125036358833313, "learning_rate": 1e-05, "loss": 0.0284, "step": 261700 }, { "epoch": 0.002618, "grad_norm": 0.26665574312210083, "learning_rate": 1e-05, "loss": 0.0287, "step": 261800 }, { "epoch": 0.002619, "grad_norm": 0.3722626566886902, "learning_rate": 1e-05, "loss": 0.0288, "step": 261900 }, { "epoch": 0.00262, "grad_norm": 0.2064317911863327, "learning_rate": 1e-05, "loss": 0.0286, "step": 262000 }, { "epoch": 0.002621, "grad_norm": 0.26095050573349, "learning_rate": 1e-05, "loss": 0.0284, "step": 262100 }, { "epoch": 0.002622, "grad_norm": 0.23117844760417938, "learning_rate": 1e-05, "loss": 0.0289, "step": 262200 }, { "epoch": 0.002623, "grad_norm": 0.2140897512435913, "learning_rate": 1e-05, "loss": 0.0283, "step": 262300 }, { "epoch": 0.002624, "grad_norm": 0.22584262490272522, "learning_rate": 1e-05, "loss": 0.0287, "step": 262400 }, { "epoch": 0.002625, "grad_norm": 0.24476554989814758, "learning_rate": 1e-05, "loss": 0.0285, "step": 262500 }, { "epoch": 0.002626, "grad_norm": 0.26677626371383667, "learning_rate": 1e-05, "loss": 0.0292, "step": 262600 }, { "epoch": 0.002627, "grad_norm": 0.29065394401550293, "learning_rate": 1e-05, "loss": 0.0281, "step": 262700 }, { "epoch": 0.002628, "grad_norm": 0.24917228519916534, "learning_rate": 1e-05, "loss": 0.0286, "step": 262800 }, { "epoch": 0.002629, "grad_norm": 0.37361276149749756, "learning_rate": 1e-05, "loss": 0.0284, "step": 262900 }, { "epoch": 0.00263, "grad_norm": 0.22004340589046478, "learning_rate": 1e-05, "loss": 0.0293, "step": 263000 }, { "epoch": 0.002631, "grad_norm": 0.2763840854167938, "learning_rate": 1e-05, "loss": 0.0285, "step": 263100 }, { "epoch": 0.002632, "grad_norm": 0.25512170791625977, "learning_rate": 1e-05, "loss": 0.0286, "step": 263200 }, { "epoch": 0.002633, "grad_norm": 0.22262750566005707, "learning_rate": 1e-05, "loss": 0.0287, "step": 263300 }, { "epoch": 0.002634, "grad_norm": 0.16845664381980896, "learning_rate": 1e-05, "loss": 0.0278, "step": 263400 }, { "epoch": 0.002635, "grad_norm": 0.23096294701099396, "learning_rate": 1e-05, "loss": 0.0286, "step": 263500 }, { "epoch": 0.002636, "grad_norm": 0.20854704082012177, "learning_rate": 1e-05, "loss": 0.0286, "step": 263600 }, { "epoch": 0.002637, "grad_norm": 0.2030772715806961, "learning_rate": 1e-05, "loss": 0.0286, "step": 263700 }, { "epoch": 0.002638, "grad_norm": 0.28036969900131226, "learning_rate": 1e-05, "loss": 0.0285, "step": 263800 }, { "epoch": 0.002639, "grad_norm": 0.22589927911758423, "learning_rate": 1e-05, "loss": 0.0284, "step": 263900 }, { "epoch": 0.00264, "grad_norm": 0.523493230342865, "learning_rate": 1e-05, "loss": 0.0284, "step": 264000 }, { "epoch": 0.002641, "grad_norm": 0.27162832021713257, "learning_rate": 1e-05, "loss": 0.0282, "step": 264100 }, { "epoch": 0.002642, "grad_norm": 0.20301467180252075, "learning_rate": 1e-05, "loss": 0.0285, "step": 264200 }, { "epoch": 0.002643, "grad_norm": 0.24693261086940765, "learning_rate": 1e-05, "loss": 0.0285, "step": 264300 }, { "epoch": 0.002644, "grad_norm": 0.2571404278278351, "learning_rate": 1e-05, "loss": 0.0285, "step": 264400 }, { "epoch": 0.002645, "grad_norm": 0.26782047748565674, "learning_rate": 1e-05, "loss": 0.0283, "step": 264500 }, { "epoch": 0.002646, "grad_norm": 0.28514477610588074, "learning_rate": 1e-05, "loss": 0.0287, "step": 264600 }, { "epoch": 0.002647, "grad_norm": 0.2359791100025177, "learning_rate": 1e-05, "loss": 0.0282, "step": 264700 }, { "epoch": 0.002648, "grad_norm": 0.24601790308952332, "learning_rate": 1e-05, "loss": 0.0285, "step": 264800 }, { "epoch": 0.002649, "grad_norm": 0.33592164516448975, "learning_rate": 1e-05, "loss": 0.0281, "step": 264900 }, { "epoch": 0.00265, "grad_norm": 0.24267975986003876, "learning_rate": 1e-05, "loss": 0.0285, "step": 265000 }, { "epoch": 0.002651, "grad_norm": 0.39469656348228455, "learning_rate": 1e-05, "loss": 0.028, "step": 265100 }, { "epoch": 0.002652, "grad_norm": 0.26666972041130066, "learning_rate": 1e-05, "loss": 0.028, "step": 265200 }, { "epoch": 0.002653, "grad_norm": 0.2059994339942932, "learning_rate": 1e-05, "loss": 0.0286, "step": 265300 }, { "epoch": 0.002654, "grad_norm": 0.25155967473983765, "learning_rate": 1e-05, "loss": 0.0288, "step": 265400 }, { "epoch": 0.002655, "grad_norm": 0.19045403599739075, "learning_rate": 1e-05, "loss": 0.0278, "step": 265500 }, { "epoch": 0.002656, "grad_norm": 0.16286471486091614, "learning_rate": 1e-05, "loss": 0.0279, "step": 265600 }, { "epoch": 0.002657, "grad_norm": 0.2987249791622162, "learning_rate": 1e-05, "loss": 0.0289, "step": 265700 }, { "epoch": 0.002658, "grad_norm": 0.20036645233631134, "learning_rate": 1e-05, "loss": 0.0281, "step": 265800 }, { "epoch": 0.002659, "grad_norm": 0.37300941348075867, "learning_rate": 1e-05, "loss": 0.0284, "step": 265900 }, { "epoch": 0.00266, "grad_norm": 0.20264463126659393, "learning_rate": 1e-05, "loss": 0.0285, "step": 266000 }, { "epoch": 0.002661, "grad_norm": 0.38917914032936096, "learning_rate": 1e-05, "loss": 0.0287, "step": 266100 }, { "epoch": 0.002662, "grad_norm": 0.2535640597343445, "learning_rate": 1e-05, "loss": 0.0291, "step": 266200 }, { "epoch": 0.002663, "grad_norm": 0.36492958664894104, "learning_rate": 1e-05, "loss": 0.0284, "step": 266300 }, { "epoch": 0.002664, "grad_norm": 0.20698626339435577, "learning_rate": 1e-05, "loss": 0.0285, "step": 266400 }, { "epoch": 0.002665, "grad_norm": 0.24375395476818085, "learning_rate": 1e-05, "loss": 0.0286, "step": 266500 }, { "epoch": 0.002666, "grad_norm": 0.26449406147003174, "learning_rate": 1e-05, "loss": 0.0285, "step": 266600 }, { "epoch": 0.002667, "grad_norm": 0.24753186106681824, "learning_rate": 1e-05, "loss": 0.0285, "step": 266700 }, { "epoch": 0.002668, "grad_norm": 0.1877846121788025, "learning_rate": 1e-05, "loss": 0.0282, "step": 266800 }, { "epoch": 0.002669, "grad_norm": 0.25925901532173157, "learning_rate": 1e-05, "loss": 0.0282, "step": 266900 }, { "epoch": 0.00267, "grad_norm": 0.31501153111457825, "learning_rate": 1e-05, "loss": 0.0283, "step": 267000 }, { "epoch": 0.002671, "grad_norm": 0.2067796289920807, "learning_rate": 1e-05, "loss": 0.0289, "step": 267100 }, { "epoch": 0.002672, "grad_norm": 0.2567298412322998, "learning_rate": 1e-05, "loss": 0.0286, "step": 267200 }, { "epoch": 0.002673, "grad_norm": 0.24496303498744965, "learning_rate": 1e-05, "loss": 0.028, "step": 267300 }, { "epoch": 0.002674, "grad_norm": 0.263375461101532, "learning_rate": 1e-05, "loss": 0.0284, "step": 267400 }, { "epoch": 0.002675, "grad_norm": 0.2139652520418167, "learning_rate": 1e-05, "loss": 0.0286, "step": 267500 }, { "epoch": 0.002676, "grad_norm": 0.27458783984184265, "learning_rate": 1e-05, "loss": 0.0282, "step": 267600 }, { "epoch": 0.002677, "grad_norm": 0.21474601328372955, "learning_rate": 1e-05, "loss": 0.0285, "step": 267700 }, { "epoch": 0.002678, "grad_norm": 0.3362443149089813, "learning_rate": 1e-05, "loss": 0.0274, "step": 267800 }, { "epoch": 0.002679, "grad_norm": 0.2753203809261322, "learning_rate": 1e-05, "loss": 0.0278, "step": 267900 }, { "epoch": 0.00268, "grad_norm": 0.2563859224319458, "learning_rate": 1e-05, "loss": 0.0282, "step": 268000 }, { "epoch": 0.002681, "grad_norm": 0.22170598804950714, "learning_rate": 1e-05, "loss": 0.0285, "step": 268100 }, { "epoch": 0.002682, "grad_norm": 0.2440803050994873, "learning_rate": 1e-05, "loss": 0.0287, "step": 268200 }, { "epoch": 0.002683, "grad_norm": 0.2300553172826767, "learning_rate": 1e-05, "loss": 0.0281, "step": 268300 }, { "epoch": 0.002684, "grad_norm": 0.1753644496202469, "learning_rate": 1e-05, "loss": 0.0283, "step": 268400 }, { "epoch": 0.002685, "grad_norm": 0.27179086208343506, "learning_rate": 1e-05, "loss": 0.0285, "step": 268500 }, { "epoch": 0.002686, "grad_norm": 0.23684775829315186, "learning_rate": 1e-05, "loss": 0.0282, "step": 268600 }, { "epoch": 0.002687, "grad_norm": 0.24493831396102905, "learning_rate": 1e-05, "loss": 0.028, "step": 268700 }, { "epoch": 0.002688, "grad_norm": 0.24448445439338684, "learning_rate": 1e-05, "loss": 0.028, "step": 268800 }, { "epoch": 0.002689, "grad_norm": 0.20923858880996704, "learning_rate": 1e-05, "loss": 0.0282, "step": 268900 }, { "epoch": 0.00269, "grad_norm": 0.21955840289592743, "learning_rate": 1e-05, "loss": 0.0285, "step": 269000 }, { "epoch": 0.002691, "grad_norm": 0.2462361454963684, "learning_rate": 1e-05, "loss": 0.0278, "step": 269100 }, { "epoch": 0.002692, "grad_norm": 0.272726833820343, "learning_rate": 1e-05, "loss": 0.0276, "step": 269200 }, { "epoch": 0.002693, "grad_norm": 0.24248342216014862, "learning_rate": 1e-05, "loss": 0.0281, "step": 269300 }, { "epoch": 0.002694, "grad_norm": 0.2635791599750519, "learning_rate": 1e-05, "loss": 0.0281, "step": 269400 }, { "epoch": 0.002695, "grad_norm": 0.3549865484237671, "learning_rate": 1e-05, "loss": 0.0286, "step": 269500 }, { "epoch": 0.002696, "grad_norm": 0.25576189160346985, "learning_rate": 1e-05, "loss": 0.0285, "step": 269600 }, { "epoch": 0.002697, "grad_norm": 0.2796299457550049, "learning_rate": 1e-05, "loss": 0.0285, "step": 269700 }, { "epoch": 0.002698, "grad_norm": 0.22711028158664703, "learning_rate": 1e-05, "loss": 0.028, "step": 269800 }, { "epoch": 0.002699, "grad_norm": 0.22749483585357666, "learning_rate": 1e-05, "loss": 0.028, "step": 269900 }, { "epoch": 0.0027, "grad_norm": 0.19341079890727997, "learning_rate": 1e-05, "loss": 0.0277, "step": 270000 }, { "epoch": 0.002701, "grad_norm": 0.15243372321128845, "learning_rate": 1e-05, "loss": 0.0284, "step": 270100 }, { "epoch": 0.002702, "grad_norm": 0.21719197928905487, "learning_rate": 1e-05, "loss": 0.0285, "step": 270200 }, { "epoch": 0.002703, "grad_norm": 0.21595461666584015, "learning_rate": 1e-05, "loss": 0.0285, "step": 270300 }, { "epoch": 0.002704, "grad_norm": 0.2993905246257782, "learning_rate": 1e-05, "loss": 0.0287, "step": 270400 }, { "epoch": 0.002705, "grad_norm": 0.1957397162914276, "learning_rate": 1e-05, "loss": 0.0283, "step": 270500 }, { "epoch": 0.002706, "grad_norm": 0.23995761573314667, "learning_rate": 1e-05, "loss": 0.0282, "step": 270600 }, { "epoch": 0.002707, "grad_norm": 0.20960579812526703, "learning_rate": 1e-05, "loss": 0.0283, "step": 270700 }, { "epoch": 0.002708, "grad_norm": 0.23893938958644867, "learning_rate": 1e-05, "loss": 0.0278, "step": 270800 }, { "epoch": 0.002709, "grad_norm": 0.2676232159137726, "learning_rate": 1e-05, "loss": 0.0276, "step": 270900 }, { "epoch": 0.00271, "grad_norm": 0.1870114803314209, "learning_rate": 1e-05, "loss": 0.0275, "step": 271000 }, { "epoch": 0.002711, "grad_norm": 0.2273780256509781, "learning_rate": 1e-05, "loss": 0.0281, "step": 271100 }, { "epoch": 0.002712, "grad_norm": 0.21565380692481995, "learning_rate": 1e-05, "loss": 0.0281, "step": 271200 }, { "epoch": 0.002713, "grad_norm": 0.2061900496482849, "learning_rate": 1e-05, "loss": 0.0285, "step": 271300 }, { "epoch": 0.002714, "grad_norm": 0.4069661796092987, "learning_rate": 1e-05, "loss": 0.0279, "step": 271400 }, { "epoch": 0.002715, "grad_norm": 0.21646632254123688, "learning_rate": 1e-05, "loss": 0.0276, "step": 271500 }, { "epoch": 0.002716, "grad_norm": 0.2505447268486023, "learning_rate": 1e-05, "loss": 0.0281, "step": 271600 }, { "epoch": 0.002717, "grad_norm": 0.24439601600170135, "learning_rate": 1e-05, "loss": 0.0283, "step": 271700 }, { "epoch": 0.002718, "grad_norm": 0.23919597268104553, "learning_rate": 1e-05, "loss": 0.0283, "step": 271800 }, { "epoch": 0.002719, "grad_norm": 0.25718867778778076, "learning_rate": 1e-05, "loss": 0.0281, "step": 271900 }, { "epoch": 0.00272, "grad_norm": 0.17952463030815125, "learning_rate": 1e-05, "loss": 0.0287, "step": 272000 }, { "epoch": 0.002721, "grad_norm": 0.24895118176937103, "learning_rate": 1e-05, "loss": 0.0276, "step": 272100 }, { "epoch": 0.002722, "grad_norm": 0.25426140427589417, "learning_rate": 1e-05, "loss": 0.0281, "step": 272200 }, { "epoch": 0.002723, "grad_norm": 0.21902287006378174, "learning_rate": 1e-05, "loss": 0.0282, "step": 272300 }, { "epoch": 0.002724, "grad_norm": 0.30716031789779663, "learning_rate": 1e-05, "loss": 0.0281, "step": 272400 }, { "epoch": 0.002725, "grad_norm": 0.22142398357391357, "learning_rate": 1e-05, "loss": 0.0282, "step": 272500 }, { "epoch": 0.002726, "grad_norm": 0.26512256264686584, "learning_rate": 1e-05, "loss": 0.0279, "step": 272600 }, { "epoch": 0.002727, "grad_norm": 0.2289360910654068, "learning_rate": 1e-05, "loss": 0.028, "step": 272700 }, { "epoch": 0.002728, "grad_norm": 0.18942196667194366, "learning_rate": 1e-05, "loss": 0.0282, "step": 272800 }, { "epoch": 0.002729, "grad_norm": 0.25587841868400574, "learning_rate": 1e-05, "loss": 0.0285, "step": 272900 }, { "epoch": 0.00273, "grad_norm": 0.23681235313415527, "learning_rate": 1e-05, "loss": 0.0278, "step": 273000 }, { "epoch": 0.002731, "grad_norm": 0.28952503204345703, "learning_rate": 1e-05, "loss": 0.0276, "step": 273100 }, { "epoch": 0.002732, "grad_norm": 0.21536293625831604, "learning_rate": 1e-05, "loss": 0.0284, "step": 273200 }, { "epoch": 0.002733, "grad_norm": 0.174167662858963, "learning_rate": 1e-05, "loss": 0.0285, "step": 273300 }, { "epoch": 0.002734, "grad_norm": 0.25015413761138916, "learning_rate": 1e-05, "loss": 0.0276, "step": 273400 }, { "epoch": 0.002735, "grad_norm": 0.33368438482284546, "learning_rate": 1e-05, "loss": 0.0276, "step": 273500 }, { "epoch": 0.002736, "grad_norm": 0.31301718950271606, "learning_rate": 1e-05, "loss": 0.0282, "step": 273600 }, { "epoch": 0.002737, "grad_norm": 0.3514457046985626, "learning_rate": 1e-05, "loss": 0.0276, "step": 273700 }, { "epoch": 0.002738, "grad_norm": 0.2791360020637512, "learning_rate": 1e-05, "loss": 0.0284, "step": 273800 }, { "epoch": 0.002739, "grad_norm": 0.22541259229183197, "learning_rate": 1e-05, "loss": 0.0278, "step": 273900 }, { "epoch": 0.00274, "grad_norm": 0.2533418536186218, "learning_rate": 1e-05, "loss": 0.0281, "step": 274000 }, { "epoch": 0.002741, "grad_norm": 0.2923007011413574, "learning_rate": 1e-05, "loss": 0.0278, "step": 274100 }, { "epoch": 0.002742, "grad_norm": 0.3561795651912689, "learning_rate": 1e-05, "loss": 0.0281, "step": 274200 }, { "epoch": 0.002743, "grad_norm": 0.21111203730106354, "learning_rate": 1e-05, "loss": 0.0282, "step": 274300 }, { "epoch": 0.002744, "grad_norm": 0.3596116304397583, "learning_rate": 1e-05, "loss": 0.028, "step": 274400 }, { "epoch": 0.002745, "grad_norm": 0.22175128757953644, "learning_rate": 1e-05, "loss": 0.0284, "step": 274500 }, { "epoch": 0.002746, "grad_norm": 0.22931748628616333, "learning_rate": 1e-05, "loss": 0.0279, "step": 274600 }, { "epoch": 0.002747, "grad_norm": 0.19572947919368744, "learning_rate": 1e-05, "loss": 0.0275, "step": 274700 }, { "epoch": 0.002748, "grad_norm": 0.20988529920578003, "learning_rate": 1e-05, "loss": 0.0279, "step": 274800 }, { "epoch": 0.002749, "grad_norm": 0.20425492525100708, "learning_rate": 1e-05, "loss": 0.0276, "step": 274900 }, { "epoch": 0.00275, "grad_norm": 0.21900810301303864, "learning_rate": 1e-05, "loss": 0.0273, "step": 275000 }, { "epoch": 0.002751, "grad_norm": 0.2884361445903778, "learning_rate": 1e-05, "loss": 0.0285, "step": 275100 }, { "epoch": 0.002752, "grad_norm": 0.23148943483829498, "learning_rate": 1e-05, "loss": 0.028, "step": 275200 }, { "epoch": 0.002753, "grad_norm": 0.2911067306995392, "learning_rate": 1e-05, "loss": 0.0271, "step": 275300 }, { "epoch": 0.002754, "grad_norm": 0.27673661708831787, "learning_rate": 1e-05, "loss": 0.0282, "step": 275400 }, { "epoch": 0.002755, "grad_norm": 0.2373129427433014, "learning_rate": 1e-05, "loss": 0.0273, "step": 275500 }, { "epoch": 0.002756, "grad_norm": 0.23336221277713776, "learning_rate": 1e-05, "loss": 0.0282, "step": 275600 }, { "epoch": 0.002757, "grad_norm": 0.25763288140296936, "learning_rate": 1e-05, "loss": 0.0278, "step": 275700 }, { "epoch": 0.002758, "grad_norm": 0.23855599761009216, "learning_rate": 1e-05, "loss": 0.0277, "step": 275800 }, { "epoch": 0.002759, "grad_norm": 0.23386619985103607, "learning_rate": 1e-05, "loss": 0.028, "step": 275900 }, { "epoch": 0.00276, "grad_norm": 0.19982677698135376, "learning_rate": 1e-05, "loss": 0.0276, "step": 276000 }, { "epoch": 0.002761, "grad_norm": 0.25872841477394104, "learning_rate": 1e-05, "loss": 0.0277, "step": 276100 }, { "epoch": 0.002762, "grad_norm": 0.33352193236351013, "learning_rate": 1e-05, "loss": 0.0274, "step": 276200 }, { "epoch": 0.002763, "grad_norm": 0.2641548812389374, "learning_rate": 1e-05, "loss": 0.0276, "step": 276300 }, { "epoch": 0.002764, "grad_norm": 0.23768527805805206, "learning_rate": 1e-05, "loss": 0.0285, "step": 276400 }, { "epoch": 0.002765, "grad_norm": 0.24344487488269806, "learning_rate": 1e-05, "loss": 0.028, "step": 276500 }, { "epoch": 0.002766, "grad_norm": 0.2889992296695709, "learning_rate": 1e-05, "loss": 0.0278, "step": 276600 }, { "epoch": 0.002767, "grad_norm": 0.24662598967552185, "learning_rate": 1e-05, "loss": 0.0281, "step": 276700 }, { "epoch": 0.002768, "grad_norm": 0.2458844929933548, "learning_rate": 1e-05, "loss": 0.0284, "step": 276800 }, { "epoch": 0.002769, "grad_norm": 0.25185126066207886, "learning_rate": 1e-05, "loss": 0.028, "step": 276900 }, { "epoch": 0.00277, "grad_norm": 0.28331297636032104, "learning_rate": 1e-05, "loss": 0.0278, "step": 277000 }, { "epoch": 0.002771, "grad_norm": 0.22743336856365204, "learning_rate": 1e-05, "loss": 0.0277, "step": 277100 }, { "epoch": 0.002772, "grad_norm": 0.26660245656967163, "learning_rate": 1e-05, "loss": 0.0282, "step": 277200 }, { "epoch": 0.002773, "grad_norm": 0.2156538963317871, "learning_rate": 1e-05, "loss": 0.0272, "step": 277300 }, { "epoch": 0.002774, "grad_norm": 0.2089371383190155, "learning_rate": 1e-05, "loss": 0.0279, "step": 277400 }, { "epoch": 0.002775, "grad_norm": 0.22323653101921082, "learning_rate": 1e-05, "loss": 0.0277, "step": 277500 }, { "epoch": 0.002776, "grad_norm": 0.2258116751909256, "learning_rate": 1e-05, "loss": 0.0278, "step": 277600 }, { "epoch": 0.002777, "grad_norm": 0.24797260761260986, "learning_rate": 1e-05, "loss": 0.0277, "step": 277700 }, { "epoch": 0.002778, "grad_norm": 0.21658116579055786, "learning_rate": 1e-05, "loss": 0.0284, "step": 277800 }, { "epoch": 0.002779, "grad_norm": 0.18194548785686493, "learning_rate": 1e-05, "loss": 0.0275, "step": 277900 }, { "epoch": 0.00278, "grad_norm": 0.2846522629261017, "learning_rate": 1e-05, "loss": 0.0281, "step": 278000 }, { "epoch": 0.002781, "grad_norm": 0.18160736560821533, "learning_rate": 1e-05, "loss": 0.0277, "step": 278100 }, { "epoch": 0.002782, "grad_norm": 0.267779141664505, "learning_rate": 1e-05, "loss": 0.0282, "step": 278200 }, { "epoch": 0.002783, "grad_norm": 0.19080890715122223, "learning_rate": 1e-05, "loss": 0.0276, "step": 278300 }, { "epoch": 0.002784, "grad_norm": 0.24349220097064972, "learning_rate": 1e-05, "loss": 0.0276, "step": 278400 }, { "epoch": 0.002785, "grad_norm": 0.2594900131225586, "learning_rate": 1e-05, "loss": 0.0273, "step": 278500 }, { "epoch": 0.002786, "grad_norm": 0.25156500935554504, "learning_rate": 1e-05, "loss": 0.0276, "step": 278600 }, { "epoch": 0.002787, "grad_norm": 0.2475072294473648, "learning_rate": 1e-05, "loss": 0.0275, "step": 278700 }, { "epoch": 0.002788, "grad_norm": 0.22083261609077454, "learning_rate": 1e-05, "loss": 0.0281, "step": 278800 }, { "epoch": 0.002789, "grad_norm": 0.22958053648471832, "learning_rate": 1e-05, "loss": 0.0279, "step": 278900 }, { "epoch": 0.00279, "grad_norm": 0.22086703777313232, "learning_rate": 1e-05, "loss": 0.0285, "step": 279000 }, { "epoch": 0.002791, "grad_norm": 0.22358830273151398, "learning_rate": 1e-05, "loss": 0.0276, "step": 279100 }, { "epoch": 0.002792, "grad_norm": 0.20372091233730316, "learning_rate": 1e-05, "loss": 0.0277, "step": 279200 }, { "epoch": 0.002793, "grad_norm": 0.21077501773834229, "learning_rate": 1e-05, "loss": 0.0276, "step": 279300 }, { "epoch": 0.002794, "grad_norm": 0.2685248553752899, "learning_rate": 1e-05, "loss": 0.0278, "step": 279400 }, { "epoch": 0.002795, "grad_norm": 0.269519180059433, "learning_rate": 1e-05, "loss": 0.028, "step": 279500 }, { "epoch": 0.002796, "grad_norm": 0.20004260540008545, "learning_rate": 1e-05, "loss": 0.0275, "step": 279600 }, { "epoch": 0.002797, "grad_norm": 0.2146834433078766, "learning_rate": 1e-05, "loss": 0.0277, "step": 279700 }, { "epoch": 0.002798, "grad_norm": 0.2739872336387634, "learning_rate": 1e-05, "loss": 0.0278, "step": 279800 }, { "epoch": 0.002799, "grad_norm": 0.22957751154899597, "learning_rate": 1e-05, "loss": 0.028, "step": 279900 }, { "epoch": 0.0028, "grad_norm": 0.19748975336551666, "learning_rate": 1e-05, "loss": 0.0274, "step": 280000 }, { "epoch": 0.0028, "eval_loss": 0.024627605453133583, "eval_runtime": 170.3994, "eval_samples_per_second": 293.428, "eval_steps_per_second": 18.339, "step": 280000 }, { "epoch": 0.002801, "grad_norm": 0.29110708832740784, "learning_rate": 1e-05, "loss": 0.0277, "step": 280100 }, { "epoch": 0.002802, "grad_norm": 0.25498759746551514, "learning_rate": 1e-05, "loss": 0.0277, "step": 280200 }, { "epoch": 0.002803, "grad_norm": 0.18197031319141388, "learning_rate": 1e-05, "loss": 0.0279, "step": 280300 }, { "epoch": 0.002804, "grad_norm": 0.3786429464817047, "learning_rate": 1e-05, "loss": 0.0281, "step": 280400 }, { "epoch": 0.002805, "grad_norm": 0.286188006401062, "learning_rate": 1e-05, "loss": 0.0281, "step": 280500 }, { "epoch": 0.002806, "grad_norm": 0.22872619330883026, "learning_rate": 1e-05, "loss": 0.0277, "step": 280600 }, { "epoch": 0.002807, "grad_norm": 0.20527757704257965, "learning_rate": 1e-05, "loss": 0.0277, "step": 280700 }, { "epoch": 0.002808, "grad_norm": 0.253246009349823, "learning_rate": 1e-05, "loss": 0.0272, "step": 280800 }, { "epoch": 0.002809, "grad_norm": 0.25252291560173035, "learning_rate": 1e-05, "loss": 0.0277, "step": 280900 }, { "epoch": 0.00281, "grad_norm": 0.26235491037368774, "learning_rate": 1e-05, "loss": 0.028, "step": 281000 }, { "epoch": 0.002811, "grad_norm": 0.16507574915885925, "learning_rate": 1e-05, "loss": 0.0276, "step": 281100 }, { "epoch": 0.002812, "grad_norm": 0.1826183944940567, "learning_rate": 1e-05, "loss": 0.0278, "step": 281200 }, { "epoch": 0.002813, "grad_norm": 0.24809186160564423, "learning_rate": 1e-05, "loss": 0.0278, "step": 281300 }, { "epoch": 0.002814, "grad_norm": 0.179380863904953, "learning_rate": 1e-05, "loss": 0.0282, "step": 281400 }, { "epoch": 0.002815, "grad_norm": 0.22074542939662933, "learning_rate": 1e-05, "loss": 0.0276, "step": 281500 }, { "epoch": 0.002816, "grad_norm": 0.22184789180755615, "learning_rate": 1e-05, "loss": 0.0275, "step": 281600 }, { "epoch": 0.002817, "grad_norm": 0.21415428817272186, "learning_rate": 1e-05, "loss": 0.0274, "step": 281700 }, { "epoch": 0.002818, "grad_norm": 0.2831111550331116, "learning_rate": 1e-05, "loss": 0.0279, "step": 281800 }, { "epoch": 0.002819, "grad_norm": 0.19921649992465973, "learning_rate": 1e-05, "loss": 0.0282, "step": 281900 }, { "epoch": 0.00282, "grad_norm": 0.22805067896842957, "learning_rate": 1e-05, "loss": 0.0279, "step": 282000 }, { "epoch": 0.002821, "grad_norm": 0.187729611992836, "learning_rate": 1e-05, "loss": 0.0276, "step": 282100 }, { "epoch": 0.002822, "grad_norm": 0.2932172119617462, "learning_rate": 1e-05, "loss": 0.0276, "step": 282200 }, { "epoch": 0.002823, "grad_norm": 0.2212413102388382, "learning_rate": 1e-05, "loss": 0.0273, "step": 282300 }, { "epoch": 0.002824, "grad_norm": 0.298909455537796, "learning_rate": 1e-05, "loss": 0.0277, "step": 282400 }, { "epoch": 0.002825, "grad_norm": 0.3060329258441925, "learning_rate": 1e-05, "loss": 0.0278, "step": 282500 }, { "epoch": 0.002826, "grad_norm": 0.43418654799461365, "learning_rate": 1e-05, "loss": 0.0274, "step": 282600 }, { "epoch": 0.002827, "grad_norm": 0.23464730381965637, "learning_rate": 1e-05, "loss": 0.0278, "step": 282700 }, { "epoch": 0.002828, "grad_norm": 0.2053508162498474, "learning_rate": 1e-05, "loss": 0.0276, "step": 282800 }, { "epoch": 0.002829, "grad_norm": 0.4512549340724945, "learning_rate": 1e-05, "loss": 0.0275, "step": 282900 }, { "epoch": 0.00283, "grad_norm": 0.24391992390155792, "learning_rate": 1e-05, "loss": 0.027, "step": 283000 }, { "epoch": 0.002831, "grad_norm": 0.2407795786857605, "learning_rate": 1e-05, "loss": 0.0279, "step": 283100 }, { "epoch": 0.002832, "grad_norm": 0.22714544832706451, "learning_rate": 1e-05, "loss": 0.0276, "step": 283200 }, { "epoch": 0.002833, "grad_norm": 0.2195502370595932, "learning_rate": 1e-05, "loss": 0.0272, "step": 283300 }, { "epoch": 0.002834, "grad_norm": 0.19935554265975952, "learning_rate": 1e-05, "loss": 0.0274, "step": 283400 }, { "epoch": 0.002835, "grad_norm": 0.22562800347805023, "learning_rate": 1e-05, "loss": 0.0275, "step": 283500 }, { "epoch": 0.002836, "grad_norm": 0.2792660593986511, "learning_rate": 1e-05, "loss": 0.0283, "step": 283600 }, { "epoch": 0.002837, "grad_norm": 0.29119977355003357, "learning_rate": 1e-05, "loss": 0.0284, "step": 283700 }, { "epoch": 0.002838, "grad_norm": 0.1964394897222519, "learning_rate": 1e-05, "loss": 0.0278, "step": 283800 }, { "epoch": 0.002839, "grad_norm": 0.25419390201568604, "learning_rate": 1e-05, "loss": 0.0281, "step": 283900 }, { "epoch": 0.00284, "grad_norm": 0.26142075657844543, "learning_rate": 1e-05, "loss": 0.0272, "step": 284000 }, { "epoch": 0.002841, "grad_norm": 0.26091933250427246, "learning_rate": 1e-05, "loss": 0.0278, "step": 284100 }, { "epoch": 0.002842, "grad_norm": 0.22883933782577515, "learning_rate": 1e-05, "loss": 0.0275, "step": 284200 }, { "epoch": 0.002843, "grad_norm": 0.21179138123989105, "learning_rate": 1e-05, "loss": 0.0274, "step": 284300 }, { "epoch": 0.002844, "grad_norm": 0.328698068857193, "learning_rate": 1e-05, "loss": 0.0278, "step": 284400 }, { "epoch": 0.002845, "grad_norm": 0.24519240856170654, "learning_rate": 1e-05, "loss": 0.0277, "step": 284500 }, { "epoch": 0.002846, "grad_norm": 0.21933317184448242, "learning_rate": 1e-05, "loss": 0.0275, "step": 284600 }, { "epoch": 0.002847, "grad_norm": 0.1955527812242508, "learning_rate": 1e-05, "loss": 0.0277, "step": 284700 }, { "epoch": 0.002848, "grad_norm": 0.2167377769947052, "learning_rate": 1e-05, "loss": 0.0279, "step": 284800 }, { "epoch": 0.002849, "grad_norm": 0.15441304445266724, "learning_rate": 1e-05, "loss": 0.0274, "step": 284900 }, { "epoch": 0.00285, "grad_norm": 0.2721899449825287, "learning_rate": 1e-05, "loss": 0.0279, "step": 285000 }, { "epoch": 0.002851, "grad_norm": 0.2279968410730362, "learning_rate": 1e-05, "loss": 0.028, "step": 285100 }, { "epoch": 0.002852, "grad_norm": 0.23611266911029816, "learning_rate": 1e-05, "loss": 0.0282, "step": 285200 }, { "epoch": 0.002853, "grad_norm": 0.8160094022750854, "learning_rate": 1e-05, "loss": 0.0275, "step": 285300 }, { "epoch": 0.002854, "grad_norm": 0.2263754904270172, "learning_rate": 1e-05, "loss": 0.0276, "step": 285400 }, { "epoch": 0.002855, "grad_norm": 0.23015806078910828, "learning_rate": 1e-05, "loss": 0.0273, "step": 285500 }, { "epoch": 0.002856, "grad_norm": 0.23156525194644928, "learning_rate": 1e-05, "loss": 0.0275, "step": 285600 }, { "epoch": 0.002857, "grad_norm": 0.2046230286359787, "learning_rate": 1e-05, "loss": 0.0272, "step": 285700 }, { "epoch": 0.002858, "grad_norm": 0.2649511396884918, "learning_rate": 1e-05, "loss": 0.0272, "step": 285800 }, { "epoch": 0.002859, "grad_norm": 0.210232675075531, "learning_rate": 1e-05, "loss": 0.0272, "step": 285900 }, { "epoch": 0.00286, "grad_norm": 0.25300079584121704, "learning_rate": 1e-05, "loss": 0.0274, "step": 286000 }, { "epoch": 0.002861, "grad_norm": 0.3001638948917389, "learning_rate": 1e-05, "loss": 0.0272, "step": 286100 }, { "epoch": 0.002862, "grad_norm": 0.23934616148471832, "learning_rate": 1e-05, "loss": 0.0272, "step": 286200 }, { "epoch": 0.002863, "grad_norm": 0.2935727834701538, "learning_rate": 1e-05, "loss": 0.0277, "step": 286300 }, { "epoch": 0.002864, "grad_norm": 0.24871401488780975, "learning_rate": 1e-05, "loss": 0.0269, "step": 286400 }, { "epoch": 0.002865, "grad_norm": 0.19701388478279114, "learning_rate": 1e-05, "loss": 0.0273, "step": 286500 }, { "epoch": 0.002866, "grad_norm": 0.1963273137807846, "learning_rate": 1e-05, "loss": 0.0276, "step": 286600 }, { "epoch": 0.002867, "grad_norm": 0.3633527159690857, "learning_rate": 1e-05, "loss": 0.0276, "step": 286700 }, { "epoch": 0.002868, "grad_norm": 0.28304722905158997, "learning_rate": 1e-05, "loss": 0.0274, "step": 286800 }, { "epoch": 0.002869, "grad_norm": 0.27208212018013, "learning_rate": 1e-05, "loss": 0.0276, "step": 286900 }, { "epoch": 0.00287, "grad_norm": 0.29816144704818726, "learning_rate": 1e-05, "loss": 0.0272, "step": 287000 }, { "epoch": 0.002871, "grad_norm": 0.22301216423511505, "learning_rate": 1e-05, "loss": 0.0277, "step": 287100 }, { "epoch": 0.002872, "grad_norm": 0.18092040717601776, "learning_rate": 1e-05, "loss": 0.0277, "step": 287200 }, { "epoch": 0.002873, "grad_norm": 0.2192026674747467, "learning_rate": 1e-05, "loss": 0.0276, "step": 287300 }, { "epoch": 0.002874, "grad_norm": 0.23193123936653137, "learning_rate": 1e-05, "loss": 0.0275, "step": 287400 }, { "epoch": 0.002875, "grad_norm": 0.1992737501859665, "learning_rate": 1e-05, "loss": 0.028, "step": 287500 }, { "epoch": 0.002876, "grad_norm": 0.27904167771339417, "learning_rate": 1e-05, "loss": 0.028, "step": 287600 }, { "epoch": 0.002877, "grad_norm": 0.2376258373260498, "learning_rate": 1e-05, "loss": 0.0274, "step": 287700 }, { "epoch": 0.002878, "grad_norm": 0.20561841130256653, "learning_rate": 1e-05, "loss": 0.0277, "step": 287800 }, { "epoch": 0.002879, "grad_norm": 0.17662794888019562, "learning_rate": 1e-05, "loss": 0.0272, "step": 287900 }, { "epoch": 0.00288, "grad_norm": 0.20095893740653992, "learning_rate": 1e-05, "loss": 0.0273, "step": 288000 }, { "epoch": 0.002881, "grad_norm": 0.2308174967765808, "learning_rate": 1e-05, "loss": 0.0279, "step": 288100 }, { "epoch": 0.002882, "grad_norm": 0.2161286622285843, "learning_rate": 1e-05, "loss": 0.0271, "step": 288200 }, { "epoch": 0.002883, "grad_norm": 0.24040083587169647, "learning_rate": 1e-05, "loss": 0.0268, "step": 288300 }, { "epoch": 0.002884, "grad_norm": 0.26135537028312683, "learning_rate": 1e-05, "loss": 0.027, "step": 288400 }, { "epoch": 0.002885, "grad_norm": 0.17872558534145355, "learning_rate": 1e-05, "loss": 0.0272, "step": 288500 }, { "epoch": 0.002886, "grad_norm": 0.18746346235275269, "learning_rate": 1e-05, "loss": 0.0276, "step": 288600 }, { "epoch": 0.002887, "grad_norm": 0.3147880733013153, "learning_rate": 1e-05, "loss": 0.0272, "step": 288700 }, { "epoch": 0.002888, "grad_norm": 0.26670563220977783, "learning_rate": 1e-05, "loss": 0.0274, "step": 288800 }, { "epoch": 0.002889, "grad_norm": 0.22966842353343964, "learning_rate": 1e-05, "loss": 0.0275, "step": 288900 }, { "epoch": 0.00289, "grad_norm": 0.16269560158252716, "learning_rate": 1e-05, "loss": 0.0276, "step": 289000 }, { "epoch": 0.002891, "grad_norm": 0.18940922617912292, "learning_rate": 1e-05, "loss": 0.0278, "step": 289100 }, { "epoch": 0.002892, "grad_norm": 0.1950470507144928, "learning_rate": 1e-05, "loss": 0.0271, "step": 289200 }, { "epoch": 0.002893, "grad_norm": 0.2084319293498993, "learning_rate": 1e-05, "loss": 0.0279, "step": 289300 }, { "epoch": 0.002894, "grad_norm": 0.20374660193920135, "learning_rate": 1e-05, "loss": 0.0275, "step": 289400 }, { "epoch": 0.002895, "grad_norm": 0.24456292390823364, "learning_rate": 1e-05, "loss": 0.0276, "step": 289500 }, { "epoch": 0.002896, "grad_norm": 0.22250503301620483, "learning_rate": 1e-05, "loss": 0.0276, "step": 289600 }, { "epoch": 0.002897, "grad_norm": 0.23765628039836884, "learning_rate": 1e-05, "loss": 0.0272, "step": 289700 }, { "epoch": 0.002898, "grad_norm": 0.18752063810825348, "learning_rate": 1e-05, "loss": 0.0274, "step": 289800 }, { "epoch": 0.002899, "grad_norm": 0.2100832611322403, "learning_rate": 1e-05, "loss": 0.0275, "step": 289900 }, { "epoch": 0.0029, "grad_norm": 0.19592754542827606, "learning_rate": 1e-05, "loss": 0.0275, "step": 290000 }, { "epoch": 0.002901, "grad_norm": 0.30490902066230774, "learning_rate": 1e-05, "loss": 0.0275, "step": 290100 }, { "epoch": 0.002902, "grad_norm": 0.29172468185424805, "learning_rate": 1e-05, "loss": 0.0273, "step": 290200 }, { "epoch": 0.002903, "grad_norm": 0.19555823504924774, "learning_rate": 1e-05, "loss": 0.0271, "step": 290300 }, { "epoch": 0.002904, "grad_norm": 0.43527278304100037, "learning_rate": 1e-05, "loss": 0.0276, "step": 290400 }, { "epoch": 0.002905, "grad_norm": 0.27792835235595703, "learning_rate": 1e-05, "loss": 0.0275, "step": 290500 }, { "epoch": 0.002906, "grad_norm": 0.22308336198329926, "learning_rate": 1e-05, "loss": 0.0276, "step": 290600 }, { "epoch": 0.002907, "grad_norm": 0.29254406690597534, "learning_rate": 1e-05, "loss": 0.0274, "step": 290700 }, { "epoch": 0.002908, "grad_norm": 0.2527879774570465, "learning_rate": 1e-05, "loss": 0.027, "step": 290800 }, { "epoch": 0.002909, "grad_norm": 0.25059276819229126, "learning_rate": 1e-05, "loss": 0.0271, "step": 290900 }, { "epoch": 0.00291, "grad_norm": 0.18927282094955444, "learning_rate": 1e-05, "loss": 0.0278, "step": 291000 }, { "epoch": 0.002911, "grad_norm": 0.28786396980285645, "learning_rate": 1e-05, "loss": 0.028, "step": 291100 }, { "epoch": 0.002912, "grad_norm": 0.22418510913848877, "learning_rate": 1e-05, "loss": 0.0276, "step": 291200 }, { "epoch": 0.002913, "grad_norm": 0.20115984976291656, "learning_rate": 1e-05, "loss": 0.0274, "step": 291300 }, { "epoch": 0.002914, "grad_norm": 0.3326718211174011, "learning_rate": 1e-05, "loss": 0.0274, "step": 291400 }, { "epoch": 0.002915, "grad_norm": 0.26343655586242676, "learning_rate": 1e-05, "loss": 0.0275, "step": 291500 }, { "epoch": 0.002916, "grad_norm": 0.20978927612304688, "learning_rate": 1e-05, "loss": 0.0273, "step": 291600 }, { "epoch": 0.002917, "grad_norm": 0.2341114580631256, "learning_rate": 1e-05, "loss": 0.0275, "step": 291700 }, { "epoch": 0.002918, "grad_norm": 0.2952612340450287, "learning_rate": 1e-05, "loss": 0.0275, "step": 291800 }, { "epoch": 0.002919, "grad_norm": 0.19049865007400513, "learning_rate": 1e-05, "loss": 0.0278, "step": 291900 }, { "epoch": 0.00292, "grad_norm": 0.18844784796237946, "learning_rate": 1e-05, "loss": 0.0268, "step": 292000 }, { "epoch": 0.002921, "grad_norm": 0.27136629819869995, "learning_rate": 1e-05, "loss": 0.0267, "step": 292100 }, { "epoch": 0.002922, "grad_norm": 0.17616640031337738, "learning_rate": 1e-05, "loss": 0.0273, "step": 292200 }, { "epoch": 0.002923, "grad_norm": 0.30955901741981506, "learning_rate": 1e-05, "loss": 0.027, "step": 292300 }, { "epoch": 0.002924, "grad_norm": 0.19668197631835938, "learning_rate": 1e-05, "loss": 0.0274, "step": 292400 }, { "epoch": 0.002925, "grad_norm": 0.20238226652145386, "learning_rate": 1e-05, "loss": 0.0275, "step": 292500 }, { "epoch": 0.002926, "grad_norm": 0.3161505162715912, "learning_rate": 1e-05, "loss": 0.0272, "step": 292600 }, { "epoch": 0.002927, "grad_norm": 0.200543612241745, "learning_rate": 1e-05, "loss": 0.0265, "step": 292700 }, { "epoch": 0.002928, "grad_norm": 0.25139865279197693, "learning_rate": 1e-05, "loss": 0.0275, "step": 292800 }, { "epoch": 0.002929, "grad_norm": 0.321555495262146, "learning_rate": 1e-05, "loss": 0.0269, "step": 292900 }, { "epoch": 0.00293, "grad_norm": 0.2420373558998108, "learning_rate": 1e-05, "loss": 0.0274, "step": 293000 }, { "epoch": 0.002931, "grad_norm": 0.22611433267593384, "learning_rate": 1e-05, "loss": 0.0273, "step": 293100 }, { "epoch": 0.002932, "grad_norm": 0.21992428600788116, "learning_rate": 1e-05, "loss": 0.0269, "step": 293200 }, { "epoch": 0.002933, "grad_norm": 0.18786901235580444, "learning_rate": 1e-05, "loss": 0.0268, "step": 293300 }, { "epoch": 0.002934, "grad_norm": 0.24600224196910858, "learning_rate": 1e-05, "loss": 0.0272, "step": 293400 }, { "epoch": 0.002935, "grad_norm": 0.21427375078201294, "learning_rate": 1e-05, "loss": 0.0271, "step": 293500 }, { "epoch": 0.002936, "grad_norm": 0.32805758714675903, "learning_rate": 1e-05, "loss": 0.0276, "step": 293600 }, { "epoch": 0.002937, "grad_norm": 0.20788568258285522, "learning_rate": 1e-05, "loss": 0.0275, "step": 293700 }, { "epoch": 0.002938, "grad_norm": 0.20231610536575317, "learning_rate": 1e-05, "loss": 0.027, "step": 293800 }, { "epoch": 0.002939, "grad_norm": 0.24338646233081818, "learning_rate": 1e-05, "loss": 0.0276, "step": 293900 }, { "epoch": 0.00294, "grad_norm": 0.20058143138885498, "learning_rate": 1e-05, "loss": 0.0272, "step": 294000 }, { "epoch": 0.002941, "grad_norm": 0.18870879709720612, "learning_rate": 1e-05, "loss": 0.0271, "step": 294100 }, { "epoch": 0.002942, "grad_norm": 0.17903867363929749, "learning_rate": 1e-05, "loss": 0.0273, "step": 294200 }, { "epoch": 0.002943, "grad_norm": 0.31151649355888367, "learning_rate": 1e-05, "loss": 0.0271, "step": 294300 }, { "epoch": 0.002944, "grad_norm": 0.2723485827445984, "learning_rate": 1e-05, "loss": 0.027, "step": 294400 }, { "epoch": 0.002945, "grad_norm": 0.18464182317256927, "learning_rate": 1e-05, "loss": 0.0271, "step": 294500 }, { "epoch": 0.002946, "grad_norm": 0.27860406041145325, "learning_rate": 1e-05, "loss": 0.0276, "step": 294600 }, { "epoch": 0.002947, "grad_norm": 0.23383447527885437, "learning_rate": 1e-05, "loss": 0.0273, "step": 294700 }, { "epoch": 0.002948, "grad_norm": 0.21705061197280884, "learning_rate": 1e-05, "loss": 0.0271, "step": 294800 }, { "epoch": 0.002949, "grad_norm": 0.23613987863063812, "learning_rate": 1e-05, "loss": 0.0271, "step": 294900 }, { "epoch": 0.00295, "grad_norm": 0.23902781307697296, "learning_rate": 1e-05, "loss": 0.0266, "step": 295000 }, { "epoch": 0.002951, "grad_norm": 0.17160800099372864, "learning_rate": 1e-05, "loss": 0.028, "step": 295100 }, { "epoch": 0.002952, "grad_norm": 0.33970049023628235, "learning_rate": 1e-05, "loss": 0.0273, "step": 295200 }, { "epoch": 0.002953, "grad_norm": 0.18324890732765198, "learning_rate": 1e-05, "loss": 0.027, "step": 295300 }, { "epoch": 0.002954, "grad_norm": 0.32035091519355774, "learning_rate": 1e-05, "loss": 0.0273, "step": 295400 }, { "epoch": 0.002955, "grad_norm": 0.20964975655078888, "learning_rate": 1e-05, "loss": 0.0269, "step": 295500 }, { "epoch": 0.002956, "grad_norm": 0.178806871175766, "learning_rate": 1e-05, "loss": 0.0266, "step": 295600 }, { "epoch": 0.002957, "grad_norm": 0.24864766001701355, "learning_rate": 1e-05, "loss": 0.0277, "step": 295700 }, { "epoch": 0.002958, "grad_norm": 0.2710517346858978, "learning_rate": 1e-05, "loss": 0.0271, "step": 295800 }, { "epoch": 0.002959, "grad_norm": 0.19774356484413147, "learning_rate": 1e-05, "loss": 0.0276, "step": 295900 }, { "epoch": 0.00296, "grad_norm": 0.23951397836208344, "learning_rate": 1e-05, "loss": 0.0273, "step": 296000 }, { "epoch": 0.002961, "grad_norm": 0.3224353790283203, "learning_rate": 1e-05, "loss": 0.0278, "step": 296100 }, { "epoch": 0.002962, "grad_norm": 0.24180898070335388, "learning_rate": 1e-05, "loss": 0.0264, "step": 296200 }, { "epoch": 0.002963, "grad_norm": 0.23841708898544312, "learning_rate": 1e-05, "loss": 0.0269, "step": 296300 }, { "epoch": 0.002964, "grad_norm": 0.29968830943107605, "learning_rate": 1e-05, "loss": 0.0268, "step": 296400 }, { "epoch": 0.002965, "grad_norm": 0.23664282262325287, "learning_rate": 1e-05, "loss": 0.0274, "step": 296500 }, { "epoch": 0.002966, "grad_norm": 0.25976505875587463, "learning_rate": 1e-05, "loss": 0.0268, "step": 296600 }, { "epoch": 0.002967, "grad_norm": 0.260409951210022, "learning_rate": 1e-05, "loss": 0.0276, "step": 296700 }, { "epoch": 0.002968, "grad_norm": 0.20636500418186188, "learning_rate": 1e-05, "loss": 0.0271, "step": 296800 }, { "epoch": 0.002969, "grad_norm": 0.2109842300415039, "learning_rate": 1e-05, "loss": 0.0277, "step": 296900 }, { "epoch": 0.00297, "grad_norm": 0.22684448957443237, "learning_rate": 1e-05, "loss": 0.0273, "step": 297000 }, { "epoch": 0.002971, "grad_norm": 0.2617685794830322, "learning_rate": 1e-05, "loss": 0.0275, "step": 297100 }, { "epoch": 0.002972, "grad_norm": 0.25246742367744446, "learning_rate": 1e-05, "loss": 0.0272, "step": 297200 }, { "epoch": 0.002973, "grad_norm": 0.2259443700313568, "learning_rate": 1e-05, "loss": 0.0275, "step": 297300 }, { "epoch": 0.002974, "grad_norm": 0.2218732237815857, "learning_rate": 1e-05, "loss": 0.0274, "step": 297400 }, { "epoch": 0.002975, "grad_norm": 0.26809731125831604, "learning_rate": 1e-05, "loss": 0.0266, "step": 297500 }, { "epoch": 0.002976, "grad_norm": 0.23182938992977142, "learning_rate": 1e-05, "loss": 0.027, "step": 297600 }, { "epoch": 0.002977, "grad_norm": 0.2149820774793625, "learning_rate": 1e-05, "loss": 0.0274, "step": 297700 }, { "epoch": 0.002978, "grad_norm": 0.21874502301216125, "learning_rate": 1e-05, "loss": 0.0271, "step": 297800 }, { "epoch": 0.002979, "grad_norm": 0.20774713158607483, "learning_rate": 1e-05, "loss": 0.0271, "step": 297900 }, { "epoch": 0.00298, "grad_norm": 0.23886801302433014, "learning_rate": 1e-05, "loss": 0.0273, "step": 298000 }, { "epoch": 0.002981, "grad_norm": 0.3069617450237274, "learning_rate": 1e-05, "loss": 0.0268, "step": 298100 }, { "epoch": 0.002982, "grad_norm": 0.3166011869907379, "learning_rate": 1e-05, "loss": 0.0267, "step": 298200 }, { "epoch": 0.002983, "grad_norm": 0.19397036731243134, "learning_rate": 1e-05, "loss": 0.0277, "step": 298300 }, { "epoch": 0.002984, "grad_norm": 0.21146588027477264, "learning_rate": 1e-05, "loss": 0.0266, "step": 298400 }, { "epoch": 0.002985, "grad_norm": 0.18514686822891235, "learning_rate": 1e-05, "loss": 0.0275, "step": 298500 }, { "epoch": 0.002986, "grad_norm": 0.6392372846603394, "learning_rate": 1e-05, "loss": 0.0268, "step": 298600 }, { "epoch": 0.002987, "grad_norm": 0.19926829636096954, "learning_rate": 1e-05, "loss": 0.0265, "step": 298700 }, { "epoch": 0.002988, "grad_norm": 0.2598033845424652, "learning_rate": 1e-05, "loss": 0.027, "step": 298800 }, { "epoch": 0.002989, "grad_norm": 0.27204570174217224, "learning_rate": 1e-05, "loss": 0.0273, "step": 298900 }, { "epoch": 0.00299, "grad_norm": 0.20035503804683685, "learning_rate": 1e-05, "loss": 0.027, "step": 299000 }, { "epoch": 0.002991, "grad_norm": 0.176410973072052, "learning_rate": 1e-05, "loss": 0.027, "step": 299100 }, { "epoch": 0.002992, "grad_norm": 0.22518637776374817, "learning_rate": 1e-05, "loss": 0.0267, "step": 299200 }, { "epoch": 0.002993, "grad_norm": 0.19711561501026154, "learning_rate": 1e-05, "loss": 0.0269, "step": 299300 }, { "epoch": 0.002994, "grad_norm": 0.269014447927475, "learning_rate": 1e-05, "loss": 0.0272, "step": 299400 }, { "epoch": 0.002995, "grad_norm": 0.18528473377227783, "learning_rate": 1e-05, "loss": 0.0264, "step": 299500 }, { "epoch": 0.002996, "grad_norm": 0.21184545755386353, "learning_rate": 1e-05, "loss": 0.0275, "step": 299600 }, { "epoch": 0.002997, "grad_norm": 0.3106190264225006, "learning_rate": 1e-05, "loss": 0.0269, "step": 299700 }, { "epoch": 0.002998, "grad_norm": 0.264780193567276, "learning_rate": 1e-05, "loss": 0.0266, "step": 299800 }, { "epoch": 0.002999, "grad_norm": 0.1754496842622757, "learning_rate": 1e-05, "loss": 0.0271, "step": 299900 }, { "epoch": 0.003, "grad_norm": 0.22756840288639069, "learning_rate": 1e-05, "loss": 0.0272, "step": 300000 }, { "epoch": 0.003, "eval_loss": 0.02352895215153694, "eval_runtime": 192.1414, "eval_samples_per_second": 260.225, "eval_steps_per_second": 16.264, "step": 300000 }, { "epoch": 0.003001, "grad_norm": 0.24564534425735474, "learning_rate": 1e-05, "loss": 0.0266, "step": 300100 }, { "epoch": 0.003002, "grad_norm": 0.25609809160232544, "learning_rate": 1e-05, "loss": 0.0272, "step": 300200 }, { "epoch": 0.003003, "grad_norm": 0.2242080271244049, "learning_rate": 1e-05, "loss": 0.027, "step": 300300 }, { "epoch": 0.003004, "grad_norm": 0.1873653531074524, "learning_rate": 1e-05, "loss": 0.027, "step": 300400 }, { "epoch": 0.003005, "grad_norm": 0.2566228210926056, "learning_rate": 1e-05, "loss": 0.0272, "step": 300500 }, { "epoch": 0.003006, "grad_norm": 0.20143531262874603, "learning_rate": 1e-05, "loss": 0.027, "step": 300600 }, { "epoch": 0.003007, "grad_norm": 0.3359016180038452, "learning_rate": 1e-05, "loss": 0.0273, "step": 300700 }, { "epoch": 0.003008, "grad_norm": 0.25699713826179504, "learning_rate": 1e-05, "loss": 0.0273, "step": 300800 }, { "epoch": 0.003009, "grad_norm": 0.23692752420902252, "learning_rate": 1e-05, "loss": 0.027, "step": 300900 }, { "epoch": 0.00301, "grad_norm": 0.2653488516807556, "learning_rate": 1e-05, "loss": 0.0273, "step": 301000 }, { "epoch": 0.003011, "grad_norm": 0.21784615516662598, "learning_rate": 1e-05, "loss": 0.0268, "step": 301100 }, { "epoch": 0.003012, "grad_norm": 0.30190372467041016, "learning_rate": 1e-05, "loss": 0.0272, "step": 301200 }, { "epoch": 0.003013, "grad_norm": 0.22344371676445007, "learning_rate": 1e-05, "loss": 0.0265, "step": 301300 }, { "epoch": 0.003014, "grad_norm": 0.2584332823753357, "learning_rate": 1e-05, "loss": 0.027, "step": 301400 }, { "epoch": 0.003015, "grad_norm": 0.2040054053068161, "learning_rate": 1e-05, "loss": 0.0269, "step": 301500 }, { "epoch": 0.003016, "grad_norm": 0.19524583220481873, "learning_rate": 1e-05, "loss": 0.0271, "step": 301600 }, { "epoch": 0.003017, "grad_norm": 0.20526626706123352, "learning_rate": 1e-05, "loss": 0.0265, "step": 301700 }, { "epoch": 0.003018, "grad_norm": 0.23805655539035797, "learning_rate": 1e-05, "loss": 0.0266, "step": 301800 }, { "epoch": 0.003019, "grad_norm": 0.22942158579826355, "learning_rate": 1e-05, "loss": 0.0269, "step": 301900 }, { "epoch": 0.00302, "grad_norm": 0.2905440032482147, "learning_rate": 1e-05, "loss": 0.0268, "step": 302000 }, { "epoch": 0.003021, "grad_norm": 0.2355891317129135, "learning_rate": 1e-05, "loss": 0.0271, "step": 302100 }, { "epoch": 0.003022, "grad_norm": 0.22563762962818146, "learning_rate": 1e-05, "loss": 0.0278, "step": 302200 }, { "epoch": 0.003023, "grad_norm": 0.21531619131565094, "learning_rate": 1e-05, "loss": 0.0272, "step": 302300 }, { "epoch": 0.003024, "grad_norm": 0.22992458939552307, "learning_rate": 1e-05, "loss": 0.0268, "step": 302400 }, { "epoch": 0.003025, "grad_norm": 0.22688639163970947, "learning_rate": 1e-05, "loss": 0.0267, "step": 302500 }, { "epoch": 0.003026, "grad_norm": 0.23058293759822845, "learning_rate": 1e-05, "loss": 0.0267, "step": 302600 }, { "epoch": 0.003027, "grad_norm": 0.20802053809165955, "learning_rate": 1e-05, "loss": 0.0264, "step": 302700 }, { "epoch": 0.003028, "grad_norm": 0.20533227920532227, "learning_rate": 1e-05, "loss": 0.0272, "step": 302800 }, { "epoch": 0.003029, "grad_norm": 0.23919910192489624, "learning_rate": 1e-05, "loss": 0.0268, "step": 302900 }, { "epoch": 0.00303, "grad_norm": 0.19025136530399323, "learning_rate": 1e-05, "loss": 0.0273, "step": 303000 }, { "epoch": 0.003031, "grad_norm": 0.19433394074440002, "learning_rate": 1e-05, "loss": 0.027, "step": 303100 }, { "epoch": 0.003032, "grad_norm": 0.23145197331905365, "learning_rate": 1e-05, "loss": 0.0275, "step": 303200 }, { "epoch": 0.003033, "grad_norm": 0.23850464820861816, "learning_rate": 1e-05, "loss": 0.0267, "step": 303300 }, { "epoch": 0.003034, "grad_norm": 0.23646266758441925, "learning_rate": 1e-05, "loss": 0.0267, "step": 303400 }, { "epoch": 0.003035, "grad_norm": 0.19940128922462463, "learning_rate": 1e-05, "loss": 0.0268, "step": 303500 }, { "epoch": 0.003036, "grad_norm": 0.22018127143383026, "learning_rate": 1e-05, "loss": 0.0272, "step": 303600 }, { "epoch": 0.003037, "grad_norm": 0.2834889888763428, "learning_rate": 1e-05, "loss": 0.0271, "step": 303700 }, { "epoch": 0.003038, "grad_norm": 0.18782445788383484, "learning_rate": 1e-05, "loss": 0.0266, "step": 303800 }, { "epoch": 0.003039, "grad_norm": 0.16653797030448914, "learning_rate": 1e-05, "loss": 0.0268, "step": 303900 }, { "epoch": 0.00304, "grad_norm": 0.31914132833480835, "learning_rate": 1e-05, "loss": 0.0266, "step": 304000 }, { "epoch": 0.003041, "grad_norm": 0.24896857142448425, "learning_rate": 1e-05, "loss": 0.0266, "step": 304100 }, { "epoch": 0.003042, "grad_norm": 0.22040998935699463, "learning_rate": 1e-05, "loss": 0.0268, "step": 304200 }, { "epoch": 0.003043, "grad_norm": 0.22441326081752777, "learning_rate": 1e-05, "loss": 0.027, "step": 304300 }, { "epoch": 0.003044, "grad_norm": 0.27830228209495544, "learning_rate": 1e-05, "loss": 0.0268, "step": 304400 }, { "epoch": 0.003045, "grad_norm": 0.19093731045722961, "learning_rate": 1e-05, "loss": 0.0269, "step": 304500 }, { "epoch": 0.003046, "grad_norm": 0.2811295986175537, "learning_rate": 1e-05, "loss": 0.027, "step": 304600 }, { "epoch": 0.003047, "grad_norm": 0.21080484986305237, "learning_rate": 1e-05, "loss": 0.0265, "step": 304700 }, { "epoch": 0.003048, "grad_norm": 0.21469475328922272, "learning_rate": 1e-05, "loss": 0.0267, "step": 304800 }, { "epoch": 0.003049, "grad_norm": 0.16331468522548676, "learning_rate": 1e-05, "loss": 0.0271, "step": 304900 }, { "epoch": 0.00305, "grad_norm": 0.23459644615650177, "learning_rate": 1e-05, "loss": 0.027, "step": 305000 }, { "epoch": 0.003051, "grad_norm": 0.23355557024478912, "learning_rate": 1e-05, "loss": 0.0268, "step": 305100 }, { "epoch": 0.003052, "grad_norm": 0.2670382857322693, "learning_rate": 1e-05, "loss": 0.0269, "step": 305200 }, { "epoch": 0.003053, "grad_norm": 0.19511283934116364, "learning_rate": 1e-05, "loss": 0.0271, "step": 305300 }, { "epoch": 0.003054, "grad_norm": 0.2069941759109497, "learning_rate": 1e-05, "loss": 0.0267, "step": 305400 }, { "epoch": 0.003055, "grad_norm": 0.2806059420108795, "learning_rate": 1e-05, "loss": 0.0268, "step": 305500 }, { "epoch": 0.003056, "grad_norm": 0.2917216420173645, "learning_rate": 1e-05, "loss": 0.0274, "step": 305600 }, { "epoch": 0.003057, "grad_norm": 0.2090231031179428, "learning_rate": 1e-05, "loss": 0.0268, "step": 305700 }, { "epoch": 0.003058, "grad_norm": 0.2871715724468231, "learning_rate": 1e-05, "loss": 0.027, "step": 305800 }, { "epoch": 0.003059, "grad_norm": 0.19492240250110626, "learning_rate": 1e-05, "loss": 0.0266, "step": 305900 }, { "epoch": 0.00306, "grad_norm": 0.283270001411438, "learning_rate": 1e-05, "loss": 0.0264, "step": 306000 }, { "epoch": 0.003061, "grad_norm": 0.28721094131469727, "learning_rate": 1e-05, "loss": 0.0268, "step": 306100 }, { "epoch": 0.003062, "grad_norm": 0.22746601700782776, "learning_rate": 1e-05, "loss": 0.0267, "step": 306200 }, { "epoch": 0.003063, "grad_norm": 0.26987698674201965, "learning_rate": 1e-05, "loss": 0.0266, "step": 306300 }, { "epoch": 0.003064, "grad_norm": 0.2578005790710449, "learning_rate": 1e-05, "loss": 0.0268, "step": 306400 }, { "epoch": 0.003065, "grad_norm": 0.2263513058423996, "learning_rate": 1e-05, "loss": 0.0267, "step": 306500 }, { "epoch": 0.003066, "grad_norm": 0.18849994242191315, "learning_rate": 1e-05, "loss": 0.0265, "step": 306600 }, { "epoch": 0.003067, "grad_norm": 0.23584192991256714, "learning_rate": 1e-05, "loss": 0.0267, "step": 306700 }, { "epoch": 0.003068, "grad_norm": 0.27472710609436035, "learning_rate": 1e-05, "loss": 0.0259, "step": 306800 }, { "epoch": 0.003069, "grad_norm": 0.21822002530097961, "learning_rate": 1e-05, "loss": 0.0271, "step": 306900 }, { "epoch": 0.00307, "grad_norm": 0.2363082617521286, "learning_rate": 1e-05, "loss": 0.0259, "step": 307000 }, { "epoch": 0.003071, "grad_norm": 0.22444720566272736, "learning_rate": 1e-05, "loss": 0.0266, "step": 307100 }, { "epoch": 0.003072, "grad_norm": 0.3271840214729309, "learning_rate": 1e-05, "loss": 0.0266, "step": 307200 }, { "epoch": 0.003073, "grad_norm": 0.2745817303657532, "learning_rate": 1e-05, "loss": 0.0271, "step": 307300 }, { "epoch": 0.003074, "grad_norm": 0.25029456615448, "learning_rate": 1e-05, "loss": 0.0268, "step": 307400 }, { "epoch": 0.003075, "grad_norm": 0.2603447437286377, "learning_rate": 1e-05, "loss": 0.0256, "step": 307500 }, { "epoch": 0.003076, "grad_norm": 0.21267341077327728, "learning_rate": 1e-05, "loss": 0.027, "step": 307600 }, { "epoch": 0.003077, "grad_norm": 0.21115660667419434, "learning_rate": 1e-05, "loss": 0.0263, "step": 307700 }, { "epoch": 0.003078, "grad_norm": 0.21885636448860168, "learning_rate": 1e-05, "loss": 0.0262, "step": 307800 }, { "epoch": 0.003079, "grad_norm": 0.24273690581321716, "learning_rate": 1e-05, "loss": 0.0268, "step": 307900 }, { "epoch": 0.00308, "grad_norm": 0.1794435828924179, "learning_rate": 1e-05, "loss": 0.0267, "step": 308000 }, { "epoch": 0.003081, "grad_norm": 0.2202482521533966, "learning_rate": 1e-05, "loss": 0.0269, "step": 308100 }, { "epoch": 0.003082, "grad_norm": 0.20879529416561127, "learning_rate": 1e-05, "loss": 0.0268, "step": 308200 }, { "epoch": 0.003083, "grad_norm": 0.24758106470108032, "learning_rate": 1e-05, "loss": 0.0266, "step": 308300 }, { "epoch": 0.003084, "grad_norm": 0.24477562308311462, "learning_rate": 1e-05, "loss": 0.0264, "step": 308400 }, { "epoch": 0.003085, "grad_norm": 0.23816826939582825, "learning_rate": 1e-05, "loss": 0.0268, "step": 308500 }, { "epoch": 0.003086, "grad_norm": 0.1829647421836853, "learning_rate": 1e-05, "loss": 0.0263, "step": 308600 }, { "epoch": 0.003087, "grad_norm": 0.22289857268333435, "learning_rate": 1e-05, "loss": 0.0261, "step": 308700 }, { "epoch": 0.003088, "grad_norm": 0.1820111870765686, "learning_rate": 1e-05, "loss": 0.0273, "step": 308800 }, { "epoch": 0.003089, "grad_norm": 0.22784028947353363, "learning_rate": 1e-05, "loss": 0.027, "step": 308900 }, { "epoch": 0.00309, "grad_norm": 0.20845426619052887, "learning_rate": 1e-05, "loss": 0.0273, "step": 309000 }, { "epoch": 0.003091, "grad_norm": 0.27483031153678894, "learning_rate": 1e-05, "loss": 0.0269, "step": 309100 }, { "epoch": 0.003092, "grad_norm": 0.3367862403392792, "learning_rate": 1e-05, "loss": 0.0269, "step": 309200 }, { "epoch": 0.003093, "grad_norm": 0.2504436671733856, "learning_rate": 1e-05, "loss": 0.0264, "step": 309300 }, { "epoch": 0.003094, "grad_norm": 0.2724253237247467, "learning_rate": 1e-05, "loss": 0.0259, "step": 309400 }, { "epoch": 0.003095, "grad_norm": 0.280286580324173, "learning_rate": 1e-05, "loss": 0.0268, "step": 309500 }, { "epoch": 0.003096, "grad_norm": 0.2695227861404419, "learning_rate": 1e-05, "loss": 0.0268, "step": 309600 }, { "epoch": 0.003097, "grad_norm": 0.19506055116653442, "learning_rate": 1e-05, "loss": 0.0266, "step": 309700 }, { "epoch": 0.003098, "grad_norm": 0.259020060300827, "learning_rate": 1e-05, "loss": 0.0268, "step": 309800 }, { "epoch": 0.003099, "grad_norm": 0.3812705874443054, "learning_rate": 1e-05, "loss": 0.0271, "step": 309900 }, { "epoch": 0.0031, "grad_norm": 0.22959260642528534, "learning_rate": 1e-05, "loss": 0.0262, "step": 310000 }, { "epoch": 0.003101, "grad_norm": 0.21151551604270935, "learning_rate": 1e-05, "loss": 0.0265, "step": 310100 }, { "epoch": 0.003102, "grad_norm": 0.21742308139801025, "learning_rate": 1e-05, "loss": 0.0262, "step": 310200 }, { "epoch": 0.003103, "grad_norm": 0.23849885165691376, "learning_rate": 1e-05, "loss": 0.0267, "step": 310300 }, { "epoch": 0.003104, "grad_norm": 0.29556405544281006, "learning_rate": 1e-05, "loss": 0.0268, "step": 310400 }, { "epoch": 0.003105, "grad_norm": 0.232294499874115, "learning_rate": 1e-05, "loss": 0.0267, "step": 310500 }, { "epoch": 0.003106, "grad_norm": 0.24282053112983704, "learning_rate": 1e-05, "loss": 0.0264, "step": 310600 }, { "epoch": 0.003107, "grad_norm": 0.19696711003780365, "learning_rate": 1e-05, "loss": 0.0269, "step": 310700 }, { "epoch": 0.003108, "grad_norm": 0.26883387565612793, "learning_rate": 1e-05, "loss": 0.0271, "step": 310800 }, { "epoch": 0.003109, "grad_norm": 0.2640938460826874, "learning_rate": 1e-05, "loss": 0.0261, "step": 310900 }, { "epoch": 0.00311, "grad_norm": 0.22291681170463562, "learning_rate": 1e-05, "loss": 0.0264, "step": 311000 }, { "epoch": 0.003111, "grad_norm": 0.23356468975543976, "learning_rate": 1e-05, "loss": 0.0264, "step": 311100 }, { "epoch": 0.003112, "grad_norm": 0.21068906784057617, "learning_rate": 1e-05, "loss": 0.0265, "step": 311200 }, { "epoch": 0.003113, "grad_norm": 0.19301259517669678, "learning_rate": 1e-05, "loss": 0.0275, "step": 311300 }, { "epoch": 0.003114, "grad_norm": 0.18231357634067535, "learning_rate": 1e-05, "loss": 0.0269, "step": 311400 }, { "epoch": 0.003115, "grad_norm": 0.47011634707450867, "learning_rate": 1e-05, "loss": 0.0271, "step": 311500 }, { "epoch": 0.003116, "grad_norm": 0.18966491520404816, "learning_rate": 1e-05, "loss": 0.0268, "step": 311600 }, { "epoch": 0.003117, "grad_norm": 0.23438602685928345, "learning_rate": 1e-05, "loss": 0.0265, "step": 311700 }, { "epoch": 0.003118, "grad_norm": 0.2869526147842407, "learning_rate": 1e-05, "loss": 0.0262, "step": 311800 }, { "epoch": 0.003119, "grad_norm": 0.22105902433395386, "learning_rate": 1e-05, "loss": 0.0265, "step": 311900 }, { "epoch": 0.00312, "grad_norm": 0.18267607688903809, "learning_rate": 1e-05, "loss": 0.0265, "step": 312000 }, { "epoch": 0.003121, "grad_norm": 0.17117823660373688, "learning_rate": 1e-05, "loss": 0.0273, "step": 312100 }, { "epoch": 0.003122, "grad_norm": 0.23207755386829376, "learning_rate": 1e-05, "loss": 0.0269, "step": 312200 }, { "epoch": 0.003123, "grad_norm": 0.19023114442825317, "learning_rate": 1e-05, "loss": 0.0268, "step": 312300 }, { "epoch": 0.003124, "grad_norm": 0.1709338128566742, "learning_rate": 1e-05, "loss": 0.0266, "step": 312400 }, { "epoch": 0.003125, "grad_norm": 0.23958516120910645, "learning_rate": 1e-05, "loss": 0.0267, "step": 312500 }, { "epoch": 0.003126, "grad_norm": 0.22739177942276, "learning_rate": 1e-05, "loss": 0.0262, "step": 312600 }, { "epoch": 0.003127, "grad_norm": 0.21145591139793396, "learning_rate": 1e-05, "loss": 0.0265, "step": 312700 }, { "epoch": 0.003128, "grad_norm": 0.19892120361328125, "learning_rate": 1e-05, "loss": 0.0268, "step": 312800 }, { "epoch": 0.003129, "grad_norm": 0.2810048460960388, "learning_rate": 1e-05, "loss": 0.0262, "step": 312900 }, { "epoch": 0.00313, "grad_norm": 0.16679918766021729, "learning_rate": 1e-05, "loss": 0.0267, "step": 313000 }, { "epoch": 0.003131, "grad_norm": 0.22223927080631256, "learning_rate": 1e-05, "loss": 0.0267, "step": 313100 }, { "epoch": 0.003132, "grad_norm": 0.2631348967552185, "learning_rate": 1e-05, "loss": 0.0268, "step": 313200 }, { "epoch": 0.003133, "grad_norm": 0.19947989284992218, "learning_rate": 1e-05, "loss": 0.0261, "step": 313300 }, { "epoch": 0.003134, "grad_norm": 0.40083256363868713, "learning_rate": 1e-05, "loss": 0.0265, "step": 313400 }, { "epoch": 0.003135, "grad_norm": 0.2303778976202011, "learning_rate": 1e-05, "loss": 0.0271, "step": 313500 }, { "epoch": 0.003136, "grad_norm": 0.198317289352417, "learning_rate": 1e-05, "loss": 0.0268, "step": 313600 }, { "epoch": 0.003137, "grad_norm": 0.24342307448387146, "learning_rate": 1e-05, "loss": 0.0262, "step": 313700 }, { "epoch": 0.003138, "grad_norm": 0.3006582260131836, "learning_rate": 1e-05, "loss": 0.0266, "step": 313800 }, { "epoch": 0.003139, "grad_norm": 0.3339352011680603, "learning_rate": 1e-05, "loss": 0.0259, "step": 313900 }, { "epoch": 0.00314, "grad_norm": 0.2375064641237259, "learning_rate": 1e-05, "loss": 0.0264, "step": 314000 }, { "epoch": 0.003141, "grad_norm": 0.23990485072135925, "learning_rate": 1e-05, "loss": 0.0262, "step": 314100 }, { "epoch": 0.003142, "grad_norm": 0.22559918463230133, "learning_rate": 1e-05, "loss": 0.0262, "step": 314200 }, { "epoch": 0.003143, "grad_norm": 0.2196795791387558, "learning_rate": 1e-05, "loss": 0.0269, "step": 314300 }, { "epoch": 0.003144, "grad_norm": 0.2290625423192978, "learning_rate": 1e-05, "loss": 0.0269, "step": 314400 }, { "epoch": 0.003145, "grad_norm": 0.22628280520439148, "learning_rate": 1e-05, "loss": 0.0262, "step": 314500 }, { "epoch": 0.003146, "grad_norm": 0.3055602014064789, "learning_rate": 1e-05, "loss": 0.0265, "step": 314600 }, { "epoch": 0.003147, "grad_norm": 0.17805078625679016, "learning_rate": 1e-05, "loss": 0.0259, "step": 314700 }, { "epoch": 0.003148, "grad_norm": 0.2297162562608719, "learning_rate": 1e-05, "loss": 0.0263, "step": 314800 }, { "epoch": 0.003149, "grad_norm": 0.17962846159934998, "learning_rate": 1e-05, "loss": 0.026, "step": 314900 }, { "epoch": 0.00315, "grad_norm": 0.23840166628360748, "learning_rate": 1e-05, "loss": 0.0262, "step": 315000 }, { "epoch": 0.003151, "grad_norm": 0.2144237905740738, "learning_rate": 1e-05, "loss": 0.0258, "step": 315100 }, { "epoch": 0.003152, "grad_norm": 0.18694724142551422, "learning_rate": 1e-05, "loss": 0.0271, "step": 315200 }, { "epoch": 0.003153, "grad_norm": 0.20214436948299408, "learning_rate": 1e-05, "loss": 0.0262, "step": 315300 }, { "epoch": 0.003154, "grad_norm": 0.18802447617053986, "learning_rate": 1e-05, "loss": 0.0263, "step": 315400 }, { "epoch": 0.003155, "grad_norm": 0.23389141261577606, "learning_rate": 1e-05, "loss": 0.0268, "step": 315500 }, { "epoch": 0.003156, "grad_norm": 0.20924752950668335, "learning_rate": 1e-05, "loss": 0.0264, "step": 315600 }, { "epoch": 0.003157, "grad_norm": 0.27211233973503113, "learning_rate": 1e-05, "loss": 0.026, "step": 315700 }, { "epoch": 0.003158, "grad_norm": 0.3373160660266876, "learning_rate": 1e-05, "loss": 0.0265, "step": 315800 }, { "epoch": 0.003159, "grad_norm": 0.21824976801872253, "learning_rate": 1e-05, "loss": 0.0265, "step": 315900 }, { "epoch": 0.00316, "grad_norm": 0.20689469575881958, "learning_rate": 1e-05, "loss": 0.0265, "step": 316000 }, { "epoch": 0.003161, "grad_norm": 0.18488562107086182, "learning_rate": 1e-05, "loss": 0.0263, "step": 316100 }, { "epoch": 0.003162, "grad_norm": 0.2104141265153885, "learning_rate": 1e-05, "loss": 0.0265, "step": 316200 }, { "epoch": 0.003163, "grad_norm": 0.23483408987522125, "learning_rate": 1e-05, "loss": 0.0268, "step": 316300 }, { "epoch": 0.003164, "grad_norm": 0.2970333993434906, "learning_rate": 1e-05, "loss": 0.0261, "step": 316400 }, { "epoch": 0.003165, "grad_norm": 0.1965234875679016, "learning_rate": 1e-05, "loss": 0.0266, "step": 316500 }, { "epoch": 0.003166, "grad_norm": 0.18899615108966827, "learning_rate": 1e-05, "loss": 0.027, "step": 316600 }, { "epoch": 0.003167, "grad_norm": 0.23424047231674194, "learning_rate": 1e-05, "loss": 0.0261, "step": 316700 }, { "epoch": 0.003168, "grad_norm": 0.24671000242233276, "learning_rate": 1e-05, "loss": 0.0271, "step": 316800 }, { "epoch": 0.003169, "grad_norm": 0.20578643679618835, "learning_rate": 1e-05, "loss": 0.0266, "step": 316900 }, { "epoch": 0.00317, "grad_norm": 0.2361016422510147, "learning_rate": 1e-05, "loss": 0.0264, "step": 317000 }, { "epoch": 0.003171, "grad_norm": 0.2327994704246521, "learning_rate": 1e-05, "loss": 0.0266, "step": 317100 }, { "epoch": 0.003172, "grad_norm": 0.2904406189918518, "learning_rate": 1e-05, "loss": 0.0266, "step": 317200 }, { "epoch": 0.003173, "grad_norm": 0.2550638020038605, "learning_rate": 1e-05, "loss": 0.0268, "step": 317300 }, { "epoch": 0.003174, "grad_norm": 0.21902337670326233, "learning_rate": 1e-05, "loss": 0.0267, "step": 317400 }, { "epoch": 0.003175, "grad_norm": 0.21695446968078613, "learning_rate": 1e-05, "loss": 0.0266, "step": 317500 }, { "epoch": 0.003176, "grad_norm": 0.14159560203552246, "learning_rate": 1e-05, "loss": 0.0261, "step": 317600 }, { "epoch": 0.003177, "grad_norm": 0.18824447691440582, "learning_rate": 1e-05, "loss": 0.0264, "step": 317700 }, { "epoch": 0.003178, "grad_norm": 0.24776631593704224, "learning_rate": 1e-05, "loss": 0.027, "step": 317800 }, { "epoch": 0.003179, "grad_norm": 0.1772354394197464, "learning_rate": 1e-05, "loss": 0.0265, "step": 317900 }, { "epoch": 0.00318, "grad_norm": 0.23863814771175385, "learning_rate": 1e-05, "loss": 0.0263, "step": 318000 }, { "epoch": 0.003181, "grad_norm": 0.25460243225097656, "learning_rate": 1e-05, "loss": 0.0261, "step": 318100 }, { "epoch": 0.003182, "grad_norm": 0.3585200011730194, "learning_rate": 1e-05, "loss": 0.0266, "step": 318200 }, { "epoch": 0.003183, "grad_norm": 0.3048652708530426, "learning_rate": 1e-05, "loss": 0.0269, "step": 318300 }, { "epoch": 0.003184, "grad_norm": 0.23855812847614288, "learning_rate": 1e-05, "loss": 0.0263, "step": 318400 }, { "epoch": 0.003185, "grad_norm": 0.19755372405052185, "learning_rate": 1e-05, "loss": 0.0262, "step": 318500 }, { "epoch": 0.003186, "grad_norm": 0.1780814528465271, "learning_rate": 1e-05, "loss": 0.0261, "step": 318600 }, { "epoch": 0.003187, "grad_norm": 0.20584912598133087, "learning_rate": 1e-05, "loss": 0.027, "step": 318700 }, { "epoch": 0.003188, "grad_norm": 0.2166757434606552, "learning_rate": 1e-05, "loss": 0.0263, "step": 318800 }, { "epoch": 0.003189, "grad_norm": 0.1830921173095703, "learning_rate": 1e-05, "loss": 0.026, "step": 318900 }, { "epoch": 0.00319, "grad_norm": 0.22540132701396942, "learning_rate": 1e-05, "loss": 0.0267, "step": 319000 }, { "epoch": 0.003191, "grad_norm": 0.18830424547195435, "learning_rate": 1e-05, "loss": 0.0268, "step": 319100 }, { "epoch": 0.003192, "grad_norm": 0.2352602332830429, "learning_rate": 1e-05, "loss": 0.0262, "step": 319200 }, { "epoch": 0.003193, "grad_norm": 0.20206184685230255, "learning_rate": 1e-05, "loss": 0.0262, "step": 319300 }, { "epoch": 0.003194, "grad_norm": 0.25560662150382996, "learning_rate": 1e-05, "loss": 0.0262, "step": 319400 }, { "epoch": 0.003195, "grad_norm": 0.24250520765781403, "learning_rate": 1e-05, "loss": 0.026, "step": 319500 }, { "epoch": 0.003196, "grad_norm": 0.2175752967596054, "learning_rate": 1e-05, "loss": 0.0258, "step": 319600 }, { "epoch": 0.003197, "grad_norm": 0.27856600284576416, "learning_rate": 1e-05, "loss": 0.0264, "step": 319700 }, { "epoch": 0.003198, "grad_norm": 0.15467730164527893, "learning_rate": 1e-05, "loss": 0.0269, "step": 319800 }, { "epoch": 0.003199, "grad_norm": 0.21042077243328094, "learning_rate": 1e-05, "loss": 0.0259, "step": 319900 }, { "epoch": 0.0032, "grad_norm": 0.2572573125362396, "learning_rate": 1e-05, "loss": 0.0261, "step": 320000 }, { "epoch": 0.0032, "eval_loss": 0.025225354358553886, "eval_runtime": 169.5254, "eval_samples_per_second": 294.941, "eval_steps_per_second": 18.434, "step": 320000 }, { "epoch": 0.003201, "grad_norm": 0.2452288568019867, "learning_rate": 1e-05, "loss": 0.0263, "step": 320100 }, { "epoch": 0.003202, "grad_norm": 0.1922990083694458, "learning_rate": 1e-05, "loss": 0.0262, "step": 320200 }, { "epoch": 0.003203, "grad_norm": 0.23566807806491852, "learning_rate": 1e-05, "loss": 0.0265, "step": 320300 }, { "epoch": 0.003204, "grad_norm": 0.17829179763793945, "learning_rate": 1e-05, "loss": 0.0264, "step": 320400 }, { "epoch": 0.003205, "grad_norm": 0.1792110949754715, "learning_rate": 1e-05, "loss": 0.0264, "step": 320500 }, { "epoch": 0.003206, "grad_norm": 0.214288130402565, "learning_rate": 1e-05, "loss": 0.0263, "step": 320600 }, { "epoch": 0.003207, "grad_norm": 0.18739189207553864, "learning_rate": 1e-05, "loss": 0.0267, "step": 320700 }, { "epoch": 0.003208, "grad_norm": 0.20027951896190643, "learning_rate": 1e-05, "loss": 0.0265, "step": 320800 }, { "epoch": 0.003209, "grad_norm": 0.2532178461551666, "learning_rate": 1e-05, "loss": 0.026, "step": 320900 }, { "epoch": 0.00321, "grad_norm": 0.2729474902153015, "learning_rate": 1e-05, "loss": 0.0267, "step": 321000 }, { "epoch": 0.003211, "grad_norm": 0.2151164561510086, "learning_rate": 1e-05, "loss": 0.0261, "step": 321100 }, { "epoch": 0.003212, "grad_norm": 0.1697973608970642, "learning_rate": 1e-05, "loss": 0.0261, "step": 321200 }, { "epoch": 0.003213, "grad_norm": 0.22821971774101257, "learning_rate": 1e-05, "loss": 0.0263, "step": 321300 }, { "epoch": 0.003214, "grad_norm": 0.21168546378612518, "learning_rate": 1e-05, "loss": 0.0263, "step": 321400 }, { "epoch": 0.003215, "grad_norm": 0.21351784467697144, "learning_rate": 1e-05, "loss": 0.0264, "step": 321500 }, { "epoch": 0.003216, "grad_norm": 0.2001633644104004, "learning_rate": 1e-05, "loss": 0.0262, "step": 321600 }, { "epoch": 0.003217, "grad_norm": 0.16557571291923523, "learning_rate": 1e-05, "loss": 0.0265, "step": 321700 }, { "epoch": 0.003218, "grad_norm": 0.24777041375637054, "learning_rate": 1e-05, "loss": 0.0259, "step": 321800 }, { "epoch": 0.003219, "grad_norm": 0.2747800648212433, "learning_rate": 1e-05, "loss": 0.0261, "step": 321900 }, { "epoch": 0.00322, "grad_norm": 0.21875742077827454, "learning_rate": 1e-05, "loss": 0.0263, "step": 322000 }, { "epoch": 0.003221, "grad_norm": 0.18426688015460968, "learning_rate": 1e-05, "loss": 0.0258, "step": 322100 }, { "epoch": 0.003222, "grad_norm": 0.1793707013130188, "learning_rate": 1e-05, "loss": 0.026, "step": 322200 }, { "epoch": 0.003223, "grad_norm": 0.1896100789308548, "learning_rate": 1e-05, "loss": 0.0264, "step": 322300 }, { "epoch": 0.003224, "grad_norm": 0.49916690587997437, "learning_rate": 1e-05, "loss": 0.0264, "step": 322400 }, { "epoch": 0.003225, "grad_norm": 0.2023572474718094, "learning_rate": 1e-05, "loss": 0.0263, "step": 322500 }, { "epoch": 0.003226, "grad_norm": 0.256666898727417, "learning_rate": 1e-05, "loss": 0.0261, "step": 322600 }, { "epoch": 0.003227, "grad_norm": 0.19276836514472961, "learning_rate": 1e-05, "loss": 0.0259, "step": 322700 }, { "epoch": 0.003228, "grad_norm": 0.31227782368659973, "learning_rate": 1e-05, "loss": 0.0252, "step": 322800 }, { "epoch": 0.003229, "grad_norm": 0.24218299984931946, "learning_rate": 1e-05, "loss": 0.0261, "step": 322900 }, { "epoch": 0.00323, "grad_norm": 0.22524356842041016, "learning_rate": 1e-05, "loss": 0.0266, "step": 323000 }, { "epoch": 0.003231, "grad_norm": 0.24048413336277008, "learning_rate": 1e-05, "loss": 0.026, "step": 323100 }, { "epoch": 0.003232, "grad_norm": 0.3700028657913208, "learning_rate": 1e-05, "loss": 0.0264, "step": 323200 }, { "epoch": 0.003233, "grad_norm": 0.20020624995231628, "learning_rate": 1e-05, "loss": 0.0258, "step": 323300 }, { "epoch": 0.003234, "grad_norm": 0.26460763812065125, "learning_rate": 1e-05, "loss": 0.0258, "step": 323400 }, { "epoch": 0.003235, "grad_norm": 0.1909564584493637, "learning_rate": 1e-05, "loss": 0.0265, "step": 323500 }, { "epoch": 0.003236, "grad_norm": 0.29465991258621216, "learning_rate": 1e-05, "loss": 0.0255, "step": 323600 }, { "epoch": 0.003237, "grad_norm": 0.1740919053554535, "learning_rate": 1e-05, "loss": 0.0267, "step": 323700 }, { "epoch": 0.003238, "grad_norm": 0.20403610169887543, "learning_rate": 1e-05, "loss": 0.0259, "step": 323800 }, { "epoch": 0.003239, "grad_norm": 0.2600473463535309, "learning_rate": 1e-05, "loss": 0.0262, "step": 323900 }, { "epoch": 0.00324, "grad_norm": 0.20135332643985748, "learning_rate": 1e-05, "loss": 0.0258, "step": 324000 }, { "epoch": 0.003241, "grad_norm": 0.2004103660583496, "learning_rate": 1e-05, "loss": 0.0263, "step": 324100 }, { "epoch": 0.003242, "grad_norm": 0.24793416261672974, "learning_rate": 1e-05, "loss": 0.0263, "step": 324200 }, { "epoch": 0.003243, "grad_norm": 0.17162394523620605, "learning_rate": 1e-05, "loss": 0.0266, "step": 324300 }, { "epoch": 0.003244, "grad_norm": 0.20746000111103058, "learning_rate": 1e-05, "loss": 0.0262, "step": 324400 }, { "epoch": 0.003245, "grad_norm": 0.19739718735218048, "learning_rate": 1e-05, "loss": 0.0266, "step": 324500 }, { "epoch": 0.003246, "grad_norm": 0.25721651315689087, "learning_rate": 1e-05, "loss": 0.0261, "step": 324600 }, { "epoch": 0.003247, "grad_norm": 0.2535655200481415, "learning_rate": 1e-05, "loss": 0.0265, "step": 324700 }, { "epoch": 0.003248, "grad_norm": 0.19521620869636536, "learning_rate": 1e-05, "loss": 0.026, "step": 324800 }, { "epoch": 0.003249, "grad_norm": 0.29187333583831787, "learning_rate": 1e-05, "loss": 0.0257, "step": 324900 }, { "epoch": 0.00325, "grad_norm": 0.15727204084396362, "learning_rate": 1e-05, "loss": 0.026, "step": 325000 }, { "epoch": 0.003251, "grad_norm": 0.24963289499282837, "learning_rate": 1e-05, "loss": 0.0257, "step": 325100 }, { "epoch": 0.003252, "grad_norm": 0.17268462479114532, "learning_rate": 1e-05, "loss": 0.027, "step": 325200 }, { "epoch": 0.003253, "grad_norm": 0.2027566283941269, "learning_rate": 1e-05, "loss": 0.026, "step": 325300 }, { "epoch": 0.003254, "grad_norm": 0.22657033801078796, "learning_rate": 1e-05, "loss": 0.0261, "step": 325400 }, { "epoch": 0.003255, "grad_norm": 0.21635736525058746, "learning_rate": 1e-05, "loss": 0.0261, "step": 325500 }, { "epoch": 0.003256, "grad_norm": 0.2766045928001404, "learning_rate": 1e-05, "loss": 0.0262, "step": 325600 }, { "epoch": 0.003257, "grad_norm": 0.19436015188694, "learning_rate": 1e-05, "loss": 0.0262, "step": 325700 }, { "epoch": 0.003258, "grad_norm": 0.2767942249774933, "learning_rate": 1e-05, "loss": 0.0261, "step": 325800 }, { "epoch": 0.003259, "grad_norm": 0.22149808704853058, "learning_rate": 1e-05, "loss": 0.0261, "step": 325900 }, { "epoch": 0.00326, "grad_norm": 0.1768575757741928, "learning_rate": 1e-05, "loss": 0.0256, "step": 326000 }, { "epoch": 0.003261, "grad_norm": 0.2387610524892807, "learning_rate": 1e-05, "loss": 0.026, "step": 326100 }, { "epoch": 0.003262, "grad_norm": 0.22350601851940155, "learning_rate": 1e-05, "loss": 0.0266, "step": 326200 }, { "epoch": 0.003263, "grad_norm": 0.2076563835144043, "learning_rate": 1e-05, "loss": 0.0263, "step": 326300 }, { "epoch": 0.003264, "grad_norm": 0.21184878051280975, "learning_rate": 1e-05, "loss": 0.0262, "step": 326400 }, { "epoch": 0.003265, "grad_norm": 0.22706152498722076, "learning_rate": 1e-05, "loss": 0.0258, "step": 326500 }, { "epoch": 0.003266, "grad_norm": 0.21045376360416412, "learning_rate": 1e-05, "loss": 0.0256, "step": 326600 }, { "epoch": 0.003267, "grad_norm": 0.194964200258255, "learning_rate": 1e-05, "loss": 0.026, "step": 326700 }, { "epoch": 0.003268, "grad_norm": 0.18685370683670044, "learning_rate": 1e-05, "loss": 0.0258, "step": 326800 }, { "epoch": 0.003269, "grad_norm": 0.20090927183628082, "learning_rate": 1e-05, "loss": 0.0262, "step": 326900 }, { "epoch": 0.00327, "grad_norm": 0.3244345486164093, "learning_rate": 1e-05, "loss": 0.0259, "step": 327000 }, { "epoch": 0.003271, "grad_norm": 0.18388201296329498, "learning_rate": 1e-05, "loss": 0.0265, "step": 327100 }, { "epoch": 0.003272, "grad_norm": 0.21499456465244293, "learning_rate": 1e-05, "loss": 0.0268, "step": 327200 }, { "epoch": 0.003273, "grad_norm": 0.19232648611068726, "learning_rate": 1e-05, "loss": 0.0265, "step": 327300 }, { "epoch": 0.003274, "grad_norm": 0.21647612750530243, "learning_rate": 1e-05, "loss": 0.0265, "step": 327400 }, { "epoch": 0.003275, "grad_norm": 0.269710898399353, "learning_rate": 1e-05, "loss": 0.0267, "step": 327500 }, { "epoch": 0.003276, "grad_norm": 0.3085789382457733, "learning_rate": 1e-05, "loss": 0.026, "step": 327600 }, { "epoch": 0.003277, "grad_norm": 0.19371287524700165, "learning_rate": 1e-05, "loss": 0.0261, "step": 327700 }, { "epoch": 0.003278, "grad_norm": 0.18252968788146973, "learning_rate": 1e-05, "loss": 0.0264, "step": 327800 }, { "epoch": 0.003279, "grad_norm": 0.16958999633789062, "learning_rate": 1e-05, "loss": 0.0262, "step": 327900 }, { "epoch": 0.00328, "grad_norm": 0.1983850598335266, "learning_rate": 1e-05, "loss": 0.0262, "step": 328000 }, { "epoch": 0.003281, "grad_norm": 0.20766080915927887, "learning_rate": 1e-05, "loss": 0.0264, "step": 328100 }, { "epoch": 0.003282, "grad_norm": 0.3172662556171417, "learning_rate": 1e-05, "loss": 0.0258, "step": 328200 }, { "epoch": 0.003283, "grad_norm": 0.2901023328304291, "learning_rate": 1e-05, "loss": 0.026, "step": 328300 }, { "epoch": 0.003284, "grad_norm": 0.18460452556610107, "learning_rate": 1e-05, "loss": 0.026, "step": 328400 }, { "epoch": 0.003285, "grad_norm": 0.22881858050823212, "learning_rate": 1e-05, "loss": 0.0264, "step": 328500 }, { "epoch": 0.003286, "grad_norm": 0.2943063974380493, "learning_rate": 1e-05, "loss": 0.0258, "step": 328600 }, { "epoch": 0.003287, "grad_norm": 0.2572328448295593, "learning_rate": 1e-05, "loss": 0.0268, "step": 328700 }, { "epoch": 0.003288, "grad_norm": 0.2629999816417694, "learning_rate": 1e-05, "loss": 0.026, "step": 328800 }, { "epoch": 0.003289, "grad_norm": 0.5243650078773499, "learning_rate": 1e-05, "loss": 0.0262, "step": 328900 }, { "epoch": 0.00329, "grad_norm": 0.2574480473995209, "learning_rate": 1e-05, "loss": 0.0257, "step": 329000 }, { "epoch": 0.003291, "grad_norm": 0.21229717135429382, "learning_rate": 1e-05, "loss": 0.0261, "step": 329100 }, { "epoch": 0.003292, "grad_norm": 0.29522302746772766, "learning_rate": 1e-05, "loss": 0.0262, "step": 329200 }, { "epoch": 0.003293, "grad_norm": 0.20544525980949402, "learning_rate": 1e-05, "loss": 0.0257, "step": 329300 }, { "epoch": 0.003294, "grad_norm": 0.1784791350364685, "learning_rate": 1e-05, "loss": 0.0254, "step": 329400 }, { "epoch": 0.003295, "grad_norm": 0.19268448650836945, "learning_rate": 1e-05, "loss": 0.0265, "step": 329500 }, { "epoch": 0.003296, "grad_norm": 0.2304309457540512, "learning_rate": 1e-05, "loss": 0.0253, "step": 329600 }, { "epoch": 0.003297, "grad_norm": 0.21994438767433167, "learning_rate": 1e-05, "loss": 0.0259, "step": 329700 }, { "epoch": 0.003298, "grad_norm": 0.21115253865718842, "learning_rate": 1e-05, "loss": 0.0262, "step": 329800 }, { "epoch": 0.003299, "grad_norm": 0.24736624956130981, "learning_rate": 1e-05, "loss": 0.0256, "step": 329900 }, { "epoch": 0.0033, "grad_norm": 0.19289976358413696, "learning_rate": 1e-05, "loss": 0.0254, "step": 330000 }, { "epoch": 0.003301, "grad_norm": 0.24266310036182404, "learning_rate": 1e-05, "loss": 0.0255, "step": 330100 }, { "epoch": 0.003302, "grad_norm": 0.1794329285621643, "learning_rate": 1e-05, "loss": 0.0262, "step": 330200 }, { "epoch": 0.003303, "grad_norm": 0.2627459764480591, "learning_rate": 1e-05, "loss": 0.026, "step": 330300 }, { "epoch": 0.003304, "grad_norm": 0.2112918347120285, "learning_rate": 1e-05, "loss": 0.0258, "step": 330400 }, { "epoch": 0.003305, "grad_norm": 0.2609831690788269, "learning_rate": 1e-05, "loss": 0.0263, "step": 330500 }, { "epoch": 0.003306, "grad_norm": 0.19058114290237427, "learning_rate": 1e-05, "loss": 0.0258, "step": 330600 }, { "epoch": 0.003307, "grad_norm": 0.28339463472366333, "learning_rate": 1e-05, "loss": 0.0258, "step": 330700 }, { "epoch": 0.003308, "grad_norm": 0.19595804810523987, "learning_rate": 1e-05, "loss": 0.0257, "step": 330800 }, { "epoch": 0.003309, "grad_norm": 0.23688659071922302, "learning_rate": 1e-05, "loss": 0.0258, "step": 330900 }, { "epoch": 0.00331, "grad_norm": 0.18772047758102417, "learning_rate": 1e-05, "loss": 0.0255, "step": 331000 }, { "epoch": 0.003311, "grad_norm": 0.17910416424274445, "learning_rate": 1e-05, "loss": 0.0256, "step": 331100 }, { "epoch": 0.003312, "grad_norm": 0.22274568676948547, "learning_rate": 1e-05, "loss": 0.0263, "step": 331200 }, { "epoch": 0.003313, "grad_norm": 0.24044393002986908, "learning_rate": 1e-05, "loss": 0.0258, "step": 331300 }, { "epoch": 0.003314, "grad_norm": 0.2564503848552704, "learning_rate": 1e-05, "loss": 0.0264, "step": 331400 }, { "epoch": 0.003315, "grad_norm": 0.2059996873140335, "learning_rate": 1e-05, "loss": 0.0259, "step": 331500 }, { "epoch": 0.003316, "grad_norm": 0.18391820788383484, "learning_rate": 1e-05, "loss": 0.0257, "step": 331600 }, { "epoch": 0.003317, "grad_norm": 0.24225758016109467, "learning_rate": 1e-05, "loss": 0.0259, "step": 331700 }, { "epoch": 0.003318, "grad_norm": 0.16476255655288696, "learning_rate": 1e-05, "loss": 0.0262, "step": 331800 }, { "epoch": 0.003319, "grad_norm": 0.18719999492168427, "learning_rate": 1e-05, "loss": 0.0264, "step": 331900 }, { "epoch": 0.00332, "grad_norm": 0.2051406055688858, "learning_rate": 1e-05, "loss": 0.0259, "step": 332000 }, { "epoch": 0.003321, "grad_norm": 0.2580793797969818, "learning_rate": 1e-05, "loss": 0.0264, "step": 332100 }, { "epoch": 0.003322, "grad_norm": 0.2719079852104187, "learning_rate": 1e-05, "loss": 0.0259, "step": 332200 }, { "epoch": 0.003323, "grad_norm": 0.20152629911899567, "learning_rate": 1e-05, "loss": 0.0258, "step": 332300 }, { "epoch": 0.003324, "grad_norm": 0.24209684133529663, "learning_rate": 1e-05, "loss": 0.0261, "step": 332400 }, { "epoch": 0.003325, "grad_norm": 0.20891176164150238, "learning_rate": 1e-05, "loss": 0.0261, "step": 332500 }, { "epoch": 0.003326, "grad_norm": 0.24177920818328857, "learning_rate": 1e-05, "loss": 0.0262, "step": 332600 }, { "epoch": 0.003327, "grad_norm": 0.29511672258377075, "learning_rate": 1e-05, "loss": 0.0257, "step": 332700 }, { "epoch": 0.003328, "grad_norm": 0.24053826928138733, "learning_rate": 1e-05, "loss": 0.0259, "step": 332800 }, { "epoch": 0.003329, "grad_norm": 0.18710261583328247, "learning_rate": 1e-05, "loss": 0.0253, "step": 332900 }, { "epoch": 0.00333, "grad_norm": 0.23774990439414978, "learning_rate": 1e-05, "loss": 0.0255, "step": 333000 }, { "epoch": 0.003331, "grad_norm": 0.16255471110343933, "learning_rate": 1e-05, "loss": 0.026, "step": 333100 }, { "epoch": 0.003332, "grad_norm": 0.42311015725135803, "learning_rate": 1e-05, "loss": 0.0259, "step": 333200 }, { "epoch": 0.003333, "grad_norm": 0.22482597827911377, "learning_rate": 1e-05, "loss": 0.026, "step": 333300 }, { "epoch": 0.003334, "grad_norm": 0.2218495011329651, "learning_rate": 1e-05, "loss": 0.0257, "step": 333400 }, { "epoch": 0.003335, "grad_norm": 0.19828921556472778, "learning_rate": 1e-05, "loss": 0.0256, "step": 333500 }, { "epoch": 0.003336, "grad_norm": 0.1843634992837906, "learning_rate": 1e-05, "loss": 0.0264, "step": 333600 }, { "epoch": 0.003337, "grad_norm": 0.25892484188079834, "learning_rate": 1e-05, "loss": 0.0258, "step": 333700 }, { "epoch": 0.003338, "grad_norm": 0.23629409074783325, "learning_rate": 1e-05, "loss": 0.0253, "step": 333800 }, { "epoch": 0.003339, "grad_norm": 0.22419913113117218, "learning_rate": 1e-05, "loss": 0.0258, "step": 333900 }, { "epoch": 0.00334, "grad_norm": 0.2854868769645691, "learning_rate": 1e-05, "loss": 0.0259, "step": 334000 }, { "epoch": 0.003341, "grad_norm": 0.18664538860321045, "learning_rate": 1e-05, "loss": 0.0264, "step": 334100 }, { "epoch": 0.003342, "grad_norm": 0.22402864694595337, "learning_rate": 1e-05, "loss": 0.0265, "step": 334200 }, { "epoch": 0.003343, "grad_norm": 0.28049710392951965, "learning_rate": 1e-05, "loss": 0.0257, "step": 334300 }, { "epoch": 0.003344, "grad_norm": 0.18755163252353668, "learning_rate": 1e-05, "loss": 0.0252, "step": 334400 }, { "epoch": 0.003345, "grad_norm": 0.24187353253364563, "learning_rate": 1e-05, "loss": 0.026, "step": 334500 }, { "epoch": 0.003346, "grad_norm": 0.17730286717414856, "learning_rate": 1e-05, "loss": 0.0257, "step": 334600 }, { "epoch": 0.003347, "grad_norm": 0.20491741597652435, "learning_rate": 1e-05, "loss": 0.0257, "step": 334700 }, { "epoch": 0.003348, "grad_norm": 0.21280677616596222, "learning_rate": 1e-05, "loss": 0.0259, "step": 334800 }, { "epoch": 0.003349, "grad_norm": 0.22672335803508759, "learning_rate": 1e-05, "loss": 0.0257, "step": 334900 }, { "epoch": 0.00335, "grad_norm": 0.195067897439003, "learning_rate": 1e-05, "loss": 0.0256, "step": 335000 }, { "epoch": 0.003351, "grad_norm": 0.29459336400032043, "learning_rate": 1e-05, "loss": 0.0259, "step": 335100 }, { "epoch": 0.003352, "grad_norm": 0.26386862993240356, "learning_rate": 1e-05, "loss": 0.0264, "step": 335200 }, { "epoch": 0.003353, "grad_norm": 0.17338527739048004, "learning_rate": 1e-05, "loss": 0.0259, "step": 335300 }, { "epoch": 0.003354, "grad_norm": 0.20065303146839142, "learning_rate": 1e-05, "loss": 0.0258, "step": 335400 }, { "epoch": 0.003355, "grad_norm": 0.19764775037765503, "learning_rate": 1e-05, "loss": 0.0251, "step": 335500 }, { "epoch": 0.003356, "grad_norm": 0.2741958796977997, "learning_rate": 1e-05, "loss": 0.0255, "step": 335600 }, { "epoch": 0.003357, "grad_norm": 0.28739771246910095, "learning_rate": 1e-05, "loss": 0.0259, "step": 335700 }, { "epoch": 0.003358, "grad_norm": 0.18067622184753418, "learning_rate": 1e-05, "loss": 0.0254, "step": 335800 }, { "epoch": 0.003359, "grad_norm": 0.22345194220542908, "learning_rate": 1e-05, "loss": 0.0262, "step": 335900 }, { "epoch": 0.00336, "grad_norm": 0.2457740306854248, "learning_rate": 1e-05, "loss": 0.0261, "step": 336000 }, { "epoch": 0.003361, "grad_norm": 0.2562016546726227, "learning_rate": 1e-05, "loss": 0.0254, "step": 336100 }, { "epoch": 0.003362, "grad_norm": 0.22217346727848053, "learning_rate": 1e-05, "loss": 0.0258, "step": 336200 }, { "epoch": 0.003363, "grad_norm": 0.2666025757789612, "learning_rate": 1e-05, "loss": 0.0259, "step": 336300 }, { "epoch": 0.003364, "grad_norm": 0.2566533386707306, "learning_rate": 1e-05, "loss": 0.026, "step": 336400 }, { "epoch": 0.003365, "grad_norm": 0.21106812357902527, "learning_rate": 1e-05, "loss": 0.0259, "step": 336500 }, { "epoch": 0.003366, "grad_norm": 0.2081550657749176, "learning_rate": 1e-05, "loss": 0.0261, "step": 336600 }, { "epoch": 0.003367, "grad_norm": 0.25009188055992126, "learning_rate": 1e-05, "loss": 0.0252, "step": 336700 }, { "epoch": 0.003368, "grad_norm": 0.1966935098171234, "learning_rate": 1e-05, "loss": 0.0263, "step": 336800 }, { "epoch": 0.003369, "grad_norm": 0.22717143595218658, "learning_rate": 1e-05, "loss": 0.0261, "step": 336900 }, { "epoch": 0.00337, "grad_norm": 0.19842340052127838, "learning_rate": 1e-05, "loss": 0.0259, "step": 337000 }, { "epoch": 0.003371, "grad_norm": 0.19794026017189026, "learning_rate": 1e-05, "loss": 0.0259, "step": 337100 }, { "epoch": 0.003372, "grad_norm": 0.2157779335975647, "learning_rate": 1e-05, "loss": 0.0256, "step": 337200 }, { "epoch": 0.003373, "grad_norm": 0.2926463186740875, "learning_rate": 1e-05, "loss": 0.0257, "step": 337300 }, { "epoch": 0.003374, "grad_norm": 0.2731887996196747, "learning_rate": 1e-05, "loss": 0.0255, "step": 337400 }, { "epoch": 0.003375, "grad_norm": 0.20208173990249634, "learning_rate": 1e-05, "loss": 0.026, "step": 337500 }, { "epoch": 0.003376, "grad_norm": 0.19638560712337494, "learning_rate": 1e-05, "loss": 0.0255, "step": 337600 }, { "epoch": 0.003377, "grad_norm": 0.19940988719463348, "learning_rate": 1e-05, "loss": 0.0257, "step": 337700 }, { "epoch": 0.003378, "grad_norm": 0.21363966166973114, "learning_rate": 1e-05, "loss": 0.0252, "step": 337800 }, { "epoch": 0.003379, "grad_norm": 0.21082064509391785, "learning_rate": 1e-05, "loss": 0.0259, "step": 337900 }, { "epoch": 0.00338, "grad_norm": 0.19800077378749847, "learning_rate": 1e-05, "loss": 0.0259, "step": 338000 }, { "epoch": 0.003381, "grad_norm": 0.1880570948123932, "learning_rate": 1e-05, "loss": 0.026, "step": 338100 }, { "epoch": 0.003382, "grad_norm": 0.20191872119903564, "learning_rate": 1e-05, "loss": 0.0262, "step": 338200 }, { "epoch": 0.003383, "grad_norm": 0.31847622990608215, "learning_rate": 1e-05, "loss": 0.0262, "step": 338300 }, { "epoch": 0.003384, "grad_norm": 0.2615985870361328, "learning_rate": 1e-05, "loss": 0.0262, "step": 338400 }, { "epoch": 0.003385, "grad_norm": 0.18741188943386078, "learning_rate": 1e-05, "loss": 0.0252, "step": 338500 }, { "epoch": 0.003386, "grad_norm": 0.2529643177986145, "learning_rate": 1e-05, "loss": 0.0263, "step": 338600 }, { "epoch": 0.003387, "grad_norm": 0.19686651229858398, "learning_rate": 1e-05, "loss": 0.0254, "step": 338700 }, { "epoch": 0.003388, "grad_norm": 0.26150697469711304, "learning_rate": 1e-05, "loss": 0.0257, "step": 338800 }, { "epoch": 0.003389, "grad_norm": 0.22503972053527832, "learning_rate": 1e-05, "loss": 0.0258, "step": 338900 }, { "epoch": 0.00339, "grad_norm": 0.1803523153066635, "learning_rate": 1e-05, "loss": 0.025, "step": 339000 }, { "epoch": 0.003391, "grad_norm": 0.17200639843940735, "learning_rate": 1e-05, "loss": 0.0254, "step": 339100 }, { "epoch": 0.003392, "grad_norm": 0.28044119477272034, "learning_rate": 1e-05, "loss": 0.0256, "step": 339200 }, { "epoch": 0.003393, "grad_norm": 0.2753671407699585, "learning_rate": 1e-05, "loss": 0.0257, "step": 339300 }, { "epoch": 0.003394, "grad_norm": 0.20262108743190765, "learning_rate": 1e-05, "loss": 0.0259, "step": 339400 }, { "epoch": 0.003395, "grad_norm": 0.19933193922042847, "learning_rate": 1e-05, "loss": 0.0258, "step": 339500 }, { "epoch": 0.003396, "grad_norm": 0.2587679624557495, "learning_rate": 1e-05, "loss": 0.0256, "step": 339600 }, { "epoch": 0.003397, "grad_norm": 0.25301435589790344, "learning_rate": 1e-05, "loss": 0.0261, "step": 339700 }, { "epoch": 0.003398, "grad_norm": 0.2259465903043747, "learning_rate": 1e-05, "loss": 0.0259, "step": 339800 }, { "epoch": 0.003399, "grad_norm": 0.1816311776638031, "learning_rate": 1e-05, "loss": 0.0257, "step": 339900 }, { "epoch": 0.0034, "grad_norm": 0.1964031159877777, "learning_rate": 1e-05, "loss": 0.0258, "step": 340000 }, { "epoch": 0.0034, "eval_loss": 0.022262409329414368, "eval_runtime": 167.8122, "eval_samples_per_second": 297.952, "eval_steps_per_second": 18.622, "step": 340000 }, { "epoch": 0.003401, "grad_norm": 0.19455471634864807, "learning_rate": 1e-05, "loss": 0.0255, "step": 340100 }, { "epoch": 0.003402, "grad_norm": 0.20395009219646454, "learning_rate": 1e-05, "loss": 0.0256, "step": 340200 }, { "epoch": 0.003403, "grad_norm": 0.4013252258300781, "learning_rate": 1e-05, "loss": 0.0259, "step": 340300 }, { "epoch": 0.003404, "grad_norm": 0.12435686588287354, "learning_rate": 1e-05, "loss": 0.0258, "step": 340400 }, { "epoch": 0.003405, "grad_norm": 0.21206669509410858, "learning_rate": 1e-05, "loss": 0.0258, "step": 340500 }, { "epoch": 0.003406, "grad_norm": 0.22015072405338287, "learning_rate": 1e-05, "loss": 0.0261, "step": 340600 }, { "epoch": 0.003407, "grad_norm": 0.17829914391040802, "learning_rate": 1e-05, "loss": 0.0257, "step": 340700 }, { "epoch": 0.003408, "grad_norm": 0.30154353380203247, "learning_rate": 1e-05, "loss": 0.0259, "step": 340800 }, { "epoch": 0.003409, "grad_norm": 0.22691747546195984, "learning_rate": 1e-05, "loss": 0.0256, "step": 340900 }, { "epoch": 0.00341, "grad_norm": 0.24601145088672638, "learning_rate": 1e-05, "loss": 0.026, "step": 341000 }, { "epoch": 0.003411, "grad_norm": 0.18850694596767426, "learning_rate": 1e-05, "loss": 0.0255, "step": 341100 }, { "epoch": 0.003412, "grad_norm": 0.1777670979499817, "learning_rate": 1e-05, "loss": 0.0256, "step": 341200 }, { "epoch": 0.003413, "grad_norm": 0.16222679615020752, "learning_rate": 1e-05, "loss": 0.0256, "step": 341300 }, { "epoch": 0.003414, "grad_norm": 0.21548481285572052, "learning_rate": 1e-05, "loss": 0.0256, "step": 341400 }, { "epoch": 0.003415, "grad_norm": 0.21011929214000702, "learning_rate": 1e-05, "loss": 0.0254, "step": 341500 }, { "epoch": 0.003416, "grad_norm": 0.20526932179927826, "learning_rate": 1e-05, "loss": 0.025, "step": 341600 }, { "epoch": 0.003417, "grad_norm": 0.17155171930789948, "learning_rate": 1e-05, "loss": 0.0255, "step": 341700 }, { "epoch": 0.003418, "grad_norm": 0.20339873433113098, "learning_rate": 1e-05, "loss": 0.0254, "step": 341800 }, { "epoch": 0.003419, "grad_norm": 0.20680727064609528, "learning_rate": 1e-05, "loss": 0.0253, "step": 341900 }, { "epoch": 0.00342, "grad_norm": 0.24560657143592834, "learning_rate": 1e-05, "loss": 0.0257, "step": 342000 }, { "epoch": 0.003421, "grad_norm": 0.20514988899230957, "learning_rate": 1e-05, "loss": 0.0258, "step": 342100 }, { "epoch": 0.003422, "grad_norm": 0.16690513491630554, "learning_rate": 1e-05, "loss": 0.0258, "step": 342200 }, { "epoch": 0.003423, "grad_norm": 0.2593839466571808, "learning_rate": 1e-05, "loss": 0.0258, "step": 342300 }, { "epoch": 0.003424, "grad_norm": 0.15296988189220428, "learning_rate": 1e-05, "loss": 0.025, "step": 342400 }, { "epoch": 0.003425, "grad_norm": 0.2260589599609375, "learning_rate": 1e-05, "loss": 0.0257, "step": 342500 }, { "epoch": 0.003426, "grad_norm": 0.3480568528175354, "learning_rate": 1e-05, "loss": 0.0253, "step": 342600 }, { "epoch": 0.003427, "grad_norm": 0.2292780727148056, "learning_rate": 1e-05, "loss": 0.0255, "step": 342700 }, { "epoch": 0.003428, "grad_norm": 0.2946573495864868, "learning_rate": 1e-05, "loss": 0.0255, "step": 342800 }, { "epoch": 0.003429, "grad_norm": 0.26851218938827515, "learning_rate": 1e-05, "loss": 0.0255, "step": 342900 }, { "epoch": 0.00343, "grad_norm": 0.2180161476135254, "learning_rate": 1e-05, "loss": 0.0254, "step": 343000 }, { "epoch": 0.003431, "grad_norm": 0.24760955572128296, "learning_rate": 1e-05, "loss": 0.0256, "step": 343100 }, { "epoch": 0.003432, "grad_norm": 0.1669629067182541, "learning_rate": 1e-05, "loss": 0.0259, "step": 343200 }, { "epoch": 0.003433, "grad_norm": 0.21703343093395233, "learning_rate": 1e-05, "loss": 0.0251, "step": 343300 }, { "epoch": 0.003434, "grad_norm": 0.22390207648277283, "learning_rate": 1e-05, "loss": 0.0252, "step": 343400 }, { "epoch": 0.003435, "grad_norm": 0.2521446943283081, "learning_rate": 1e-05, "loss": 0.0257, "step": 343500 }, { "epoch": 0.003436, "grad_norm": 1.0345635414123535, "learning_rate": 1e-05, "loss": 0.0257, "step": 343600 }, { "epoch": 0.003437, "grad_norm": 0.19848418235778809, "learning_rate": 1e-05, "loss": 0.0255, "step": 343700 }, { "epoch": 0.003438, "grad_norm": 0.19186519086360931, "learning_rate": 1e-05, "loss": 0.0249, "step": 343800 }, { "epoch": 0.003439, "grad_norm": 0.22862061858177185, "learning_rate": 1e-05, "loss": 0.0261, "step": 343900 }, { "epoch": 0.00344, "grad_norm": 0.26299968361854553, "learning_rate": 1e-05, "loss": 0.0255, "step": 344000 }, { "epoch": 0.003441, "grad_norm": 0.22287769615650177, "learning_rate": 1e-05, "loss": 0.0254, "step": 344100 }, { "epoch": 0.003442, "grad_norm": 0.2176257073879242, "learning_rate": 1e-05, "loss": 0.0254, "step": 344200 }, { "epoch": 0.003443, "grad_norm": 0.2146892100572586, "learning_rate": 1e-05, "loss": 0.0255, "step": 344300 }, { "epoch": 0.003444, "grad_norm": 0.24948792159557343, "learning_rate": 1e-05, "loss": 0.0255, "step": 344400 }, { "epoch": 0.003445, "grad_norm": 0.1785217672586441, "learning_rate": 1e-05, "loss": 0.0258, "step": 344500 }, { "epoch": 0.003446, "grad_norm": 0.24420130252838135, "learning_rate": 1e-05, "loss": 0.0259, "step": 344600 }, { "epoch": 0.003447, "grad_norm": 0.2950577735900879, "learning_rate": 1e-05, "loss": 0.0251, "step": 344700 }, { "epoch": 0.003448, "grad_norm": 0.18404103815555573, "learning_rate": 1e-05, "loss": 0.026, "step": 344800 }, { "epoch": 0.003449, "grad_norm": 0.27592894434928894, "learning_rate": 1e-05, "loss": 0.0258, "step": 344900 }, { "epoch": 0.00345, "grad_norm": 0.20414221286773682, "learning_rate": 1e-05, "loss": 0.0257, "step": 345000 }, { "epoch": 0.003451, "grad_norm": 0.17545627057552338, "learning_rate": 1e-05, "loss": 0.0257, "step": 345100 }, { "epoch": 0.003452, "grad_norm": 0.23395714163780212, "learning_rate": 1e-05, "loss": 0.026, "step": 345200 }, { "epoch": 0.003453, "grad_norm": 0.19793027639389038, "learning_rate": 1e-05, "loss": 0.0251, "step": 345300 }, { "epoch": 0.003454, "grad_norm": 0.1931024044752121, "learning_rate": 1e-05, "loss": 0.0252, "step": 345400 }, { "epoch": 0.003455, "grad_norm": 0.24904154241085052, "learning_rate": 1e-05, "loss": 0.0257, "step": 345500 }, { "epoch": 0.003456, "grad_norm": 0.2803846001625061, "learning_rate": 1e-05, "loss": 0.0259, "step": 345600 }, { "epoch": 0.003457, "grad_norm": 0.3020590841770172, "learning_rate": 1e-05, "loss": 0.0248, "step": 345700 }, { "epoch": 0.003458, "grad_norm": 0.1645859181880951, "learning_rate": 1e-05, "loss": 0.0254, "step": 345800 }, { "epoch": 0.003459, "grad_norm": 0.2023889571428299, "learning_rate": 1e-05, "loss": 0.0258, "step": 345900 }, { "epoch": 0.00346, "grad_norm": 0.2420167624950409, "learning_rate": 1e-05, "loss": 0.0254, "step": 346000 }, { "epoch": 0.003461, "grad_norm": 0.23831528425216675, "learning_rate": 1e-05, "loss": 0.0256, "step": 346100 }, { "epoch": 0.003462, "grad_norm": 0.17657887935638428, "learning_rate": 1e-05, "loss": 0.0253, "step": 346200 }, { "epoch": 0.003463, "grad_norm": 0.21234019100666046, "learning_rate": 1e-05, "loss": 0.0258, "step": 346300 }, { "epoch": 0.003464, "grad_norm": 0.24785380065441132, "learning_rate": 1e-05, "loss": 0.0255, "step": 346400 }, { "epoch": 0.003465, "grad_norm": 0.1959632933139801, "learning_rate": 1e-05, "loss": 0.0255, "step": 346500 }, { "epoch": 0.003466, "grad_norm": 0.151380717754364, "learning_rate": 1e-05, "loss": 0.0254, "step": 346600 }, { "epoch": 0.003467, "grad_norm": 0.2139723300933838, "learning_rate": 1e-05, "loss": 0.0251, "step": 346700 }, { "epoch": 0.003468, "grad_norm": 0.16897976398468018, "learning_rate": 1e-05, "loss": 0.0257, "step": 346800 }, { "epoch": 0.003469, "grad_norm": 0.1994641125202179, "learning_rate": 1e-05, "loss": 0.0256, "step": 346900 }, { "epoch": 0.00347, "grad_norm": 0.1821015477180481, "learning_rate": 1e-05, "loss": 0.0258, "step": 347000 }, { "epoch": 0.003471, "grad_norm": 0.21710331737995148, "learning_rate": 1e-05, "loss": 0.0254, "step": 347100 }, { "epoch": 0.003472, "grad_norm": 0.22937925159931183, "learning_rate": 1e-05, "loss": 0.0264, "step": 347200 }, { "epoch": 0.003473, "grad_norm": 0.1922784447669983, "learning_rate": 1e-05, "loss": 0.0257, "step": 347300 }, { "epoch": 0.003474, "grad_norm": 0.17950443923473358, "learning_rate": 1e-05, "loss": 0.0253, "step": 347400 }, { "epoch": 0.003475, "grad_norm": 0.170990988612175, "learning_rate": 1e-05, "loss": 0.0257, "step": 347500 }, { "epoch": 0.003476, "grad_norm": 0.1855325698852539, "learning_rate": 1e-05, "loss": 0.0257, "step": 347600 }, { "epoch": 0.003477, "grad_norm": 0.2391960620880127, "learning_rate": 1e-05, "loss": 0.0255, "step": 347700 }, { "epoch": 0.003478, "grad_norm": 0.16121315956115723, "learning_rate": 1e-05, "loss": 0.0255, "step": 347800 }, { "epoch": 0.003479, "grad_norm": 0.2749963700771332, "learning_rate": 1e-05, "loss": 0.0256, "step": 347900 }, { "epoch": 0.00348, "grad_norm": 0.7174862027168274, "learning_rate": 1e-05, "loss": 0.0256, "step": 348000 }, { "epoch": 0.003481, "grad_norm": 0.23689448833465576, "learning_rate": 1e-05, "loss": 0.0256, "step": 348100 }, { "epoch": 0.003482, "grad_norm": 0.18093661963939667, "learning_rate": 1e-05, "loss": 0.0254, "step": 348200 }, { "epoch": 0.003483, "grad_norm": 0.20836983621120453, "learning_rate": 1e-05, "loss": 0.0261, "step": 348300 }, { "epoch": 0.003484, "grad_norm": 0.18827295303344727, "learning_rate": 1e-05, "loss": 0.0253, "step": 348400 }, { "epoch": 0.003485, "grad_norm": 0.15924328565597534, "learning_rate": 1e-05, "loss": 0.0255, "step": 348500 }, { "epoch": 0.003486, "grad_norm": 0.19388347864151, "learning_rate": 1e-05, "loss": 0.0256, "step": 348600 }, { "epoch": 0.003487, "grad_norm": 0.2514536380767822, "learning_rate": 1e-05, "loss": 0.0258, "step": 348700 }, { "epoch": 0.003488, "grad_norm": 0.27309146523475647, "learning_rate": 1e-05, "loss": 0.0248, "step": 348800 }, { "epoch": 0.003489, "grad_norm": 0.18778039515018463, "learning_rate": 1e-05, "loss": 0.0258, "step": 348900 }, { "epoch": 0.00349, "grad_norm": 0.22264915704727173, "learning_rate": 1e-05, "loss": 0.025, "step": 349000 }, { "epoch": 0.003491, "grad_norm": 0.22129440307617188, "learning_rate": 1e-05, "loss": 0.0255, "step": 349100 }, { "epoch": 0.003492, "grad_norm": 0.3384068012237549, "learning_rate": 1e-05, "loss": 0.0252, "step": 349200 }, { "epoch": 0.003493, "grad_norm": 0.20708250999450684, "learning_rate": 1e-05, "loss": 0.0256, "step": 349300 }, { "epoch": 0.003494, "grad_norm": 0.21864783763885498, "learning_rate": 1e-05, "loss": 0.0254, "step": 349400 }, { "epoch": 0.003495, "grad_norm": 0.21621127426624298, "learning_rate": 1e-05, "loss": 0.0257, "step": 349500 }, { "epoch": 0.003496, "grad_norm": 0.19588179886341095, "learning_rate": 1e-05, "loss": 0.0252, "step": 349600 }, { "epoch": 0.003497, "grad_norm": 0.28438800573349, "learning_rate": 1e-05, "loss": 0.0249, "step": 349700 }, { "epoch": 0.003498, "grad_norm": 0.2554504871368408, "learning_rate": 1e-05, "loss": 0.0256, "step": 349800 }, { "epoch": 0.003499, "grad_norm": 0.21558262407779694, "learning_rate": 1e-05, "loss": 0.0249, "step": 349900 }, { "epoch": 0.0035, "grad_norm": 0.2280120998620987, "learning_rate": 1e-05, "loss": 0.0258, "step": 350000 }, { "epoch": 0.003501, "grad_norm": 0.5311402678489685, "learning_rate": 1e-05, "loss": 0.0251, "step": 350100 }, { "epoch": 0.003502, "grad_norm": 0.16265453398227692, "learning_rate": 1e-05, "loss": 0.0256, "step": 350200 }, { "epoch": 0.003503, "grad_norm": 0.25673002004623413, "learning_rate": 1e-05, "loss": 0.0251, "step": 350300 }, { "epoch": 0.003504, "grad_norm": 0.18486860394477844, "learning_rate": 1e-05, "loss": 0.0258, "step": 350400 }, { "epoch": 0.003505, "grad_norm": 0.17345423996448517, "learning_rate": 1e-05, "loss": 0.025, "step": 350500 }, { "epoch": 0.003506, "grad_norm": 0.2084941565990448, "learning_rate": 1e-05, "loss": 0.0252, "step": 350600 }, { "epoch": 0.003507, "grad_norm": 0.19509057700634003, "learning_rate": 1e-05, "loss": 0.0261, "step": 350700 }, { "epoch": 0.003508, "grad_norm": 0.17678861320018768, "learning_rate": 1e-05, "loss": 0.026, "step": 350800 }, { "epoch": 0.003509, "grad_norm": 0.20327670872211456, "learning_rate": 1e-05, "loss": 0.0255, "step": 350900 }, { "epoch": 0.00351, "grad_norm": 0.3554207384586334, "learning_rate": 1e-05, "loss": 0.0253, "step": 351000 }, { "epoch": 0.003511, "grad_norm": 0.3405420482158661, "learning_rate": 1e-05, "loss": 0.0259, "step": 351100 }, { "epoch": 0.003512, "grad_norm": 0.21139048039913177, "learning_rate": 1e-05, "loss": 0.0256, "step": 351200 }, { "epoch": 0.003513, "grad_norm": 0.2039434164762497, "learning_rate": 1e-05, "loss": 0.0259, "step": 351300 }, { "epoch": 0.003514, "grad_norm": 0.23392628133296967, "learning_rate": 1e-05, "loss": 0.026, "step": 351400 }, { "epoch": 0.003515, "grad_norm": 0.20103168487548828, "learning_rate": 1e-05, "loss": 0.0257, "step": 351500 }, { "epoch": 0.003516, "grad_norm": 0.22161120176315308, "learning_rate": 1e-05, "loss": 0.0257, "step": 351600 }, { "epoch": 0.003517, "grad_norm": 0.2196168601512909, "learning_rate": 1e-05, "loss": 0.0249, "step": 351700 }, { "epoch": 0.003518, "grad_norm": 0.21681076288223267, "learning_rate": 1e-05, "loss": 0.0254, "step": 351800 }, { "epoch": 0.003519, "grad_norm": 0.20995111763477325, "learning_rate": 1e-05, "loss": 0.0246, "step": 351900 }, { "epoch": 0.00352, "grad_norm": 0.18594080209732056, "learning_rate": 1e-05, "loss": 0.0256, "step": 352000 }, { "epoch": 0.003521, "grad_norm": 0.2837679088115692, "learning_rate": 1e-05, "loss": 0.0253, "step": 352100 }, { "epoch": 0.003522, "grad_norm": 0.21709731221199036, "learning_rate": 1e-05, "loss": 0.0258, "step": 352200 }, { "epoch": 0.003523, "grad_norm": 0.2566852271556854, "learning_rate": 1e-05, "loss": 0.0253, "step": 352300 }, { "epoch": 0.003524, "grad_norm": 0.19111543893814087, "learning_rate": 1e-05, "loss": 0.0255, "step": 352400 }, { "epoch": 0.003525, "grad_norm": 0.191665381193161, "learning_rate": 1e-05, "loss": 0.0254, "step": 352500 }, { "epoch": 0.003526, "grad_norm": 0.2138933390378952, "learning_rate": 1e-05, "loss": 0.0252, "step": 352600 }, { "epoch": 0.003527, "grad_norm": 0.24859748780727386, "learning_rate": 1e-05, "loss": 0.0256, "step": 352700 }, { "epoch": 0.003528, "grad_norm": 0.18274061381816864, "learning_rate": 1e-05, "loss": 0.0251, "step": 352800 }, { "epoch": 0.003529, "grad_norm": 0.20358552038669586, "learning_rate": 1e-05, "loss": 0.0253, "step": 352900 }, { "epoch": 0.00353, "grad_norm": 0.21871551871299744, "learning_rate": 1e-05, "loss": 0.0259, "step": 353000 }, { "epoch": 0.003531, "grad_norm": 0.17301665246486664, "learning_rate": 1e-05, "loss": 0.025, "step": 353100 }, { "epoch": 0.003532, "grad_norm": 0.2632037103176117, "learning_rate": 1e-05, "loss": 0.0253, "step": 353200 }, { "epoch": 0.003533, "grad_norm": 0.24351949989795685, "learning_rate": 1e-05, "loss": 0.0254, "step": 353300 }, { "epoch": 0.003534, "grad_norm": 0.2732505798339844, "learning_rate": 1e-05, "loss": 0.0258, "step": 353400 }, { "epoch": 0.003535, "grad_norm": 0.16060833632946014, "learning_rate": 1e-05, "loss": 0.0253, "step": 353500 }, { "epoch": 0.003536, "grad_norm": 0.2200380563735962, "learning_rate": 1e-05, "loss": 0.0252, "step": 353600 }, { "epoch": 0.003537, "grad_norm": 0.13920234143733978, "learning_rate": 1e-05, "loss": 0.0253, "step": 353700 }, { "epoch": 0.003538, "grad_norm": 0.3616617023944855, "learning_rate": 1e-05, "loss": 0.0253, "step": 353800 }, { "epoch": 0.003539, "grad_norm": 0.2921803891658783, "learning_rate": 1e-05, "loss": 0.0258, "step": 353900 }, { "epoch": 0.00354, "grad_norm": 0.2523195445537567, "learning_rate": 1e-05, "loss": 0.0255, "step": 354000 }, { "epoch": 0.003541, "grad_norm": 0.21873869001865387, "learning_rate": 1e-05, "loss": 0.0251, "step": 354100 }, { "epoch": 0.003542, "grad_norm": 0.2708798050880432, "learning_rate": 1e-05, "loss": 0.0254, "step": 354200 }, { "epoch": 0.003543, "grad_norm": 0.27648934721946716, "learning_rate": 1e-05, "loss": 0.0254, "step": 354300 }, { "epoch": 0.003544, "grad_norm": 0.19729086756706238, "learning_rate": 1e-05, "loss": 0.0255, "step": 354400 }, { "epoch": 0.003545, "grad_norm": 0.21544203162193298, "learning_rate": 1e-05, "loss": 0.0253, "step": 354500 }, { "epoch": 0.003546, "grad_norm": 0.19345548748970032, "learning_rate": 1e-05, "loss": 0.0253, "step": 354600 }, { "epoch": 0.003547, "grad_norm": 0.24129854142665863, "learning_rate": 1e-05, "loss": 0.0251, "step": 354700 }, { "epoch": 0.003548, "grad_norm": 0.21000075340270996, "learning_rate": 1e-05, "loss": 0.0253, "step": 354800 }, { "epoch": 0.003549, "grad_norm": 0.2286650687456131, "learning_rate": 1e-05, "loss": 0.0255, "step": 354900 }, { "epoch": 0.00355, "grad_norm": 0.3658515214920044, "learning_rate": 1e-05, "loss": 0.0254, "step": 355000 }, { "epoch": 0.003551, "grad_norm": 0.2377588003873825, "learning_rate": 1e-05, "loss": 0.0257, "step": 355100 }, { "epoch": 0.003552, "grad_norm": 0.1848420351743698, "learning_rate": 1e-05, "loss": 0.0252, "step": 355200 }, { "epoch": 0.003553, "grad_norm": 0.2786361873149872, "learning_rate": 1e-05, "loss": 0.0253, "step": 355300 }, { "epoch": 0.003554, "grad_norm": 0.2290109246969223, "learning_rate": 1e-05, "loss": 0.0245, "step": 355400 }, { "epoch": 0.003555, "grad_norm": 0.1861780434846878, "learning_rate": 1e-05, "loss": 0.0253, "step": 355500 }, { "epoch": 0.003556, "grad_norm": 0.23654060065746307, "learning_rate": 1e-05, "loss": 0.0248, "step": 355600 }, { "epoch": 0.003557, "grad_norm": 0.23819495737552643, "learning_rate": 1e-05, "loss": 0.0257, "step": 355700 }, { "epoch": 0.003558, "grad_norm": 0.1833958923816681, "learning_rate": 1e-05, "loss": 0.0248, "step": 355800 }, { "epoch": 0.003559, "grad_norm": 0.19535568356513977, "learning_rate": 1e-05, "loss": 0.0249, "step": 355900 }, { "epoch": 0.00356, "grad_norm": 0.20449545979499817, "learning_rate": 1e-05, "loss": 0.025, "step": 356000 }, { "epoch": 0.003561, "grad_norm": 0.18818727135658264, "learning_rate": 1e-05, "loss": 0.0257, "step": 356100 }, { "epoch": 0.003562, "grad_norm": 0.1963980495929718, "learning_rate": 1e-05, "loss": 0.0254, "step": 356200 }, { "epoch": 0.003563, "grad_norm": 0.31191179156303406, "learning_rate": 1e-05, "loss": 0.0253, "step": 356300 }, { "epoch": 0.003564, "grad_norm": 0.2002289593219757, "learning_rate": 1e-05, "loss": 0.0252, "step": 356400 }, { "epoch": 0.003565, "grad_norm": 0.3184802830219269, "learning_rate": 1e-05, "loss": 0.0249, "step": 356500 }, { "epoch": 0.003566, "grad_norm": 0.2301105260848999, "learning_rate": 1e-05, "loss": 0.0246, "step": 356600 }, { "epoch": 0.003567, "grad_norm": 0.1690077930688858, "learning_rate": 1e-05, "loss": 0.0257, "step": 356700 }, { "epoch": 0.003568, "grad_norm": 0.22516180574893951, "learning_rate": 1e-05, "loss": 0.0252, "step": 356800 }, { "epoch": 0.003569, "grad_norm": 0.19704365730285645, "learning_rate": 1e-05, "loss": 0.0251, "step": 356900 }, { "epoch": 0.00357, "grad_norm": 0.3379906415939331, "learning_rate": 1e-05, "loss": 0.0254, "step": 357000 }, { "epoch": 0.003571, "grad_norm": 0.22231370210647583, "learning_rate": 1e-05, "loss": 0.0254, "step": 357100 }, { "epoch": 0.003572, "grad_norm": 0.1662837713956833, "learning_rate": 1e-05, "loss": 0.0251, "step": 357200 }, { "epoch": 0.003573, "grad_norm": 0.2702544629573822, "learning_rate": 1e-05, "loss": 0.0248, "step": 357300 }, { "epoch": 0.003574, "grad_norm": 0.24233821034431458, "learning_rate": 1e-05, "loss": 0.0257, "step": 357400 }, { "epoch": 0.003575, "grad_norm": 0.2584858536720276, "learning_rate": 1e-05, "loss": 0.0244, "step": 357500 }, { "epoch": 0.003576, "grad_norm": 0.28853365778923035, "learning_rate": 1e-05, "loss": 0.0248, "step": 357600 }, { "epoch": 0.003577, "grad_norm": 0.15685397386550903, "learning_rate": 1e-05, "loss": 0.0249, "step": 357700 }, { "epoch": 0.003578, "grad_norm": 0.18045689165592194, "learning_rate": 1e-05, "loss": 0.0247, "step": 357800 }, { "epoch": 0.003579, "grad_norm": 0.2153179794549942, "learning_rate": 1e-05, "loss": 0.0246, "step": 357900 }, { "epoch": 0.00358, "grad_norm": 0.22929222881793976, "learning_rate": 1e-05, "loss": 0.0248, "step": 358000 }, { "epoch": 0.003581, "grad_norm": 0.1616838574409485, "learning_rate": 1e-05, "loss": 0.0253, "step": 358100 }, { "epoch": 0.003582, "grad_norm": 0.2460157722234726, "learning_rate": 1e-05, "loss": 0.025, "step": 358200 }, { "epoch": 0.003583, "grad_norm": 0.19205032289028168, "learning_rate": 1e-05, "loss": 0.0256, "step": 358300 }, { "epoch": 0.003584, "grad_norm": 0.2592017948627472, "learning_rate": 1e-05, "loss": 0.025, "step": 358400 }, { "epoch": 0.003585, "grad_norm": 0.22000306844711304, "learning_rate": 1e-05, "loss": 0.0247, "step": 358500 }, { "epoch": 0.003586, "grad_norm": 0.24931752681732178, "learning_rate": 1e-05, "loss": 0.0256, "step": 358600 }, { "epoch": 0.003587, "grad_norm": 0.239908367395401, "learning_rate": 1e-05, "loss": 0.0253, "step": 358700 }, { "epoch": 0.003588, "grad_norm": 0.22263160347938538, "learning_rate": 1e-05, "loss": 0.025, "step": 358800 }, { "epoch": 0.003589, "grad_norm": 0.17370916903018951, "learning_rate": 1e-05, "loss": 0.0248, "step": 358900 }, { "epoch": 0.00359, "grad_norm": 0.19698089361190796, "learning_rate": 1e-05, "loss": 0.0252, "step": 359000 }, { "epoch": 0.003591, "grad_norm": 0.19412487745285034, "learning_rate": 1e-05, "loss": 0.0251, "step": 359100 }, { "epoch": 0.003592, "grad_norm": 0.24610573053359985, "learning_rate": 1e-05, "loss": 0.0252, "step": 359200 }, { "epoch": 0.003593, "grad_norm": 0.1572159081697464, "learning_rate": 1e-05, "loss": 0.0253, "step": 359300 }, { "epoch": 0.003594, "grad_norm": 0.2635226547718048, "learning_rate": 1e-05, "loss": 0.0253, "step": 359400 }, { "epoch": 0.003595, "grad_norm": 0.2390194684267044, "learning_rate": 1e-05, "loss": 0.0256, "step": 359500 }, { "epoch": 0.003596, "grad_norm": 0.1880943775177002, "learning_rate": 1e-05, "loss": 0.0252, "step": 359600 }, { "epoch": 0.003597, "grad_norm": 0.17608967423439026, "learning_rate": 1e-05, "loss": 0.0249, "step": 359700 }, { "epoch": 0.003598, "grad_norm": 0.3419440686702728, "learning_rate": 1e-05, "loss": 0.0251, "step": 359800 }, { "epoch": 0.003599, "grad_norm": 0.21480712294578552, "learning_rate": 1e-05, "loss": 0.0249, "step": 359900 }, { "epoch": 0.0036, "grad_norm": 0.20239529013633728, "learning_rate": 1e-05, "loss": 0.0246, "step": 360000 }, { "epoch": 0.0036, "eval_loss": 0.023067966103553772, "eval_runtime": 180.1965, "eval_samples_per_second": 277.475, "eval_steps_per_second": 17.342, "step": 360000 }, { "epoch": 0.003601, "grad_norm": 0.20302771031856537, "learning_rate": 1e-05, "loss": 0.0254, "step": 360100 }, { "epoch": 0.003602, "grad_norm": 0.24560332298278809, "learning_rate": 1e-05, "loss": 0.0254, "step": 360200 }, { "epoch": 0.003603, "grad_norm": 0.2584492862224579, "learning_rate": 1e-05, "loss": 0.0249, "step": 360300 }, { "epoch": 0.003604, "grad_norm": 0.2886830270290375, "learning_rate": 1e-05, "loss": 0.0252, "step": 360400 }, { "epoch": 0.003605, "grad_norm": 0.1543215811252594, "learning_rate": 1e-05, "loss": 0.0256, "step": 360500 }, { "epoch": 0.003606, "grad_norm": 0.1785227656364441, "learning_rate": 1e-05, "loss": 0.0248, "step": 360600 }, { "epoch": 0.003607, "grad_norm": 0.2761559784412384, "learning_rate": 1e-05, "loss": 0.0259, "step": 360700 }, { "epoch": 0.003608, "grad_norm": 0.262396901845932, "learning_rate": 1e-05, "loss": 0.0249, "step": 360800 }, { "epoch": 0.003609, "grad_norm": 0.1713998019695282, "learning_rate": 1e-05, "loss": 0.0246, "step": 360900 }, { "epoch": 0.00361, "grad_norm": 0.26273655891418457, "learning_rate": 1e-05, "loss": 0.0246, "step": 361000 }, { "epoch": 0.003611, "grad_norm": 0.19543206691741943, "learning_rate": 1e-05, "loss": 0.0248, "step": 361100 }, { "epoch": 0.003612, "grad_norm": 0.2542403042316437, "learning_rate": 1e-05, "loss": 0.0254, "step": 361200 }, { "epoch": 0.003613, "grad_norm": 0.17178753018379211, "learning_rate": 1e-05, "loss": 0.0252, "step": 361300 }, { "epoch": 0.003614, "grad_norm": 0.23939181864261627, "learning_rate": 1e-05, "loss": 0.0249, "step": 361400 }, { "epoch": 0.003615, "grad_norm": 0.25077226758003235, "learning_rate": 1e-05, "loss": 0.0248, "step": 361500 }, { "epoch": 0.003616, "grad_norm": 0.23185382783412933, "learning_rate": 1e-05, "loss": 0.0257, "step": 361600 }, { "epoch": 0.003617, "grad_norm": 0.22628186643123627, "learning_rate": 1e-05, "loss": 0.0252, "step": 361700 }, { "epoch": 0.003618, "grad_norm": 0.20310889184474945, "learning_rate": 1e-05, "loss": 0.0253, "step": 361800 }, { "epoch": 0.003619, "grad_norm": 0.16925017535686493, "learning_rate": 1e-05, "loss": 0.0249, "step": 361900 }, { "epoch": 0.00362, "grad_norm": 0.25221946835517883, "learning_rate": 1e-05, "loss": 0.0245, "step": 362000 }, { "epoch": 0.003621, "grad_norm": 0.2255512923002243, "learning_rate": 1e-05, "loss": 0.0253, "step": 362100 }, { "epoch": 0.003622, "grad_norm": 0.256612628698349, "learning_rate": 1e-05, "loss": 0.0246, "step": 362200 }, { "epoch": 0.003623, "grad_norm": 0.18089304864406586, "learning_rate": 1e-05, "loss": 0.0251, "step": 362300 }, { "epoch": 0.003624, "grad_norm": 0.19653820991516113, "learning_rate": 1e-05, "loss": 0.025, "step": 362400 }, { "epoch": 0.003625, "grad_norm": 0.2273004800081253, "learning_rate": 1e-05, "loss": 0.0251, "step": 362500 }, { "epoch": 0.003626, "grad_norm": 0.1996285617351532, "learning_rate": 1e-05, "loss": 0.0253, "step": 362600 }, { "epoch": 0.003627, "grad_norm": 0.17695675790309906, "learning_rate": 1e-05, "loss": 0.0249, "step": 362700 }, { "epoch": 0.003628, "grad_norm": 0.2135741412639618, "learning_rate": 1e-05, "loss": 0.0254, "step": 362800 }, { "epoch": 0.003629, "grad_norm": 0.24207369983196259, "learning_rate": 1e-05, "loss": 0.0249, "step": 362900 }, { "epoch": 0.00363, "grad_norm": 0.163295179605484, "learning_rate": 1e-05, "loss": 0.0252, "step": 363000 }, { "epoch": 0.003631, "grad_norm": 0.20321440696716309, "learning_rate": 1e-05, "loss": 0.0249, "step": 363100 }, { "epoch": 0.003632, "grad_norm": 0.18990954756736755, "learning_rate": 1e-05, "loss": 0.0253, "step": 363200 }, { "epoch": 0.003633, "grad_norm": 0.24028554558753967, "learning_rate": 1e-05, "loss": 0.025, "step": 363300 }, { "epoch": 0.003634, "grad_norm": 0.19837458431720734, "learning_rate": 1e-05, "loss": 0.0257, "step": 363400 }, { "epoch": 0.003635, "grad_norm": 0.19971373677253723, "learning_rate": 1e-05, "loss": 0.0244, "step": 363500 }, { "epoch": 0.003636, "grad_norm": 0.22919034957885742, "learning_rate": 1e-05, "loss": 0.0247, "step": 363600 }, { "epoch": 0.003637, "grad_norm": 0.1647142618894577, "learning_rate": 1e-05, "loss": 0.0261, "step": 363700 }, { "epoch": 0.003638, "grad_norm": 0.23795802891254425, "learning_rate": 1e-05, "loss": 0.0255, "step": 363800 }, { "epoch": 0.003639, "grad_norm": 0.2529671788215637, "learning_rate": 1e-05, "loss": 0.0248, "step": 363900 }, { "epoch": 0.00364, "grad_norm": 0.17827951908111572, "learning_rate": 1e-05, "loss": 0.025, "step": 364000 }, { "epoch": 0.003641, "grad_norm": 0.2079813927412033, "learning_rate": 1e-05, "loss": 0.0247, "step": 364100 }, { "epoch": 0.003642, "grad_norm": 0.17558550834655762, "learning_rate": 1e-05, "loss": 0.0249, "step": 364200 }, { "epoch": 0.003643, "grad_norm": 0.19689811766147614, "learning_rate": 1e-05, "loss": 0.0256, "step": 364300 }, { "epoch": 0.003644, "grad_norm": 0.20036867260932922, "learning_rate": 1e-05, "loss": 0.0243, "step": 364400 }, { "epoch": 0.003645, "grad_norm": 0.21646912395954132, "learning_rate": 1e-05, "loss": 0.0253, "step": 364500 }, { "epoch": 0.003646, "grad_norm": 0.20635046064853668, "learning_rate": 1e-05, "loss": 0.0252, "step": 364600 }, { "epoch": 0.003647, "grad_norm": 0.19094569981098175, "learning_rate": 1e-05, "loss": 0.0244, "step": 364700 }, { "epoch": 0.003648, "grad_norm": 0.22726359963417053, "learning_rate": 1e-05, "loss": 0.0246, "step": 364800 }, { "epoch": 0.003649, "grad_norm": 0.16395846009254456, "learning_rate": 1e-05, "loss": 0.0247, "step": 364900 }, { "epoch": 0.00365, "grad_norm": 0.20203112065792084, "learning_rate": 1e-05, "loss": 0.025, "step": 365000 }, { "epoch": 0.003651, "grad_norm": 0.1840880960226059, "learning_rate": 1e-05, "loss": 0.0251, "step": 365100 }, { "epoch": 0.003652, "grad_norm": 0.1993907243013382, "learning_rate": 1e-05, "loss": 0.025, "step": 365200 }, { "epoch": 0.003653, "grad_norm": 0.16416646540164948, "learning_rate": 1e-05, "loss": 0.0253, "step": 365300 }, { "epoch": 0.003654, "grad_norm": 0.19940659403800964, "learning_rate": 1e-05, "loss": 0.0252, "step": 365400 }, { "epoch": 0.003655, "grad_norm": 0.19726437330245972, "learning_rate": 1e-05, "loss": 0.0246, "step": 365500 }, { "epoch": 0.003656, "grad_norm": 0.23533031344413757, "learning_rate": 1e-05, "loss": 0.0245, "step": 365600 }, { "epoch": 0.003657, "grad_norm": 0.1882912665605545, "learning_rate": 1e-05, "loss": 0.0255, "step": 365700 }, { "epoch": 0.003658, "grad_norm": 0.20943276584148407, "learning_rate": 1e-05, "loss": 0.0253, "step": 365800 }, { "epoch": 0.003659, "grad_norm": 0.17838990688323975, "learning_rate": 1e-05, "loss": 0.0248, "step": 365900 }, { "epoch": 0.00366, "grad_norm": 0.21286004781723022, "learning_rate": 1e-05, "loss": 0.0248, "step": 366000 }, { "epoch": 0.003661, "grad_norm": 0.32256683707237244, "learning_rate": 1e-05, "loss": 0.025, "step": 366100 }, { "epoch": 0.003662, "grad_norm": 0.26632070541381836, "learning_rate": 1e-05, "loss": 0.0251, "step": 366200 }, { "epoch": 0.003663, "grad_norm": 0.22817842662334442, "learning_rate": 1e-05, "loss": 0.0248, "step": 366300 }, { "epoch": 0.003664, "grad_norm": 0.20168069005012512, "learning_rate": 1e-05, "loss": 0.0246, "step": 366400 }, { "epoch": 0.003665, "grad_norm": 0.20437271893024445, "learning_rate": 1e-05, "loss": 0.024, "step": 366500 }, { "epoch": 0.003666, "grad_norm": 0.274840772151947, "learning_rate": 1e-05, "loss": 0.0251, "step": 366600 }, { "epoch": 0.003667, "grad_norm": 0.2555960416793823, "learning_rate": 1e-05, "loss": 0.0248, "step": 366700 }, { "epoch": 0.003668, "grad_norm": 0.26440495252609253, "learning_rate": 1e-05, "loss": 0.0252, "step": 366800 }, { "epoch": 0.003669, "grad_norm": 0.22074559330940247, "learning_rate": 1e-05, "loss": 0.0251, "step": 366900 }, { "epoch": 0.00367, "grad_norm": 0.20398014783859253, "learning_rate": 1e-05, "loss": 0.0253, "step": 367000 }, { "epoch": 0.003671, "grad_norm": 0.30584216117858887, "learning_rate": 1e-05, "loss": 0.0246, "step": 367100 }, { "epoch": 0.003672, "grad_norm": 0.18554237484931946, "learning_rate": 1e-05, "loss": 0.0251, "step": 367200 }, { "epoch": 0.003673, "grad_norm": 0.198908269405365, "learning_rate": 1e-05, "loss": 0.0253, "step": 367300 }, { "epoch": 0.003674, "grad_norm": 0.4103855490684509, "learning_rate": 1e-05, "loss": 0.0248, "step": 367400 }, { "epoch": 0.003675, "grad_norm": 0.23869557678699493, "learning_rate": 1e-05, "loss": 0.025, "step": 367500 }, { "epoch": 0.003676, "grad_norm": 0.19531629979610443, "learning_rate": 1e-05, "loss": 0.025, "step": 367600 }, { "epoch": 0.003677, "grad_norm": 0.21761277318000793, "learning_rate": 1e-05, "loss": 0.0251, "step": 367700 }, { "epoch": 0.003678, "grad_norm": 0.2739604115486145, "learning_rate": 1e-05, "loss": 0.025, "step": 367800 }, { "epoch": 0.003679, "grad_norm": 0.19281220436096191, "learning_rate": 1e-05, "loss": 0.0245, "step": 367900 }, { "epoch": 0.00368, "grad_norm": 0.17426267266273499, "learning_rate": 1e-05, "loss": 0.0251, "step": 368000 }, { "epoch": 0.003681, "grad_norm": 0.2839365303516388, "learning_rate": 1e-05, "loss": 0.0252, "step": 368100 }, { "epoch": 0.003682, "grad_norm": 0.24606680870056152, "learning_rate": 1e-05, "loss": 0.0246, "step": 368200 }, { "epoch": 0.003683, "grad_norm": 0.20925748348236084, "learning_rate": 1e-05, "loss": 0.0254, "step": 368300 }, { "epoch": 0.003684, "grad_norm": 0.22718989849090576, "learning_rate": 1e-05, "loss": 0.0251, "step": 368400 }, { "epoch": 0.003685, "grad_norm": 0.19015607237815857, "learning_rate": 1e-05, "loss": 0.0248, "step": 368500 }, { "epoch": 0.003686, "grad_norm": 0.2183292657136917, "learning_rate": 1e-05, "loss": 0.0255, "step": 368600 }, { "epoch": 0.003687, "grad_norm": 0.38502976298332214, "learning_rate": 1e-05, "loss": 0.0244, "step": 368700 }, { "epoch": 0.003688, "grad_norm": 0.17979417741298676, "learning_rate": 1e-05, "loss": 0.0253, "step": 368800 }, { "epoch": 0.003689, "grad_norm": 0.40981394052505493, "learning_rate": 1e-05, "loss": 0.0253, "step": 368900 }, { "epoch": 0.00369, "grad_norm": 0.26270484924316406, "learning_rate": 1e-05, "loss": 0.0249, "step": 369000 }, { "epoch": 0.003691, "grad_norm": 0.2628958821296692, "learning_rate": 1e-05, "loss": 0.0251, "step": 369100 }, { "epoch": 0.003692, "grad_norm": 0.21812467277050018, "learning_rate": 1e-05, "loss": 0.0247, "step": 369200 }, { "epoch": 0.003693, "grad_norm": 0.2034122347831726, "learning_rate": 1e-05, "loss": 0.0249, "step": 369300 }, { "epoch": 0.003694, "grad_norm": 0.30062997341156006, "learning_rate": 1e-05, "loss": 0.0246, "step": 369400 }, { "epoch": 0.003695, "grad_norm": 0.17720966041088104, "learning_rate": 1e-05, "loss": 0.025, "step": 369500 }, { "epoch": 0.003696, "grad_norm": 0.1701696664094925, "learning_rate": 1e-05, "loss": 0.0245, "step": 369600 }, { "epoch": 0.003697, "grad_norm": 0.23631073534488678, "learning_rate": 1e-05, "loss": 0.0249, "step": 369700 }, { "epoch": 0.003698, "grad_norm": 0.20604847371578217, "learning_rate": 1e-05, "loss": 0.0244, "step": 369800 }, { "epoch": 0.003699, "grad_norm": 0.2382778376340866, "learning_rate": 1e-05, "loss": 0.0246, "step": 369900 }, { "epoch": 0.0037, "grad_norm": 0.2551063299179077, "learning_rate": 1e-05, "loss": 0.0255, "step": 370000 }, { "epoch": 0.003701, "grad_norm": 0.21359778940677643, "learning_rate": 1e-05, "loss": 0.025, "step": 370100 }, { "epoch": 0.003702, "grad_norm": 0.23730020225048065, "learning_rate": 1e-05, "loss": 0.0249, "step": 370200 }, { "epoch": 0.003703, "grad_norm": 0.1928272247314453, "learning_rate": 1e-05, "loss": 0.025, "step": 370300 }, { "epoch": 0.003704, "grad_norm": 0.19780988991260529, "learning_rate": 1e-05, "loss": 0.0253, "step": 370400 }, { "epoch": 0.003705, "grad_norm": 0.17017987370491028, "learning_rate": 1e-05, "loss": 0.0247, "step": 370500 }, { "epoch": 0.003706, "grad_norm": 0.23881371319293976, "learning_rate": 1e-05, "loss": 0.0251, "step": 370600 }, { "epoch": 0.003707, "grad_norm": 0.173597052693367, "learning_rate": 1e-05, "loss": 0.0249, "step": 370700 }, { "epoch": 0.003708, "grad_norm": 0.27132511138916016, "learning_rate": 1e-05, "loss": 0.0248, "step": 370800 }, { "epoch": 0.003709, "grad_norm": 0.2218388170003891, "learning_rate": 1e-05, "loss": 0.0255, "step": 370900 }, { "epoch": 0.00371, "grad_norm": 0.1846773773431778, "learning_rate": 1e-05, "loss": 0.0253, "step": 371000 }, { "epoch": 0.003711, "grad_norm": 0.19518236815929413, "learning_rate": 1e-05, "loss": 0.0251, "step": 371100 }, { "epoch": 0.003712, "grad_norm": 0.23849821090698242, "learning_rate": 1e-05, "loss": 0.0247, "step": 371200 }, { "epoch": 0.003713, "grad_norm": 0.17952241003513336, "learning_rate": 1e-05, "loss": 0.0245, "step": 371300 }, { "epoch": 0.003714, "grad_norm": 0.24987390637397766, "learning_rate": 1e-05, "loss": 0.0249, "step": 371400 }, { "epoch": 0.003715, "grad_norm": 0.19220592081546783, "learning_rate": 1e-05, "loss": 0.0246, "step": 371500 }, { "epoch": 0.003716, "grad_norm": 0.23864246904850006, "learning_rate": 1e-05, "loss": 0.0252, "step": 371600 }, { "epoch": 0.003717, "grad_norm": 0.22066572308540344, "learning_rate": 1e-05, "loss": 0.0252, "step": 371700 }, { "epoch": 0.003718, "grad_norm": 0.21779391169548035, "learning_rate": 1e-05, "loss": 0.0244, "step": 371800 }, { "epoch": 0.003719, "grad_norm": 0.24789579212665558, "learning_rate": 1e-05, "loss": 0.0253, "step": 371900 }, { "epoch": 0.00372, "grad_norm": 0.26520082354545593, "learning_rate": 1e-05, "loss": 0.0246, "step": 372000 }, { "epoch": 0.003721, "grad_norm": 0.23471081256866455, "learning_rate": 1e-05, "loss": 0.025, "step": 372100 }, { "epoch": 0.003722, "grad_norm": 0.2799331247806549, "learning_rate": 1e-05, "loss": 0.0247, "step": 372200 }, { "epoch": 0.003723, "grad_norm": 0.1931152641773224, "learning_rate": 1e-05, "loss": 0.0245, "step": 372300 }, { "epoch": 0.003724, "grad_norm": 0.2081960290670395, "learning_rate": 1e-05, "loss": 0.0245, "step": 372400 }, { "epoch": 0.003725, "grad_norm": 0.26411405205726624, "learning_rate": 1e-05, "loss": 0.0246, "step": 372500 }, { "epoch": 0.003726, "grad_norm": 0.19111527502536774, "learning_rate": 1e-05, "loss": 0.0253, "step": 372600 }, { "epoch": 0.003727, "grad_norm": 0.1998601257801056, "learning_rate": 1e-05, "loss": 0.0256, "step": 372700 }, { "epoch": 0.003728, "grad_norm": 0.26529842615127563, "learning_rate": 1e-05, "loss": 0.025, "step": 372800 }, { "epoch": 0.003729, "grad_norm": 0.224734365940094, "learning_rate": 1e-05, "loss": 0.0247, "step": 372900 }, { "epoch": 0.00373, "grad_norm": 0.32914549112319946, "learning_rate": 1e-05, "loss": 0.0248, "step": 373000 }, { "epoch": 0.003731, "grad_norm": 0.2233278751373291, "learning_rate": 1e-05, "loss": 0.0248, "step": 373100 }, { "epoch": 0.003732, "grad_norm": 0.1759822815656662, "learning_rate": 1e-05, "loss": 0.0246, "step": 373200 }, { "epoch": 0.003733, "grad_norm": 0.19412517547607422, "learning_rate": 1e-05, "loss": 0.025, "step": 373300 }, { "epoch": 0.003734, "grad_norm": 0.19160285592079163, "learning_rate": 1e-05, "loss": 0.0242, "step": 373400 }, { "epoch": 0.003735, "grad_norm": 0.20060423016548157, "learning_rate": 1e-05, "loss": 0.0244, "step": 373500 }, { "epoch": 0.003736, "grad_norm": 0.19575592875480652, "learning_rate": 1e-05, "loss": 0.0245, "step": 373600 }, { "epoch": 0.003737, "grad_norm": 0.2239302396774292, "learning_rate": 1e-05, "loss": 0.0246, "step": 373700 }, { "epoch": 0.003738, "grad_norm": 0.19177569448947906, "learning_rate": 1e-05, "loss": 0.0253, "step": 373800 }, { "epoch": 0.003739, "grad_norm": 0.26498982310295105, "learning_rate": 1e-05, "loss": 0.0252, "step": 373900 }, { "epoch": 0.00374, "grad_norm": 0.17887812852859497, "learning_rate": 1e-05, "loss": 0.0246, "step": 374000 }, { "epoch": 0.003741, "grad_norm": 0.22321709990501404, "learning_rate": 1e-05, "loss": 0.0256, "step": 374100 }, { "epoch": 0.003742, "grad_norm": 0.17493513226509094, "learning_rate": 1e-05, "loss": 0.0244, "step": 374200 }, { "epoch": 0.003743, "grad_norm": 0.2125990092754364, "learning_rate": 1e-05, "loss": 0.0246, "step": 374300 }, { "epoch": 0.003744, "grad_norm": 0.22383056581020355, "learning_rate": 1e-05, "loss": 0.0245, "step": 374400 }, { "epoch": 0.003745, "grad_norm": 0.19103258848190308, "learning_rate": 1e-05, "loss": 0.0249, "step": 374500 }, { "epoch": 0.003746, "grad_norm": 0.24592848122119904, "learning_rate": 1e-05, "loss": 0.0245, "step": 374600 }, { "epoch": 0.003747, "grad_norm": 0.21821942925453186, "learning_rate": 1e-05, "loss": 0.025, "step": 374700 }, { "epoch": 0.003748, "grad_norm": 0.21425941586494446, "learning_rate": 1e-05, "loss": 0.0247, "step": 374800 }, { "epoch": 0.003749, "grad_norm": 0.21215233206748962, "learning_rate": 1e-05, "loss": 0.0246, "step": 374900 }, { "epoch": 0.00375, "grad_norm": 0.23215535283088684, "learning_rate": 1e-05, "loss": 0.0249, "step": 375000 }, { "epoch": 0.003751, "grad_norm": 0.23559755086898804, "learning_rate": 1e-05, "loss": 0.0247, "step": 375100 }, { "epoch": 0.003752, "grad_norm": 0.18183013796806335, "learning_rate": 1e-05, "loss": 0.0248, "step": 375200 }, { "epoch": 0.003753, "grad_norm": 0.21647806465625763, "learning_rate": 1e-05, "loss": 0.0249, "step": 375300 }, { "epoch": 0.003754, "grad_norm": 0.1759694367647171, "learning_rate": 1e-05, "loss": 0.0251, "step": 375400 }, { "epoch": 0.003755, "grad_norm": 0.24636122584342957, "learning_rate": 1e-05, "loss": 0.024, "step": 375500 }, { "epoch": 0.003756, "grad_norm": 0.2067081332206726, "learning_rate": 1e-05, "loss": 0.0249, "step": 375600 }, { "epoch": 0.003757, "grad_norm": 0.2415318489074707, "learning_rate": 1e-05, "loss": 0.0243, "step": 375700 }, { "epoch": 0.003758, "grad_norm": 0.19353999197483063, "learning_rate": 1e-05, "loss": 0.0249, "step": 375800 }, { "epoch": 0.003759, "grad_norm": 0.2648521065711975, "learning_rate": 1e-05, "loss": 0.0241, "step": 375900 }, { "epoch": 0.00376, "grad_norm": 0.24222242832183838, "learning_rate": 1e-05, "loss": 0.0249, "step": 376000 }, { "epoch": 0.003761, "grad_norm": 0.29870426654815674, "learning_rate": 1e-05, "loss": 0.0245, "step": 376100 }, { "epoch": 0.003762, "grad_norm": 0.16651245951652527, "learning_rate": 1e-05, "loss": 0.0249, "step": 376200 }, { "epoch": 0.003763, "grad_norm": 0.1661238968372345, "learning_rate": 1e-05, "loss": 0.0247, "step": 376300 }, { "epoch": 0.003764, "grad_norm": 0.2628041207790375, "learning_rate": 1e-05, "loss": 0.0249, "step": 376400 }, { "epoch": 0.003765, "grad_norm": 0.2032880187034607, "learning_rate": 1e-05, "loss": 0.0244, "step": 376500 }, { "epoch": 0.003766, "grad_norm": 0.17853261530399323, "learning_rate": 1e-05, "loss": 0.0252, "step": 376600 }, { "epoch": 0.003767, "grad_norm": 0.2467428296804428, "learning_rate": 1e-05, "loss": 0.0245, "step": 376700 }, { "epoch": 0.003768, "grad_norm": 0.34390559792518616, "learning_rate": 1e-05, "loss": 0.0244, "step": 376800 }, { "epoch": 0.003769, "grad_norm": 0.2217007875442505, "learning_rate": 1e-05, "loss": 0.0248, "step": 376900 }, { "epoch": 0.00377, "grad_norm": 0.21123437583446503, "learning_rate": 1e-05, "loss": 0.0249, "step": 377000 }, { "epoch": 0.003771, "grad_norm": 0.18240004777908325, "learning_rate": 1e-05, "loss": 0.0249, "step": 377100 }, { "epoch": 0.003772, "grad_norm": 0.2816891372203827, "learning_rate": 1e-05, "loss": 0.0244, "step": 377200 }, { "epoch": 0.003773, "grad_norm": 0.2240234762430191, "learning_rate": 1e-05, "loss": 0.0247, "step": 377300 }, { "epoch": 0.003774, "grad_norm": 0.24389906227588654, "learning_rate": 1e-05, "loss": 0.0248, "step": 377400 }, { "epoch": 0.003775, "grad_norm": 0.20726659893989563, "learning_rate": 1e-05, "loss": 0.0243, "step": 377500 }, { "epoch": 0.003776, "grad_norm": 0.22083258628845215, "learning_rate": 1e-05, "loss": 0.0247, "step": 377600 }, { "epoch": 0.003777, "grad_norm": 0.3208354711532593, "learning_rate": 1e-05, "loss": 0.0246, "step": 377700 }, { "epoch": 0.003778, "grad_norm": 0.2520400583744049, "learning_rate": 1e-05, "loss": 0.0245, "step": 377800 }, { "epoch": 0.003779, "grad_norm": 0.1842058151960373, "learning_rate": 1e-05, "loss": 0.0244, "step": 377900 }, { "epoch": 0.00378, "grad_norm": 0.24821709096431732, "learning_rate": 1e-05, "loss": 0.0248, "step": 378000 }, { "epoch": 0.003781, "grad_norm": 0.18983368575572968, "learning_rate": 1e-05, "loss": 0.0246, "step": 378100 }, { "epoch": 0.003782, "grad_norm": 0.2608160078525543, "learning_rate": 1e-05, "loss": 0.0244, "step": 378200 }, { "epoch": 0.003783, "grad_norm": 0.18209455907344818, "learning_rate": 1e-05, "loss": 0.0246, "step": 378300 }, { "epoch": 0.003784, "grad_norm": 0.3501066267490387, "learning_rate": 1e-05, "loss": 0.0245, "step": 378400 }, { "epoch": 0.003785, "grad_norm": 0.25017502903938293, "learning_rate": 1e-05, "loss": 0.0251, "step": 378500 }, { "epoch": 0.003786, "grad_norm": 0.15793612599372864, "learning_rate": 1e-05, "loss": 0.0243, "step": 378600 }, { "epoch": 0.003787, "grad_norm": 0.21350181102752686, "learning_rate": 1e-05, "loss": 0.0247, "step": 378700 }, { "epoch": 0.003788, "grad_norm": 0.2721436619758606, "learning_rate": 1e-05, "loss": 0.0248, "step": 378800 }, { "epoch": 0.003789, "grad_norm": 0.21430189907550812, "learning_rate": 1e-05, "loss": 0.0249, "step": 378900 }, { "epoch": 0.00379, "grad_norm": 0.1756872534751892, "learning_rate": 1e-05, "loss": 0.0247, "step": 379000 }, { "epoch": 0.003791, "grad_norm": 0.19413214921951294, "learning_rate": 1e-05, "loss": 0.0244, "step": 379100 }, { "epoch": 0.003792, "grad_norm": 0.3006983697414398, "learning_rate": 1e-05, "loss": 0.0247, "step": 379200 }, { "epoch": 0.003793, "grad_norm": 0.1765180379152298, "learning_rate": 1e-05, "loss": 0.0247, "step": 379300 }, { "epoch": 0.003794, "grad_norm": 0.23979610204696655, "learning_rate": 1e-05, "loss": 0.0246, "step": 379400 }, { "epoch": 0.003795, "grad_norm": 0.23277638852596283, "learning_rate": 1e-05, "loss": 0.0242, "step": 379500 }, { "epoch": 0.003796, "grad_norm": 0.3477649986743927, "learning_rate": 1e-05, "loss": 0.0245, "step": 379600 }, { "epoch": 0.003797, "grad_norm": 0.2395758479833603, "learning_rate": 1e-05, "loss": 0.0249, "step": 379700 }, { "epoch": 0.003798, "grad_norm": 0.22816447913646698, "learning_rate": 1e-05, "loss": 0.0243, "step": 379800 }, { "epoch": 0.003799, "grad_norm": 0.3722003400325775, "learning_rate": 1e-05, "loss": 0.0243, "step": 379900 }, { "epoch": 0.0038, "grad_norm": 0.2531076967716217, "learning_rate": 1e-05, "loss": 0.0247, "step": 380000 }, { "epoch": 0.0038, "eval_loss": 0.023014426231384277, "eval_runtime": 170.0219, "eval_samples_per_second": 294.08, "eval_steps_per_second": 18.38, "step": 380000 }, { "epoch": 0.003801, "grad_norm": 0.2355535477399826, "learning_rate": 1e-05, "loss": 0.0241, "step": 380100 }, { "epoch": 0.003802, "grad_norm": 0.24661356210708618, "learning_rate": 1e-05, "loss": 0.0247, "step": 380200 }, { "epoch": 0.003803, "grad_norm": 0.20010404288768768, "learning_rate": 1e-05, "loss": 0.0246, "step": 380300 }, { "epoch": 0.003804, "grad_norm": 0.24043266475200653, "learning_rate": 1e-05, "loss": 0.0241, "step": 380400 }, { "epoch": 0.003805, "grad_norm": 0.237777441740036, "learning_rate": 1e-05, "loss": 0.0249, "step": 380500 }, { "epoch": 0.003806, "grad_norm": 0.4071030914783478, "learning_rate": 1e-05, "loss": 0.0248, "step": 380600 }, { "epoch": 0.003807, "grad_norm": 0.19982925057411194, "learning_rate": 1e-05, "loss": 0.025, "step": 380700 }, { "epoch": 0.003808, "grad_norm": 0.21891255676746368, "learning_rate": 1e-05, "loss": 0.025, "step": 380800 }, { "epoch": 0.003809, "grad_norm": 0.19348196685314178, "learning_rate": 1e-05, "loss": 0.0245, "step": 380900 }, { "epoch": 0.00381, "grad_norm": 0.19093260169029236, "learning_rate": 1e-05, "loss": 0.0248, "step": 381000 }, { "epoch": 0.003811, "grad_norm": 0.19013600051403046, "learning_rate": 1e-05, "loss": 0.0247, "step": 381100 }, { "epoch": 0.003812, "grad_norm": 0.2168656885623932, "learning_rate": 1e-05, "loss": 0.0252, "step": 381200 }, { "epoch": 0.003813, "grad_norm": 0.2309120148420334, "learning_rate": 1e-05, "loss": 0.0248, "step": 381300 }, { "epoch": 0.003814, "grad_norm": 0.2185588926076889, "learning_rate": 1e-05, "loss": 0.0247, "step": 381400 }, { "epoch": 0.003815, "grad_norm": 0.20840859413146973, "learning_rate": 1e-05, "loss": 0.0244, "step": 381500 }, { "epoch": 0.003816, "grad_norm": 0.16162413358688354, "learning_rate": 1e-05, "loss": 0.0244, "step": 381600 }, { "epoch": 0.003817, "grad_norm": 0.20517896115779877, "learning_rate": 1e-05, "loss": 0.024, "step": 381700 }, { "epoch": 0.003818, "grad_norm": 0.28155753016471863, "learning_rate": 1e-05, "loss": 0.0244, "step": 381800 }, { "epoch": 0.003819, "grad_norm": 0.20702910423278809, "learning_rate": 1e-05, "loss": 0.0247, "step": 381900 }, { "epoch": 0.00382, "grad_norm": 0.2714799642562866, "learning_rate": 1e-05, "loss": 0.0252, "step": 382000 }, { "epoch": 0.003821, "grad_norm": 0.21756194531917572, "learning_rate": 1e-05, "loss": 0.0246, "step": 382100 }, { "epoch": 0.003822, "grad_norm": 0.20111438632011414, "learning_rate": 1e-05, "loss": 0.0246, "step": 382200 }, { "epoch": 0.003823, "grad_norm": 0.23010657727718353, "learning_rate": 1e-05, "loss": 0.0246, "step": 382300 }, { "epoch": 0.003824, "grad_norm": 0.24687328934669495, "learning_rate": 1e-05, "loss": 0.0247, "step": 382400 }, { "epoch": 0.003825, "grad_norm": 0.17295311391353607, "learning_rate": 1e-05, "loss": 0.0243, "step": 382500 }, { "epoch": 0.003826, "grad_norm": 0.24085505306720734, "learning_rate": 1e-05, "loss": 0.0244, "step": 382600 }, { "epoch": 0.003827, "grad_norm": 0.2054484486579895, "learning_rate": 1e-05, "loss": 0.0241, "step": 382700 }, { "epoch": 0.003828, "grad_norm": 0.3595985770225525, "learning_rate": 1e-05, "loss": 0.0249, "step": 382800 }, { "epoch": 0.003829, "grad_norm": 0.24556712806224823, "learning_rate": 1e-05, "loss": 0.0247, "step": 382900 }, { "epoch": 0.00383, "grad_norm": 0.21064861118793488, "learning_rate": 1e-05, "loss": 0.0245, "step": 383000 }, { "epoch": 0.003831, "grad_norm": 0.23798032104969025, "learning_rate": 1e-05, "loss": 0.0246, "step": 383100 }, { "epoch": 0.003832, "grad_norm": 0.2243366241455078, "learning_rate": 1e-05, "loss": 0.0243, "step": 383200 }, { "epoch": 0.003833, "grad_norm": 0.18297971785068512, "learning_rate": 1e-05, "loss": 0.0244, "step": 383300 }, { "epoch": 0.003834, "grad_norm": 0.2381155639886856, "learning_rate": 1e-05, "loss": 0.0246, "step": 383400 }, { "epoch": 0.003835, "grad_norm": 0.1784166544675827, "learning_rate": 1e-05, "loss": 0.0246, "step": 383500 }, { "epoch": 0.003836, "grad_norm": 0.2214227318763733, "learning_rate": 1e-05, "loss": 0.0245, "step": 383600 }, { "epoch": 0.003837, "grad_norm": 0.22347001731395721, "learning_rate": 1e-05, "loss": 0.0245, "step": 383700 }, { "epoch": 0.003838, "grad_norm": 0.1696915328502655, "learning_rate": 1e-05, "loss": 0.0246, "step": 383800 }, { "epoch": 0.003839, "grad_norm": 0.21490193903446198, "learning_rate": 1e-05, "loss": 0.024, "step": 383900 }, { "epoch": 0.00384, "grad_norm": 0.22424010932445526, "learning_rate": 1e-05, "loss": 0.0244, "step": 384000 }, { "epoch": 0.003841, "grad_norm": 0.23634181916713715, "learning_rate": 1e-05, "loss": 0.0253, "step": 384100 }, { "epoch": 0.003842, "grad_norm": 0.273552268743515, "learning_rate": 1e-05, "loss": 0.0242, "step": 384200 }, { "epoch": 0.003843, "grad_norm": 0.28121131658554077, "learning_rate": 1e-05, "loss": 0.0249, "step": 384300 }, { "epoch": 0.003844, "grad_norm": 0.15125665068626404, "learning_rate": 1e-05, "loss": 0.0246, "step": 384400 }, { "epoch": 0.003845, "grad_norm": 0.2055521309375763, "learning_rate": 1e-05, "loss": 0.0249, "step": 384500 }, { "epoch": 0.003846, "grad_norm": 0.3157076835632324, "learning_rate": 1e-05, "loss": 0.0246, "step": 384600 }, { "epoch": 0.003847, "grad_norm": 0.156753808259964, "learning_rate": 1e-05, "loss": 0.0243, "step": 384700 }, { "epoch": 0.003848, "grad_norm": 0.21320556104183197, "learning_rate": 1e-05, "loss": 0.0249, "step": 384800 }, { "epoch": 0.003849, "grad_norm": 0.17284367978572845, "learning_rate": 1e-05, "loss": 0.0252, "step": 384900 }, { "epoch": 0.00385, "grad_norm": 0.19024644792079926, "learning_rate": 1e-05, "loss": 0.0244, "step": 385000 }, { "epoch": 0.003851, "grad_norm": 0.20551486313343048, "learning_rate": 1e-05, "loss": 0.0241, "step": 385100 }, { "epoch": 0.003852, "grad_norm": 0.18045629560947418, "learning_rate": 1e-05, "loss": 0.0241, "step": 385200 }, { "epoch": 0.003853, "grad_norm": 0.281615674495697, "learning_rate": 1e-05, "loss": 0.0242, "step": 385300 }, { "epoch": 0.003854, "grad_norm": 0.19528232514858246, "learning_rate": 1e-05, "loss": 0.0244, "step": 385400 }, { "epoch": 0.003855, "grad_norm": 0.1785202920436859, "learning_rate": 1e-05, "loss": 0.0241, "step": 385500 }, { "epoch": 0.003856, "grad_norm": 0.2536911964416504, "learning_rate": 1e-05, "loss": 0.0242, "step": 385600 }, { "epoch": 0.003857, "grad_norm": 0.21435323357582092, "learning_rate": 1e-05, "loss": 0.0243, "step": 385700 }, { "epoch": 0.003858, "grad_norm": 0.1920408010482788, "learning_rate": 1e-05, "loss": 0.0243, "step": 385800 }, { "epoch": 0.003859, "grad_norm": 0.29913750290870667, "learning_rate": 1e-05, "loss": 0.0246, "step": 385900 }, { "epoch": 0.00386, "grad_norm": 0.22562415897846222, "learning_rate": 1e-05, "loss": 0.0244, "step": 386000 }, { "epoch": 0.003861, "grad_norm": 0.1923309713602066, "learning_rate": 1e-05, "loss": 0.0248, "step": 386100 }, { "epoch": 0.003862, "grad_norm": 0.22699762880802155, "learning_rate": 1e-05, "loss": 0.0248, "step": 386200 }, { "epoch": 0.003863, "grad_norm": 0.15286476910114288, "learning_rate": 1e-05, "loss": 0.024, "step": 386300 }, { "epoch": 0.003864, "grad_norm": 0.17886394262313843, "learning_rate": 1e-05, "loss": 0.0244, "step": 386400 }, { "epoch": 0.003865, "grad_norm": 0.2340906411409378, "learning_rate": 1e-05, "loss": 0.0245, "step": 386500 }, { "epoch": 0.003866, "grad_norm": 0.19572274386882782, "learning_rate": 1e-05, "loss": 0.0246, "step": 386600 }, { "epoch": 0.003867, "grad_norm": 0.186109259724617, "learning_rate": 1e-05, "loss": 0.0246, "step": 386700 }, { "epoch": 0.003868, "grad_norm": 0.2537018954753876, "learning_rate": 1e-05, "loss": 0.0245, "step": 386800 }, { "epoch": 0.003869, "grad_norm": 0.18626907467842102, "learning_rate": 1e-05, "loss": 0.0249, "step": 386900 }, { "epoch": 0.00387, "grad_norm": 0.16478435695171356, "learning_rate": 1e-05, "loss": 0.0246, "step": 387000 }, { "epoch": 0.003871, "grad_norm": 0.20890071988105774, "learning_rate": 1e-05, "loss": 0.0247, "step": 387100 }, { "epoch": 0.003872, "grad_norm": 0.1843041479587555, "learning_rate": 1e-05, "loss": 0.0246, "step": 387200 }, { "epoch": 0.003873, "grad_norm": 0.1669585108757019, "learning_rate": 1e-05, "loss": 0.024, "step": 387300 }, { "epoch": 0.003874, "grad_norm": 0.1874060183763504, "learning_rate": 1e-05, "loss": 0.0239, "step": 387400 }, { "epoch": 0.003875, "grad_norm": 0.23247629404067993, "learning_rate": 1e-05, "loss": 0.0238, "step": 387500 }, { "epoch": 0.003876, "grad_norm": 0.17936177551746368, "learning_rate": 1e-05, "loss": 0.0244, "step": 387600 }, { "epoch": 0.003877, "grad_norm": 0.21438287198543549, "learning_rate": 1e-05, "loss": 0.0241, "step": 387700 }, { "epoch": 0.003878, "grad_norm": 0.2156336009502411, "learning_rate": 1e-05, "loss": 0.0245, "step": 387800 }, { "epoch": 0.003879, "grad_norm": 0.20199619233608246, "learning_rate": 1e-05, "loss": 0.0244, "step": 387900 }, { "epoch": 0.00388, "grad_norm": 0.220320925116539, "learning_rate": 1e-05, "loss": 0.0242, "step": 388000 }, { "epoch": 0.003881, "grad_norm": 0.3209114372730255, "learning_rate": 1e-05, "loss": 0.025, "step": 388100 }, { "epoch": 0.003882, "grad_norm": 0.19384653866291046, "learning_rate": 1e-05, "loss": 0.0245, "step": 388200 }, { "epoch": 0.003883, "grad_norm": 0.1904648244380951, "learning_rate": 1e-05, "loss": 0.0245, "step": 388300 }, { "epoch": 0.003884, "grad_norm": 0.1954963058233261, "learning_rate": 1e-05, "loss": 0.0245, "step": 388400 }, { "epoch": 0.003885, "grad_norm": 0.17553012073040009, "learning_rate": 1e-05, "loss": 0.0245, "step": 388500 }, { "epoch": 0.003886, "grad_norm": 0.19883398711681366, "learning_rate": 1e-05, "loss": 0.0241, "step": 388600 }, { "epoch": 0.003887, "grad_norm": 0.2783505618572235, "learning_rate": 1e-05, "loss": 0.0245, "step": 388700 }, { "epoch": 0.003888, "grad_norm": 0.20686647295951843, "learning_rate": 1e-05, "loss": 0.0246, "step": 388800 }, { "epoch": 0.003889, "grad_norm": 0.21686680614948273, "learning_rate": 1e-05, "loss": 0.0243, "step": 388900 }, { "epoch": 0.00389, "grad_norm": 0.23873895406723022, "learning_rate": 1e-05, "loss": 0.0247, "step": 389000 }, { "epoch": 0.003891, "grad_norm": 0.26515287160873413, "learning_rate": 1e-05, "loss": 0.0244, "step": 389100 }, { "epoch": 0.003892, "grad_norm": 0.16223591566085815, "learning_rate": 1e-05, "loss": 0.0245, "step": 389200 }, { "epoch": 0.003893, "grad_norm": 0.31088268756866455, "learning_rate": 1e-05, "loss": 0.0241, "step": 389300 }, { "epoch": 0.003894, "grad_norm": 0.2626400589942932, "learning_rate": 1e-05, "loss": 0.0247, "step": 389400 }, { "epoch": 0.003895, "grad_norm": 0.18556109070777893, "learning_rate": 1e-05, "loss": 0.0242, "step": 389500 }, { "epoch": 0.003896, "grad_norm": 0.24392524361610413, "learning_rate": 1e-05, "loss": 0.0251, "step": 389600 }, { "epoch": 0.003897, "grad_norm": 0.1873825341463089, "learning_rate": 1e-05, "loss": 0.0244, "step": 389700 }, { "epoch": 0.003898, "grad_norm": 0.18333490192890167, "learning_rate": 1e-05, "loss": 0.0248, "step": 389800 }, { "epoch": 0.003899, "grad_norm": 0.18217316269874573, "learning_rate": 1e-05, "loss": 0.0242, "step": 389900 }, { "epoch": 0.0039, "grad_norm": 0.15103434026241302, "learning_rate": 1e-05, "loss": 0.0248, "step": 390000 }, { "epoch": 0.003901, "grad_norm": 0.19246108829975128, "learning_rate": 1e-05, "loss": 0.0244, "step": 390100 }, { "epoch": 0.003902, "grad_norm": 0.26151350140571594, "learning_rate": 1e-05, "loss": 0.0244, "step": 390200 }, { "epoch": 0.003903, "grad_norm": 0.20223847031593323, "learning_rate": 1e-05, "loss": 0.0241, "step": 390300 }, { "epoch": 0.003904, "grad_norm": 0.2232682853937149, "learning_rate": 1e-05, "loss": 0.0243, "step": 390400 }, { "epoch": 0.003905, "grad_norm": 0.2576899230480194, "learning_rate": 1e-05, "loss": 0.0242, "step": 390500 }, { "epoch": 0.003906, "grad_norm": 0.2579266130924225, "learning_rate": 1e-05, "loss": 0.0244, "step": 390600 }, { "epoch": 0.003907, "grad_norm": 0.20377138257026672, "learning_rate": 1e-05, "loss": 0.0243, "step": 390700 }, { "epoch": 0.003908, "grad_norm": 0.22205713391304016, "learning_rate": 1e-05, "loss": 0.0241, "step": 390800 }, { "epoch": 0.003909, "grad_norm": 0.2162742018699646, "learning_rate": 1e-05, "loss": 0.0244, "step": 390900 }, { "epoch": 0.00391, "grad_norm": 0.30379900336265564, "learning_rate": 1e-05, "loss": 0.0248, "step": 391000 }, { "epoch": 0.003911, "grad_norm": 0.21567434072494507, "learning_rate": 1e-05, "loss": 0.0246, "step": 391100 }, { "epoch": 0.003912, "grad_norm": 0.19331777095794678, "learning_rate": 1e-05, "loss": 0.0247, "step": 391200 }, { "epoch": 0.003913, "grad_norm": 0.1569313257932663, "learning_rate": 1e-05, "loss": 0.0245, "step": 391300 }, { "epoch": 0.003914, "grad_norm": 0.2654237449169159, "learning_rate": 1e-05, "loss": 0.0245, "step": 391400 }, { "epoch": 0.003915, "grad_norm": 0.23599138855934143, "learning_rate": 1e-05, "loss": 0.0246, "step": 391500 }, { "epoch": 0.003916, "grad_norm": 0.1986730545759201, "learning_rate": 1e-05, "loss": 0.0244, "step": 391600 }, { "epoch": 0.003917, "grad_norm": 0.2396184504032135, "learning_rate": 1e-05, "loss": 0.0244, "step": 391700 }, { "epoch": 0.003918, "grad_norm": 0.19297581911087036, "learning_rate": 1e-05, "loss": 0.0238, "step": 391800 }, { "epoch": 0.003919, "grad_norm": 0.1976887285709381, "learning_rate": 1e-05, "loss": 0.0242, "step": 391900 }, { "epoch": 0.00392, "grad_norm": 0.20878970623016357, "learning_rate": 1e-05, "loss": 0.0245, "step": 392000 }, { "epoch": 0.003921, "grad_norm": 0.17915888130664825, "learning_rate": 1e-05, "loss": 0.0242, "step": 392100 }, { "epoch": 0.003922, "grad_norm": 0.2291068434715271, "learning_rate": 1e-05, "loss": 0.0237, "step": 392200 }, { "epoch": 0.003923, "grad_norm": 0.17525742948055267, "learning_rate": 1e-05, "loss": 0.0247, "step": 392300 }, { "epoch": 0.003924, "grad_norm": 0.2377382516860962, "learning_rate": 1e-05, "loss": 0.0245, "step": 392400 }, { "epoch": 0.003925, "grad_norm": 0.25245729088783264, "learning_rate": 1e-05, "loss": 0.0245, "step": 392500 }, { "epoch": 0.003926, "grad_norm": 0.1884782761335373, "learning_rate": 1e-05, "loss": 0.0242, "step": 392600 }, { "epoch": 0.003927, "grad_norm": 0.17751815915107727, "learning_rate": 1e-05, "loss": 0.0248, "step": 392700 }, { "epoch": 0.003928, "grad_norm": 0.15534016489982605, "learning_rate": 1e-05, "loss": 0.0243, "step": 392800 }, { "epoch": 0.003929, "grad_norm": 0.2107638120651245, "learning_rate": 1e-05, "loss": 0.0245, "step": 392900 }, { "epoch": 0.00393, "grad_norm": 0.15954922139644623, "learning_rate": 1e-05, "loss": 0.0246, "step": 393000 }, { "epoch": 0.003931, "grad_norm": 0.17600011825561523, "learning_rate": 1e-05, "loss": 0.0246, "step": 393100 }, { "epoch": 0.003932, "grad_norm": 0.1639776974916458, "learning_rate": 1e-05, "loss": 0.0246, "step": 393200 }, { "epoch": 0.003933, "grad_norm": 0.20656682550907135, "learning_rate": 1e-05, "loss": 0.0242, "step": 393300 }, { "epoch": 0.003934, "grad_norm": 0.2315010130405426, "learning_rate": 1e-05, "loss": 0.0239, "step": 393400 }, { "epoch": 0.003935, "grad_norm": 0.21008014678955078, "learning_rate": 1e-05, "loss": 0.0244, "step": 393500 }, { "epoch": 0.003936, "grad_norm": 0.23167107999324799, "learning_rate": 1e-05, "loss": 0.0242, "step": 393600 }, { "epoch": 0.003937, "grad_norm": 0.1700504720211029, "learning_rate": 1e-05, "loss": 0.0239, "step": 393700 }, { "epoch": 0.003938, "grad_norm": 0.2899870276451111, "learning_rate": 1e-05, "loss": 0.0239, "step": 393800 }, { "epoch": 0.003939, "grad_norm": 0.1761350929737091, "learning_rate": 1e-05, "loss": 0.0245, "step": 393900 }, { "epoch": 0.00394, "grad_norm": 0.1583409607410431, "learning_rate": 1e-05, "loss": 0.0244, "step": 394000 }, { "epoch": 0.003941, "grad_norm": 0.2389076054096222, "learning_rate": 1e-05, "loss": 0.0243, "step": 394100 }, { "epoch": 0.003942, "grad_norm": 0.2132047414779663, "learning_rate": 1e-05, "loss": 0.0235, "step": 394200 }, { "epoch": 0.003943, "grad_norm": 0.23401862382888794, "learning_rate": 1e-05, "loss": 0.0247, "step": 394300 }, { "epoch": 0.003944, "grad_norm": 0.21738748252391815, "learning_rate": 1e-05, "loss": 0.0239, "step": 394400 }, { "epoch": 0.003945, "grad_norm": 0.1965199112892151, "learning_rate": 1e-05, "loss": 0.0243, "step": 394500 }, { "epoch": 0.003946, "grad_norm": 0.18044523894786835, "learning_rate": 1e-05, "loss": 0.0241, "step": 394600 }, { "epoch": 0.003947, "grad_norm": 0.259910523891449, "learning_rate": 1e-05, "loss": 0.0248, "step": 394700 }, { "epoch": 0.003948, "grad_norm": 0.16778475046157837, "learning_rate": 1e-05, "loss": 0.0247, "step": 394800 }, { "epoch": 0.003949, "grad_norm": 0.2690548896789551, "learning_rate": 1e-05, "loss": 0.0242, "step": 394900 }, { "epoch": 0.00395, "grad_norm": 0.17565564811229706, "learning_rate": 1e-05, "loss": 0.0245, "step": 395000 }, { "epoch": 0.003951, "grad_norm": 0.2487185299396515, "learning_rate": 1e-05, "loss": 0.0241, "step": 395100 }, { "epoch": 0.003952, "grad_norm": 0.21471241116523743, "learning_rate": 1e-05, "loss": 0.0237, "step": 395200 }, { "epoch": 0.003953, "grad_norm": 0.20425668358802795, "learning_rate": 1e-05, "loss": 0.024, "step": 395300 }, { "epoch": 0.003954, "grad_norm": 0.24581287801265717, "learning_rate": 1e-05, "loss": 0.0247, "step": 395400 }, { "epoch": 0.003955, "grad_norm": 0.21950827538967133, "learning_rate": 1e-05, "loss": 0.0241, "step": 395500 }, { "epoch": 0.003956, "grad_norm": 0.21299898624420166, "learning_rate": 1e-05, "loss": 0.0243, "step": 395600 }, { "epoch": 0.003957, "grad_norm": 0.19356533885002136, "learning_rate": 1e-05, "loss": 0.0242, "step": 395700 }, { "epoch": 0.003958, "grad_norm": 0.1636401265859604, "learning_rate": 1e-05, "loss": 0.0245, "step": 395800 }, { "epoch": 0.003959, "grad_norm": 0.17646116018295288, "learning_rate": 1e-05, "loss": 0.024, "step": 395900 }, { "epoch": 0.00396, "grad_norm": 0.17995420098304749, "learning_rate": 1e-05, "loss": 0.0243, "step": 396000 }, { "epoch": 0.003961, "grad_norm": 0.14980532228946686, "learning_rate": 1e-05, "loss": 0.0241, "step": 396100 }, { "epoch": 0.003962, "grad_norm": 0.18425343930721283, "learning_rate": 1e-05, "loss": 0.0245, "step": 396200 }, { "epoch": 0.003963, "grad_norm": 0.18979232013225555, "learning_rate": 1e-05, "loss": 0.024, "step": 396300 }, { "epoch": 0.003964, "grad_norm": 0.25762420892715454, "learning_rate": 1e-05, "loss": 0.0237, "step": 396400 }, { "epoch": 0.003965, "grad_norm": 0.17355525493621826, "learning_rate": 1e-05, "loss": 0.0247, "step": 396500 }, { "epoch": 0.003966, "grad_norm": 0.14875204861164093, "learning_rate": 1e-05, "loss": 0.0241, "step": 396600 }, { "epoch": 0.003967, "grad_norm": 0.1550195962190628, "learning_rate": 1e-05, "loss": 0.0242, "step": 396700 }, { "epoch": 0.003968, "grad_norm": 0.23234505951404572, "learning_rate": 1e-05, "loss": 0.0243, "step": 396800 }, { "epoch": 0.003969, "grad_norm": 0.19327206909656525, "learning_rate": 1e-05, "loss": 0.0245, "step": 396900 }, { "epoch": 0.00397, "grad_norm": 0.2723594009876251, "learning_rate": 1e-05, "loss": 0.0245, "step": 397000 }, { "epoch": 0.003971, "grad_norm": 0.1802951842546463, "learning_rate": 1e-05, "loss": 0.024, "step": 397100 }, { "epoch": 0.003972, "grad_norm": 0.18921352922916412, "learning_rate": 1e-05, "loss": 0.0248, "step": 397200 }, { "epoch": 0.003973, "grad_norm": 0.2053617686033249, "learning_rate": 1e-05, "loss": 0.0243, "step": 397300 }, { "epoch": 0.003974, "grad_norm": 0.20489545166492462, "learning_rate": 1e-05, "loss": 0.0246, "step": 397400 }, { "epoch": 0.003975, "grad_norm": 0.18236985802650452, "learning_rate": 1e-05, "loss": 0.0242, "step": 397500 }, { "epoch": 0.003976, "grad_norm": 0.269508957862854, "learning_rate": 1e-05, "loss": 0.024, "step": 397600 }, { "epoch": 0.003977, "grad_norm": 0.17261308431625366, "learning_rate": 1e-05, "loss": 0.0244, "step": 397700 }, { "epoch": 0.003978, "grad_norm": 0.22099550068378448, "learning_rate": 1e-05, "loss": 0.0241, "step": 397800 }, { "epoch": 0.003979, "grad_norm": 0.19354794919490814, "learning_rate": 1e-05, "loss": 0.0241, "step": 397900 }, { "epoch": 0.00398, "grad_norm": 0.15845584869384766, "learning_rate": 1e-05, "loss": 0.0243, "step": 398000 }, { "epoch": 0.003981, "grad_norm": 0.18738186359405518, "learning_rate": 1e-05, "loss": 0.0241, "step": 398100 }, { "epoch": 0.003982, "grad_norm": 0.18495316803455353, "learning_rate": 1e-05, "loss": 0.024, "step": 398200 }, { "epoch": 0.003983, "grad_norm": 0.15310056507587433, "learning_rate": 1e-05, "loss": 0.024, "step": 398300 }, { "epoch": 0.003984, "grad_norm": 0.19418907165527344, "learning_rate": 1e-05, "loss": 0.0248, "step": 398400 }, { "epoch": 0.003985, "grad_norm": 0.14243687689304352, "learning_rate": 1e-05, "loss": 0.0244, "step": 398500 }, { "epoch": 0.003986, "grad_norm": 0.2533894181251526, "learning_rate": 1e-05, "loss": 0.024, "step": 398600 }, { "epoch": 0.003987, "grad_norm": 0.18048831820487976, "learning_rate": 1e-05, "loss": 0.0243, "step": 398700 }, { "epoch": 0.003988, "grad_norm": 0.21684424579143524, "learning_rate": 1e-05, "loss": 0.0247, "step": 398800 }, { "epoch": 0.003989, "grad_norm": 0.24964727461338043, "learning_rate": 1e-05, "loss": 0.0244, "step": 398900 }, { "epoch": 0.00399, "grad_norm": 0.24441255629062653, "learning_rate": 1e-05, "loss": 0.0242, "step": 399000 }, { "epoch": 0.003991, "grad_norm": 0.20440982282161713, "learning_rate": 1e-05, "loss": 0.0242, "step": 399100 }, { "epoch": 0.003992, "grad_norm": 0.2948802411556244, "learning_rate": 1e-05, "loss": 0.0243, "step": 399200 }, { "epoch": 0.003993, "grad_norm": 0.2583007514476776, "learning_rate": 1e-05, "loss": 0.0248, "step": 399300 }, { "epoch": 0.003994, "grad_norm": 0.3098246455192566, "learning_rate": 1e-05, "loss": 0.0243, "step": 399400 }, { "epoch": 0.003995, "grad_norm": 0.1914750635623932, "learning_rate": 1e-05, "loss": 0.0245, "step": 399500 }, { "epoch": 0.003996, "grad_norm": 0.22532373666763306, "learning_rate": 1e-05, "loss": 0.0244, "step": 399600 }, { "epoch": 0.003997, "grad_norm": 0.1786954700946808, "learning_rate": 1e-05, "loss": 0.0238, "step": 399700 }, { "epoch": 0.003998, "grad_norm": 0.16837936639785767, "learning_rate": 1e-05, "loss": 0.024, "step": 399800 }, { "epoch": 0.003999, "grad_norm": 0.2182445377111435, "learning_rate": 1e-05, "loss": 0.0242, "step": 399900 }, { "epoch": 0.004, "grad_norm": 0.17310243844985962, "learning_rate": 1e-05, "loss": 0.024, "step": 400000 }, { "epoch": 0.004, "eval_loss": 0.021325109526515007, "eval_runtime": 181.9813, "eval_samples_per_second": 274.754, "eval_steps_per_second": 17.172, "step": 400000 }, { "epoch": 0.004001, "grad_norm": 0.17864519357681274, "learning_rate": 1e-05, "loss": 0.0245, "step": 400100 }, { "epoch": 0.004002, "grad_norm": 0.21447062492370605, "learning_rate": 1e-05, "loss": 0.0238, "step": 400200 }, { "epoch": 0.004003, "grad_norm": 0.22690722346305847, "learning_rate": 1e-05, "loss": 0.0245, "step": 400300 }, { "epoch": 0.004004, "grad_norm": 0.22103351354599, "learning_rate": 1e-05, "loss": 0.0239, "step": 400400 }, { "epoch": 0.004005, "grad_norm": 0.25905048847198486, "learning_rate": 1e-05, "loss": 0.0241, "step": 400500 }, { "epoch": 0.004006, "grad_norm": 0.18882887065410614, "learning_rate": 1e-05, "loss": 0.0243, "step": 400600 }, { "epoch": 0.004007, "grad_norm": 0.21001237630844116, "learning_rate": 1e-05, "loss": 0.0243, "step": 400700 }, { "epoch": 0.004008, "grad_norm": 0.18807949125766754, "learning_rate": 1e-05, "loss": 0.0248, "step": 400800 }, { "epoch": 0.004009, "grad_norm": 0.19721829891204834, "learning_rate": 1e-05, "loss": 0.0241, "step": 400900 }, { "epoch": 0.00401, "grad_norm": 0.19580477476119995, "learning_rate": 1e-05, "loss": 0.0239, "step": 401000 }, { "epoch": 0.004011, "grad_norm": 0.30588701367378235, "learning_rate": 1e-05, "loss": 0.0244, "step": 401100 }, { "epoch": 0.004012, "grad_norm": 0.22376719117164612, "learning_rate": 1e-05, "loss": 0.024, "step": 401200 }, { "epoch": 0.004013, "grad_norm": 0.2030571550130844, "learning_rate": 1e-05, "loss": 0.0241, "step": 401300 }, { "epoch": 0.004014, "grad_norm": 0.22415898740291595, "learning_rate": 1e-05, "loss": 0.0242, "step": 401400 }, { "epoch": 0.004015, "grad_norm": 0.2560105621814728, "learning_rate": 1e-05, "loss": 0.0239, "step": 401500 }, { "epoch": 0.004016, "grad_norm": 0.18780073523521423, "learning_rate": 1e-05, "loss": 0.0243, "step": 401600 }, { "epoch": 0.004017, "grad_norm": 0.22121521830558777, "learning_rate": 1e-05, "loss": 0.0247, "step": 401700 }, { "epoch": 0.004018, "grad_norm": 0.17672856152057648, "learning_rate": 1e-05, "loss": 0.0242, "step": 401800 }, { "epoch": 0.004019, "grad_norm": 0.1632930487394333, "learning_rate": 1e-05, "loss": 0.0237, "step": 401900 }, { "epoch": 0.00402, "grad_norm": 0.20020559430122375, "learning_rate": 1e-05, "loss": 0.0236, "step": 402000 }, { "epoch": 0.004021, "grad_norm": 0.1854398399591446, "learning_rate": 1e-05, "loss": 0.0237, "step": 402100 }, { "epoch": 0.004022, "grad_norm": 0.2342918962240219, "learning_rate": 1e-05, "loss": 0.0245, "step": 402200 }, { "epoch": 0.004023, "grad_norm": 0.1739826649427414, "learning_rate": 1e-05, "loss": 0.0241, "step": 402300 }, { "epoch": 0.004024, "grad_norm": 0.254190057516098, "learning_rate": 1e-05, "loss": 0.0246, "step": 402400 }, { "epoch": 0.004025, "grad_norm": 0.17600515484809875, "learning_rate": 1e-05, "loss": 0.0239, "step": 402500 }, { "epoch": 0.004026, "grad_norm": 0.2600719928741455, "learning_rate": 1e-05, "loss": 0.0242, "step": 402600 }, { "epoch": 0.004027, "grad_norm": 0.20344164967536926, "learning_rate": 1e-05, "loss": 0.0244, "step": 402700 }, { "epoch": 0.004028, "grad_norm": 0.22224602103233337, "learning_rate": 1e-05, "loss": 0.0244, "step": 402800 }, { "epoch": 0.004029, "grad_norm": 0.19075195491313934, "learning_rate": 1e-05, "loss": 0.0239, "step": 402900 }, { "epoch": 0.00403, "grad_norm": 0.18946099281311035, "learning_rate": 1e-05, "loss": 0.0237, "step": 403000 }, { "epoch": 0.004031, "grad_norm": 0.17483866214752197, "learning_rate": 1e-05, "loss": 0.024, "step": 403100 }, { "epoch": 0.004032, "grad_norm": 0.2106839120388031, "learning_rate": 1e-05, "loss": 0.0243, "step": 403200 }, { "epoch": 0.004033, "grad_norm": 0.1726783961057663, "learning_rate": 1e-05, "loss": 0.0236, "step": 403300 }, { "epoch": 0.004034, "grad_norm": 0.26713061332702637, "learning_rate": 1e-05, "loss": 0.0242, "step": 403400 }, { "epoch": 0.004035, "grad_norm": 0.2140360325574875, "learning_rate": 1e-05, "loss": 0.0238, "step": 403500 }, { "epoch": 0.004036, "grad_norm": 0.2488483190536499, "learning_rate": 1e-05, "loss": 0.0242, "step": 403600 }, { "epoch": 0.004037, "grad_norm": 0.2509625554084778, "learning_rate": 1e-05, "loss": 0.0238, "step": 403700 }, { "epoch": 0.004038, "grad_norm": 0.20372413098812103, "learning_rate": 1e-05, "loss": 0.0247, "step": 403800 }, { "epoch": 0.004039, "grad_norm": 0.22386036813259125, "learning_rate": 1e-05, "loss": 0.0242, "step": 403900 }, { "epoch": 0.00404, "grad_norm": 0.19657465815544128, "learning_rate": 1e-05, "loss": 0.0241, "step": 404000 }, { "epoch": 0.004041, "grad_norm": 0.18385273218154907, "learning_rate": 1e-05, "loss": 0.0236, "step": 404100 }, { "epoch": 0.004042, "grad_norm": 0.22625063359737396, "learning_rate": 1e-05, "loss": 0.0242, "step": 404200 }, { "epoch": 0.004043, "grad_norm": 0.22416622936725616, "learning_rate": 1e-05, "loss": 0.0241, "step": 404300 }, { "epoch": 0.004044, "grad_norm": 0.2363739013671875, "learning_rate": 1e-05, "loss": 0.0232, "step": 404400 }, { "epoch": 0.004045, "grad_norm": 0.17753678560256958, "learning_rate": 1e-05, "loss": 0.0241, "step": 404500 }, { "epoch": 0.004046, "grad_norm": 0.27816784381866455, "learning_rate": 1e-05, "loss": 0.0243, "step": 404600 }, { "epoch": 0.004047, "grad_norm": 0.18444371223449707, "learning_rate": 1e-05, "loss": 0.0238, "step": 404700 }, { "epoch": 0.004048, "grad_norm": 0.1938232034444809, "learning_rate": 1e-05, "loss": 0.0249, "step": 404800 }, { "epoch": 0.004049, "grad_norm": 0.20411688089370728, "learning_rate": 1e-05, "loss": 0.0235, "step": 404900 }, { "epoch": 0.00405, "grad_norm": 0.1435740888118744, "learning_rate": 1e-05, "loss": 0.0241, "step": 405000 }, { "epoch": 0.004051, "grad_norm": 0.27717411518096924, "learning_rate": 1e-05, "loss": 0.0233, "step": 405100 }, { "epoch": 0.004052, "grad_norm": 0.2798541784286499, "learning_rate": 1e-05, "loss": 0.0236, "step": 405200 }, { "epoch": 0.004053, "grad_norm": 0.3026045560836792, "learning_rate": 1e-05, "loss": 0.0243, "step": 405300 }, { "epoch": 0.004054, "grad_norm": 0.2283545732498169, "learning_rate": 1e-05, "loss": 0.0242, "step": 405400 }, { "epoch": 0.004055, "grad_norm": 0.20050480961799622, "learning_rate": 1e-05, "loss": 0.0238, "step": 405500 }, { "epoch": 0.004056, "grad_norm": 0.20615510642528534, "learning_rate": 1e-05, "loss": 0.0239, "step": 405600 }, { "epoch": 0.004057, "grad_norm": 0.18706946074962616, "learning_rate": 1e-05, "loss": 0.0241, "step": 405700 }, { "epoch": 0.004058, "grad_norm": 0.19194181263446808, "learning_rate": 1e-05, "loss": 0.0241, "step": 405800 }, { "epoch": 0.004059, "grad_norm": 0.1784643828868866, "learning_rate": 1e-05, "loss": 0.024, "step": 405900 }, { "epoch": 0.00406, "grad_norm": 0.19931229948997498, "learning_rate": 1e-05, "loss": 0.0241, "step": 406000 }, { "epoch": 0.004061, "grad_norm": 0.2299507111310959, "learning_rate": 1e-05, "loss": 0.0238, "step": 406100 }, { "epoch": 0.004062, "grad_norm": 0.16718555986881256, "learning_rate": 1e-05, "loss": 0.0241, "step": 406200 }, { "epoch": 0.004063, "grad_norm": 0.2337353527545929, "learning_rate": 1e-05, "loss": 0.0236, "step": 406300 }, { "epoch": 0.004064, "grad_norm": 0.16645725071430206, "learning_rate": 1e-05, "loss": 0.0242, "step": 406400 }, { "epoch": 0.004065, "grad_norm": 0.19582144916057587, "learning_rate": 1e-05, "loss": 0.0239, "step": 406500 }, { "epoch": 0.004066, "grad_norm": 0.15095144510269165, "learning_rate": 1e-05, "loss": 0.0244, "step": 406600 }, { "epoch": 0.004067, "grad_norm": 0.2913278043270111, "learning_rate": 1e-05, "loss": 0.024, "step": 406700 }, { "epoch": 0.004068, "grad_norm": 0.2009751796722412, "learning_rate": 1e-05, "loss": 0.0239, "step": 406800 }, { "epoch": 0.004069, "grad_norm": 0.18142026662826538, "learning_rate": 1e-05, "loss": 0.0245, "step": 406900 }, { "epoch": 0.00407, "grad_norm": 0.1700320541858673, "learning_rate": 1e-05, "loss": 0.0243, "step": 407000 }, { "epoch": 0.004071, "grad_norm": 0.149856299161911, "learning_rate": 1e-05, "loss": 0.024, "step": 407100 }, { "epoch": 0.004072, "grad_norm": 0.16043926775455475, "learning_rate": 1e-05, "loss": 0.024, "step": 407200 }, { "epoch": 0.004073, "grad_norm": 0.2235875129699707, "learning_rate": 1e-05, "loss": 0.0241, "step": 407300 }, { "epoch": 0.004074, "grad_norm": 0.23072665929794312, "learning_rate": 1e-05, "loss": 0.0239, "step": 407400 }, { "epoch": 0.004075, "grad_norm": 0.2780907452106476, "learning_rate": 1e-05, "loss": 0.024, "step": 407500 }, { "epoch": 0.004076, "grad_norm": 0.17851833999156952, "learning_rate": 1e-05, "loss": 0.0234, "step": 407600 }, { "epoch": 0.004077, "grad_norm": 0.17850230634212494, "learning_rate": 1e-05, "loss": 0.0239, "step": 407700 }, { "epoch": 0.004078, "grad_norm": 0.23101787269115448, "learning_rate": 1e-05, "loss": 0.0236, "step": 407800 }, { "epoch": 0.004079, "grad_norm": 0.15315276384353638, "learning_rate": 1e-05, "loss": 0.0235, "step": 407900 }, { "epoch": 0.00408, "grad_norm": 0.3056657612323761, "learning_rate": 1e-05, "loss": 0.0235, "step": 408000 }, { "epoch": 0.004081, "grad_norm": 0.1600840538740158, "learning_rate": 1e-05, "loss": 0.0233, "step": 408100 }, { "epoch": 0.004082, "grad_norm": 0.2069026231765747, "learning_rate": 1e-05, "loss": 0.0234, "step": 408200 }, { "epoch": 0.004083, "grad_norm": 0.16931883990764618, "learning_rate": 1e-05, "loss": 0.0239, "step": 408300 }, { "epoch": 0.004084, "grad_norm": 0.16279153525829315, "learning_rate": 1e-05, "loss": 0.0245, "step": 408400 }, { "epoch": 0.004085, "grad_norm": 0.30323198437690735, "learning_rate": 1e-05, "loss": 0.0244, "step": 408500 }, { "epoch": 0.004086, "grad_norm": 0.15873537957668304, "learning_rate": 1e-05, "loss": 0.0238, "step": 408600 }, { "epoch": 0.004087, "grad_norm": 0.2052287757396698, "learning_rate": 1e-05, "loss": 0.0237, "step": 408700 }, { "epoch": 0.004088, "grad_norm": 0.24865420162677765, "learning_rate": 1e-05, "loss": 0.024, "step": 408800 }, { "epoch": 0.004089, "grad_norm": 0.20936857163906097, "learning_rate": 1e-05, "loss": 0.0231, "step": 408900 }, { "epoch": 0.00409, "grad_norm": 0.17382682859897614, "learning_rate": 1e-05, "loss": 0.0244, "step": 409000 }, { "epoch": 0.004091, "grad_norm": 0.15957964956760406, "learning_rate": 1e-05, "loss": 0.0241, "step": 409100 }, { "epoch": 0.004092, "grad_norm": 0.14949068427085876, "learning_rate": 1e-05, "loss": 0.0237, "step": 409200 }, { "epoch": 0.004093, "grad_norm": 0.2077893763780594, "learning_rate": 1e-05, "loss": 0.0241, "step": 409300 }, { "epoch": 0.004094, "grad_norm": 0.19283032417297363, "learning_rate": 1e-05, "loss": 0.0238, "step": 409400 }, { "epoch": 0.004095, "grad_norm": 0.15965749323368073, "learning_rate": 1e-05, "loss": 0.0233, "step": 409500 }, { "epoch": 0.004096, "grad_norm": 0.29571226239204407, "learning_rate": 1e-05, "loss": 0.0241, "step": 409600 }, { "epoch": 0.004097, "grad_norm": 0.15089602768421173, "learning_rate": 1e-05, "loss": 0.0243, "step": 409700 }, { "epoch": 0.004098, "grad_norm": 0.20679809153079987, "learning_rate": 1e-05, "loss": 0.0237, "step": 409800 }, { "epoch": 0.004099, "grad_norm": 0.2337440401315689, "learning_rate": 1e-05, "loss": 0.0242, "step": 409900 }, { "epoch": 0.0041, "grad_norm": 0.23629258573055267, "learning_rate": 1e-05, "loss": 0.024, "step": 410000 }, { "epoch": 0.004101, "grad_norm": 0.21103115379810333, "learning_rate": 1e-05, "loss": 0.0236, "step": 410100 }, { "epoch": 0.004102, "grad_norm": 0.17498356103897095, "learning_rate": 1e-05, "loss": 0.0236, "step": 410200 }, { "epoch": 0.004103, "grad_norm": 0.2011914849281311, "learning_rate": 1e-05, "loss": 0.024, "step": 410300 }, { "epoch": 0.004104, "grad_norm": 0.1928895264863968, "learning_rate": 1e-05, "loss": 0.0242, "step": 410400 }, { "epoch": 0.004105, "grad_norm": 0.18732555210590363, "learning_rate": 1e-05, "loss": 0.0236, "step": 410500 }, { "epoch": 0.004106, "grad_norm": 0.20819500088691711, "learning_rate": 1e-05, "loss": 0.0242, "step": 410600 }, { "epoch": 0.004107, "grad_norm": 0.15586677193641663, "learning_rate": 1e-05, "loss": 0.024, "step": 410700 }, { "epoch": 0.004108, "grad_norm": 0.2075367271900177, "learning_rate": 1e-05, "loss": 0.0239, "step": 410800 }, { "epoch": 0.004109, "grad_norm": 0.1591407209634781, "learning_rate": 1e-05, "loss": 0.0233, "step": 410900 }, { "epoch": 0.00411, "grad_norm": 0.2341272085905075, "learning_rate": 1e-05, "loss": 0.0242, "step": 411000 }, { "epoch": 0.004111, "grad_norm": 0.2315295785665512, "learning_rate": 1e-05, "loss": 0.0242, "step": 411100 }, { "epoch": 0.004112, "grad_norm": 0.1858806014060974, "learning_rate": 1e-05, "loss": 0.0244, "step": 411200 }, { "epoch": 0.004113, "grad_norm": 0.2048673778772354, "learning_rate": 1e-05, "loss": 0.0237, "step": 411300 }, { "epoch": 0.004114, "grad_norm": 0.19979144632816315, "learning_rate": 1e-05, "loss": 0.0233, "step": 411400 }, { "epoch": 0.004115, "grad_norm": 0.21563541889190674, "learning_rate": 1e-05, "loss": 0.0242, "step": 411500 }, { "epoch": 0.004116, "grad_norm": 0.1873602569103241, "learning_rate": 1e-05, "loss": 0.024, "step": 411600 }, { "epoch": 0.004117, "grad_norm": 0.2290479987859726, "learning_rate": 1e-05, "loss": 0.024, "step": 411700 }, { "epoch": 0.004118, "grad_norm": 0.24949702620506287, "learning_rate": 1e-05, "loss": 0.0243, "step": 411800 }, { "epoch": 0.004119, "grad_norm": 0.22323231399059296, "learning_rate": 1e-05, "loss": 0.0236, "step": 411900 }, { "epoch": 0.00412, "grad_norm": 0.1746332049369812, "learning_rate": 1e-05, "loss": 0.0239, "step": 412000 }, { "epoch": 0.004121, "grad_norm": 0.2160986214876175, "learning_rate": 1e-05, "loss": 0.0242, "step": 412100 }, { "epoch": 0.004122, "grad_norm": 0.26759248971939087, "learning_rate": 1e-05, "loss": 0.0239, "step": 412200 }, { "epoch": 0.004123, "grad_norm": 0.16025647521018982, "learning_rate": 1e-05, "loss": 0.0242, "step": 412300 }, { "epoch": 0.004124, "grad_norm": 0.1693701595067978, "learning_rate": 1e-05, "loss": 0.0238, "step": 412400 }, { "epoch": 0.004125, "grad_norm": 0.253498911857605, "learning_rate": 1e-05, "loss": 0.0239, "step": 412500 }, { "epoch": 0.004126, "grad_norm": 0.23547326028347015, "learning_rate": 1e-05, "loss": 0.0243, "step": 412600 }, { "epoch": 0.004127, "grad_norm": 0.23215053975582123, "learning_rate": 1e-05, "loss": 0.0242, "step": 412700 }, { "epoch": 0.004128, "grad_norm": 0.19575569033622742, "learning_rate": 1e-05, "loss": 0.0239, "step": 412800 }, { "epoch": 0.004129, "grad_norm": 0.3548726439476013, "learning_rate": 1e-05, "loss": 0.0241, "step": 412900 }, { "epoch": 0.00413, "grad_norm": 0.24185355007648468, "learning_rate": 1e-05, "loss": 0.0236, "step": 413000 }, { "epoch": 0.004131, "grad_norm": 0.16292066872119904, "learning_rate": 1e-05, "loss": 0.0236, "step": 413100 }, { "epoch": 0.004132, "grad_norm": 0.16266168653964996, "learning_rate": 1e-05, "loss": 0.0237, "step": 413200 }, { "epoch": 0.004133, "grad_norm": 0.1676439493894577, "learning_rate": 1e-05, "loss": 0.0238, "step": 413300 }, { "epoch": 0.004134, "grad_norm": 0.24161051213741302, "learning_rate": 1e-05, "loss": 0.0242, "step": 413400 }, { "epoch": 0.004135, "grad_norm": 0.25921371579170227, "learning_rate": 1e-05, "loss": 0.0238, "step": 413500 }, { "epoch": 0.004136, "grad_norm": 0.2421140819787979, "learning_rate": 1e-05, "loss": 0.0243, "step": 413600 }, { "epoch": 0.004137, "grad_norm": 0.18767842650413513, "learning_rate": 1e-05, "loss": 0.024, "step": 413700 }, { "epoch": 0.004138, "grad_norm": 0.17803381383419037, "learning_rate": 1e-05, "loss": 0.0243, "step": 413800 }, { "epoch": 0.004139, "grad_norm": 0.24695448577404022, "learning_rate": 1e-05, "loss": 0.024, "step": 413900 }, { "epoch": 0.00414, "grad_norm": 0.18784227967262268, "learning_rate": 1e-05, "loss": 0.0234, "step": 414000 }, { "epoch": 0.004141, "grad_norm": 0.20039895176887512, "learning_rate": 1e-05, "loss": 0.0236, "step": 414100 }, { "epoch": 0.004142, "grad_norm": 0.19771581888198853, "learning_rate": 1e-05, "loss": 0.0237, "step": 414200 }, { "epoch": 0.004143, "grad_norm": 0.1711001992225647, "learning_rate": 1e-05, "loss": 0.0237, "step": 414300 }, { "epoch": 0.004144, "grad_norm": 0.21709679067134857, "learning_rate": 1e-05, "loss": 0.0236, "step": 414400 }, { "epoch": 0.004145, "grad_norm": 0.2471849024295807, "learning_rate": 1e-05, "loss": 0.0239, "step": 414500 }, { "epoch": 0.004146, "grad_norm": 0.1705552190542221, "learning_rate": 1e-05, "loss": 0.0241, "step": 414600 }, { "epoch": 0.004147, "grad_norm": 0.19185632467269897, "learning_rate": 1e-05, "loss": 0.0237, "step": 414700 }, { "epoch": 0.004148, "grad_norm": 0.25090599060058594, "learning_rate": 1e-05, "loss": 0.0238, "step": 414800 }, { "epoch": 0.004149, "grad_norm": 0.19111108779907227, "learning_rate": 1e-05, "loss": 0.0239, "step": 414900 }, { "epoch": 0.00415, "grad_norm": 0.19888189435005188, "learning_rate": 1e-05, "loss": 0.0239, "step": 415000 }, { "epoch": 0.004151, "grad_norm": 0.20116466283798218, "learning_rate": 1e-05, "loss": 0.024, "step": 415100 }, { "epoch": 0.004152, "grad_norm": 0.16846789419651031, "learning_rate": 1e-05, "loss": 0.0242, "step": 415200 }, { "epoch": 0.004153, "grad_norm": 0.15874427556991577, "learning_rate": 1e-05, "loss": 0.0241, "step": 415300 }, { "epoch": 0.004154, "grad_norm": 0.2389228343963623, "learning_rate": 1e-05, "loss": 0.0233, "step": 415400 }, { "epoch": 0.004155, "grad_norm": 0.16489608585834503, "learning_rate": 1e-05, "loss": 0.0239, "step": 415500 }, { "epoch": 0.004156, "grad_norm": 0.19362910091876984, "learning_rate": 1e-05, "loss": 0.0234, "step": 415600 }, { "epoch": 0.004157, "grad_norm": 0.22064854204654694, "learning_rate": 1e-05, "loss": 0.0239, "step": 415700 }, { "epoch": 0.004158, "grad_norm": 0.29256850481033325, "learning_rate": 1e-05, "loss": 0.0239, "step": 415800 }, { "epoch": 0.004159, "grad_norm": 0.2016410231590271, "learning_rate": 1e-05, "loss": 0.0239, "step": 415900 }, { "epoch": 0.00416, "grad_norm": 0.20603743195533752, "learning_rate": 1e-05, "loss": 0.0237, "step": 416000 }, { "epoch": 0.004161, "grad_norm": 0.19934989511966705, "learning_rate": 1e-05, "loss": 0.0241, "step": 416100 }, { "epoch": 0.004162, "grad_norm": 0.16956506669521332, "learning_rate": 1e-05, "loss": 0.0235, "step": 416200 }, { "epoch": 0.004163, "grad_norm": 0.16828955709934235, "learning_rate": 1e-05, "loss": 0.024, "step": 416300 }, { "epoch": 0.004164, "grad_norm": 0.19737327098846436, "learning_rate": 1e-05, "loss": 0.0244, "step": 416400 }, { "epoch": 0.004165, "grad_norm": 0.20796439051628113, "learning_rate": 1e-05, "loss": 0.0238, "step": 416500 }, { "epoch": 0.004166, "grad_norm": 0.1688832938671112, "learning_rate": 1e-05, "loss": 0.0238, "step": 416600 }, { "epoch": 0.004167, "grad_norm": 0.1987214833498001, "learning_rate": 1e-05, "loss": 0.0238, "step": 416700 }, { "epoch": 0.004168, "grad_norm": 0.16946817934513092, "learning_rate": 1e-05, "loss": 0.0244, "step": 416800 }, { "epoch": 0.004169, "grad_norm": 0.2016819566488266, "learning_rate": 1e-05, "loss": 0.0242, "step": 416900 }, { "epoch": 0.00417, "grad_norm": 0.1842210292816162, "learning_rate": 1e-05, "loss": 0.0234, "step": 417000 }, { "epoch": 0.004171, "grad_norm": 0.28646185994148254, "learning_rate": 1e-05, "loss": 0.0235, "step": 417100 }, { "epoch": 0.004172, "grad_norm": 0.24527733027935028, "learning_rate": 1e-05, "loss": 0.0243, "step": 417200 }, { "epoch": 0.004173, "grad_norm": 0.15508246421813965, "learning_rate": 1e-05, "loss": 0.0234, "step": 417300 }, { "epoch": 0.004174, "grad_norm": 0.2016175389289856, "learning_rate": 1e-05, "loss": 0.0243, "step": 417400 }, { "epoch": 0.004175, "grad_norm": 0.16688264906406403, "learning_rate": 1e-05, "loss": 0.0235, "step": 417500 }, { "epoch": 0.004176, "grad_norm": 0.21296170353889465, "learning_rate": 1e-05, "loss": 0.0231, "step": 417600 }, { "epoch": 0.004177, "grad_norm": 0.21790237724781036, "learning_rate": 1e-05, "loss": 0.0238, "step": 417700 }, { "epoch": 0.004178, "grad_norm": 0.1724802553653717, "learning_rate": 1e-05, "loss": 0.0236, "step": 417800 }, { "epoch": 0.004179, "grad_norm": 0.27631035447120667, "learning_rate": 1e-05, "loss": 0.0239, "step": 417900 }, { "epoch": 0.00418, "grad_norm": 0.18844562768936157, "learning_rate": 1e-05, "loss": 0.0232, "step": 418000 }, { "epoch": 0.004181, "grad_norm": 0.17523527145385742, "learning_rate": 1e-05, "loss": 0.0234, "step": 418100 }, { "epoch": 0.004182, "grad_norm": 0.20500946044921875, "learning_rate": 1e-05, "loss": 0.0238, "step": 418200 }, { "epoch": 0.004183, "grad_norm": 0.21619221568107605, "learning_rate": 1e-05, "loss": 0.0232, "step": 418300 }, { "epoch": 0.004184, "grad_norm": 0.17848503589630127, "learning_rate": 1e-05, "loss": 0.024, "step": 418400 }, { "epoch": 0.004185, "grad_norm": 0.2213757485151291, "learning_rate": 1e-05, "loss": 0.0235, "step": 418500 }, { "epoch": 0.004186, "grad_norm": 0.23661598563194275, "learning_rate": 1e-05, "loss": 0.0238, "step": 418600 }, { "epoch": 0.004187, "grad_norm": 0.24121993780136108, "learning_rate": 1e-05, "loss": 0.024, "step": 418700 }, { "epoch": 0.004188, "grad_norm": 0.1613485962152481, "learning_rate": 1e-05, "loss": 0.0236, "step": 418800 }, { "epoch": 0.004189, "grad_norm": 0.2982117235660553, "learning_rate": 1e-05, "loss": 0.0237, "step": 418900 }, { "epoch": 0.00419, "grad_norm": 0.2035265862941742, "learning_rate": 1e-05, "loss": 0.0236, "step": 419000 }, { "epoch": 0.004191, "grad_norm": 0.22106562554836273, "learning_rate": 1e-05, "loss": 0.0235, "step": 419100 }, { "epoch": 0.004192, "grad_norm": 0.264619916677475, "learning_rate": 1e-05, "loss": 0.0234, "step": 419200 }, { "epoch": 0.004193, "grad_norm": 0.1567707359790802, "learning_rate": 1e-05, "loss": 0.0238, "step": 419300 }, { "epoch": 0.004194, "grad_norm": 0.2539576590061188, "learning_rate": 1e-05, "loss": 0.0238, "step": 419400 }, { "epoch": 0.004195, "grad_norm": 0.1844097077846527, "learning_rate": 1e-05, "loss": 0.0241, "step": 419500 }, { "epoch": 0.004196, "grad_norm": 0.18894335627555847, "learning_rate": 1e-05, "loss": 0.0238, "step": 419600 }, { "epoch": 0.004197, "grad_norm": 0.28224891424179077, "learning_rate": 1e-05, "loss": 0.0242, "step": 419700 }, { "epoch": 0.004198, "grad_norm": 0.16208012402057648, "learning_rate": 1e-05, "loss": 0.0241, "step": 419800 }, { "epoch": 0.004199, "grad_norm": 0.13610723614692688, "learning_rate": 1e-05, "loss": 0.0237, "step": 419900 }, { "epoch": 0.0042, "grad_norm": 0.1662692129611969, "learning_rate": 1e-05, "loss": 0.0232, "step": 420000 }, { "epoch": 0.0042, "eval_loss": 0.02153271809220314, "eval_runtime": 193.1882, "eval_samples_per_second": 258.815, "eval_steps_per_second": 16.176, "step": 420000 }, { "epoch": 0.004201, "grad_norm": 0.1924775242805481, "learning_rate": 1e-05, "loss": 0.024, "step": 420100 }, { "epoch": 0.004202, "grad_norm": 0.22987835109233856, "learning_rate": 1e-05, "loss": 0.0237, "step": 420200 }, { "epoch": 0.004203, "grad_norm": 0.1986425668001175, "learning_rate": 1e-05, "loss": 0.0242, "step": 420300 }, { "epoch": 0.004204, "grad_norm": 0.1781289428472519, "learning_rate": 1e-05, "loss": 0.0237, "step": 420400 }, { "epoch": 0.004205, "grad_norm": 0.20679186284542084, "learning_rate": 1e-05, "loss": 0.0237, "step": 420500 }, { "epoch": 0.004206, "grad_norm": 0.21408845484256744, "learning_rate": 1e-05, "loss": 0.0241, "step": 420600 }, { "epoch": 0.004207, "grad_norm": 0.20359979569911957, "learning_rate": 1e-05, "loss": 0.0235, "step": 420700 }, { "epoch": 0.004208, "grad_norm": 0.17168061435222626, "learning_rate": 1e-05, "loss": 0.0231, "step": 420800 }, { "epoch": 0.004209, "grad_norm": 0.2686200439929962, "learning_rate": 1e-05, "loss": 0.0236, "step": 420900 }, { "epoch": 0.00421, "grad_norm": 0.2675858438014984, "learning_rate": 1e-05, "loss": 0.0237, "step": 421000 }, { "epoch": 0.004211, "grad_norm": 0.19950535893440247, "learning_rate": 1e-05, "loss": 0.0236, "step": 421100 }, { "epoch": 0.004212, "grad_norm": 0.1846756935119629, "learning_rate": 1e-05, "loss": 0.0238, "step": 421200 }, { "epoch": 0.004213, "grad_norm": 0.2127295732498169, "learning_rate": 1e-05, "loss": 0.0238, "step": 421300 }, { "epoch": 0.004214, "grad_norm": 0.17740221321582794, "learning_rate": 1e-05, "loss": 0.0233, "step": 421400 }, { "epoch": 0.004215, "grad_norm": 0.17140015959739685, "learning_rate": 1e-05, "loss": 0.0242, "step": 421500 }, { "epoch": 0.004216, "grad_norm": 0.1850009262561798, "learning_rate": 1e-05, "loss": 0.0236, "step": 421600 }, { "epoch": 0.004217, "grad_norm": 0.21517637372016907, "learning_rate": 1e-05, "loss": 0.0233, "step": 421700 }, { "epoch": 0.004218, "grad_norm": 0.2141711711883545, "learning_rate": 1e-05, "loss": 0.0241, "step": 421800 }, { "epoch": 0.004219, "grad_norm": 0.29322028160095215, "learning_rate": 1e-05, "loss": 0.0235, "step": 421900 }, { "epoch": 0.00422, "grad_norm": 0.2434767633676529, "learning_rate": 1e-05, "loss": 0.0239, "step": 422000 }, { "epoch": 0.004221, "grad_norm": 0.17590637505054474, "learning_rate": 1e-05, "loss": 0.0236, "step": 422100 }, { "epoch": 0.004222, "grad_norm": 0.25584331154823303, "learning_rate": 1e-05, "loss": 0.024, "step": 422200 }, { "epoch": 0.004223, "grad_norm": 0.238374724984169, "learning_rate": 1e-05, "loss": 0.024, "step": 422300 }, { "epoch": 0.004224, "grad_norm": 0.1754980832338333, "learning_rate": 1e-05, "loss": 0.0237, "step": 422400 }, { "epoch": 0.004225, "grad_norm": 0.2889844477176666, "learning_rate": 1e-05, "loss": 0.0238, "step": 422500 }, { "epoch": 0.004226, "grad_norm": 0.22638972103595734, "learning_rate": 1e-05, "loss": 0.0239, "step": 422600 }, { "epoch": 0.004227, "grad_norm": 0.2191096544265747, "learning_rate": 1e-05, "loss": 0.0239, "step": 422700 }, { "epoch": 0.004228, "grad_norm": 0.19967034459114075, "learning_rate": 1e-05, "loss": 0.0237, "step": 422800 }, { "epoch": 0.004229, "grad_norm": 0.19170403480529785, "learning_rate": 1e-05, "loss": 0.0238, "step": 422900 }, { "epoch": 0.00423, "grad_norm": 0.22899439930915833, "learning_rate": 1e-05, "loss": 0.0238, "step": 423000 }, { "epoch": 0.004231, "grad_norm": 0.21077010035514832, "learning_rate": 1e-05, "loss": 0.0243, "step": 423100 }, { "epoch": 0.004232, "grad_norm": 0.20535080134868622, "learning_rate": 1e-05, "loss": 0.0233, "step": 423200 }, { "epoch": 0.004233, "grad_norm": 0.16150633990764618, "learning_rate": 1e-05, "loss": 0.0235, "step": 423300 }, { "epoch": 0.004234, "grad_norm": 0.21215292811393738, "learning_rate": 1e-05, "loss": 0.0234, "step": 423400 }, { "epoch": 0.004235, "grad_norm": 0.20256568491458893, "learning_rate": 1e-05, "loss": 0.0236, "step": 423500 }, { "epoch": 0.004236, "grad_norm": 0.18233750760555267, "learning_rate": 1e-05, "loss": 0.0232, "step": 423600 }, { "epoch": 0.004237, "grad_norm": 0.20284625887870789, "learning_rate": 1e-05, "loss": 0.0241, "step": 423700 }, { "epoch": 0.004238, "grad_norm": 0.18513615429401398, "learning_rate": 1e-05, "loss": 0.0237, "step": 423800 }, { "epoch": 0.004239, "grad_norm": 0.1844584345817566, "learning_rate": 1e-05, "loss": 0.0233, "step": 423900 }, { "epoch": 0.00424, "grad_norm": 0.20986753702163696, "learning_rate": 1e-05, "loss": 0.0239, "step": 424000 }, { "epoch": 0.004241, "grad_norm": 0.27762240171432495, "learning_rate": 1e-05, "loss": 0.0231, "step": 424100 }, { "epoch": 0.004242, "grad_norm": 0.27391934394836426, "learning_rate": 1e-05, "loss": 0.024, "step": 424200 }, { "epoch": 0.004243, "grad_norm": 0.1804317682981491, "learning_rate": 1e-05, "loss": 0.0233, "step": 424300 }, { "epoch": 0.004244, "grad_norm": 0.17361098527908325, "learning_rate": 1e-05, "loss": 0.0238, "step": 424400 }, { "epoch": 0.004245, "grad_norm": 0.254703164100647, "learning_rate": 1e-05, "loss": 0.0237, "step": 424500 }, { "epoch": 0.004246, "grad_norm": 0.15799972414970398, "learning_rate": 1e-05, "loss": 0.0236, "step": 424600 }, { "epoch": 0.004247, "grad_norm": 0.18377424776554108, "learning_rate": 1e-05, "loss": 0.0241, "step": 424700 }, { "epoch": 0.004248, "grad_norm": 0.2696676254272461, "learning_rate": 1e-05, "loss": 0.0235, "step": 424800 }, { "epoch": 0.004249, "grad_norm": 0.24274636805057526, "learning_rate": 1e-05, "loss": 0.0238, "step": 424900 }, { "epoch": 0.00425, "grad_norm": 0.2796351909637451, "learning_rate": 1e-05, "loss": 0.0233, "step": 425000 }, { "epoch": 0.004251, "grad_norm": 0.19691476225852966, "learning_rate": 1e-05, "loss": 0.0237, "step": 425100 }, { "epoch": 0.004252, "grad_norm": 0.20784056186676025, "learning_rate": 1e-05, "loss": 0.0236, "step": 425200 }, { "epoch": 0.004253, "grad_norm": 0.18475975096225739, "learning_rate": 1e-05, "loss": 0.0238, "step": 425300 }, { "epoch": 0.004254, "grad_norm": 0.19756901264190674, "learning_rate": 1e-05, "loss": 0.0234, "step": 425400 }, { "epoch": 0.004255, "grad_norm": 0.24494299292564392, "learning_rate": 1e-05, "loss": 0.0233, "step": 425500 }, { "epoch": 0.004256, "grad_norm": 0.31264087557792664, "learning_rate": 1e-05, "loss": 0.0229, "step": 425600 }, { "epoch": 0.004257, "grad_norm": 0.20705053210258484, "learning_rate": 1e-05, "loss": 0.0237, "step": 425700 }, { "epoch": 0.004258, "grad_norm": 0.3010505437850952, "learning_rate": 1e-05, "loss": 0.0238, "step": 425800 }, { "epoch": 0.004259, "grad_norm": 0.22312258183956146, "learning_rate": 1e-05, "loss": 0.0237, "step": 425900 }, { "epoch": 0.00426, "grad_norm": 0.3098558485507965, "learning_rate": 1e-05, "loss": 0.0236, "step": 426000 }, { "epoch": 0.004261, "grad_norm": 0.18097002804279327, "learning_rate": 1e-05, "loss": 0.024, "step": 426100 }, { "epoch": 0.004262, "grad_norm": 0.195201575756073, "learning_rate": 1e-05, "loss": 0.0233, "step": 426200 }, { "epoch": 0.004263, "grad_norm": 0.14585259556770325, "learning_rate": 1e-05, "loss": 0.0231, "step": 426300 }, { "epoch": 0.004264, "grad_norm": 0.2256711721420288, "learning_rate": 1e-05, "loss": 0.0238, "step": 426400 }, { "epoch": 0.004265, "grad_norm": 0.16565346717834473, "learning_rate": 1e-05, "loss": 0.024, "step": 426500 }, { "epoch": 0.004266, "grad_norm": 0.1964271366596222, "learning_rate": 1e-05, "loss": 0.0231, "step": 426600 }, { "epoch": 0.004267, "grad_norm": 0.18551132082939148, "learning_rate": 1e-05, "loss": 0.0233, "step": 426700 }, { "epoch": 0.004268, "grad_norm": 0.20699335634708405, "learning_rate": 1e-05, "loss": 0.0239, "step": 426800 }, { "epoch": 0.004269, "grad_norm": 0.22032226622104645, "learning_rate": 1e-05, "loss": 0.0235, "step": 426900 }, { "epoch": 0.00427, "grad_norm": 0.30567118525505066, "learning_rate": 1e-05, "loss": 0.0233, "step": 427000 }, { "epoch": 0.004271, "grad_norm": 0.233809232711792, "learning_rate": 1e-05, "loss": 0.0236, "step": 427100 }, { "epoch": 0.004272, "grad_norm": 0.2140359878540039, "learning_rate": 1e-05, "loss": 0.0238, "step": 427200 }, { "epoch": 0.004273, "grad_norm": 0.2033415287733078, "learning_rate": 1e-05, "loss": 0.0235, "step": 427300 }, { "epoch": 0.004274, "grad_norm": 0.20184923708438873, "learning_rate": 1e-05, "loss": 0.023, "step": 427400 }, { "epoch": 0.004275, "grad_norm": 0.2361277937889099, "learning_rate": 1e-05, "loss": 0.0237, "step": 427500 }, { "epoch": 0.004276, "grad_norm": 0.2430427074432373, "learning_rate": 1e-05, "loss": 0.024, "step": 427600 }, { "epoch": 0.004277, "grad_norm": 0.23499882221221924, "learning_rate": 1e-05, "loss": 0.0234, "step": 427700 }, { "epoch": 0.004278, "grad_norm": 0.23380815982818604, "learning_rate": 1e-05, "loss": 0.0239, "step": 427800 }, { "epoch": 0.004279, "grad_norm": 0.2226102650165558, "learning_rate": 1e-05, "loss": 0.0239, "step": 427900 }, { "epoch": 0.00428, "grad_norm": 0.17551952600479126, "learning_rate": 1e-05, "loss": 0.0233, "step": 428000 }, { "epoch": 0.004281, "grad_norm": 0.1724383533000946, "learning_rate": 1e-05, "loss": 0.023, "step": 428100 }, { "epoch": 0.004282, "grad_norm": 0.2332030087709427, "learning_rate": 1e-05, "loss": 0.0235, "step": 428200 }, { "epoch": 0.004283, "grad_norm": 0.19680607318878174, "learning_rate": 1e-05, "loss": 0.0236, "step": 428300 }, { "epoch": 0.004284, "grad_norm": 0.2692990005016327, "learning_rate": 1e-05, "loss": 0.0235, "step": 428400 }, { "epoch": 0.004285, "grad_norm": 0.25071054697036743, "learning_rate": 1e-05, "loss": 0.0234, "step": 428500 }, { "epoch": 0.004286, "grad_norm": 0.18852300941944122, "learning_rate": 1e-05, "loss": 0.0235, "step": 428600 }, { "epoch": 0.004287, "grad_norm": 0.22067157924175262, "learning_rate": 1e-05, "loss": 0.0238, "step": 428700 }, { "epoch": 0.004288, "grad_norm": 0.16732381284236908, "learning_rate": 1e-05, "loss": 0.0236, "step": 428800 }, { "epoch": 0.004289, "grad_norm": 0.17053505778312683, "learning_rate": 1e-05, "loss": 0.0234, "step": 428900 }, { "epoch": 0.00429, "grad_norm": 0.20293216407299042, "learning_rate": 1e-05, "loss": 0.0234, "step": 429000 }, { "epoch": 0.004291, "grad_norm": 0.27229127287864685, "learning_rate": 1e-05, "loss": 0.0234, "step": 429100 }, { "epoch": 0.004292, "grad_norm": 0.20203547179698944, "learning_rate": 1e-05, "loss": 0.024, "step": 429200 }, { "epoch": 0.004293, "grad_norm": 0.21692636609077454, "learning_rate": 1e-05, "loss": 0.0234, "step": 429300 }, { "epoch": 0.004294, "grad_norm": 0.19434820115566254, "learning_rate": 1e-05, "loss": 0.0233, "step": 429400 }, { "epoch": 0.004295, "grad_norm": 0.183962881565094, "learning_rate": 1e-05, "loss": 0.0237, "step": 429500 }, { "epoch": 0.004296, "grad_norm": 0.23288263380527496, "learning_rate": 1e-05, "loss": 0.0235, "step": 429600 }, { "epoch": 0.004297, "grad_norm": 0.204896941781044, "learning_rate": 1e-05, "loss": 0.0238, "step": 429700 }, { "epoch": 0.004298, "grad_norm": 0.23417586088180542, "learning_rate": 1e-05, "loss": 0.0235, "step": 429800 }, { "epoch": 0.004299, "grad_norm": 0.22970955073833466, "learning_rate": 1e-05, "loss": 0.0239, "step": 429900 }, { "epoch": 0.0043, "grad_norm": 0.1792362481355667, "learning_rate": 1e-05, "loss": 0.024, "step": 430000 }, { "epoch": 0.004301, "grad_norm": 0.16857630014419556, "learning_rate": 1e-05, "loss": 0.0233, "step": 430100 }, { "epoch": 0.004302, "grad_norm": 0.1974494308233261, "learning_rate": 1e-05, "loss": 0.0236, "step": 430200 }, { "epoch": 0.004303, "grad_norm": 0.23761752247810364, "learning_rate": 1e-05, "loss": 0.0235, "step": 430300 }, { "epoch": 0.004304, "grad_norm": 0.1752641350030899, "learning_rate": 1e-05, "loss": 0.0241, "step": 430400 }, { "epoch": 0.004305, "grad_norm": 0.19682177901268005, "learning_rate": 1e-05, "loss": 0.0234, "step": 430500 }, { "epoch": 0.004306, "grad_norm": 0.21573105454444885, "learning_rate": 1e-05, "loss": 0.0237, "step": 430600 }, { "epoch": 0.004307, "grad_norm": 0.2569282054901123, "learning_rate": 1e-05, "loss": 0.0232, "step": 430700 }, { "epoch": 0.004308, "grad_norm": 0.19441097974777222, "learning_rate": 1e-05, "loss": 0.0241, "step": 430800 }, { "epoch": 0.004309, "grad_norm": 0.17740872502326965, "learning_rate": 1e-05, "loss": 0.0242, "step": 430900 }, { "epoch": 0.00431, "grad_norm": 0.25713902711868286, "learning_rate": 1e-05, "loss": 0.0242, "step": 431000 }, { "epoch": 0.004311, "grad_norm": 0.16214464604854584, "learning_rate": 1e-05, "loss": 0.0236, "step": 431100 }, { "epoch": 0.004312, "grad_norm": 0.19039447605609894, "learning_rate": 1e-05, "loss": 0.0238, "step": 431200 }, { "epoch": 0.004313, "grad_norm": 0.21667931973934174, "learning_rate": 1e-05, "loss": 0.0239, "step": 431300 }, { "epoch": 0.004314, "grad_norm": 0.21574436128139496, "learning_rate": 1e-05, "loss": 0.0234, "step": 431400 }, { "epoch": 0.004315, "grad_norm": 0.18191376328468323, "learning_rate": 1e-05, "loss": 0.0237, "step": 431500 }, { "epoch": 0.004316, "grad_norm": 0.21535120904445648, "learning_rate": 1e-05, "loss": 0.0232, "step": 431600 }, { "epoch": 0.004317, "grad_norm": 0.3786921203136444, "learning_rate": 1e-05, "loss": 0.0237, "step": 431700 }, { "epoch": 0.004318, "grad_norm": 0.17714186012744904, "learning_rate": 1e-05, "loss": 0.0234, "step": 431800 }, { "epoch": 0.004319, "grad_norm": 0.16206806898117065, "learning_rate": 1e-05, "loss": 0.0233, "step": 431900 }, { "epoch": 0.00432, "grad_norm": 0.1697683483362198, "learning_rate": 1e-05, "loss": 0.0233, "step": 432000 }, { "epoch": 0.004321, "grad_norm": 0.3348475396633148, "learning_rate": 1e-05, "loss": 0.0233, "step": 432100 }, { "epoch": 0.004322, "grad_norm": 0.11804835498332977, "learning_rate": 1e-05, "loss": 0.0231, "step": 432200 }, { "epoch": 0.004323, "grad_norm": 0.283023864030838, "learning_rate": 1e-05, "loss": 0.0229, "step": 432300 }, { "epoch": 0.004324, "grad_norm": 0.25664088129997253, "learning_rate": 1e-05, "loss": 0.0233, "step": 432400 }, { "epoch": 0.004325, "grad_norm": 0.169874906539917, "learning_rate": 1e-05, "loss": 0.0232, "step": 432500 }, { "epoch": 0.004326, "grad_norm": 0.17657825350761414, "learning_rate": 1e-05, "loss": 0.0235, "step": 432600 }, { "epoch": 0.004327, "grad_norm": 0.17877300083637238, "learning_rate": 1e-05, "loss": 0.0236, "step": 432700 }, { "epoch": 0.004328, "grad_norm": 0.14666680991649628, "learning_rate": 1e-05, "loss": 0.0233, "step": 432800 }, { "epoch": 0.004329, "grad_norm": 0.20054639875888824, "learning_rate": 1e-05, "loss": 0.0233, "step": 432900 }, { "epoch": 0.00433, "grad_norm": 0.2674124836921692, "learning_rate": 1e-05, "loss": 0.0237, "step": 433000 }, { "epoch": 0.004331, "grad_norm": 0.24184325337409973, "learning_rate": 1e-05, "loss": 0.0236, "step": 433100 }, { "epoch": 0.004332, "grad_norm": 0.1694374829530716, "learning_rate": 1e-05, "loss": 0.0234, "step": 433200 }, { "epoch": 0.004333, "grad_norm": 0.19181030988693237, "learning_rate": 1e-05, "loss": 0.0236, "step": 433300 }, { "epoch": 0.004334, "grad_norm": 0.19022874534130096, "learning_rate": 1e-05, "loss": 0.0236, "step": 433400 }, { "epoch": 0.004335, "grad_norm": 0.20790456235408783, "learning_rate": 1e-05, "loss": 0.0227, "step": 433500 }, { "epoch": 0.004336, "grad_norm": 0.18109750747680664, "learning_rate": 1e-05, "loss": 0.0229, "step": 433600 }, { "epoch": 0.004337, "grad_norm": 0.13456806540489197, "learning_rate": 1e-05, "loss": 0.0233, "step": 433700 }, { "epoch": 0.004338, "grad_norm": 0.19162620604038239, "learning_rate": 1e-05, "loss": 0.0234, "step": 433800 }, { "epoch": 0.004339, "grad_norm": 0.2225160002708435, "learning_rate": 1e-05, "loss": 0.0242, "step": 433900 }, { "epoch": 0.00434, "grad_norm": 0.15667806565761566, "learning_rate": 1e-05, "loss": 0.0229, "step": 434000 }, { "epoch": 0.004341, "grad_norm": 0.1464381068944931, "learning_rate": 1e-05, "loss": 0.0235, "step": 434100 }, { "epoch": 0.004342, "grad_norm": 0.1858130544424057, "learning_rate": 1e-05, "loss": 0.0235, "step": 434200 }, { "epoch": 0.004343, "grad_norm": 0.22693192958831787, "learning_rate": 1e-05, "loss": 0.0235, "step": 434300 }, { "epoch": 0.004344, "grad_norm": 0.16937954723834991, "learning_rate": 1e-05, "loss": 0.0237, "step": 434400 }, { "epoch": 0.004345, "grad_norm": 0.1958421915769577, "learning_rate": 1e-05, "loss": 0.0238, "step": 434500 }, { "epoch": 0.004346, "grad_norm": 0.21297836303710938, "learning_rate": 1e-05, "loss": 0.0231, "step": 434600 }, { "epoch": 0.004347, "grad_norm": 0.21221713721752167, "learning_rate": 1e-05, "loss": 0.0234, "step": 434700 }, { "epoch": 0.004348, "grad_norm": 0.1933383196592331, "learning_rate": 1e-05, "loss": 0.023, "step": 434800 }, { "epoch": 0.004349, "grad_norm": 0.20156291127204895, "learning_rate": 1e-05, "loss": 0.0234, "step": 434900 }, { "epoch": 0.00435, "grad_norm": 0.17280252277851105, "learning_rate": 1e-05, "loss": 0.0232, "step": 435000 }, { "epoch": 0.004351, "grad_norm": 0.2166159600019455, "learning_rate": 1e-05, "loss": 0.0237, "step": 435100 }, { "epoch": 0.004352, "grad_norm": 0.16804145276546478, "learning_rate": 1e-05, "loss": 0.0234, "step": 435200 }, { "epoch": 0.004353, "grad_norm": 0.2535684108734131, "learning_rate": 1e-05, "loss": 0.0232, "step": 435300 }, { "epoch": 0.004354, "grad_norm": 0.22681118547916412, "learning_rate": 1e-05, "loss": 0.0232, "step": 435400 }, { "epoch": 0.004355, "grad_norm": 0.16846774518489838, "learning_rate": 1e-05, "loss": 0.0236, "step": 435500 }, { "epoch": 0.004356, "grad_norm": 0.175014466047287, "learning_rate": 1e-05, "loss": 0.0235, "step": 435600 }, { "epoch": 0.004357, "grad_norm": 0.2292056530714035, "learning_rate": 1e-05, "loss": 0.0235, "step": 435700 }, { "epoch": 0.004358, "grad_norm": 0.2194889634847641, "learning_rate": 1e-05, "loss": 0.0236, "step": 435800 }, { "epoch": 0.004359, "grad_norm": 0.19981488585472107, "learning_rate": 1e-05, "loss": 0.0235, "step": 435900 }, { "epoch": 0.00436, "grad_norm": 0.18299952149391174, "learning_rate": 1e-05, "loss": 0.0234, "step": 436000 }, { "epoch": 0.004361, "grad_norm": 0.14954215288162231, "learning_rate": 1e-05, "loss": 0.0237, "step": 436100 }, { "epoch": 0.004362, "grad_norm": 0.1807412952184677, "learning_rate": 1e-05, "loss": 0.0231, "step": 436200 }, { "epoch": 0.004363, "grad_norm": 0.16171984374523163, "learning_rate": 1e-05, "loss": 0.0238, "step": 436300 }, { "epoch": 0.004364, "grad_norm": 0.20767316222190857, "learning_rate": 1e-05, "loss": 0.0232, "step": 436400 }, { "epoch": 0.004365, "grad_norm": 0.16481776535511017, "learning_rate": 1e-05, "loss": 0.0231, "step": 436500 }, { "epoch": 0.004366, "grad_norm": 0.19073954224586487, "learning_rate": 1e-05, "loss": 0.0231, "step": 436600 }, { "epoch": 0.004367, "grad_norm": 0.18178704380989075, "learning_rate": 1e-05, "loss": 0.0238, "step": 436700 }, { "epoch": 0.004368, "grad_norm": 0.17812800407409668, "learning_rate": 1e-05, "loss": 0.0228, "step": 436800 }, { "epoch": 0.004369, "grad_norm": 0.2609589993953705, "learning_rate": 1e-05, "loss": 0.023, "step": 436900 }, { "epoch": 0.00437, "grad_norm": 0.2566321790218353, "learning_rate": 1e-05, "loss": 0.0238, "step": 437000 }, { "epoch": 0.004371, "grad_norm": 0.2780173122882843, "learning_rate": 1e-05, "loss": 0.0228, "step": 437100 }, { "epoch": 0.004372, "grad_norm": 0.2605515718460083, "learning_rate": 1e-05, "loss": 0.0234, "step": 437200 }, { "epoch": 0.004373, "grad_norm": 0.21113985776901245, "learning_rate": 1e-05, "loss": 0.0235, "step": 437300 }, { "epoch": 0.004374, "grad_norm": 0.16012415289878845, "learning_rate": 1e-05, "loss": 0.0235, "step": 437400 }, { "epoch": 0.004375, "grad_norm": 0.17792946100234985, "learning_rate": 1e-05, "loss": 0.0234, "step": 437500 }, { "epoch": 0.004376, "grad_norm": 0.21906466782093048, "learning_rate": 1e-05, "loss": 0.0236, "step": 437600 }, { "epoch": 0.004377, "grad_norm": 0.22951853275299072, "learning_rate": 1e-05, "loss": 0.0232, "step": 437700 }, { "epoch": 0.004378, "grad_norm": 0.2302633374929428, "learning_rate": 1e-05, "loss": 0.0231, "step": 437800 }, { "epoch": 0.004379, "grad_norm": 0.22421370446681976, "learning_rate": 1e-05, "loss": 0.0236, "step": 437900 }, { "epoch": 0.00438, "grad_norm": 0.19783838093280792, "learning_rate": 1e-05, "loss": 0.0233, "step": 438000 }, { "epoch": 0.004381, "grad_norm": 0.16764023900032043, "learning_rate": 1e-05, "loss": 0.0237, "step": 438100 }, { "epoch": 0.004382, "grad_norm": 0.23772922158241272, "learning_rate": 1e-05, "loss": 0.0231, "step": 438200 }, { "epoch": 0.004383, "grad_norm": 0.19395196437835693, "learning_rate": 1e-05, "loss": 0.0237, "step": 438300 }, { "epoch": 0.004384, "grad_norm": 0.21181148290634155, "learning_rate": 1e-05, "loss": 0.0226, "step": 438400 }, { "epoch": 0.004385, "grad_norm": 0.16714906692504883, "learning_rate": 1e-05, "loss": 0.0236, "step": 438500 }, { "epoch": 0.004386, "grad_norm": 0.17532610893249512, "learning_rate": 1e-05, "loss": 0.023, "step": 438600 }, { "epoch": 0.004387, "grad_norm": 0.1704893261194229, "learning_rate": 1e-05, "loss": 0.0233, "step": 438700 }, { "epoch": 0.004388, "grad_norm": 0.2081296741962433, "learning_rate": 1e-05, "loss": 0.0237, "step": 438800 }, { "epoch": 0.004389, "grad_norm": 0.1690738946199417, "learning_rate": 1e-05, "loss": 0.023, "step": 438900 }, { "epoch": 0.00439, "grad_norm": 0.19499215483665466, "learning_rate": 1e-05, "loss": 0.023, "step": 439000 }, { "epoch": 0.004391, "grad_norm": 0.24602189660072327, "learning_rate": 1e-05, "loss": 0.0234, "step": 439100 }, { "epoch": 0.004392, "grad_norm": 0.18142880499362946, "learning_rate": 1e-05, "loss": 0.0229, "step": 439200 }, { "epoch": 0.004393, "grad_norm": 0.1993546336889267, "learning_rate": 1e-05, "loss": 0.0235, "step": 439300 }, { "epoch": 0.004394, "grad_norm": 0.267570436000824, "learning_rate": 1e-05, "loss": 0.0229, "step": 439400 }, { "epoch": 0.004395, "grad_norm": 0.2170959860086441, "learning_rate": 1e-05, "loss": 0.0231, "step": 439500 }, { "epoch": 0.004396, "grad_norm": 0.18060381710529327, "learning_rate": 1e-05, "loss": 0.0233, "step": 439600 }, { "epoch": 0.004397, "grad_norm": 0.17942363023757935, "learning_rate": 1e-05, "loss": 0.024, "step": 439700 }, { "epoch": 0.004398, "grad_norm": 0.22946074604988098, "learning_rate": 1e-05, "loss": 0.0231, "step": 439800 }, { "epoch": 0.004399, "grad_norm": 0.24079564213752747, "learning_rate": 1e-05, "loss": 0.0236, "step": 439900 }, { "epoch": 0.0044, "grad_norm": 0.2132103592157364, "learning_rate": 1e-05, "loss": 0.0238, "step": 440000 }, { "epoch": 0.0044, "eval_loss": 0.020904023200273514, "eval_runtime": 180.6503, "eval_samples_per_second": 276.778, "eval_steps_per_second": 17.299, "step": 440000 }, { "epoch": 0.004401, "grad_norm": 0.23044341802597046, "learning_rate": 1e-05, "loss": 0.0231, "step": 440100 }, { "epoch": 0.004402, "grad_norm": 0.24813386797904968, "learning_rate": 1e-05, "loss": 0.0232, "step": 440200 }, { "epoch": 0.004403, "grad_norm": 0.3295495808124542, "learning_rate": 1e-05, "loss": 0.023, "step": 440300 }, { "epoch": 0.004404, "grad_norm": 0.2256755530834198, "learning_rate": 1e-05, "loss": 0.023, "step": 440400 }, { "epoch": 0.004405, "grad_norm": 0.18168839812278748, "learning_rate": 1e-05, "loss": 0.0237, "step": 440500 }, { "epoch": 0.004406, "grad_norm": 0.2610403299331665, "learning_rate": 1e-05, "loss": 0.0234, "step": 440600 }, { "epoch": 0.004407, "grad_norm": 0.18918107450008392, "learning_rate": 1e-05, "loss": 0.0232, "step": 440700 }, { "epoch": 0.004408, "grad_norm": 0.10917015373706818, "learning_rate": 1e-05, "loss": 0.0234, "step": 440800 }, { "epoch": 0.004409, "grad_norm": 0.24390384554862976, "learning_rate": 1e-05, "loss": 0.0231, "step": 440900 }, { "epoch": 0.00441, "grad_norm": 0.19046364724636078, "learning_rate": 1e-05, "loss": 0.0233, "step": 441000 }, { "epoch": 0.004411, "grad_norm": 0.2199350744485855, "learning_rate": 1e-05, "loss": 0.0232, "step": 441100 }, { "epoch": 0.004412, "grad_norm": 0.27372199296951294, "learning_rate": 1e-05, "loss": 0.0237, "step": 441200 }, { "epoch": 0.004413, "grad_norm": 0.18218539655208588, "learning_rate": 1e-05, "loss": 0.0233, "step": 441300 }, { "epoch": 0.004414, "grad_norm": 0.1716521978378296, "learning_rate": 1e-05, "loss": 0.0238, "step": 441400 }, { "epoch": 0.004415, "grad_norm": 0.15880508720874786, "learning_rate": 1e-05, "loss": 0.0233, "step": 441500 }, { "epoch": 0.004416, "grad_norm": 0.19665701687335968, "learning_rate": 1e-05, "loss": 0.0233, "step": 441600 }, { "epoch": 0.004417, "grad_norm": 0.18757113814353943, "learning_rate": 1e-05, "loss": 0.0232, "step": 441700 }, { "epoch": 0.004418, "grad_norm": 0.20283359289169312, "learning_rate": 1e-05, "loss": 0.023, "step": 441800 }, { "epoch": 0.004419, "grad_norm": 0.18775001168251038, "learning_rate": 1e-05, "loss": 0.024, "step": 441900 }, { "epoch": 0.00442, "grad_norm": 0.2023424506187439, "learning_rate": 1e-05, "loss": 0.0231, "step": 442000 }, { "epoch": 0.004421, "grad_norm": 0.24058109521865845, "learning_rate": 1e-05, "loss": 0.0232, "step": 442100 }, { "epoch": 0.004422, "grad_norm": 0.24987103044986725, "learning_rate": 1e-05, "loss": 0.023, "step": 442200 }, { "epoch": 0.004423, "grad_norm": 0.162273108959198, "learning_rate": 1e-05, "loss": 0.0237, "step": 442300 }, { "epoch": 0.004424, "grad_norm": 0.21134719252586365, "learning_rate": 1e-05, "loss": 0.0236, "step": 442400 }, { "epoch": 0.004425, "grad_norm": 0.2058124840259552, "learning_rate": 1e-05, "loss": 0.0229, "step": 442500 }, { "epoch": 0.004426, "grad_norm": 0.2522238492965698, "learning_rate": 1e-05, "loss": 0.0232, "step": 442600 }, { "epoch": 0.004427, "grad_norm": 0.5077947974205017, "learning_rate": 1e-05, "loss": 0.0233, "step": 442700 }, { "epoch": 0.004428, "grad_norm": 0.31110048294067383, "learning_rate": 1e-05, "loss": 0.0239, "step": 442800 }, { "epoch": 0.004429, "grad_norm": 0.18634116649627686, "learning_rate": 1e-05, "loss": 0.0234, "step": 442900 }, { "epoch": 0.00443, "grad_norm": 0.20208144187927246, "learning_rate": 1e-05, "loss": 0.0233, "step": 443000 }, { "epoch": 0.004431, "grad_norm": 0.1627720445394516, "learning_rate": 1e-05, "loss": 0.0231, "step": 443100 }, { "epoch": 0.004432, "grad_norm": 0.17021462321281433, "learning_rate": 1e-05, "loss": 0.0233, "step": 443200 }, { "epoch": 0.004433, "grad_norm": 0.24140915274620056, "learning_rate": 1e-05, "loss": 0.0234, "step": 443300 }, { "epoch": 0.004434, "grad_norm": 0.20896629989147186, "learning_rate": 1e-05, "loss": 0.0233, "step": 443400 }, { "epoch": 0.004435, "grad_norm": 0.23582100868225098, "learning_rate": 1e-05, "loss": 0.0231, "step": 443500 }, { "epoch": 0.004436, "grad_norm": 0.1991957128047943, "learning_rate": 1e-05, "loss": 0.0227, "step": 443600 }, { "epoch": 0.004437, "grad_norm": 0.2037353664636612, "learning_rate": 1e-05, "loss": 0.0231, "step": 443700 }, { "epoch": 0.004438, "grad_norm": 0.16649161279201508, "learning_rate": 1e-05, "loss": 0.0234, "step": 443800 }, { "epoch": 0.004439, "grad_norm": 0.14894193410873413, "learning_rate": 1e-05, "loss": 0.0234, "step": 443900 }, { "epoch": 0.00444, "grad_norm": 0.26870012283325195, "learning_rate": 1e-05, "loss": 0.0234, "step": 444000 }, { "epoch": 0.004441, "grad_norm": 0.198878675699234, "learning_rate": 1e-05, "loss": 0.0241, "step": 444100 }, { "epoch": 0.004442, "grad_norm": 0.17928649485111237, "learning_rate": 1e-05, "loss": 0.0234, "step": 444200 }, { "epoch": 0.004443, "grad_norm": 0.19134186208248138, "learning_rate": 1e-05, "loss": 0.0233, "step": 444300 }, { "epoch": 0.004444, "grad_norm": 0.19351203739643097, "learning_rate": 1e-05, "loss": 0.0235, "step": 444400 }, { "epoch": 0.004445, "grad_norm": 0.20061351358890533, "learning_rate": 1e-05, "loss": 0.024, "step": 444500 }, { "epoch": 0.004446, "grad_norm": 0.17882142961025238, "learning_rate": 1e-05, "loss": 0.0231, "step": 444600 }, { "epoch": 0.004447, "grad_norm": 0.3137259781360626, "learning_rate": 1e-05, "loss": 0.0234, "step": 444700 }, { "epoch": 0.004448, "grad_norm": 0.14606870710849762, "learning_rate": 1e-05, "loss": 0.0231, "step": 444800 }, { "epoch": 0.004449, "grad_norm": 0.1492360681295395, "learning_rate": 1e-05, "loss": 0.0238, "step": 444900 }, { "epoch": 0.00445, "grad_norm": 0.21933642029762268, "learning_rate": 1e-05, "loss": 0.0234, "step": 445000 }, { "epoch": 0.004451, "grad_norm": 0.20557262003421783, "learning_rate": 1e-05, "loss": 0.0227, "step": 445100 }, { "epoch": 0.004452, "grad_norm": 0.24514763057231903, "learning_rate": 1e-05, "loss": 0.0234, "step": 445200 }, { "epoch": 0.004453, "grad_norm": 0.2134215235710144, "learning_rate": 1e-05, "loss": 0.0227, "step": 445300 }, { "epoch": 0.004454, "grad_norm": 0.15273673832416534, "learning_rate": 1e-05, "loss": 0.0228, "step": 445400 }, { "epoch": 0.004455, "grad_norm": 0.18735869228839874, "learning_rate": 1e-05, "loss": 0.0226, "step": 445500 }, { "epoch": 0.004456, "grad_norm": 0.32106441259384155, "learning_rate": 1e-05, "loss": 0.0229, "step": 445600 }, { "epoch": 0.004457, "grad_norm": 0.170523002743721, "learning_rate": 1e-05, "loss": 0.0234, "step": 445700 }, { "epoch": 0.004458, "grad_norm": 0.26386621594429016, "learning_rate": 1e-05, "loss": 0.0233, "step": 445800 }, { "epoch": 0.004459, "grad_norm": 0.16597509384155273, "learning_rate": 1e-05, "loss": 0.0232, "step": 445900 }, { "epoch": 0.00446, "grad_norm": 0.18207789957523346, "learning_rate": 1e-05, "loss": 0.0227, "step": 446000 }, { "epoch": 0.004461, "grad_norm": 0.3322072923183441, "learning_rate": 1e-05, "loss": 0.0237, "step": 446100 }, { "epoch": 0.004462, "grad_norm": 0.14896336197853088, "learning_rate": 1e-05, "loss": 0.0228, "step": 446200 }, { "epoch": 0.004463, "grad_norm": 0.2081025093793869, "learning_rate": 1e-05, "loss": 0.0234, "step": 446300 }, { "epoch": 0.004464, "grad_norm": 0.21239423751831055, "learning_rate": 1e-05, "loss": 0.0232, "step": 446400 }, { "epoch": 0.004465, "grad_norm": 0.21632926166057587, "learning_rate": 1e-05, "loss": 0.023, "step": 446500 }, { "epoch": 0.004466, "grad_norm": 0.16434019804000854, "learning_rate": 1e-05, "loss": 0.0231, "step": 446600 }, { "epoch": 0.004467, "grad_norm": 0.24972814321517944, "learning_rate": 1e-05, "loss": 0.0233, "step": 446700 }, { "epoch": 0.004468, "grad_norm": 0.21741558611392975, "learning_rate": 1e-05, "loss": 0.023, "step": 446800 }, { "epoch": 0.004469, "grad_norm": 0.1760004609823227, "learning_rate": 1e-05, "loss": 0.0232, "step": 446900 }, { "epoch": 0.00447, "grad_norm": 0.21169045567512512, "learning_rate": 1e-05, "loss": 0.0228, "step": 447000 }, { "epoch": 0.004471, "grad_norm": 0.15478041768074036, "learning_rate": 1e-05, "loss": 0.0234, "step": 447100 }, { "epoch": 0.004472, "grad_norm": 0.20823664963245392, "learning_rate": 1e-05, "loss": 0.0232, "step": 447200 }, { "epoch": 0.004473, "grad_norm": 0.22482509911060333, "learning_rate": 1e-05, "loss": 0.0231, "step": 447300 }, { "epoch": 0.004474, "grad_norm": 0.1521388590335846, "learning_rate": 1e-05, "loss": 0.0229, "step": 447400 }, { "epoch": 0.004475, "grad_norm": 0.18999828398227692, "learning_rate": 1e-05, "loss": 0.0228, "step": 447500 }, { "epoch": 0.004476, "grad_norm": 0.24601712822914124, "learning_rate": 1e-05, "loss": 0.023, "step": 447600 }, { "epoch": 0.004477, "grad_norm": 0.17273949086666107, "learning_rate": 1e-05, "loss": 0.0235, "step": 447700 }, { "epoch": 0.004478, "grad_norm": 0.23145507276058197, "learning_rate": 1e-05, "loss": 0.023, "step": 447800 }, { "epoch": 0.004479, "grad_norm": 0.19401346147060394, "learning_rate": 1e-05, "loss": 0.0231, "step": 447900 }, { "epoch": 0.00448, "grad_norm": 0.2648124694824219, "learning_rate": 1e-05, "loss": 0.0228, "step": 448000 }, { "epoch": 0.004481, "grad_norm": 0.20703448355197906, "learning_rate": 1e-05, "loss": 0.0233, "step": 448100 }, { "epoch": 0.004482, "grad_norm": 0.18155379593372345, "learning_rate": 1e-05, "loss": 0.0232, "step": 448200 }, { "epoch": 0.004483, "grad_norm": 0.13702107965946198, "learning_rate": 1e-05, "loss": 0.0235, "step": 448300 }, { "epoch": 0.004484, "grad_norm": 0.19250327348709106, "learning_rate": 1e-05, "loss": 0.0235, "step": 448400 }, { "epoch": 0.004485, "grad_norm": 0.22122113406658173, "learning_rate": 1e-05, "loss": 0.0227, "step": 448500 }, { "epoch": 0.004486, "grad_norm": 0.2069425880908966, "learning_rate": 1e-05, "loss": 0.0232, "step": 448600 }, { "epoch": 0.004487, "grad_norm": 0.18438157439231873, "learning_rate": 1e-05, "loss": 0.0235, "step": 448700 }, { "epoch": 0.004488, "grad_norm": 0.2047736495733261, "learning_rate": 1e-05, "loss": 0.0231, "step": 448800 }, { "epoch": 0.004489, "grad_norm": 0.24017782509326935, "learning_rate": 1e-05, "loss": 0.0227, "step": 448900 }, { "epoch": 0.00449, "grad_norm": 0.23227883875370026, "learning_rate": 1e-05, "loss": 0.023, "step": 449000 }, { "epoch": 0.004491, "grad_norm": 0.28926998376846313, "learning_rate": 1e-05, "loss": 0.0232, "step": 449100 }, { "epoch": 0.004492, "grad_norm": 0.23910094797611237, "learning_rate": 1e-05, "loss": 0.0229, "step": 449200 }, { "epoch": 0.004493, "grad_norm": 0.3035074770450592, "learning_rate": 1e-05, "loss": 0.0229, "step": 449300 }, { "epoch": 0.004494, "grad_norm": 0.20144207775592804, "learning_rate": 1e-05, "loss": 0.0235, "step": 449400 }, { "epoch": 0.004495, "grad_norm": 0.18115662038326263, "learning_rate": 1e-05, "loss": 0.0229, "step": 449500 }, { "epoch": 0.004496, "grad_norm": 0.18513402342796326, "learning_rate": 1e-05, "loss": 0.0232, "step": 449600 }, { "epoch": 0.004497, "grad_norm": 0.20980636775493622, "learning_rate": 1e-05, "loss": 0.0233, "step": 449700 }, { "epoch": 0.004498, "grad_norm": 0.22925084829330444, "learning_rate": 1e-05, "loss": 0.0228, "step": 449800 }, { "epoch": 0.004499, "grad_norm": 0.20134945213794708, "learning_rate": 1e-05, "loss": 0.0233, "step": 449900 }, { "epoch": 0.0045, "grad_norm": 0.18172423541545868, "learning_rate": 1e-05, "loss": 0.0232, "step": 450000 }, { "epoch": 0.004501, "grad_norm": 0.17052100598812103, "learning_rate": 1e-05, "loss": 0.0229, "step": 450100 }, { "epoch": 0.004502, "grad_norm": 0.16894374787807465, "learning_rate": 1e-05, "loss": 0.0231, "step": 450200 }, { "epoch": 0.004503, "grad_norm": 0.20565585792064667, "learning_rate": 1e-05, "loss": 0.0232, "step": 450300 }, { "epoch": 0.004504, "grad_norm": 0.20105917751789093, "learning_rate": 1e-05, "loss": 0.0231, "step": 450400 }, { "epoch": 0.004505, "grad_norm": 0.15531715750694275, "learning_rate": 1e-05, "loss": 0.0232, "step": 450500 }, { "epoch": 0.004506, "grad_norm": 0.18183229863643646, "learning_rate": 1e-05, "loss": 0.0232, "step": 450600 }, { "epoch": 0.004507, "grad_norm": 0.25801900029182434, "learning_rate": 1e-05, "loss": 0.0227, "step": 450700 }, { "epoch": 0.004508, "grad_norm": 0.17992989718914032, "learning_rate": 1e-05, "loss": 0.0235, "step": 450800 }, { "epoch": 0.004509, "grad_norm": 0.23099379241466522, "learning_rate": 1e-05, "loss": 0.0231, "step": 450900 }, { "epoch": 0.00451, "grad_norm": 0.2047002613544464, "learning_rate": 1e-05, "loss": 0.0235, "step": 451000 }, { "epoch": 0.004511, "grad_norm": 0.1940610706806183, "learning_rate": 1e-05, "loss": 0.0228, "step": 451100 }, { "epoch": 0.004512, "grad_norm": 0.14289551973342896, "learning_rate": 1e-05, "loss": 0.0231, "step": 451200 }, { "epoch": 0.004513, "grad_norm": 0.16565260291099548, "learning_rate": 1e-05, "loss": 0.0232, "step": 451300 }, { "epoch": 0.004514, "grad_norm": 0.16273708641529083, "learning_rate": 1e-05, "loss": 0.0229, "step": 451400 }, { "epoch": 0.004515, "grad_norm": 0.18616001307964325, "learning_rate": 1e-05, "loss": 0.0229, "step": 451500 }, { "epoch": 0.004516, "grad_norm": 0.3159469664096832, "learning_rate": 1e-05, "loss": 0.0229, "step": 451600 }, { "epoch": 0.004517, "grad_norm": 0.17681588232517242, "learning_rate": 1e-05, "loss": 0.0229, "step": 451700 }, { "epoch": 0.004518, "grad_norm": 0.16114111244678497, "learning_rate": 1e-05, "loss": 0.0231, "step": 451800 }, { "epoch": 0.004519, "grad_norm": 0.21660533547401428, "learning_rate": 1e-05, "loss": 0.0233, "step": 451900 }, { "epoch": 0.00452, "grad_norm": 0.18272870779037476, "learning_rate": 1e-05, "loss": 0.0235, "step": 452000 }, { "epoch": 0.004521, "grad_norm": 0.23419484496116638, "learning_rate": 1e-05, "loss": 0.023, "step": 452100 }, { "epoch": 0.004522, "grad_norm": 0.20687219500541687, "learning_rate": 1e-05, "loss": 0.023, "step": 452200 }, { "epoch": 0.004523, "grad_norm": 0.15437601506710052, "learning_rate": 1e-05, "loss": 0.0231, "step": 452300 }, { "epoch": 0.004524, "grad_norm": 0.16934850811958313, "learning_rate": 1e-05, "loss": 0.0226, "step": 452400 }, { "epoch": 0.004525, "grad_norm": 0.22904974222183228, "learning_rate": 1e-05, "loss": 0.023, "step": 452500 }, { "epoch": 0.004526, "grad_norm": 0.1979958415031433, "learning_rate": 1e-05, "loss": 0.0232, "step": 452600 }, { "epoch": 0.004527, "grad_norm": 0.1708015352487564, "learning_rate": 1e-05, "loss": 0.0227, "step": 452700 }, { "epoch": 0.004528, "grad_norm": 0.17079056799411774, "learning_rate": 1e-05, "loss": 0.0228, "step": 452800 }, { "epoch": 0.004529, "grad_norm": 0.20083169639110565, "learning_rate": 1e-05, "loss": 0.0234, "step": 452900 }, { "epoch": 0.00453, "grad_norm": 0.18402160704135895, "learning_rate": 1e-05, "loss": 0.0229, "step": 453000 }, { "epoch": 0.004531, "grad_norm": 0.17755411565303802, "learning_rate": 1e-05, "loss": 0.0235, "step": 453100 }, { "epoch": 0.004532, "grad_norm": 0.18507404625415802, "learning_rate": 1e-05, "loss": 0.0231, "step": 453200 }, { "epoch": 0.004533, "grad_norm": 0.14741286635398865, "learning_rate": 1e-05, "loss": 0.0232, "step": 453300 }, { "epoch": 0.004534, "grad_norm": 0.14462244510650635, "learning_rate": 1e-05, "loss": 0.0232, "step": 453400 }, { "epoch": 0.004535, "grad_norm": 0.14444054663181305, "learning_rate": 1e-05, "loss": 0.023, "step": 453500 }, { "epoch": 0.004536, "grad_norm": 0.15837080776691437, "learning_rate": 1e-05, "loss": 0.0228, "step": 453600 }, { "epoch": 0.004537, "grad_norm": 0.21775290369987488, "learning_rate": 1e-05, "loss": 0.0233, "step": 453700 }, { "epoch": 0.004538, "grad_norm": 0.17842070758342743, "learning_rate": 1e-05, "loss": 0.0234, "step": 453800 }, { "epoch": 0.004539, "grad_norm": 0.19244998693466187, "learning_rate": 1e-05, "loss": 0.0225, "step": 453900 }, { "epoch": 0.00454, "grad_norm": 0.2087089717388153, "learning_rate": 1e-05, "loss": 0.0227, "step": 454000 }, { "epoch": 0.004541, "grad_norm": 0.1833614706993103, "learning_rate": 1e-05, "loss": 0.023, "step": 454100 }, { "epoch": 0.004542, "grad_norm": 0.15584298968315125, "learning_rate": 1e-05, "loss": 0.0224, "step": 454200 }, { "epoch": 0.004543, "grad_norm": 0.17314353585243225, "learning_rate": 1e-05, "loss": 0.0236, "step": 454300 }, { "epoch": 0.004544, "grad_norm": 0.16964980959892273, "learning_rate": 1e-05, "loss": 0.0229, "step": 454400 }, { "epoch": 0.004545, "grad_norm": 0.1869010031223297, "learning_rate": 1e-05, "loss": 0.0236, "step": 454500 }, { "epoch": 0.004546, "grad_norm": 0.17369498312473297, "learning_rate": 1e-05, "loss": 0.0232, "step": 454600 }, { "epoch": 0.004547, "grad_norm": 0.15386289358139038, "learning_rate": 1e-05, "loss": 0.0235, "step": 454700 }, { "epoch": 0.004548, "grad_norm": 0.2071363776922226, "learning_rate": 1e-05, "loss": 0.0231, "step": 454800 }, { "epoch": 0.004549, "grad_norm": 0.160829558968544, "learning_rate": 1e-05, "loss": 0.0227, "step": 454900 }, { "epoch": 0.00455, "grad_norm": 0.1610802710056305, "learning_rate": 1e-05, "loss": 0.0231, "step": 455000 }, { "epoch": 0.004551, "grad_norm": 0.16887809336185455, "learning_rate": 1e-05, "loss": 0.0225, "step": 455100 }, { "epoch": 0.004552, "grad_norm": 0.1909848153591156, "learning_rate": 1e-05, "loss": 0.0235, "step": 455200 }, { "epoch": 0.004553, "grad_norm": 0.19938939809799194, "learning_rate": 1e-05, "loss": 0.0232, "step": 455300 }, { "epoch": 0.004554, "grad_norm": 0.2050144523382187, "learning_rate": 1e-05, "loss": 0.0232, "step": 455400 }, { "epoch": 0.004555, "grad_norm": 0.1862177550792694, "learning_rate": 1e-05, "loss": 0.0234, "step": 455500 }, { "epoch": 0.004556, "grad_norm": 0.19525231420993805, "learning_rate": 1e-05, "loss": 0.0228, "step": 455600 }, { "epoch": 0.004557, "grad_norm": 0.16179819405078888, "learning_rate": 1e-05, "loss": 0.0227, "step": 455700 }, { "epoch": 0.004558, "grad_norm": 0.19049790501594543, "learning_rate": 1e-05, "loss": 0.0224, "step": 455800 }, { "epoch": 0.004559, "grad_norm": 0.16750475764274597, "learning_rate": 1e-05, "loss": 0.0233, "step": 455900 }, { "epoch": 0.00456, "grad_norm": 0.25337398052215576, "learning_rate": 1e-05, "loss": 0.0233, "step": 456000 }, { "epoch": 0.004561, "grad_norm": 0.25638455152511597, "learning_rate": 1e-05, "loss": 0.0232, "step": 456100 }, { "epoch": 0.004562, "grad_norm": 0.21213509142398834, "learning_rate": 1e-05, "loss": 0.0229, "step": 456200 }, { "epoch": 0.004563, "grad_norm": 0.18961049616336823, "learning_rate": 1e-05, "loss": 0.0231, "step": 456300 }, { "epoch": 0.004564, "grad_norm": 0.1659918874502182, "learning_rate": 1e-05, "loss": 0.0225, "step": 456400 }, { "epoch": 0.004565, "grad_norm": 0.1820361167192459, "learning_rate": 1e-05, "loss": 0.0232, "step": 456500 }, { "epoch": 0.004566, "grad_norm": 0.20252996683120728, "learning_rate": 1e-05, "loss": 0.0232, "step": 456600 }, { "epoch": 0.004567, "grad_norm": 0.13261383771896362, "learning_rate": 1e-05, "loss": 0.0227, "step": 456700 }, { "epoch": 0.004568, "grad_norm": 0.1791812628507614, "learning_rate": 1e-05, "loss": 0.0231, "step": 456800 }, { "epoch": 0.004569, "grad_norm": 0.14500634372234344, "learning_rate": 1e-05, "loss": 0.0233, "step": 456900 }, { "epoch": 0.00457, "grad_norm": 0.18092700839042664, "learning_rate": 1e-05, "loss": 0.0226, "step": 457000 }, { "epoch": 0.004571, "grad_norm": 0.2156936377286911, "learning_rate": 1e-05, "loss": 0.0227, "step": 457100 }, { "epoch": 0.004572, "grad_norm": 0.16848526895046234, "learning_rate": 1e-05, "loss": 0.0225, "step": 457200 }, { "epoch": 0.004573, "grad_norm": 0.15912885963916779, "learning_rate": 1e-05, "loss": 0.023, "step": 457300 }, { "epoch": 0.004574, "grad_norm": 0.19193251430988312, "learning_rate": 1e-05, "loss": 0.0232, "step": 457400 }, { "epoch": 0.004575, "grad_norm": 0.16344474256038666, "learning_rate": 1e-05, "loss": 0.0233, "step": 457500 }, { "epoch": 0.004576, "grad_norm": 0.2121238261461258, "learning_rate": 1e-05, "loss": 0.0231, "step": 457600 }, { "epoch": 0.004577, "grad_norm": 0.18239593505859375, "learning_rate": 1e-05, "loss": 0.0228, "step": 457700 }, { "epoch": 0.004578, "grad_norm": 0.3322307765483856, "learning_rate": 1e-05, "loss": 0.0229, "step": 457800 }, { "epoch": 0.004579, "grad_norm": 0.19922569394111633, "learning_rate": 1e-05, "loss": 0.0229, "step": 457900 }, { "epoch": 0.00458, "grad_norm": 0.1699572652578354, "learning_rate": 1e-05, "loss": 0.0233, "step": 458000 }, { "epoch": 0.004581, "grad_norm": 0.24469798803329468, "learning_rate": 1e-05, "loss": 0.0237, "step": 458100 }, { "epoch": 0.004582, "grad_norm": 0.18910102546215057, "learning_rate": 1e-05, "loss": 0.0231, "step": 458200 }, { "epoch": 0.004583, "grad_norm": 0.16399964690208435, "learning_rate": 1e-05, "loss": 0.0223, "step": 458300 }, { "epoch": 0.004584, "grad_norm": 0.1810591220855713, "learning_rate": 1e-05, "loss": 0.0227, "step": 458400 }, { "epoch": 0.004585, "grad_norm": 0.16085894405841827, "learning_rate": 1e-05, "loss": 0.0231, "step": 458500 }, { "epoch": 0.004586, "grad_norm": 0.13465441763401031, "learning_rate": 1e-05, "loss": 0.0233, "step": 458600 }, { "epoch": 0.004587, "grad_norm": 0.21779310703277588, "learning_rate": 1e-05, "loss": 0.0229, "step": 458700 }, { "epoch": 0.004588, "grad_norm": 0.1727190613746643, "learning_rate": 1e-05, "loss": 0.0227, "step": 458800 }, { "epoch": 0.004589, "grad_norm": 0.2394542247056961, "learning_rate": 1e-05, "loss": 0.0227, "step": 458900 }, { "epoch": 0.00459, "grad_norm": 0.23303960263729095, "learning_rate": 1e-05, "loss": 0.0226, "step": 459000 }, { "epoch": 0.004591, "grad_norm": 0.1708889603614807, "learning_rate": 1e-05, "loss": 0.0226, "step": 459100 }, { "epoch": 0.004592, "grad_norm": 0.19896560907363892, "learning_rate": 1e-05, "loss": 0.0228, "step": 459200 }, { "epoch": 0.004593, "grad_norm": 0.20611436665058136, "learning_rate": 1e-05, "loss": 0.0224, "step": 459300 }, { "epoch": 0.004594, "grad_norm": 0.16436883807182312, "learning_rate": 1e-05, "loss": 0.0233, "step": 459400 }, { "epoch": 0.004595, "grad_norm": 0.22044388949871063, "learning_rate": 1e-05, "loss": 0.0229, "step": 459500 }, { "epoch": 0.004596, "grad_norm": 0.19866138696670532, "learning_rate": 1e-05, "loss": 0.0225, "step": 459600 }, { "epoch": 0.004597, "grad_norm": 0.1547439694404602, "learning_rate": 1e-05, "loss": 0.0227, "step": 459700 }, { "epoch": 0.004598, "grad_norm": 0.18736763298511505, "learning_rate": 1e-05, "loss": 0.0237, "step": 459800 }, { "epoch": 0.004599, "grad_norm": 0.21086405217647552, "learning_rate": 1e-05, "loss": 0.0231, "step": 459900 }, { "epoch": 0.0046, "grad_norm": 0.15123756229877472, "learning_rate": 1e-05, "loss": 0.0235, "step": 460000 }, { "epoch": 0.0046, "eval_loss": 0.020319581031799316, "eval_runtime": 192.8338, "eval_samples_per_second": 259.291, "eval_steps_per_second": 16.206, "step": 460000 }, { "epoch": 0.004601, "grad_norm": 0.16817186772823334, "learning_rate": 1e-05, "loss": 0.0228, "step": 460100 }, { "epoch": 0.004602, "grad_norm": 0.2750777304172516, "learning_rate": 1e-05, "loss": 0.0229, "step": 460200 }, { "epoch": 0.004603, "grad_norm": 0.2069888561964035, "learning_rate": 1e-05, "loss": 0.0227, "step": 460300 }, { "epoch": 0.004604, "grad_norm": 0.2206188589334488, "learning_rate": 1e-05, "loss": 0.0229, "step": 460400 }, { "epoch": 0.004605, "grad_norm": 0.22614310681819916, "learning_rate": 1e-05, "loss": 0.0229, "step": 460500 }, { "epoch": 0.004606, "grad_norm": 0.18772999942302704, "learning_rate": 1e-05, "loss": 0.0233, "step": 460600 }, { "epoch": 0.004607, "grad_norm": 0.1715085804462433, "learning_rate": 1e-05, "loss": 0.0227, "step": 460700 }, { "epoch": 0.004608, "grad_norm": 0.1953740417957306, "learning_rate": 1e-05, "loss": 0.023, "step": 460800 }, { "epoch": 0.004609, "grad_norm": 0.18464332818984985, "learning_rate": 1e-05, "loss": 0.0227, "step": 460900 }, { "epoch": 0.00461, "grad_norm": 0.20896369218826294, "learning_rate": 1e-05, "loss": 0.0227, "step": 461000 }, { "epoch": 0.004611, "grad_norm": 0.22138327360153198, "learning_rate": 1e-05, "loss": 0.0229, "step": 461100 }, { "epoch": 0.004612, "grad_norm": 0.16991890966892242, "learning_rate": 1e-05, "loss": 0.023, "step": 461200 }, { "epoch": 0.004613, "grad_norm": 0.17407870292663574, "learning_rate": 1e-05, "loss": 0.0232, "step": 461300 }, { "epoch": 0.004614, "grad_norm": 0.21315792202949524, "learning_rate": 1e-05, "loss": 0.0232, "step": 461400 }, { "epoch": 0.004615, "grad_norm": 0.25159910321235657, "learning_rate": 1e-05, "loss": 0.0226, "step": 461500 }, { "epoch": 0.004616, "grad_norm": 0.18286053836345673, "learning_rate": 1e-05, "loss": 0.0225, "step": 461600 }, { "epoch": 0.004617, "grad_norm": 0.172820046544075, "learning_rate": 1e-05, "loss": 0.0233, "step": 461700 }, { "epoch": 0.004618, "grad_norm": 0.195301353931427, "learning_rate": 1e-05, "loss": 0.0226, "step": 461800 }, { "epoch": 0.004619, "grad_norm": 0.13693980872631073, "learning_rate": 1e-05, "loss": 0.0232, "step": 461900 }, { "epoch": 0.00462, "grad_norm": 0.19925057888031006, "learning_rate": 1e-05, "loss": 0.0227, "step": 462000 }, { "epoch": 0.004621, "grad_norm": 0.210720032453537, "learning_rate": 1e-05, "loss": 0.023, "step": 462100 }, { "epoch": 0.004622, "grad_norm": 0.16939611732959747, "learning_rate": 1e-05, "loss": 0.0228, "step": 462200 }, { "epoch": 0.004623, "grad_norm": 0.2567330598831177, "learning_rate": 1e-05, "loss": 0.023, "step": 462300 }, { "epoch": 0.004624, "grad_norm": 0.1622077077627182, "learning_rate": 1e-05, "loss": 0.0227, "step": 462400 }, { "epoch": 0.004625, "grad_norm": 0.22177843749523163, "learning_rate": 1e-05, "loss": 0.0228, "step": 462500 }, { "epoch": 0.004626, "grad_norm": 0.1724337935447693, "learning_rate": 1e-05, "loss": 0.0232, "step": 462600 }, { "epoch": 0.004627, "grad_norm": 0.17031732201576233, "learning_rate": 1e-05, "loss": 0.0237, "step": 462700 }, { "epoch": 0.004628, "grad_norm": 0.15635071694850922, "learning_rate": 1e-05, "loss": 0.0226, "step": 462800 }, { "epoch": 0.004629, "grad_norm": 0.1844763308763504, "learning_rate": 1e-05, "loss": 0.0229, "step": 462900 }, { "epoch": 0.00463, "grad_norm": 0.21699947118759155, "learning_rate": 1e-05, "loss": 0.0233, "step": 463000 }, { "epoch": 0.004631, "grad_norm": 0.17056338489055634, "learning_rate": 1e-05, "loss": 0.0229, "step": 463100 }, { "epoch": 0.004632, "grad_norm": 0.18890085816383362, "learning_rate": 1e-05, "loss": 0.0231, "step": 463200 }, { "epoch": 0.004633, "grad_norm": 0.17528116703033447, "learning_rate": 1e-05, "loss": 0.0229, "step": 463300 }, { "epoch": 0.004634, "grad_norm": 0.20424672961235046, "learning_rate": 1e-05, "loss": 0.0229, "step": 463400 }, { "epoch": 0.004635, "grad_norm": 0.17659494280815125, "learning_rate": 1e-05, "loss": 0.023, "step": 463500 }, { "epoch": 0.004636, "grad_norm": 0.18644417822360992, "learning_rate": 1e-05, "loss": 0.0226, "step": 463600 }, { "epoch": 0.004637, "grad_norm": 0.17174889147281647, "learning_rate": 1e-05, "loss": 0.023, "step": 463700 }, { "epoch": 0.004638, "grad_norm": 0.26294204592704773, "learning_rate": 1e-05, "loss": 0.0234, "step": 463800 }, { "epoch": 0.004639, "grad_norm": 0.18307162821292877, "learning_rate": 1e-05, "loss": 0.0229, "step": 463900 }, { "epoch": 0.00464, "grad_norm": 0.16525349020957947, "learning_rate": 1e-05, "loss": 0.0228, "step": 464000 }, { "epoch": 0.004641, "grad_norm": 0.20797957479953766, "learning_rate": 1e-05, "loss": 0.0221, "step": 464100 }, { "epoch": 0.004642, "grad_norm": 0.18369720876216888, "learning_rate": 1e-05, "loss": 0.0232, "step": 464200 }, { "epoch": 0.004643, "grad_norm": 0.2386540025472641, "learning_rate": 1e-05, "loss": 0.023, "step": 464300 }, { "epoch": 0.004644, "grad_norm": 0.17980162799358368, "learning_rate": 1e-05, "loss": 0.0233, "step": 464400 }, { "epoch": 0.004645, "grad_norm": 0.17866939306259155, "learning_rate": 1e-05, "loss": 0.0227, "step": 464500 }, { "epoch": 0.004646, "grad_norm": 0.14445056021213531, "learning_rate": 1e-05, "loss": 0.0231, "step": 464600 }, { "epoch": 0.004647, "grad_norm": 0.19206061959266663, "learning_rate": 1e-05, "loss": 0.0226, "step": 464700 }, { "epoch": 0.004648, "grad_norm": 0.1521332561969757, "learning_rate": 1e-05, "loss": 0.0222, "step": 464800 }, { "epoch": 0.004649, "grad_norm": 0.14740872383117676, "learning_rate": 1e-05, "loss": 0.0228, "step": 464900 }, { "epoch": 0.00465, "grad_norm": 0.21131455898284912, "learning_rate": 1e-05, "loss": 0.0227, "step": 465000 }, { "epoch": 0.004651, "grad_norm": 0.21261794865131378, "learning_rate": 1e-05, "loss": 0.0224, "step": 465100 }, { "epoch": 0.004652, "grad_norm": 0.2245393991470337, "learning_rate": 1e-05, "loss": 0.0229, "step": 465200 }, { "epoch": 0.004653, "grad_norm": 0.1859494298696518, "learning_rate": 1e-05, "loss": 0.0227, "step": 465300 }, { "epoch": 0.004654, "grad_norm": 0.19919073581695557, "learning_rate": 1e-05, "loss": 0.0232, "step": 465400 }, { "epoch": 0.004655, "grad_norm": 0.15790864825248718, "learning_rate": 1e-05, "loss": 0.0226, "step": 465500 }, { "epoch": 0.004656, "grad_norm": 0.1499752402305603, "learning_rate": 1e-05, "loss": 0.0229, "step": 465600 }, { "epoch": 0.004657, "grad_norm": 0.21955645084381104, "learning_rate": 1e-05, "loss": 0.0225, "step": 465700 }, { "epoch": 0.004658, "grad_norm": 0.202839195728302, "learning_rate": 1e-05, "loss": 0.0227, "step": 465800 }, { "epoch": 0.004659, "grad_norm": 0.19858711957931519, "learning_rate": 1e-05, "loss": 0.0233, "step": 465900 }, { "epoch": 0.00466, "grad_norm": 0.23552709817886353, "learning_rate": 1e-05, "loss": 0.0232, "step": 466000 }, { "epoch": 0.004661, "grad_norm": 0.21678686141967773, "learning_rate": 1e-05, "loss": 0.0231, "step": 466100 }, { "epoch": 0.004662, "grad_norm": 0.20443668961524963, "learning_rate": 1e-05, "loss": 0.0231, "step": 466200 }, { "epoch": 0.004663, "grad_norm": 0.22271481156349182, "learning_rate": 1e-05, "loss": 0.023, "step": 466300 }, { "epoch": 0.004664, "grad_norm": 0.15970873832702637, "learning_rate": 1e-05, "loss": 0.023, "step": 466400 }, { "epoch": 0.004665, "grad_norm": 0.26170989871025085, "learning_rate": 1e-05, "loss": 0.0232, "step": 466500 }, { "epoch": 0.004666, "grad_norm": 0.21848715841770172, "learning_rate": 1e-05, "loss": 0.0233, "step": 466600 }, { "epoch": 0.004667, "grad_norm": 0.1588781625032425, "learning_rate": 1e-05, "loss": 0.0229, "step": 466700 }, { "epoch": 0.004668, "grad_norm": 0.17810826003551483, "learning_rate": 1e-05, "loss": 0.0226, "step": 466800 }, { "epoch": 0.004669, "grad_norm": 0.25144585967063904, "learning_rate": 1e-05, "loss": 0.023, "step": 466900 }, { "epoch": 0.00467, "grad_norm": 0.19919374585151672, "learning_rate": 1e-05, "loss": 0.0231, "step": 467000 }, { "epoch": 0.004671, "grad_norm": 0.1947977989912033, "learning_rate": 1e-05, "loss": 0.0233, "step": 467100 }, { "epoch": 0.004672, "grad_norm": 0.22350119054317474, "learning_rate": 1e-05, "loss": 0.0229, "step": 467200 }, { "epoch": 0.004673, "grad_norm": 0.28167399764060974, "learning_rate": 1e-05, "loss": 0.0226, "step": 467300 }, { "epoch": 0.004674, "grad_norm": 0.19099701941013336, "learning_rate": 1e-05, "loss": 0.0232, "step": 467400 }, { "epoch": 0.004675, "grad_norm": 0.2532368302345276, "learning_rate": 1e-05, "loss": 0.0225, "step": 467500 }, { "epoch": 0.004676, "grad_norm": 0.20602421462535858, "learning_rate": 1e-05, "loss": 0.0228, "step": 467600 }, { "epoch": 0.004677, "grad_norm": 0.18218685686588287, "learning_rate": 1e-05, "loss": 0.0232, "step": 467700 }, { "epoch": 0.004678, "grad_norm": 0.22666633129119873, "learning_rate": 1e-05, "loss": 0.023, "step": 467800 }, { "epoch": 0.004679, "grad_norm": 0.19166722893714905, "learning_rate": 1e-05, "loss": 0.0222, "step": 467900 }, { "epoch": 0.00468, "grad_norm": 0.16342394053936005, "learning_rate": 1e-05, "loss": 0.0231, "step": 468000 }, { "epoch": 0.004681, "grad_norm": 0.1612776666879654, "learning_rate": 1e-05, "loss": 0.0232, "step": 468100 }, { "epoch": 0.004682, "grad_norm": 0.1556386500597, "learning_rate": 1e-05, "loss": 0.0228, "step": 468200 }, { "epoch": 0.004683, "grad_norm": 0.19825135171413422, "learning_rate": 1e-05, "loss": 0.0229, "step": 468300 }, { "epoch": 0.004684, "grad_norm": 0.2040782868862152, "learning_rate": 1e-05, "loss": 0.0224, "step": 468400 }, { "epoch": 0.004685, "grad_norm": 0.18644168972969055, "learning_rate": 1e-05, "loss": 0.0228, "step": 468500 }, { "epoch": 0.004686, "grad_norm": 0.2227136194705963, "learning_rate": 1e-05, "loss": 0.0229, "step": 468600 }, { "epoch": 0.004687, "grad_norm": 0.2368399202823639, "learning_rate": 1e-05, "loss": 0.0224, "step": 468700 }, { "epoch": 0.004688, "grad_norm": 0.17254802584648132, "learning_rate": 1e-05, "loss": 0.0228, "step": 468800 }, { "epoch": 0.004689, "grad_norm": 0.15059570968151093, "learning_rate": 1e-05, "loss": 0.0227, "step": 468900 }, { "epoch": 0.00469, "grad_norm": 0.23941633105278015, "learning_rate": 1e-05, "loss": 0.0224, "step": 469000 }, { "epoch": 0.004691, "grad_norm": 0.1751018464565277, "learning_rate": 1e-05, "loss": 0.0229, "step": 469100 }, { "epoch": 0.004692, "grad_norm": 0.17812445759773254, "learning_rate": 1e-05, "loss": 0.023, "step": 469200 }, { "epoch": 0.004693, "grad_norm": 0.2082817256450653, "learning_rate": 1e-05, "loss": 0.0228, "step": 469300 }, { "epoch": 0.004694, "grad_norm": 0.17749124765396118, "learning_rate": 1e-05, "loss": 0.0226, "step": 469400 }, { "epoch": 0.004695, "grad_norm": 0.26588693261146545, "learning_rate": 1e-05, "loss": 0.0231, "step": 469500 }, { "epoch": 0.004696, "grad_norm": 0.1590612679719925, "learning_rate": 1e-05, "loss": 0.0222, "step": 469600 }, { "epoch": 0.004697, "grad_norm": 0.16313524544239044, "learning_rate": 1e-05, "loss": 0.0226, "step": 469700 }, { "epoch": 0.004698, "grad_norm": 0.20323598384857178, "learning_rate": 1e-05, "loss": 0.0226, "step": 469800 }, { "epoch": 0.004699, "grad_norm": 0.1967225819826126, "learning_rate": 1e-05, "loss": 0.0232, "step": 469900 }, { "epoch": 0.0047, "grad_norm": 0.2295638918876648, "learning_rate": 1e-05, "loss": 0.023, "step": 470000 }, { "epoch": 0.004701, "grad_norm": 0.21652637422084808, "learning_rate": 1e-05, "loss": 0.0228, "step": 470100 }, { "epoch": 0.004702, "grad_norm": 0.18467332422733307, "learning_rate": 1e-05, "loss": 0.0224, "step": 470200 }, { "epoch": 0.004703, "grad_norm": 0.17680741846561432, "learning_rate": 1e-05, "loss": 0.0228, "step": 470300 }, { "epoch": 0.004704, "grad_norm": 0.19046202301979065, "learning_rate": 1e-05, "loss": 0.0225, "step": 470400 }, { "epoch": 0.004705, "grad_norm": 0.15697474777698517, "learning_rate": 1e-05, "loss": 0.0232, "step": 470500 }, { "epoch": 0.004706, "grad_norm": 0.18852907419204712, "learning_rate": 1e-05, "loss": 0.0228, "step": 470600 }, { "epoch": 0.004707, "grad_norm": 0.21793396770954132, "learning_rate": 1e-05, "loss": 0.0227, "step": 470700 }, { "epoch": 0.004708, "grad_norm": 0.220299631357193, "learning_rate": 1e-05, "loss": 0.0227, "step": 470800 }, { "epoch": 0.004709, "grad_norm": 0.2428463250398636, "learning_rate": 1e-05, "loss": 0.0223, "step": 470900 }, { "epoch": 0.00471, "grad_norm": 0.1908440738916397, "learning_rate": 1e-05, "loss": 0.0223, "step": 471000 }, { "epoch": 0.004711, "grad_norm": 0.22241054475307465, "learning_rate": 1e-05, "loss": 0.023, "step": 471100 }, { "epoch": 0.004712, "grad_norm": 0.22327132523059845, "learning_rate": 1e-05, "loss": 0.0231, "step": 471200 }, { "epoch": 0.004713, "grad_norm": 0.17235766351222992, "learning_rate": 1e-05, "loss": 0.0231, "step": 471300 }, { "epoch": 0.004714, "grad_norm": 0.16893352568149567, "learning_rate": 1e-05, "loss": 0.0233, "step": 471400 }, { "epoch": 0.004715, "grad_norm": 0.2238239049911499, "learning_rate": 1e-05, "loss": 0.0225, "step": 471500 }, { "epoch": 0.004716, "grad_norm": 0.15357500314712524, "learning_rate": 1e-05, "loss": 0.0226, "step": 471600 }, { "epoch": 0.004717, "grad_norm": 0.36644870042800903, "learning_rate": 1e-05, "loss": 0.0231, "step": 471700 }, { "epoch": 0.004718, "grad_norm": 0.21335767209529877, "learning_rate": 1e-05, "loss": 0.0228, "step": 471800 }, { "epoch": 0.004719, "grad_norm": 0.17705276608467102, "learning_rate": 1e-05, "loss": 0.0227, "step": 471900 }, { "epoch": 0.00472, "grad_norm": 0.16037040948867798, "learning_rate": 1e-05, "loss": 0.0225, "step": 472000 }, { "epoch": 0.004721, "grad_norm": 0.20449958741664886, "learning_rate": 1e-05, "loss": 0.0228, "step": 472100 }, { "epoch": 0.004722, "grad_norm": 0.17981202900409698, "learning_rate": 1e-05, "loss": 0.0228, "step": 472200 }, { "epoch": 0.004723, "grad_norm": 0.22864137589931488, "learning_rate": 1e-05, "loss": 0.0223, "step": 472300 }, { "epoch": 0.004724, "grad_norm": 0.2087819129228592, "learning_rate": 1e-05, "loss": 0.0224, "step": 472400 }, { "epoch": 0.004725, "grad_norm": 0.1650557816028595, "learning_rate": 1e-05, "loss": 0.0222, "step": 472500 }, { "epoch": 0.004726, "grad_norm": 0.1611696183681488, "learning_rate": 1e-05, "loss": 0.0226, "step": 472600 }, { "epoch": 0.004727, "grad_norm": 0.16798821091651917, "learning_rate": 1e-05, "loss": 0.0226, "step": 472700 }, { "epoch": 0.004728, "grad_norm": 0.2602728307247162, "learning_rate": 1e-05, "loss": 0.0229, "step": 472800 }, { "epoch": 0.004729, "grad_norm": 0.1646832972764969, "learning_rate": 1e-05, "loss": 0.0224, "step": 472900 }, { "epoch": 0.00473, "grad_norm": 0.16522882878780365, "learning_rate": 1e-05, "loss": 0.0226, "step": 473000 }, { "epoch": 0.004731, "grad_norm": 0.2600788474082947, "learning_rate": 1e-05, "loss": 0.0229, "step": 473100 }, { "epoch": 0.004732, "grad_norm": 0.20384302735328674, "learning_rate": 1e-05, "loss": 0.0229, "step": 473200 }, { "epoch": 0.004733, "grad_norm": 0.18546348810195923, "learning_rate": 1e-05, "loss": 0.0229, "step": 473300 }, { "epoch": 0.004734, "grad_norm": 0.20488139986991882, "learning_rate": 1e-05, "loss": 0.023, "step": 473400 }, { "epoch": 0.004735, "grad_norm": 0.171819806098938, "learning_rate": 1e-05, "loss": 0.0228, "step": 473500 }, { "epoch": 0.004736, "grad_norm": 0.13302037119865417, "learning_rate": 1e-05, "loss": 0.0225, "step": 473600 }, { "epoch": 0.004737, "grad_norm": 0.22026808559894562, "learning_rate": 1e-05, "loss": 0.0223, "step": 473700 }, { "epoch": 0.004738, "grad_norm": 0.2168174684047699, "learning_rate": 1e-05, "loss": 0.0224, "step": 473800 }, { "epoch": 0.004739, "grad_norm": 0.17939671874046326, "learning_rate": 1e-05, "loss": 0.023, "step": 473900 }, { "epoch": 0.00474, "grad_norm": 0.16538555920124054, "learning_rate": 1e-05, "loss": 0.0233, "step": 474000 }, { "epoch": 0.004741, "grad_norm": 0.18210729956626892, "learning_rate": 1e-05, "loss": 0.0227, "step": 474100 }, { "epoch": 0.004742, "grad_norm": 0.14740067720413208, "learning_rate": 1e-05, "loss": 0.0231, "step": 474200 }, { "epoch": 0.004743, "grad_norm": 0.19917045533657074, "learning_rate": 1e-05, "loss": 0.0226, "step": 474300 }, { "epoch": 0.004744, "grad_norm": 0.1822853684425354, "learning_rate": 1e-05, "loss": 0.0229, "step": 474400 }, { "epoch": 0.004745, "grad_norm": 0.28736069798469543, "learning_rate": 1e-05, "loss": 0.0225, "step": 474500 }, { "epoch": 0.004746, "grad_norm": 0.22132661938667297, "learning_rate": 1e-05, "loss": 0.0223, "step": 474600 }, { "epoch": 0.004747, "grad_norm": 0.17378349602222443, "learning_rate": 1e-05, "loss": 0.0227, "step": 474700 }, { "epoch": 0.004748, "grad_norm": 0.18301337957382202, "learning_rate": 1e-05, "loss": 0.0225, "step": 474800 }, { "epoch": 0.004749, "grad_norm": 0.19333066046237946, "learning_rate": 1e-05, "loss": 0.023, "step": 474900 }, { "epoch": 0.00475, "grad_norm": 0.12537643313407898, "learning_rate": 1e-05, "loss": 0.0225, "step": 475000 }, { "epoch": 0.004751, "grad_norm": 0.2377309799194336, "learning_rate": 1e-05, "loss": 0.0229, "step": 475100 }, { "epoch": 0.004752, "grad_norm": 0.17744170129299164, "learning_rate": 1e-05, "loss": 0.0229, "step": 475200 }, { "epoch": 0.004753, "grad_norm": 0.4057808816432953, "learning_rate": 1e-05, "loss": 0.0227, "step": 475300 }, { "epoch": 0.004754, "grad_norm": 0.18006765842437744, "learning_rate": 1e-05, "loss": 0.0223, "step": 475400 }, { "epoch": 0.004755, "grad_norm": 0.32279279828071594, "learning_rate": 1e-05, "loss": 0.0226, "step": 475500 }, { "epoch": 0.004756, "grad_norm": 0.22508996725082397, "learning_rate": 1e-05, "loss": 0.0227, "step": 475600 }, { "epoch": 0.004757, "grad_norm": 0.20366227626800537, "learning_rate": 1e-05, "loss": 0.0221, "step": 475700 }, { "epoch": 0.004758, "grad_norm": 0.22006303071975708, "learning_rate": 1e-05, "loss": 0.0222, "step": 475800 }, { "epoch": 0.004759, "grad_norm": 0.2136417031288147, "learning_rate": 1e-05, "loss": 0.0226, "step": 475900 }, { "epoch": 0.00476, "grad_norm": 0.20387357473373413, "learning_rate": 1e-05, "loss": 0.0225, "step": 476000 }, { "epoch": 0.004761, "grad_norm": 0.1369362324476242, "learning_rate": 1e-05, "loss": 0.0221, "step": 476100 }, { "epoch": 0.004762, "grad_norm": 0.18721480667591095, "learning_rate": 1e-05, "loss": 0.0226, "step": 476200 }, { "epoch": 0.004763, "grad_norm": 0.18347370624542236, "learning_rate": 1e-05, "loss": 0.0231, "step": 476300 }, { "epoch": 0.004764, "grad_norm": 0.19316598773002625, "learning_rate": 1e-05, "loss": 0.0225, "step": 476400 }, { "epoch": 0.004765, "grad_norm": 0.17184723913669586, "learning_rate": 1e-05, "loss": 0.0228, "step": 476500 }, { "epoch": 0.004766, "grad_norm": 0.17854614555835724, "learning_rate": 1e-05, "loss": 0.023, "step": 476600 }, { "epoch": 0.004767, "grad_norm": 0.20490407943725586, "learning_rate": 1e-05, "loss": 0.0231, "step": 476700 }, { "epoch": 0.004768, "grad_norm": 0.14648078382015228, "learning_rate": 1e-05, "loss": 0.0224, "step": 476800 }, { "epoch": 0.004769, "grad_norm": 0.22981663048267365, "learning_rate": 1e-05, "loss": 0.023, "step": 476900 }, { "epoch": 0.00477, "grad_norm": 0.19988170266151428, "learning_rate": 1e-05, "loss": 0.0226, "step": 477000 }, { "epoch": 0.004771, "grad_norm": 0.17057357728481293, "learning_rate": 1e-05, "loss": 0.0226, "step": 477100 }, { "epoch": 0.004772, "grad_norm": 0.1935354769229889, "learning_rate": 1e-05, "loss": 0.0228, "step": 477200 }, { "epoch": 0.004773, "grad_norm": 0.22836162149906158, "learning_rate": 1e-05, "loss": 0.0228, "step": 477300 }, { "epoch": 0.004774, "grad_norm": 0.18785761296749115, "learning_rate": 1e-05, "loss": 0.0227, "step": 477400 }, { "epoch": 0.004775, "grad_norm": 0.160059854388237, "learning_rate": 1e-05, "loss": 0.0227, "step": 477500 }, { "epoch": 0.004776, "grad_norm": 0.1951289176940918, "learning_rate": 1e-05, "loss": 0.0224, "step": 477600 }, { "epoch": 0.004777, "grad_norm": 0.2601816952228546, "learning_rate": 1e-05, "loss": 0.0222, "step": 477700 }, { "epoch": 0.004778, "grad_norm": 0.1900980919599533, "learning_rate": 1e-05, "loss": 0.0231, "step": 477800 }, { "epoch": 0.004779, "grad_norm": 0.2459518313407898, "learning_rate": 1e-05, "loss": 0.0232, "step": 477900 }, { "epoch": 0.00478, "grad_norm": 0.15587767958641052, "learning_rate": 1e-05, "loss": 0.023, "step": 478000 }, { "epoch": 0.004781, "grad_norm": 0.3509857654571533, "learning_rate": 1e-05, "loss": 0.0228, "step": 478100 }, { "epoch": 0.004782, "grad_norm": 0.22551119327545166, "learning_rate": 1e-05, "loss": 0.0227, "step": 478200 }, { "epoch": 0.004783, "grad_norm": 0.22277049720287323, "learning_rate": 1e-05, "loss": 0.0226, "step": 478300 }, { "epoch": 0.004784, "grad_norm": 0.20757439732551575, "learning_rate": 1e-05, "loss": 0.0229, "step": 478400 }, { "epoch": 0.004785, "grad_norm": 0.1619933545589447, "learning_rate": 1e-05, "loss": 0.0224, "step": 478500 }, { "epoch": 0.004786, "grad_norm": 0.1602727621793747, "learning_rate": 1e-05, "loss": 0.0222, "step": 478600 }, { "epoch": 0.004787, "grad_norm": 0.19456689059734344, "learning_rate": 1e-05, "loss": 0.0222, "step": 478700 }, { "epoch": 0.004788, "grad_norm": 0.17152246832847595, "learning_rate": 1e-05, "loss": 0.0227, "step": 478800 }, { "epoch": 0.004789, "grad_norm": 0.17568174004554749, "learning_rate": 1e-05, "loss": 0.0229, "step": 478900 }, { "epoch": 0.00479, "grad_norm": 0.2433381825685501, "learning_rate": 1e-05, "loss": 0.0228, "step": 479000 }, { "epoch": 0.004791, "grad_norm": 0.16553480923175812, "learning_rate": 1e-05, "loss": 0.0227, "step": 479100 }, { "epoch": 0.004792, "grad_norm": 0.2579858601093292, "learning_rate": 1e-05, "loss": 0.0223, "step": 479200 }, { "epoch": 0.004793, "grad_norm": 0.20410792529582977, "learning_rate": 1e-05, "loss": 0.0226, "step": 479300 }, { "epoch": 0.004794, "grad_norm": 0.18530359864234924, "learning_rate": 1e-05, "loss": 0.0227, "step": 479400 }, { "epoch": 0.004795, "grad_norm": 0.19660459458827972, "learning_rate": 1e-05, "loss": 0.0226, "step": 479500 }, { "epoch": 0.004796, "grad_norm": 0.17871466279029846, "learning_rate": 1e-05, "loss": 0.0225, "step": 479600 }, { "epoch": 0.004797, "grad_norm": 0.1992536336183548, "learning_rate": 1e-05, "loss": 0.0222, "step": 479700 }, { "epoch": 0.004798, "grad_norm": 0.15069518983364105, "learning_rate": 1e-05, "loss": 0.0226, "step": 479800 }, { "epoch": 0.004799, "grad_norm": 0.15981236100196838, "learning_rate": 1e-05, "loss": 0.0229, "step": 479900 }, { "epoch": 0.0048, "grad_norm": 0.15589135885238647, "learning_rate": 1e-05, "loss": 0.0228, "step": 480000 }, { "epoch": 0.0048, "eval_loss": 0.020210323855280876, "eval_runtime": 183.2691, "eval_samples_per_second": 272.823, "eval_steps_per_second": 17.051, "step": 480000 }, { "epoch": 0.004801, "grad_norm": 0.19858911633491516, "learning_rate": 1e-05, "loss": 0.0227, "step": 480100 }, { "epoch": 0.004802, "grad_norm": 0.15207539498806, "learning_rate": 1e-05, "loss": 0.0225, "step": 480200 }, { "epoch": 0.004803, "grad_norm": 0.1325957328081131, "learning_rate": 1e-05, "loss": 0.0224, "step": 480300 }, { "epoch": 0.004804, "grad_norm": 0.23294663429260254, "learning_rate": 1e-05, "loss": 0.0226, "step": 480400 }, { "epoch": 0.004805, "grad_norm": 0.18681181967258453, "learning_rate": 1e-05, "loss": 0.0221, "step": 480500 }, { "epoch": 0.004806, "grad_norm": 0.26766422390937805, "learning_rate": 1e-05, "loss": 0.0222, "step": 480600 }, { "epoch": 0.004807, "grad_norm": 0.15606385469436646, "learning_rate": 1e-05, "loss": 0.0227, "step": 480700 }, { "epoch": 0.004808, "grad_norm": 0.15690290927886963, "learning_rate": 1e-05, "loss": 0.0226, "step": 480800 }, { "epoch": 0.004809, "grad_norm": 0.1779525876045227, "learning_rate": 1e-05, "loss": 0.0228, "step": 480900 }, { "epoch": 0.00481, "grad_norm": 0.2627849280834198, "learning_rate": 1e-05, "loss": 0.0229, "step": 481000 }, { "epoch": 0.004811, "grad_norm": 0.21175305545330048, "learning_rate": 1e-05, "loss": 0.0221, "step": 481100 }, { "epoch": 0.004812, "grad_norm": 0.16684013605117798, "learning_rate": 1e-05, "loss": 0.0224, "step": 481200 }, { "epoch": 0.004813, "grad_norm": 0.20950034260749817, "learning_rate": 1e-05, "loss": 0.023, "step": 481300 }, { "epoch": 0.004814, "grad_norm": 0.20494452118873596, "learning_rate": 1e-05, "loss": 0.0222, "step": 481400 }, { "epoch": 0.004815, "grad_norm": 0.19432593882083893, "learning_rate": 1e-05, "loss": 0.0227, "step": 481500 }, { "epoch": 0.004816, "grad_norm": 0.26819002628326416, "learning_rate": 1e-05, "loss": 0.0227, "step": 481600 }, { "epoch": 0.004817, "grad_norm": 0.26799535751342773, "learning_rate": 1e-05, "loss": 0.0233, "step": 481700 }, { "epoch": 0.004818, "grad_norm": 0.20080631971359253, "learning_rate": 1e-05, "loss": 0.0222, "step": 481800 }, { "epoch": 0.004819, "grad_norm": 0.19530464708805084, "learning_rate": 1e-05, "loss": 0.023, "step": 481900 }, { "epoch": 0.00482, "grad_norm": 0.3117150664329529, "learning_rate": 1e-05, "loss": 0.0228, "step": 482000 }, { "epoch": 0.004821, "grad_norm": 0.21459464728832245, "learning_rate": 1e-05, "loss": 0.0225, "step": 482100 }, { "epoch": 0.004822, "grad_norm": 0.1681216061115265, "learning_rate": 1e-05, "loss": 0.0225, "step": 482200 }, { "epoch": 0.004823, "grad_norm": 0.2033972591161728, "learning_rate": 1e-05, "loss": 0.0226, "step": 482300 }, { "epoch": 0.004824, "grad_norm": 0.18456736207008362, "learning_rate": 1e-05, "loss": 0.0228, "step": 482400 }, { "epoch": 0.004825, "grad_norm": 0.13899816572666168, "learning_rate": 1e-05, "loss": 0.0225, "step": 482500 }, { "epoch": 0.004826, "grad_norm": 0.15683220326900482, "learning_rate": 1e-05, "loss": 0.0225, "step": 482600 }, { "epoch": 0.004827, "grad_norm": 0.1842149794101715, "learning_rate": 1e-05, "loss": 0.0219, "step": 482700 }, { "epoch": 0.004828, "grad_norm": 0.19302187860012054, "learning_rate": 1e-05, "loss": 0.0223, "step": 482800 }, { "epoch": 0.004829, "grad_norm": 0.17793028056621552, "learning_rate": 1e-05, "loss": 0.0224, "step": 482900 }, { "epoch": 0.00483, "grad_norm": 0.16678635776042938, "learning_rate": 1e-05, "loss": 0.0227, "step": 483000 }, { "epoch": 0.004831, "grad_norm": 0.12676487863063812, "learning_rate": 1e-05, "loss": 0.0225, "step": 483100 }, { "epoch": 0.004832, "grad_norm": 0.13489502668380737, "learning_rate": 1e-05, "loss": 0.0226, "step": 483200 }, { "epoch": 0.004833, "grad_norm": 0.18094728887081146, "learning_rate": 1e-05, "loss": 0.0227, "step": 483300 }, { "epoch": 0.004834, "grad_norm": 0.2660669684410095, "learning_rate": 1e-05, "loss": 0.0227, "step": 483400 }, { "epoch": 0.004835, "grad_norm": 0.2031784951686859, "learning_rate": 1e-05, "loss": 0.0226, "step": 483500 }, { "epoch": 0.004836, "grad_norm": 0.17592385411262512, "learning_rate": 1e-05, "loss": 0.0223, "step": 483600 }, { "epoch": 0.004837, "grad_norm": 0.14218981564044952, "learning_rate": 1e-05, "loss": 0.0224, "step": 483700 }, { "epoch": 0.004838, "grad_norm": 0.1639585942029953, "learning_rate": 1e-05, "loss": 0.0222, "step": 483800 }, { "epoch": 0.004839, "grad_norm": 0.16760849952697754, "learning_rate": 1e-05, "loss": 0.0227, "step": 483900 }, { "epoch": 0.00484, "grad_norm": 0.2298850119113922, "learning_rate": 1e-05, "loss": 0.023, "step": 484000 }, { "epoch": 0.004841, "grad_norm": 0.2601833641529083, "learning_rate": 1e-05, "loss": 0.0227, "step": 484100 }, { "epoch": 0.004842, "grad_norm": 0.20759740471839905, "learning_rate": 1e-05, "loss": 0.0224, "step": 484200 }, { "epoch": 0.004843, "grad_norm": 0.21301333606243134, "learning_rate": 1e-05, "loss": 0.0226, "step": 484300 }, { "epoch": 0.004844, "grad_norm": 0.20009183883666992, "learning_rate": 1e-05, "loss": 0.0225, "step": 484400 }, { "epoch": 0.004845, "grad_norm": 0.2169293761253357, "learning_rate": 1e-05, "loss": 0.0225, "step": 484500 }, { "epoch": 0.004846, "grad_norm": 0.16364678740501404, "learning_rate": 1e-05, "loss": 0.0228, "step": 484600 }, { "epoch": 0.004847, "grad_norm": 0.14871971309185028, "learning_rate": 1e-05, "loss": 0.0227, "step": 484700 }, { "epoch": 0.004848, "grad_norm": 0.18035338819026947, "learning_rate": 1e-05, "loss": 0.0229, "step": 484800 }, { "epoch": 0.004849, "grad_norm": 0.19805669784545898, "learning_rate": 1e-05, "loss": 0.0222, "step": 484900 }, { "epoch": 0.00485, "grad_norm": 0.20793002843856812, "learning_rate": 1e-05, "loss": 0.0224, "step": 485000 }, { "epoch": 0.004851, "grad_norm": 0.18014852702617645, "learning_rate": 1e-05, "loss": 0.023, "step": 485100 }, { "epoch": 0.004852, "grad_norm": 0.1944490373134613, "learning_rate": 1e-05, "loss": 0.0223, "step": 485200 }, { "epoch": 0.004853, "grad_norm": 0.16900306940078735, "learning_rate": 1e-05, "loss": 0.0223, "step": 485300 }, { "epoch": 0.004854, "grad_norm": 0.15113256871700287, "learning_rate": 1e-05, "loss": 0.0226, "step": 485400 }, { "epoch": 0.004855, "grad_norm": 0.1816157102584839, "learning_rate": 1e-05, "loss": 0.0222, "step": 485500 }, { "epoch": 0.004856, "grad_norm": 0.22360166907310486, "learning_rate": 1e-05, "loss": 0.0225, "step": 485600 }, { "epoch": 0.004857, "grad_norm": 0.1941595822572708, "learning_rate": 1e-05, "loss": 0.0222, "step": 485700 }, { "epoch": 0.004858, "grad_norm": 0.2051376849412918, "learning_rate": 1e-05, "loss": 0.0223, "step": 485800 }, { "epoch": 0.004859, "grad_norm": 0.16336752474308014, "learning_rate": 1e-05, "loss": 0.0226, "step": 485900 }, { "epoch": 0.00486, "grad_norm": 0.14895319938659668, "learning_rate": 1e-05, "loss": 0.0223, "step": 486000 }, { "epoch": 0.004861, "grad_norm": 0.17026206851005554, "learning_rate": 1e-05, "loss": 0.0228, "step": 486100 }, { "epoch": 0.004862, "grad_norm": 0.14094384014606476, "learning_rate": 1e-05, "loss": 0.0224, "step": 486200 }, { "epoch": 0.004863, "grad_norm": 0.1634945571422577, "learning_rate": 1e-05, "loss": 0.0227, "step": 486300 }, { "epoch": 0.004864, "grad_norm": 0.1666261851787567, "learning_rate": 1e-05, "loss": 0.0227, "step": 486400 }, { "epoch": 0.004865, "grad_norm": 0.16356317698955536, "learning_rate": 1e-05, "loss": 0.0225, "step": 486500 }, { "epoch": 0.004866, "grad_norm": 0.14626428484916687, "learning_rate": 1e-05, "loss": 0.0221, "step": 486600 }, { "epoch": 0.004867, "grad_norm": 0.1871759593486786, "learning_rate": 1e-05, "loss": 0.0219, "step": 486700 }, { "epoch": 0.004868, "grad_norm": 0.19877472519874573, "learning_rate": 1e-05, "loss": 0.0223, "step": 486800 }, { "epoch": 0.004869, "grad_norm": 0.14292922616004944, "learning_rate": 1e-05, "loss": 0.0225, "step": 486900 }, { "epoch": 0.00487, "grad_norm": 0.1764097958803177, "learning_rate": 1e-05, "loss": 0.0224, "step": 487000 }, { "epoch": 0.004871, "grad_norm": 0.17586185038089752, "learning_rate": 1e-05, "loss": 0.0228, "step": 487100 }, { "epoch": 0.004872, "grad_norm": 0.22949270904064178, "learning_rate": 1e-05, "loss": 0.0224, "step": 487200 }, { "epoch": 0.004873, "grad_norm": 0.1661129742860794, "learning_rate": 1e-05, "loss": 0.0222, "step": 487300 }, { "epoch": 0.004874, "grad_norm": 0.19552993774414062, "learning_rate": 1e-05, "loss": 0.0224, "step": 487400 }, { "epoch": 0.004875, "grad_norm": 0.18561182916164398, "learning_rate": 1e-05, "loss": 0.0221, "step": 487500 }, { "epoch": 0.004876, "grad_norm": 0.14551962912082672, "learning_rate": 1e-05, "loss": 0.0224, "step": 487600 }, { "epoch": 0.004877, "grad_norm": 0.21218594908714294, "learning_rate": 1e-05, "loss": 0.0226, "step": 487700 }, { "epoch": 0.004878, "grad_norm": 0.1844281256198883, "learning_rate": 1e-05, "loss": 0.0227, "step": 487800 }, { "epoch": 0.004879, "grad_norm": 0.1997205764055252, "learning_rate": 1e-05, "loss": 0.0226, "step": 487900 }, { "epoch": 0.00488, "grad_norm": 0.24793003499507904, "learning_rate": 1e-05, "loss": 0.0226, "step": 488000 }, { "epoch": 0.004881, "grad_norm": 0.19330953061580658, "learning_rate": 1e-05, "loss": 0.0229, "step": 488100 }, { "epoch": 0.004882, "grad_norm": 0.3183402419090271, "learning_rate": 1e-05, "loss": 0.0225, "step": 488200 }, { "epoch": 0.004883, "grad_norm": 0.1506684273481369, "learning_rate": 1e-05, "loss": 0.0225, "step": 488300 }, { "epoch": 0.004884, "grad_norm": 0.21010585129261017, "learning_rate": 1e-05, "loss": 0.0223, "step": 488400 }, { "epoch": 0.004885, "grad_norm": 0.17681162059307098, "learning_rate": 1e-05, "loss": 0.0224, "step": 488500 }, { "epoch": 0.004886, "grad_norm": 0.15867190062999725, "learning_rate": 1e-05, "loss": 0.023, "step": 488600 }, { "epoch": 0.004887, "grad_norm": 0.18228976428508759, "learning_rate": 1e-05, "loss": 0.0226, "step": 488700 }, { "epoch": 0.004888, "grad_norm": 0.16981647908687592, "learning_rate": 1e-05, "loss": 0.0225, "step": 488800 }, { "epoch": 0.004889, "grad_norm": 0.17589503526687622, "learning_rate": 1e-05, "loss": 0.0223, "step": 488900 }, { "epoch": 0.00489, "grad_norm": 0.26660430431365967, "learning_rate": 1e-05, "loss": 0.0226, "step": 489000 }, { "epoch": 0.004891, "grad_norm": 0.22372598946094513, "learning_rate": 1e-05, "loss": 0.0222, "step": 489100 }, { "epoch": 0.004892, "grad_norm": 0.2527467608451843, "learning_rate": 1e-05, "loss": 0.0229, "step": 489200 }, { "epoch": 0.004893, "grad_norm": 0.18553735315799713, "learning_rate": 1e-05, "loss": 0.0223, "step": 489300 }, { "epoch": 0.004894, "grad_norm": 0.2172551155090332, "learning_rate": 1e-05, "loss": 0.022, "step": 489400 }, { "epoch": 0.004895, "grad_norm": 0.16336023807525635, "learning_rate": 1e-05, "loss": 0.0225, "step": 489500 }, { "epoch": 0.004896, "grad_norm": 0.21395717561244965, "learning_rate": 1e-05, "loss": 0.0227, "step": 489600 }, { "epoch": 0.004897, "grad_norm": 0.33376699686050415, "learning_rate": 1e-05, "loss": 0.0221, "step": 489700 }, { "epoch": 0.004898, "grad_norm": 0.2199081927537918, "learning_rate": 1e-05, "loss": 0.0223, "step": 489800 }, { "epoch": 0.004899, "grad_norm": 0.2354394644498825, "learning_rate": 1e-05, "loss": 0.0221, "step": 489900 }, { "epoch": 0.0049, "grad_norm": 0.14878928661346436, "learning_rate": 1e-05, "loss": 0.0224, "step": 490000 }, { "epoch": 0.004901, "grad_norm": 0.15737982094287872, "learning_rate": 1e-05, "loss": 0.0227, "step": 490100 }, { "epoch": 0.004902, "grad_norm": 0.16013920307159424, "learning_rate": 1e-05, "loss": 0.0222, "step": 490200 }, { "epoch": 0.004903, "grad_norm": 0.25470688939094543, "learning_rate": 1e-05, "loss": 0.0225, "step": 490300 }, { "epoch": 0.004904, "grad_norm": 0.17360754311084747, "learning_rate": 1e-05, "loss": 0.0223, "step": 490400 }, { "epoch": 0.004905, "grad_norm": 0.23480123281478882, "learning_rate": 1e-05, "loss": 0.0228, "step": 490500 }, { "epoch": 0.004906, "grad_norm": 0.2206820845603943, "learning_rate": 1e-05, "loss": 0.0224, "step": 490600 }, { "epoch": 0.004907, "grad_norm": 0.15437212586402893, "learning_rate": 1e-05, "loss": 0.0226, "step": 490700 }, { "epoch": 0.004908, "grad_norm": 0.2217675745487213, "learning_rate": 1e-05, "loss": 0.0223, "step": 490800 }, { "epoch": 0.004909, "grad_norm": 0.14655490219593048, "learning_rate": 1e-05, "loss": 0.0221, "step": 490900 }, { "epoch": 0.00491, "grad_norm": 0.14409515261650085, "learning_rate": 1e-05, "loss": 0.0225, "step": 491000 }, { "epoch": 0.004911, "grad_norm": 0.2326953262090683, "learning_rate": 1e-05, "loss": 0.0224, "step": 491100 }, { "epoch": 0.004912, "grad_norm": 0.2596275508403778, "learning_rate": 1e-05, "loss": 0.0225, "step": 491200 }, { "epoch": 0.004913, "grad_norm": 0.24036499857902527, "learning_rate": 1e-05, "loss": 0.0226, "step": 491300 }, { "epoch": 0.004914, "grad_norm": 0.17784036695957184, "learning_rate": 1e-05, "loss": 0.0222, "step": 491400 }, { "epoch": 0.004915, "grad_norm": 0.15445251762866974, "learning_rate": 1e-05, "loss": 0.0225, "step": 491500 }, { "epoch": 0.004916, "grad_norm": 0.16181112825870514, "learning_rate": 1e-05, "loss": 0.0224, "step": 491600 }, { "epoch": 0.004917, "grad_norm": 0.18760506808757782, "learning_rate": 1e-05, "loss": 0.0228, "step": 491700 }, { "epoch": 0.004918, "grad_norm": 0.20751334726810455, "learning_rate": 1e-05, "loss": 0.0228, "step": 491800 }, { "epoch": 0.004919, "grad_norm": 0.17591220140457153, "learning_rate": 1e-05, "loss": 0.0224, "step": 491900 }, { "epoch": 0.00492, "grad_norm": 0.19821839034557343, "learning_rate": 1e-05, "loss": 0.0225, "step": 492000 }, { "epoch": 0.004921, "grad_norm": 0.2212182581424713, "learning_rate": 1e-05, "loss": 0.0225, "step": 492100 }, { "epoch": 0.004922, "grad_norm": 0.15731021761894226, "learning_rate": 1e-05, "loss": 0.0227, "step": 492200 }, { "epoch": 0.004923, "grad_norm": 0.14833572506904602, "learning_rate": 1e-05, "loss": 0.0223, "step": 492300 }, { "epoch": 0.004924, "grad_norm": 0.12140193581581116, "learning_rate": 1e-05, "loss": 0.0218, "step": 492400 }, { "epoch": 0.004925, "grad_norm": 0.252888023853302, "learning_rate": 1e-05, "loss": 0.0229, "step": 492500 }, { "epoch": 0.004926, "grad_norm": 0.15642037987709045, "learning_rate": 1e-05, "loss": 0.0228, "step": 492600 }, { "epoch": 0.004927, "grad_norm": 0.13275474309921265, "learning_rate": 1e-05, "loss": 0.0227, "step": 492700 }, { "epoch": 0.004928, "grad_norm": 0.16984188556671143, "learning_rate": 1e-05, "loss": 0.0224, "step": 492800 }, { "epoch": 0.004929, "grad_norm": 0.21277165412902832, "learning_rate": 1e-05, "loss": 0.0222, "step": 492900 }, { "epoch": 0.00493, "grad_norm": 0.1703546792268753, "learning_rate": 1e-05, "loss": 0.0232, "step": 493000 }, { "epoch": 0.004931, "grad_norm": 0.29064568877220154, "learning_rate": 1e-05, "loss": 0.0223, "step": 493100 }, { "epoch": 0.004932, "grad_norm": 0.14051178097724915, "learning_rate": 1e-05, "loss": 0.0222, "step": 493200 }, { "epoch": 0.004933, "grad_norm": 0.141187846660614, "learning_rate": 1e-05, "loss": 0.0223, "step": 493300 }, { "epoch": 0.004934, "grad_norm": 0.248707115650177, "learning_rate": 1e-05, "loss": 0.0227, "step": 493400 }, { "epoch": 0.004935, "grad_norm": 0.1633865088224411, "learning_rate": 1e-05, "loss": 0.0221, "step": 493500 }, { "epoch": 0.004936, "grad_norm": 0.23493026196956635, "learning_rate": 1e-05, "loss": 0.0224, "step": 493600 }, { "epoch": 0.004937, "grad_norm": 0.16851398348808289, "learning_rate": 1e-05, "loss": 0.0226, "step": 493700 }, { "epoch": 0.004938, "grad_norm": 0.21559743583202362, "learning_rate": 1e-05, "loss": 0.0224, "step": 493800 }, { "epoch": 0.004939, "grad_norm": 0.1903742551803589, "learning_rate": 1e-05, "loss": 0.0221, "step": 493900 }, { "epoch": 0.00494, "grad_norm": 0.22859756648540497, "learning_rate": 1e-05, "loss": 0.0227, "step": 494000 }, { "epoch": 0.004941, "grad_norm": 0.1816614419221878, "learning_rate": 1e-05, "loss": 0.0223, "step": 494100 }, { "epoch": 0.004942, "grad_norm": 0.1491163820028305, "learning_rate": 1e-05, "loss": 0.023, "step": 494200 }, { "epoch": 0.004943, "grad_norm": 0.1559378206729889, "learning_rate": 1e-05, "loss": 0.0219, "step": 494300 }, { "epoch": 0.004944, "grad_norm": 0.2327524721622467, "learning_rate": 1e-05, "loss": 0.0226, "step": 494400 }, { "epoch": 0.004945, "grad_norm": 0.15845568478107452, "learning_rate": 1e-05, "loss": 0.0222, "step": 494500 }, { "epoch": 0.004946, "grad_norm": 0.17711521685123444, "learning_rate": 1e-05, "loss": 0.0222, "step": 494600 }, { "epoch": 0.004947, "grad_norm": 0.23511902987957, "learning_rate": 1e-05, "loss": 0.0221, "step": 494700 }, { "epoch": 0.004948, "grad_norm": 0.18569150567054749, "learning_rate": 1e-05, "loss": 0.0226, "step": 494800 }, { "epoch": 0.004949, "grad_norm": 0.21922273933887482, "learning_rate": 1e-05, "loss": 0.0224, "step": 494900 }, { "epoch": 0.00495, "grad_norm": 0.1734301894903183, "learning_rate": 1e-05, "loss": 0.0225, "step": 495000 }, { "epoch": 0.004951, "grad_norm": 0.17675210535526276, "learning_rate": 1e-05, "loss": 0.0221, "step": 495100 }, { "epoch": 0.004952, "grad_norm": 0.1763714849948883, "learning_rate": 1e-05, "loss": 0.0223, "step": 495200 }, { "epoch": 0.004953, "grad_norm": 0.18216589093208313, "learning_rate": 1e-05, "loss": 0.0222, "step": 495300 }, { "epoch": 0.004954, "grad_norm": 0.1881098449230194, "learning_rate": 1e-05, "loss": 0.0229, "step": 495400 }, { "epoch": 0.004955, "grad_norm": 0.1618843525648117, "learning_rate": 1e-05, "loss": 0.0223, "step": 495500 }, { "epoch": 0.004956, "grad_norm": 0.20416833460330963, "learning_rate": 1e-05, "loss": 0.0224, "step": 495600 }, { "epoch": 0.004957, "grad_norm": 0.27172648906707764, "learning_rate": 1e-05, "loss": 0.0226, "step": 495700 }, { "epoch": 0.004958, "grad_norm": 0.22851955890655518, "learning_rate": 1e-05, "loss": 0.0221, "step": 495800 }, { "epoch": 0.004959, "grad_norm": 0.18228548765182495, "learning_rate": 1e-05, "loss": 0.0223, "step": 495900 }, { "epoch": 0.00496, "grad_norm": 0.1556166261434555, "learning_rate": 1e-05, "loss": 0.0228, "step": 496000 }, { "epoch": 0.004961, "grad_norm": 0.13907559216022491, "learning_rate": 1e-05, "loss": 0.0222, "step": 496100 }, { "epoch": 0.004962, "grad_norm": 0.1678418517112732, "learning_rate": 1e-05, "loss": 0.0228, "step": 496200 }, { "epoch": 0.004963, "grad_norm": 0.20859429240226746, "learning_rate": 1e-05, "loss": 0.0223, "step": 496300 }, { "epoch": 0.004964, "grad_norm": 0.14712657034397125, "learning_rate": 1e-05, "loss": 0.0226, "step": 496400 }, { "epoch": 0.004965, "grad_norm": 0.14531143009662628, "learning_rate": 1e-05, "loss": 0.0228, "step": 496500 }, { "epoch": 0.004966, "grad_norm": 0.20882773399353027, "learning_rate": 1e-05, "loss": 0.0223, "step": 496600 }, { "epoch": 0.004967, "grad_norm": 0.22087039053440094, "learning_rate": 1e-05, "loss": 0.0224, "step": 496700 }, { "epoch": 0.004968, "grad_norm": 0.2958511412143707, "learning_rate": 1e-05, "loss": 0.0225, "step": 496800 }, { "epoch": 0.004969, "grad_norm": 0.19788813591003418, "learning_rate": 1e-05, "loss": 0.0222, "step": 496900 }, { "epoch": 0.00497, "grad_norm": 0.176690474152565, "learning_rate": 1e-05, "loss": 0.0219, "step": 497000 }, { "epoch": 0.004971, "grad_norm": 0.17848649621009827, "learning_rate": 1e-05, "loss": 0.0225, "step": 497100 }, { "epoch": 0.004972, "grad_norm": 0.16314560174942017, "learning_rate": 1e-05, "loss": 0.0221, "step": 497200 }, { "epoch": 0.004973, "grad_norm": 0.1981799453496933, "learning_rate": 1e-05, "loss": 0.0226, "step": 497300 }, { "epoch": 0.004974, "grad_norm": 0.16922800242900848, "learning_rate": 1e-05, "loss": 0.0217, "step": 497400 }, { "epoch": 0.004975, "grad_norm": 0.20301394164562225, "learning_rate": 1e-05, "loss": 0.0219, "step": 497500 }, { "epoch": 0.004976, "grad_norm": 0.19125759601593018, "learning_rate": 1e-05, "loss": 0.022, "step": 497600 }, { "epoch": 0.004977, "grad_norm": 0.17234794795513153, "learning_rate": 1e-05, "loss": 0.0218, "step": 497700 }, { "epoch": 0.004978, "grad_norm": 0.16469094157218933, "learning_rate": 1e-05, "loss": 0.0223, "step": 497800 }, { "epoch": 0.004979, "grad_norm": 0.21150341629981995, "learning_rate": 1e-05, "loss": 0.0219, "step": 497900 }, { "epoch": 0.00498, "grad_norm": 0.16134443879127502, "learning_rate": 1e-05, "loss": 0.0225, "step": 498000 }, { "epoch": 0.004981, "grad_norm": 0.15701983869075775, "learning_rate": 1e-05, "loss": 0.0216, "step": 498100 }, { "epoch": 0.004982, "grad_norm": 0.21458804607391357, "learning_rate": 1e-05, "loss": 0.0224, "step": 498200 }, { "epoch": 0.004983, "grad_norm": 0.1757224202156067, "learning_rate": 1e-05, "loss": 0.0221, "step": 498300 }, { "epoch": 0.004984, "grad_norm": 0.23796291649341583, "learning_rate": 1e-05, "loss": 0.022, "step": 498400 }, { "epoch": 0.004985, "grad_norm": 0.18060126900672913, "learning_rate": 1e-05, "loss": 0.0226, "step": 498500 }, { "epoch": 0.004986, "grad_norm": 0.15155822038650513, "learning_rate": 1e-05, "loss": 0.0218, "step": 498600 }, { "epoch": 0.004987, "grad_norm": 0.1833479404449463, "learning_rate": 1e-05, "loss": 0.022, "step": 498700 }, { "epoch": 0.004988, "grad_norm": 0.1782234013080597, "learning_rate": 1e-05, "loss": 0.0228, "step": 498800 }, { "epoch": 0.004989, "grad_norm": 0.20433324575424194, "learning_rate": 1e-05, "loss": 0.0224, "step": 498900 }, { "epoch": 0.00499, "grad_norm": 0.17950913310050964, "learning_rate": 1e-05, "loss": 0.0222, "step": 499000 }, { "epoch": 0.004991, "grad_norm": 0.1907675415277481, "learning_rate": 1e-05, "loss": 0.0222, "step": 499100 }, { "epoch": 0.004992, "grad_norm": 0.14273685216903687, "learning_rate": 1e-05, "loss": 0.0216, "step": 499200 }, { "epoch": 0.004993, "grad_norm": 0.13647359609603882, "learning_rate": 1e-05, "loss": 0.0221, "step": 499300 }, { "epoch": 0.004994, "grad_norm": 0.2074493169784546, "learning_rate": 1e-05, "loss": 0.022, "step": 499400 }, { "epoch": 0.004995, "grad_norm": 0.2408534288406372, "learning_rate": 1e-05, "loss": 0.0221, "step": 499500 }, { "epoch": 0.004996, "grad_norm": 0.17320658266544342, "learning_rate": 1e-05, "loss": 0.0226, "step": 499600 }, { "epoch": 0.004997, "grad_norm": 0.1624217927455902, "learning_rate": 1e-05, "loss": 0.022, "step": 499700 }, { "epoch": 0.004998, "grad_norm": 0.3061617612838745, "learning_rate": 1e-05, "loss": 0.0225, "step": 499800 }, { "epoch": 0.004999, "grad_norm": 0.16439774632453918, "learning_rate": 1e-05, "loss": 0.0221, "step": 499900 }, { "epoch": 0.005, "grad_norm": 0.18290100991725922, "learning_rate": 1e-05, "loss": 0.0227, "step": 500000 }, { "epoch": 0.005, "eval_loss": 0.020368527621030807, "eval_runtime": 201.9393, "eval_samples_per_second": 247.599, "eval_steps_per_second": 15.475, "step": 500000 }, { "epoch": 0.005001, "grad_norm": 0.1730956882238388, "learning_rate": 1e-05, "loss": 0.0215, "step": 500100 }, { "epoch": 0.005002, "grad_norm": 0.18557944893836975, "learning_rate": 1e-05, "loss": 0.0224, "step": 500200 }, { "epoch": 0.005003, "grad_norm": 0.17831017076969147, "learning_rate": 1e-05, "loss": 0.0221, "step": 500300 }, { "epoch": 0.005004, "grad_norm": 0.17148609459400177, "learning_rate": 1e-05, "loss": 0.0225, "step": 500400 }, { "epoch": 0.005005, "grad_norm": 0.2087256759405136, "learning_rate": 1e-05, "loss": 0.0227, "step": 500500 }, { "epoch": 0.005006, "grad_norm": 0.17928588390350342, "learning_rate": 1e-05, "loss": 0.0221, "step": 500600 }, { "epoch": 0.005007, "grad_norm": 0.1668325811624527, "learning_rate": 1e-05, "loss": 0.0222, "step": 500700 }, { "epoch": 0.005008, "grad_norm": 0.15595051646232605, "learning_rate": 1e-05, "loss": 0.0224, "step": 500800 }, { "epoch": 0.005009, "grad_norm": 0.21958331763744354, "learning_rate": 1e-05, "loss": 0.0222, "step": 500900 }, { "epoch": 0.00501, "grad_norm": 0.16143953800201416, "learning_rate": 1e-05, "loss": 0.0221, "step": 501000 }, { "epoch": 0.005011, "grad_norm": 0.23131421208381653, "learning_rate": 1e-05, "loss": 0.0222, "step": 501100 }, { "epoch": 0.005012, "grad_norm": 0.178961381316185, "learning_rate": 1e-05, "loss": 0.0224, "step": 501200 }, { "epoch": 0.005013, "grad_norm": 0.1527274250984192, "learning_rate": 1e-05, "loss": 0.0222, "step": 501300 }, { "epoch": 0.005014, "grad_norm": 0.184279665350914, "learning_rate": 1e-05, "loss": 0.0216, "step": 501400 }, { "epoch": 0.005015, "grad_norm": 0.20106936991214752, "learning_rate": 1e-05, "loss": 0.0224, "step": 501500 }, { "epoch": 0.005016, "grad_norm": 0.22428947687149048, "learning_rate": 1e-05, "loss": 0.0222, "step": 501600 }, { "epoch": 0.005017, "grad_norm": 0.16428467631340027, "learning_rate": 1e-05, "loss": 0.0223, "step": 501700 }, { "epoch": 0.005018, "grad_norm": 0.16279509663581848, "learning_rate": 1e-05, "loss": 0.0219, "step": 501800 }, { "epoch": 0.005019, "grad_norm": 0.20206716656684875, "learning_rate": 1e-05, "loss": 0.0227, "step": 501900 }, { "epoch": 0.00502, "grad_norm": 0.1436138153076172, "learning_rate": 1e-05, "loss": 0.0228, "step": 502000 }, { "epoch": 0.005021, "grad_norm": 0.1346484124660492, "learning_rate": 1e-05, "loss": 0.0222, "step": 502100 }, { "epoch": 0.005022, "grad_norm": 0.16371528804302216, "learning_rate": 1e-05, "loss": 0.022, "step": 502200 }, { "epoch": 0.005023, "grad_norm": 0.16668391227722168, "learning_rate": 1e-05, "loss": 0.0225, "step": 502300 }, { "epoch": 0.005024, "grad_norm": 0.19581370055675507, "learning_rate": 1e-05, "loss": 0.0222, "step": 502400 }, { "epoch": 0.005025, "grad_norm": 0.1634637862443924, "learning_rate": 1e-05, "loss": 0.022, "step": 502500 }, { "epoch": 0.005026, "grad_norm": 0.16552488505840302, "learning_rate": 1e-05, "loss": 0.0221, "step": 502600 }, { "epoch": 0.005027, "grad_norm": 0.1647871881723404, "learning_rate": 1e-05, "loss": 0.0222, "step": 502700 }, { "epoch": 0.005028, "grad_norm": 0.13967177271842957, "learning_rate": 1e-05, "loss": 0.0221, "step": 502800 }, { "epoch": 0.005029, "grad_norm": 0.1962343007326126, "learning_rate": 1e-05, "loss": 0.0221, "step": 502900 }, { "epoch": 0.00503, "grad_norm": 0.1798674613237381, "learning_rate": 1e-05, "loss": 0.0225, "step": 503000 }, { "epoch": 0.005031, "grad_norm": 0.22816099226474762, "learning_rate": 1e-05, "loss": 0.0221, "step": 503100 }, { "epoch": 0.005032, "grad_norm": 0.2658965587615967, "learning_rate": 1e-05, "loss": 0.0222, "step": 503200 }, { "epoch": 0.005033, "grad_norm": 0.1709921956062317, "learning_rate": 1e-05, "loss": 0.0222, "step": 503300 }, { "epoch": 0.005034, "grad_norm": 0.15775492787361145, "learning_rate": 1e-05, "loss": 0.0217, "step": 503400 }, { "epoch": 0.005035, "grad_norm": 0.2154555469751358, "learning_rate": 1e-05, "loss": 0.0222, "step": 503500 }, { "epoch": 0.005036, "grad_norm": 0.18170669674873352, "learning_rate": 1e-05, "loss": 0.0223, "step": 503600 }, { "epoch": 0.005037, "grad_norm": 0.28320056200027466, "learning_rate": 1e-05, "loss": 0.0224, "step": 503700 }, { "epoch": 0.005038, "grad_norm": 0.1979909986257553, "learning_rate": 1e-05, "loss": 0.0226, "step": 503800 }, { "epoch": 0.005039, "grad_norm": 0.19887927174568176, "learning_rate": 1e-05, "loss": 0.0225, "step": 503900 }, { "epoch": 0.00504, "grad_norm": 0.17779967188835144, "learning_rate": 1e-05, "loss": 0.0226, "step": 504000 }, { "epoch": 0.005041, "grad_norm": 0.2151610255241394, "learning_rate": 1e-05, "loss": 0.0224, "step": 504100 }, { "epoch": 0.005042, "grad_norm": 0.16002508997917175, "learning_rate": 1e-05, "loss": 0.022, "step": 504200 }, { "epoch": 0.005043, "grad_norm": 0.16536933183670044, "learning_rate": 1e-05, "loss": 0.0218, "step": 504300 }, { "epoch": 0.005044, "grad_norm": 0.18222512304782867, "learning_rate": 1e-05, "loss": 0.0225, "step": 504400 }, { "epoch": 0.005045, "grad_norm": 0.23075509071350098, "learning_rate": 1e-05, "loss": 0.0218, "step": 504500 }, { "epoch": 0.005046, "grad_norm": 0.20390905439853668, "learning_rate": 1e-05, "loss": 0.0225, "step": 504600 }, { "epoch": 0.005047, "grad_norm": 0.1625085026025772, "learning_rate": 1e-05, "loss": 0.0219, "step": 504700 }, { "epoch": 0.005048, "grad_norm": 0.1744568794965744, "learning_rate": 1e-05, "loss": 0.0219, "step": 504800 }, { "epoch": 0.005049, "grad_norm": 0.18963667750358582, "learning_rate": 1e-05, "loss": 0.0222, "step": 504900 }, { "epoch": 0.00505, "grad_norm": 0.19075265526771545, "learning_rate": 1e-05, "loss": 0.0222, "step": 505000 }, { "epoch": 0.005051, "grad_norm": 0.19949012994766235, "learning_rate": 1e-05, "loss": 0.0223, "step": 505100 }, { "epoch": 0.005052, "grad_norm": 0.23749272525310516, "learning_rate": 1e-05, "loss": 0.0225, "step": 505200 }, { "epoch": 0.005053, "grad_norm": 0.18709872663021088, "learning_rate": 1e-05, "loss": 0.0223, "step": 505300 }, { "epoch": 0.005054, "grad_norm": 0.14609885215759277, "learning_rate": 1e-05, "loss": 0.0226, "step": 505400 }, { "epoch": 0.005055, "grad_norm": 0.20043879747390747, "learning_rate": 1e-05, "loss": 0.0227, "step": 505500 }, { "epoch": 0.005056, "grad_norm": 0.14228880405426025, "learning_rate": 1e-05, "loss": 0.0218, "step": 505600 }, { "epoch": 0.005057, "grad_norm": 0.2100621461868286, "learning_rate": 1e-05, "loss": 0.0222, "step": 505700 }, { "epoch": 0.005058, "grad_norm": 0.16085653007030487, "learning_rate": 1e-05, "loss": 0.0223, "step": 505800 }, { "epoch": 0.005059, "grad_norm": 0.19522270560264587, "learning_rate": 1e-05, "loss": 0.0222, "step": 505900 }, { "epoch": 0.00506, "grad_norm": 0.16659827530384064, "learning_rate": 1e-05, "loss": 0.0222, "step": 506000 }, { "epoch": 0.005061, "grad_norm": 0.26441490650177, "learning_rate": 1e-05, "loss": 0.0226, "step": 506100 }, { "epoch": 0.005062, "grad_norm": 0.19379980862140656, "learning_rate": 1e-05, "loss": 0.0217, "step": 506200 }, { "epoch": 0.005063, "grad_norm": 0.1608579307794571, "learning_rate": 1e-05, "loss": 0.0224, "step": 506300 }, { "epoch": 0.005064, "grad_norm": 0.2756195068359375, "learning_rate": 1e-05, "loss": 0.0223, "step": 506400 }, { "epoch": 0.005065, "grad_norm": 0.15984463691711426, "learning_rate": 1e-05, "loss": 0.0224, "step": 506500 }, { "epoch": 0.005066, "grad_norm": 0.1600152552127838, "learning_rate": 1e-05, "loss": 0.0223, "step": 506600 }, { "epoch": 0.005067, "grad_norm": 0.22252634167671204, "learning_rate": 1e-05, "loss": 0.0219, "step": 506700 }, { "epoch": 0.005068, "grad_norm": 0.16834428906440735, "learning_rate": 1e-05, "loss": 0.0225, "step": 506800 }, { "epoch": 0.005069, "grad_norm": 0.15329894423484802, "learning_rate": 1e-05, "loss": 0.0224, "step": 506900 }, { "epoch": 0.00507, "grad_norm": 0.20419053733348846, "learning_rate": 1e-05, "loss": 0.0217, "step": 507000 }, { "epoch": 0.005071, "grad_norm": 0.16447651386260986, "learning_rate": 1e-05, "loss": 0.0219, "step": 507100 }, { "epoch": 0.005072, "grad_norm": 0.17175589501857758, "learning_rate": 1e-05, "loss": 0.0224, "step": 507200 }, { "epoch": 0.005073, "grad_norm": 0.125766322016716, "learning_rate": 1e-05, "loss": 0.0224, "step": 507300 }, { "epoch": 0.005074, "grad_norm": 0.1601540595293045, "learning_rate": 1e-05, "loss": 0.0226, "step": 507400 }, { "epoch": 0.005075, "grad_norm": 0.13804326951503754, "learning_rate": 1e-05, "loss": 0.0221, "step": 507500 }, { "epoch": 0.005076, "grad_norm": 0.2553005814552307, "learning_rate": 1e-05, "loss": 0.0223, "step": 507600 }, { "epoch": 0.005077, "grad_norm": 0.19381675124168396, "learning_rate": 1e-05, "loss": 0.022, "step": 507700 }, { "epoch": 0.005078, "grad_norm": 0.2046412080526352, "learning_rate": 1e-05, "loss": 0.0222, "step": 507800 }, { "epoch": 0.005079, "grad_norm": 0.18402324616909027, "learning_rate": 1e-05, "loss": 0.0218, "step": 507900 }, { "epoch": 0.00508, "grad_norm": 0.20850765705108643, "learning_rate": 1e-05, "loss": 0.0225, "step": 508000 }, { "epoch": 0.005081, "grad_norm": 0.2958817780017853, "learning_rate": 1e-05, "loss": 0.0219, "step": 508100 }, { "epoch": 0.005082, "grad_norm": 0.22430017590522766, "learning_rate": 1e-05, "loss": 0.0218, "step": 508200 }, { "epoch": 0.005083, "grad_norm": 0.16673550009727478, "learning_rate": 1e-05, "loss": 0.0226, "step": 508300 }, { "epoch": 0.005084, "grad_norm": 0.2110511213541031, "learning_rate": 1e-05, "loss": 0.0216, "step": 508400 }, { "epoch": 0.005085, "grad_norm": 0.18565848469734192, "learning_rate": 1e-05, "loss": 0.022, "step": 508500 }, { "epoch": 0.005086, "grad_norm": 0.18346522748470306, "learning_rate": 1e-05, "loss": 0.0222, "step": 508600 }, { "epoch": 0.005087, "grad_norm": 0.23770560324192047, "learning_rate": 1e-05, "loss": 0.0224, "step": 508700 }, { "epoch": 0.005088, "grad_norm": 0.16799791157245636, "learning_rate": 1e-05, "loss": 0.0228, "step": 508800 }, { "epoch": 0.005089, "grad_norm": 0.15598882734775543, "learning_rate": 1e-05, "loss": 0.0221, "step": 508900 }, { "epoch": 0.00509, "grad_norm": 0.13898597657680511, "learning_rate": 1e-05, "loss": 0.0222, "step": 509000 }, { "epoch": 0.005091, "grad_norm": 0.23213304579257965, "learning_rate": 1e-05, "loss": 0.0222, "step": 509100 }, { "epoch": 0.005092, "grad_norm": 0.21119028329849243, "learning_rate": 1e-05, "loss": 0.0222, "step": 509200 }, { "epoch": 0.005093, "grad_norm": 0.202596053481102, "learning_rate": 1e-05, "loss": 0.0222, "step": 509300 }, { "epoch": 0.005094, "grad_norm": 0.23463867604732513, "learning_rate": 1e-05, "loss": 0.0225, "step": 509400 }, { "epoch": 0.005095, "grad_norm": 0.2577962279319763, "learning_rate": 1e-05, "loss": 0.0217, "step": 509500 }, { "epoch": 0.005096, "grad_norm": 0.18573550879955292, "learning_rate": 1e-05, "loss": 0.023, "step": 509600 }, { "epoch": 0.005097, "grad_norm": 0.15581920742988586, "learning_rate": 1e-05, "loss": 0.0225, "step": 509700 }, { "epoch": 0.005098, "grad_norm": 0.25723138451576233, "learning_rate": 1e-05, "loss": 0.022, "step": 509800 }, { "epoch": 0.005099, "grad_norm": 0.204372838139534, "learning_rate": 1e-05, "loss": 0.0221, "step": 509900 }, { "epoch": 0.0051, "grad_norm": 0.17827077209949493, "learning_rate": 1e-05, "loss": 0.0219, "step": 510000 }, { "epoch": 0.005101, "grad_norm": 0.15465669333934784, "learning_rate": 1e-05, "loss": 0.0221, "step": 510100 }, { "epoch": 0.005102, "grad_norm": 0.18871107697486877, "learning_rate": 1e-05, "loss": 0.0218, "step": 510200 }, { "epoch": 0.005103, "grad_norm": 0.19572384655475616, "learning_rate": 1e-05, "loss": 0.0219, "step": 510300 }, { "epoch": 0.005104, "grad_norm": 0.16794966161251068, "learning_rate": 1e-05, "loss": 0.0218, "step": 510400 }, { "epoch": 0.005105, "grad_norm": 0.14963942766189575, "learning_rate": 1e-05, "loss": 0.0223, "step": 510500 }, { "epoch": 0.005106, "grad_norm": 0.16831903159618378, "learning_rate": 1e-05, "loss": 0.0225, "step": 510600 }, { "epoch": 0.005107, "grad_norm": 0.15348482131958008, "learning_rate": 1e-05, "loss": 0.022, "step": 510700 }, { "epoch": 0.005108, "grad_norm": 0.14979903399944305, "learning_rate": 1e-05, "loss": 0.0219, "step": 510800 }, { "epoch": 0.005109, "grad_norm": 0.16510199010372162, "learning_rate": 1e-05, "loss": 0.0222, "step": 510900 }, { "epoch": 0.00511, "grad_norm": 0.16631245613098145, "learning_rate": 1e-05, "loss": 0.0219, "step": 511000 }, { "epoch": 0.005111, "grad_norm": 0.24561463296413422, "learning_rate": 1e-05, "loss": 0.0219, "step": 511100 }, { "epoch": 0.005112, "grad_norm": 0.2206975370645523, "learning_rate": 1e-05, "loss": 0.0222, "step": 511200 }, { "epoch": 0.005113, "grad_norm": 0.2151448279619217, "learning_rate": 1e-05, "loss": 0.022, "step": 511300 }, { "epoch": 0.005114, "grad_norm": 0.25566595792770386, "learning_rate": 1e-05, "loss": 0.0223, "step": 511400 }, { "epoch": 0.005115, "grad_norm": 0.17456898093223572, "learning_rate": 1e-05, "loss": 0.0222, "step": 511500 }, { "epoch": 0.005116, "grad_norm": 0.19400009512901306, "learning_rate": 1e-05, "loss": 0.0219, "step": 511600 }, { "epoch": 0.005117, "grad_norm": 0.1873835027217865, "learning_rate": 1e-05, "loss": 0.0223, "step": 511700 }, { "epoch": 0.005118, "grad_norm": 0.22489643096923828, "learning_rate": 1e-05, "loss": 0.022, "step": 511800 }, { "epoch": 0.005119, "grad_norm": 0.15325921773910522, "learning_rate": 1e-05, "loss": 0.0221, "step": 511900 }, { "epoch": 0.00512, "grad_norm": 0.18074475228786469, "learning_rate": 1e-05, "loss": 0.0218, "step": 512000 }, { "epoch": 0.005121, "grad_norm": 0.19041866064071655, "learning_rate": 1e-05, "loss": 0.0225, "step": 512100 }, { "epoch": 0.005122, "grad_norm": 0.15092197060585022, "learning_rate": 1e-05, "loss": 0.022, "step": 512200 }, { "epoch": 0.005123, "grad_norm": 0.24386700987815857, "learning_rate": 1e-05, "loss": 0.0219, "step": 512300 }, { "epoch": 0.005124, "grad_norm": 0.2866821587085724, "learning_rate": 1e-05, "loss": 0.0218, "step": 512400 }, { "epoch": 0.005125, "grad_norm": 0.19473174214363098, "learning_rate": 1e-05, "loss": 0.0223, "step": 512500 }, { "epoch": 0.005126, "grad_norm": 0.21506409347057343, "learning_rate": 1e-05, "loss": 0.0224, "step": 512600 }, { "epoch": 0.005127, "grad_norm": 0.18930940330028534, "learning_rate": 1e-05, "loss": 0.0219, "step": 512700 }, { "epoch": 0.005128, "grad_norm": 0.14341086149215698, "learning_rate": 1e-05, "loss": 0.0219, "step": 512800 }, { "epoch": 0.005129, "grad_norm": 0.3173481822013855, "learning_rate": 1e-05, "loss": 0.0222, "step": 512900 }, { "epoch": 0.00513, "grad_norm": 0.16340819001197815, "learning_rate": 1e-05, "loss": 0.0215, "step": 513000 }, { "epoch": 0.005131, "grad_norm": 0.211440771818161, "learning_rate": 1e-05, "loss": 0.0223, "step": 513100 }, { "epoch": 0.005132, "grad_norm": 0.23591311275959015, "learning_rate": 1e-05, "loss": 0.0222, "step": 513200 }, { "epoch": 0.005133, "grad_norm": 0.15995800495147705, "learning_rate": 1e-05, "loss": 0.0223, "step": 513300 }, { "epoch": 0.005134, "grad_norm": 0.14663799107074738, "learning_rate": 1e-05, "loss": 0.0221, "step": 513400 }, { "epoch": 0.005135, "grad_norm": 0.20810279250144958, "learning_rate": 1e-05, "loss": 0.0218, "step": 513500 }, { "epoch": 0.005136, "grad_norm": 0.15707986056804657, "learning_rate": 1e-05, "loss": 0.0214, "step": 513600 }, { "epoch": 0.005137, "grad_norm": 0.2074098438024521, "learning_rate": 1e-05, "loss": 0.0224, "step": 513700 }, { "epoch": 0.005138, "grad_norm": 0.17800050973892212, "learning_rate": 1e-05, "loss": 0.0225, "step": 513800 }, { "epoch": 0.005139, "grad_norm": 0.21399565041065216, "learning_rate": 1e-05, "loss": 0.0225, "step": 513900 }, { "epoch": 0.00514, "grad_norm": 0.11772659420967102, "learning_rate": 1e-05, "loss": 0.0217, "step": 514000 }, { "epoch": 0.005141, "grad_norm": 0.2189042717218399, "learning_rate": 1e-05, "loss": 0.0221, "step": 514100 }, { "epoch": 0.005142, "grad_norm": 0.17448017001152039, "learning_rate": 1e-05, "loss": 0.0219, "step": 514200 }, { "epoch": 0.005143, "grad_norm": 0.19326718151569366, "learning_rate": 1e-05, "loss": 0.0223, "step": 514300 }, { "epoch": 0.005144, "grad_norm": 0.14584589004516602, "learning_rate": 1e-05, "loss": 0.0219, "step": 514400 }, { "epoch": 0.005145, "grad_norm": 0.1901099979877472, "learning_rate": 1e-05, "loss": 0.0221, "step": 514500 }, { "epoch": 0.005146, "grad_norm": 0.20117680728435516, "learning_rate": 1e-05, "loss": 0.022, "step": 514600 }, { "epoch": 0.005147, "grad_norm": 0.18487168848514557, "learning_rate": 1e-05, "loss": 0.0222, "step": 514700 }, { "epoch": 0.005148, "grad_norm": 0.18408960103988647, "learning_rate": 1e-05, "loss": 0.0224, "step": 514800 }, { "epoch": 0.005149, "grad_norm": 0.19492191076278687, "learning_rate": 1e-05, "loss": 0.0214, "step": 514900 }, { "epoch": 0.00515, "grad_norm": 0.11989197880029678, "learning_rate": 1e-05, "loss": 0.0213, "step": 515000 }, { "epoch": 0.005151, "grad_norm": 0.1762372851371765, "learning_rate": 1e-05, "loss": 0.0218, "step": 515100 }, { "epoch": 0.005152, "grad_norm": 0.17749446630477905, "learning_rate": 1e-05, "loss": 0.0219, "step": 515200 }, { "epoch": 0.005153, "grad_norm": 0.26825639605522156, "learning_rate": 1e-05, "loss": 0.0216, "step": 515300 }, { "epoch": 0.005154, "grad_norm": 0.14416693150997162, "learning_rate": 1e-05, "loss": 0.0224, "step": 515400 }, { "epoch": 0.005155, "grad_norm": 0.21301916241645813, "learning_rate": 1e-05, "loss": 0.022, "step": 515500 }, { "epoch": 0.005156, "grad_norm": 0.15521948039531708, "learning_rate": 1e-05, "loss": 0.0219, "step": 515600 }, { "epoch": 0.005157, "grad_norm": 0.16343387961387634, "learning_rate": 1e-05, "loss": 0.022, "step": 515700 }, { "epoch": 0.005158, "grad_norm": 0.1579238772392273, "learning_rate": 1e-05, "loss": 0.0225, "step": 515800 }, { "epoch": 0.005159, "grad_norm": 0.21261273324489594, "learning_rate": 1e-05, "loss": 0.0218, "step": 515900 }, { "epoch": 0.00516, "grad_norm": 0.20317254960536957, "learning_rate": 1e-05, "loss": 0.0221, "step": 516000 }, { "epoch": 0.005161, "grad_norm": 0.2413691133260727, "learning_rate": 1e-05, "loss": 0.0223, "step": 516100 }, { "epoch": 0.005162, "grad_norm": 0.23964151740074158, "learning_rate": 1e-05, "loss": 0.0222, "step": 516200 }, { "epoch": 0.005163, "grad_norm": 0.1514037400484085, "learning_rate": 1e-05, "loss": 0.0216, "step": 516300 }, { "epoch": 0.005164, "grad_norm": 0.131228968501091, "learning_rate": 1e-05, "loss": 0.0215, "step": 516400 }, { "epoch": 0.005165, "grad_norm": 0.20677104592323303, "learning_rate": 1e-05, "loss": 0.0222, "step": 516500 }, { "epoch": 0.005166, "grad_norm": 0.17464856803417206, "learning_rate": 1e-05, "loss": 0.0216, "step": 516600 }, { "epoch": 0.005167, "grad_norm": 0.1511344313621521, "learning_rate": 1e-05, "loss": 0.0223, "step": 516700 }, { "epoch": 0.005168, "grad_norm": 0.1745045781135559, "learning_rate": 1e-05, "loss": 0.0217, "step": 516800 }, { "epoch": 0.005169, "grad_norm": 0.17399461567401886, "learning_rate": 1e-05, "loss": 0.0221, "step": 516900 }, { "epoch": 0.00517, "grad_norm": 0.23565885424613953, "learning_rate": 1e-05, "loss": 0.0215, "step": 517000 }, { "epoch": 0.005171, "grad_norm": 0.11355145275592804, "learning_rate": 1e-05, "loss": 0.0222, "step": 517100 }, { "epoch": 0.005172, "grad_norm": 0.15561695396900177, "learning_rate": 1e-05, "loss": 0.0215, "step": 517200 }, { "epoch": 0.005173, "grad_norm": 0.18234020471572876, "learning_rate": 1e-05, "loss": 0.0222, "step": 517300 }, { "epoch": 0.005174, "grad_norm": 0.20115217566490173, "learning_rate": 1e-05, "loss": 0.0223, "step": 517400 }, { "epoch": 0.005175, "grad_norm": 0.1983947902917862, "learning_rate": 1e-05, "loss": 0.0222, "step": 517500 }, { "epoch": 0.005176, "grad_norm": 0.2094590961933136, "learning_rate": 1e-05, "loss": 0.0223, "step": 517600 }, { "epoch": 0.005177, "grad_norm": 0.16946837306022644, "learning_rate": 1e-05, "loss": 0.0216, "step": 517700 }, { "epoch": 0.005178, "grad_norm": 0.23574481904506683, "learning_rate": 1e-05, "loss": 0.0225, "step": 517800 }, { "epoch": 0.005179, "grad_norm": 0.16094711422920227, "learning_rate": 1e-05, "loss": 0.0217, "step": 517900 }, { "epoch": 0.00518, "grad_norm": 0.21692928671836853, "learning_rate": 1e-05, "loss": 0.0221, "step": 518000 }, { "epoch": 0.005181, "grad_norm": 0.17372220754623413, "learning_rate": 1e-05, "loss": 0.0217, "step": 518100 }, { "epoch": 0.005182, "grad_norm": 0.16046153008937836, "learning_rate": 1e-05, "loss": 0.0222, "step": 518200 }, { "epoch": 0.005183, "grad_norm": 0.19216546416282654, "learning_rate": 1e-05, "loss": 0.0221, "step": 518300 }, { "epoch": 0.005184, "grad_norm": 0.20505785942077637, "learning_rate": 1e-05, "loss": 0.0219, "step": 518400 }, { "epoch": 0.005185, "grad_norm": 0.15830056369304657, "learning_rate": 1e-05, "loss": 0.0216, "step": 518500 }, { "epoch": 0.005186, "grad_norm": 0.19777771830558777, "learning_rate": 1e-05, "loss": 0.0219, "step": 518600 }, { "epoch": 0.005187, "grad_norm": 0.20698802173137665, "learning_rate": 1e-05, "loss": 0.0218, "step": 518700 }, { "epoch": 0.005188, "grad_norm": 0.28006476163864136, "learning_rate": 1e-05, "loss": 0.0221, "step": 518800 }, { "epoch": 0.005189, "grad_norm": 0.23197554051876068, "learning_rate": 1e-05, "loss": 0.0212, "step": 518900 }, { "epoch": 0.00519, "grad_norm": 0.1429504007101059, "learning_rate": 1e-05, "loss": 0.0223, "step": 519000 }, { "epoch": 0.005191, "grad_norm": 0.20026585459709167, "learning_rate": 1e-05, "loss": 0.0217, "step": 519100 }, { "epoch": 0.005192, "grad_norm": 0.2138204723596573, "learning_rate": 1e-05, "loss": 0.022, "step": 519200 }, { "epoch": 0.005193, "grad_norm": 0.16531355679035187, "learning_rate": 1e-05, "loss": 0.0217, "step": 519300 }, { "epoch": 0.005194, "grad_norm": 0.1625499427318573, "learning_rate": 1e-05, "loss": 0.0221, "step": 519400 }, { "epoch": 0.005195, "grad_norm": 0.19142067432403564, "learning_rate": 1e-05, "loss": 0.0215, "step": 519500 }, { "epoch": 0.005196, "grad_norm": 0.19860899448394775, "learning_rate": 1e-05, "loss": 0.0219, "step": 519600 }, { "epoch": 0.005197, "grad_norm": 0.1492001712322235, "learning_rate": 1e-05, "loss": 0.0223, "step": 519700 }, { "epoch": 0.005198, "grad_norm": 0.3342857360839844, "learning_rate": 1e-05, "loss": 0.022, "step": 519800 }, { "epoch": 0.005199, "grad_norm": 0.13614535331726074, "learning_rate": 1e-05, "loss": 0.0221, "step": 519900 }, { "epoch": 0.0052, "grad_norm": 0.18393531441688538, "learning_rate": 1e-05, "loss": 0.0221, "step": 520000 }, { "epoch": 0.0052, "eval_loss": 0.019574856385588646, "eval_runtime": 169.5606, "eval_samples_per_second": 294.88, "eval_steps_per_second": 18.43, "step": 520000 }, { "epoch": 0.005201, "grad_norm": 0.14153896272182465, "learning_rate": 1e-05, "loss": 0.022, "step": 520100 }, { "epoch": 0.005202, "grad_norm": 0.14659392833709717, "learning_rate": 1e-05, "loss": 0.0221, "step": 520200 }, { "epoch": 0.005203, "grad_norm": 0.19354002177715302, "learning_rate": 1e-05, "loss": 0.0219, "step": 520300 }, { "epoch": 0.005204, "grad_norm": 0.18926972150802612, "learning_rate": 1e-05, "loss": 0.0218, "step": 520400 }, { "epoch": 0.005205, "grad_norm": 0.22269970178604126, "learning_rate": 1e-05, "loss": 0.0218, "step": 520500 }, { "epoch": 0.005206, "grad_norm": 0.17003971338272095, "learning_rate": 1e-05, "loss": 0.0222, "step": 520600 }, { "epoch": 0.005207, "grad_norm": 0.1744411140680313, "learning_rate": 1e-05, "loss": 0.0219, "step": 520700 }, { "epoch": 0.005208, "grad_norm": 0.16201463341712952, "learning_rate": 1e-05, "loss": 0.0224, "step": 520800 }, { "epoch": 0.005209, "grad_norm": 0.16160345077514648, "learning_rate": 1e-05, "loss": 0.0216, "step": 520900 }, { "epoch": 0.00521, "grad_norm": 0.21760736405849457, "learning_rate": 1e-05, "loss": 0.0224, "step": 521000 }, { "epoch": 0.005211, "grad_norm": 0.22604085505008698, "learning_rate": 1e-05, "loss": 0.022, "step": 521100 }, { "epoch": 0.005212, "grad_norm": 0.16676940023899078, "learning_rate": 1e-05, "loss": 0.0223, "step": 521200 }, { "epoch": 0.005213, "grad_norm": 0.14291612803936005, "learning_rate": 1e-05, "loss": 0.0219, "step": 521300 }, { "epoch": 0.005214, "grad_norm": 0.14447656273841858, "learning_rate": 1e-05, "loss": 0.0217, "step": 521400 }, { "epoch": 0.005215, "grad_norm": 0.17757900059223175, "learning_rate": 1e-05, "loss": 0.0222, "step": 521500 }, { "epoch": 0.005216, "grad_norm": 0.21780630946159363, "learning_rate": 1e-05, "loss": 0.0224, "step": 521600 }, { "epoch": 0.005217, "grad_norm": 0.17200031876564026, "learning_rate": 1e-05, "loss": 0.0218, "step": 521700 }, { "epoch": 0.005218, "grad_norm": 0.26114028692245483, "learning_rate": 1e-05, "loss": 0.0223, "step": 521800 }, { "epoch": 0.005219, "grad_norm": 0.1326814740896225, "learning_rate": 1e-05, "loss": 0.0219, "step": 521900 }, { "epoch": 0.00522, "grad_norm": 0.20146451890468597, "learning_rate": 1e-05, "loss": 0.0215, "step": 522000 }, { "epoch": 0.005221, "grad_norm": 0.23365569114685059, "learning_rate": 1e-05, "loss": 0.0221, "step": 522100 }, { "epoch": 0.005222, "grad_norm": 0.1708264797925949, "learning_rate": 1e-05, "loss": 0.0219, "step": 522200 }, { "epoch": 0.005223, "grad_norm": 0.15147563815116882, "learning_rate": 1e-05, "loss": 0.0221, "step": 522300 }, { "epoch": 0.005224, "grad_norm": 0.17956939339637756, "learning_rate": 1e-05, "loss": 0.0213, "step": 522400 }, { "epoch": 0.005225, "grad_norm": 0.18070247769355774, "learning_rate": 1e-05, "loss": 0.022, "step": 522500 }, { "epoch": 0.005226, "grad_norm": 0.21242475509643555, "learning_rate": 1e-05, "loss": 0.0224, "step": 522600 }, { "epoch": 0.005227, "grad_norm": 0.2398556023836136, "learning_rate": 1e-05, "loss": 0.0212, "step": 522700 }, { "epoch": 0.005228, "grad_norm": 0.17049553990364075, "learning_rate": 1e-05, "loss": 0.022, "step": 522800 }, { "epoch": 0.005229, "grad_norm": 0.18438705801963806, "learning_rate": 1e-05, "loss": 0.0219, "step": 522900 }, { "epoch": 0.00523, "grad_norm": 0.18207408487796783, "learning_rate": 1e-05, "loss": 0.0218, "step": 523000 }, { "epoch": 0.005231, "grad_norm": 0.1390789896249771, "learning_rate": 1e-05, "loss": 0.0218, "step": 523100 }, { "epoch": 0.005232, "grad_norm": 0.17510849237442017, "learning_rate": 1e-05, "loss": 0.0221, "step": 523200 }, { "epoch": 0.005233, "grad_norm": 0.16017043590545654, "learning_rate": 1e-05, "loss": 0.0224, "step": 523300 }, { "epoch": 0.005234, "grad_norm": 0.20636077225208282, "learning_rate": 1e-05, "loss": 0.022, "step": 523400 }, { "epoch": 0.005235, "grad_norm": 0.18318480253219604, "learning_rate": 1e-05, "loss": 0.0215, "step": 523500 }, { "epoch": 0.005236, "grad_norm": 0.1799270659685135, "learning_rate": 1e-05, "loss": 0.0219, "step": 523600 }, { "epoch": 0.005237, "grad_norm": 0.1939309537410736, "learning_rate": 1e-05, "loss": 0.022, "step": 523700 }, { "epoch": 0.005238, "grad_norm": 0.17326873540878296, "learning_rate": 1e-05, "loss": 0.0223, "step": 523800 }, { "epoch": 0.005239, "grad_norm": 0.14989235997200012, "learning_rate": 1e-05, "loss": 0.0221, "step": 523900 }, { "epoch": 0.00524, "grad_norm": 0.16989783942699432, "learning_rate": 1e-05, "loss": 0.0225, "step": 524000 }, { "epoch": 0.005241, "grad_norm": 0.15059509873390198, "learning_rate": 1e-05, "loss": 0.0222, "step": 524100 }, { "epoch": 0.005242, "grad_norm": 0.2540434002876282, "learning_rate": 1e-05, "loss": 0.0221, "step": 524200 }, { "epoch": 0.005243, "grad_norm": 0.2519684433937073, "learning_rate": 1e-05, "loss": 0.0225, "step": 524300 }, { "epoch": 0.005244, "grad_norm": 0.1979115605354309, "learning_rate": 1e-05, "loss": 0.0219, "step": 524400 }, { "epoch": 0.005245, "grad_norm": 0.1637943536043167, "learning_rate": 1e-05, "loss": 0.0217, "step": 524500 }, { "epoch": 0.005246, "grad_norm": 0.18622051179409027, "learning_rate": 1e-05, "loss": 0.0221, "step": 524600 }, { "epoch": 0.005247, "grad_norm": 0.1880505234003067, "learning_rate": 1e-05, "loss": 0.0217, "step": 524700 }, { "epoch": 0.005248, "grad_norm": 0.20128799974918365, "learning_rate": 1e-05, "loss": 0.0224, "step": 524800 }, { "epoch": 0.005249, "grad_norm": 0.1761418730020523, "learning_rate": 1e-05, "loss": 0.0216, "step": 524900 }, { "epoch": 0.00525, "grad_norm": 0.23777350783348083, "learning_rate": 1e-05, "loss": 0.0215, "step": 525000 }, { "epoch": 0.005251, "grad_norm": 0.16149422526359558, "learning_rate": 1e-05, "loss": 0.0221, "step": 525100 }, { "epoch": 0.005252, "grad_norm": 0.18251881003379822, "learning_rate": 1e-05, "loss": 0.0224, "step": 525200 }, { "epoch": 0.005253, "grad_norm": 0.18976335227489471, "learning_rate": 1e-05, "loss": 0.0215, "step": 525300 }, { "epoch": 0.005254, "grad_norm": 0.15596811473369598, "learning_rate": 1e-05, "loss": 0.0223, "step": 525400 }, { "epoch": 0.005255, "grad_norm": 0.20789046585559845, "learning_rate": 1e-05, "loss": 0.0216, "step": 525500 }, { "epoch": 0.005256, "grad_norm": 0.14423248171806335, "learning_rate": 1e-05, "loss": 0.0216, "step": 525600 }, { "epoch": 0.005257, "grad_norm": 0.16381679475307465, "learning_rate": 1e-05, "loss": 0.022, "step": 525700 }, { "epoch": 0.005258, "grad_norm": 0.15940460562705994, "learning_rate": 1e-05, "loss": 0.0218, "step": 525800 }, { "epoch": 0.005259, "grad_norm": 0.2061987817287445, "learning_rate": 1e-05, "loss": 0.0214, "step": 525900 }, { "epoch": 0.00526, "grad_norm": 0.24444112181663513, "learning_rate": 1e-05, "loss": 0.0223, "step": 526000 }, { "epoch": 0.005261, "grad_norm": 0.18422581255435944, "learning_rate": 1e-05, "loss": 0.0222, "step": 526100 }, { "epoch": 0.005262, "grad_norm": 0.16724221408367157, "learning_rate": 1e-05, "loss": 0.0223, "step": 526200 }, { "epoch": 0.005263, "grad_norm": 0.2926560938358307, "learning_rate": 1e-05, "loss": 0.0217, "step": 526300 }, { "epoch": 0.005264, "grad_norm": 0.1952270120382309, "learning_rate": 1e-05, "loss": 0.022, "step": 526400 }, { "epoch": 0.005265, "grad_norm": 0.18297021090984344, "learning_rate": 1e-05, "loss": 0.0224, "step": 526500 }, { "epoch": 0.005266, "grad_norm": 0.15614698827266693, "learning_rate": 1e-05, "loss": 0.0215, "step": 526600 }, { "epoch": 0.005267, "grad_norm": 0.2529445290565491, "learning_rate": 1e-05, "loss": 0.0223, "step": 526700 }, { "epoch": 0.005268, "grad_norm": 0.23708373308181763, "learning_rate": 1e-05, "loss": 0.022, "step": 526800 }, { "epoch": 0.005269, "grad_norm": 0.12588979303836823, "learning_rate": 1e-05, "loss": 0.0216, "step": 526900 }, { "epoch": 0.00527, "grad_norm": 0.13134446740150452, "learning_rate": 1e-05, "loss": 0.0215, "step": 527000 }, { "epoch": 0.005271, "grad_norm": 0.24277684092521667, "learning_rate": 1e-05, "loss": 0.0217, "step": 527100 }, { "epoch": 0.005272, "grad_norm": 0.17399713397026062, "learning_rate": 1e-05, "loss": 0.0222, "step": 527200 }, { "epoch": 0.005273, "grad_norm": 0.20243705809116364, "learning_rate": 1e-05, "loss": 0.0222, "step": 527300 }, { "epoch": 0.005274, "grad_norm": 0.1747864931821823, "learning_rate": 1e-05, "loss": 0.0221, "step": 527400 }, { "epoch": 0.005275, "grad_norm": 0.1807374358177185, "learning_rate": 1e-05, "loss": 0.0224, "step": 527500 }, { "epoch": 0.005276, "grad_norm": 0.1972818523645401, "learning_rate": 1e-05, "loss": 0.0215, "step": 527600 }, { "epoch": 0.005277, "grad_norm": 0.2027268409729004, "learning_rate": 1e-05, "loss": 0.0215, "step": 527700 }, { "epoch": 0.005278, "grad_norm": 0.17621658742427826, "learning_rate": 1e-05, "loss": 0.022, "step": 527800 }, { "epoch": 0.005279, "grad_norm": 0.19458863139152527, "learning_rate": 1e-05, "loss": 0.0216, "step": 527900 }, { "epoch": 0.00528, "grad_norm": 0.20512939989566803, "learning_rate": 1e-05, "loss": 0.0217, "step": 528000 }, { "epoch": 0.005281, "grad_norm": 0.1876089721918106, "learning_rate": 1e-05, "loss": 0.0219, "step": 528100 }, { "epoch": 0.005282, "grad_norm": 0.16528773307800293, "learning_rate": 1e-05, "loss": 0.0219, "step": 528200 }, { "epoch": 0.005283, "grad_norm": 0.14041711390018463, "learning_rate": 1e-05, "loss": 0.0217, "step": 528300 }, { "epoch": 0.005284, "grad_norm": 0.17202499508857727, "learning_rate": 1e-05, "loss": 0.0221, "step": 528400 }, { "epoch": 0.005285, "grad_norm": 0.21118849515914917, "learning_rate": 1e-05, "loss": 0.0221, "step": 528500 }, { "epoch": 0.005286, "grad_norm": 0.18647988140583038, "learning_rate": 1e-05, "loss": 0.0215, "step": 528600 }, { "epoch": 0.005287, "grad_norm": 0.18588325381278992, "learning_rate": 1e-05, "loss": 0.022, "step": 528700 }, { "epoch": 0.005288, "grad_norm": 0.21061885356903076, "learning_rate": 1e-05, "loss": 0.0225, "step": 528800 }, { "epoch": 0.005289, "grad_norm": 0.17643000185489655, "learning_rate": 1e-05, "loss": 0.022, "step": 528900 }, { "epoch": 0.00529, "grad_norm": 0.1705438196659088, "learning_rate": 1e-05, "loss": 0.0217, "step": 529000 }, { "epoch": 0.005291, "grad_norm": 0.14537057280540466, "learning_rate": 1e-05, "loss": 0.0216, "step": 529100 }, { "epoch": 0.005292, "grad_norm": 0.1494985669851303, "learning_rate": 1e-05, "loss": 0.0218, "step": 529200 }, { "epoch": 0.005293, "grad_norm": 0.15359173715114594, "learning_rate": 1e-05, "loss": 0.0221, "step": 529300 }, { "epoch": 0.005294, "grad_norm": 0.1544751226902008, "learning_rate": 1e-05, "loss": 0.0216, "step": 529400 }, { "epoch": 0.005295, "grad_norm": 0.13357725739479065, "learning_rate": 1e-05, "loss": 0.0218, "step": 529500 }, { "epoch": 0.005296, "grad_norm": 0.25733187794685364, "learning_rate": 1e-05, "loss": 0.0224, "step": 529600 }, { "epoch": 0.005297, "grad_norm": 0.18889769911766052, "learning_rate": 1e-05, "loss": 0.022, "step": 529700 }, { "epoch": 0.005298, "grad_norm": 0.1301455944776535, "learning_rate": 1e-05, "loss": 0.022, "step": 529800 }, { "epoch": 0.005299, "grad_norm": 0.231567844748497, "learning_rate": 1e-05, "loss": 0.0223, "step": 529900 }, { "epoch": 0.0053, "grad_norm": 0.2154947966337204, "learning_rate": 1e-05, "loss": 0.0217, "step": 530000 }, { "epoch": 0.005301, "grad_norm": 0.16928498446941376, "learning_rate": 1e-05, "loss": 0.0217, "step": 530100 }, { "epoch": 0.005302, "grad_norm": 0.1706966906785965, "learning_rate": 1e-05, "loss": 0.0218, "step": 530200 }, { "epoch": 0.005303, "grad_norm": 0.1930660903453827, "learning_rate": 1e-05, "loss": 0.0221, "step": 530300 }, { "epoch": 0.005304, "grad_norm": 0.14719197154045105, "learning_rate": 1e-05, "loss": 0.0217, "step": 530400 }, { "epoch": 0.005305, "grad_norm": 0.1434042900800705, "learning_rate": 1e-05, "loss": 0.0214, "step": 530500 }, { "epoch": 0.005306, "grad_norm": 0.18489794433116913, "learning_rate": 1e-05, "loss": 0.0218, "step": 530600 }, { "epoch": 0.005307, "grad_norm": 0.26572152972221375, "learning_rate": 1e-05, "loss": 0.0215, "step": 530700 }, { "epoch": 0.005308, "grad_norm": 0.17698262631893158, "learning_rate": 1e-05, "loss": 0.0221, "step": 530800 }, { "epoch": 0.005309, "grad_norm": 0.21425434947013855, "learning_rate": 1e-05, "loss": 0.0216, "step": 530900 }, { "epoch": 0.00531, "grad_norm": 0.20463988184928894, "learning_rate": 1e-05, "loss": 0.0216, "step": 531000 }, { "epoch": 0.005311, "grad_norm": 0.17625992000102997, "learning_rate": 1e-05, "loss": 0.0219, "step": 531100 }, { "epoch": 0.005312, "grad_norm": 0.1764143407344818, "learning_rate": 1e-05, "loss": 0.022, "step": 531200 }, { "epoch": 0.005313, "grad_norm": 0.19569841027259827, "learning_rate": 1e-05, "loss": 0.0219, "step": 531300 }, { "epoch": 0.005314, "grad_norm": 0.1490900218486786, "learning_rate": 1e-05, "loss": 0.022, "step": 531400 }, { "epoch": 0.005315, "grad_norm": 0.15754862129688263, "learning_rate": 1e-05, "loss": 0.0216, "step": 531500 }, { "epoch": 0.005316, "grad_norm": 0.16856679320335388, "learning_rate": 1e-05, "loss": 0.0224, "step": 531600 }, { "epoch": 0.005317, "grad_norm": 0.1581338346004486, "learning_rate": 1e-05, "loss": 0.0213, "step": 531700 }, { "epoch": 0.005318, "grad_norm": 0.19169659912586212, "learning_rate": 1e-05, "loss": 0.0222, "step": 531800 }, { "epoch": 0.005319, "grad_norm": 0.15235714614391327, "learning_rate": 1e-05, "loss": 0.0218, "step": 531900 }, { "epoch": 0.00532, "grad_norm": 0.5544143915176392, "learning_rate": 1e-05, "loss": 0.0224, "step": 532000 }, { "epoch": 0.005321, "grad_norm": 0.16942846775054932, "learning_rate": 1e-05, "loss": 0.0215, "step": 532100 }, { "epoch": 0.005322, "grad_norm": 0.1605413407087326, "learning_rate": 1e-05, "loss": 0.0214, "step": 532200 }, { "epoch": 0.005323, "grad_norm": 0.14669598639011383, "learning_rate": 1e-05, "loss": 0.0221, "step": 532300 }, { "epoch": 0.005324, "grad_norm": 0.15687952935695648, "learning_rate": 1e-05, "loss": 0.0215, "step": 532400 }, { "epoch": 0.005325, "grad_norm": 0.2558515965938568, "learning_rate": 1e-05, "loss": 0.0216, "step": 532500 }, { "epoch": 0.005326, "grad_norm": 0.18853884935379028, "learning_rate": 1e-05, "loss": 0.022, "step": 532600 }, { "epoch": 0.005327, "grad_norm": 0.15048526227474213, "learning_rate": 1e-05, "loss": 0.0218, "step": 532700 }, { "epoch": 0.005328, "grad_norm": 0.17215827107429504, "learning_rate": 1e-05, "loss": 0.0218, "step": 532800 }, { "epoch": 0.005329, "grad_norm": 0.1648462861776352, "learning_rate": 1e-05, "loss": 0.022, "step": 532900 }, { "epoch": 0.00533, "grad_norm": 0.21057412028312683, "learning_rate": 1e-05, "loss": 0.0218, "step": 533000 }, { "epoch": 0.005331, "grad_norm": 0.14689110219478607, "learning_rate": 1e-05, "loss": 0.0219, "step": 533100 }, { "epoch": 0.005332, "grad_norm": 0.13163691759109497, "learning_rate": 1e-05, "loss": 0.0213, "step": 533200 }, { "epoch": 0.005333, "grad_norm": 0.20095907151699066, "learning_rate": 1e-05, "loss": 0.0221, "step": 533300 }, { "epoch": 0.005334, "grad_norm": 0.15936681628227234, "learning_rate": 1e-05, "loss": 0.0218, "step": 533400 }, { "epoch": 0.005335, "grad_norm": 0.17714296281337738, "learning_rate": 1e-05, "loss": 0.022, "step": 533500 }, { "epoch": 0.005336, "grad_norm": 0.19105540215969086, "learning_rate": 1e-05, "loss": 0.0215, "step": 533600 }, { "epoch": 0.005337, "grad_norm": 0.1695810854434967, "learning_rate": 1e-05, "loss": 0.0217, "step": 533700 }, { "epoch": 0.005338, "grad_norm": 0.17089644074440002, "learning_rate": 1e-05, "loss": 0.0216, "step": 533800 }, { "epoch": 0.005339, "grad_norm": 0.177419975399971, "learning_rate": 1e-05, "loss": 0.0223, "step": 533900 }, { "epoch": 0.00534, "grad_norm": 0.19726207852363586, "learning_rate": 1e-05, "loss": 0.0222, "step": 534000 }, { "epoch": 0.005341, "grad_norm": 0.16183669865131378, "learning_rate": 1e-05, "loss": 0.0222, "step": 534100 }, { "epoch": 0.005342, "grad_norm": 0.1784578561782837, "learning_rate": 1e-05, "loss": 0.0218, "step": 534200 }, { "epoch": 0.005343, "grad_norm": 0.1992739886045456, "learning_rate": 1e-05, "loss": 0.0221, "step": 534300 }, { "epoch": 0.005344, "grad_norm": 0.1646251380443573, "learning_rate": 1e-05, "loss": 0.0219, "step": 534400 }, { "epoch": 0.005345, "grad_norm": 0.16809546947479248, "learning_rate": 1e-05, "loss": 0.0217, "step": 534500 }, { "epoch": 0.005346, "grad_norm": 0.13951413333415985, "learning_rate": 1e-05, "loss": 0.0213, "step": 534600 }, { "epoch": 0.005347, "grad_norm": 0.155339777469635, "learning_rate": 1e-05, "loss": 0.0217, "step": 534700 }, { "epoch": 0.005348, "grad_norm": 0.16363966464996338, "learning_rate": 1e-05, "loss": 0.0212, "step": 534800 }, { "epoch": 0.005349, "grad_norm": 0.20155709981918335, "learning_rate": 1e-05, "loss": 0.0218, "step": 534900 }, { "epoch": 0.00535, "grad_norm": 0.18331249058246613, "learning_rate": 1e-05, "loss": 0.022, "step": 535000 }, { "epoch": 0.005351, "grad_norm": 0.19324883818626404, "learning_rate": 1e-05, "loss": 0.0217, "step": 535100 }, { "epoch": 0.005352, "grad_norm": 0.17284339666366577, "learning_rate": 1e-05, "loss": 0.022, "step": 535200 }, { "epoch": 0.005353, "grad_norm": 0.1740754395723343, "learning_rate": 1e-05, "loss": 0.0217, "step": 535300 }, { "epoch": 0.005354, "grad_norm": 0.18658244609832764, "learning_rate": 1e-05, "loss": 0.0215, "step": 535400 }, { "epoch": 0.005355, "grad_norm": 0.19476573169231415, "learning_rate": 1e-05, "loss": 0.0217, "step": 535500 }, { "epoch": 0.005356, "grad_norm": 0.1563701331615448, "learning_rate": 1e-05, "loss": 0.0221, "step": 535600 }, { "epoch": 0.005357, "grad_norm": 0.20779448747634888, "learning_rate": 1e-05, "loss": 0.0216, "step": 535700 }, { "epoch": 0.005358, "grad_norm": 0.16096803545951843, "learning_rate": 1e-05, "loss": 0.0218, "step": 535800 }, { "epoch": 0.005359, "grad_norm": 0.15284644067287445, "learning_rate": 1e-05, "loss": 0.0215, "step": 535900 }, { "epoch": 0.00536, "grad_norm": 0.1640775501728058, "learning_rate": 1e-05, "loss": 0.0214, "step": 536000 }, { "epoch": 0.005361, "grad_norm": 0.19330614805221558, "learning_rate": 1e-05, "loss": 0.022, "step": 536100 }, { "epoch": 0.005362, "grad_norm": 0.3002602159976959, "learning_rate": 1e-05, "loss": 0.0216, "step": 536200 }, { "epoch": 0.005363, "grad_norm": 0.21634088456630707, "learning_rate": 1e-05, "loss": 0.0217, "step": 536300 }, { "epoch": 0.005364, "grad_norm": 0.19622410833835602, "learning_rate": 1e-05, "loss": 0.0216, "step": 536400 }, { "epoch": 0.005365, "grad_norm": 0.13677527010440826, "learning_rate": 1e-05, "loss": 0.0218, "step": 536500 }, { "epoch": 0.005366, "grad_norm": 0.15425297617912292, "learning_rate": 1e-05, "loss": 0.0221, "step": 536600 }, { "epoch": 0.005367, "grad_norm": 0.15366023778915405, "learning_rate": 1e-05, "loss": 0.0221, "step": 536700 }, { "epoch": 0.005368, "grad_norm": 0.14171023666858673, "learning_rate": 1e-05, "loss": 0.0213, "step": 536800 }, { "epoch": 0.005369, "grad_norm": 0.16111312806606293, "learning_rate": 1e-05, "loss": 0.0216, "step": 536900 }, { "epoch": 0.00537, "grad_norm": 0.18236468732357025, "learning_rate": 1e-05, "loss": 0.0213, "step": 537000 }, { "epoch": 0.005371, "grad_norm": 0.2594356834888458, "learning_rate": 1e-05, "loss": 0.0219, "step": 537100 }, { "epoch": 0.005372, "grad_norm": 0.16822724044322968, "learning_rate": 1e-05, "loss": 0.0216, "step": 537200 }, { "epoch": 0.005373, "grad_norm": 0.1913599669933319, "learning_rate": 1e-05, "loss": 0.0215, "step": 537300 }, { "epoch": 0.005374, "grad_norm": 0.1969166249036789, "learning_rate": 1e-05, "loss": 0.0217, "step": 537400 }, { "epoch": 0.005375, "grad_norm": 0.23408100008964539, "learning_rate": 1e-05, "loss": 0.0208, "step": 537500 }, { "epoch": 0.005376, "grad_norm": 0.16076628863811493, "learning_rate": 1e-05, "loss": 0.0221, "step": 537600 }, { "epoch": 0.005377, "grad_norm": 0.16735385358333588, "learning_rate": 1e-05, "loss": 0.0214, "step": 537700 }, { "epoch": 0.005378, "grad_norm": 0.1583266705274582, "learning_rate": 1e-05, "loss": 0.0219, "step": 537800 }, { "epoch": 0.005379, "grad_norm": 0.2141857147216797, "learning_rate": 1e-05, "loss": 0.0213, "step": 537900 }, { "epoch": 0.00538, "grad_norm": 0.15018931031227112, "learning_rate": 1e-05, "loss": 0.0214, "step": 538000 }, { "epoch": 0.005381, "grad_norm": 0.17910566926002502, "learning_rate": 1e-05, "loss": 0.0213, "step": 538100 }, { "epoch": 0.005382, "grad_norm": 0.16434547305107117, "learning_rate": 1e-05, "loss": 0.0216, "step": 538200 }, { "epoch": 0.005383, "grad_norm": 0.13449890911579132, "learning_rate": 1e-05, "loss": 0.0215, "step": 538300 }, { "epoch": 0.005384, "grad_norm": 0.42232054471969604, "learning_rate": 1e-05, "loss": 0.0214, "step": 538400 }, { "epoch": 0.005385, "grad_norm": 0.19684070348739624, "learning_rate": 1e-05, "loss": 0.0222, "step": 538500 }, { "epoch": 0.005386, "grad_norm": 0.1957356482744217, "learning_rate": 1e-05, "loss": 0.0219, "step": 538600 }, { "epoch": 0.005387, "grad_norm": 0.20746099948883057, "learning_rate": 1e-05, "loss": 0.0218, "step": 538700 }, { "epoch": 0.005388, "grad_norm": 0.20959565043449402, "learning_rate": 1e-05, "loss": 0.0219, "step": 538800 }, { "epoch": 0.005389, "grad_norm": 0.19072017073631287, "learning_rate": 1e-05, "loss": 0.0217, "step": 538900 }, { "epoch": 0.00539, "grad_norm": 0.25133034586906433, "learning_rate": 1e-05, "loss": 0.0219, "step": 539000 }, { "epoch": 0.005391, "grad_norm": 0.23125721514225006, "learning_rate": 1e-05, "loss": 0.0212, "step": 539100 }, { "epoch": 0.005392, "grad_norm": 0.16979765892028809, "learning_rate": 1e-05, "loss": 0.0217, "step": 539200 }, { "epoch": 0.005393, "grad_norm": 0.2241716980934143, "learning_rate": 1e-05, "loss": 0.0215, "step": 539300 }, { "epoch": 0.005394, "grad_norm": 0.16183330118656158, "learning_rate": 1e-05, "loss": 0.0213, "step": 539400 }, { "epoch": 0.005395, "grad_norm": 0.2017919421195984, "learning_rate": 1e-05, "loss": 0.0218, "step": 539500 }, { "epoch": 0.005396, "grad_norm": 0.1689465194940567, "learning_rate": 1e-05, "loss": 0.0217, "step": 539600 }, { "epoch": 0.005397, "grad_norm": 0.20658539235591888, "learning_rate": 1e-05, "loss": 0.0217, "step": 539700 }, { "epoch": 0.005398, "grad_norm": 0.17172208428382874, "learning_rate": 1e-05, "loss": 0.0222, "step": 539800 }, { "epoch": 0.005399, "grad_norm": 0.17888139188289642, "learning_rate": 1e-05, "loss": 0.0218, "step": 539900 }, { "epoch": 0.0054, "grad_norm": 0.16492384672164917, "learning_rate": 1e-05, "loss": 0.0218, "step": 540000 }, { "epoch": 0.0054, "eval_loss": 0.02012992464005947, "eval_runtime": 189.7367, "eval_samples_per_second": 263.523, "eval_steps_per_second": 16.47, "step": 540000 }, { "epoch": 0.005401, "grad_norm": 0.1376647651195526, "learning_rate": 1e-05, "loss": 0.0219, "step": 540100 }, { "epoch": 0.005402, "grad_norm": 0.15598270297050476, "learning_rate": 1e-05, "loss": 0.0217, "step": 540200 }, { "epoch": 0.005403, "grad_norm": 0.1725093126296997, "learning_rate": 1e-05, "loss": 0.0213, "step": 540300 }, { "epoch": 0.005404, "grad_norm": 0.15124653279781342, "learning_rate": 1e-05, "loss": 0.0218, "step": 540400 }, { "epoch": 0.005405, "grad_norm": 0.2952737808227539, "learning_rate": 1e-05, "loss": 0.0216, "step": 540500 }, { "epoch": 0.005406, "grad_norm": 0.16719235479831696, "learning_rate": 1e-05, "loss": 0.022, "step": 540600 }, { "epoch": 0.005407, "grad_norm": 0.1878708302974701, "learning_rate": 1e-05, "loss": 0.0218, "step": 540700 }, { "epoch": 0.005408, "grad_norm": 0.18218517303466797, "learning_rate": 1e-05, "loss": 0.0214, "step": 540800 }, { "epoch": 0.005409, "grad_norm": 0.2127697914838791, "learning_rate": 1e-05, "loss": 0.0218, "step": 540900 }, { "epoch": 0.00541, "grad_norm": 0.20254480838775635, "learning_rate": 1e-05, "loss": 0.0211, "step": 541000 }, { "epoch": 0.005411, "grad_norm": 0.2159142941236496, "learning_rate": 1e-05, "loss": 0.0221, "step": 541100 }, { "epoch": 0.005412, "grad_norm": 0.16101154685020447, "learning_rate": 1e-05, "loss": 0.0215, "step": 541200 }, { "epoch": 0.005413, "grad_norm": 0.21399344503879547, "learning_rate": 1e-05, "loss": 0.0217, "step": 541300 }, { "epoch": 0.005414, "grad_norm": 0.20890845358371735, "learning_rate": 1e-05, "loss": 0.0214, "step": 541400 }, { "epoch": 0.005415, "grad_norm": 0.18512989580631256, "learning_rate": 1e-05, "loss": 0.0216, "step": 541500 }, { "epoch": 0.005416, "grad_norm": 0.19752629101276398, "learning_rate": 1e-05, "loss": 0.0218, "step": 541600 }, { "epoch": 0.005417, "grad_norm": 0.18725569546222687, "learning_rate": 1e-05, "loss": 0.0216, "step": 541700 }, { "epoch": 0.005418, "grad_norm": 0.16731654107570648, "learning_rate": 1e-05, "loss": 0.0216, "step": 541800 }, { "epoch": 0.005419, "grad_norm": 0.18347300589084625, "learning_rate": 1e-05, "loss": 0.0216, "step": 541900 }, { "epoch": 0.00542, "grad_norm": 0.13863952457904816, "learning_rate": 1e-05, "loss": 0.022, "step": 542000 }, { "epoch": 0.005421, "grad_norm": 0.1924634724855423, "learning_rate": 1e-05, "loss": 0.022, "step": 542100 }, { "epoch": 0.005422, "grad_norm": 0.18735885620117188, "learning_rate": 1e-05, "loss": 0.0223, "step": 542200 }, { "epoch": 0.005423, "grad_norm": 0.2867150604724884, "learning_rate": 1e-05, "loss": 0.0214, "step": 542300 }, { "epoch": 0.005424, "grad_norm": 0.15133988857269287, "learning_rate": 1e-05, "loss": 0.0218, "step": 542400 }, { "epoch": 0.005425, "grad_norm": 0.17198632657527924, "learning_rate": 1e-05, "loss": 0.0217, "step": 542500 }, { "epoch": 0.005426, "grad_norm": 0.193128764629364, "learning_rate": 1e-05, "loss": 0.0212, "step": 542600 }, { "epoch": 0.005427, "grad_norm": 0.18349269032478333, "learning_rate": 1e-05, "loss": 0.021, "step": 542700 }, { "epoch": 0.005428, "grad_norm": 0.16548287868499756, "learning_rate": 1e-05, "loss": 0.0217, "step": 542800 }, { "epoch": 0.005429, "grad_norm": 0.1488509476184845, "learning_rate": 1e-05, "loss": 0.0215, "step": 542900 }, { "epoch": 0.00543, "grad_norm": 0.14354580640792847, "learning_rate": 1e-05, "loss": 0.022, "step": 543000 }, { "epoch": 0.005431, "grad_norm": 0.15487037599086761, "learning_rate": 1e-05, "loss": 0.0215, "step": 543100 }, { "epoch": 0.005432, "grad_norm": 0.11392688751220703, "learning_rate": 1e-05, "loss": 0.0219, "step": 543200 }, { "epoch": 0.005433, "grad_norm": 0.24297688901424408, "learning_rate": 1e-05, "loss": 0.0214, "step": 543300 }, { "epoch": 0.005434, "grad_norm": 0.17184902727603912, "learning_rate": 1e-05, "loss": 0.0213, "step": 543400 }, { "epoch": 0.005435, "grad_norm": 0.19541209936141968, "learning_rate": 1e-05, "loss": 0.0217, "step": 543500 }, { "epoch": 0.005436, "grad_norm": 0.1626151204109192, "learning_rate": 1e-05, "loss": 0.0217, "step": 543600 }, { "epoch": 0.005437, "grad_norm": 0.1463102549314499, "learning_rate": 1e-05, "loss": 0.022, "step": 543700 }, { "epoch": 0.005438, "grad_norm": 0.12155080586671829, "learning_rate": 1e-05, "loss": 0.0219, "step": 543800 }, { "epoch": 0.005439, "grad_norm": 0.1948794573545456, "learning_rate": 1e-05, "loss": 0.0217, "step": 543900 }, { "epoch": 0.00544, "grad_norm": 0.2526201605796814, "learning_rate": 1e-05, "loss": 0.0217, "step": 544000 }, { "epoch": 0.005441, "grad_norm": 0.12036930024623871, "learning_rate": 1e-05, "loss": 0.022, "step": 544100 }, { "epoch": 0.005442, "grad_norm": 0.1562691181898117, "learning_rate": 1e-05, "loss": 0.0218, "step": 544200 }, { "epoch": 0.005443, "grad_norm": 0.18472075462341309, "learning_rate": 1e-05, "loss": 0.0214, "step": 544300 }, { "epoch": 0.005444, "grad_norm": 0.18149687349796295, "learning_rate": 1e-05, "loss": 0.0219, "step": 544400 }, { "epoch": 0.005445, "grad_norm": 0.15242432057857513, "learning_rate": 1e-05, "loss": 0.021, "step": 544500 }, { "epoch": 0.005446, "grad_norm": 0.1388390213251114, "learning_rate": 1e-05, "loss": 0.0214, "step": 544600 }, { "epoch": 0.005447, "grad_norm": 0.20885638892650604, "learning_rate": 1e-05, "loss": 0.0217, "step": 544700 }, { "epoch": 0.005448, "grad_norm": 0.17225933074951172, "learning_rate": 1e-05, "loss": 0.0212, "step": 544800 }, { "epoch": 0.005449, "grad_norm": 0.20919081568717957, "learning_rate": 1e-05, "loss": 0.0216, "step": 544900 }, { "epoch": 0.00545, "grad_norm": 0.19414174556732178, "learning_rate": 1e-05, "loss": 0.0217, "step": 545000 }, { "epoch": 0.005451, "grad_norm": 0.1559295356273651, "learning_rate": 1e-05, "loss": 0.0218, "step": 545100 }, { "epoch": 0.005452, "grad_norm": 0.24934794008731842, "learning_rate": 1e-05, "loss": 0.0221, "step": 545200 }, { "epoch": 0.005453, "grad_norm": 0.20909146964550018, "learning_rate": 1e-05, "loss": 0.0212, "step": 545300 }, { "epoch": 0.005454, "grad_norm": 0.16465140879154205, "learning_rate": 1e-05, "loss": 0.0221, "step": 545400 }, { "epoch": 0.005455, "grad_norm": 0.1953044980764389, "learning_rate": 1e-05, "loss": 0.0217, "step": 545500 }, { "epoch": 0.005456, "grad_norm": 0.15084321796894073, "learning_rate": 1e-05, "loss": 0.022, "step": 545600 }, { "epoch": 0.005457, "grad_norm": 0.14530053734779358, "learning_rate": 1e-05, "loss": 0.0216, "step": 545700 }, { "epoch": 0.005458, "grad_norm": 0.1483290195465088, "learning_rate": 1e-05, "loss": 0.0216, "step": 545800 }, { "epoch": 0.005459, "grad_norm": 0.20576933026313782, "learning_rate": 1e-05, "loss": 0.0215, "step": 545900 }, { "epoch": 0.00546, "grad_norm": 0.17957279086112976, "learning_rate": 1e-05, "loss": 0.0213, "step": 546000 }, { "epoch": 0.005461, "grad_norm": 0.19977599382400513, "learning_rate": 1e-05, "loss": 0.0213, "step": 546100 }, { "epoch": 0.005462, "grad_norm": 0.1793259084224701, "learning_rate": 1e-05, "loss": 0.0215, "step": 546200 }, { "epoch": 0.005463, "grad_norm": 0.15062139928340912, "learning_rate": 1e-05, "loss": 0.0219, "step": 546300 }, { "epoch": 0.005464, "grad_norm": 0.19402800500392914, "learning_rate": 1e-05, "loss": 0.0215, "step": 546400 }, { "epoch": 0.005465, "grad_norm": 0.26172611117362976, "learning_rate": 1e-05, "loss": 0.0219, "step": 546500 }, { "epoch": 0.005466, "grad_norm": 0.1614566147327423, "learning_rate": 1e-05, "loss": 0.0217, "step": 546600 }, { "epoch": 0.005467, "grad_norm": 0.19221346080303192, "learning_rate": 1e-05, "loss": 0.0214, "step": 546700 }, { "epoch": 0.005468, "grad_norm": 0.17863084375858307, "learning_rate": 1e-05, "loss": 0.0213, "step": 546800 }, { "epoch": 0.005469, "grad_norm": 0.19169104099273682, "learning_rate": 1e-05, "loss": 0.0221, "step": 546900 }, { "epoch": 0.00547, "grad_norm": 0.14476944506168365, "learning_rate": 1e-05, "loss": 0.0217, "step": 547000 }, { "epoch": 0.005471, "grad_norm": 0.17336702346801758, "learning_rate": 1e-05, "loss": 0.0213, "step": 547100 }, { "epoch": 0.005472, "grad_norm": 0.17259250581264496, "learning_rate": 1e-05, "loss": 0.0211, "step": 547200 }, { "epoch": 0.005473, "grad_norm": 0.20450477302074432, "learning_rate": 1e-05, "loss": 0.0214, "step": 547300 }, { "epoch": 0.005474, "grad_norm": 0.18811900913715363, "learning_rate": 1e-05, "loss": 0.0217, "step": 547400 }, { "epoch": 0.005475, "grad_norm": 0.21880902349948883, "learning_rate": 1e-05, "loss": 0.0214, "step": 547500 }, { "epoch": 0.005476, "grad_norm": 0.1726023405790329, "learning_rate": 1e-05, "loss": 0.0217, "step": 547600 }, { "epoch": 0.005477, "grad_norm": 0.19622670114040375, "learning_rate": 1e-05, "loss": 0.0214, "step": 547700 }, { "epoch": 0.005478, "grad_norm": 0.16259050369262695, "learning_rate": 1e-05, "loss": 0.0221, "step": 547800 }, { "epoch": 0.005479, "grad_norm": 0.1755434274673462, "learning_rate": 1e-05, "loss": 0.0217, "step": 547900 }, { "epoch": 0.00548, "grad_norm": 0.20546650886535645, "learning_rate": 1e-05, "loss": 0.0217, "step": 548000 }, { "epoch": 0.005481, "grad_norm": 0.13370300829410553, "learning_rate": 1e-05, "loss": 0.0216, "step": 548100 }, { "epoch": 0.005482, "grad_norm": 0.15013065934181213, "learning_rate": 1e-05, "loss": 0.022, "step": 548200 }, { "epoch": 0.005483, "grad_norm": 0.15599766373634338, "learning_rate": 1e-05, "loss": 0.0214, "step": 548300 }, { "epoch": 0.005484, "grad_norm": 0.1967044472694397, "learning_rate": 1e-05, "loss": 0.0216, "step": 548400 }, { "epoch": 0.005485, "grad_norm": 0.26215609908103943, "learning_rate": 1e-05, "loss": 0.0215, "step": 548500 }, { "epoch": 0.005486, "grad_norm": 0.2224382609128952, "learning_rate": 1e-05, "loss": 0.0213, "step": 548600 }, { "epoch": 0.005487, "grad_norm": 0.21233311295509338, "learning_rate": 1e-05, "loss": 0.0215, "step": 548700 }, { "epoch": 0.005488, "grad_norm": 0.17058499157428741, "learning_rate": 1e-05, "loss": 0.0216, "step": 548800 }, { "epoch": 0.005489, "grad_norm": 0.16729681193828583, "learning_rate": 1e-05, "loss": 0.0215, "step": 548900 }, { "epoch": 0.00549, "grad_norm": 0.2223873883485794, "learning_rate": 1e-05, "loss": 0.0215, "step": 549000 }, { "epoch": 0.005491, "grad_norm": 0.14064931869506836, "learning_rate": 1e-05, "loss": 0.0212, "step": 549100 }, { "epoch": 0.005492, "grad_norm": 0.2298833429813385, "learning_rate": 1e-05, "loss": 0.0215, "step": 549200 }, { "epoch": 0.005493, "grad_norm": 0.16844542324543, "learning_rate": 1e-05, "loss": 0.0212, "step": 549300 }, { "epoch": 0.005494, "grad_norm": 0.14503394067287445, "learning_rate": 1e-05, "loss": 0.0215, "step": 549400 }, { "epoch": 0.005495, "grad_norm": 0.19712984561920166, "learning_rate": 1e-05, "loss": 0.0218, "step": 549500 }, { "epoch": 0.005496, "grad_norm": 0.17435185611248016, "learning_rate": 1e-05, "loss": 0.021, "step": 549600 }, { "epoch": 0.005497, "grad_norm": 0.1759231686592102, "learning_rate": 1e-05, "loss": 0.0216, "step": 549700 }, { "epoch": 0.005498, "grad_norm": 0.21886299550533295, "learning_rate": 1e-05, "loss": 0.0216, "step": 549800 }, { "epoch": 0.005499, "grad_norm": 0.18715375661849976, "learning_rate": 1e-05, "loss": 0.0213, "step": 549900 }, { "epoch": 0.0055, "grad_norm": 0.5101802945137024, "learning_rate": 1e-05, "loss": 0.0215, "step": 550000 }, { "epoch": 0.005501, "grad_norm": 0.14757248759269714, "learning_rate": 1e-05, "loss": 0.022, "step": 550100 }, { "epoch": 0.005502, "grad_norm": 0.1538257598876953, "learning_rate": 1e-05, "loss": 0.0216, "step": 550200 }, { "epoch": 0.005503, "grad_norm": 0.18424300849437714, "learning_rate": 1e-05, "loss": 0.0219, "step": 550300 }, { "epoch": 0.005504, "grad_norm": 0.1385951042175293, "learning_rate": 1e-05, "loss": 0.0212, "step": 550400 }, { "epoch": 0.005505, "grad_norm": 0.1573139727115631, "learning_rate": 1e-05, "loss": 0.0211, "step": 550500 }, { "epoch": 0.005506, "grad_norm": 0.19755080342292786, "learning_rate": 1e-05, "loss": 0.0219, "step": 550600 }, { "epoch": 0.005507, "grad_norm": 0.26627489924430847, "learning_rate": 1e-05, "loss": 0.0222, "step": 550700 }, { "epoch": 0.005508, "grad_norm": 0.19467227160930634, "learning_rate": 1e-05, "loss": 0.0219, "step": 550800 }, { "epoch": 0.005509, "grad_norm": 0.18204249441623688, "learning_rate": 1e-05, "loss": 0.0216, "step": 550900 }, { "epoch": 0.00551, "grad_norm": 0.18634353578090668, "learning_rate": 1e-05, "loss": 0.0213, "step": 551000 }, { "epoch": 0.005511, "grad_norm": 0.18346403539180756, "learning_rate": 1e-05, "loss": 0.0216, "step": 551100 }, { "epoch": 0.005512, "grad_norm": 0.18032020330429077, "learning_rate": 1e-05, "loss": 0.0218, "step": 551200 }, { "epoch": 0.005513, "grad_norm": 0.13982774317264557, "learning_rate": 1e-05, "loss": 0.022, "step": 551300 }, { "epoch": 0.005514, "grad_norm": 0.20941148698329926, "learning_rate": 1e-05, "loss": 0.0213, "step": 551400 }, { "epoch": 0.005515, "grad_norm": 0.19179323315620422, "learning_rate": 1e-05, "loss": 0.0215, "step": 551500 }, { "epoch": 0.005516, "grad_norm": 0.23358038067817688, "learning_rate": 1e-05, "loss": 0.0216, "step": 551600 }, { "epoch": 0.005517, "grad_norm": 0.1772303283214569, "learning_rate": 1e-05, "loss": 0.022, "step": 551700 }, { "epoch": 0.005518, "grad_norm": 0.1620360165834427, "learning_rate": 1e-05, "loss": 0.0214, "step": 551800 }, { "epoch": 0.005519, "grad_norm": 0.21681056916713715, "learning_rate": 1e-05, "loss": 0.0214, "step": 551900 }, { "epoch": 0.00552, "grad_norm": 0.1262381374835968, "learning_rate": 1e-05, "loss": 0.0212, "step": 552000 }, { "epoch": 0.005521, "grad_norm": 0.19117802381515503, "learning_rate": 1e-05, "loss": 0.0214, "step": 552100 }, { "epoch": 0.005522, "grad_norm": 0.2543821334838867, "learning_rate": 1e-05, "loss": 0.022, "step": 552200 }, { "epoch": 0.005523, "grad_norm": 0.18348821997642517, "learning_rate": 1e-05, "loss": 0.0212, "step": 552300 }, { "epoch": 0.005524, "grad_norm": 0.19789254665374756, "learning_rate": 1e-05, "loss": 0.0217, "step": 552400 }, { "epoch": 0.005525, "grad_norm": 0.13670571148395538, "learning_rate": 1e-05, "loss": 0.0213, "step": 552500 }, { "epoch": 0.005526, "grad_norm": 0.22788432240486145, "learning_rate": 1e-05, "loss": 0.0215, "step": 552600 }, { "epoch": 0.005527, "grad_norm": 0.1577412337064743, "learning_rate": 1e-05, "loss": 0.0213, "step": 552700 }, { "epoch": 0.005528, "grad_norm": 0.17063653469085693, "learning_rate": 1e-05, "loss": 0.0212, "step": 552800 }, { "epoch": 0.005529, "grad_norm": 0.16492754220962524, "learning_rate": 1e-05, "loss": 0.0218, "step": 552900 }, { "epoch": 0.00553, "grad_norm": 0.14668181538581848, "learning_rate": 1e-05, "loss": 0.0215, "step": 553000 }, { "epoch": 0.005531, "grad_norm": 0.18860650062561035, "learning_rate": 1e-05, "loss": 0.0215, "step": 553100 }, { "epoch": 0.005532, "grad_norm": 0.17642806470394135, "learning_rate": 1e-05, "loss": 0.0216, "step": 553200 }, { "epoch": 0.005533, "grad_norm": 0.17048820853233337, "learning_rate": 1e-05, "loss": 0.0215, "step": 553300 }, { "epoch": 0.005534, "grad_norm": 0.19447720050811768, "learning_rate": 1e-05, "loss": 0.0217, "step": 553400 }, { "epoch": 0.005535, "grad_norm": 0.1877930909395218, "learning_rate": 1e-05, "loss": 0.0214, "step": 553500 }, { "epoch": 0.005536, "grad_norm": 0.1890811175107956, "learning_rate": 1e-05, "loss": 0.0213, "step": 553600 }, { "epoch": 0.005537, "grad_norm": 0.22177182137966156, "learning_rate": 1e-05, "loss": 0.0219, "step": 553700 }, { "epoch": 0.005538, "grad_norm": 0.15446998178958893, "learning_rate": 1e-05, "loss": 0.0215, "step": 553800 }, { "epoch": 0.005539, "grad_norm": 0.19936780631542206, "learning_rate": 1e-05, "loss": 0.0216, "step": 553900 }, { "epoch": 0.00554, "grad_norm": 0.21125032007694244, "learning_rate": 1e-05, "loss": 0.022, "step": 554000 }, { "epoch": 0.005541, "grad_norm": 0.21861998736858368, "learning_rate": 1e-05, "loss": 0.0213, "step": 554100 }, { "epoch": 0.005542, "grad_norm": 0.16528308391571045, "learning_rate": 1e-05, "loss": 0.0216, "step": 554200 }, { "epoch": 0.005543, "grad_norm": 0.18943755328655243, "learning_rate": 1e-05, "loss": 0.0214, "step": 554300 }, { "epoch": 0.005544, "grad_norm": 0.16291575133800507, "learning_rate": 1e-05, "loss": 0.0213, "step": 554400 }, { "epoch": 0.005545, "grad_norm": 0.1467743068933487, "learning_rate": 1e-05, "loss": 0.0218, "step": 554500 }, { "epoch": 0.005546, "grad_norm": 0.19955529272556305, "learning_rate": 1e-05, "loss": 0.0215, "step": 554600 }, { "epoch": 0.005547, "grad_norm": 0.19955365359783173, "learning_rate": 1e-05, "loss": 0.0217, "step": 554700 }, { "epoch": 0.005548, "grad_norm": 0.1565152108669281, "learning_rate": 1e-05, "loss": 0.0216, "step": 554800 }, { "epoch": 0.005549, "grad_norm": 0.16706077754497528, "learning_rate": 1e-05, "loss": 0.0216, "step": 554900 }, { "epoch": 0.00555, "grad_norm": 0.16815151274204254, "learning_rate": 1e-05, "loss": 0.0216, "step": 555000 }, { "epoch": 0.005551, "grad_norm": 0.352616548538208, "learning_rate": 1e-05, "loss": 0.0217, "step": 555100 }, { "epoch": 0.005552, "grad_norm": 0.2569660246372223, "learning_rate": 1e-05, "loss": 0.0214, "step": 555200 }, { "epoch": 0.005553, "grad_norm": 0.16380088031291962, "learning_rate": 1e-05, "loss": 0.0214, "step": 555300 }, { "epoch": 0.005554, "grad_norm": 0.24187463521957397, "learning_rate": 1e-05, "loss": 0.0217, "step": 555400 }, { "epoch": 0.005555, "grad_norm": 0.22565321624279022, "learning_rate": 1e-05, "loss": 0.0216, "step": 555500 }, { "epoch": 0.005556, "grad_norm": 0.17749769985675812, "learning_rate": 1e-05, "loss": 0.0213, "step": 555600 }, { "epoch": 0.005557, "grad_norm": 0.14612244069576263, "learning_rate": 1e-05, "loss": 0.0218, "step": 555700 }, { "epoch": 0.005558, "grad_norm": 0.13905346393585205, "learning_rate": 1e-05, "loss": 0.0213, "step": 555800 }, { "epoch": 0.005559, "grad_norm": 0.18477971851825714, "learning_rate": 1e-05, "loss": 0.0209, "step": 555900 }, { "epoch": 0.00556, "grad_norm": 0.2279455065727234, "learning_rate": 1e-05, "loss": 0.0215, "step": 556000 }, { "epoch": 0.005561, "grad_norm": 0.16901566088199615, "learning_rate": 1e-05, "loss": 0.0215, "step": 556100 }, { "epoch": 0.005562, "grad_norm": 0.16132889688014984, "learning_rate": 1e-05, "loss": 0.0215, "step": 556200 }, { "epoch": 0.005563, "grad_norm": 0.15984201431274414, "learning_rate": 1e-05, "loss": 0.0215, "step": 556300 }, { "epoch": 0.005564, "grad_norm": 0.22975924611091614, "learning_rate": 1e-05, "loss": 0.0208, "step": 556400 }, { "epoch": 0.005565, "grad_norm": 0.17653454840183258, "learning_rate": 1e-05, "loss": 0.0217, "step": 556500 }, { "epoch": 0.005566, "grad_norm": 0.13989302515983582, "learning_rate": 1e-05, "loss": 0.0211, "step": 556600 }, { "epoch": 0.005567, "grad_norm": 0.20093423128128052, "learning_rate": 1e-05, "loss": 0.0218, "step": 556700 }, { "epoch": 0.005568, "grad_norm": 0.20107205212116241, "learning_rate": 1e-05, "loss": 0.0215, "step": 556800 }, { "epoch": 0.005569, "grad_norm": 0.2590128481388092, "learning_rate": 1e-05, "loss": 0.0214, "step": 556900 }, { "epoch": 0.00557, "grad_norm": 0.2755796015262604, "learning_rate": 1e-05, "loss": 0.0216, "step": 557000 }, { "epoch": 0.005571, "grad_norm": 0.2038138061761856, "learning_rate": 1e-05, "loss": 0.0211, "step": 557100 }, { "epoch": 0.005572, "grad_norm": 0.17188778519630432, "learning_rate": 1e-05, "loss": 0.0211, "step": 557200 }, { "epoch": 0.005573, "grad_norm": 0.16210515797138214, "learning_rate": 1e-05, "loss": 0.0217, "step": 557300 }, { "epoch": 0.005574, "grad_norm": 0.15236018598079681, "learning_rate": 1e-05, "loss": 0.0214, "step": 557400 }, { "epoch": 0.005575, "grad_norm": 0.12892593443393707, "learning_rate": 1e-05, "loss": 0.0216, "step": 557500 }, { "epoch": 0.005576, "grad_norm": 0.1857982575893402, "learning_rate": 1e-05, "loss": 0.0213, "step": 557600 }, { "epoch": 0.005577, "grad_norm": 0.16898445785045624, "learning_rate": 1e-05, "loss": 0.0214, "step": 557700 }, { "epoch": 0.005578, "grad_norm": 0.1807907670736313, "learning_rate": 1e-05, "loss": 0.0213, "step": 557800 }, { "epoch": 0.005579, "grad_norm": 0.1360076367855072, "learning_rate": 1e-05, "loss": 0.0208, "step": 557900 }, { "epoch": 0.00558, "grad_norm": 0.27640506625175476, "learning_rate": 1e-05, "loss": 0.0214, "step": 558000 }, { "epoch": 0.005581, "grad_norm": 0.14093583822250366, "learning_rate": 1e-05, "loss": 0.0216, "step": 558100 }, { "epoch": 0.005582, "grad_norm": 0.1645306944847107, "learning_rate": 1e-05, "loss": 0.0214, "step": 558200 }, { "epoch": 0.005583, "grad_norm": 0.23434501886367798, "learning_rate": 1e-05, "loss": 0.0215, "step": 558300 }, { "epoch": 0.005584, "grad_norm": 0.22003774344921112, "learning_rate": 1e-05, "loss": 0.0215, "step": 558400 }, { "epoch": 0.005585, "grad_norm": 0.1776084154844284, "learning_rate": 1e-05, "loss": 0.0216, "step": 558500 }, { "epoch": 0.005586, "grad_norm": 0.24516430497169495, "learning_rate": 1e-05, "loss": 0.0215, "step": 558600 }, { "epoch": 0.005587, "grad_norm": 0.15869762003421783, "learning_rate": 1e-05, "loss": 0.0216, "step": 558700 }, { "epoch": 0.005588, "grad_norm": 0.2100542187690735, "learning_rate": 1e-05, "loss": 0.0209, "step": 558800 }, { "epoch": 0.005589, "grad_norm": 0.176955446600914, "learning_rate": 1e-05, "loss": 0.0214, "step": 558900 }, { "epoch": 0.00559, "grad_norm": 0.1364227831363678, "learning_rate": 1e-05, "loss": 0.0212, "step": 559000 }, { "epoch": 0.005591, "grad_norm": 0.16514813899993896, "learning_rate": 1e-05, "loss": 0.0215, "step": 559100 }, { "epoch": 0.005592, "grad_norm": 0.1537804901599884, "learning_rate": 1e-05, "loss": 0.0207, "step": 559200 }, { "epoch": 0.005593, "grad_norm": 0.18153861165046692, "learning_rate": 1e-05, "loss": 0.0212, "step": 559300 }, { "epoch": 0.005594, "grad_norm": 0.17260077595710754, "learning_rate": 1e-05, "loss": 0.0214, "step": 559400 }, { "epoch": 0.005595, "grad_norm": 0.15464262664318085, "learning_rate": 1e-05, "loss": 0.0211, "step": 559500 }, { "epoch": 0.005596, "grad_norm": 0.19560229778289795, "learning_rate": 1e-05, "loss": 0.0215, "step": 559600 }, { "epoch": 0.005597, "grad_norm": 0.15554594993591309, "learning_rate": 1e-05, "loss": 0.0212, "step": 559700 }, { "epoch": 0.005598, "grad_norm": 0.268970787525177, "learning_rate": 1e-05, "loss": 0.0213, "step": 559800 }, { "epoch": 0.005599, "grad_norm": 0.1767590194940567, "learning_rate": 1e-05, "loss": 0.0216, "step": 559900 }, { "epoch": 0.0056, "grad_norm": 0.16251994669437408, "learning_rate": 1e-05, "loss": 0.0217, "step": 560000 }, { "epoch": 0.0056, "eval_loss": 0.018635718151926994, "eval_runtime": 188.3689, "eval_samples_per_second": 265.437, "eval_steps_per_second": 16.59, "step": 560000 }, { "epoch": 0.005601, "grad_norm": 0.15570446848869324, "learning_rate": 1e-05, "loss": 0.0219, "step": 560100 }, { "epoch": 0.005602, "grad_norm": 0.16893988847732544, "learning_rate": 1e-05, "loss": 0.0218, "step": 560200 }, { "epoch": 0.005603, "grad_norm": 0.22165176272392273, "learning_rate": 1e-05, "loss": 0.0218, "step": 560300 }, { "epoch": 0.005604, "grad_norm": 0.2085549682378769, "learning_rate": 1e-05, "loss": 0.0214, "step": 560400 }, { "epoch": 0.005605, "grad_norm": 0.13417479395866394, "learning_rate": 1e-05, "loss": 0.0216, "step": 560500 }, { "epoch": 0.005606, "grad_norm": 0.21768851578235626, "learning_rate": 1e-05, "loss": 0.0214, "step": 560600 }, { "epoch": 0.005607, "grad_norm": 0.195266991853714, "learning_rate": 1e-05, "loss": 0.0221, "step": 560700 }, { "epoch": 0.005608, "grad_norm": 0.1376257687807083, "learning_rate": 1e-05, "loss": 0.0214, "step": 560800 }, { "epoch": 0.005609, "grad_norm": 0.17232754826545715, "learning_rate": 1e-05, "loss": 0.0217, "step": 560900 }, { "epoch": 0.00561, "grad_norm": 0.19726479053497314, "learning_rate": 1e-05, "loss": 0.0214, "step": 561000 }, { "epoch": 0.005611, "grad_norm": 0.16798220574855804, "learning_rate": 1e-05, "loss": 0.0212, "step": 561100 }, { "epoch": 0.005612, "grad_norm": 0.16480158269405365, "learning_rate": 1e-05, "loss": 0.0208, "step": 561200 }, { "epoch": 0.005613, "grad_norm": 0.14470480382442474, "learning_rate": 1e-05, "loss": 0.0219, "step": 561300 }, { "epoch": 0.005614, "grad_norm": 0.3047696053981781, "learning_rate": 1e-05, "loss": 0.0221, "step": 561400 }, { "epoch": 0.005615, "grad_norm": 0.1638990044593811, "learning_rate": 1e-05, "loss": 0.0211, "step": 561500 }, { "epoch": 0.005616, "grad_norm": 0.27864545583724976, "learning_rate": 1e-05, "loss": 0.0215, "step": 561600 }, { "epoch": 0.005617, "grad_norm": 0.1783299744129181, "learning_rate": 1e-05, "loss": 0.0216, "step": 561700 }, { "epoch": 0.005618, "grad_norm": 0.21028587222099304, "learning_rate": 1e-05, "loss": 0.0208, "step": 561800 }, { "epoch": 0.005619, "grad_norm": 0.17195391654968262, "learning_rate": 1e-05, "loss": 0.0213, "step": 561900 }, { "epoch": 0.00562, "grad_norm": 0.1919167935848236, "learning_rate": 1e-05, "loss": 0.0221, "step": 562000 }, { "epoch": 0.005621, "grad_norm": 0.18126846849918365, "learning_rate": 1e-05, "loss": 0.0214, "step": 562100 }, { "epoch": 0.005622, "grad_norm": 0.14388507604599, "learning_rate": 1e-05, "loss": 0.0216, "step": 562200 }, { "epoch": 0.005623, "grad_norm": 0.16689163446426392, "learning_rate": 1e-05, "loss": 0.0213, "step": 562300 }, { "epoch": 0.005624, "grad_norm": 0.1902565062046051, "learning_rate": 1e-05, "loss": 0.0211, "step": 562400 }, { "epoch": 0.005625, "grad_norm": 0.15596285462379456, "learning_rate": 1e-05, "loss": 0.0212, "step": 562500 }, { "epoch": 0.005626, "grad_norm": 0.19484767317771912, "learning_rate": 1e-05, "loss": 0.0212, "step": 562600 }, { "epoch": 0.005627, "grad_norm": 0.20800313353538513, "learning_rate": 1e-05, "loss": 0.0213, "step": 562700 }, { "epoch": 0.005628, "grad_norm": 0.23871885240077972, "learning_rate": 1e-05, "loss": 0.021, "step": 562800 }, { "epoch": 0.005629, "grad_norm": 0.19075509905815125, "learning_rate": 1e-05, "loss": 0.0212, "step": 562900 }, { "epoch": 0.00563, "grad_norm": 0.1459088772535324, "learning_rate": 1e-05, "loss": 0.0211, "step": 563000 }, { "epoch": 0.005631, "grad_norm": 0.1943489909172058, "learning_rate": 1e-05, "loss": 0.0212, "step": 563100 }, { "epoch": 0.005632, "grad_norm": 0.15154214203357697, "learning_rate": 1e-05, "loss": 0.0213, "step": 563200 }, { "epoch": 0.005633, "grad_norm": 0.15985813736915588, "learning_rate": 1e-05, "loss": 0.0209, "step": 563300 }, { "epoch": 0.005634, "grad_norm": 0.17054495215415955, "learning_rate": 1e-05, "loss": 0.0215, "step": 563400 }, { "epoch": 0.005635, "grad_norm": 0.15887859463691711, "learning_rate": 1e-05, "loss": 0.0212, "step": 563500 }, { "epoch": 0.005636, "grad_norm": 0.20461444556713104, "learning_rate": 1e-05, "loss": 0.0215, "step": 563600 }, { "epoch": 0.005637, "grad_norm": 0.16059096157550812, "learning_rate": 1e-05, "loss": 0.0215, "step": 563700 }, { "epoch": 0.005638, "grad_norm": 0.2243344932794571, "learning_rate": 1e-05, "loss": 0.0219, "step": 563800 }, { "epoch": 0.005639, "grad_norm": 0.19537338614463806, "learning_rate": 1e-05, "loss": 0.0215, "step": 563900 }, { "epoch": 0.00564, "grad_norm": 0.19301889836788177, "learning_rate": 1e-05, "loss": 0.0214, "step": 564000 }, { "epoch": 0.005641, "grad_norm": 0.20854875445365906, "learning_rate": 1e-05, "loss": 0.0211, "step": 564100 }, { "epoch": 0.005642, "grad_norm": 0.22897794842720032, "learning_rate": 1e-05, "loss": 0.0219, "step": 564200 }, { "epoch": 0.005643, "grad_norm": 0.13939480483531952, "learning_rate": 1e-05, "loss": 0.0214, "step": 564300 }, { "epoch": 0.005644, "grad_norm": 0.19302140176296234, "learning_rate": 1e-05, "loss": 0.021, "step": 564400 }, { "epoch": 0.005645, "grad_norm": 0.16157463192939758, "learning_rate": 1e-05, "loss": 0.0215, "step": 564500 }, { "epoch": 0.005646, "grad_norm": 0.20204028487205505, "learning_rate": 1e-05, "loss": 0.0206, "step": 564600 }, { "epoch": 0.005647, "grad_norm": 0.23535718023777008, "learning_rate": 1e-05, "loss": 0.0209, "step": 564700 }, { "epoch": 0.005648, "grad_norm": 0.2102852612733841, "learning_rate": 1e-05, "loss": 0.0211, "step": 564800 }, { "epoch": 0.005649, "grad_norm": 0.20921200513839722, "learning_rate": 1e-05, "loss": 0.0217, "step": 564900 }, { "epoch": 0.00565, "grad_norm": 0.20055699348449707, "learning_rate": 1e-05, "loss": 0.0217, "step": 565000 }, { "epoch": 0.005651, "grad_norm": 0.13918645679950714, "learning_rate": 1e-05, "loss": 0.0219, "step": 565100 }, { "epoch": 0.005652, "grad_norm": 0.21246376633644104, "learning_rate": 1e-05, "loss": 0.022, "step": 565200 }, { "epoch": 0.005653, "grad_norm": 0.16694696247577667, "learning_rate": 1e-05, "loss": 0.0211, "step": 565300 }, { "epoch": 0.005654, "grad_norm": 0.17357927560806274, "learning_rate": 1e-05, "loss": 0.0211, "step": 565400 }, { "epoch": 0.005655, "grad_norm": 0.2128274291753769, "learning_rate": 1e-05, "loss": 0.021, "step": 565500 }, { "epoch": 0.005656, "grad_norm": 0.19355826079845428, "learning_rate": 1e-05, "loss": 0.0217, "step": 565600 }, { "epoch": 0.005657, "grad_norm": 0.16454586386680603, "learning_rate": 1e-05, "loss": 0.0218, "step": 565700 }, { "epoch": 0.005658, "grad_norm": 0.1596595197916031, "learning_rate": 1e-05, "loss": 0.0214, "step": 565800 }, { "epoch": 0.005659, "grad_norm": 0.1753876656293869, "learning_rate": 1e-05, "loss": 0.0213, "step": 565900 }, { "epoch": 0.00566, "grad_norm": 0.2130473107099533, "learning_rate": 1e-05, "loss": 0.0214, "step": 566000 }, { "epoch": 0.005661, "grad_norm": 0.14769406616687775, "learning_rate": 1e-05, "loss": 0.0209, "step": 566100 }, { "epoch": 0.005662, "grad_norm": 0.2001980096101761, "learning_rate": 1e-05, "loss": 0.021, "step": 566200 }, { "epoch": 0.005663, "grad_norm": 0.2031766176223755, "learning_rate": 1e-05, "loss": 0.0206, "step": 566300 }, { "epoch": 0.005664, "grad_norm": 0.20344407856464386, "learning_rate": 1e-05, "loss": 0.0216, "step": 566400 }, { "epoch": 0.005665, "grad_norm": 0.14375163614749908, "learning_rate": 1e-05, "loss": 0.0213, "step": 566500 }, { "epoch": 0.005666, "grad_norm": 0.1607665717601776, "learning_rate": 1e-05, "loss": 0.0216, "step": 566600 }, { "epoch": 0.005667, "grad_norm": 0.16410745680332184, "learning_rate": 1e-05, "loss": 0.0213, "step": 566700 }, { "epoch": 0.005668, "grad_norm": 0.18102942407131195, "learning_rate": 1e-05, "loss": 0.0212, "step": 566800 }, { "epoch": 0.005669, "grad_norm": 0.18649420142173767, "learning_rate": 1e-05, "loss": 0.0208, "step": 566900 }, { "epoch": 0.00567, "grad_norm": 0.14621183276176453, "learning_rate": 1e-05, "loss": 0.0216, "step": 567000 }, { "epoch": 0.005671, "grad_norm": 0.19361375272274017, "learning_rate": 1e-05, "loss": 0.0215, "step": 567100 }, { "epoch": 0.005672, "grad_norm": 0.18602855503559113, "learning_rate": 1e-05, "loss": 0.0215, "step": 567200 }, { "epoch": 0.005673, "grad_norm": 0.21234677731990814, "learning_rate": 1e-05, "loss": 0.0211, "step": 567300 }, { "epoch": 0.005674, "grad_norm": 0.18931730091571808, "learning_rate": 1e-05, "loss": 0.0212, "step": 567400 }, { "epoch": 0.005675, "grad_norm": 0.20227065682411194, "learning_rate": 1e-05, "loss": 0.0213, "step": 567500 }, { "epoch": 0.005676, "grad_norm": 0.24090944230556488, "learning_rate": 1e-05, "loss": 0.0214, "step": 567600 }, { "epoch": 0.005677, "grad_norm": 0.2028038054704666, "learning_rate": 1e-05, "loss": 0.0218, "step": 567700 }, { "epoch": 0.005678, "grad_norm": 0.18757614493370056, "learning_rate": 1e-05, "loss": 0.0217, "step": 567800 }, { "epoch": 0.005679, "grad_norm": 0.16229186952114105, "learning_rate": 1e-05, "loss": 0.0213, "step": 567900 }, { "epoch": 0.00568, "grad_norm": 0.14897440373897552, "learning_rate": 1e-05, "loss": 0.0215, "step": 568000 }, { "epoch": 0.005681, "grad_norm": 0.17434576153755188, "learning_rate": 1e-05, "loss": 0.021, "step": 568100 }, { "epoch": 0.005682, "grad_norm": 0.16657991707324982, "learning_rate": 1e-05, "loss": 0.0213, "step": 568200 }, { "epoch": 0.005683, "grad_norm": 0.18529905378818512, "learning_rate": 1e-05, "loss": 0.0216, "step": 568300 }, { "epoch": 0.005684, "grad_norm": 0.22213485836982727, "learning_rate": 1e-05, "loss": 0.0214, "step": 568400 }, { "epoch": 0.005685, "grad_norm": 0.12817411124706268, "learning_rate": 1e-05, "loss": 0.0213, "step": 568500 }, { "epoch": 0.005686, "grad_norm": 0.18407602608203888, "learning_rate": 1e-05, "loss": 0.0212, "step": 568600 }, { "epoch": 0.005687, "grad_norm": 0.17404165863990784, "learning_rate": 1e-05, "loss": 0.0214, "step": 568700 }, { "epoch": 0.005688, "grad_norm": 0.19925899803638458, "learning_rate": 1e-05, "loss": 0.0212, "step": 568800 }, { "epoch": 0.005689, "grad_norm": 0.20248030126094818, "learning_rate": 1e-05, "loss": 0.0211, "step": 568900 }, { "epoch": 0.00569, "grad_norm": 0.17260795831680298, "learning_rate": 1e-05, "loss": 0.0214, "step": 569000 }, { "epoch": 0.005691, "grad_norm": 0.16223108768463135, "learning_rate": 1e-05, "loss": 0.0216, "step": 569100 }, { "epoch": 0.005692, "grad_norm": 0.20732015371322632, "learning_rate": 1e-05, "loss": 0.021, "step": 569200 }, { "epoch": 0.005693, "grad_norm": 0.20327803492546082, "learning_rate": 1e-05, "loss": 0.0215, "step": 569300 }, { "epoch": 0.005694, "grad_norm": 0.22293904423713684, "learning_rate": 1e-05, "loss": 0.0206, "step": 569400 }, { "epoch": 0.005695, "grad_norm": 0.16138111054897308, "learning_rate": 1e-05, "loss": 0.0212, "step": 569500 }, { "epoch": 0.005696, "grad_norm": 0.18290729820728302, "learning_rate": 1e-05, "loss": 0.0211, "step": 569600 }, { "epoch": 0.005697, "grad_norm": 0.17689462006092072, "learning_rate": 1e-05, "loss": 0.0215, "step": 569700 }, { "epoch": 0.005698, "grad_norm": 0.16690123081207275, "learning_rate": 1e-05, "loss": 0.0212, "step": 569800 }, { "epoch": 0.005699, "grad_norm": 0.18617787957191467, "learning_rate": 1e-05, "loss": 0.0212, "step": 569900 }, { "epoch": 0.0057, "grad_norm": 0.1900787502527237, "learning_rate": 1e-05, "loss": 0.0212, "step": 570000 }, { "epoch": 0.005701, "grad_norm": 0.17463304102420807, "learning_rate": 1e-05, "loss": 0.0214, "step": 570100 }, { "epoch": 0.005702, "grad_norm": 0.15382720530033112, "learning_rate": 1e-05, "loss": 0.0213, "step": 570200 }, { "epoch": 0.005703, "grad_norm": 0.16445161402225494, "learning_rate": 1e-05, "loss": 0.0217, "step": 570300 }, { "epoch": 0.005704, "grad_norm": 0.20721998810768127, "learning_rate": 1e-05, "loss": 0.0218, "step": 570400 }, { "epoch": 0.005705, "grad_norm": 0.14730340242385864, "learning_rate": 1e-05, "loss": 0.0215, "step": 570500 }, { "epoch": 0.005706, "grad_norm": 0.15686209499835968, "learning_rate": 1e-05, "loss": 0.0212, "step": 570600 }, { "epoch": 0.005707, "grad_norm": 0.1887214183807373, "learning_rate": 1e-05, "loss": 0.0217, "step": 570700 }, { "epoch": 0.005708, "grad_norm": 0.1400224268436432, "learning_rate": 1e-05, "loss": 0.0213, "step": 570800 }, { "epoch": 0.005709, "grad_norm": 0.16516083478927612, "learning_rate": 1e-05, "loss": 0.0213, "step": 570900 }, { "epoch": 0.00571, "grad_norm": 0.2565386891365051, "learning_rate": 1e-05, "loss": 0.0212, "step": 571000 }, { "epoch": 0.005711, "grad_norm": 0.15351442992687225, "learning_rate": 1e-05, "loss": 0.021, "step": 571100 }, { "epoch": 0.005712, "grad_norm": 0.26175186038017273, "learning_rate": 1e-05, "loss": 0.0211, "step": 571200 }, { "epoch": 0.005713, "grad_norm": 0.1648392528295517, "learning_rate": 1e-05, "loss": 0.0209, "step": 571300 }, { "epoch": 0.005714, "grad_norm": 0.18197698891162872, "learning_rate": 1e-05, "loss": 0.0208, "step": 571400 }, { "epoch": 0.005715, "grad_norm": 0.16569095849990845, "learning_rate": 1e-05, "loss": 0.0207, "step": 571500 }, { "epoch": 0.005716, "grad_norm": 0.19288933277130127, "learning_rate": 1e-05, "loss": 0.0212, "step": 571600 }, { "epoch": 0.005717, "grad_norm": 0.19233931601047516, "learning_rate": 1e-05, "loss": 0.0207, "step": 571700 }, { "epoch": 0.005718, "grad_norm": 0.187378391623497, "learning_rate": 1e-05, "loss": 0.0214, "step": 571800 }, { "epoch": 0.005719, "grad_norm": 0.14966048300266266, "learning_rate": 1e-05, "loss": 0.0213, "step": 571900 }, { "epoch": 0.00572, "grad_norm": 0.17621181905269623, "learning_rate": 1e-05, "loss": 0.0213, "step": 572000 }, { "epoch": 0.005721, "grad_norm": 0.19635890424251556, "learning_rate": 1e-05, "loss": 0.0208, "step": 572100 }, { "epoch": 0.005722, "grad_norm": 0.29109886288642883, "learning_rate": 1e-05, "loss": 0.0209, "step": 572200 }, { "epoch": 0.005723, "grad_norm": 0.20945581793785095, "learning_rate": 1e-05, "loss": 0.0211, "step": 572300 }, { "epoch": 0.005724, "grad_norm": 0.19952714443206787, "learning_rate": 1e-05, "loss": 0.0209, "step": 572400 }, { "epoch": 0.005725, "grad_norm": 0.22455601394176483, "learning_rate": 1e-05, "loss": 0.021, "step": 572500 }, { "epoch": 0.005726, "grad_norm": 0.19845525920391083, "learning_rate": 1e-05, "loss": 0.0212, "step": 572600 }, { "epoch": 0.005727, "grad_norm": 0.15179429948329926, "learning_rate": 1e-05, "loss": 0.0212, "step": 572700 }, { "epoch": 0.005728, "grad_norm": 0.15827392041683197, "learning_rate": 1e-05, "loss": 0.0211, "step": 572800 }, { "epoch": 0.005729, "grad_norm": 0.21893253922462463, "learning_rate": 1e-05, "loss": 0.0208, "step": 572900 }, { "epoch": 0.00573, "grad_norm": 0.12060569226741791, "learning_rate": 1e-05, "loss": 0.0209, "step": 573000 }, { "epoch": 0.005731, "grad_norm": 0.203363835811615, "learning_rate": 1e-05, "loss": 0.0209, "step": 573100 }, { "epoch": 0.005732, "grad_norm": 0.1569388508796692, "learning_rate": 1e-05, "loss": 0.0213, "step": 573200 }, { "epoch": 0.005733, "grad_norm": 0.19552889466285706, "learning_rate": 1e-05, "loss": 0.0209, "step": 573300 }, { "epoch": 0.005734, "grad_norm": 0.18278442323207855, "learning_rate": 1e-05, "loss": 0.0214, "step": 573400 }, { "epoch": 0.005735, "grad_norm": 0.16588835418224335, "learning_rate": 1e-05, "loss": 0.0211, "step": 573500 }, { "epoch": 0.005736, "grad_norm": 0.26829811930656433, "learning_rate": 1e-05, "loss": 0.0207, "step": 573600 }, { "epoch": 0.005737, "grad_norm": 0.15834948420524597, "learning_rate": 1e-05, "loss": 0.021, "step": 573700 }, { "epoch": 0.005738, "grad_norm": 0.18455740809440613, "learning_rate": 1e-05, "loss": 0.0213, "step": 573800 }, { "epoch": 0.005739, "grad_norm": 0.13424226641654968, "learning_rate": 1e-05, "loss": 0.0216, "step": 573900 }, { "epoch": 0.00574, "grad_norm": 0.170083686709404, "learning_rate": 1e-05, "loss": 0.0217, "step": 574000 }, { "epoch": 0.005741, "grad_norm": 0.18398723006248474, "learning_rate": 1e-05, "loss": 0.0213, "step": 574100 }, { "epoch": 0.005742, "grad_norm": 0.1410660296678543, "learning_rate": 1e-05, "loss": 0.0214, "step": 574200 }, { "epoch": 0.005743, "grad_norm": 0.13537146151065826, "learning_rate": 1e-05, "loss": 0.0216, "step": 574300 }, { "epoch": 0.005744, "grad_norm": 0.18581359088420868, "learning_rate": 1e-05, "loss": 0.0211, "step": 574400 }, { "epoch": 0.005745, "grad_norm": 0.16050602495670319, "learning_rate": 1e-05, "loss": 0.0213, "step": 574500 }, { "epoch": 0.005746, "grad_norm": 0.16277818381786346, "learning_rate": 1e-05, "loss": 0.0216, "step": 574600 }, { "epoch": 0.005747, "grad_norm": 0.17467674612998962, "learning_rate": 1e-05, "loss": 0.0209, "step": 574700 }, { "epoch": 0.005748, "grad_norm": 0.2636488378047943, "learning_rate": 1e-05, "loss": 0.0216, "step": 574800 }, { "epoch": 0.005749, "grad_norm": 0.28531333804130554, "learning_rate": 1e-05, "loss": 0.0216, "step": 574900 }, { "epoch": 0.00575, "grad_norm": 0.1723947823047638, "learning_rate": 1e-05, "loss": 0.0214, "step": 575000 }, { "epoch": 0.005751, "grad_norm": 0.1928778737783432, "learning_rate": 1e-05, "loss": 0.0215, "step": 575100 }, { "epoch": 0.005752, "grad_norm": 0.2202211320400238, "learning_rate": 1e-05, "loss": 0.021, "step": 575200 }, { "epoch": 0.005753, "grad_norm": 0.20531338453292847, "learning_rate": 1e-05, "loss": 0.0218, "step": 575300 }, { "epoch": 0.005754, "grad_norm": 0.23002421855926514, "learning_rate": 1e-05, "loss": 0.0214, "step": 575400 }, { "epoch": 0.005755, "grad_norm": 0.19433046877384186, "learning_rate": 1e-05, "loss": 0.0213, "step": 575500 }, { "epoch": 0.005756, "grad_norm": 0.15410813689231873, "learning_rate": 1e-05, "loss": 0.021, "step": 575600 }, { "epoch": 0.005757, "grad_norm": 0.2407412976026535, "learning_rate": 1e-05, "loss": 0.0212, "step": 575700 }, { "epoch": 0.005758, "grad_norm": 0.18670165538787842, "learning_rate": 1e-05, "loss": 0.0217, "step": 575800 }, { "epoch": 0.005759, "grad_norm": 0.20037966966629028, "learning_rate": 1e-05, "loss": 0.021, "step": 575900 }, { "epoch": 0.00576, "grad_norm": 0.30088871717453003, "learning_rate": 1e-05, "loss": 0.0212, "step": 576000 }, { "epoch": 0.005761, "grad_norm": 0.17094355821609497, "learning_rate": 1e-05, "loss": 0.0209, "step": 576100 }, { "epoch": 0.005762, "grad_norm": 0.2136581540107727, "learning_rate": 1e-05, "loss": 0.021, "step": 576200 }, { "epoch": 0.005763, "grad_norm": 0.14944732189178467, "learning_rate": 1e-05, "loss": 0.0215, "step": 576300 }, { "epoch": 0.005764, "grad_norm": 0.1570085734128952, "learning_rate": 1e-05, "loss": 0.0214, "step": 576400 }, { "epoch": 0.005765, "grad_norm": 0.1391046941280365, "learning_rate": 1e-05, "loss": 0.0208, "step": 576500 }, { "epoch": 0.005766, "grad_norm": 0.24515706300735474, "learning_rate": 1e-05, "loss": 0.021, "step": 576600 }, { "epoch": 0.005767, "grad_norm": 0.15838991105556488, "learning_rate": 1e-05, "loss": 0.0212, "step": 576700 }, { "epoch": 0.005768, "grad_norm": 0.15124092996120453, "learning_rate": 1e-05, "loss": 0.0216, "step": 576800 }, { "epoch": 0.005769, "grad_norm": 0.2676244080066681, "learning_rate": 1e-05, "loss": 0.0209, "step": 576900 }, { "epoch": 0.00577, "grad_norm": 0.2247081696987152, "learning_rate": 1e-05, "loss": 0.0208, "step": 577000 }, { "epoch": 0.005771, "grad_norm": 0.1722600907087326, "learning_rate": 1e-05, "loss": 0.0213, "step": 577100 }, { "epoch": 0.005772, "grad_norm": 0.17253977060317993, "learning_rate": 1e-05, "loss": 0.0209, "step": 577200 }, { "epoch": 0.005773, "grad_norm": 0.22492411732673645, "learning_rate": 1e-05, "loss": 0.021, "step": 577300 }, { "epoch": 0.005774, "grad_norm": 0.1452348530292511, "learning_rate": 1e-05, "loss": 0.0215, "step": 577400 }, { "epoch": 0.005775, "grad_norm": 0.19413377344608307, "learning_rate": 1e-05, "loss": 0.0208, "step": 577500 }, { "epoch": 0.005776, "grad_norm": 0.1365891546010971, "learning_rate": 1e-05, "loss": 0.0211, "step": 577600 }, { "epoch": 0.005777, "grad_norm": 0.17511922121047974, "learning_rate": 1e-05, "loss": 0.0212, "step": 577700 }, { "epoch": 0.005778, "grad_norm": 0.14899873733520508, "learning_rate": 1e-05, "loss": 0.0209, "step": 577800 }, { "epoch": 0.005779, "grad_norm": 0.16723939776420593, "learning_rate": 1e-05, "loss": 0.0206, "step": 577900 }, { "epoch": 0.00578, "grad_norm": 0.11718878895044327, "learning_rate": 1e-05, "loss": 0.021, "step": 578000 }, { "epoch": 0.005781, "grad_norm": 0.15980643033981323, "learning_rate": 1e-05, "loss": 0.0212, "step": 578100 }, { "epoch": 0.005782, "grad_norm": 0.23330242931842804, "learning_rate": 1e-05, "loss": 0.0214, "step": 578200 }, { "epoch": 0.005783, "grad_norm": 0.1613931953907013, "learning_rate": 1e-05, "loss": 0.0212, "step": 578300 }, { "epoch": 0.005784, "grad_norm": 0.1629764884710312, "learning_rate": 1e-05, "loss": 0.0217, "step": 578400 }, { "epoch": 0.005785, "grad_norm": 0.2037486433982849, "learning_rate": 1e-05, "loss": 0.0214, "step": 578500 }, { "epoch": 0.005786, "grad_norm": 0.18275220692157745, "learning_rate": 1e-05, "loss": 0.021, "step": 578600 }, { "epoch": 0.005787, "grad_norm": 0.1742374300956726, "learning_rate": 1e-05, "loss": 0.0215, "step": 578700 }, { "epoch": 0.005788, "grad_norm": 0.14989320933818817, "learning_rate": 1e-05, "loss": 0.0213, "step": 578800 }, { "epoch": 0.005789, "grad_norm": 0.170790433883667, "learning_rate": 1e-05, "loss": 0.0212, "step": 578900 }, { "epoch": 0.00579, "grad_norm": 0.13825495541095734, "learning_rate": 1e-05, "loss": 0.0208, "step": 579000 }, { "epoch": 0.005791, "grad_norm": 0.1686275601387024, "learning_rate": 1e-05, "loss": 0.0212, "step": 579100 }, { "epoch": 0.005792, "grad_norm": 0.1947414129972458, "learning_rate": 1e-05, "loss": 0.0208, "step": 579200 }, { "epoch": 0.005793, "grad_norm": 0.1432606428861618, "learning_rate": 1e-05, "loss": 0.0215, "step": 579300 }, { "epoch": 0.005794, "grad_norm": 0.14401490986347198, "learning_rate": 1e-05, "loss": 0.0215, "step": 579400 }, { "epoch": 0.005795, "grad_norm": 0.19076552987098694, "learning_rate": 1e-05, "loss": 0.0216, "step": 579500 }, { "epoch": 0.005796, "grad_norm": 0.2035866528749466, "learning_rate": 1e-05, "loss": 0.0211, "step": 579600 }, { "epoch": 0.005797, "grad_norm": 0.14898575842380524, "learning_rate": 1e-05, "loss": 0.0212, "step": 579700 }, { "epoch": 0.005798, "grad_norm": 0.15350264310836792, "learning_rate": 1e-05, "loss": 0.021, "step": 579800 }, { "epoch": 0.005799, "grad_norm": 0.14569734036922455, "learning_rate": 1e-05, "loss": 0.0212, "step": 579900 }, { "epoch": 0.0058, "grad_norm": 0.1544477790594101, "learning_rate": 1e-05, "loss": 0.021, "step": 580000 }, { "epoch": 0.0058, "eval_loss": 0.019604716449975967, "eval_runtime": 196.251, "eval_samples_per_second": 254.776, "eval_steps_per_second": 15.923, "step": 580000 }, { "epoch": 0.005801, "grad_norm": 0.20394912362098694, "learning_rate": 1e-05, "loss": 0.0213, "step": 580100 }, { "epoch": 0.005802, "grad_norm": 0.1718025952577591, "learning_rate": 1e-05, "loss": 0.0216, "step": 580200 }, { "epoch": 0.005803, "grad_norm": 0.13232600688934326, "learning_rate": 1e-05, "loss": 0.021, "step": 580300 }, { "epoch": 0.005804, "grad_norm": 0.18981923162937164, "learning_rate": 1e-05, "loss": 0.0212, "step": 580400 }, { "epoch": 0.005805, "grad_norm": 0.22414672374725342, "learning_rate": 1e-05, "loss": 0.0214, "step": 580500 }, { "epoch": 0.005806, "grad_norm": 0.21210485696792603, "learning_rate": 1e-05, "loss": 0.0212, "step": 580600 }, { "epoch": 0.005807, "grad_norm": 0.14878253638744354, "learning_rate": 1e-05, "loss": 0.0211, "step": 580700 }, { "epoch": 0.005808, "grad_norm": 0.17055225372314453, "learning_rate": 1e-05, "loss": 0.021, "step": 580800 }, { "epoch": 0.005809, "grad_norm": 0.2357247918844223, "learning_rate": 1e-05, "loss": 0.0212, "step": 580900 }, { "epoch": 0.00581, "grad_norm": 0.28162702918052673, "learning_rate": 1e-05, "loss": 0.0209, "step": 581000 }, { "epoch": 0.005811, "grad_norm": 0.19320203363895416, "learning_rate": 1e-05, "loss": 0.0211, "step": 581100 }, { "epoch": 0.005812, "grad_norm": 0.14156867563724518, "learning_rate": 1e-05, "loss": 0.0216, "step": 581200 }, { "epoch": 0.005813, "grad_norm": 0.177739679813385, "learning_rate": 1e-05, "loss": 0.0211, "step": 581300 }, { "epoch": 0.005814, "grad_norm": 0.14907996356487274, "learning_rate": 1e-05, "loss": 0.0208, "step": 581400 }, { "epoch": 0.005815, "grad_norm": 0.21898695826530457, "learning_rate": 1e-05, "loss": 0.021, "step": 581500 }, { "epoch": 0.005816, "grad_norm": 0.18825113773345947, "learning_rate": 1e-05, "loss": 0.0217, "step": 581600 }, { "epoch": 0.005817, "grad_norm": 0.17399638891220093, "learning_rate": 1e-05, "loss": 0.021, "step": 581700 }, { "epoch": 0.005818, "grad_norm": 0.14294734597206116, "learning_rate": 1e-05, "loss": 0.0212, "step": 581800 }, { "epoch": 0.005819, "grad_norm": 0.20822007954120636, "learning_rate": 1e-05, "loss": 0.0214, "step": 581900 }, { "epoch": 0.00582, "grad_norm": 0.1908642202615738, "learning_rate": 1e-05, "loss": 0.0214, "step": 582000 }, { "epoch": 0.005821, "grad_norm": 0.19040462374687195, "learning_rate": 1e-05, "loss": 0.021, "step": 582100 }, { "epoch": 0.005822, "grad_norm": 0.16242177784442902, "learning_rate": 1e-05, "loss": 0.0209, "step": 582200 }, { "epoch": 0.005823, "grad_norm": 0.1957864761352539, "learning_rate": 1e-05, "loss": 0.0213, "step": 582300 }, { "epoch": 0.005824, "grad_norm": 0.11807893216609955, "learning_rate": 1e-05, "loss": 0.0213, "step": 582400 }, { "epoch": 0.005825, "grad_norm": 0.20970138907432556, "learning_rate": 1e-05, "loss": 0.0211, "step": 582500 }, { "epoch": 0.005826, "grad_norm": 0.18850143253803253, "learning_rate": 1e-05, "loss": 0.0214, "step": 582600 }, { "epoch": 0.005827, "grad_norm": 0.16357116401195526, "learning_rate": 1e-05, "loss": 0.0209, "step": 582700 }, { "epoch": 0.005828, "grad_norm": 0.2032591700553894, "learning_rate": 1e-05, "loss": 0.0211, "step": 582800 }, { "epoch": 0.005829, "grad_norm": 0.13383959233760834, "learning_rate": 1e-05, "loss": 0.0208, "step": 582900 }, { "epoch": 0.00583, "grad_norm": 0.19072847068309784, "learning_rate": 1e-05, "loss": 0.0213, "step": 583000 }, { "epoch": 0.005831, "grad_norm": 0.1696120947599411, "learning_rate": 1e-05, "loss": 0.0211, "step": 583100 }, { "epoch": 0.005832, "grad_norm": 0.1509893536567688, "learning_rate": 1e-05, "loss": 0.0214, "step": 583200 }, { "epoch": 0.005833, "grad_norm": 0.19000139832496643, "learning_rate": 1e-05, "loss": 0.0208, "step": 583300 }, { "epoch": 0.005834, "grad_norm": 0.19925692677497864, "learning_rate": 1e-05, "loss": 0.0208, "step": 583400 }, { "epoch": 0.005835, "grad_norm": 0.23576998710632324, "learning_rate": 1e-05, "loss": 0.0209, "step": 583500 }, { "epoch": 0.005836, "grad_norm": 0.17778004705905914, "learning_rate": 1e-05, "loss": 0.0214, "step": 583600 }, { "epoch": 0.005837, "grad_norm": 0.1620296686887741, "learning_rate": 1e-05, "loss": 0.0208, "step": 583700 }, { "epoch": 0.005838, "grad_norm": 0.21948404610157013, "learning_rate": 1e-05, "loss": 0.021, "step": 583800 }, { "epoch": 0.005839, "grad_norm": 0.17229190468788147, "learning_rate": 1e-05, "loss": 0.0214, "step": 583900 }, { "epoch": 0.00584, "grad_norm": 0.2568354904651642, "learning_rate": 1e-05, "loss": 0.0206, "step": 584000 }, { "epoch": 0.005841, "grad_norm": 0.14954774081707, "learning_rate": 1e-05, "loss": 0.0209, "step": 584100 }, { "epoch": 0.005842, "grad_norm": 0.15897485613822937, "learning_rate": 1e-05, "loss": 0.0208, "step": 584200 }, { "epoch": 0.005843, "grad_norm": 0.15381944179534912, "learning_rate": 1e-05, "loss": 0.0211, "step": 584300 }, { "epoch": 0.005844, "grad_norm": 0.20824666321277618, "learning_rate": 1e-05, "loss": 0.0213, "step": 584400 }, { "epoch": 0.005845, "grad_norm": 0.18293826282024384, "learning_rate": 1e-05, "loss": 0.0209, "step": 584500 }, { "epoch": 0.005846, "grad_norm": 0.1783250868320465, "learning_rate": 1e-05, "loss": 0.0208, "step": 584600 }, { "epoch": 0.005847, "grad_norm": 0.1731746941804886, "learning_rate": 1e-05, "loss": 0.0213, "step": 584700 }, { "epoch": 0.005848, "grad_norm": 0.13026444613933563, "learning_rate": 1e-05, "loss": 0.0209, "step": 584800 }, { "epoch": 0.005849, "grad_norm": 0.11039680987596512, "learning_rate": 1e-05, "loss": 0.021, "step": 584900 }, { "epoch": 0.00585, "grad_norm": 0.1633644700050354, "learning_rate": 1e-05, "loss": 0.0213, "step": 585000 }, { "epoch": 0.005851, "grad_norm": 0.20298650860786438, "learning_rate": 1e-05, "loss": 0.0213, "step": 585100 }, { "epoch": 0.005852, "grad_norm": 0.19005431234836578, "learning_rate": 1e-05, "loss": 0.0206, "step": 585200 }, { "epoch": 0.005853, "grad_norm": 0.1393861621618271, "learning_rate": 1e-05, "loss": 0.0214, "step": 585300 }, { "epoch": 0.005854, "grad_norm": 0.1728292852640152, "learning_rate": 1e-05, "loss": 0.021, "step": 585400 }, { "epoch": 0.005855, "grad_norm": 0.13528987765312195, "learning_rate": 1e-05, "loss": 0.0209, "step": 585500 }, { "epoch": 0.005856, "grad_norm": 0.1587122678756714, "learning_rate": 1e-05, "loss": 0.0216, "step": 585600 }, { "epoch": 0.005857, "grad_norm": 0.18153144419193268, "learning_rate": 1e-05, "loss": 0.0204, "step": 585700 }, { "epoch": 0.005858, "grad_norm": 0.1569143831729889, "learning_rate": 1e-05, "loss": 0.0209, "step": 585800 }, { "epoch": 0.005859, "grad_norm": 0.22489206492900848, "learning_rate": 1e-05, "loss": 0.0208, "step": 585900 }, { "epoch": 0.00586, "grad_norm": 0.1559339463710785, "learning_rate": 1e-05, "loss": 0.0215, "step": 586000 }, { "epoch": 0.005861, "grad_norm": 0.1280537098646164, "learning_rate": 1e-05, "loss": 0.0205, "step": 586100 }, { "epoch": 0.005862, "grad_norm": 0.15204986929893494, "learning_rate": 1e-05, "loss": 0.0213, "step": 586200 }, { "epoch": 0.005863, "grad_norm": 0.1458340734243393, "learning_rate": 1e-05, "loss": 0.021, "step": 586300 }, { "epoch": 0.005864, "grad_norm": 0.1826070249080658, "learning_rate": 1e-05, "loss": 0.0208, "step": 586400 }, { "epoch": 0.005865, "grad_norm": 0.10834330320358276, "learning_rate": 1e-05, "loss": 0.0212, "step": 586500 }, { "epoch": 0.005866, "grad_norm": 0.2098936140537262, "learning_rate": 1e-05, "loss": 0.0212, "step": 586600 }, { "epoch": 0.005867, "grad_norm": 0.1675851196050644, "learning_rate": 1e-05, "loss": 0.0209, "step": 586700 }, { "epoch": 0.005868, "grad_norm": 0.2228291928768158, "learning_rate": 1e-05, "loss": 0.0212, "step": 586800 }, { "epoch": 0.005869, "grad_norm": 0.18493176996707916, "learning_rate": 1e-05, "loss": 0.0211, "step": 586900 }, { "epoch": 0.00587, "grad_norm": 0.17395280301570892, "learning_rate": 1e-05, "loss": 0.0207, "step": 587000 }, { "epoch": 0.005871, "grad_norm": 0.1433342695236206, "learning_rate": 1e-05, "loss": 0.0208, "step": 587100 }, { "epoch": 0.005872, "grad_norm": 0.15819013118743896, "learning_rate": 1e-05, "loss": 0.021, "step": 587200 }, { "epoch": 0.005873, "grad_norm": 0.2050904631614685, "learning_rate": 1e-05, "loss": 0.0204, "step": 587300 }, { "epoch": 0.005874, "grad_norm": 0.17648430168628693, "learning_rate": 1e-05, "loss": 0.0213, "step": 587400 }, { "epoch": 0.005875, "grad_norm": 0.17256104946136475, "learning_rate": 1e-05, "loss": 0.0205, "step": 587500 }, { "epoch": 0.005876, "grad_norm": 0.2002917379140854, "learning_rate": 1e-05, "loss": 0.0215, "step": 587600 }, { "epoch": 0.005877, "grad_norm": 0.1973036527633667, "learning_rate": 1e-05, "loss": 0.0212, "step": 587700 }, { "epoch": 0.005878, "grad_norm": 0.23588219285011292, "learning_rate": 1e-05, "loss": 0.0213, "step": 587800 }, { "epoch": 0.005879, "grad_norm": 0.16127312183380127, "learning_rate": 1e-05, "loss": 0.0216, "step": 587900 }, { "epoch": 0.00588, "grad_norm": 0.14321866631507874, "learning_rate": 1e-05, "loss": 0.0208, "step": 588000 }, { "epoch": 0.005881, "grad_norm": 0.18183167278766632, "learning_rate": 1e-05, "loss": 0.0203, "step": 588100 }, { "epoch": 0.005882, "grad_norm": 0.2473861426115036, "learning_rate": 1e-05, "loss": 0.0214, "step": 588200 }, { "epoch": 0.005883, "grad_norm": 0.16056878864765167, "learning_rate": 1e-05, "loss": 0.0213, "step": 588300 }, { "epoch": 0.005884, "grad_norm": 0.2516213655471802, "learning_rate": 1e-05, "loss": 0.0213, "step": 588400 }, { "epoch": 0.005885, "grad_norm": 0.1557886302471161, "learning_rate": 1e-05, "loss": 0.0211, "step": 588500 }, { "epoch": 0.005886, "grad_norm": 0.16567711532115936, "learning_rate": 1e-05, "loss": 0.0207, "step": 588600 }, { "epoch": 0.005887, "grad_norm": 0.1997353732585907, "learning_rate": 1e-05, "loss": 0.0209, "step": 588700 }, { "epoch": 0.005888, "grad_norm": 0.1847914457321167, "learning_rate": 1e-05, "loss": 0.0207, "step": 588800 }, { "epoch": 0.005889, "grad_norm": 0.2764768600463867, "learning_rate": 1e-05, "loss": 0.0208, "step": 588900 }, { "epoch": 0.00589, "grad_norm": 0.18396319448947906, "learning_rate": 1e-05, "loss": 0.021, "step": 589000 }, { "epoch": 0.005891, "grad_norm": 0.23451435565948486, "learning_rate": 1e-05, "loss": 0.0209, "step": 589100 }, { "epoch": 0.005892, "grad_norm": 0.2046915739774704, "learning_rate": 1e-05, "loss": 0.0211, "step": 589200 }, { "epoch": 0.005893, "grad_norm": 0.22041521966457367, "learning_rate": 1e-05, "loss": 0.0214, "step": 589300 }, { "epoch": 0.005894, "grad_norm": 0.17514733970165253, "learning_rate": 1e-05, "loss": 0.0209, "step": 589400 }, { "epoch": 0.005895, "grad_norm": 0.20346695184707642, "learning_rate": 1e-05, "loss": 0.021, "step": 589500 }, { "epoch": 0.005896, "grad_norm": 0.18584932386875153, "learning_rate": 1e-05, "loss": 0.0206, "step": 589600 }, { "epoch": 0.005897, "grad_norm": 0.22433744370937347, "learning_rate": 1e-05, "loss": 0.0209, "step": 589700 }, { "epoch": 0.005898, "grad_norm": 0.12199508398771286, "learning_rate": 1e-05, "loss": 0.0208, "step": 589800 }, { "epoch": 0.005899, "grad_norm": 0.223799929022789, "learning_rate": 1e-05, "loss": 0.0213, "step": 589900 }, { "epoch": 0.0059, "grad_norm": 0.15070270001888275, "learning_rate": 1e-05, "loss": 0.0208, "step": 590000 }, { "epoch": 0.005901, "grad_norm": 0.17144007980823517, "learning_rate": 1e-05, "loss": 0.0211, "step": 590100 }, { "epoch": 0.005902, "grad_norm": 0.12870094180107117, "learning_rate": 1e-05, "loss": 0.0208, "step": 590200 }, { "epoch": 0.005903, "grad_norm": 0.1982165277004242, "learning_rate": 1e-05, "loss": 0.0214, "step": 590300 }, { "epoch": 0.005904, "grad_norm": 0.15241596102714539, "learning_rate": 1e-05, "loss": 0.0213, "step": 590400 }, { "epoch": 0.005905, "grad_norm": 0.17031265795230865, "learning_rate": 1e-05, "loss": 0.0213, "step": 590500 }, { "epoch": 0.005906, "grad_norm": 0.14449633657932281, "learning_rate": 1e-05, "loss": 0.0212, "step": 590600 }, { "epoch": 0.005907, "grad_norm": 0.21342654526233673, "learning_rate": 1e-05, "loss": 0.0205, "step": 590700 }, { "epoch": 0.005908, "grad_norm": 0.18231931328773499, "learning_rate": 1e-05, "loss": 0.0212, "step": 590800 }, { "epoch": 0.005909, "grad_norm": 0.19250182807445526, "learning_rate": 1e-05, "loss": 0.0214, "step": 590900 }, { "epoch": 0.00591, "grad_norm": 0.3286546468734741, "learning_rate": 1e-05, "loss": 0.0209, "step": 591000 }, { "epoch": 0.005911, "grad_norm": 0.18192987143993378, "learning_rate": 1e-05, "loss": 0.0212, "step": 591100 }, { "epoch": 0.005912, "grad_norm": 0.23460052907466888, "learning_rate": 1e-05, "loss": 0.0209, "step": 591200 }, { "epoch": 0.005913, "grad_norm": 0.2641129791736603, "learning_rate": 1e-05, "loss": 0.0209, "step": 591300 }, { "epoch": 0.005914, "grad_norm": 0.2107561230659485, "learning_rate": 1e-05, "loss": 0.0206, "step": 591400 }, { "epoch": 0.005915, "grad_norm": 0.19154031574726105, "learning_rate": 1e-05, "loss": 0.0208, "step": 591500 }, { "epoch": 0.005916, "grad_norm": 0.14417146146297455, "learning_rate": 1e-05, "loss": 0.0209, "step": 591600 }, { "epoch": 0.005917, "grad_norm": 0.17162244021892548, "learning_rate": 1e-05, "loss": 0.0208, "step": 591700 }, { "epoch": 0.005918, "grad_norm": 0.2153385728597641, "learning_rate": 1e-05, "loss": 0.0209, "step": 591800 }, { "epoch": 0.005919, "grad_norm": 0.2140119969844818, "learning_rate": 1e-05, "loss": 0.0211, "step": 591900 }, { "epoch": 0.00592, "grad_norm": 0.17804037034511566, "learning_rate": 1e-05, "loss": 0.0207, "step": 592000 }, { "epoch": 0.005921, "grad_norm": 0.20471513271331787, "learning_rate": 1e-05, "loss": 0.021, "step": 592100 }, { "epoch": 0.005922, "grad_norm": 0.17570330202579498, "learning_rate": 1e-05, "loss": 0.0212, "step": 592200 }, { "epoch": 0.005923, "grad_norm": 0.14662623405456543, "learning_rate": 1e-05, "loss": 0.0214, "step": 592300 }, { "epoch": 0.005924, "grad_norm": 0.141471728682518, "learning_rate": 1e-05, "loss": 0.0207, "step": 592400 }, { "epoch": 0.005925, "grad_norm": 0.16483379900455475, "learning_rate": 1e-05, "loss": 0.0204, "step": 592500 }, { "epoch": 0.005926, "grad_norm": 0.16892318427562714, "learning_rate": 1e-05, "loss": 0.0214, "step": 592600 }, { "epoch": 0.005927, "grad_norm": 0.10870113223791122, "learning_rate": 1e-05, "loss": 0.0208, "step": 592700 }, { "epoch": 0.005928, "grad_norm": 0.18551918864250183, "learning_rate": 1e-05, "loss": 0.0214, "step": 592800 }, { "epoch": 0.005929, "grad_norm": 0.14654423296451569, "learning_rate": 1e-05, "loss": 0.0213, "step": 592900 }, { "epoch": 0.00593, "grad_norm": 0.20150382816791534, "learning_rate": 1e-05, "loss": 0.021, "step": 593000 }, { "epoch": 0.005931, "grad_norm": 0.17567545175552368, "learning_rate": 1e-05, "loss": 0.021, "step": 593100 }, { "epoch": 0.005932, "grad_norm": 0.16860084235668182, "learning_rate": 1e-05, "loss": 0.0213, "step": 593200 }, { "epoch": 0.005933, "grad_norm": 0.19154928624629974, "learning_rate": 1e-05, "loss": 0.0209, "step": 593300 }, { "epoch": 0.005934, "grad_norm": 0.1621568351984024, "learning_rate": 1e-05, "loss": 0.0207, "step": 593400 }, { "epoch": 0.005935, "grad_norm": 0.28719133138656616, "learning_rate": 1e-05, "loss": 0.021, "step": 593500 }, { "epoch": 0.005936, "grad_norm": 0.16984814405441284, "learning_rate": 1e-05, "loss": 0.0211, "step": 593600 }, { "epoch": 0.005937, "grad_norm": 0.1657189130783081, "learning_rate": 1e-05, "loss": 0.0211, "step": 593700 }, { "epoch": 0.005938, "grad_norm": 0.12272246927022934, "learning_rate": 1e-05, "loss": 0.0209, "step": 593800 }, { "epoch": 0.005939, "grad_norm": 0.16553889214992523, "learning_rate": 1e-05, "loss": 0.0212, "step": 593900 }, { "epoch": 0.00594, "grad_norm": 0.1800086349248886, "learning_rate": 1e-05, "loss": 0.021, "step": 594000 }, { "epoch": 0.005941, "grad_norm": 0.13346464931964874, "learning_rate": 1e-05, "loss": 0.0206, "step": 594100 }, { "epoch": 0.005942, "grad_norm": 0.15473546087741852, "learning_rate": 1e-05, "loss": 0.0209, "step": 594200 }, { "epoch": 0.005943, "grad_norm": 0.14423659443855286, "learning_rate": 1e-05, "loss": 0.0212, "step": 594300 }, { "epoch": 0.005944, "grad_norm": 0.21053585410118103, "learning_rate": 1e-05, "loss": 0.0208, "step": 594400 }, { "epoch": 0.005945, "grad_norm": 0.18162794411182404, "learning_rate": 1e-05, "loss": 0.0209, "step": 594500 }, { "epoch": 0.005946, "grad_norm": 0.15467818081378937, "learning_rate": 1e-05, "loss": 0.0211, "step": 594600 }, { "epoch": 0.005947, "grad_norm": 0.2023337483406067, "learning_rate": 1e-05, "loss": 0.0207, "step": 594700 }, { "epoch": 0.005948, "grad_norm": 0.1545305848121643, "learning_rate": 1e-05, "loss": 0.0211, "step": 594800 }, { "epoch": 0.005949, "grad_norm": 0.1720171421766281, "learning_rate": 1e-05, "loss": 0.0206, "step": 594900 }, { "epoch": 0.00595, "grad_norm": 0.16523860394954681, "learning_rate": 1e-05, "loss": 0.021, "step": 595000 }, { "epoch": 0.005951, "grad_norm": 0.21775977313518524, "learning_rate": 1e-05, "loss": 0.0211, "step": 595100 }, { "epoch": 0.005952, "grad_norm": 0.21623627841472626, "learning_rate": 1e-05, "loss": 0.0211, "step": 595200 }, { "epoch": 0.005953, "grad_norm": 0.16431090235710144, "learning_rate": 1e-05, "loss": 0.0208, "step": 595300 }, { "epoch": 0.005954, "grad_norm": 0.1542142778635025, "learning_rate": 1e-05, "loss": 0.0211, "step": 595400 }, { "epoch": 0.005955, "grad_norm": 0.2312825620174408, "learning_rate": 1e-05, "loss": 0.0216, "step": 595500 }, { "epoch": 0.005956, "grad_norm": 0.16911114752292633, "learning_rate": 1e-05, "loss": 0.021, "step": 595600 }, { "epoch": 0.005957, "grad_norm": 0.3033979833126068, "learning_rate": 1e-05, "loss": 0.0209, "step": 595700 }, { "epoch": 0.005958, "grad_norm": 0.1702926605939865, "learning_rate": 1e-05, "loss": 0.0207, "step": 595800 }, { "epoch": 0.005959, "grad_norm": 0.14844830334186554, "learning_rate": 1e-05, "loss": 0.021, "step": 595900 }, { "epoch": 0.00596, "grad_norm": 0.18650585412979126, "learning_rate": 1e-05, "loss": 0.0211, "step": 596000 }, { "epoch": 0.005961, "grad_norm": 0.16671159863471985, "learning_rate": 1e-05, "loss": 0.0211, "step": 596100 }, { "epoch": 0.005962, "grad_norm": 0.2010623812675476, "learning_rate": 1e-05, "loss": 0.0207, "step": 596200 }, { "epoch": 0.005963, "grad_norm": 0.1901353895664215, "learning_rate": 1e-05, "loss": 0.021, "step": 596300 }, { "epoch": 0.005964, "grad_norm": 0.1923867166042328, "learning_rate": 1e-05, "loss": 0.021, "step": 596400 }, { "epoch": 0.005965, "grad_norm": 0.2142161726951599, "learning_rate": 1e-05, "loss": 0.021, "step": 596500 }, { "epoch": 0.005966, "grad_norm": 0.18099001049995422, "learning_rate": 1e-05, "loss": 0.0207, "step": 596600 }, { "epoch": 0.005967, "grad_norm": 0.168930321931839, "learning_rate": 1e-05, "loss": 0.0207, "step": 596700 }, { "epoch": 0.005968, "grad_norm": 0.20115914940834045, "learning_rate": 1e-05, "loss": 0.0206, "step": 596800 }, { "epoch": 0.005969, "grad_norm": 0.2624085247516632, "learning_rate": 1e-05, "loss": 0.0206, "step": 596900 }, { "epoch": 0.00597, "grad_norm": 0.22696545720100403, "learning_rate": 1e-05, "loss": 0.0208, "step": 597000 }, { "epoch": 0.005971, "grad_norm": 0.16387973725795746, "learning_rate": 1e-05, "loss": 0.0209, "step": 597100 }, { "epoch": 0.005972, "grad_norm": 0.15778015553951263, "learning_rate": 1e-05, "loss": 0.0208, "step": 597200 }, { "epoch": 0.005973, "grad_norm": 0.1426464170217514, "learning_rate": 1e-05, "loss": 0.0206, "step": 597300 }, { "epoch": 0.005974, "grad_norm": 0.2028937190771103, "learning_rate": 1e-05, "loss": 0.0208, "step": 597400 }, { "epoch": 0.005975, "grad_norm": 0.15281732380390167, "learning_rate": 1e-05, "loss": 0.0207, "step": 597500 }, { "epoch": 0.005976, "grad_norm": 0.1580214649438858, "learning_rate": 1e-05, "loss": 0.0213, "step": 597600 }, { "epoch": 0.005977, "grad_norm": 0.28952741622924805, "learning_rate": 1e-05, "loss": 0.0211, "step": 597700 }, { "epoch": 0.005978, "grad_norm": 0.17582052946090698, "learning_rate": 1e-05, "loss": 0.0211, "step": 597800 }, { "epoch": 0.005979, "grad_norm": 0.18102188408374786, "learning_rate": 1e-05, "loss": 0.021, "step": 597900 }, { "epoch": 0.00598, "grad_norm": 0.17440645396709442, "learning_rate": 1e-05, "loss": 0.0207, "step": 598000 }, { "epoch": 0.005981, "grad_norm": 0.21589994430541992, "learning_rate": 1e-05, "loss": 0.0213, "step": 598100 }, { "epoch": 0.005982, "grad_norm": 0.19440950453281403, "learning_rate": 1e-05, "loss": 0.0208, "step": 598200 }, { "epoch": 0.005983, "grad_norm": 0.16933834552764893, "learning_rate": 1e-05, "loss": 0.0211, "step": 598300 }, { "epoch": 0.005984, "grad_norm": 0.18784812092781067, "learning_rate": 1e-05, "loss": 0.0208, "step": 598400 }, { "epoch": 0.005985, "grad_norm": 0.18001176416873932, "learning_rate": 1e-05, "loss": 0.0214, "step": 598500 }, { "epoch": 0.005986, "grad_norm": 0.13449415564537048, "learning_rate": 1e-05, "loss": 0.0204, "step": 598600 }, { "epoch": 0.005987, "grad_norm": 0.2236074060201645, "learning_rate": 1e-05, "loss": 0.0213, "step": 598700 }, { "epoch": 0.005988, "grad_norm": 0.19698598980903625, "learning_rate": 1e-05, "loss": 0.021, "step": 598800 }, { "epoch": 0.005989, "grad_norm": 0.16076615452766418, "learning_rate": 1e-05, "loss": 0.0203, "step": 598900 }, { "epoch": 0.00599, "grad_norm": 0.12988390028476715, "learning_rate": 1e-05, "loss": 0.0207, "step": 599000 }, { "epoch": 0.005991, "grad_norm": 0.2562560737133026, "learning_rate": 1e-05, "loss": 0.021, "step": 599100 }, { "epoch": 0.005992, "grad_norm": 0.1437900960445404, "learning_rate": 1e-05, "loss": 0.0214, "step": 599200 }, { "epoch": 0.005993, "grad_norm": 0.2185530811548233, "learning_rate": 1e-05, "loss": 0.0206, "step": 599300 }, { "epoch": 0.005994, "grad_norm": 0.21971367299556732, "learning_rate": 1e-05, "loss": 0.021, "step": 599400 }, { "epoch": 0.005995, "grad_norm": 0.23873236775398254, "learning_rate": 1e-05, "loss": 0.0213, "step": 599500 }, { "epoch": 0.005996, "grad_norm": 0.1298198401927948, "learning_rate": 1e-05, "loss": 0.0205, "step": 599600 }, { "epoch": 0.005997, "grad_norm": 0.1951810121536255, "learning_rate": 1e-05, "loss": 0.0209, "step": 599700 }, { "epoch": 0.005998, "grad_norm": 0.19229735434055328, "learning_rate": 1e-05, "loss": 0.021, "step": 599800 }, { "epoch": 0.005999, "grad_norm": 0.17428280413150787, "learning_rate": 1e-05, "loss": 0.0215, "step": 599900 }, { "epoch": 0.006, "grad_norm": 0.1422475278377533, "learning_rate": 1e-05, "loss": 0.0205, "step": 600000 }, { "epoch": 0.006, "eval_loss": 0.018452569842338562, "eval_runtime": 189.4436, "eval_samples_per_second": 263.931, "eval_steps_per_second": 16.496, "step": 600000 }, { "epoch": 0.006001, "grad_norm": 0.17196759581565857, "learning_rate": 1e-05, "loss": 0.0209, "step": 600100 }, { "epoch": 0.006002, "grad_norm": 0.23099541664123535, "learning_rate": 1e-05, "loss": 0.0203, "step": 600200 }, { "epoch": 0.006003, "grad_norm": 0.2012059986591339, "learning_rate": 1e-05, "loss": 0.0212, "step": 600300 }, { "epoch": 0.006004, "grad_norm": 0.18296268582344055, "learning_rate": 1e-05, "loss": 0.0206, "step": 600400 }, { "epoch": 0.006005, "grad_norm": 0.198968306183815, "learning_rate": 1e-05, "loss": 0.0208, "step": 600500 }, { "epoch": 0.006006, "grad_norm": 0.16781292855739594, "learning_rate": 1e-05, "loss": 0.0205, "step": 600600 }, { "epoch": 0.006007, "grad_norm": 0.20592443645000458, "learning_rate": 1e-05, "loss": 0.0207, "step": 600700 }, { "epoch": 0.006008, "grad_norm": 0.1739392727613449, "learning_rate": 1e-05, "loss": 0.0213, "step": 600800 }, { "epoch": 0.006009, "grad_norm": 0.1772734820842743, "learning_rate": 1e-05, "loss": 0.0213, "step": 600900 }, { "epoch": 0.00601, "grad_norm": 0.19868122041225433, "learning_rate": 1e-05, "loss": 0.0211, "step": 601000 }, { "epoch": 0.006011, "grad_norm": 0.1607896089553833, "learning_rate": 1e-05, "loss": 0.0212, "step": 601100 }, { "epoch": 0.006012, "grad_norm": 0.15844649076461792, "learning_rate": 1e-05, "loss": 0.0212, "step": 601200 }, { "epoch": 0.006013, "grad_norm": 0.1556393951177597, "learning_rate": 1e-05, "loss": 0.0209, "step": 601300 }, { "epoch": 0.006014, "grad_norm": 0.1675998717546463, "learning_rate": 1e-05, "loss": 0.0203, "step": 601400 }, { "epoch": 0.006015, "grad_norm": 0.1623888462781906, "learning_rate": 1e-05, "loss": 0.0206, "step": 601500 }, { "epoch": 0.006016, "grad_norm": 0.1570730358362198, "learning_rate": 1e-05, "loss": 0.0205, "step": 601600 }, { "epoch": 0.006017, "grad_norm": 0.1592717170715332, "learning_rate": 1e-05, "loss": 0.0208, "step": 601700 }, { "epoch": 0.006018, "grad_norm": 0.17408688366413116, "learning_rate": 1e-05, "loss": 0.0212, "step": 601800 }, { "epoch": 0.006019, "grad_norm": 0.19918905198574066, "learning_rate": 1e-05, "loss": 0.0214, "step": 601900 }, { "epoch": 0.00602, "grad_norm": 0.1566506326198578, "learning_rate": 1e-05, "loss": 0.0209, "step": 602000 }, { "epoch": 0.006021, "grad_norm": 0.168969064950943, "learning_rate": 1e-05, "loss": 0.0204, "step": 602100 }, { "epoch": 0.006022, "grad_norm": 0.12540443241596222, "learning_rate": 1e-05, "loss": 0.0207, "step": 602200 }, { "epoch": 0.006023, "grad_norm": 0.18774083256721497, "learning_rate": 1e-05, "loss": 0.0213, "step": 602300 }, { "epoch": 0.006024, "grad_norm": 0.13456584513187408, "learning_rate": 1e-05, "loss": 0.0209, "step": 602400 }, { "epoch": 0.006025, "grad_norm": 0.16856783628463745, "learning_rate": 1e-05, "loss": 0.0214, "step": 602500 }, { "epoch": 0.006026, "grad_norm": 0.24269159138202667, "learning_rate": 1e-05, "loss": 0.0208, "step": 602600 }, { "epoch": 0.006027, "grad_norm": 0.13302558660507202, "learning_rate": 1e-05, "loss": 0.021, "step": 602700 }, { "epoch": 0.006028, "grad_norm": 0.22404703497886658, "learning_rate": 1e-05, "loss": 0.0209, "step": 602800 }, { "epoch": 0.006029, "grad_norm": 0.22005099058151245, "learning_rate": 1e-05, "loss": 0.0208, "step": 602900 }, { "epoch": 0.00603, "grad_norm": 0.17096078395843506, "learning_rate": 1e-05, "loss": 0.0208, "step": 603000 }, { "epoch": 0.006031, "grad_norm": 0.2275574505329132, "learning_rate": 1e-05, "loss": 0.0207, "step": 603100 }, { "epoch": 0.006032, "grad_norm": 0.16457614302635193, "learning_rate": 1e-05, "loss": 0.0211, "step": 603200 }, { "epoch": 0.006033, "grad_norm": 0.17625626921653748, "learning_rate": 1e-05, "loss": 0.0208, "step": 603300 }, { "epoch": 0.006034, "grad_norm": 0.15284883975982666, "learning_rate": 1e-05, "loss": 0.0206, "step": 603400 }, { "epoch": 0.006035, "grad_norm": 0.17357847094535828, "learning_rate": 1e-05, "loss": 0.0206, "step": 603500 }, { "epoch": 0.006036, "grad_norm": 0.1421729326248169, "learning_rate": 1e-05, "loss": 0.0211, "step": 603600 }, { "epoch": 0.006037, "grad_norm": 0.178543820977211, "learning_rate": 1e-05, "loss": 0.0211, "step": 603700 }, { "epoch": 0.006038, "grad_norm": 0.18071125447750092, "learning_rate": 1e-05, "loss": 0.0212, "step": 603800 }, { "epoch": 0.006039, "grad_norm": 0.18797177076339722, "learning_rate": 1e-05, "loss": 0.0204, "step": 603900 }, { "epoch": 0.00604, "grad_norm": 0.19586630165576935, "learning_rate": 1e-05, "loss": 0.0207, "step": 604000 }, { "epoch": 0.006041, "grad_norm": 0.25218459963798523, "learning_rate": 1e-05, "loss": 0.021, "step": 604100 }, { "epoch": 0.006042, "grad_norm": 0.14411720633506775, "learning_rate": 1e-05, "loss": 0.0211, "step": 604200 }, { "epoch": 0.006043, "grad_norm": 0.16786764562129974, "learning_rate": 1e-05, "loss": 0.0208, "step": 604300 }, { "epoch": 0.006044, "grad_norm": 0.20318549871444702, "learning_rate": 1e-05, "loss": 0.0209, "step": 604400 }, { "epoch": 0.006045, "grad_norm": 0.17685849964618683, "learning_rate": 1e-05, "loss": 0.0208, "step": 604500 }, { "epoch": 0.006046, "grad_norm": 0.18327860534191132, "learning_rate": 1e-05, "loss": 0.0204, "step": 604600 }, { "epoch": 0.006047, "grad_norm": 0.15164539217948914, "learning_rate": 1e-05, "loss": 0.0209, "step": 604700 }, { "epoch": 0.006048, "grad_norm": 0.19477126002311707, "learning_rate": 1e-05, "loss": 0.0207, "step": 604800 }, { "epoch": 0.006049, "grad_norm": 0.17479729652404785, "learning_rate": 1e-05, "loss": 0.0208, "step": 604900 }, { "epoch": 0.00605, "grad_norm": 0.13099528849124908, "learning_rate": 1e-05, "loss": 0.021, "step": 605000 }, { "epoch": 0.006051, "grad_norm": 0.22696848213672638, "learning_rate": 1e-05, "loss": 0.0205, "step": 605100 }, { "epoch": 0.006052, "grad_norm": 0.17529743909835815, "learning_rate": 1e-05, "loss": 0.0205, "step": 605200 }, { "epoch": 0.006053, "grad_norm": 0.121192067861557, "learning_rate": 1e-05, "loss": 0.0206, "step": 605300 }, { "epoch": 0.006054, "grad_norm": 0.2850170135498047, "learning_rate": 1e-05, "loss": 0.0206, "step": 605400 }, { "epoch": 0.006055, "grad_norm": 0.17221790552139282, "learning_rate": 1e-05, "loss": 0.0209, "step": 605500 }, { "epoch": 0.006056, "grad_norm": 0.18437904119491577, "learning_rate": 1e-05, "loss": 0.0211, "step": 605600 }, { "epoch": 0.006057, "grad_norm": 0.17681542038917542, "learning_rate": 1e-05, "loss": 0.0213, "step": 605700 }, { "epoch": 0.006058, "grad_norm": 0.13671673834323883, "learning_rate": 1e-05, "loss": 0.0207, "step": 605800 }, { "epoch": 0.006059, "grad_norm": 0.23387286067008972, "learning_rate": 1e-05, "loss": 0.0205, "step": 605900 }, { "epoch": 0.00606, "grad_norm": 0.1939520686864853, "learning_rate": 1e-05, "loss": 0.02, "step": 606000 }, { "epoch": 0.006061, "grad_norm": 0.19357256591320038, "learning_rate": 1e-05, "loss": 0.0207, "step": 606100 }, { "epoch": 0.006062, "grad_norm": 0.16154052317142487, "learning_rate": 1e-05, "loss": 0.0207, "step": 606200 }, { "epoch": 0.006063, "grad_norm": 0.17146046459674835, "learning_rate": 1e-05, "loss": 0.0207, "step": 606300 }, { "epoch": 0.006064, "grad_norm": 0.1576167345046997, "learning_rate": 1e-05, "loss": 0.0205, "step": 606400 }, { "epoch": 0.006065, "grad_norm": 0.16342003643512726, "learning_rate": 1e-05, "loss": 0.021, "step": 606500 }, { "epoch": 0.006066, "grad_norm": 0.1955215036869049, "learning_rate": 1e-05, "loss": 0.0209, "step": 606600 }, { "epoch": 0.006067, "grad_norm": 0.13196547329425812, "learning_rate": 1e-05, "loss": 0.02, "step": 606700 }, { "epoch": 0.006068, "grad_norm": 0.19242723286151886, "learning_rate": 1e-05, "loss": 0.0211, "step": 606800 }, { "epoch": 0.006069, "grad_norm": 0.17468036711215973, "learning_rate": 1e-05, "loss": 0.0208, "step": 606900 }, { "epoch": 0.00607, "grad_norm": 0.19051644206047058, "learning_rate": 1e-05, "loss": 0.0208, "step": 607000 }, { "epoch": 0.006071, "grad_norm": 0.18340909481048584, "learning_rate": 1e-05, "loss": 0.0206, "step": 607100 }, { "epoch": 0.006072, "grad_norm": 0.1477491408586502, "learning_rate": 1e-05, "loss": 0.021, "step": 607200 }, { "epoch": 0.006073, "grad_norm": 0.17110177874565125, "learning_rate": 1e-05, "loss": 0.021, "step": 607300 }, { "epoch": 0.006074, "grad_norm": 0.17124208807945251, "learning_rate": 1e-05, "loss": 0.0207, "step": 607400 }, { "epoch": 0.006075, "grad_norm": 0.1883884221315384, "learning_rate": 1e-05, "loss": 0.0211, "step": 607500 }, { "epoch": 0.006076, "grad_norm": 0.20916259288787842, "learning_rate": 1e-05, "loss": 0.0209, "step": 607600 }, { "epoch": 0.006077, "grad_norm": 0.1330225169658661, "learning_rate": 1e-05, "loss": 0.0205, "step": 607700 }, { "epoch": 0.006078, "grad_norm": 0.1503976732492447, "learning_rate": 1e-05, "loss": 0.0209, "step": 607800 }, { "epoch": 0.006079, "grad_norm": 0.1838831901550293, "learning_rate": 1e-05, "loss": 0.021, "step": 607900 }, { "epoch": 0.00608, "grad_norm": 0.16155873239040375, "learning_rate": 1e-05, "loss": 0.0211, "step": 608000 }, { "epoch": 0.006081, "grad_norm": 0.18437570333480835, "learning_rate": 1e-05, "loss": 0.0207, "step": 608100 }, { "epoch": 0.006082, "grad_norm": 0.17497193813323975, "learning_rate": 1e-05, "loss": 0.0207, "step": 608200 }, { "epoch": 0.006083, "grad_norm": 0.21737024188041687, "learning_rate": 1e-05, "loss": 0.0205, "step": 608300 }, { "epoch": 0.006084, "grad_norm": 0.18087609112262726, "learning_rate": 1e-05, "loss": 0.0205, "step": 608400 }, { "epoch": 0.006085, "grad_norm": 0.16963231563568115, "learning_rate": 1e-05, "loss": 0.0208, "step": 608500 }, { "epoch": 0.006086, "grad_norm": 0.18508322536945343, "learning_rate": 1e-05, "loss": 0.0208, "step": 608600 }, { "epoch": 0.006087, "grad_norm": 0.1665726900100708, "learning_rate": 1e-05, "loss": 0.0208, "step": 608700 }, { "epoch": 0.006088, "grad_norm": 0.1975094974040985, "learning_rate": 1e-05, "loss": 0.0212, "step": 608800 }, { "epoch": 0.006089, "grad_norm": 0.18321271240711212, "learning_rate": 1e-05, "loss": 0.0209, "step": 608900 }, { "epoch": 0.00609, "grad_norm": 0.16590441763401031, "learning_rate": 1e-05, "loss": 0.0206, "step": 609000 }, { "epoch": 0.006091, "grad_norm": 0.17669829726219177, "learning_rate": 1e-05, "loss": 0.0207, "step": 609100 }, { "epoch": 0.006092, "grad_norm": 0.1613623946905136, "learning_rate": 1e-05, "loss": 0.0207, "step": 609200 }, { "epoch": 0.006093, "grad_norm": 0.18893131613731384, "learning_rate": 1e-05, "loss": 0.0203, "step": 609300 }, { "epoch": 0.006094, "grad_norm": 0.13646847009658813, "learning_rate": 1e-05, "loss": 0.021, "step": 609400 }, { "epoch": 0.006095, "grad_norm": 0.19916051626205444, "learning_rate": 1e-05, "loss": 0.0206, "step": 609500 }, { "epoch": 0.006096, "grad_norm": 0.17676009237766266, "learning_rate": 1e-05, "loss": 0.0204, "step": 609600 }, { "epoch": 0.006097, "grad_norm": 0.20779237151145935, "learning_rate": 1e-05, "loss": 0.0211, "step": 609700 }, { "epoch": 0.006098, "grad_norm": 0.20416632294654846, "learning_rate": 1e-05, "loss": 0.0209, "step": 609800 }, { "epoch": 0.006099, "grad_norm": 0.13783608376979828, "learning_rate": 1e-05, "loss": 0.0203, "step": 609900 }, { "epoch": 0.0061, "grad_norm": 0.21013471484184265, "learning_rate": 1e-05, "loss": 0.021, "step": 610000 }, { "epoch": 0.006101, "grad_norm": 0.1449415534734726, "learning_rate": 1e-05, "loss": 0.0209, "step": 610100 }, { "epoch": 0.006102, "grad_norm": 0.26383140683174133, "learning_rate": 1e-05, "loss": 0.021, "step": 610200 }, { "epoch": 0.006103, "grad_norm": 0.17034047842025757, "learning_rate": 1e-05, "loss": 0.0211, "step": 610300 }, { "epoch": 0.006104, "grad_norm": 0.2659900188446045, "learning_rate": 1e-05, "loss": 0.0209, "step": 610400 }, { "epoch": 0.006105, "grad_norm": 0.18808291852474213, "learning_rate": 1e-05, "loss": 0.0209, "step": 610500 }, { "epoch": 0.006106, "grad_norm": 0.1523335576057434, "learning_rate": 1e-05, "loss": 0.0206, "step": 610600 }, { "epoch": 0.006107, "grad_norm": 0.14844276010990143, "learning_rate": 1e-05, "loss": 0.0211, "step": 610700 }, { "epoch": 0.006108, "grad_norm": 0.1371431052684784, "learning_rate": 1e-05, "loss": 0.0211, "step": 610800 }, { "epoch": 0.006109, "grad_norm": 0.14327044785022736, "learning_rate": 1e-05, "loss": 0.0209, "step": 610900 }, { "epoch": 0.00611, "grad_norm": 0.16486528515815735, "learning_rate": 1e-05, "loss": 0.0201, "step": 611000 }, { "epoch": 0.006111, "grad_norm": 0.14682529866695404, "learning_rate": 1e-05, "loss": 0.0208, "step": 611100 }, { "epoch": 0.006112, "grad_norm": 0.2160116285085678, "learning_rate": 1e-05, "loss": 0.0207, "step": 611200 }, { "epoch": 0.006113, "grad_norm": 0.17636795341968536, "learning_rate": 1e-05, "loss": 0.0208, "step": 611300 }, { "epoch": 0.006114, "grad_norm": 0.1674317866563797, "learning_rate": 1e-05, "loss": 0.021, "step": 611400 }, { "epoch": 0.006115, "grad_norm": 0.18244175612926483, "learning_rate": 1e-05, "loss": 0.0206, "step": 611500 }, { "epoch": 0.006116, "grad_norm": 0.1731281578540802, "learning_rate": 1e-05, "loss": 0.0207, "step": 611600 }, { "epoch": 0.006117, "grad_norm": 0.1590263992547989, "learning_rate": 1e-05, "loss": 0.0203, "step": 611700 }, { "epoch": 0.006118, "grad_norm": 0.17740023136138916, "learning_rate": 1e-05, "loss": 0.0208, "step": 611800 }, { "epoch": 0.006119, "grad_norm": 0.17359939217567444, "learning_rate": 1e-05, "loss": 0.0209, "step": 611900 }, { "epoch": 0.00612, "grad_norm": 0.17537863552570343, "learning_rate": 1e-05, "loss": 0.0207, "step": 612000 }, { "epoch": 0.006121, "grad_norm": 0.1834016740322113, "learning_rate": 1e-05, "loss": 0.0206, "step": 612100 }, { "epoch": 0.006122, "grad_norm": 0.17944513261318207, "learning_rate": 1e-05, "loss": 0.0206, "step": 612200 }, { "epoch": 0.006123, "grad_norm": 0.18157365918159485, "learning_rate": 1e-05, "loss": 0.0206, "step": 612300 }, { "epoch": 0.006124, "grad_norm": 0.1474217176437378, "learning_rate": 1e-05, "loss": 0.0209, "step": 612400 }, { "epoch": 0.006125, "grad_norm": 0.210081547498703, "learning_rate": 1e-05, "loss": 0.0211, "step": 612500 }, { "epoch": 0.006126, "grad_norm": 0.15931028127670288, "learning_rate": 1e-05, "loss": 0.0205, "step": 612600 }, { "epoch": 0.006127, "grad_norm": 0.13182587921619415, "learning_rate": 1e-05, "loss": 0.0205, "step": 612700 }, { "epoch": 0.006128, "grad_norm": 0.17065902054309845, "learning_rate": 1e-05, "loss": 0.0208, "step": 612800 }, { "epoch": 0.006129, "grad_norm": 0.1590055227279663, "learning_rate": 1e-05, "loss": 0.0211, "step": 612900 }, { "epoch": 0.00613, "grad_norm": 0.17475511133670807, "learning_rate": 1e-05, "loss": 0.0208, "step": 613000 }, { "epoch": 0.006131, "grad_norm": 0.15689612925052643, "learning_rate": 1e-05, "loss": 0.0208, "step": 613100 }, { "epoch": 0.006132, "grad_norm": 0.12376229465007782, "learning_rate": 1e-05, "loss": 0.0207, "step": 613200 }, { "epoch": 0.006133, "grad_norm": 0.19186542928218842, "learning_rate": 1e-05, "loss": 0.0211, "step": 613300 }, { "epoch": 0.006134, "grad_norm": 0.23478734493255615, "learning_rate": 1e-05, "loss": 0.0212, "step": 613400 }, { "epoch": 0.006135, "grad_norm": 0.13116003572940826, "learning_rate": 1e-05, "loss": 0.0209, "step": 613500 }, { "epoch": 0.006136, "grad_norm": 0.2222750037908554, "learning_rate": 1e-05, "loss": 0.021, "step": 613600 }, { "epoch": 0.006137, "grad_norm": 0.16376562416553497, "learning_rate": 1e-05, "loss": 0.0215, "step": 613700 }, { "epoch": 0.006138, "grad_norm": 0.24403315782546997, "learning_rate": 1e-05, "loss": 0.0206, "step": 613800 }, { "epoch": 0.006139, "grad_norm": 0.16004310548305511, "learning_rate": 1e-05, "loss": 0.0204, "step": 613900 }, { "epoch": 0.00614, "grad_norm": 0.23121671378612518, "learning_rate": 1e-05, "loss": 0.0207, "step": 614000 }, { "epoch": 0.006141, "grad_norm": 0.199758380651474, "learning_rate": 1e-05, "loss": 0.0208, "step": 614100 }, { "epoch": 0.006142, "grad_norm": 0.20181946456432343, "learning_rate": 1e-05, "loss": 0.0213, "step": 614200 }, { "epoch": 0.006143, "grad_norm": 0.17570605874061584, "learning_rate": 1e-05, "loss": 0.0206, "step": 614300 }, { "epoch": 0.006144, "grad_norm": 0.18154391646385193, "learning_rate": 1e-05, "loss": 0.0202, "step": 614400 }, { "epoch": 0.006145, "grad_norm": 0.24315692484378815, "learning_rate": 1e-05, "loss": 0.0213, "step": 614500 }, { "epoch": 0.006146, "grad_norm": 0.21047353744506836, "learning_rate": 1e-05, "loss": 0.0211, "step": 614600 }, { "epoch": 0.006147, "grad_norm": 0.19623228907585144, "learning_rate": 1e-05, "loss": 0.0206, "step": 614700 }, { "epoch": 0.006148, "grad_norm": 0.16538067162036896, "learning_rate": 1e-05, "loss": 0.0207, "step": 614800 }, { "epoch": 0.006149, "grad_norm": 0.16716209053993225, "learning_rate": 1e-05, "loss": 0.0204, "step": 614900 }, { "epoch": 0.00615, "grad_norm": 0.13130436837673187, "learning_rate": 1e-05, "loss": 0.0205, "step": 615000 }, { "epoch": 0.006151, "grad_norm": 0.18497566878795624, "learning_rate": 1e-05, "loss": 0.0206, "step": 615100 }, { "epoch": 0.006152, "grad_norm": 0.1628239005804062, "learning_rate": 1e-05, "loss": 0.0205, "step": 615200 }, { "epoch": 0.006153, "grad_norm": 0.15599554777145386, "learning_rate": 1e-05, "loss": 0.0209, "step": 615300 }, { "epoch": 0.006154, "grad_norm": 0.1726304590702057, "learning_rate": 1e-05, "loss": 0.0205, "step": 615400 }, { "epoch": 0.006155, "grad_norm": 0.12341421097517014, "learning_rate": 1e-05, "loss": 0.0202, "step": 615500 }, { "epoch": 0.006156, "grad_norm": 0.19271093606948853, "learning_rate": 1e-05, "loss": 0.0203, "step": 615600 }, { "epoch": 0.006157, "grad_norm": 0.19675372540950775, "learning_rate": 1e-05, "loss": 0.0207, "step": 615700 }, { "epoch": 0.006158, "grad_norm": 0.18193106353282928, "learning_rate": 1e-05, "loss": 0.021, "step": 615800 }, { "epoch": 0.006159, "grad_norm": 0.1516302227973938, "learning_rate": 1e-05, "loss": 0.0205, "step": 615900 }, { "epoch": 0.00616, "grad_norm": 0.14575359225273132, "learning_rate": 1e-05, "loss": 0.0207, "step": 616000 }, { "epoch": 0.006161, "grad_norm": 0.12307685613632202, "learning_rate": 1e-05, "loss": 0.0208, "step": 616100 }, { "epoch": 0.006162, "grad_norm": 0.1657484769821167, "learning_rate": 1e-05, "loss": 0.0205, "step": 616200 }, { "epoch": 0.006163, "grad_norm": 0.22150327265262604, "learning_rate": 1e-05, "loss": 0.0202, "step": 616300 }, { "epoch": 0.006164, "grad_norm": 0.1383359432220459, "learning_rate": 1e-05, "loss": 0.0207, "step": 616400 }, { "epoch": 0.006165, "grad_norm": 0.1775946468114853, "learning_rate": 1e-05, "loss": 0.0209, "step": 616500 }, { "epoch": 0.006166, "grad_norm": 0.16290658712387085, "learning_rate": 1e-05, "loss": 0.0212, "step": 616600 }, { "epoch": 0.006167, "grad_norm": 0.132952481508255, "learning_rate": 1e-05, "loss": 0.0212, "step": 616700 }, { "epoch": 0.006168, "grad_norm": 0.22549273073673248, "learning_rate": 1e-05, "loss": 0.0214, "step": 616800 }, { "epoch": 0.006169, "grad_norm": 0.21856795251369476, "learning_rate": 1e-05, "loss": 0.0208, "step": 616900 }, { "epoch": 0.00617, "grad_norm": 0.16409693658351898, "learning_rate": 1e-05, "loss": 0.0205, "step": 617000 }, { "epoch": 0.006171, "grad_norm": 0.22301363945007324, "learning_rate": 1e-05, "loss": 0.0211, "step": 617100 }, { "epoch": 0.006172, "grad_norm": 0.14941802620887756, "learning_rate": 1e-05, "loss": 0.0208, "step": 617200 }, { "epoch": 0.006173, "grad_norm": 0.2576025724411011, "learning_rate": 1e-05, "loss": 0.0207, "step": 617300 }, { "epoch": 0.006174, "grad_norm": 0.1788252294063568, "learning_rate": 1e-05, "loss": 0.0201, "step": 617400 }, { "epoch": 0.006175, "grad_norm": 0.22080563008785248, "learning_rate": 1e-05, "loss": 0.0205, "step": 617500 }, { "epoch": 0.006176, "grad_norm": 0.17260755598545074, "learning_rate": 1e-05, "loss": 0.0205, "step": 617600 }, { "epoch": 0.006177, "grad_norm": 0.25393009185791016, "learning_rate": 1e-05, "loss": 0.0203, "step": 617700 }, { "epoch": 0.006178, "grad_norm": 0.16901758313179016, "learning_rate": 1e-05, "loss": 0.0206, "step": 617800 }, { "epoch": 0.006179, "grad_norm": 0.20933352410793304, "learning_rate": 1e-05, "loss": 0.0211, "step": 617900 }, { "epoch": 0.00618, "grad_norm": 0.15239204466342926, "learning_rate": 1e-05, "loss": 0.0206, "step": 618000 }, { "epoch": 0.006181, "grad_norm": 0.15393957495689392, "learning_rate": 1e-05, "loss": 0.0206, "step": 618100 }, { "epoch": 0.006182, "grad_norm": 0.31556379795074463, "learning_rate": 1e-05, "loss": 0.0209, "step": 618200 }, { "epoch": 0.006183, "grad_norm": 0.1625487208366394, "learning_rate": 1e-05, "loss": 0.0209, "step": 618300 }, { "epoch": 0.006184, "grad_norm": 0.18269258737564087, "learning_rate": 1e-05, "loss": 0.021, "step": 618400 }, { "epoch": 0.006185, "grad_norm": 0.15331485867500305, "learning_rate": 1e-05, "loss": 0.0206, "step": 618500 }, { "epoch": 0.006186, "grad_norm": 0.16239917278289795, "learning_rate": 1e-05, "loss": 0.0206, "step": 618600 }, { "epoch": 0.006187, "grad_norm": 0.17707592248916626, "learning_rate": 1e-05, "loss": 0.02, "step": 618700 }, { "epoch": 0.006188, "grad_norm": 0.15611903369426727, "learning_rate": 1e-05, "loss": 0.0208, "step": 618800 }, { "epoch": 0.006189, "grad_norm": 0.1300312876701355, "learning_rate": 1e-05, "loss": 0.0209, "step": 618900 }, { "epoch": 0.00619, "grad_norm": 0.2046944797039032, "learning_rate": 1e-05, "loss": 0.0206, "step": 619000 }, { "epoch": 0.006191, "grad_norm": 0.1595475971698761, "learning_rate": 1e-05, "loss": 0.0209, "step": 619100 }, { "epoch": 0.006192, "grad_norm": 0.16280104219913483, "learning_rate": 1e-05, "loss": 0.021, "step": 619200 }, { "epoch": 0.006193, "grad_norm": 0.158903568983078, "learning_rate": 1e-05, "loss": 0.0202, "step": 619300 }, { "epoch": 0.006194, "grad_norm": 0.1867748498916626, "learning_rate": 1e-05, "loss": 0.0206, "step": 619400 }, { "epoch": 0.006195, "grad_norm": 0.166880264878273, "learning_rate": 1e-05, "loss": 0.0208, "step": 619500 }, { "epoch": 0.006196, "grad_norm": 0.26208382844924927, "learning_rate": 1e-05, "loss": 0.0204, "step": 619600 }, { "epoch": 0.006197, "grad_norm": 0.2072513997554779, "learning_rate": 1e-05, "loss": 0.0205, "step": 619700 }, { "epoch": 0.006198, "grad_norm": 0.1845964640378952, "learning_rate": 1e-05, "loss": 0.0206, "step": 619800 }, { "epoch": 0.006199, "grad_norm": 0.1490764319896698, "learning_rate": 1e-05, "loss": 0.0211, "step": 619900 }, { "epoch": 0.0062, "grad_norm": 0.18843403458595276, "learning_rate": 1e-05, "loss": 0.0204, "step": 620000 }, { "epoch": 0.0062, "eval_loss": 0.01788323000073433, "eval_runtime": 197.7449, "eval_samples_per_second": 252.851, "eval_steps_per_second": 15.803, "step": 620000 }, { "epoch": 0.006201, "grad_norm": 0.1462770253419876, "learning_rate": 1e-05, "loss": 0.0207, "step": 620100 }, { "epoch": 0.006202, "grad_norm": 0.18814799189567566, "learning_rate": 1e-05, "loss": 0.0208, "step": 620200 }, { "epoch": 0.006203, "grad_norm": 0.11819654703140259, "learning_rate": 1e-05, "loss": 0.0206, "step": 620300 }, { "epoch": 0.006204, "grad_norm": 0.3259127140045166, "learning_rate": 1e-05, "loss": 0.0202, "step": 620400 }, { "epoch": 0.006205, "grad_norm": 0.14926981925964355, "learning_rate": 1e-05, "loss": 0.0201, "step": 620500 }, { "epoch": 0.006206, "grad_norm": 0.18142277002334595, "learning_rate": 1e-05, "loss": 0.0206, "step": 620600 }, { "epoch": 0.006207, "grad_norm": 0.1754610538482666, "learning_rate": 1e-05, "loss": 0.0206, "step": 620700 }, { "epoch": 0.006208, "grad_norm": 0.21610939502716064, "learning_rate": 1e-05, "loss": 0.0208, "step": 620800 }, { "epoch": 0.006209, "grad_norm": 0.17833587527275085, "learning_rate": 1e-05, "loss": 0.0209, "step": 620900 }, { "epoch": 0.00621, "grad_norm": 0.18031160533428192, "learning_rate": 1e-05, "loss": 0.0209, "step": 621000 }, { "epoch": 0.006211, "grad_norm": 0.15142470598220825, "learning_rate": 1e-05, "loss": 0.0207, "step": 621100 }, { "epoch": 0.006212, "grad_norm": 0.18367096781730652, "learning_rate": 1e-05, "loss": 0.0206, "step": 621200 }, { "epoch": 0.006213, "grad_norm": 0.1606319546699524, "learning_rate": 1e-05, "loss": 0.0204, "step": 621300 }, { "epoch": 0.006214, "grad_norm": 0.16809490323066711, "learning_rate": 1e-05, "loss": 0.0207, "step": 621400 }, { "epoch": 0.006215, "grad_norm": 0.20047175884246826, "learning_rate": 1e-05, "loss": 0.0205, "step": 621500 }, { "epoch": 0.006216, "grad_norm": 0.2520972788333893, "learning_rate": 1e-05, "loss": 0.0209, "step": 621600 }, { "epoch": 0.006217, "grad_norm": 0.12199167162179947, "learning_rate": 1e-05, "loss": 0.0205, "step": 621700 }, { "epoch": 0.006218, "grad_norm": 0.1430024653673172, "learning_rate": 1e-05, "loss": 0.0213, "step": 621800 }, { "epoch": 0.006219, "grad_norm": 0.16720230877399445, "learning_rate": 1e-05, "loss": 0.0206, "step": 621900 }, { "epoch": 0.00622, "grad_norm": 0.15908372402191162, "learning_rate": 1e-05, "loss": 0.0207, "step": 622000 }, { "epoch": 0.006221, "grad_norm": 0.20048238337039948, "learning_rate": 1e-05, "loss": 0.0208, "step": 622100 }, { "epoch": 0.006222, "grad_norm": 0.2578120231628418, "learning_rate": 1e-05, "loss": 0.0205, "step": 622200 }, { "epoch": 0.006223, "grad_norm": 0.13364656269550323, "learning_rate": 1e-05, "loss": 0.0204, "step": 622300 }, { "epoch": 0.006224, "grad_norm": 0.14300642907619476, "learning_rate": 1e-05, "loss": 0.0209, "step": 622400 }, { "epoch": 0.006225, "grad_norm": 0.1737307757139206, "learning_rate": 1e-05, "loss": 0.0202, "step": 622500 }, { "epoch": 0.006226, "grad_norm": 0.2363063097000122, "learning_rate": 1e-05, "loss": 0.0209, "step": 622600 }, { "epoch": 0.006227, "grad_norm": 0.20755797624588013, "learning_rate": 1e-05, "loss": 0.021, "step": 622700 }, { "epoch": 0.006228, "grad_norm": 0.1755916178226471, "learning_rate": 1e-05, "loss": 0.0207, "step": 622800 }, { "epoch": 0.006229, "grad_norm": 0.17243927717208862, "learning_rate": 1e-05, "loss": 0.0207, "step": 622900 }, { "epoch": 0.00623, "grad_norm": 0.1850782334804535, "learning_rate": 1e-05, "loss": 0.0207, "step": 623000 }, { "epoch": 0.006231, "grad_norm": 0.14892683923244476, "learning_rate": 1e-05, "loss": 0.0202, "step": 623100 }, { "epoch": 0.006232, "grad_norm": 0.1559371054172516, "learning_rate": 1e-05, "loss": 0.0202, "step": 623200 }, { "epoch": 0.006233, "grad_norm": 0.1689717024564743, "learning_rate": 1e-05, "loss": 0.0207, "step": 623300 }, { "epoch": 0.006234, "grad_norm": 0.21855004131793976, "learning_rate": 1e-05, "loss": 0.0204, "step": 623400 }, { "epoch": 0.006235, "grad_norm": 0.1809195727109909, "learning_rate": 1e-05, "loss": 0.0203, "step": 623500 }, { "epoch": 0.006236, "grad_norm": 0.316110223531723, "learning_rate": 1e-05, "loss": 0.0211, "step": 623600 }, { "epoch": 0.006237, "grad_norm": 0.15413588285446167, "learning_rate": 1e-05, "loss": 0.02, "step": 623700 }, { "epoch": 0.006238, "grad_norm": 0.15998321771621704, "learning_rate": 1e-05, "loss": 0.0205, "step": 623800 }, { "epoch": 0.006239, "grad_norm": 0.15528589487075806, "learning_rate": 1e-05, "loss": 0.0207, "step": 623900 }, { "epoch": 0.00624, "grad_norm": 0.21136295795440674, "learning_rate": 1e-05, "loss": 0.0209, "step": 624000 }, { "epoch": 0.006241, "grad_norm": 0.2086833268404007, "learning_rate": 1e-05, "loss": 0.0204, "step": 624100 }, { "epoch": 0.006242, "grad_norm": 0.20254147052764893, "learning_rate": 1e-05, "loss": 0.0206, "step": 624200 }, { "epoch": 0.006243, "grad_norm": 0.1793130487203598, "learning_rate": 1e-05, "loss": 0.021, "step": 624300 }, { "epoch": 0.006244, "grad_norm": 0.2895253300666809, "learning_rate": 1e-05, "loss": 0.0205, "step": 624400 }, { "epoch": 0.006245, "grad_norm": 0.16376417875289917, "learning_rate": 1e-05, "loss": 0.0207, "step": 624500 }, { "epoch": 0.006246, "grad_norm": 0.19478416442871094, "learning_rate": 1e-05, "loss": 0.0211, "step": 624600 }, { "epoch": 0.006247, "grad_norm": 0.15253058075904846, "learning_rate": 1e-05, "loss": 0.0204, "step": 624700 }, { "epoch": 0.006248, "grad_norm": 0.16693215072155, "learning_rate": 1e-05, "loss": 0.0207, "step": 624800 }, { "epoch": 0.006249, "grad_norm": 0.15478134155273438, "learning_rate": 1e-05, "loss": 0.0207, "step": 624900 }, { "epoch": 0.00625, "grad_norm": 0.1643378734588623, "learning_rate": 1e-05, "loss": 0.0207, "step": 625000 }, { "epoch": 0.006251, "grad_norm": 0.13849836587905884, "learning_rate": 1e-05, "loss": 0.0206, "step": 625100 }, { "epoch": 0.006252, "grad_norm": 0.15022775530815125, "learning_rate": 1e-05, "loss": 0.0205, "step": 625200 }, { "epoch": 0.006253, "grad_norm": 0.18263398110866547, "learning_rate": 1e-05, "loss": 0.0204, "step": 625300 }, { "epoch": 0.006254, "grad_norm": 0.1934930384159088, "learning_rate": 1e-05, "loss": 0.0208, "step": 625400 }, { "epoch": 0.006255, "grad_norm": 0.17685295641422272, "learning_rate": 1e-05, "loss": 0.0204, "step": 625500 }, { "epoch": 0.006256, "grad_norm": 0.16564826667308807, "learning_rate": 1e-05, "loss": 0.0205, "step": 625600 }, { "epoch": 0.006257, "grad_norm": 0.19013115763664246, "learning_rate": 1e-05, "loss": 0.0204, "step": 625700 }, { "epoch": 0.006258, "grad_norm": 0.12741400301456451, "learning_rate": 1e-05, "loss": 0.0204, "step": 625800 }, { "epoch": 0.006259, "grad_norm": 0.14832721650600433, "learning_rate": 1e-05, "loss": 0.0204, "step": 625900 }, { "epoch": 0.00626, "grad_norm": 0.22682805359363556, "learning_rate": 1e-05, "loss": 0.0211, "step": 626000 }, { "epoch": 0.006261, "grad_norm": 0.22576116025447845, "learning_rate": 1e-05, "loss": 0.0209, "step": 626100 }, { "epoch": 0.006262, "grad_norm": 0.15454883873462677, "learning_rate": 1e-05, "loss": 0.0208, "step": 626200 }, { "epoch": 0.006263, "grad_norm": 0.13202384114265442, "learning_rate": 1e-05, "loss": 0.0202, "step": 626300 }, { "epoch": 0.006264, "grad_norm": 0.15123504400253296, "learning_rate": 1e-05, "loss": 0.0212, "step": 626400 }, { "epoch": 0.006265, "grad_norm": 0.18022994697093964, "learning_rate": 1e-05, "loss": 0.0205, "step": 626500 }, { "epoch": 0.006266, "grad_norm": 0.17538362741470337, "learning_rate": 1e-05, "loss": 0.021, "step": 626600 }, { "epoch": 0.006267, "grad_norm": 0.18334327638149261, "learning_rate": 1e-05, "loss": 0.0202, "step": 626700 }, { "epoch": 0.006268, "grad_norm": 0.16116712987422943, "learning_rate": 1e-05, "loss": 0.0203, "step": 626800 }, { "epoch": 0.006269, "grad_norm": 0.17540879547595978, "learning_rate": 1e-05, "loss": 0.0203, "step": 626900 }, { "epoch": 0.00627, "grad_norm": 0.13409671187400818, "learning_rate": 1e-05, "loss": 0.0204, "step": 627000 }, { "epoch": 0.006271, "grad_norm": 0.1731218546628952, "learning_rate": 1e-05, "loss": 0.0202, "step": 627100 }, { "epoch": 0.006272, "grad_norm": 0.1834629774093628, "learning_rate": 1e-05, "loss": 0.0209, "step": 627200 }, { "epoch": 0.006273, "grad_norm": 0.1684240996837616, "learning_rate": 1e-05, "loss": 0.0203, "step": 627300 }, { "epoch": 0.006274, "grad_norm": 0.2246481031179428, "learning_rate": 1e-05, "loss": 0.0205, "step": 627400 }, { "epoch": 0.006275, "grad_norm": 0.16877411305904388, "learning_rate": 1e-05, "loss": 0.0205, "step": 627500 }, { "epoch": 0.006276, "grad_norm": 0.13547740876674652, "learning_rate": 1e-05, "loss": 0.021, "step": 627600 }, { "epoch": 0.006277, "grad_norm": 0.18718191981315613, "learning_rate": 1e-05, "loss": 0.0204, "step": 627700 }, { "epoch": 0.006278, "grad_norm": 0.13189975917339325, "learning_rate": 1e-05, "loss": 0.0207, "step": 627800 }, { "epoch": 0.006279, "grad_norm": 0.1637977957725525, "learning_rate": 1e-05, "loss": 0.0198, "step": 627900 }, { "epoch": 0.00628, "grad_norm": 0.18798291683197021, "learning_rate": 1e-05, "loss": 0.0202, "step": 628000 }, { "epoch": 0.006281, "grad_norm": 0.22419075667858124, "learning_rate": 1e-05, "loss": 0.0206, "step": 628100 }, { "epoch": 0.006282, "grad_norm": 0.1824210286140442, "learning_rate": 1e-05, "loss": 0.0209, "step": 628200 }, { "epoch": 0.006283, "grad_norm": 0.1836768090724945, "learning_rate": 1e-05, "loss": 0.021, "step": 628300 }, { "epoch": 0.006284, "grad_norm": 0.15528874099254608, "learning_rate": 1e-05, "loss": 0.0204, "step": 628400 }, { "epoch": 0.006285, "grad_norm": 0.1439627707004547, "learning_rate": 1e-05, "loss": 0.0204, "step": 628500 }, { "epoch": 0.006286, "grad_norm": 0.13729362189769745, "learning_rate": 1e-05, "loss": 0.0208, "step": 628600 }, { "epoch": 0.006287, "grad_norm": 0.27046337723731995, "learning_rate": 1e-05, "loss": 0.0204, "step": 628700 }, { "epoch": 0.006288, "grad_norm": 0.17771591246128082, "learning_rate": 1e-05, "loss": 0.0205, "step": 628800 }, { "epoch": 0.006289, "grad_norm": 0.17224524915218353, "learning_rate": 1e-05, "loss": 0.021, "step": 628900 }, { "epoch": 0.00629, "grad_norm": 0.23151062428951263, "learning_rate": 1e-05, "loss": 0.0204, "step": 629000 }, { "epoch": 0.006291, "grad_norm": 0.18143443763256073, "learning_rate": 1e-05, "loss": 0.0206, "step": 629100 }, { "epoch": 0.006292, "grad_norm": 0.2545838952064514, "learning_rate": 1e-05, "loss": 0.0203, "step": 629200 }, { "epoch": 0.006293, "grad_norm": 0.2568891942501068, "learning_rate": 1e-05, "loss": 0.0206, "step": 629300 }, { "epoch": 0.006294, "grad_norm": 0.2561025023460388, "learning_rate": 1e-05, "loss": 0.0209, "step": 629400 }, { "epoch": 0.006295, "grad_norm": 0.19841808080673218, "learning_rate": 1e-05, "loss": 0.0208, "step": 629500 }, { "epoch": 0.006296, "grad_norm": 0.1398191601037979, "learning_rate": 1e-05, "loss": 0.0205, "step": 629600 }, { "epoch": 0.006297, "grad_norm": 0.16436535120010376, "learning_rate": 1e-05, "loss": 0.02, "step": 629700 }, { "epoch": 0.006298, "grad_norm": 0.1320326179265976, "learning_rate": 1e-05, "loss": 0.0209, "step": 629800 }, { "epoch": 0.006299, "grad_norm": 0.1412450671195984, "learning_rate": 1e-05, "loss": 0.0207, "step": 629900 }, { "epoch": 0.0063, "grad_norm": 0.17193448543548584, "learning_rate": 1e-05, "loss": 0.0205, "step": 630000 }, { "epoch": 0.006301, "grad_norm": 0.1636325865983963, "learning_rate": 1e-05, "loss": 0.0206, "step": 630100 }, { "epoch": 0.006302, "grad_norm": 0.16577878594398499, "learning_rate": 1e-05, "loss": 0.0204, "step": 630200 }, { "epoch": 0.006303, "grad_norm": 0.17676712572574615, "learning_rate": 1e-05, "loss": 0.0208, "step": 630300 }, { "epoch": 0.006304, "grad_norm": 0.14208926260471344, "learning_rate": 1e-05, "loss": 0.0198, "step": 630400 }, { "epoch": 0.006305, "grad_norm": 0.15631751716136932, "learning_rate": 1e-05, "loss": 0.0207, "step": 630500 }, { "epoch": 0.006306, "grad_norm": 0.14985963702201843, "learning_rate": 1e-05, "loss": 0.02, "step": 630600 }, { "epoch": 0.006307, "grad_norm": 0.1592445820569992, "learning_rate": 1e-05, "loss": 0.0208, "step": 630700 }, { "epoch": 0.006308, "grad_norm": 0.17726677656173706, "learning_rate": 1e-05, "loss": 0.0204, "step": 630800 }, { "epoch": 0.006309, "grad_norm": 0.1526213139295578, "learning_rate": 1e-05, "loss": 0.0208, "step": 630900 }, { "epoch": 0.00631, "grad_norm": 0.15547491610050201, "learning_rate": 1e-05, "loss": 0.0202, "step": 631000 }, { "epoch": 0.006311, "grad_norm": 0.18483692407608032, "learning_rate": 1e-05, "loss": 0.0206, "step": 631100 }, { "epoch": 0.006312, "grad_norm": 0.16877049207687378, "learning_rate": 1e-05, "loss": 0.0208, "step": 631200 }, { "epoch": 0.006313, "grad_norm": 0.1625966578722, "learning_rate": 1e-05, "loss": 0.0206, "step": 631300 }, { "epoch": 0.006314, "grad_norm": 0.1405232697725296, "learning_rate": 1e-05, "loss": 0.0202, "step": 631400 }, { "epoch": 0.006315, "grad_norm": 0.18327200412750244, "learning_rate": 1e-05, "loss": 0.0205, "step": 631500 }, { "epoch": 0.006316, "grad_norm": 0.14698877930641174, "learning_rate": 1e-05, "loss": 0.0202, "step": 631600 }, { "epoch": 0.006317, "grad_norm": 0.21016637980937958, "learning_rate": 1e-05, "loss": 0.0203, "step": 631700 }, { "epoch": 0.006318, "grad_norm": 0.15751433372497559, "learning_rate": 1e-05, "loss": 0.0206, "step": 631800 }, { "epoch": 0.006319, "grad_norm": 0.14980503916740417, "learning_rate": 1e-05, "loss": 0.0204, "step": 631900 }, { "epoch": 0.00632, "grad_norm": 0.14578846096992493, "learning_rate": 1e-05, "loss": 0.0206, "step": 632000 }, { "epoch": 0.006321, "grad_norm": 0.14859062433242798, "learning_rate": 1e-05, "loss": 0.0204, "step": 632100 }, { "epoch": 0.006322, "grad_norm": 0.1284855753183365, "learning_rate": 1e-05, "loss": 0.0206, "step": 632200 }, { "epoch": 0.006323, "grad_norm": 0.13883289694786072, "learning_rate": 1e-05, "loss": 0.0207, "step": 632300 }, { "epoch": 0.006324, "grad_norm": 0.21725109219551086, "learning_rate": 1e-05, "loss": 0.0207, "step": 632400 }, { "epoch": 0.006325, "grad_norm": 0.16109156608581543, "learning_rate": 1e-05, "loss": 0.0208, "step": 632500 }, { "epoch": 0.006326, "grad_norm": 0.15577630698680878, "learning_rate": 1e-05, "loss": 0.0206, "step": 632600 }, { "epoch": 0.006327, "grad_norm": 0.1924273669719696, "learning_rate": 1e-05, "loss": 0.0211, "step": 632700 }, { "epoch": 0.006328, "grad_norm": 0.1811286211013794, "learning_rate": 1e-05, "loss": 0.0201, "step": 632800 }, { "epoch": 0.006329, "grad_norm": 0.19118653237819672, "learning_rate": 1e-05, "loss": 0.0203, "step": 632900 }, { "epoch": 0.00633, "grad_norm": 0.18658238649368286, "learning_rate": 1e-05, "loss": 0.0211, "step": 633000 }, { "epoch": 0.006331, "grad_norm": 0.21828998625278473, "learning_rate": 1e-05, "loss": 0.0205, "step": 633100 }, { "epoch": 0.006332, "grad_norm": 0.24038179218769073, "learning_rate": 1e-05, "loss": 0.0212, "step": 633200 }, { "epoch": 0.006333, "grad_norm": 0.11484485119581223, "learning_rate": 1e-05, "loss": 0.0207, "step": 633300 }, { "epoch": 0.006334, "grad_norm": 0.2555901110172272, "learning_rate": 1e-05, "loss": 0.0206, "step": 633400 }, { "epoch": 0.006335, "grad_norm": 0.3074251413345337, "learning_rate": 1e-05, "loss": 0.0205, "step": 633500 }, { "epoch": 0.006336, "grad_norm": 0.23159249126911163, "learning_rate": 1e-05, "loss": 0.0207, "step": 633600 }, { "epoch": 0.006337, "grad_norm": 0.21987789869308472, "learning_rate": 1e-05, "loss": 0.0205, "step": 633700 }, { "epoch": 0.006338, "grad_norm": 0.19619743525981903, "learning_rate": 1e-05, "loss": 0.0208, "step": 633800 }, { "epoch": 0.006339, "grad_norm": 0.15879008173942566, "learning_rate": 1e-05, "loss": 0.0209, "step": 633900 }, { "epoch": 0.00634, "grad_norm": 0.15518790483474731, "learning_rate": 1e-05, "loss": 0.0205, "step": 634000 }, { "epoch": 0.006341, "grad_norm": 0.1262803077697754, "learning_rate": 1e-05, "loss": 0.0208, "step": 634100 }, { "epoch": 0.006342, "grad_norm": 0.11978044360876083, "learning_rate": 1e-05, "loss": 0.0204, "step": 634200 }, { "epoch": 0.006343, "grad_norm": 0.2704891860485077, "learning_rate": 1e-05, "loss": 0.0207, "step": 634300 }, { "epoch": 0.006344, "grad_norm": 0.1460079848766327, "learning_rate": 1e-05, "loss": 0.0201, "step": 634400 }, { "epoch": 0.006345, "grad_norm": 0.19704869389533997, "learning_rate": 1e-05, "loss": 0.021, "step": 634500 }, { "epoch": 0.006346, "grad_norm": 0.15804685652256012, "learning_rate": 1e-05, "loss": 0.0207, "step": 634600 }, { "epoch": 0.006347, "grad_norm": 0.19264966249465942, "learning_rate": 1e-05, "loss": 0.0204, "step": 634700 }, { "epoch": 0.006348, "grad_norm": 0.18282215297222137, "learning_rate": 1e-05, "loss": 0.0206, "step": 634800 }, { "epoch": 0.006349, "grad_norm": 0.18462611734867096, "learning_rate": 1e-05, "loss": 0.0204, "step": 634900 }, { "epoch": 0.00635, "grad_norm": 0.1677350252866745, "learning_rate": 1e-05, "loss": 0.0203, "step": 635000 }, { "epoch": 0.006351, "grad_norm": 0.17078031599521637, "learning_rate": 1e-05, "loss": 0.02, "step": 635100 }, { "epoch": 0.006352, "grad_norm": 0.14908196032047272, "learning_rate": 1e-05, "loss": 0.0205, "step": 635200 }, { "epoch": 0.006353, "grad_norm": 0.20884005725383759, "learning_rate": 1e-05, "loss": 0.0207, "step": 635300 }, { "epoch": 0.006354, "grad_norm": 0.1492428183555603, "learning_rate": 1e-05, "loss": 0.0206, "step": 635400 }, { "epoch": 0.006355, "grad_norm": 0.15688101947307587, "learning_rate": 1e-05, "loss": 0.0207, "step": 635500 }, { "epoch": 0.006356, "grad_norm": 0.14295794069766998, "learning_rate": 1e-05, "loss": 0.0208, "step": 635600 }, { "epoch": 0.006357, "grad_norm": 0.13696974515914917, "learning_rate": 1e-05, "loss": 0.0207, "step": 635700 }, { "epoch": 0.006358, "grad_norm": 0.17785494029521942, "learning_rate": 1e-05, "loss": 0.0205, "step": 635800 }, { "epoch": 0.006359, "grad_norm": 0.16919831931591034, "learning_rate": 1e-05, "loss": 0.0205, "step": 635900 }, { "epoch": 0.00636, "grad_norm": 0.1848812699317932, "learning_rate": 1e-05, "loss": 0.0209, "step": 636000 }, { "epoch": 0.006361, "grad_norm": 0.1341562420129776, "learning_rate": 1e-05, "loss": 0.0209, "step": 636100 }, { "epoch": 0.006362, "grad_norm": 0.16889269649982452, "learning_rate": 1e-05, "loss": 0.0204, "step": 636200 }, { "epoch": 0.006363, "grad_norm": 0.20790629088878632, "learning_rate": 1e-05, "loss": 0.0203, "step": 636300 }, { "epoch": 0.006364, "grad_norm": 0.15961268544197083, "learning_rate": 1e-05, "loss": 0.0207, "step": 636400 }, { "epoch": 0.006365, "grad_norm": 0.1555992066860199, "learning_rate": 1e-05, "loss": 0.0205, "step": 636500 }, { "epoch": 0.006366, "grad_norm": 0.16711263358592987, "learning_rate": 1e-05, "loss": 0.0212, "step": 636600 }, { "epoch": 0.006367, "grad_norm": 0.14228585362434387, "learning_rate": 1e-05, "loss": 0.0206, "step": 636700 }, { "epoch": 0.006368, "grad_norm": 0.19654864072799683, "learning_rate": 1e-05, "loss": 0.0206, "step": 636800 }, { "epoch": 0.006369, "grad_norm": 0.11401879787445068, "learning_rate": 1e-05, "loss": 0.0204, "step": 636900 }, { "epoch": 0.00637, "grad_norm": 0.19158397614955902, "learning_rate": 1e-05, "loss": 0.0207, "step": 637000 }, { "epoch": 0.006371, "grad_norm": 0.16668088734149933, "learning_rate": 1e-05, "loss": 0.0202, "step": 637100 }, { "epoch": 0.006372, "grad_norm": 0.2549144923686981, "learning_rate": 1e-05, "loss": 0.0202, "step": 637200 }, { "epoch": 0.006373, "grad_norm": 0.1701173335313797, "learning_rate": 1e-05, "loss": 0.0207, "step": 637300 }, { "epoch": 0.006374, "grad_norm": 0.16716811060905457, "learning_rate": 1e-05, "loss": 0.0203, "step": 637400 }, { "epoch": 0.006375, "grad_norm": 0.2069680243730545, "learning_rate": 1e-05, "loss": 0.0201, "step": 637500 }, { "epoch": 0.006376, "grad_norm": 0.17075026035308838, "learning_rate": 1e-05, "loss": 0.0202, "step": 637600 }, { "epoch": 0.006377, "grad_norm": 0.15178462862968445, "learning_rate": 1e-05, "loss": 0.0201, "step": 637700 }, { "epoch": 0.006378, "grad_norm": 0.14519314467906952, "learning_rate": 1e-05, "loss": 0.0206, "step": 637800 }, { "epoch": 0.006379, "grad_norm": 0.14812511205673218, "learning_rate": 1e-05, "loss": 0.0206, "step": 637900 }, { "epoch": 0.00638, "grad_norm": 0.2162003517150879, "learning_rate": 1e-05, "loss": 0.0202, "step": 638000 }, { "epoch": 0.006381, "grad_norm": 0.1388694941997528, "learning_rate": 1e-05, "loss": 0.0206, "step": 638100 }, { "epoch": 0.006382, "grad_norm": 0.14472214877605438, "learning_rate": 1e-05, "loss": 0.0211, "step": 638200 }, { "epoch": 0.006383, "grad_norm": 0.1827509105205536, "learning_rate": 1e-05, "loss": 0.0206, "step": 638300 }, { "epoch": 0.006384, "grad_norm": 0.22983801364898682, "learning_rate": 1e-05, "loss": 0.0206, "step": 638400 }, { "epoch": 0.006385, "grad_norm": 0.1100093200802803, "learning_rate": 1e-05, "loss": 0.0205, "step": 638500 }, { "epoch": 0.006386, "grad_norm": 0.1992758810520172, "learning_rate": 1e-05, "loss": 0.0208, "step": 638600 }, { "epoch": 0.006387, "grad_norm": 0.24549764394760132, "learning_rate": 1e-05, "loss": 0.0206, "step": 638700 }, { "epoch": 0.006388, "grad_norm": 0.1391640454530716, "learning_rate": 1e-05, "loss": 0.0201, "step": 638800 }, { "epoch": 0.006389, "grad_norm": 0.17405273020267487, "learning_rate": 1e-05, "loss": 0.0206, "step": 638900 }, { "epoch": 0.00639, "grad_norm": 0.2853308916091919, "learning_rate": 1e-05, "loss": 0.0208, "step": 639000 }, { "epoch": 0.006391, "grad_norm": 0.19122876226902008, "learning_rate": 1e-05, "loss": 0.0206, "step": 639100 }, { "epoch": 0.006392, "grad_norm": 0.21674185991287231, "learning_rate": 1e-05, "loss": 0.0205, "step": 639200 }, { "epoch": 0.006393, "grad_norm": 0.17131651937961578, "learning_rate": 1e-05, "loss": 0.0201, "step": 639300 }, { "epoch": 0.006394, "grad_norm": 0.1323380172252655, "learning_rate": 1e-05, "loss": 0.0209, "step": 639400 }, { "epoch": 0.006395, "grad_norm": 0.16229090094566345, "learning_rate": 1e-05, "loss": 0.0203, "step": 639500 }, { "epoch": 0.006396, "grad_norm": 0.15275035798549652, "learning_rate": 1e-05, "loss": 0.0205, "step": 639600 }, { "epoch": 0.006397, "grad_norm": 0.2869251072406769, "learning_rate": 1e-05, "loss": 0.0204, "step": 639700 }, { "epoch": 0.006398, "grad_norm": 0.1396133005619049, "learning_rate": 1e-05, "loss": 0.0201, "step": 639800 }, { "epoch": 0.006399, "grad_norm": 0.18514420092105865, "learning_rate": 1e-05, "loss": 0.0203, "step": 639900 }, { "epoch": 0.0064, "grad_norm": 0.16034948825836182, "learning_rate": 1e-05, "loss": 0.0211, "step": 640000 }, { "epoch": 0.0064, "eval_loss": 0.018128113821148872, "eval_runtime": 196.0109, "eval_samples_per_second": 255.088, "eval_steps_per_second": 15.943, "step": 640000 }, { "epoch": 0.006401, "grad_norm": 0.27226555347442627, "learning_rate": 1e-05, "loss": 0.0207, "step": 640100 }, { "epoch": 0.006402, "grad_norm": 0.15670019388198853, "learning_rate": 1e-05, "loss": 0.0207, "step": 640200 }, { "epoch": 0.006403, "grad_norm": 0.16855992376804352, "learning_rate": 1e-05, "loss": 0.0208, "step": 640300 }, { "epoch": 0.006404, "grad_norm": 0.18897554278373718, "learning_rate": 1e-05, "loss": 0.0203, "step": 640400 }, { "epoch": 0.006405, "grad_norm": 0.18621215224266052, "learning_rate": 1e-05, "loss": 0.02, "step": 640500 }, { "epoch": 0.006406, "grad_norm": 0.1663270741701126, "learning_rate": 1e-05, "loss": 0.0205, "step": 640600 }, { "epoch": 0.006407, "grad_norm": 0.20483660697937012, "learning_rate": 1e-05, "loss": 0.0199, "step": 640700 }, { "epoch": 0.006408, "grad_norm": 0.1991611272096634, "learning_rate": 1e-05, "loss": 0.0207, "step": 640800 }, { "epoch": 0.006409, "grad_norm": 0.16673803329467773, "learning_rate": 1e-05, "loss": 0.0204, "step": 640900 }, { "epoch": 0.00641, "grad_norm": 0.15167705714702606, "learning_rate": 1e-05, "loss": 0.0209, "step": 641000 }, { "epoch": 0.006411, "grad_norm": 0.1503002792596817, "learning_rate": 1e-05, "loss": 0.0207, "step": 641100 }, { "epoch": 0.006412, "grad_norm": 0.19581353664398193, "learning_rate": 1e-05, "loss": 0.0199, "step": 641200 }, { "epoch": 0.006413, "grad_norm": 0.189825177192688, "learning_rate": 1e-05, "loss": 0.0207, "step": 641300 }, { "epoch": 0.006414, "grad_norm": 0.13540461659431458, "learning_rate": 1e-05, "loss": 0.0201, "step": 641400 }, { "epoch": 0.006415, "grad_norm": 0.2152574509382248, "learning_rate": 1e-05, "loss": 0.0204, "step": 641500 }, { "epoch": 0.006416, "grad_norm": 0.1427013874053955, "learning_rate": 1e-05, "loss": 0.0201, "step": 641600 }, { "epoch": 0.006417, "grad_norm": 0.14145411550998688, "learning_rate": 1e-05, "loss": 0.0206, "step": 641700 }, { "epoch": 0.006418, "grad_norm": 0.19091065227985382, "learning_rate": 1e-05, "loss": 0.0204, "step": 641800 }, { "epoch": 0.006419, "grad_norm": 0.1324782818555832, "learning_rate": 1e-05, "loss": 0.0205, "step": 641900 }, { "epoch": 0.00642, "grad_norm": 0.13394670188426971, "learning_rate": 1e-05, "loss": 0.02, "step": 642000 }, { "epoch": 0.006421, "grad_norm": 0.15740849077701569, "learning_rate": 1e-05, "loss": 0.0206, "step": 642100 }, { "epoch": 0.006422, "grad_norm": 0.18082816898822784, "learning_rate": 1e-05, "loss": 0.0207, "step": 642200 }, { "epoch": 0.006423, "grad_norm": 0.2524728775024414, "learning_rate": 1e-05, "loss": 0.0205, "step": 642300 }, { "epoch": 0.006424, "grad_norm": 0.163295716047287, "learning_rate": 1e-05, "loss": 0.0205, "step": 642400 }, { "epoch": 0.006425, "grad_norm": 0.16587752103805542, "learning_rate": 1e-05, "loss": 0.0202, "step": 642500 }, { "epoch": 0.006426, "grad_norm": 0.1931290477514267, "learning_rate": 1e-05, "loss": 0.0206, "step": 642600 }, { "epoch": 0.006427, "grad_norm": 0.21909335255622864, "learning_rate": 1e-05, "loss": 0.0205, "step": 642700 }, { "epoch": 0.006428, "grad_norm": 0.17513719201087952, "learning_rate": 1e-05, "loss": 0.0203, "step": 642800 }, { "epoch": 0.006429, "grad_norm": 0.17197421193122864, "learning_rate": 1e-05, "loss": 0.0202, "step": 642900 }, { "epoch": 0.00643, "grad_norm": 0.1589643508195877, "learning_rate": 1e-05, "loss": 0.0206, "step": 643000 }, { "epoch": 0.006431, "grad_norm": 0.15744620561599731, "learning_rate": 1e-05, "loss": 0.0204, "step": 643100 }, { "epoch": 0.006432, "grad_norm": 0.1556015908718109, "learning_rate": 1e-05, "loss": 0.0208, "step": 643200 }, { "epoch": 0.006433, "grad_norm": 0.14711152017116547, "learning_rate": 1e-05, "loss": 0.0203, "step": 643300 }, { "epoch": 0.006434, "grad_norm": 0.17876087129116058, "learning_rate": 1e-05, "loss": 0.0199, "step": 643400 }, { "epoch": 0.006435, "grad_norm": 0.2215213030576706, "learning_rate": 1e-05, "loss": 0.0204, "step": 643500 }, { "epoch": 0.006436, "grad_norm": 0.2430855929851532, "learning_rate": 1e-05, "loss": 0.0205, "step": 643600 }, { "epoch": 0.006437, "grad_norm": 0.1748047173023224, "learning_rate": 1e-05, "loss": 0.0203, "step": 643700 }, { "epoch": 0.006438, "grad_norm": 0.1968008428812027, "learning_rate": 1e-05, "loss": 0.0207, "step": 643800 }, { "epoch": 0.006439, "grad_norm": 0.17545250058174133, "learning_rate": 1e-05, "loss": 0.0199, "step": 643900 }, { "epoch": 0.00644, "grad_norm": 0.14405779540538788, "learning_rate": 1e-05, "loss": 0.0208, "step": 644000 }, { "epoch": 0.006441, "grad_norm": 0.20840738713741302, "learning_rate": 1e-05, "loss": 0.0201, "step": 644100 }, { "epoch": 0.006442, "grad_norm": 0.20736609399318695, "learning_rate": 1e-05, "loss": 0.0209, "step": 644200 }, { "epoch": 0.006443, "grad_norm": 0.23055143654346466, "learning_rate": 1e-05, "loss": 0.0203, "step": 644300 }, { "epoch": 0.006444, "grad_norm": 0.10373374074697495, "learning_rate": 1e-05, "loss": 0.0206, "step": 644400 }, { "epoch": 0.006445, "grad_norm": 0.1930067092180252, "learning_rate": 1e-05, "loss": 0.0207, "step": 644500 }, { "epoch": 0.006446, "grad_norm": 0.14136070013046265, "learning_rate": 1e-05, "loss": 0.0202, "step": 644600 }, { "epoch": 0.006447, "grad_norm": 0.1951899379491806, "learning_rate": 1e-05, "loss": 0.0199, "step": 644700 }, { "epoch": 0.006448, "grad_norm": 0.15971501171588898, "learning_rate": 1e-05, "loss": 0.0207, "step": 644800 }, { "epoch": 0.006449, "grad_norm": 0.16493745148181915, "learning_rate": 1e-05, "loss": 0.0204, "step": 644900 }, { "epoch": 0.00645, "grad_norm": 0.14479073882102966, "learning_rate": 1e-05, "loss": 0.0203, "step": 645000 }, { "epoch": 0.006451, "grad_norm": 0.21224628388881683, "learning_rate": 1e-05, "loss": 0.0206, "step": 645100 }, { "epoch": 0.006452, "grad_norm": 0.19966262578964233, "learning_rate": 1e-05, "loss": 0.0204, "step": 645200 }, { "epoch": 0.006453, "grad_norm": 0.1942337602376938, "learning_rate": 1e-05, "loss": 0.02, "step": 645300 }, { "epoch": 0.006454, "grad_norm": 0.17580920457839966, "learning_rate": 1e-05, "loss": 0.0202, "step": 645400 }, { "epoch": 0.006455, "grad_norm": 0.14621631801128387, "learning_rate": 1e-05, "loss": 0.0205, "step": 645500 }, { "epoch": 0.006456, "grad_norm": 0.200029194355011, "learning_rate": 1e-05, "loss": 0.0206, "step": 645600 }, { "epoch": 0.006457, "grad_norm": 0.2088559865951538, "learning_rate": 1e-05, "loss": 0.0201, "step": 645700 }, { "epoch": 0.006458, "grad_norm": 0.1508663296699524, "learning_rate": 1e-05, "loss": 0.0203, "step": 645800 }, { "epoch": 0.006459, "grad_norm": 0.21805408596992493, "learning_rate": 1e-05, "loss": 0.0205, "step": 645900 }, { "epoch": 0.00646, "grad_norm": 0.16349124908447266, "learning_rate": 1e-05, "loss": 0.0205, "step": 646000 }, { "epoch": 0.006461, "grad_norm": 0.12002424150705338, "learning_rate": 1e-05, "loss": 0.021, "step": 646100 }, { "epoch": 0.006462, "grad_norm": 0.16143880784511566, "learning_rate": 1e-05, "loss": 0.0205, "step": 646200 }, { "epoch": 0.006463, "grad_norm": 0.12535026669502258, "learning_rate": 1e-05, "loss": 0.0205, "step": 646300 }, { "epoch": 0.006464, "grad_norm": 0.20654162764549255, "learning_rate": 1e-05, "loss": 0.0205, "step": 646400 }, { "epoch": 0.006465, "grad_norm": 0.36099380254745483, "learning_rate": 1e-05, "loss": 0.0203, "step": 646500 }, { "epoch": 0.006466, "grad_norm": 0.1571231633424759, "learning_rate": 1e-05, "loss": 0.0205, "step": 646600 }, { "epoch": 0.006467, "grad_norm": 0.1578260362148285, "learning_rate": 1e-05, "loss": 0.0205, "step": 646700 }, { "epoch": 0.006468, "grad_norm": 0.16705867648124695, "learning_rate": 1e-05, "loss": 0.0205, "step": 646800 }, { "epoch": 0.006469, "grad_norm": 0.17041517794132233, "learning_rate": 1e-05, "loss": 0.0202, "step": 646900 }, { "epoch": 0.00647, "grad_norm": 0.16607269644737244, "learning_rate": 1e-05, "loss": 0.0205, "step": 647000 }, { "epoch": 0.006471, "grad_norm": 0.15130296349525452, "learning_rate": 1e-05, "loss": 0.0205, "step": 647100 }, { "epoch": 0.006472, "grad_norm": 0.1516292244195938, "learning_rate": 1e-05, "loss": 0.0201, "step": 647200 }, { "epoch": 0.006473, "grad_norm": 0.14971823990345, "learning_rate": 1e-05, "loss": 0.0206, "step": 647300 }, { "epoch": 0.006474, "grad_norm": 0.21615122258663177, "learning_rate": 1e-05, "loss": 0.0204, "step": 647400 }, { "epoch": 0.006475, "grad_norm": 0.17679740488529205, "learning_rate": 1e-05, "loss": 0.0201, "step": 647500 }, { "epoch": 0.006476, "grad_norm": 0.15894071757793427, "learning_rate": 1e-05, "loss": 0.0207, "step": 647600 }, { "epoch": 0.006477, "grad_norm": 0.15966901183128357, "learning_rate": 1e-05, "loss": 0.0203, "step": 647700 }, { "epoch": 0.006478, "grad_norm": 0.18190616369247437, "learning_rate": 1e-05, "loss": 0.0204, "step": 647800 }, { "epoch": 0.006479, "grad_norm": 0.13850374519824982, "learning_rate": 1e-05, "loss": 0.0205, "step": 647900 }, { "epoch": 0.00648, "grad_norm": 0.19216924905776978, "learning_rate": 1e-05, "loss": 0.0206, "step": 648000 }, { "epoch": 0.006481, "grad_norm": 0.18179981410503387, "learning_rate": 1e-05, "loss": 0.0202, "step": 648100 }, { "epoch": 0.006482, "grad_norm": 0.1811554729938507, "learning_rate": 1e-05, "loss": 0.0201, "step": 648200 }, { "epoch": 0.006483, "grad_norm": 0.13436846435070038, "learning_rate": 1e-05, "loss": 0.0202, "step": 648300 }, { "epoch": 0.006484, "grad_norm": 0.23776569962501526, "learning_rate": 1e-05, "loss": 0.0205, "step": 648400 }, { "epoch": 0.006485, "grad_norm": 0.15701788663864136, "learning_rate": 1e-05, "loss": 0.0204, "step": 648500 }, { "epoch": 0.006486, "grad_norm": 0.16649216413497925, "learning_rate": 1e-05, "loss": 0.0205, "step": 648600 }, { "epoch": 0.006487, "grad_norm": 0.22083057463169098, "learning_rate": 1e-05, "loss": 0.0206, "step": 648700 }, { "epoch": 0.006488, "grad_norm": 0.14907263219356537, "learning_rate": 1e-05, "loss": 0.0201, "step": 648800 }, { "epoch": 0.006489, "grad_norm": 0.2100508064031601, "learning_rate": 1e-05, "loss": 0.0205, "step": 648900 }, { "epoch": 0.00649, "grad_norm": 0.1676941215991974, "learning_rate": 1e-05, "loss": 0.0201, "step": 649000 }, { "epoch": 0.006491, "grad_norm": 0.19891075789928436, "learning_rate": 1e-05, "loss": 0.0204, "step": 649100 }, { "epoch": 0.006492, "grad_norm": 0.1801123172044754, "learning_rate": 1e-05, "loss": 0.0199, "step": 649200 }, { "epoch": 0.006493, "grad_norm": 0.1552482396364212, "learning_rate": 1e-05, "loss": 0.0203, "step": 649300 }, { "epoch": 0.006494, "grad_norm": 0.17009006440639496, "learning_rate": 1e-05, "loss": 0.0199, "step": 649400 }, { "epoch": 0.006495, "grad_norm": 0.13383033871650696, "learning_rate": 1e-05, "loss": 0.0204, "step": 649500 }, { "epoch": 0.006496, "grad_norm": 0.1461203396320343, "learning_rate": 1e-05, "loss": 0.0204, "step": 649600 }, { "epoch": 0.006497, "grad_norm": 0.16020086407661438, "learning_rate": 1e-05, "loss": 0.0207, "step": 649700 }, { "epoch": 0.006498, "grad_norm": 0.17471490800380707, "learning_rate": 1e-05, "loss": 0.0207, "step": 649800 }, { "epoch": 0.006499, "grad_norm": 0.20858030021190643, "learning_rate": 1e-05, "loss": 0.0201, "step": 649900 }, { "epoch": 0.0065, "grad_norm": 0.2552523910999298, "learning_rate": 1e-05, "loss": 0.0199, "step": 650000 }, { "epoch": 0.006501, "grad_norm": 0.15563809871673584, "learning_rate": 1e-05, "loss": 0.0201, "step": 650100 }, { "epoch": 0.006502, "grad_norm": 0.18283623456954956, "learning_rate": 1e-05, "loss": 0.0201, "step": 650200 }, { "epoch": 0.006503, "grad_norm": 0.17954513430595398, "learning_rate": 1e-05, "loss": 0.0206, "step": 650300 }, { "epoch": 0.006504, "grad_norm": 0.14711281657218933, "learning_rate": 1e-05, "loss": 0.0204, "step": 650400 }, { "epoch": 0.006505, "grad_norm": 0.16177405416965485, "learning_rate": 1e-05, "loss": 0.0204, "step": 650500 }, { "epoch": 0.006506, "grad_norm": 0.21085205674171448, "learning_rate": 1e-05, "loss": 0.0205, "step": 650600 }, { "epoch": 0.006507, "grad_norm": 0.2108727991580963, "learning_rate": 1e-05, "loss": 0.0208, "step": 650700 }, { "epoch": 0.006508, "grad_norm": 0.1738152801990509, "learning_rate": 1e-05, "loss": 0.0202, "step": 650800 }, { "epoch": 0.006509, "grad_norm": 0.1687711477279663, "learning_rate": 1e-05, "loss": 0.0196, "step": 650900 }, { "epoch": 0.00651, "grad_norm": 0.1785842329263687, "learning_rate": 1e-05, "loss": 0.0209, "step": 651000 }, { "epoch": 0.006511, "grad_norm": 0.16801589727401733, "learning_rate": 1e-05, "loss": 0.0207, "step": 651100 }, { "epoch": 0.006512, "grad_norm": 0.16775786876678467, "learning_rate": 1e-05, "loss": 0.0204, "step": 651200 }, { "epoch": 0.006513, "grad_norm": 0.21811623871326447, "learning_rate": 1e-05, "loss": 0.0203, "step": 651300 }, { "epoch": 0.006514, "grad_norm": 0.1276593655347824, "learning_rate": 1e-05, "loss": 0.0201, "step": 651400 }, { "epoch": 0.006515, "grad_norm": 0.15154752135276794, "learning_rate": 1e-05, "loss": 0.0198, "step": 651500 }, { "epoch": 0.006516, "grad_norm": 0.25107502937316895, "learning_rate": 1e-05, "loss": 0.021, "step": 651600 }, { "epoch": 0.006517, "grad_norm": 0.1775975376367569, "learning_rate": 1e-05, "loss": 0.0206, "step": 651700 }, { "epoch": 0.006518, "grad_norm": 0.15581607818603516, "learning_rate": 1e-05, "loss": 0.0204, "step": 651800 }, { "epoch": 0.006519, "grad_norm": 0.12727780640125275, "learning_rate": 1e-05, "loss": 0.0204, "step": 651900 }, { "epoch": 0.00652, "grad_norm": 0.11711833626031876, "learning_rate": 1e-05, "loss": 0.0202, "step": 652000 }, { "epoch": 0.006521, "grad_norm": 0.17599228024482727, "learning_rate": 1e-05, "loss": 0.02, "step": 652100 }, { "epoch": 0.006522, "grad_norm": 0.18165330588817596, "learning_rate": 1e-05, "loss": 0.0207, "step": 652200 }, { "epoch": 0.006523, "grad_norm": 0.19742879271507263, "learning_rate": 1e-05, "loss": 0.0199, "step": 652300 }, { "epoch": 0.006524, "grad_norm": 0.20131251215934753, "learning_rate": 1e-05, "loss": 0.0206, "step": 652400 }, { "epoch": 0.006525, "grad_norm": 0.1612449437379837, "learning_rate": 1e-05, "loss": 0.0207, "step": 652500 }, { "epoch": 0.006526, "grad_norm": 0.23194269835948944, "learning_rate": 1e-05, "loss": 0.0202, "step": 652600 }, { "epoch": 0.006527, "grad_norm": 0.12471341341733932, "learning_rate": 1e-05, "loss": 0.0201, "step": 652700 }, { "epoch": 0.006528, "grad_norm": 0.185779869556427, "learning_rate": 1e-05, "loss": 0.0202, "step": 652800 }, { "epoch": 0.006529, "grad_norm": 0.16452230513095856, "learning_rate": 1e-05, "loss": 0.0203, "step": 652900 }, { "epoch": 0.00653, "grad_norm": 0.15533314645290375, "learning_rate": 1e-05, "loss": 0.0201, "step": 653000 }, { "epoch": 0.006531, "grad_norm": 0.19425828754901886, "learning_rate": 1e-05, "loss": 0.0201, "step": 653100 }, { "epoch": 0.006532, "grad_norm": 0.13635459542274475, "learning_rate": 1e-05, "loss": 0.0201, "step": 653200 }, { "epoch": 0.006533, "grad_norm": 0.1499633640050888, "learning_rate": 1e-05, "loss": 0.0205, "step": 653300 }, { "epoch": 0.006534, "grad_norm": 0.16718940436840057, "learning_rate": 1e-05, "loss": 0.0201, "step": 653400 }, { "epoch": 0.006535, "grad_norm": 0.17429183423519135, "learning_rate": 1e-05, "loss": 0.0202, "step": 653500 }, { "epoch": 0.006536, "grad_norm": 0.14245007932186127, "learning_rate": 1e-05, "loss": 0.0203, "step": 653600 }, { "epoch": 0.006537, "grad_norm": 0.23801180720329285, "learning_rate": 1e-05, "loss": 0.0202, "step": 653700 }, { "epoch": 0.006538, "grad_norm": 0.13414740562438965, "learning_rate": 1e-05, "loss": 0.0204, "step": 653800 }, { "epoch": 0.006539, "grad_norm": 0.13431763648986816, "learning_rate": 1e-05, "loss": 0.0199, "step": 653900 }, { "epoch": 0.00654, "grad_norm": 0.13990314304828644, "learning_rate": 1e-05, "loss": 0.0204, "step": 654000 }, { "epoch": 0.006541, "grad_norm": 0.18024654686450958, "learning_rate": 1e-05, "loss": 0.0205, "step": 654100 }, { "epoch": 0.006542, "grad_norm": 0.14947643876075745, "learning_rate": 1e-05, "loss": 0.0203, "step": 654200 }, { "epoch": 0.006543, "grad_norm": 0.15789124369621277, "learning_rate": 1e-05, "loss": 0.02, "step": 654300 }, { "epoch": 0.006544, "grad_norm": 0.15005327761173248, "learning_rate": 1e-05, "loss": 0.0206, "step": 654400 }, { "epoch": 0.006545, "grad_norm": 0.13772642612457275, "learning_rate": 1e-05, "loss": 0.0202, "step": 654500 }, { "epoch": 0.006546, "grad_norm": 0.14275844395160675, "learning_rate": 1e-05, "loss": 0.0208, "step": 654600 }, { "epoch": 0.006547, "grad_norm": 0.15244492888450623, "learning_rate": 1e-05, "loss": 0.0204, "step": 654700 }, { "epoch": 0.006548, "grad_norm": 0.207069531083107, "learning_rate": 1e-05, "loss": 0.0201, "step": 654800 }, { "epoch": 0.006549, "grad_norm": 0.16732564568519592, "learning_rate": 1e-05, "loss": 0.0198, "step": 654900 }, { "epoch": 0.00655, "grad_norm": 0.1398012787103653, "learning_rate": 1e-05, "loss": 0.0206, "step": 655000 }, { "epoch": 0.006551, "grad_norm": 0.18908953666687012, "learning_rate": 1e-05, "loss": 0.0201, "step": 655100 }, { "epoch": 0.006552, "grad_norm": 0.176034078001976, "learning_rate": 1e-05, "loss": 0.0205, "step": 655200 }, { "epoch": 0.006553, "grad_norm": 0.14255258440971375, "learning_rate": 1e-05, "loss": 0.0202, "step": 655300 }, { "epoch": 0.006554, "grad_norm": 0.1492723971605301, "learning_rate": 1e-05, "loss": 0.0197, "step": 655400 }, { "epoch": 0.006555, "grad_norm": 0.20177561044692993, "learning_rate": 1e-05, "loss": 0.0205, "step": 655500 }, { "epoch": 0.006556, "grad_norm": 0.1414705216884613, "learning_rate": 1e-05, "loss": 0.0203, "step": 655600 }, { "epoch": 0.006557, "grad_norm": 0.15312445163726807, "learning_rate": 1e-05, "loss": 0.0201, "step": 655700 }, { "epoch": 0.006558, "grad_norm": 0.1598077267408371, "learning_rate": 1e-05, "loss": 0.0205, "step": 655800 }, { "epoch": 0.006559, "grad_norm": 0.20020052790641785, "learning_rate": 1e-05, "loss": 0.0204, "step": 655900 }, { "epoch": 0.00656, "grad_norm": 0.1633697897195816, "learning_rate": 1e-05, "loss": 0.0197, "step": 656000 }, { "epoch": 0.006561, "grad_norm": 0.14048472046852112, "learning_rate": 1e-05, "loss": 0.0205, "step": 656100 }, { "epoch": 0.006562, "grad_norm": 0.2522900104522705, "learning_rate": 1e-05, "loss": 0.0201, "step": 656200 }, { "epoch": 0.006563, "grad_norm": 0.21404823660850525, "learning_rate": 1e-05, "loss": 0.0207, "step": 656300 }, { "epoch": 0.006564, "grad_norm": 0.1919536292552948, "learning_rate": 1e-05, "loss": 0.0203, "step": 656400 }, { "epoch": 0.006565, "grad_norm": 0.12566766142845154, "learning_rate": 1e-05, "loss": 0.0208, "step": 656500 }, { "epoch": 0.006566, "grad_norm": 0.1648520529270172, "learning_rate": 1e-05, "loss": 0.02, "step": 656600 }, { "epoch": 0.006567, "grad_norm": 0.17544393241405487, "learning_rate": 1e-05, "loss": 0.02, "step": 656700 }, { "epoch": 0.006568, "grad_norm": 0.13357965648174286, "learning_rate": 1e-05, "loss": 0.0203, "step": 656800 }, { "epoch": 0.006569, "grad_norm": 0.34106773138046265, "learning_rate": 1e-05, "loss": 0.02, "step": 656900 }, { "epoch": 0.00657, "grad_norm": 0.17659194767475128, "learning_rate": 1e-05, "loss": 0.0199, "step": 657000 }, { "epoch": 0.006571, "grad_norm": 0.18654023110866547, "learning_rate": 1e-05, "loss": 0.02, "step": 657100 }, { "epoch": 0.006572, "grad_norm": 0.20495086908340454, "learning_rate": 1e-05, "loss": 0.0203, "step": 657200 }, { "epoch": 0.006573, "grad_norm": 0.24926085770130157, "learning_rate": 1e-05, "loss": 0.0205, "step": 657300 }, { "epoch": 0.006574, "grad_norm": 0.1735149770975113, "learning_rate": 1e-05, "loss": 0.0203, "step": 657400 }, { "epoch": 0.006575, "grad_norm": 0.11423244327306747, "learning_rate": 1e-05, "loss": 0.0208, "step": 657500 }, { "epoch": 0.006576, "grad_norm": 0.21853870153427124, "learning_rate": 1e-05, "loss": 0.0202, "step": 657600 }, { "epoch": 0.006577, "grad_norm": 0.13131579756736755, "learning_rate": 1e-05, "loss": 0.0205, "step": 657700 }, { "epoch": 0.006578, "grad_norm": 0.14778390526771545, "learning_rate": 1e-05, "loss": 0.0202, "step": 657800 }, { "epoch": 0.006579, "grad_norm": 0.17639094591140747, "learning_rate": 1e-05, "loss": 0.0201, "step": 657900 }, { "epoch": 0.00658, "grad_norm": 0.13444869220256805, "learning_rate": 1e-05, "loss": 0.0205, "step": 658000 }, { "epoch": 0.006581, "grad_norm": 0.3425089716911316, "learning_rate": 1e-05, "loss": 0.02, "step": 658100 }, { "epoch": 0.006582, "grad_norm": 0.14317145943641663, "learning_rate": 1e-05, "loss": 0.0203, "step": 658200 }, { "epoch": 0.006583, "grad_norm": 0.20663900673389435, "learning_rate": 1e-05, "loss": 0.02, "step": 658300 }, { "epoch": 0.006584, "grad_norm": 0.14128293097019196, "learning_rate": 1e-05, "loss": 0.0202, "step": 658400 }, { "epoch": 0.006585, "grad_norm": 0.16130098700523376, "learning_rate": 1e-05, "loss": 0.0203, "step": 658500 }, { "epoch": 0.006586, "grad_norm": 0.1786297857761383, "learning_rate": 1e-05, "loss": 0.0206, "step": 658600 }, { "epoch": 0.006587, "grad_norm": 0.1569976508617401, "learning_rate": 1e-05, "loss": 0.0201, "step": 658700 }, { "epoch": 0.006588, "grad_norm": 0.1617337018251419, "learning_rate": 1e-05, "loss": 0.0205, "step": 658800 }, { "epoch": 0.006589, "grad_norm": 0.15332622826099396, "learning_rate": 1e-05, "loss": 0.0202, "step": 658900 }, { "epoch": 0.00659, "grad_norm": 0.17310881614685059, "learning_rate": 1e-05, "loss": 0.0202, "step": 659000 }, { "epoch": 0.006591, "grad_norm": 0.15419475734233856, "learning_rate": 1e-05, "loss": 0.0203, "step": 659100 }, { "epoch": 0.006592, "grad_norm": 0.1566811203956604, "learning_rate": 1e-05, "loss": 0.0201, "step": 659200 }, { "epoch": 0.006593, "grad_norm": 0.18507570028305054, "learning_rate": 1e-05, "loss": 0.02, "step": 659300 }, { "epoch": 0.006594, "grad_norm": 0.1857679933309555, "learning_rate": 1e-05, "loss": 0.0204, "step": 659400 }, { "epoch": 0.006595, "grad_norm": 0.12878215312957764, "learning_rate": 1e-05, "loss": 0.0205, "step": 659500 }, { "epoch": 0.006596, "grad_norm": 0.23296770453453064, "learning_rate": 1e-05, "loss": 0.0201, "step": 659600 }, { "epoch": 0.006597, "grad_norm": 0.18017607927322388, "learning_rate": 1e-05, "loss": 0.0201, "step": 659700 }, { "epoch": 0.006598, "grad_norm": 0.12324637174606323, "learning_rate": 1e-05, "loss": 0.0205, "step": 659800 }, { "epoch": 0.006599, "grad_norm": 0.1537853479385376, "learning_rate": 1e-05, "loss": 0.0204, "step": 659900 }, { "epoch": 0.0066, "grad_norm": 0.15806764364242554, "learning_rate": 1e-05, "loss": 0.0203, "step": 660000 }, { "epoch": 0.0066, "eval_loss": 0.018065335229039192, "eval_runtime": 169.0584, "eval_samples_per_second": 295.756, "eval_steps_per_second": 18.485, "step": 660000 }, { "epoch": 0.006601, "grad_norm": 0.1685188114643097, "learning_rate": 1e-05, "loss": 0.0205, "step": 660100 }, { "epoch": 0.006602, "grad_norm": 0.20284287631511688, "learning_rate": 1e-05, "loss": 0.0205, "step": 660200 }, { "epoch": 0.006603, "grad_norm": 0.13748309016227722, "learning_rate": 1e-05, "loss": 0.02, "step": 660300 }, { "epoch": 0.006604, "grad_norm": 0.15432852506637573, "learning_rate": 1e-05, "loss": 0.02, "step": 660400 }, { "epoch": 0.006605, "grad_norm": 0.13139982521533966, "learning_rate": 1e-05, "loss": 0.0205, "step": 660500 }, { "epoch": 0.006606, "grad_norm": 0.1521771252155304, "learning_rate": 1e-05, "loss": 0.0201, "step": 660600 }, { "epoch": 0.006607, "grad_norm": 0.19163019955158234, "learning_rate": 1e-05, "loss": 0.0204, "step": 660700 }, { "epoch": 0.006608, "grad_norm": 0.19552449882030487, "learning_rate": 1e-05, "loss": 0.0202, "step": 660800 }, { "epoch": 0.006609, "grad_norm": 0.14210863411426544, "learning_rate": 1e-05, "loss": 0.02, "step": 660900 }, { "epoch": 0.00661, "grad_norm": 0.11765111982822418, "learning_rate": 1e-05, "loss": 0.0201, "step": 661000 }, { "epoch": 0.006611, "grad_norm": 0.16293568909168243, "learning_rate": 1e-05, "loss": 0.0205, "step": 661100 }, { "epoch": 0.006612, "grad_norm": 0.1780109703540802, "learning_rate": 1e-05, "loss": 0.0202, "step": 661200 }, { "epoch": 0.006613, "grad_norm": 0.14589357376098633, "learning_rate": 1e-05, "loss": 0.0207, "step": 661300 }, { "epoch": 0.006614, "grad_norm": 0.1467968076467514, "learning_rate": 1e-05, "loss": 0.0201, "step": 661400 }, { "epoch": 0.006615, "grad_norm": 0.1308368444442749, "learning_rate": 1e-05, "loss": 0.0203, "step": 661500 }, { "epoch": 0.006616, "grad_norm": 0.11129401624202728, "learning_rate": 1e-05, "loss": 0.0202, "step": 661600 }, { "epoch": 0.006617, "grad_norm": 0.12362271547317505, "learning_rate": 1e-05, "loss": 0.0204, "step": 661700 }, { "epoch": 0.006618, "grad_norm": 0.19282498955726624, "learning_rate": 1e-05, "loss": 0.0202, "step": 661800 }, { "epoch": 0.006619, "grad_norm": 0.23562999069690704, "learning_rate": 1e-05, "loss": 0.0199, "step": 661900 }, { "epoch": 0.00662, "grad_norm": 0.16727818548679352, "learning_rate": 1e-05, "loss": 0.0204, "step": 662000 }, { "epoch": 0.006621, "grad_norm": 0.17108270525932312, "learning_rate": 1e-05, "loss": 0.02, "step": 662100 }, { "epoch": 0.006622, "grad_norm": 0.1513115018606186, "learning_rate": 1e-05, "loss": 0.0208, "step": 662200 }, { "epoch": 0.006623, "grad_norm": 0.2174321860074997, "learning_rate": 1e-05, "loss": 0.0194, "step": 662300 }, { "epoch": 0.006624, "grad_norm": 0.20893703401088715, "learning_rate": 1e-05, "loss": 0.0194, "step": 662400 }, { "epoch": 0.006625, "grad_norm": 0.2177962064743042, "learning_rate": 1e-05, "loss": 0.0208, "step": 662500 }, { "epoch": 0.006626, "grad_norm": 0.21626216173171997, "learning_rate": 1e-05, "loss": 0.0204, "step": 662600 }, { "epoch": 0.006627, "grad_norm": 0.21712026000022888, "learning_rate": 1e-05, "loss": 0.0204, "step": 662700 }, { "epoch": 0.006628, "grad_norm": 0.16072022914886475, "learning_rate": 1e-05, "loss": 0.0204, "step": 662800 }, { "epoch": 0.006629, "grad_norm": 0.1293783187866211, "learning_rate": 1e-05, "loss": 0.0199, "step": 662900 }, { "epoch": 0.00663, "grad_norm": 0.13301992416381836, "learning_rate": 1e-05, "loss": 0.0201, "step": 663000 }, { "epoch": 0.006631, "grad_norm": 0.2498013824224472, "learning_rate": 1e-05, "loss": 0.0202, "step": 663100 }, { "epoch": 0.006632, "grad_norm": 0.22724445164203644, "learning_rate": 1e-05, "loss": 0.0201, "step": 663200 }, { "epoch": 0.006633, "grad_norm": 0.15099382400512695, "learning_rate": 1e-05, "loss": 0.02, "step": 663300 }, { "epoch": 0.006634, "grad_norm": 0.1310555636882782, "learning_rate": 1e-05, "loss": 0.0202, "step": 663400 }, { "epoch": 0.006635, "grad_norm": 0.2283080816268921, "learning_rate": 1e-05, "loss": 0.02, "step": 663500 }, { "epoch": 0.006636, "grad_norm": 0.19001807272434235, "learning_rate": 1e-05, "loss": 0.0204, "step": 663600 }, { "epoch": 0.006637, "grad_norm": 0.17126113176345825, "learning_rate": 1e-05, "loss": 0.02, "step": 663700 }, { "epoch": 0.006638, "grad_norm": 0.15017841756343842, "learning_rate": 1e-05, "loss": 0.0204, "step": 663800 }, { "epoch": 0.006639, "grad_norm": 0.18610727787017822, "learning_rate": 1e-05, "loss": 0.02, "step": 663900 }, { "epoch": 0.00664, "grad_norm": 0.18615812063217163, "learning_rate": 1e-05, "loss": 0.0204, "step": 664000 }, { "epoch": 0.006641, "grad_norm": 0.18168367445468903, "learning_rate": 1e-05, "loss": 0.0199, "step": 664100 }, { "epoch": 0.006642, "grad_norm": 0.16449566185474396, "learning_rate": 1e-05, "loss": 0.0197, "step": 664200 }, { "epoch": 0.006643, "grad_norm": 0.18041294813156128, "learning_rate": 1e-05, "loss": 0.0201, "step": 664300 }, { "epoch": 0.006644, "grad_norm": 0.14277030527591705, "learning_rate": 1e-05, "loss": 0.0201, "step": 664400 }, { "epoch": 0.006645, "grad_norm": 0.16397753357887268, "learning_rate": 1e-05, "loss": 0.0195, "step": 664500 }, { "epoch": 0.006646, "grad_norm": 0.1995062679052353, "learning_rate": 1e-05, "loss": 0.0202, "step": 664600 }, { "epoch": 0.006647, "grad_norm": 0.1738767772912979, "learning_rate": 1e-05, "loss": 0.0202, "step": 664700 }, { "epoch": 0.006648, "grad_norm": 0.15077786147594452, "learning_rate": 1e-05, "loss": 0.0198, "step": 664800 }, { "epoch": 0.006649, "grad_norm": 0.11903395503759384, "learning_rate": 1e-05, "loss": 0.0201, "step": 664900 }, { "epoch": 0.00665, "grad_norm": 0.18736159801483154, "learning_rate": 1e-05, "loss": 0.0203, "step": 665000 }, { "epoch": 0.006651, "grad_norm": 0.14263804256916046, "learning_rate": 1e-05, "loss": 0.0202, "step": 665100 }, { "epoch": 0.006652, "grad_norm": 0.16837500035762787, "learning_rate": 1e-05, "loss": 0.0202, "step": 665200 }, { "epoch": 0.006653, "grad_norm": 0.24975839257240295, "learning_rate": 1e-05, "loss": 0.0202, "step": 665300 }, { "epoch": 0.006654, "grad_norm": 0.21337443590164185, "learning_rate": 1e-05, "loss": 0.0199, "step": 665400 }, { "epoch": 0.006655, "grad_norm": 0.1382983773946762, "learning_rate": 1e-05, "loss": 0.0198, "step": 665500 }, { "epoch": 0.006656, "grad_norm": 0.20605657994747162, "learning_rate": 1e-05, "loss": 0.0203, "step": 665600 }, { "epoch": 0.006657, "grad_norm": 0.16358637809753418, "learning_rate": 1e-05, "loss": 0.0196, "step": 665700 }, { "epoch": 0.006658, "grad_norm": 0.15890085697174072, "learning_rate": 1e-05, "loss": 0.0198, "step": 665800 }, { "epoch": 0.006659, "grad_norm": 0.1520826667547226, "learning_rate": 1e-05, "loss": 0.0204, "step": 665900 }, { "epoch": 0.00666, "grad_norm": 0.172838032245636, "learning_rate": 1e-05, "loss": 0.0199, "step": 666000 }, { "epoch": 0.006661, "grad_norm": 0.21179205179214478, "learning_rate": 1e-05, "loss": 0.0202, "step": 666100 }, { "epoch": 0.006662, "grad_norm": 0.18424706161022186, "learning_rate": 1e-05, "loss": 0.0204, "step": 666200 }, { "epoch": 0.006663, "grad_norm": 0.1507119983434677, "learning_rate": 1e-05, "loss": 0.0203, "step": 666300 }, { "epoch": 0.006664, "grad_norm": 0.18394121527671814, "learning_rate": 1e-05, "loss": 0.0205, "step": 666400 }, { "epoch": 0.006665, "grad_norm": 0.25857827067375183, "learning_rate": 1e-05, "loss": 0.0203, "step": 666500 }, { "epoch": 0.006666, "grad_norm": 0.11875756084918976, "learning_rate": 1e-05, "loss": 0.0202, "step": 666600 }, { "epoch": 0.006667, "grad_norm": 0.1530715674161911, "learning_rate": 1e-05, "loss": 0.02, "step": 666700 }, { "epoch": 0.006668, "grad_norm": 0.16316597163677216, "learning_rate": 1e-05, "loss": 0.0204, "step": 666800 }, { "epoch": 0.006669, "grad_norm": 0.164708212018013, "learning_rate": 1e-05, "loss": 0.02, "step": 666900 }, { "epoch": 0.00667, "grad_norm": 0.15878616273403168, "learning_rate": 1e-05, "loss": 0.0207, "step": 667000 }, { "epoch": 0.006671, "grad_norm": 0.13918708264827728, "learning_rate": 1e-05, "loss": 0.0203, "step": 667100 }, { "epoch": 0.006672, "grad_norm": 0.15655791759490967, "learning_rate": 1e-05, "loss": 0.02, "step": 667200 }, { "epoch": 0.006673, "grad_norm": 0.1398797333240509, "learning_rate": 1e-05, "loss": 0.0201, "step": 667300 }, { "epoch": 0.006674, "grad_norm": 0.17276053130626678, "learning_rate": 1e-05, "loss": 0.0194, "step": 667400 }, { "epoch": 0.006675, "grad_norm": 0.15728874504566193, "learning_rate": 1e-05, "loss": 0.0197, "step": 667500 }, { "epoch": 0.006676, "grad_norm": 0.18961840867996216, "learning_rate": 1e-05, "loss": 0.0199, "step": 667600 }, { "epoch": 0.006677, "grad_norm": 0.20413389801979065, "learning_rate": 1e-05, "loss": 0.02, "step": 667700 }, { "epoch": 0.006678, "grad_norm": 0.1591649204492569, "learning_rate": 1e-05, "loss": 0.02, "step": 667800 }, { "epoch": 0.006679, "grad_norm": 0.12278532236814499, "learning_rate": 1e-05, "loss": 0.0205, "step": 667900 }, { "epoch": 0.00668, "grad_norm": 0.1476123332977295, "learning_rate": 1e-05, "loss": 0.0201, "step": 668000 }, { "epoch": 0.006681, "grad_norm": 0.20554737746715546, "learning_rate": 1e-05, "loss": 0.0198, "step": 668100 }, { "epoch": 0.006682, "grad_norm": 0.1267973780632019, "learning_rate": 1e-05, "loss": 0.0205, "step": 668200 }, { "epoch": 0.006683, "grad_norm": 0.165740504860878, "learning_rate": 1e-05, "loss": 0.02, "step": 668300 }, { "epoch": 0.006684, "grad_norm": 0.16048841178417206, "learning_rate": 1e-05, "loss": 0.0202, "step": 668400 }, { "epoch": 0.006685, "grad_norm": 0.1966540515422821, "learning_rate": 1e-05, "loss": 0.0201, "step": 668500 }, { "epoch": 0.006686, "grad_norm": 0.21526701748371124, "learning_rate": 1e-05, "loss": 0.0202, "step": 668600 }, { "epoch": 0.006687, "grad_norm": 0.13313326239585876, "learning_rate": 1e-05, "loss": 0.0203, "step": 668700 }, { "epoch": 0.006688, "grad_norm": 0.1759059578180313, "learning_rate": 1e-05, "loss": 0.0203, "step": 668800 }, { "epoch": 0.006689, "grad_norm": 0.19922222197055817, "learning_rate": 1e-05, "loss": 0.0199, "step": 668900 }, { "epoch": 0.00669, "grad_norm": 0.17820875346660614, "learning_rate": 1e-05, "loss": 0.0199, "step": 669000 }, { "epoch": 0.006691, "grad_norm": 0.15161626040935516, "learning_rate": 1e-05, "loss": 0.02, "step": 669100 }, { "epoch": 0.006692, "grad_norm": 0.12759795784950256, "learning_rate": 1e-05, "loss": 0.0198, "step": 669200 }, { "epoch": 0.006693, "grad_norm": 0.1796836405992508, "learning_rate": 1e-05, "loss": 0.0202, "step": 669300 }, { "epoch": 0.006694, "grad_norm": 0.19587582349777222, "learning_rate": 1e-05, "loss": 0.0201, "step": 669400 }, { "epoch": 0.006695, "grad_norm": 0.17890925705432892, "learning_rate": 1e-05, "loss": 0.0201, "step": 669500 }, { "epoch": 0.006696, "grad_norm": 0.1638539433479309, "learning_rate": 1e-05, "loss": 0.0202, "step": 669600 }, { "epoch": 0.006697, "grad_norm": 0.1897086799144745, "learning_rate": 1e-05, "loss": 0.0206, "step": 669700 }, { "epoch": 0.006698, "grad_norm": 0.17429326474666595, "learning_rate": 1e-05, "loss": 0.0201, "step": 669800 }, { "epoch": 0.006699, "grad_norm": 0.14986185729503632, "learning_rate": 1e-05, "loss": 0.0203, "step": 669900 }, { "epoch": 0.0067, "grad_norm": 0.18413834273815155, "learning_rate": 1e-05, "loss": 0.0206, "step": 670000 }, { "epoch": 0.006701, "grad_norm": 0.13743361830711365, "learning_rate": 1e-05, "loss": 0.0198, "step": 670100 }, { "epoch": 0.006702, "grad_norm": 0.13850833475589752, "learning_rate": 1e-05, "loss": 0.0201, "step": 670200 }, { "epoch": 0.006703, "grad_norm": 0.15366974472999573, "learning_rate": 1e-05, "loss": 0.0205, "step": 670300 }, { "epoch": 0.006704, "grad_norm": 0.22855767607688904, "learning_rate": 1e-05, "loss": 0.0204, "step": 670400 }, { "epoch": 0.006705, "grad_norm": 0.16353899240493774, "learning_rate": 1e-05, "loss": 0.0194, "step": 670500 }, { "epoch": 0.006706, "grad_norm": 0.17644701898097992, "learning_rate": 1e-05, "loss": 0.0206, "step": 670600 }, { "epoch": 0.006707, "grad_norm": 0.12759122252464294, "learning_rate": 1e-05, "loss": 0.02, "step": 670700 }, { "epoch": 0.006708, "grad_norm": 0.14656227827072144, "learning_rate": 1e-05, "loss": 0.0203, "step": 670800 }, { "epoch": 0.006709, "grad_norm": 0.16387279331684113, "learning_rate": 1e-05, "loss": 0.02, "step": 670900 }, { "epoch": 0.00671, "grad_norm": 0.19638237357139587, "learning_rate": 1e-05, "loss": 0.0205, "step": 671000 }, { "epoch": 0.006711, "grad_norm": 0.18617086112499237, "learning_rate": 1e-05, "loss": 0.0204, "step": 671100 }, { "epoch": 0.006712, "grad_norm": 0.14767727255821228, "learning_rate": 1e-05, "loss": 0.0202, "step": 671200 }, { "epoch": 0.006713, "grad_norm": 0.12382063269615173, "learning_rate": 1e-05, "loss": 0.0204, "step": 671300 }, { "epoch": 0.006714, "grad_norm": 0.1871551275253296, "learning_rate": 1e-05, "loss": 0.0201, "step": 671400 }, { "epoch": 0.006715, "grad_norm": 0.15697261691093445, "learning_rate": 1e-05, "loss": 0.0199, "step": 671500 }, { "epoch": 0.006716, "grad_norm": 0.15743373334407806, "learning_rate": 1e-05, "loss": 0.0202, "step": 671600 }, { "epoch": 0.006717, "grad_norm": 0.14203931391239166, "learning_rate": 1e-05, "loss": 0.0199, "step": 671700 }, { "epoch": 0.006718, "grad_norm": 0.22024375200271606, "learning_rate": 1e-05, "loss": 0.02, "step": 671800 }, { "epoch": 0.006719, "grad_norm": 0.17285870015621185, "learning_rate": 1e-05, "loss": 0.0197, "step": 671900 }, { "epoch": 0.00672, "grad_norm": 0.14079077541828156, "learning_rate": 1e-05, "loss": 0.0199, "step": 672000 }, { "epoch": 0.006721, "grad_norm": 0.15664608776569366, "learning_rate": 1e-05, "loss": 0.0206, "step": 672100 }, { "epoch": 0.006722, "grad_norm": 0.13246087729930878, "learning_rate": 1e-05, "loss": 0.02, "step": 672200 }, { "epoch": 0.006723, "grad_norm": 0.13810038566589355, "learning_rate": 1e-05, "loss": 0.0196, "step": 672300 }, { "epoch": 0.006724, "grad_norm": 0.19153283536434174, "learning_rate": 1e-05, "loss": 0.0202, "step": 672400 }, { "epoch": 0.006725, "grad_norm": 0.14055797457695007, "learning_rate": 1e-05, "loss": 0.02, "step": 672500 }, { "epoch": 0.006726, "grad_norm": 0.20073598623275757, "learning_rate": 1e-05, "loss": 0.0204, "step": 672600 }, { "epoch": 0.006727, "grad_norm": 0.18673254549503326, "learning_rate": 1e-05, "loss": 0.0197, "step": 672700 }, { "epoch": 0.006728, "grad_norm": 0.16903413832187653, "learning_rate": 1e-05, "loss": 0.0196, "step": 672800 }, { "epoch": 0.006729, "grad_norm": 0.15411794185638428, "learning_rate": 1e-05, "loss": 0.0204, "step": 672900 }, { "epoch": 0.00673, "grad_norm": 0.18545544147491455, "learning_rate": 1e-05, "loss": 0.0196, "step": 673000 }, { "epoch": 0.006731, "grad_norm": 0.14976796507835388, "learning_rate": 1e-05, "loss": 0.0202, "step": 673100 }, { "epoch": 0.006732, "grad_norm": 0.15572668612003326, "learning_rate": 1e-05, "loss": 0.0201, "step": 673200 }, { "epoch": 0.006733, "grad_norm": 0.22145171463489532, "learning_rate": 1e-05, "loss": 0.0198, "step": 673300 }, { "epoch": 0.006734, "grad_norm": 0.20904424786567688, "learning_rate": 1e-05, "loss": 0.0202, "step": 673400 }, { "epoch": 0.006735, "grad_norm": 0.1777569204568863, "learning_rate": 1e-05, "loss": 0.0202, "step": 673500 }, { "epoch": 0.006736, "grad_norm": 0.18887603282928467, "learning_rate": 1e-05, "loss": 0.0205, "step": 673600 }, { "epoch": 0.006737, "grad_norm": 0.17859691381454468, "learning_rate": 1e-05, "loss": 0.02, "step": 673700 }, { "epoch": 0.006738, "grad_norm": 0.2167612612247467, "learning_rate": 1e-05, "loss": 0.02, "step": 673800 }, { "epoch": 0.006739, "grad_norm": 0.18777401745319366, "learning_rate": 1e-05, "loss": 0.02, "step": 673900 }, { "epoch": 0.00674, "grad_norm": 0.14620791375637054, "learning_rate": 1e-05, "loss": 0.0201, "step": 674000 }, { "epoch": 0.006741, "grad_norm": 0.13165128231048584, "learning_rate": 1e-05, "loss": 0.0195, "step": 674100 }, { "epoch": 0.006742, "grad_norm": 0.14512252807617188, "learning_rate": 1e-05, "loss": 0.0205, "step": 674200 }, { "epoch": 0.006743, "grad_norm": 0.14688247442245483, "learning_rate": 1e-05, "loss": 0.0197, "step": 674300 }, { "epoch": 0.006744, "grad_norm": 0.13422133028507233, "learning_rate": 1e-05, "loss": 0.0202, "step": 674400 }, { "epoch": 0.006745, "grad_norm": 0.18117086589336395, "learning_rate": 1e-05, "loss": 0.0201, "step": 674500 }, { "epoch": 0.006746, "grad_norm": 0.17796590924263, "learning_rate": 1e-05, "loss": 0.0202, "step": 674600 }, { "epoch": 0.006747, "grad_norm": 0.20980772376060486, "learning_rate": 1e-05, "loss": 0.0206, "step": 674700 }, { "epoch": 0.006748, "grad_norm": 0.13397839665412903, "learning_rate": 1e-05, "loss": 0.0202, "step": 674800 }, { "epoch": 0.006749, "grad_norm": 0.15501612424850464, "learning_rate": 1e-05, "loss": 0.0203, "step": 674900 }, { "epoch": 0.00675, "grad_norm": 0.17488625645637512, "learning_rate": 1e-05, "loss": 0.0202, "step": 675000 }, { "epoch": 0.006751, "grad_norm": 0.2526601254940033, "learning_rate": 1e-05, "loss": 0.0202, "step": 675100 }, { "epoch": 0.006752, "grad_norm": 0.14359185099601746, "learning_rate": 1e-05, "loss": 0.0198, "step": 675200 }, { "epoch": 0.006753, "grad_norm": 0.1663753092288971, "learning_rate": 1e-05, "loss": 0.0197, "step": 675300 }, { "epoch": 0.006754, "grad_norm": 0.11094969511032104, "learning_rate": 1e-05, "loss": 0.0208, "step": 675400 }, { "epoch": 0.006755, "grad_norm": 0.2387596070766449, "learning_rate": 1e-05, "loss": 0.0198, "step": 675500 }, { "epoch": 0.006756, "grad_norm": 0.1409689486026764, "learning_rate": 1e-05, "loss": 0.0199, "step": 675600 }, { "epoch": 0.006757, "grad_norm": 0.17202328145503998, "learning_rate": 1e-05, "loss": 0.02, "step": 675700 }, { "epoch": 0.006758, "grad_norm": 0.25168487429618835, "learning_rate": 1e-05, "loss": 0.0199, "step": 675800 }, { "epoch": 0.006759, "grad_norm": 0.1844267100095749, "learning_rate": 1e-05, "loss": 0.0198, "step": 675900 }, { "epoch": 0.00676, "grad_norm": 0.20796677470207214, "learning_rate": 1e-05, "loss": 0.0197, "step": 676000 }, { "epoch": 0.006761, "grad_norm": 0.13332122564315796, "learning_rate": 1e-05, "loss": 0.0205, "step": 676100 }, { "epoch": 0.006762, "grad_norm": 0.1697768121957779, "learning_rate": 1e-05, "loss": 0.02, "step": 676200 }, { "epoch": 0.006763, "grad_norm": 0.1187852993607521, "learning_rate": 1e-05, "loss": 0.0197, "step": 676300 }, { "epoch": 0.006764, "grad_norm": 0.1611827313899994, "learning_rate": 1e-05, "loss": 0.0203, "step": 676400 }, { "epoch": 0.006765, "grad_norm": 0.15297454595565796, "learning_rate": 1e-05, "loss": 0.0203, "step": 676500 }, { "epoch": 0.006766, "grad_norm": 0.11534599959850311, "learning_rate": 1e-05, "loss": 0.0199, "step": 676600 }, { "epoch": 0.006767, "grad_norm": 0.18607285618782043, "learning_rate": 1e-05, "loss": 0.0204, "step": 676700 }, { "epoch": 0.006768, "grad_norm": 0.17166830599308014, "learning_rate": 1e-05, "loss": 0.0205, "step": 676800 }, { "epoch": 0.006769, "grad_norm": 0.12974220514297485, "learning_rate": 1e-05, "loss": 0.0206, "step": 676900 }, { "epoch": 0.00677, "grad_norm": 0.16516250371932983, "learning_rate": 1e-05, "loss": 0.02, "step": 677000 }, { "epoch": 0.006771, "grad_norm": 0.17065294086933136, "learning_rate": 1e-05, "loss": 0.0199, "step": 677100 }, { "epoch": 0.006772, "grad_norm": 0.15111202001571655, "learning_rate": 1e-05, "loss": 0.0202, "step": 677200 }, { "epoch": 0.006773, "grad_norm": 0.17312030494213104, "learning_rate": 1e-05, "loss": 0.0196, "step": 677300 }, { "epoch": 0.006774, "grad_norm": 0.2509268820285797, "learning_rate": 1e-05, "loss": 0.0201, "step": 677400 }, { "epoch": 0.006775, "grad_norm": 0.15134558081626892, "learning_rate": 1e-05, "loss": 0.0197, "step": 677500 }, { "epoch": 0.006776, "grad_norm": 0.14988639950752258, "learning_rate": 1e-05, "loss": 0.02, "step": 677600 }, { "epoch": 0.006777, "grad_norm": 0.20623798668384552, "learning_rate": 1e-05, "loss": 0.02, "step": 677700 }, { "epoch": 0.006778, "grad_norm": 0.1512375771999359, "learning_rate": 1e-05, "loss": 0.0202, "step": 677800 }, { "epoch": 0.006779, "grad_norm": 0.20820726454257965, "learning_rate": 1e-05, "loss": 0.0202, "step": 677900 }, { "epoch": 0.00678, "grad_norm": 0.25777581334114075, "learning_rate": 1e-05, "loss": 0.0198, "step": 678000 }, { "epoch": 0.006781, "grad_norm": 0.17670346796512604, "learning_rate": 1e-05, "loss": 0.0206, "step": 678100 }, { "epoch": 0.006782, "grad_norm": 0.1608748733997345, "learning_rate": 1e-05, "loss": 0.0197, "step": 678200 }, { "epoch": 0.006783, "grad_norm": 0.14385201036930084, "learning_rate": 1e-05, "loss": 0.0196, "step": 678300 }, { "epoch": 0.006784, "grad_norm": 0.18606460094451904, "learning_rate": 1e-05, "loss": 0.0202, "step": 678400 }, { "epoch": 0.006785, "grad_norm": 0.12123222649097443, "learning_rate": 1e-05, "loss": 0.0202, "step": 678500 }, { "epoch": 0.006786, "grad_norm": 0.1318989247083664, "learning_rate": 1e-05, "loss": 0.02, "step": 678600 }, { "epoch": 0.006787, "grad_norm": 0.17261970043182373, "learning_rate": 1e-05, "loss": 0.0208, "step": 678700 }, { "epoch": 0.006788, "grad_norm": 0.11563287675380707, "learning_rate": 1e-05, "loss": 0.0198, "step": 678800 }, { "epoch": 0.006789, "grad_norm": 0.15004216134548187, "learning_rate": 1e-05, "loss": 0.0198, "step": 678900 }, { "epoch": 0.00679, "grad_norm": 0.12859517335891724, "learning_rate": 1e-05, "loss": 0.0204, "step": 679000 }, { "epoch": 0.006791, "grad_norm": 0.17998598515987396, "learning_rate": 1e-05, "loss": 0.02, "step": 679100 }, { "epoch": 0.006792, "grad_norm": 0.16366615891456604, "learning_rate": 1e-05, "loss": 0.0201, "step": 679200 }, { "epoch": 0.006793, "grad_norm": 0.12029530107975006, "learning_rate": 1e-05, "loss": 0.0205, "step": 679300 }, { "epoch": 0.006794, "grad_norm": 0.18130284547805786, "learning_rate": 1e-05, "loss": 0.0201, "step": 679400 }, { "epoch": 0.006795, "grad_norm": 0.21762274205684662, "learning_rate": 1e-05, "loss": 0.0201, "step": 679500 }, { "epoch": 0.006796, "grad_norm": 0.13823309540748596, "learning_rate": 1e-05, "loss": 0.0196, "step": 679600 }, { "epoch": 0.006797, "grad_norm": 0.18029040098190308, "learning_rate": 1e-05, "loss": 0.02, "step": 679700 }, { "epoch": 0.006798, "grad_norm": 0.14194108545780182, "learning_rate": 1e-05, "loss": 0.0202, "step": 679800 }, { "epoch": 0.006799, "grad_norm": 0.17548969388008118, "learning_rate": 1e-05, "loss": 0.0197, "step": 679900 }, { "epoch": 0.0068, "grad_norm": 0.15614669024944305, "learning_rate": 1e-05, "loss": 0.0202, "step": 680000 }, { "epoch": 0.0068, "eval_loss": 0.019248666241765022, "eval_runtime": 171.3146, "eval_samples_per_second": 291.861, "eval_steps_per_second": 18.241, "step": 680000 }, { "epoch": 0.006801, "grad_norm": 0.21116790175437927, "learning_rate": 1e-05, "loss": 0.0198, "step": 680100 }, { "epoch": 0.006802, "grad_norm": 0.2088579535484314, "learning_rate": 1e-05, "loss": 0.0201, "step": 680200 }, { "epoch": 0.006803, "grad_norm": 0.15514160692691803, "learning_rate": 1e-05, "loss": 0.02, "step": 680300 }, { "epoch": 0.006804, "grad_norm": 0.14425097405910492, "learning_rate": 1e-05, "loss": 0.0202, "step": 680400 }, { "epoch": 0.006805, "grad_norm": 0.15971289575099945, "learning_rate": 1e-05, "loss": 0.0203, "step": 680500 }, { "epoch": 0.006806, "grad_norm": 0.20289725065231323, "learning_rate": 1e-05, "loss": 0.0198, "step": 680600 }, { "epoch": 0.006807, "grad_norm": 0.3303561508655548, "learning_rate": 1e-05, "loss": 0.02, "step": 680700 }, { "epoch": 0.006808, "grad_norm": 0.19245842099189758, "learning_rate": 1e-05, "loss": 0.02, "step": 680800 }, { "epoch": 0.006809, "grad_norm": 0.20238684117794037, "learning_rate": 1e-05, "loss": 0.0204, "step": 680900 }, { "epoch": 0.00681, "grad_norm": 0.17910122871398926, "learning_rate": 1e-05, "loss": 0.0203, "step": 681000 }, { "epoch": 0.006811, "grad_norm": 0.21480730175971985, "learning_rate": 1e-05, "loss": 0.0203, "step": 681100 }, { "epoch": 0.006812, "grad_norm": 0.19475263357162476, "learning_rate": 1e-05, "loss": 0.02, "step": 681200 }, { "epoch": 0.006813, "grad_norm": 0.17588825523853302, "learning_rate": 1e-05, "loss": 0.0201, "step": 681300 }, { "epoch": 0.006814, "grad_norm": 0.15158799290657043, "learning_rate": 1e-05, "loss": 0.0201, "step": 681400 }, { "epoch": 0.006815, "grad_norm": 0.2074403017759323, "learning_rate": 1e-05, "loss": 0.0202, "step": 681500 }, { "epoch": 0.006816, "grad_norm": 0.1557527482509613, "learning_rate": 1e-05, "loss": 0.0201, "step": 681600 }, { "epoch": 0.006817, "grad_norm": 0.17362158000469208, "learning_rate": 1e-05, "loss": 0.0201, "step": 681700 }, { "epoch": 0.006818, "grad_norm": 0.16492517292499542, "learning_rate": 1e-05, "loss": 0.0205, "step": 681800 }, { "epoch": 0.006819, "grad_norm": 0.19607782363891602, "learning_rate": 1e-05, "loss": 0.0204, "step": 681900 }, { "epoch": 0.00682, "grad_norm": 0.14339469373226166, "learning_rate": 1e-05, "loss": 0.0201, "step": 682000 }, { "epoch": 0.006821, "grad_norm": 0.15967752039432526, "learning_rate": 1e-05, "loss": 0.0195, "step": 682100 }, { "epoch": 0.006822, "grad_norm": 0.14061446487903595, "learning_rate": 1e-05, "loss": 0.0195, "step": 682200 }, { "epoch": 0.006823, "grad_norm": 0.16728782653808594, "learning_rate": 1e-05, "loss": 0.0202, "step": 682300 }, { "epoch": 0.006824, "grad_norm": 0.21214069426059723, "learning_rate": 1e-05, "loss": 0.0198, "step": 682400 }, { "epoch": 0.006825, "grad_norm": 0.09942296147346497, "learning_rate": 1e-05, "loss": 0.0199, "step": 682500 }, { "epoch": 0.006826, "grad_norm": 0.1507798731327057, "learning_rate": 1e-05, "loss": 0.02, "step": 682600 }, { "epoch": 0.006827, "grad_norm": 0.13526104390621185, "learning_rate": 1e-05, "loss": 0.02, "step": 682700 }, { "epoch": 0.006828, "grad_norm": 0.10570035874843597, "learning_rate": 1e-05, "loss": 0.0202, "step": 682800 }, { "epoch": 0.006829, "grad_norm": 0.17920289933681488, "learning_rate": 1e-05, "loss": 0.0201, "step": 682900 }, { "epoch": 0.00683, "grad_norm": 0.1889697015285492, "learning_rate": 1e-05, "loss": 0.0197, "step": 683000 }, { "epoch": 0.006831, "grad_norm": 0.1649981290102005, "learning_rate": 1e-05, "loss": 0.0198, "step": 683100 }, { "epoch": 0.006832, "grad_norm": 0.1803230196237564, "learning_rate": 1e-05, "loss": 0.0201, "step": 683200 }, { "epoch": 0.006833, "grad_norm": 0.2927742302417755, "learning_rate": 1e-05, "loss": 0.0204, "step": 683300 }, { "epoch": 0.006834, "grad_norm": 0.19460155069828033, "learning_rate": 1e-05, "loss": 0.0198, "step": 683400 }, { "epoch": 0.006835, "grad_norm": 0.1486339122056961, "learning_rate": 1e-05, "loss": 0.0197, "step": 683500 }, { "epoch": 0.006836, "grad_norm": 0.12979336082935333, "learning_rate": 1e-05, "loss": 0.0197, "step": 683600 }, { "epoch": 0.006837, "grad_norm": 0.22740302979946136, "learning_rate": 1e-05, "loss": 0.02, "step": 683700 }, { "epoch": 0.006838, "grad_norm": 0.18147403001785278, "learning_rate": 1e-05, "loss": 0.0202, "step": 683800 }, { "epoch": 0.006839, "grad_norm": 0.15749862790107727, "learning_rate": 1e-05, "loss": 0.0202, "step": 683900 }, { "epoch": 0.00684, "grad_norm": 0.14274843037128448, "learning_rate": 1e-05, "loss": 0.02, "step": 684000 }, { "epoch": 0.006841, "grad_norm": 0.1677263379096985, "learning_rate": 1e-05, "loss": 0.0198, "step": 684100 }, { "epoch": 0.006842, "grad_norm": 0.20037242770195007, "learning_rate": 1e-05, "loss": 0.0197, "step": 684200 }, { "epoch": 0.006843, "grad_norm": 0.1626407653093338, "learning_rate": 1e-05, "loss": 0.0201, "step": 684300 }, { "epoch": 0.006844, "grad_norm": 0.15450404584407806, "learning_rate": 1e-05, "loss": 0.02, "step": 684400 }, { "epoch": 0.006845, "grad_norm": 0.1660902500152588, "learning_rate": 1e-05, "loss": 0.02, "step": 684500 }, { "epoch": 0.006846, "grad_norm": 0.15793131291866302, "learning_rate": 1e-05, "loss": 0.0199, "step": 684600 }, { "epoch": 0.006847, "grad_norm": 0.15605705976486206, "learning_rate": 1e-05, "loss": 0.0203, "step": 684700 }, { "epoch": 0.006848, "grad_norm": 0.26765894889831543, "learning_rate": 1e-05, "loss": 0.0196, "step": 684800 }, { "epoch": 0.006849, "grad_norm": 0.14239057898521423, "learning_rate": 1e-05, "loss": 0.0198, "step": 684900 }, { "epoch": 0.00685, "grad_norm": 0.15587256848812103, "learning_rate": 1e-05, "loss": 0.0201, "step": 685000 }, { "epoch": 0.006851, "grad_norm": 0.1559217870235443, "learning_rate": 1e-05, "loss": 0.0203, "step": 685100 }, { "epoch": 0.006852, "grad_norm": 0.15588229894638062, "learning_rate": 1e-05, "loss": 0.0196, "step": 685200 }, { "epoch": 0.006853, "grad_norm": 0.20489118993282318, "learning_rate": 1e-05, "loss": 0.0202, "step": 685300 }, { "epoch": 0.006854, "grad_norm": 0.168620303273201, "learning_rate": 1e-05, "loss": 0.0199, "step": 685400 }, { "epoch": 0.006855, "grad_norm": 0.2329803854227066, "learning_rate": 1e-05, "loss": 0.02, "step": 685500 }, { "epoch": 0.006856, "grad_norm": 0.20637795329093933, "learning_rate": 1e-05, "loss": 0.02, "step": 685600 }, { "epoch": 0.006857, "grad_norm": 0.24972449243068695, "learning_rate": 1e-05, "loss": 0.02, "step": 685700 }, { "epoch": 0.006858, "grad_norm": 0.13250549137592316, "learning_rate": 1e-05, "loss": 0.0204, "step": 685800 }, { "epoch": 0.006859, "grad_norm": 0.16321004927158356, "learning_rate": 1e-05, "loss": 0.0205, "step": 685900 }, { "epoch": 0.00686, "grad_norm": 0.16803587973117828, "learning_rate": 1e-05, "loss": 0.0207, "step": 686000 }, { "epoch": 0.006861, "grad_norm": 0.15514960885047913, "learning_rate": 1e-05, "loss": 0.0205, "step": 686100 }, { "epoch": 0.006862, "grad_norm": 0.17385327816009521, "learning_rate": 1e-05, "loss": 0.0201, "step": 686200 }, { "epoch": 0.006863, "grad_norm": 0.20041543245315552, "learning_rate": 1e-05, "loss": 0.0199, "step": 686300 }, { "epoch": 0.006864, "grad_norm": 0.15998122096061707, "learning_rate": 1e-05, "loss": 0.0203, "step": 686400 }, { "epoch": 0.006865, "grad_norm": 0.17289654910564423, "learning_rate": 1e-05, "loss": 0.0201, "step": 686500 }, { "epoch": 0.006866, "grad_norm": 0.1509542167186737, "learning_rate": 1e-05, "loss": 0.0202, "step": 686600 }, { "epoch": 0.006867, "grad_norm": 0.1596463918685913, "learning_rate": 1e-05, "loss": 0.0198, "step": 686700 }, { "epoch": 0.006868, "grad_norm": 0.17610017955303192, "learning_rate": 1e-05, "loss": 0.0195, "step": 686800 }, { "epoch": 0.006869, "grad_norm": 0.15621919929981232, "learning_rate": 1e-05, "loss": 0.0199, "step": 686900 }, { "epoch": 0.00687, "grad_norm": 0.1821345090866089, "learning_rate": 1e-05, "loss": 0.0201, "step": 687000 }, { "epoch": 0.006871, "grad_norm": 0.15959641337394714, "learning_rate": 1e-05, "loss": 0.0199, "step": 687100 }, { "epoch": 0.006872, "grad_norm": 0.15915027260780334, "learning_rate": 1e-05, "loss": 0.0199, "step": 687200 }, { "epoch": 0.006873, "grad_norm": 0.16150465607643127, "learning_rate": 1e-05, "loss": 0.0193, "step": 687300 }, { "epoch": 0.006874, "grad_norm": 0.17955486476421356, "learning_rate": 1e-05, "loss": 0.0198, "step": 687400 }, { "epoch": 0.006875, "grad_norm": 0.18827420473098755, "learning_rate": 1e-05, "loss": 0.02, "step": 687500 }, { "epoch": 0.006876, "grad_norm": 0.17804954946041107, "learning_rate": 1e-05, "loss": 0.0205, "step": 687600 }, { "epoch": 0.006877, "grad_norm": 0.1644495576620102, "learning_rate": 1e-05, "loss": 0.0193, "step": 687700 }, { "epoch": 0.006878, "grad_norm": 0.13943827152252197, "learning_rate": 1e-05, "loss": 0.0198, "step": 687800 }, { "epoch": 0.006879, "grad_norm": 0.19547101855278015, "learning_rate": 1e-05, "loss": 0.0203, "step": 687900 }, { "epoch": 0.00688, "grad_norm": 0.17496855556964874, "learning_rate": 1e-05, "loss": 0.0201, "step": 688000 }, { "epoch": 0.006881, "grad_norm": 0.15065255761146545, "learning_rate": 1e-05, "loss": 0.0199, "step": 688100 }, { "epoch": 0.006882, "grad_norm": 0.2579807937145233, "learning_rate": 1e-05, "loss": 0.02, "step": 688200 }, { "epoch": 0.006883, "grad_norm": 0.22215527296066284, "learning_rate": 1e-05, "loss": 0.0202, "step": 688300 }, { "epoch": 0.006884, "grad_norm": 0.15026350319385529, "learning_rate": 1e-05, "loss": 0.0201, "step": 688400 }, { "epoch": 0.006885, "grad_norm": 0.16349633038043976, "learning_rate": 1e-05, "loss": 0.0195, "step": 688500 }, { "epoch": 0.006886, "grad_norm": 0.1544891595840454, "learning_rate": 1e-05, "loss": 0.0199, "step": 688600 }, { "epoch": 0.006887, "grad_norm": 0.1707206815481186, "learning_rate": 1e-05, "loss": 0.0197, "step": 688700 }, { "epoch": 0.006888, "grad_norm": 0.13174721598625183, "learning_rate": 1e-05, "loss": 0.0201, "step": 688800 }, { "epoch": 0.006889, "grad_norm": 0.1531132459640503, "learning_rate": 1e-05, "loss": 0.0193, "step": 688900 }, { "epoch": 0.00689, "grad_norm": 0.24689516425132751, "learning_rate": 1e-05, "loss": 0.0198, "step": 689000 }, { "epoch": 0.006891, "grad_norm": 0.1698918491601944, "learning_rate": 1e-05, "loss": 0.02, "step": 689100 }, { "epoch": 0.006892, "grad_norm": 0.12224109470844269, "learning_rate": 1e-05, "loss": 0.02, "step": 689200 }, { "epoch": 0.006893, "grad_norm": 0.18030472099781036, "learning_rate": 1e-05, "loss": 0.0197, "step": 689300 }, { "epoch": 0.006894, "grad_norm": 0.13385216891765594, "learning_rate": 1e-05, "loss": 0.0203, "step": 689400 }, { "epoch": 0.006895, "grad_norm": 0.15473796427249908, "learning_rate": 1e-05, "loss": 0.0206, "step": 689500 }, { "epoch": 0.006896, "grad_norm": 0.16308410465717316, "learning_rate": 1e-05, "loss": 0.0202, "step": 689600 }, { "epoch": 0.006897, "grad_norm": 0.1318325698375702, "learning_rate": 1e-05, "loss": 0.0198, "step": 689700 }, { "epoch": 0.006898, "grad_norm": 0.14128954708576202, "learning_rate": 1e-05, "loss": 0.0198, "step": 689800 }, { "epoch": 0.006899, "grad_norm": 0.14943766593933105, "learning_rate": 1e-05, "loss": 0.02, "step": 689900 }, { "epoch": 0.0069, "grad_norm": 0.22478780150413513, "learning_rate": 1e-05, "loss": 0.0197, "step": 690000 }, { "epoch": 0.006901, "grad_norm": 0.15965890884399414, "learning_rate": 1e-05, "loss": 0.0198, "step": 690100 }, { "epoch": 0.006902, "grad_norm": 0.13225211203098297, "learning_rate": 1e-05, "loss": 0.02, "step": 690200 }, { "epoch": 0.006903, "grad_norm": 0.16837802529335022, "learning_rate": 1e-05, "loss": 0.0197, "step": 690300 }, { "epoch": 0.006904, "grad_norm": 0.16942240297794342, "learning_rate": 1e-05, "loss": 0.0197, "step": 690400 }, { "epoch": 0.006905, "grad_norm": 0.14177776873111725, "learning_rate": 1e-05, "loss": 0.0197, "step": 690500 }, { "epoch": 0.006906, "grad_norm": 0.16812852025032043, "learning_rate": 1e-05, "loss": 0.0197, "step": 690600 }, { "epoch": 0.006907, "grad_norm": 0.16482052206993103, "learning_rate": 1e-05, "loss": 0.0198, "step": 690700 }, { "epoch": 0.006908, "grad_norm": 0.19012795388698578, "learning_rate": 1e-05, "loss": 0.0196, "step": 690800 }, { "epoch": 0.006909, "grad_norm": 0.15551303327083588, "learning_rate": 1e-05, "loss": 0.0202, "step": 690900 }, { "epoch": 0.00691, "grad_norm": 0.1525798887014389, "learning_rate": 1e-05, "loss": 0.0199, "step": 691000 }, { "epoch": 0.006911, "grad_norm": 0.1490287035703659, "learning_rate": 1e-05, "loss": 0.0201, "step": 691100 }, { "epoch": 0.006912, "grad_norm": 0.11800742149353027, "learning_rate": 1e-05, "loss": 0.0199, "step": 691200 }, { "epoch": 0.006913, "grad_norm": 0.19321009516716003, "learning_rate": 1e-05, "loss": 0.0202, "step": 691300 }, { "epoch": 0.006914, "grad_norm": 0.24570511281490326, "learning_rate": 1e-05, "loss": 0.0197, "step": 691400 }, { "epoch": 0.006915, "grad_norm": 0.1610879898071289, "learning_rate": 1e-05, "loss": 0.02, "step": 691500 }, { "epoch": 0.006916, "grad_norm": 0.12846027314662933, "learning_rate": 1e-05, "loss": 0.0193, "step": 691600 }, { "epoch": 0.006917, "grad_norm": 0.19356414675712585, "learning_rate": 1e-05, "loss": 0.0199, "step": 691700 }, { "epoch": 0.006918, "grad_norm": 0.19600313901901245, "learning_rate": 1e-05, "loss": 0.02, "step": 691800 }, { "epoch": 0.006919, "grad_norm": 0.11814963072538376, "learning_rate": 1e-05, "loss": 0.0197, "step": 691900 }, { "epoch": 0.00692, "grad_norm": 0.179888054728508, "learning_rate": 1e-05, "loss": 0.02, "step": 692000 }, { "epoch": 0.006921, "grad_norm": 0.18388760089874268, "learning_rate": 1e-05, "loss": 0.0198, "step": 692100 }, { "epoch": 0.006922, "grad_norm": 0.17229889333248138, "learning_rate": 1e-05, "loss": 0.0197, "step": 692200 }, { "epoch": 0.006923, "grad_norm": 0.15931527316570282, "learning_rate": 1e-05, "loss": 0.0198, "step": 692300 }, { "epoch": 0.006924, "grad_norm": 0.16564889252185822, "learning_rate": 1e-05, "loss": 0.0197, "step": 692400 }, { "epoch": 0.006925, "grad_norm": 0.23457032442092896, "learning_rate": 1e-05, "loss": 0.0198, "step": 692500 }, { "epoch": 0.006926, "grad_norm": 0.12747691571712494, "learning_rate": 1e-05, "loss": 0.0195, "step": 692600 }, { "epoch": 0.006927, "grad_norm": 0.12400733679533005, "learning_rate": 1e-05, "loss": 0.0193, "step": 692700 }, { "epoch": 0.006928, "grad_norm": 0.17902721464633942, "learning_rate": 1e-05, "loss": 0.0206, "step": 692800 }, { "epoch": 0.006929, "grad_norm": 0.12265167385339737, "learning_rate": 1e-05, "loss": 0.0204, "step": 692900 }, { "epoch": 0.00693, "grad_norm": 0.15775761008262634, "learning_rate": 1e-05, "loss": 0.0199, "step": 693000 }, { "epoch": 0.006931, "grad_norm": 0.20507919788360596, "learning_rate": 1e-05, "loss": 0.0195, "step": 693100 }, { "epoch": 0.006932, "grad_norm": 0.1393631249666214, "learning_rate": 1e-05, "loss": 0.0198, "step": 693200 }, { "epoch": 0.006933, "grad_norm": 0.15301944315433502, "learning_rate": 1e-05, "loss": 0.0195, "step": 693300 }, { "epoch": 0.006934, "grad_norm": 0.1806642860174179, "learning_rate": 1e-05, "loss": 0.0196, "step": 693400 }, { "epoch": 0.006935, "grad_norm": 0.12399540841579437, "learning_rate": 1e-05, "loss": 0.0193, "step": 693500 }, { "epoch": 0.006936, "grad_norm": 0.1641191840171814, "learning_rate": 1e-05, "loss": 0.02, "step": 693600 }, { "epoch": 0.006937, "grad_norm": 0.20512810349464417, "learning_rate": 1e-05, "loss": 0.0192, "step": 693700 }, { "epoch": 0.006938, "grad_norm": 0.17359256744384766, "learning_rate": 1e-05, "loss": 0.0198, "step": 693800 }, { "epoch": 0.006939, "grad_norm": 0.1560746133327484, "learning_rate": 1e-05, "loss": 0.0198, "step": 693900 }, { "epoch": 0.00694, "grad_norm": 0.13722927868366241, "learning_rate": 1e-05, "loss": 0.02, "step": 694000 }, { "epoch": 0.006941, "grad_norm": 0.15791857242584229, "learning_rate": 1e-05, "loss": 0.0199, "step": 694100 }, { "epoch": 0.006942, "grad_norm": 0.1392795592546463, "learning_rate": 1e-05, "loss": 0.0201, "step": 694200 }, { "epoch": 0.006943, "grad_norm": 0.17297928035259247, "learning_rate": 1e-05, "loss": 0.0195, "step": 694300 }, { "epoch": 0.006944, "grad_norm": 0.16802600026130676, "learning_rate": 1e-05, "loss": 0.0203, "step": 694400 }, { "epoch": 0.006945, "grad_norm": 0.15773673355579376, "learning_rate": 1e-05, "loss": 0.0203, "step": 694500 }, { "epoch": 0.006946, "grad_norm": 0.1545788198709488, "learning_rate": 1e-05, "loss": 0.0198, "step": 694600 }, { "epoch": 0.006947, "grad_norm": 0.2213374376296997, "learning_rate": 1e-05, "loss": 0.0201, "step": 694700 }, { "epoch": 0.006948, "grad_norm": 0.15855488181114197, "learning_rate": 1e-05, "loss": 0.0196, "step": 694800 }, { "epoch": 0.006949, "grad_norm": 0.16223783791065216, "learning_rate": 1e-05, "loss": 0.0195, "step": 694900 }, { "epoch": 0.00695, "grad_norm": 0.16101516783237457, "learning_rate": 1e-05, "loss": 0.0194, "step": 695000 }, { "epoch": 0.006951, "grad_norm": 0.11717020720243454, "learning_rate": 1e-05, "loss": 0.0198, "step": 695100 }, { "epoch": 0.006952, "grad_norm": 0.16216996312141418, "learning_rate": 1e-05, "loss": 0.0199, "step": 695200 }, { "epoch": 0.006953, "grad_norm": 0.16429856419563293, "learning_rate": 1e-05, "loss": 0.0202, "step": 695300 }, { "epoch": 0.006954, "grad_norm": 0.1671290546655655, "learning_rate": 1e-05, "loss": 0.0202, "step": 695400 }, { "epoch": 0.006955, "grad_norm": 0.16388362646102905, "learning_rate": 1e-05, "loss": 0.0199, "step": 695500 }, { "epoch": 0.006956, "grad_norm": 0.1316460371017456, "learning_rate": 1e-05, "loss": 0.0198, "step": 695600 }, { "epoch": 0.006957, "grad_norm": 0.1473531275987625, "learning_rate": 1e-05, "loss": 0.0195, "step": 695700 }, { "epoch": 0.006958, "grad_norm": 0.12424464523792267, "learning_rate": 1e-05, "loss": 0.02, "step": 695800 }, { "epoch": 0.006959, "grad_norm": 0.14465609192848206, "learning_rate": 1e-05, "loss": 0.0198, "step": 695900 }, { "epoch": 0.00696, "grad_norm": 0.2014867514371872, "learning_rate": 1e-05, "loss": 0.0198, "step": 696000 }, { "epoch": 0.006961, "grad_norm": 0.23234610259532928, "learning_rate": 1e-05, "loss": 0.0197, "step": 696100 }, { "epoch": 0.006962, "grad_norm": 0.17005938291549683, "learning_rate": 1e-05, "loss": 0.0192, "step": 696200 }, { "epoch": 0.006963, "grad_norm": 0.24713779985904694, "learning_rate": 1e-05, "loss": 0.0197, "step": 696300 }, { "epoch": 0.006964, "grad_norm": 0.11611694097518921, "learning_rate": 1e-05, "loss": 0.0194, "step": 696400 }, { "epoch": 0.006965, "grad_norm": 0.23265114426612854, "learning_rate": 1e-05, "loss": 0.0198, "step": 696500 }, { "epoch": 0.006966, "grad_norm": 0.17219948768615723, "learning_rate": 1e-05, "loss": 0.0201, "step": 696600 }, { "epoch": 0.006967, "grad_norm": 0.19164210557937622, "learning_rate": 1e-05, "loss": 0.0198, "step": 696700 }, { "epoch": 0.006968, "grad_norm": 0.16826742887496948, "learning_rate": 1e-05, "loss": 0.0201, "step": 696800 }, { "epoch": 0.006969, "grad_norm": 0.17299708724021912, "learning_rate": 1e-05, "loss": 0.02, "step": 696900 }, { "epoch": 0.00697, "grad_norm": 0.17678284645080566, "learning_rate": 1e-05, "loss": 0.0203, "step": 697000 }, { "epoch": 0.006971, "grad_norm": 0.12851886451244354, "learning_rate": 1e-05, "loss": 0.02, "step": 697100 }, { "epoch": 0.006972, "grad_norm": 0.18232524394989014, "learning_rate": 1e-05, "loss": 0.0203, "step": 697200 }, { "epoch": 0.006973, "grad_norm": 0.15677472949028015, "learning_rate": 1e-05, "loss": 0.0196, "step": 697300 }, { "epoch": 0.006974, "grad_norm": 0.22810715436935425, "learning_rate": 1e-05, "loss": 0.0197, "step": 697400 }, { "epoch": 0.006975, "grad_norm": 0.15828654170036316, "learning_rate": 1e-05, "loss": 0.0196, "step": 697500 }, { "epoch": 0.006976, "grad_norm": 0.19253958761692047, "learning_rate": 1e-05, "loss": 0.0197, "step": 697600 }, { "epoch": 0.006977, "grad_norm": 0.23307323455810547, "learning_rate": 1e-05, "loss": 0.0201, "step": 697700 }, { "epoch": 0.006978, "grad_norm": 0.11867852509021759, "learning_rate": 1e-05, "loss": 0.0194, "step": 697800 }, { "epoch": 0.006979, "grad_norm": 0.11970482766628265, "learning_rate": 1e-05, "loss": 0.0197, "step": 697900 }, { "epoch": 0.00698, "grad_norm": 0.21009154617786407, "learning_rate": 1e-05, "loss": 0.0193, "step": 698000 }, { "epoch": 0.006981, "grad_norm": 0.12912797927856445, "learning_rate": 1e-05, "loss": 0.0198, "step": 698100 }, { "epoch": 0.006982, "grad_norm": 0.17548838257789612, "learning_rate": 1e-05, "loss": 0.0198, "step": 698200 }, { "epoch": 0.006983, "grad_norm": 0.18506619334220886, "learning_rate": 1e-05, "loss": 0.0195, "step": 698300 }, { "epoch": 0.006984, "grad_norm": 0.15470483899116516, "learning_rate": 1e-05, "loss": 0.0199, "step": 698400 }, { "epoch": 0.006985, "grad_norm": 0.1344095915555954, "learning_rate": 1e-05, "loss": 0.0195, "step": 698500 }, { "epoch": 0.006986, "grad_norm": 0.160774365067482, "learning_rate": 1e-05, "loss": 0.0199, "step": 698600 }, { "epoch": 0.006987, "grad_norm": 0.17131365835666656, "learning_rate": 1e-05, "loss": 0.0195, "step": 698700 }, { "epoch": 0.006988, "grad_norm": 0.29133620858192444, "learning_rate": 1e-05, "loss": 0.0194, "step": 698800 }, { "epoch": 0.006989, "grad_norm": 0.199613556265831, "learning_rate": 1e-05, "loss": 0.0195, "step": 698900 }, { "epoch": 0.00699, "grad_norm": 0.15141423046588898, "learning_rate": 1e-05, "loss": 0.0197, "step": 699000 }, { "epoch": 0.006991, "grad_norm": 0.2135506123304367, "learning_rate": 1e-05, "loss": 0.0199, "step": 699100 }, { "epoch": 0.006992, "grad_norm": 0.1978229582309723, "learning_rate": 1e-05, "loss": 0.0199, "step": 699200 }, { "epoch": 0.006993, "grad_norm": 0.1648973822593689, "learning_rate": 1e-05, "loss": 0.0195, "step": 699300 }, { "epoch": 0.006994, "grad_norm": 0.19291983544826508, "learning_rate": 1e-05, "loss": 0.02, "step": 699400 }, { "epoch": 0.006995, "grad_norm": 0.17624177038669586, "learning_rate": 1e-05, "loss": 0.0197, "step": 699500 }, { "epoch": 0.006996, "grad_norm": 0.15246707201004028, "learning_rate": 1e-05, "loss": 0.0203, "step": 699600 }, { "epoch": 0.006997, "grad_norm": 0.1290099322795868, "learning_rate": 1e-05, "loss": 0.0201, "step": 699700 }, { "epoch": 0.006998, "grad_norm": 0.1858505755662918, "learning_rate": 1e-05, "loss": 0.0197, "step": 699800 }, { "epoch": 0.006999, "grad_norm": 0.18833166360855103, "learning_rate": 1e-05, "loss": 0.0194, "step": 699900 }, { "epoch": 0.007, "grad_norm": 0.1458377093076706, "learning_rate": 1e-05, "loss": 0.0196, "step": 700000 }, { "epoch": 0.007, "eval_loss": 0.017844874411821365, "eval_runtime": 170.0677, "eval_samples_per_second": 294.001, "eval_steps_per_second": 18.375, "step": 700000 }, { "epoch": 0.007001, "grad_norm": 0.1572650820016861, "learning_rate": 1e-05, "loss": 0.0199, "step": 700100 }, { "epoch": 0.007002, "grad_norm": 0.18875879049301147, "learning_rate": 1e-05, "loss": 0.0202, "step": 700200 }, { "epoch": 0.007003, "grad_norm": 0.1648273915052414, "learning_rate": 1e-05, "loss": 0.0196, "step": 700300 }, { "epoch": 0.007004, "grad_norm": 0.1813950091600418, "learning_rate": 1e-05, "loss": 0.0197, "step": 700400 }, { "epoch": 0.007005, "grad_norm": 0.18199270963668823, "learning_rate": 1e-05, "loss": 0.0201, "step": 700500 }, { "epoch": 0.007006, "grad_norm": 0.12191420048475266, "learning_rate": 1e-05, "loss": 0.0198, "step": 700600 }, { "epoch": 0.007007, "grad_norm": 0.173506960272789, "learning_rate": 1e-05, "loss": 0.0196, "step": 700700 }, { "epoch": 0.007008, "grad_norm": 0.15038013458251953, "learning_rate": 1e-05, "loss": 0.0197, "step": 700800 }, { "epoch": 0.007009, "grad_norm": 0.15974268317222595, "learning_rate": 1e-05, "loss": 0.0198, "step": 700900 }, { "epoch": 0.00701, "grad_norm": 0.1464388519525528, "learning_rate": 1e-05, "loss": 0.0199, "step": 701000 }, { "epoch": 0.007011, "grad_norm": 0.1569279134273529, "learning_rate": 1e-05, "loss": 0.02, "step": 701100 }, { "epoch": 0.007012, "grad_norm": 0.1902606189250946, "learning_rate": 1e-05, "loss": 0.02, "step": 701200 }, { "epoch": 0.007013, "grad_norm": 0.17805488407611847, "learning_rate": 1e-05, "loss": 0.0201, "step": 701300 }, { "epoch": 0.007014, "grad_norm": 0.14518879354000092, "learning_rate": 1e-05, "loss": 0.0196, "step": 701400 }, { "epoch": 0.007015, "grad_norm": 0.18548813462257385, "learning_rate": 1e-05, "loss": 0.0197, "step": 701500 }, { "epoch": 0.007016, "grad_norm": 0.15661801397800446, "learning_rate": 1e-05, "loss": 0.0196, "step": 701600 }, { "epoch": 0.007017, "grad_norm": 0.14613156020641327, "learning_rate": 1e-05, "loss": 0.0193, "step": 701700 }, { "epoch": 0.007018, "grad_norm": 0.15656273066997528, "learning_rate": 1e-05, "loss": 0.0199, "step": 701800 }, { "epoch": 0.007019, "grad_norm": 0.1350867599248886, "learning_rate": 1e-05, "loss": 0.0199, "step": 701900 }, { "epoch": 0.00702, "grad_norm": 0.17687714099884033, "learning_rate": 1e-05, "loss": 0.0199, "step": 702000 }, { "epoch": 0.007021, "grad_norm": 0.1963929533958435, "learning_rate": 1e-05, "loss": 0.0194, "step": 702100 }, { "epoch": 0.007022, "grad_norm": 0.12862597405910492, "learning_rate": 1e-05, "loss": 0.0199, "step": 702200 }, { "epoch": 0.007023, "grad_norm": 0.16661600768566132, "learning_rate": 1e-05, "loss": 0.0199, "step": 702300 }, { "epoch": 0.007024, "grad_norm": 0.19466805458068848, "learning_rate": 1e-05, "loss": 0.0202, "step": 702400 }, { "epoch": 0.007025, "grad_norm": 0.14479541778564453, "learning_rate": 1e-05, "loss": 0.0198, "step": 702500 }, { "epoch": 0.007026, "grad_norm": 0.18401971459388733, "learning_rate": 1e-05, "loss": 0.0193, "step": 702600 }, { "epoch": 0.007027, "grad_norm": 0.17684251070022583, "learning_rate": 1e-05, "loss": 0.02, "step": 702700 }, { "epoch": 0.007028, "grad_norm": 0.28187352418899536, "learning_rate": 1e-05, "loss": 0.02, "step": 702800 }, { "epoch": 0.007029, "grad_norm": 0.18377676606178284, "learning_rate": 1e-05, "loss": 0.0199, "step": 702900 }, { "epoch": 0.00703, "grad_norm": 0.21166053414344788, "learning_rate": 1e-05, "loss": 0.0195, "step": 703000 }, { "epoch": 0.007031, "grad_norm": 0.12583091855049133, "learning_rate": 1e-05, "loss": 0.0194, "step": 703100 }, { "epoch": 0.007032, "grad_norm": 0.1665705144405365, "learning_rate": 1e-05, "loss": 0.0199, "step": 703200 }, { "epoch": 0.007033, "grad_norm": 0.15326140820980072, "learning_rate": 1e-05, "loss": 0.0196, "step": 703300 }, { "epoch": 0.007034, "grad_norm": 0.2342665046453476, "learning_rate": 1e-05, "loss": 0.0197, "step": 703400 }, { "epoch": 0.007035, "grad_norm": 0.1616213470697403, "learning_rate": 1e-05, "loss": 0.0199, "step": 703500 }, { "epoch": 0.007036, "grad_norm": 0.18798571825027466, "learning_rate": 1e-05, "loss": 0.0195, "step": 703600 }, { "epoch": 0.007037, "grad_norm": 0.17172923684120178, "learning_rate": 1e-05, "loss": 0.0198, "step": 703700 }, { "epoch": 0.007038, "grad_norm": 0.19905498623847961, "learning_rate": 1e-05, "loss": 0.0195, "step": 703800 }, { "epoch": 0.007039, "grad_norm": 0.1913129836320877, "learning_rate": 1e-05, "loss": 0.02, "step": 703900 }, { "epoch": 0.00704, "grad_norm": 0.1987917721271515, "learning_rate": 1e-05, "loss": 0.0194, "step": 704000 }, { "epoch": 0.007041, "grad_norm": 0.1275213062763214, "learning_rate": 1e-05, "loss": 0.0202, "step": 704100 }, { "epoch": 0.007042, "grad_norm": 0.13933512568473816, "learning_rate": 1e-05, "loss": 0.0197, "step": 704200 }, { "epoch": 0.007043, "grad_norm": 0.15757021307945251, "learning_rate": 1e-05, "loss": 0.0195, "step": 704300 }, { "epoch": 0.007044, "grad_norm": 0.14518077671527863, "learning_rate": 1e-05, "loss": 0.0196, "step": 704400 }, { "epoch": 0.007045, "grad_norm": 0.14918334782123566, "learning_rate": 1e-05, "loss": 0.02, "step": 704500 }, { "epoch": 0.007046, "grad_norm": 0.18515121936798096, "learning_rate": 1e-05, "loss": 0.0203, "step": 704600 }, { "epoch": 0.007047, "grad_norm": 0.15746596455574036, "learning_rate": 1e-05, "loss": 0.0196, "step": 704700 }, { "epoch": 0.007048, "grad_norm": 0.1368628293275833, "learning_rate": 1e-05, "loss": 0.0197, "step": 704800 }, { "epoch": 0.007049, "grad_norm": 0.15264664590358734, "learning_rate": 1e-05, "loss": 0.0201, "step": 704900 }, { "epoch": 0.00705, "grad_norm": 0.14971862733364105, "learning_rate": 1e-05, "loss": 0.0204, "step": 705000 }, { "epoch": 0.007051, "grad_norm": 0.13172924518585205, "learning_rate": 1e-05, "loss": 0.0197, "step": 705100 }, { "epoch": 0.007052, "grad_norm": 0.22299325466156006, "learning_rate": 1e-05, "loss": 0.02, "step": 705200 }, { "epoch": 0.007053, "grad_norm": 0.12899145483970642, "learning_rate": 1e-05, "loss": 0.0204, "step": 705300 }, { "epoch": 0.007054, "grad_norm": 0.20611001551151276, "learning_rate": 1e-05, "loss": 0.0202, "step": 705400 }, { "epoch": 0.007055, "grad_norm": 0.12337581068277359, "learning_rate": 1e-05, "loss": 0.0196, "step": 705500 }, { "epoch": 0.007056, "grad_norm": 0.1610242873430252, "learning_rate": 1e-05, "loss": 0.0199, "step": 705600 }, { "epoch": 0.007057, "grad_norm": 0.15610161423683167, "learning_rate": 1e-05, "loss": 0.0199, "step": 705700 }, { "epoch": 0.007058, "grad_norm": 0.1753484606742859, "learning_rate": 1e-05, "loss": 0.0194, "step": 705800 }, { "epoch": 0.007059, "grad_norm": 0.17733287811279297, "learning_rate": 1e-05, "loss": 0.0201, "step": 705900 }, { "epoch": 0.00706, "grad_norm": 0.13397221267223358, "learning_rate": 1e-05, "loss": 0.0196, "step": 706000 }, { "epoch": 0.007061, "grad_norm": 0.19868676364421844, "learning_rate": 1e-05, "loss": 0.0196, "step": 706100 }, { "epoch": 0.007062, "grad_norm": 0.17303727567195892, "learning_rate": 1e-05, "loss": 0.0194, "step": 706200 }, { "epoch": 0.007063, "grad_norm": 0.1869206726551056, "learning_rate": 1e-05, "loss": 0.0198, "step": 706300 }, { "epoch": 0.007064, "grad_norm": 0.15712159872055054, "learning_rate": 1e-05, "loss": 0.0199, "step": 706400 }, { "epoch": 0.007065, "grad_norm": 0.22547456622123718, "learning_rate": 1e-05, "loss": 0.0199, "step": 706500 }, { "epoch": 0.007066, "grad_norm": 0.14327135682106018, "learning_rate": 1e-05, "loss": 0.02, "step": 706600 }, { "epoch": 0.007067, "grad_norm": 0.18697206676006317, "learning_rate": 1e-05, "loss": 0.0193, "step": 706700 }, { "epoch": 0.007068, "grad_norm": 0.16140888631343842, "learning_rate": 1e-05, "loss": 0.0198, "step": 706800 }, { "epoch": 0.007069, "grad_norm": 0.13262927532196045, "learning_rate": 1e-05, "loss": 0.0199, "step": 706900 }, { "epoch": 0.00707, "grad_norm": 0.19907590746879578, "learning_rate": 1e-05, "loss": 0.0202, "step": 707000 }, { "epoch": 0.007071, "grad_norm": 0.15462371706962585, "learning_rate": 1e-05, "loss": 0.0199, "step": 707100 }, { "epoch": 0.007072, "grad_norm": 0.1339094191789627, "learning_rate": 1e-05, "loss": 0.0195, "step": 707200 }, { "epoch": 0.007073, "grad_norm": 0.13736173510551453, "learning_rate": 1e-05, "loss": 0.0198, "step": 707300 }, { "epoch": 0.007074, "grad_norm": 0.19035093486309052, "learning_rate": 1e-05, "loss": 0.02, "step": 707400 }, { "epoch": 0.007075, "grad_norm": 0.16595281660556793, "learning_rate": 1e-05, "loss": 0.0198, "step": 707500 }, { "epoch": 0.007076, "grad_norm": 0.1555197536945343, "learning_rate": 1e-05, "loss": 0.0195, "step": 707600 }, { "epoch": 0.007077, "grad_norm": 0.14974373579025269, "learning_rate": 1e-05, "loss": 0.0199, "step": 707700 }, { "epoch": 0.007078, "grad_norm": 0.1138257309794426, "learning_rate": 1e-05, "loss": 0.0199, "step": 707800 }, { "epoch": 0.007079, "grad_norm": 0.15705494582653046, "learning_rate": 1e-05, "loss": 0.0199, "step": 707900 }, { "epoch": 0.00708, "grad_norm": 0.1546253263950348, "learning_rate": 1e-05, "loss": 0.0197, "step": 708000 }, { "epoch": 0.007081, "grad_norm": 0.1237204447388649, "learning_rate": 1e-05, "loss": 0.0199, "step": 708100 }, { "epoch": 0.007082, "grad_norm": 0.1482395976781845, "learning_rate": 1e-05, "loss": 0.0203, "step": 708200 }, { "epoch": 0.007083, "grad_norm": 0.20321963727474213, "learning_rate": 1e-05, "loss": 0.0197, "step": 708300 }, { "epoch": 0.007084, "grad_norm": 0.18238593637943268, "learning_rate": 1e-05, "loss": 0.0197, "step": 708400 }, { "epoch": 0.007085, "grad_norm": 0.17459321022033691, "learning_rate": 1e-05, "loss": 0.0198, "step": 708500 }, { "epoch": 0.007086, "grad_norm": 0.1413417011499405, "learning_rate": 1e-05, "loss": 0.0198, "step": 708600 }, { "epoch": 0.007087, "grad_norm": 0.23400752246379852, "learning_rate": 1e-05, "loss": 0.0201, "step": 708700 }, { "epoch": 0.007088, "grad_norm": 0.14536163210868835, "learning_rate": 1e-05, "loss": 0.02, "step": 708800 }, { "epoch": 0.007089, "grad_norm": 0.17308425903320312, "learning_rate": 1e-05, "loss": 0.0198, "step": 708900 }, { "epoch": 0.00709, "grad_norm": 0.22521451115608215, "learning_rate": 1e-05, "loss": 0.0199, "step": 709000 }, { "epoch": 0.007091, "grad_norm": 0.11805422604084015, "learning_rate": 1e-05, "loss": 0.0195, "step": 709100 }, { "epoch": 0.007092, "grad_norm": 0.16254396736621857, "learning_rate": 1e-05, "loss": 0.0196, "step": 709200 }, { "epoch": 0.007093, "grad_norm": 0.21282225847244263, "learning_rate": 1e-05, "loss": 0.0197, "step": 709300 }, { "epoch": 0.007094, "grad_norm": 0.22844535112380981, "learning_rate": 1e-05, "loss": 0.0196, "step": 709400 }, { "epoch": 0.007095, "grad_norm": 0.17190028727054596, "learning_rate": 1e-05, "loss": 0.02, "step": 709500 }, { "epoch": 0.007096, "grad_norm": 0.1998148411512375, "learning_rate": 1e-05, "loss": 0.0199, "step": 709600 }, { "epoch": 0.007097, "grad_norm": 0.1283416599035263, "learning_rate": 1e-05, "loss": 0.0196, "step": 709700 }, { "epoch": 0.007098, "grad_norm": 0.16680848598480225, "learning_rate": 1e-05, "loss": 0.0198, "step": 709800 }, { "epoch": 0.007099, "grad_norm": 0.12956981360912323, "learning_rate": 1e-05, "loss": 0.0198, "step": 709900 }, { "epoch": 0.0071, "grad_norm": 0.1242656484246254, "learning_rate": 1e-05, "loss": 0.0201, "step": 710000 }, { "epoch": 0.007101, "grad_norm": 0.18193913996219635, "learning_rate": 1e-05, "loss": 0.0202, "step": 710100 }, { "epoch": 0.007102, "grad_norm": 0.15591363608837128, "learning_rate": 1e-05, "loss": 0.0197, "step": 710200 }, { "epoch": 0.007103, "grad_norm": 0.18415340781211853, "learning_rate": 1e-05, "loss": 0.0196, "step": 710300 }, { "epoch": 0.007104, "grad_norm": 0.11297295242547989, "learning_rate": 1e-05, "loss": 0.0198, "step": 710400 }, { "epoch": 0.007105, "grad_norm": 0.16798485815525055, "learning_rate": 1e-05, "loss": 0.0195, "step": 710500 }, { "epoch": 0.007106, "grad_norm": 0.16503262519836426, "learning_rate": 1e-05, "loss": 0.0202, "step": 710600 }, { "epoch": 0.007107, "grad_norm": 0.12106503546237946, "learning_rate": 1e-05, "loss": 0.0194, "step": 710700 }, { "epoch": 0.007108, "grad_norm": 0.14307266473770142, "learning_rate": 1e-05, "loss": 0.0198, "step": 710800 }, { "epoch": 0.007109, "grad_norm": 0.1524532586336136, "learning_rate": 1e-05, "loss": 0.0197, "step": 710900 }, { "epoch": 0.00711, "grad_norm": 0.15160371363162994, "learning_rate": 1e-05, "loss": 0.0198, "step": 711000 }, { "epoch": 0.007111, "grad_norm": 0.16012263298034668, "learning_rate": 1e-05, "loss": 0.0197, "step": 711100 }, { "epoch": 0.007112, "grad_norm": 0.1949101835489273, "learning_rate": 1e-05, "loss": 0.0195, "step": 711200 }, { "epoch": 0.007113, "grad_norm": 0.14627701044082642, "learning_rate": 1e-05, "loss": 0.0199, "step": 711300 }, { "epoch": 0.007114, "grad_norm": 0.1669301986694336, "learning_rate": 1e-05, "loss": 0.0201, "step": 711400 }, { "epoch": 0.007115, "grad_norm": 0.12675081193447113, "learning_rate": 1e-05, "loss": 0.0199, "step": 711500 }, { "epoch": 0.007116, "grad_norm": 0.18695005774497986, "learning_rate": 1e-05, "loss": 0.0195, "step": 711600 }, { "epoch": 0.007117, "grad_norm": 0.1403084546327591, "learning_rate": 1e-05, "loss": 0.0197, "step": 711700 }, { "epoch": 0.007118, "grad_norm": 0.15348897874355316, "learning_rate": 1e-05, "loss": 0.0198, "step": 711800 }, { "epoch": 0.007119, "grad_norm": 0.16612787544727325, "learning_rate": 1e-05, "loss": 0.0192, "step": 711900 }, { "epoch": 0.00712, "grad_norm": 0.15785902738571167, "learning_rate": 1e-05, "loss": 0.0201, "step": 712000 }, { "epoch": 0.007121, "grad_norm": 0.14933757483959198, "learning_rate": 1e-05, "loss": 0.0199, "step": 712100 }, { "epoch": 0.007122, "grad_norm": 0.13639596104621887, "learning_rate": 1e-05, "loss": 0.0198, "step": 712200 }, { "epoch": 0.007123, "grad_norm": 0.17347002029418945, "learning_rate": 1e-05, "loss": 0.0198, "step": 712300 }, { "epoch": 0.007124, "grad_norm": 0.15813659131526947, "learning_rate": 1e-05, "loss": 0.0199, "step": 712400 }, { "epoch": 0.007125, "grad_norm": 0.1314045488834381, "learning_rate": 1e-05, "loss": 0.0199, "step": 712500 }, { "epoch": 0.007126, "grad_norm": 0.17542971670627594, "learning_rate": 1e-05, "loss": 0.0198, "step": 712600 }, { "epoch": 0.007127, "grad_norm": 0.13707827031612396, "learning_rate": 1e-05, "loss": 0.0195, "step": 712700 }, { "epoch": 0.007128, "grad_norm": 0.15272288024425507, "learning_rate": 1e-05, "loss": 0.02, "step": 712800 }, { "epoch": 0.007129, "grad_norm": 0.1342436522245407, "learning_rate": 1e-05, "loss": 0.0197, "step": 712900 }, { "epoch": 0.00713, "grad_norm": 0.16724793612957, "learning_rate": 1e-05, "loss": 0.0195, "step": 713000 }, { "epoch": 0.007131, "grad_norm": 0.18950989842414856, "learning_rate": 1e-05, "loss": 0.0194, "step": 713100 }, { "epoch": 0.007132, "grad_norm": 0.14357160031795502, "learning_rate": 1e-05, "loss": 0.0196, "step": 713200 }, { "epoch": 0.007133, "grad_norm": 0.14843763411045074, "learning_rate": 1e-05, "loss": 0.0194, "step": 713300 }, { "epoch": 0.007134, "grad_norm": 0.17781654000282288, "learning_rate": 1e-05, "loss": 0.02, "step": 713400 }, { "epoch": 0.007135, "grad_norm": 0.18398736417293549, "learning_rate": 1e-05, "loss": 0.0193, "step": 713500 }, { "epoch": 0.007136, "grad_norm": 0.11309274286031723, "learning_rate": 1e-05, "loss": 0.0197, "step": 713600 }, { "epoch": 0.007137, "grad_norm": 0.18835727870464325, "learning_rate": 1e-05, "loss": 0.0195, "step": 713700 }, { "epoch": 0.007138, "grad_norm": 0.2026374489068985, "learning_rate": 1e-05, "loss": 0.0199, "step": 713800 }, { "epoch": 0.007139, "grad_norm": 0.1908065378665924, "learning_rate": 1e-05, "loss": 0.0201, "step": 713900 }, { "epoch": 0.00714, "grad_norm": 0.14454206824302673, "learning_rate": 1e-05, "loss": 0.0197, "step": 714000 }, { "epoch": 0.007141, "grad_norm": 0.1394592821598053, "learning_rate": 1e-05, "loss": 0.0198, "step": 714100 }, { "epoch": 0.007142, "grad_norm": 0.17402639985084534, "learning_rate": 1e-05, "loss": 0.02, "step": 714200 }, { "epoch": 0.007143, "grad_norm": 0.20055897533893585, "learning_rate": 1e-05, "loss": 0.0197, "step": 714300 }, { "epoch": 0.007144, "grad_norm": 0.19996395707130432, "learning_rate": 1e-05, "loss": 0.0196, "step": 714400 }, { "epoch": 0.007145, "grad_norm": 0.2151063233613968, "learning_rate": 1e-05, "loss": 0.02, "step": 714500 }, { "epoch": 0.007146, "grad_norm": 0.15245375037193298, "learning_rate": 1e-05, "loss": 0.0199, "step": 714600 }, { "epoch": 0.007147, "grad_norm": 0.17226292192935944, "learning_rate": 1e-05, "loss": 0.0195, "step": 714700 }, { "epoch": 0.007148, "grad_norm": 0.12029430270195007, "learning_rate": 1e-05, "loss": 0.0197, "step": 714800 }, { "epoch": 0.007149, "grad_norm": 0.22407138347625732, "learning_rate": 1e-05, "loss": 0.02, "step": 714900 }, { "epoch": 0.00715, "grad_norm": 0.1388905942440033, "learning_rate": 1e-05, "loss": 0.0202, "step": 715000 }, { "epoch": 0.007151, "grad_norm": 0.12921938300132751, "learning_rate": 1e-05, "loss": 0.0201, "step": 715100 }, { "epoch": 0.007152, "grad_norm": 0.13810794055461884, "learning_rate": 1e-05, "loss": 0.0195, "step": 715200 }, { "epoch": 0.007153, "grad_norm": 0.18613073229789734, "learning_rate": 1e-05, "loss": 0.0197, "step": 715300 }, { "epoch": 0.007154, "grad_norm": 0.24312016367912292, "learning_rate": 1e-05, "loss": 0.0195, "step": 715400 }, { "epoch": 0.007155, "grad_norm": 0.15562599897384644, "learning_rate": 1e-05, "loss": 0.0195, "step": 715500 }, { "epoch": 0.007156, "grad_norm": 0.14524810016155243, "learning_rate": 1e-05, "loss": 0.02, "step": 715600 }, { "epoch": 0.007157, "grad_norm": 0.19888494908809662, "learning_rate": 1e-05, "loss": 0.0192, "step": 715700 }, { "epoch": 0.007158, "grad_norm": 0.16905224323272705, "learning_rate": 1e-05, "loss": 0.0196, "step": 715800 }, { "epoch": 0.007159, "grad_norm": 0.18540939688682556, "learning_rate": 1e-05, "loss": 0.0201, "step": 715900 }, { "epoch": 0.00716, "grad_norm": 0.13971927762031555, "learning_rate": 1e-05, "loss": 0.0197, "step": 716000 }, { "epoch": 0.007161, "grad_norm": 0.1491069346666336, "learning_rate": 1e-05, "loss": 0.0192, "step": 716100 }, { "epoch": 0.007162, "grad_norm": 0.15102288126945496, "learning_rate": 1e-05, "loss": 0.0201, "step": 716200 }, { "epoch": 0.007163, "grad_norm": 0.13266056776046753, "learning_rate": 1e-05, "loss": 0.0195, "step": 716300 }, { "epoch": 0.007164, "grad_norm": 0.13875271379947662, "learning_rate": 1e-05, "loss": 0.019, "step": 716400 }, { "epoch": 0.007165, "grad_norm": 0.18499433994293213, "learning_rate": 1e-05, "loss": 0.0198, "step": 716500 }, { "epoch": 0.007166, "grad_norm": 0.18074965476989746, "learning_rate": 1e-05, "loss": 0.0195, "step": 716600 }, { "epoch": 0.007167, "grad_norm": 0.13476796448230743, "learning_rate": 1e-05, "loss": 0.0193, "step": 716700 }, { "epoch": 0.007168, "grad_norm": 0.1681891232728958, "learning_rate": 1e-05, "loss": 0.0196, "step": 716800 }, { "epoch": 0.007169, "grad_norm": 0.16763390600681305, "learning_rate": 1e-05, "loss": 0.0199, "step": 716900 }, { "epoch": 0.00717, "grad_norm": 0.15890641510486603, "learning_rate": 1e-05, "loss": 0.0199, "step": 717000 }, { "epoch": 0.007171, "grad_norm": 0.17086397111415863, "learning_rate": 1e-05, "loss": 0.0194, "step": 717100 }, { "epoch": 0.007172, "grad_norm": 0.1926320493221283, "learning_rate": 1e-05, "loss": 0.0193, "step": 717200 }, { "epoch": 0.007173, "grad_norm": 0.18047352135181427, "learning_rate": 1e-05, "loss": 0.0198, "step": 717300 }, { "epoch": 0.007174, "grad_norm": 0.14709292352199554, "learning_rate": 1e-05, "loss": 0.0197, "step": 717400 }, { "epoch": 0.007175, "grad_norm": 0.14382341504096985, "learning_rate": 1e-05, "loss": 0.0194, "step": 717500 }, { "epoch": 0.007176, "grad_norm": 0.12502868473529816, "learning_rate": 1e-05, "loss": 0.02, "step": 717600 }, { "epoch": 0.007177, "grad_norm": 0.15080641210079193, "learning_rate": 1e-05, "loss": 0.0195, "step": 717700 }, { "epoch": 0.007178, "grad_norm": 0.15951479971408844, "learning_rate": 1e-05, "loss": 0.0201, "step": 717800 }, { "epoch": 0.007179, "grad_norm": 0.260864794254303, "learning_rate": 1e-05, "loss": 0.02, "step": 717900 }, { "epoch": 0.00718, "grad_norm": 0.1337732970714569, "learning_rate": 1e-05, "loss": 0.0193, "step": 718000 }, { "epoch": 0.007181, "grad_norm": 0.1194663941860199, "learning_rate": 1e-05, "loss": 0.02, "step": 718100 }, { "epoch": 0.007182, "grad_norm": 0.12492929399013519, "learning_rate": 1e-05, "loss": 0.0194, "step": 718200 }, { "epoch": 0.007183, "grad_norm": 0.18389467895030975, "learning_rate": 1e-05, "loss": 0.0193, "step": 718300 }, { "epoch": 0.007184, "grad_norm": 0.17202408611774445, "learning_rate": 1e-05, "loss": 0.0198, "step": 718400 }, { "epoch": 0.007185, "grad_norm": 0.17535118758678436, "learning_rate": 1e-05, "loss": 0.0199, "step": 718500 }, { "epoch": 0.007186, "grad_norm": 0.13365760445594788, "learning_rate": 1e-05, "loss": 0.0192, "step": 718600 }, { "epoch": 0.007187, "grad_norm": 0.2382543683052063, "learning_rate": 1e-05, "loss": 0.0193, "step": 718700 }, { "epoch": 0.007188, "grad_norm": 0.11501654237508774, "learning_rate": 1e-05, "loss": 0.0198, "step": 718800 }, { "epoch": 0.007189, "grad_norm": 0.19656458497047424, "learning_rate": 1e-05, "loss": 0.0198, "step": 718900 }, { "epoch": 0.00719, "grad_norm": 0.14675186574459076, "learning_rate": 1e-05, "loss": 0.0194, "step": 719000 }, { "epoch": 0.007191, "grad_norm": 0.14105482399463654, "learning_rate": 1e-05, "loss": 0.0195, "step": 719100 }, { "epoch": 0.007192, "grad_norm": 0.19269177317619324, "learning_rate": 1e-05, "loss": 0.0199, "step": 719200 }, { "epoch": 0.007193, "grad_norm": 0.14971280097961426, "learning_rate": 1e-05, "loss": 0.0197, "step": 719300 }, { "epoch": 0.007194, "grad_norm": 0.12770037353038788, "learning_rate": 1e-05, "loss": 0.0193, "step": 719400 }, { "epoch": 0.007195, "grad_norm": 0.21096763014793396, "learning_rate": 1e-05, "loss": 0.0199, "step": 719500 }, { "epoch": 0.007196, "grad_norm": 0.18536008894443512, "learning_rate": 1e-05, "loss": 0.0194, "step": 719600 }, { "epoch": 0.007197, "grad_norm": 0.16499218344688416, "learning_rate": 1e-05, "loss": 0.0196, "step": 719700 }, { "epoch": 0.007198, "grad_norm": 0.2292371243238449, "learning_rate": 1e-05, "loss": 0.0197, "step": 719800 }, { "epoch": 0.007199, "grad_norm": 0.12075572460889816, "learning_rate": 1e-05, "loss": 0.0196, "step": 719900 }, { "epoch": 0.0072, "grad_norm": 0.14874808490276337, "learning_rate": 1e-05, "loss": 0.0194, "step": 720000 }, { "epoch": 0.0072, "eval_loss": 0.017219042405486107, "eval_runtime": 191.7364, "eval_samples_per_second": 260.775, "eval_steps_per_second": 16.298, "step": 720000 }, { "epoch": 0.007201, "grad_norm": 0.18216058611869812, "learning_rate": 1e-05, "loss": 0.0199, "step": 720100 }, { "epoch": 0.007202, "grad_norm": 0.1645735204219818, "learning_rate": 1e-05, "loss": 0.0197, "step": 720200 }, { "epoch": 0.007203, "grad_norm": 0.13694877922534943, "learning_rate": 1e-05, "loss": 0.02, "step": 720300 }, { "epoch": 0.007204, "grad_norm": 0.1547728329896927, "learning_rate": 1e-05, "loss": 0.0196, "step": 720400 }, { "epoch": 0.007205, "grad_norm": 0.12933151423931122, "learning_rate": 1e-05, "loss": 0.0196, "step": 720500 }, { "epoch": 0.007206, "grad_norm": 0.12467887997627258, "learning_rate": 1e-05, "loss": 0.0201, "step": 720600 }, { "epoch": 0.007207, "grad_norm": 0.1704154759645462, "learning_rate": 1e-05, "loss": 0.0197, "step": 720700 }, { "epoch": 0.007208, "grad_norm": 0.191717267036438, "learning_rate": 1e-05, "loss": 0.0195, "step": 720800 }, { "epoch": 0.007209, "grad_norm": 0.20125795900821686, "learning_rate": 1e-05, "loss": 0.0196, "step": 720900 }, { "epoch": 0.00721, "grad_norm": 0.12113479524850845, "learning_rate": 1e-05, "loss": 0.02, "step": 721000 }, { "epoch": 0.007211, "grad_norm": 0.1468314528465271, "learning_rate": 1e-05, "loss": 0.0193, "step": 721100 }, { "epoch": 0.007212, "grad_norm": 0.1301354616880417, "learning_rate": 1e-05, "loss": 0.0195, "step": 721200 }, { "epoch": 0.007213, "grad_norm": 0.15042980015277863, "learning_rate": 1e-05, "loss": 0.0191, "step": 721300 }, { "epoch": 0.007214, "grad_norm": 0.13952556252479553, "learning_rate": 1e-05, "loss": 0.0195, "step": 721400 }, { "epoch": 0.007215, "grad_norm": 0.15178070962429047, "learning_rate": 1e-05, "loss": 0.0193, "step": 721500 }, { "epoch": 0.007216, "grad_norm": 0.12048232555389404, "learning_rate": 1e-05, "loss": 0.0193, "step": 721600 }, { "epoch": 0.007217, "grad_norm": 0.1266619712114334, "learning_rate": 1e-05, "loss": 0.0196, "step": 721700 }, { "epoch": 0.007218, "grad_norm": 0.11794070154428482, "learning_rate": 1e-05, "loss": 0.0195, "step": 721800 }, { "epoch": 0.007219, "grad_norm": 0.152640238404274, "learning_rate": 1e-05, "loss": 0.0197, "step": 721900 }, { "epoch": 0.00722, "grad_norm": 0.18779920041561127, "learning_rate": 1e-05, "loss": 0.0198, "step": 722000 }, { "epoch": 0.007221, "grad_norm": 0.15429171919822693, "learning_rate": 1e-05, "loss": 0.0196, "step": 722100 }, { "epoch": 0.007222, "grad_norm": 0.17116962373256683, "learning_rate": 1e-05, "loss": 0.0198, "step": 722200 }, { "epoch": 0.007223, "grad_norm": 0.17450670897960663, "learning_rate": 1e-05, "loss": 0.0192, "step": 722300 }, { "epoch": 0.007224, "grad_norm": 0.12561893463134766, "learning_rate": 1e-05, "loss": 0.0194, "step": 722400 }, { "epoch": 0.007225, "grad_norm": 0.13795685768127441, "learning_rate": 1e-05, "loss": 0.0196, "step": 722500 }, { "epoch": 0.007226, "grad_norm": 0.16039904952049255, "learning_rate": 1e-05, "loss": 0.0201, "step": 722600 }, { "epoch": 0.007227, "grad_norm": 0.15061457455158234, "learning_rate": 1e-05, "loss": 0.0195, "step": 722700 }, { "epoch": 0.007228, "grad_norm": 0.144836887717247, "learning_rate": 1e-05, "loss": 0.0197, "step": 722800 }, { "epoch": 0.007229, "grad_norm": 0.15150219202041626, "learning_rate": 1e-05, "loss": 0.0201, "step": 722900 }, { "epoch": 0.00723, "grad_norm": 0.14978182315826416, "learning_rate": 1e-05, "loss": 0.0197, "step": 723000 }, { "epoch": 0.007231, "grad_norm": 0.15566572546958923, "learning_rate": 1e-05, "loss": 0.0198, "step": 723100 }, { "epoch": 0.007232, "grad_norm": 0.1179388165473938, "learning_rate": 1e-05, "loss": 0.0197, "step": 723200 }, { "epoch": 0.007233, "grad_norm": 0.14788804948329926, "learning_rate": 1e-05, "loss": 0.0198, "step": 723300 }, { "epoch": 0.007234, "grad_norm": 0.13163410127162933, "learning_rate": 1e-05, "loss": 0.0193, "step": 723400 }, { "epoch": 0.007235, "grad_norm": 0.15033967792987823, "learning_rate": 1e-05, "loss": 0.0191, "step": 723500 }, { "epoch": 0.007236, "grad_norm": 0.1593116968870163, "learning_rate": 1e-05, "loss": 0.0195, "step": 723600 }, { "epoch": 0.007237, "grad_norm": 0.1799475997686386, "learning_rate": 1e-05, "loss": 0.0197, "step": 723700 }, { "epoch": 0.007238, "grad_norm": 0.1833961009979248, "learning_rate": 1e-05, "loss": 0.0196, "step": 723800 }, { "epoch": 0.007239, "grad_norm": 0.14203420281410217, "learning_rate": 1e-05, "loss": 0.0198, "step": 723900 }, { "epoch": 0.00724, "grad_norm": 0.2168150097131729, "learning_rate": 1e-05, "loss": 0.0193, "step": 724000 }, { "epoch": 0.007241, "grad_norm": 0.17631769180297852, "learning_rate": 1e-05, "loss": 0.0196, "step": 724100 }, { "epoch": 0.007242, "grad_norm": 0.22480660676956177, "learning_rate": 1e-05, "loss": 0.0196, "step": 724200 }, { "epoch": 0.007243, "grad_norm": 0.19203384220600128, "learning_rate": 1e-05, "loss": 0.0196, "step": 724300 }, { "epoch": 0.007244, "grad_norm": 0.1696033626794815, "learning_rate": 1e-05, "loss": 0.0194, "step": 724400 }, { "epoch": 0.007245, "grad_norm": 0.13845831155776978, "learning_rate": 1e-05, "loss": 0.0195, "step": 724500 }, { "epoch": 0.007246, "grad_norm": 0.21083185076713562, "learning_rate": 1e-05, "loss": 0.019, "step": 724600 }, { "epoch": 0.007247, "grad_norm": 0.17747631669044495, "learning_rate": 1e-05, "loss": 0.0198, "step": 724700 }, { "epoch": 0.007248, "grad_norm": 0.1545555591583252, "learning_rate": 1e-05, "loss": 0.0195, "step": 724800 }, { "epoch": 0.007249, "grad_norm": 0.1509186327457428, "learning_rate": 1e-05, "loss": 0.0195, "step": 724900 }, { "epoch": 0.00725, "grad_norm": 0.22809015214443207, "learning_rate": 1e-05, "loss": 0.0194, "step": 725000 }, { "epoch": 0.007251, "grad_norm": 0.15426073968410492, "learning_rate": 1e-05, "loss": 0.0194, "step": 725100 }, { "epoch": 0.007252, "grad_norm": 0.1802789270877838, "learning_rate": 1e-05, "loss": 0.0195, "step": 725200 }, { "epoch": 0.007253, "grad_norm": 0.16469398140907288, "learning_rate": 1e-05, "loss": 0.0193, "step": 725300 }, { "epoch": 0.007254, "grad_norm": 0.12701329588890076, "learning_rate": 1e-05, "loss": 0.0193, "step": 725400 }, { "epoch": 0.007255, "grad_norm": 0.13997097313404083, "learning_rate": 1e-05, "loss": 0.02, "step": 725500 }, { "epoch": 0.007256, "grad_norm": 0.20127803087234497, "learning_rate": 1e-05, "loss": 0.0195, "step": 725600 }, { "epoch": 0.007257, "grad_norm": 0.23894177377223969, "learning_rate": 1e-05, "loss": 0.0196, "step": 725700 }, { "epoch": 0.007258, "grad_norm": 0.16864684224128723, "learning_rate": 1e-05, "loss": 0.0196, "step": 725800 }, { "epoch": 0.007259, "grad_norm": 0.13347448408603668, "learning_rate": 1e-05, "loss": 0.0197, "step": 725900 }, { "epoch": 0.00726, "grad_norm": 0.1906319260597229, "learning_rate": 1e-05, "loss": 0.0199, "step": 726000 }, { "epoch": 0.007261, "grad_norm": 0.11888524889945984, "learning_rate": 1e-05, "loss": 0.0193, "step": 726100 }, { "epoch": 0.007262, "grad_norm": 0.21949300169944763, "learning_rate": 1e-05, "loss": 0.0192, "step": 726200 }, { "epoch": 0.007263, "grad_norm": 0.13197889924049377, "learning_rate": 1e-05, "loss": 0.0195, "step": 726300 }, { "epoch": 0.007264, "grad_norm": 0.4766717553138733, "learning_rate": 1e-05, "loss": 0.0196, "step": 726400 }, { "epoch": 0.007265, "grad_norm": 0.1626826673746109, "learning_rate": 1e-05, "loss": 0.0197, "step": 726500 }, { "epoch": 0.007266, "grad_norm": 0.16265235841274261, "learning_rate": 1e-05, "loss": 0.0189, "step": 726600 }, { "epoch": 0.007267, "grad_norm": 0.29317915439605713, "learning_rate": 1e-05, "loss": 0.0194, "step": 726700 }, { "epoch": 0.007268, "grad_norm": 0.2808690667152405, "learning_rate": 1e-05, "loss": 0.02, "step": 726800 }, { "epoch": 0.007269, "grad_norm": 0.16589008271694183, "learning_rate": 1e-05, "loss": 0.0197, "step": 726900 }, { "epoch": 0.00727, "grad_norm": 0.1292557567358017, "learning_rate": 1e-05, "loss": 0.0195, "step": 727000 }, { "epoch": 0.007271, "grad_norm": 0.13797658681869507, "learning_rate": 1e-05, "loss": 0.0195, "step": 727100 }, { "epoch": 0.007272, "grad_norm": 0.15921999514102936, "learning_rate": 1e-05, "loss": 0.0192, "step": 727200 }, { "epoch": 0.007273, "grad_norm": 0.15404276549816132, "learning_rate": 1e-05, "loss": 0.0196, "step": 727300 }, { "epoch": 0.007274, "grad_norm": 0.1784091293811798, "learning_rate": 1e-05, "loss": 0.0194, "step": 727400 }, { "epoch": 0.007275, "grad_norm": 0.16369010508060455, "learning_rate": 1e-05, "loss": 0.0193, "step": 727500 }, { "epoch": 0.007276, "grad_norm": 0.14191564917564392, "learning_rate": 1e-05, "loss": 0.0198, "step": 727600 }, { "epoch": 0.007277, "grad_norm": 0.2122742086648941, "learning_rate": 1e-05, "loss": 0.0198, "step": 727700 }, { "epoch": 0.007278, "grad_norm": 0.12051953375339508, "learning_rate": 1e-05, "loss": 0.02, "step": 727800 }, { "epoch": 0.007279, "grad_norm": 0.16339610517024994, "learning_rate": 1e-05, "loss": 0.0199, "step": 727900 }, { "epoch": 0.00728, "grad_norm": 0.18296155333518982, "learning_rate": 1e-05, "loss": 0.0194, "step": 728000 }, { "epoch": 0.007281, "grad_norm": 0.14637324213981628, "learning_rate": 1e-05, "loss": 0.0195, "step": 728100 }, { "epoch": 0.007282, "grad_norm": 0.14183925092220306, "learning_rate": 1e-05, "loss": 0.0192, "step": 728200 }, { "epoch": 0.007283, "grad_norm": 0.15390066802501678, "learning_rate": 1e-05, "loss": 0.0196, "step": 728300 }, { "epoch": 0.007284, "grad_norm": 0.19652831554412842, "learning_rate": 1e-05, "loss": 0.0198, "step": 728400 }, { "epoch": 0.007285, "grad_norm": 0.13645926117897034, "learning_rate": 1e-05, "loss": 0.02, "step": 728500 }, { "epoch": 0.007286, "grad_norm": 0.16016259789466858, "learning_rate": 1e-05, "loss": 0.0202, "step": 728600 }, { "epoch": 0.007287, "grad_norm": 0.2564946711063385, "learning_rate": 1e-05, "loss": 0.0195, "step": 728700 }, { "epoch": 0.007288, "grad_norm": 0.16385766863822937, "learning_rate": 1e-05, "loss": 0.0199, "step": 728800 }, { "epoch": 0.007289, "grad_norm": 0.17537379264831543, "learning_rate": 1e-05, "loss": 0.0196, "step": 728900 }, { "epoch": 0.00729, "grad_norm": 0.17377746105194092, "learning_rate": 1e-05, "loss": 0.0196, "step": 729000 }, { "epoch": 0.007291, "grad_norm": 0.18631449341773987, "learning_rate": 1e-05, "loss": 0.0193, "step": 729100 }, { "epoch": 0.007292, "grad_norm": 0.13149318099021912, "learning_rate": 1e-05, "loss": 0.0195, "step": 729200 }, { "epoch": 0.007293, "grad_norm": 0.16070473194122314, "learning_rate": 1e-05, "loss": 0.0194, "step": 729300 }, { "epoch": 0.007294, "grad_norm": 0.20850582420825958, "learning_rate": 1e-05, "loss": 0.0202, "step": 729400 }, { "epoch": 0.007295, "grad_norm": 0.2311098873615265, "learning_rate": 1e-05, "loss": 0.0195, "step": 729500 }, { "epoch": 0.007296, "grad_norm": 0.24130085110664368, "learning_rate": 1e-05, "loss": 0.0197, "step": 729600 }, { "epoch": 0.007297, "grad_norm": 0.14897939562797546, "learning_rate": 1e-05, "loss": 0.0196, "step": 729700 }, { "epoch": 0.007298, "grad_norm": 0.18836835026741028, "learning_rate": 1e-05, "loss": 0.0195, "step": 729800 }, { "epoch": 0.007299, "grad_norm": 0.1772562712430954, "learning_rate": 1e-05, "loss": 0.0194, "step": 729900 }, { "epoch": 0.0073, "grad_norm": 0.1847693771123886, "learning_rate": 1e-05, "loss": 0.0198, "step": 730000 }, { "epoch": 0.007301, "grad_norm": 0.19388653337955475, "learning_rate": 1e-05, "loss": 0.0194, "step": 730100 }, { "epoch": 0.007302, "grad_norm": 0.11181018501520157, "learning_rate": 1e-05, "loss": 0.0192, "step": 730200 }, { "epoch": 0.007303, "grad_norm": 0.15551511943340302, "learning_rate": 1e-05, "loss": 0.0193, "step": 730300 }, { "epoch": 0.007304, "grad_norm": 0.17669200897216797, "learning_rate": 1e-05, "loss": 0.0192, "step": 730400 }, { "epoch": 0.007305, "grad_norm": 0.1561633050441742, "learning_rate": 1e-05, "loss": 0.0197, "step": 730500 }, { "epoch": 0.007306, "grad_norm": 0.1648651361465454, "learning_rate": 1e-05, "loss": 0.0196, "step": 730600 }, { "epoch": 0.007307, "grad_norm": 0.17571623623371124, "learning_rate": 1e-05, "loss": 0.0193, "step": 730700 }, { "epoch": 0.007308, "grad_norm": 0.14354215562343597, "learning_rate": 1e-05, "loss": 0.0198, "step": 730800 }, { "epoch": 0.007309, "grad_norm": 0.12790974974632263, "learning_rate": 1e-05, "loss": 0.0195, "step": 730900 }, { "epoch": 0.00731, "grad_norm": 0.11638034880161285, "learning_rate": 1e-05, "loss": 0.0195, "step": 731000 }, { "epoch": 0.007311, "grad_norm": 0.1705775111913681, "learning_rate": 1e-05, "loss": 0.0195, "step": 731100 }, { "epoch": 0.007312, "grad_norm": 0.18528832495212555, "learning_rate": 1e-05, "loss": 0.0196, "step": 731200 }, { "epoch": 0.007313, "grad_norm": 0.17062434554100037, "learning_rate": 1e-05, "loss": 0.0192, "step": 731300 }, { "epoch": 0.007314, "grad_norm": 0.1641567498445511, "learning_rate": 1e-05, "loss": 0.0191, "step": 731400 }, { "epoch": 0.007315, "grad_norm": 0.15148717164993286, "learning_rate": 1e-05, "loss": 0.0191, "step": 731500 }, { "epoch": 0.007316, "grad_norm": 0.1614750623703003, "learning_rate": 1e-05, "loss": 0.0192, "step": 731600 }, { "epoch": 0.007317, "grad_norm": 0.17793065309524536, "learning_rate": 1e-05, "loss": 0.0196, "step": 731700 }, { "epoch": 0.007318, "grad_norm": 0.18082517385482788, "learning_rate": 1e-05, "loss": 0.0194, "step": 731800 }, { "epoch": 0.007319, "grad_norm": 0.24673478305339813, "learning_rate": 1e-05, "loss": 0.0191, "step": 731900 }, { "epoch": 0.00732, "grad_norm": 0.18002592027187347, "learning_rate": 1e-05, "loss": 0.0198, "step": 732000 }, { "epoch": 0.007321, "grad_norm": 0.1440049558877945, "learning_rate": 1e-05, "loss": 0.0198, "step": 732100 }, { "epoch": 0.007322, "grad_norm": 0.20420116186141968, "learning_rate": 1e-05, "loss": 0.02, "step": 732200 }, { "epoch": 0.007323, "grad_norm": 0.12231123447418213, "learning_rate": 1e-05, "loss": 0.0195, "step": 732300 }, { "epoch": 0.007324, "grad_norm": 0.18226486444473267, "learning_rate": 1e-05, "loss": 0.0197, "step": 732400 }, { "epoch": 0.007325, "grad_norm": 0.13368965685367584, "learning_rate": 1e-05, "loss": 0.0196, "step": 732500 }, { "epoch": 0.007326, "grad_norm": 0.1536140888929367, "learning_rate": 1e-05, "loss": 0.0196, "step": 732600 }, { "epoch": 0.007327, "grad_norm": 0.14973653852939606, "learning_rate": 1e-05, "loss": 0.0192, "step": 732700 }, { "epoch": 0.007328, "grad_norm": 0.15433943271636963, "learning_rate": 1e-05, "loss": 0.0196, "step": 732800 }, { "epoch": 0.007329, "grad_norm": 0.20030611753463745, "learning_rate": 1e-05, "loss": 0.0195, "step": 732900 }, { "epoch": 0.00733, "grad_norm": 0.14771996438503265, "learning_rate": 1e-05, "loss": 0.0195, "step": 733000 }, { "epoch": 0.007331, "grad_norm": 0.130289226770401, "learning_rate": 1e-05, "loss": 0.0194, "step": 733100 }, { "epoch": 0.007332, "grad_norm": 0.20033176243305206, "learning_rate": 1e-05, "loss": 0.0197, "step": 733200 }, { "epoch": 0.007333, "grad_norm": 0.1505407840013504, "learning_rate": 1e-05, "loss": 0.0194, "step": 733300 }, { "epoch": 0.007334, "grad_norm": 0.1281607300043106, "learning_rate": 1e-05, "loss": 0.0193, "step": 733400 }, { "epoch": 0.007335, "grad_norm": 0.1456248015165329, "learning_rate": 1e-05, "loss": 0.0196, "step": 733500 }, { "epoch": 0.007336, "grad_norm": 0.14545129239559174, "learning_rate": 1e-05, "loss": 0.0198, "step": 733600 }, { "epoch": 0.007337, "grad_norm": 0.18151560425758362, "learning_rate": 1e-05, "loss": 0.0195, "step": 733700 }, { "epoch": 0.007338, "grad_norm": 0.18254075944423676, "learning_rate": 1e-05, "loss": 0.0195, "step": 733800 }, { "epoch": 0.007339, "grad_norm": 0.13178513944149017, "learning_rate": 1e-05, "loss": 0.0194, "step": 733900 }, { "epoch": 0.00734, "grad_norm": 0.1458578258752823, "learning_rate": 1e-05, "loss": 0.0194, "step": 734000 }, { "epoch": 0.007341, "grad_norm": 0.16001282632350922, "learning_rate": 1e-05, "loss": 0.0193, "step": 734100 }, { "epoch": 0.007342, "grad_norm": 0.1864817589521408, "learning_rate": 1e-05, "loss": 0.0191, "step": 734200 }, { "epoch": 0.007343, "grad_norm": 0.15576964616775513, "learning_rate": 1e-05, "loss": 0.0197, "step": 734300 }, { "epoch": 0.007344, "grad_norm": 0.12551361322402954, "learning_rate": 1e-05, "loss": 0.0193, "step": 734400 }, { "epoch": 0.007345, "grad_norm": 0.14678406715393066, "learning_rate": 1e-05, "loss": 0.0193, "step": 734500 }, { "epoch": 0.007346, "grad_norm": 0.16710685193538666, "learning_rate": 1e-05, "loss": 0.0192, "step": 734600 }, { "epoch": 0.007347, "grad_norm": 0.15483345091342926, "learning_rate": 1e-05, "loss": 0.0197, "step": 734700 }, { "epoch": 0.007348, "grad_norm": 0.21039333939552307, "learning_rate": 1e-05, "loss": 0.0195, "step": 734800 }, { "epoch": 0.007349, "grad_norm": 0.16795842349529266, "learning_rate": 1e-05, "loss": 0.0195, "step": 734900 }, { "epoch": 0.00735, "grad_norm": 0.174830362200737, "learning_rate": 1e-05, "loss": 0.0195, "step": 735000 }, { "epoch": 0.007351, "grad_norm": 0.17746180295944214, "learning_rate": 1e-05, "loss": 0.0192, "step": 735100 }, { "epoch": 0.007352, "grad_norm": 0.1682301163673401, "learning_rate": 1e-05, "loss": 0.0198, "step": 735200 }, { "epoch": 0.007353, "grad_norm": 0.18866980075836182, "learning_rate": 1e-05, "loss": 0.0195, "step": 735300 }, { "epoch": 0.007354, "grad_norm": 0.1627092808485031, "learning_rate": 1e-05, "loss": 0.0195, "step": 735400 }, { "epoch": 0.007355, "grad_norm": 0.16259382665157318, "learning_rate": 1e-05, "loss": 0.0195, "step": 735500 }, { "epoch": 0.007356, "grad_norm": 0.18653219938278198, "learning_rate": 1e-05, "loss": 0.0196, "step": 735600 }, { "epoch": 0.007357, "grad_norm": 0.1351061910390854, "learning_rate": 1e-05, "loss": 0.0195, "step": 735700 }, { "epoch": 0.007358, "grad_norm": 0.2610721290111542, "learning_rate": 1e-05, "loss": 0.0192, "step": 735800 }, { "epoch": 0.007359, "grad_norm": 0.19239434599876404, "learning_rate": 1e-05, "loss": 0.0195, "step": 735900 }, { "epoch": 0.00736, "grad_norm": 0.14986200630664825, "learning_rate": 1e-05, "loss": 0.0196, "step": 736000 }, { "epoch": 0.007361, "grad_norm": 0.16044339537620544, "learning_rate": 1e-05, "loss": 0.0198, "step": 736100 }, { "epoch": 0.007362, "grad_norm": 0.17841224372386932, "learning_rate": 1e-05, "loss": 0.0199, "step": 736200 }, { "epoch": 0.007363, "grad_norm": 0.21765613555908203, "learning_rate": 1e-05, "loss": 0.0198, "step": 736300 }, { "epoch": 0.007364, "grad_norm": 0.12833945453166962, "learning_rate": 1e-05, "loss": 0.0192, "step": 736400 }, { "epoch": 0.007365, "grad_norm": 0.16560639441013336, "learning_rate": 1e-05, "loss": 0.0197, "step": 736500 }, { "epoch": 0.007366, "grad_norm": 0.15236085653305054, "learning_rate": 1e-05, "loss": 0.0193, "step": 736600 }, { "epoch": 0.007367, "grad_norm": 0.17794330418109894, "learning_rate": 1e-05, "loss": 0.0195, "step": 736700 }, { "epoch": 0.007368, "grad_norm": 0.1439027637243271, "learning_rate": 1e-05, "loss": 0.0194, "step": 736800 }, { "epoch": 0.007369, "grad_norm": 0.15831948816776276, "learning_rate": 1e-05, "loss": 0.0197, "step": 736900 }, { "epoch": 0.00737, "grad_norm": 0.2052062600851059, "learning_rate": 1e-05, "loss": 0.0195, "step": 737000 }, { "epoch": 0.007371, "grad_norm": 0.18181616067886353, "learning_rate": 1e-05, "loss": 0.0191, "step": 737100 }, { "epoch": 0.007372, "grad_norm": 0.1412908285856247, "learning_rate": 1e-05, "loss": 0.0193, "step": 737200 }, { "epoch": 0.007373, "grad_norm": 0.16684210300445557, "learning_rate": 1e-05, "loss": 0.0202, "step": 737300 }, { "epoch": 0.007374, "grad_norm": 0.12719297409057617, "learning_rate": 1e-05, "loss": 0.0189, "step": 737400 }, { "epoch": 0.007375, "grad_norm": 0.17629936337471008, "learning_rate": 1e-05, "loss": 0.0198, "step": 737500 }, { "epoch": 0.007376, "grad_norm": 0.2043241411447525, "learning_rate": 1e-05, "loss": 0.0196, "step": 737600 }, { "epoch": 0.007377, "grad_norm": 0.16930101811885834, "learning_rate": 1e-05, "loss": 0.0192, "step": 737700 }, { "epoch": 0.007378, "grad_norm": 0.12039948254823685, "learning_rate": 1e-05, "loss": 0.0199, "step": 737800 }, { "epoch": 0.007379, "grad_norm": 0.21054114401340485, "learning_rate": 1e-05, "loss": 0.019, "step": 737900 }, { "epoch": 0.00738, "grad_norm": 0.16070373356342316, "learning_rate": 1e-05, "loss": 0.0197, "step": 738000 }, { "epoch": 0.007381, "grad_norm": 0.12468338012695312, "learning_rate": 1e-05, "loss": 0.0195, "step": 738100 }, { "epoch": 0.007382, "grad_norm": 0.1660773903131485, "learning_rate": 1e-05, "loss": 0.0195, "step": 738200 }, { "epoch": 0.007383, "grad_norm": 0.20055978000164032, "learning_rate": 1e-05, "loss": 0.0198, "step": 738300 }, { "epoch": 0.007384, "grad_norm": 0.16113144159317017, "learning_rate": 1e-05, "loss": 0.0196, "step": 738400 }, { "epoch": 0.007385, "grad_norm": 0.11212427914142609, "learning_rate": 1e-05, "loss": 0.0197, "step": 738500 }, { "epoch": 0.007386, "grad_norm": 0.13315804302692413, "learning_rate": 1e-05, "loss": 0.0199, "step": 738600 }, { "epoch": 0.007387, "grad_norm": 0.14012299478054047, "learning_rate": 1e-05, "loss": 0.0194, "step": 738700 }, { "epoch": 0.007388, "grad_norm": 0.16775570809841156, "learning_rate": 1e-05, "loss": 0.0193, "step": 738800 }, { "epoch": 0.007389, "grad_norm": 0.16516493260860443, "learning_rate": 1e-05, "loss": 0.0195, "step": 738900 }, { "epoch": 0.00739, "grad_norm": 0.17556743323802948, "learning_rate": 1e-05, "loss": 0.0191, "step": 739000 }, { "epoch": 0.007391, "grad_norm": 0.18042264878749847, "learning_rate": 1e-05, "loss": 0.02, "step": 739100 }, { "epoch": 0.007392, "grad_norm": 0.24465270340442657, "learning_rate": 1e-05, "loss": 0.0199, "step": 739200 }, { "epoch": 0.007393, "grad_norm": 0.12167414277791977, "learning_rate": 1e-05, "loss": 0.0192, "step": 739300 }, { "epoch": 0.007394, "grad_norm": 0.1460653394460678, "learning_rate": 1e-05, "loss": 0.0195, "step": 739400 }, { "epoch": 0.007395, "grad_norm": 0.25892117619514465, "learning_rate": 1e-05, "loss": 0.0187, "step": 739500 }, { "epoch": 0.007396, "grad_norm": 0.15647707879543304, "learning_rate": 1e-05, "loss": 0.0198, "step": 739600 }, { "epoch": 0.007397, "grad_norm": 0.1676042377948761, "learning_rate": 1e-05, "loss": 0.0194, "step": 739700 }, { "epoch": 0.007398, "grad_norm": 0.13469232618808746, "learning_rate": 1e-05, "loss": 0.0197, "step": 739800 }, { "epoch": 0.007399, "grad_norm": 0.16637997329235077, "learning_rate": 1e-05, "loss": 0.0189, "step": 739900 }, { "epoch": 0.0074, "grad_norm": 0.14695845544338226, "learning_rate": 1e-05, "loss": 0.0193, "step": 740000 }, { "epoch": 0.0074, "eval_loss": 0.018127089366316795, "eval_runtime": 188.7323, "eval_samples_per_second": 264.925, "eval_steps_per_second": 16.558, "step": 740000 }, { "epoch": 0.007401, "grad_norm": 0.18185950815677643, "learning_rate": 1e-05, "loss": 0.0198, "step": 740100 }, { "epoch": 0.007402, "grad_norm": 0.13890604674816132, "learning_rate": 1e-05, "loss": 0.0195, "step": 740200 }, { "epoch": 0.007403, "grad_norm": 0.23317040503025055, "learning_rate": 1e-05, "loss": 0.0195, "step": 740300 }, { "epoch": 0.007404, "grad_norm": 0.216078981757164, "learning_rate": 1e-05, "loss": 0.0194, "step": 740400 }, { "epoch": 0.007405, "grad_norm": 0.19616171717643738, "learning_rate": 1e-05, "loss": 0.0196, "step": 740500 }, { "epoch": 0.007406, "grad_norm": 0.14146733283996582, "learning_rate": 1e-05, "loss": 0.0194, "step": 740600 }, { "epoch": 0.007407, "grad_norm": 0.14467006921768188, "learning_rate": 1e-05, "loss": 0.019, "step": 740700 }, { "epoch": 0.007408, "grad_norm": 0.1639222800731659, "learning_rate": 1e-05, "loss": 0.0197, "step": 740800 }, { "epoch": 0.007409, "grad_norm": 0.1608804315328598, "learning_rate": 1e-05, "loss": 0.0201, "step": 740900 }, { "epoch": 0.00741, "grad_norm": 0.12436668574810028, "learning_rate": 1e-05, "loss": 0.0191, "step": 741000 }, { "epoch": 0.007411, "grad_norm": 0.13844576478004456, "learning_rate": 1e-05, "loss": 0.0197, "step": 741100 }, { "epoch": 0.007412, "grad_norm": 0.14013443887233734, "learning_rate": 1e-05, "loss": 0.0196, "step": 741200 }, { "epoch": 0.007413, "grad_norm": 0.10414540022611618, "learning_rate": 1e-05, "loss": 0.0192, "step": 741300 }, { "epoch": 0.007414, "grad_norm": 0.18622027337551117, "learning_rate": 1e-05, "loss": 0.0195, "step": 741400 }, { "epoch": 0.007415, "grad_norm": 0.15386627614498138, "learning_rate": 1e-05, "loss": 0.0197, "step": 741500 }, { "epoch": 0.007416, "grad_norm": 0.13807417452335358, "learning_rate": 1e-05, "loss": 0.02, "step": 741600 }, { "epoch": 0.007417, "grad_norm": 0.15666142106056213, "learning_rate": 1e-05, "loss": 0.0187, "step": 741700 }, { "epoch": 0.007418, "grad_norm": 0.16078269481658936, "learning_rate": 1e-05, "loss": 0.0193, "step": 741800 }, { "epoch": 0.007419, "grad_norm": 0.20240680873394012, "learning_rate": 1e-05, "loss": 0.0197, "step": 741900 }, { "epoch": 0.00742, "grad_norm": 0.153652161359787, "learning_rate": 1e-05, "loss": 0.0197, "step": 742000 }, { "epoch": 0.007421, "grad_norm": 0.2287597805261612, "learning_rate": 1e-05, "loss": 0.0195, "step": 742100 }, { "epoch": 0.007422, "grad_norm": 0.1560909003019333, "learning_rate": 1e-05, "loss": 0.0195, "step": 742200 }, { "epoch": 0.007423, "grad_norm": 0.158006951212883, "learning_rate": 1e-05, "loss": 0.0197, "step": 742300 }, { "epoch": 0.007424, "grad_norm": 0.12040576338768005, "learning_rate": 1e-05, "loss": 0.019, "step": 742400 }, { "epoch": 0.007425, "grad_norm": 0.15668970346450806, "learning_rate": 1e-05, "loss": 0.0192, "step": 742500 }, { "epoch": 0.007426, "grad_norm": 0.15314221382141113, "learning_rate": 1e-05, "loss": 0.0191, "step": 742600 }, { "epoch": 0.007427, "grad_norm": 0.1768358200788498, "learning_rate": 1e-05, "loss": 0.0192, "step": 742700 }, { "epoch": 0.007428, "grad_norm": 0.1712649166584015, "learning_rate": 1e-05, "loss": 0.0195, "step": 742800 }, { "epoch": 0.007429, "grad_norm": 0.11440459638834, "learning_rate": 1e-05, "loss": 0.0194, "step": 742900 }, { "epoch": 0.00743, "grad_norm": 0.14490929245948792, "learning_rate": 1e-05, "loss": 0.0198, "step": 743000 }, { "epoch": 0.007431, "grad_norm": 0.17990516126155853, "learning_rate": 1e-05, "loss": 0.0198, "step": 743100 }, { "epoch": 0.007432, "grad_norm": 0.12248816341161728, "learning_rate": 1e-05, "loss": 0.0193, "step": 743200 }, { "epoch": 0.007433, "grad_norm": 0.10856699198484421, "learning_rate": 1e-05, "loss": 0.0194, "step": 743300 }, { "epoch": 0.007434, "grad_norm": 0.20691248774528503, "learning_rate": 1e-05, "loss": 0.0195, "step": 743400 }, { "epoch": 0.007435, "grad_norm": 0.12602199614048004, "learning_rate": 1e-05, "loss": 0.0197, "step": 743500 }, { "epoch": 0.007436, "grad_norm": 0.1576269119977951, "learning_rate": 1e-05, "loss": 0.0196, "step": 743600 }, { "epoch": 0.007437, "grad_norm": 0.17703483998775482, "learning_rate": 1e-05, "loss": 0.019, "step": 743700 }, { "epoch": 0.007438, "grad_norm": 0.1761191338300705, "learning_rate": 1e-05, "loss": 0.0194, "step": 743800 }, { "epoch": 0.007439, "grad_norm": 0.16995273530483246, "learning_rate": 1e-05, "loss": 0.0195, "step": 743900 }, { "epoch": 0.00744, "grad_norm": 0.10666856914758682, "learning_rate": 1e-05, "loss": 0.019, "step": 744000 }, { "epoch": 0.007441, "grad_norm": 0.13178059458732605, "learning_rate": 1e-05, "loss": 0.0192, "step": 744100 }, { "epoch": 0.007442, "grad_norm": 0.12919706106185913, "learning_rate": 1e-05, "loss": 0.0194, "step": 744200 }, { "epoch": 0.007443, "grad_norm": 0.2156895101070404, "learning_rate": 1e-05, "loss": 0.0193, "step": 744300 }, { "epoch": 0.007444, "grad_norm": 0.1448773443698883, "learning_rate": 1e-05, "loss": 0.0196, "step": 744400 }, { "epoch": 0.007445, "grad_norm": 0.23528218269348145, "learning_rate": 1e-05, "loss": 0.0191, "step": 744500 }, { "epoch": 0.007446, "grad_norm": 0.12669070065021515, "learning_rate": 1e-05, "loss": 0.0198, "step": 744600 }, { "epoch": 0.007447, "grad_norm": 0.1622793972492218, "learning_rate": 1e-05, "loss": 0.0201, "step": 744700 }, { "epoch": 0.007448, "grad_norm": 0.17350390553474426, "learning_rate": 1e-05, "loss": 0.0196, "step": 744800 }, { "epoch": 0.007449, "grad_norm": 0.17406271398067474, "learning_rate": 1e-05, "loss": 0.0196, "step": 744900 }, { "epoch": 0.00745, "grad_norm": 0.20600704848766327, "learning_rate": 1e-05, "loss": 0.0197, "step": 745000 }, { "epoch": 0.007451, "grad_norm": 0.156830295920372, "learning_rate": 1e-05, "loss": 0.0197, "step": 745100 }, { "epoch": 0.007452, "grad_norm": 0.14103464782238007, "learning_rate": 1e-05, "loss": 0.0196, "step": 745200 }, { "epoch": 0.007453, "grad_norm": 0.1545875072479248, "learning_rate": 1e-05, "loss": 0.0194, "step": 745300 }, { "epoch": 0.007454, "grad_norm": 0.31857529282569885, "learning_rate": 1e-05, "loss": 0.0193, "step": 745400 }, { "epoch": 0.007455, "grad_norm": 0.12731121480464935, "learning_rate": 1e-05, "loss": 0.0194, "step": 745500 }, { "epoch": 0.007456, "grad_norm": 0.2094549983739853, "learning_rate": 1e-05, "loss": 0.0192, "step": 745600 }, { "epoch": 0.007457, "grad_norm": 0.17660294473171234, "learning_rate": 1e-05, "loss": 0.0199, "step": 745700 }, { "epoch": 0.007458, "grad_norm": 0.13743652403354645, "learning_rate": 1e-05, "loss": 0.0194, "step": 745800 }, { "epoch": 0.007459, "grad_norm": 0.18437281250953674, "learning_rate": 1e-05, "loss": 0.0201, "step": 745900 }, { "epoch": 0.00746, "grad_norm": 0.15908440947532654, "learning_rate": 1e-05, "loss": 0.0189, "step": 746000 }, { "epoch": 0.007461, "grad_norm": 0.13356126844882965, "learning_rate": 1e-05, "loss": 0.0199, "step": 746100 }, { "epoch": 0.007462, "grad_norm": 0.19676624238491058, "learning_rate": 1e-05, "loss": 0.0192, "step": 746200 }, { "epoch": 0.007463, "grad_norm": 0.1717715561389923, "learning_rate": 1e-05, "loss": 0.0192, "step": 746300 }, { "epoch": 0.007464, "grad_norm": 0.10675173252820969, "learning_rate": 1e-05, "loss": 0.0194, "step": 746400 }, { "epoch": 0.007465, "grad_norm": 0.17491403222084045, "learning_rate": 1e-05, "loss": 0.0192, "step": 746500 }, { "epoch": 0.007466, "grad_norm": 0.1345733106136322, "learning_rate": 1e-05, "loss": 0.0194, "step": 746600 }, { "epoch": 0.007467, "grad_norm": 0.13579173386096954, "learning_rate": 1e-05, "loss": 0.0194, "step": 746700 }, { "epoch": 0.007468, "grad_norm": 0.11606121808290482, "learning_rate": 1e-05, "loss": 0.0195, "step": 746800 }, { "epoch": 0.007469, "grad_norm": 0.12217334657907486, "learning_rate": 1e-05, "loss": 0.0194, "step": 746900 }, { "epoch": 0.00747, "grad_norm": 0.153522327542305, "learning_rate": 1e-05, "loss": 0.0192, "step": 747000 }, { "epoch": 0.007471, "grad_norm": 0.1380367875099182, "learning_rate": 1e-05, "loss": 0.019, "step": 747100 }, { "epoch": 0.007472, "grad_norm": 0.19044822454452515, "learning_rate": 1e-05, "loss": 0.0191, "step": 747200 }, { "epoch": 0.007473, "grad_norm": 0.17028765380382538, "learning_rate": 1e-05, "loss": 0.0192, "step": 747300 }, { "epoch": 0.007474, "grad_norm": 0.15857276320457458, "learning_rate": 1e-05, "loss": 0.0196, "step": 747400 }, { "epoch": 0.007475, "grad_norm": 0.14097526669502258, "learning_rate": 1e-05, "loss": 0.0192, "step": 747500 }, { "epoch": 0.007476, "grad_norm": 0.20369915664196014, "learning_rate": 1e-05, "loss": 0.0195, "step": 747600 }, { "epoch": 0.007477, "grad_norm": 0.15437689423561096, "learning_rate": 1e-05, "loss": 0.0196, "step": 747700 }, { "epoch": 0.007478, "grad_norm": 0.13732486963272095, "learning_rate": 1e-05, "loss": 0.0197, "step": 747800 }, { "epoch": 0.007479, "grad_norm": 0.10423186421394348, "learning_rate": 1e-05, "loss": 0.0192, "step": 747900 }, { "epoch": 0.00748, "grad_norm": 0.16861768066883087, "learning_rate": 1e-05, "loss": 0.0194, "step": 748000 }, { "epoch": 0.007481, "grad_norm": 0.17006061971187592, "learning_rate": 1e-05, "loss": 0.0193, "step": 748100 }, { "epoch": 0.007482, "grad_norm": 0.14117994904518127, "learning_rate": 1e-05, "loss": 0.0195, "step": 748200 }, { "epoch": 0.007483, "grad_norm": 0.1392923891544342, "learning_rate": 1e-05, "loss": 0.0195, "step": 748300 }, { "epoch": 0.007484, "grad_norm": 0.14861442148685455, "learning_rate": 1e-05, "loss": 0.0197, "step": 748400 }, { "epoch": 0.007485, "grad_norm": 0.1791553944349289, "learning_rate": 1e-05, "loss": 0.019, "step": 748500 }, { "epoch": 0.007486, "grad_norm": 0.1717151701450348, "learning_rate": 1e-05, "loss": 0.0194, "step": 748600 }, { "epoch": 0.007487, "grad_norm": 0.1708107590675354, "learning_rate": 1e-05, "loss": 0.0193, "step": 748700 }, { "epoch": 0.007488, "grad_norm": 0.14919300377368927, "learning_rate": 1e-05, "loss": 0.0195, "step": 748800 }, { "epoch": 0.007489, "grad_norm": 0.15805619955062866, "learning_rate": 1e-05, "loss": 0.0192, "step": 748900 }, { "epoch": 0.00749, "grad_norm": 0.19059143960475922, "learning_rate": 1e-05, "loss": 0.0194, "step": 749000 }, { "epoch": 0.007491, "grad_norm": 0.1530914604663849, "learning_rate": 1e-05, "loss": 0.019, "step": 749100 }, { "epoch": 0.007492, "grad_norm": 0.1821098029613495, "learning_rate": 1e-05, "loss": 0.0195, "step": 749200 }, { "epoch": 0.007493, "grad_norm": 0.141972616314888, "learning_rate": 1e-05, "loss": 0.0193, "step": 749300 }, { "epoch": 0.007494, "grad_norm": 0.14280514419078827, "learning_rate": 1e-05, "loss": 0.0194, "step": 749400 }, { "epoch": 0.007495, "grad_norm": 0.12443874776363373, "learning_rate": 1e-05, "loss": 0.0195, "step": 749500 }, { "epoch": 0.007496, "grad_norm": 0.17060966789722443, "learning_rate": 1e-05, "loss": 0.0193, "step": 749600 }, { "epoch": 0.007497, "grad_norm": 0.1515396237373352, "learning_rate": 1e-05, "loss": 0.019, "step": 749700 }, { "epoch": 0.007498, "grad_norm": 0.15413235127925873, "learning_rate": 1e-05, "loss": 0.0193, "step": 749800 }, { "epoch": 0.007499, "grad_norm": 0.1230238601565361, "learning_rate": 1e-05, "loss": 0.0194, "step": 749900 }, { "epoch": 0.0075, "grad_norm": 0.15995186567306519, "learning_rate": 1e-05, "loss": 0.0189, "step": 750000 }, { "epoch": 0.007501, "grad_norm": 0.13587406277656555, "learning_rate": 1e-05, "loss": 0.019, "step": 750100 }, { "epoch": 0.007502, "grad_norm": 0.12433928996324539, "learning_rate": 1e-05, "loss": 0.0192, "step": 750200 }, { "epoch": 0.007503, "grad_norm": 0.18604567646980286, "learning_rate": 1e-05, "loss": 0.0195, "step": 750300 }, { "epoch": 0.007504, "grad_norm": 0.2545609176158905, "learning_rate": 1e-05, "loss": 0.0194, "step": 750400 }, { "epoch": 0.007505, "grad_norm": 0.17171882092952728, "learning_rate": 1e-05, "loss": 0.0191, "step": 750500 }, { "epoch": 0.007506, "grad_norm": 0.21733297407627106, "learning_rate": 1e-05, "loss": 0.0191, "step": 750600 }, { "epoch": 0.007507, "grad_norm": 0.14787207543849945, "learning_rate": 1e-05, "loss": 0.0198, "step": 750700 }, { "epoch": 0.007508, "grad_norm": 0.13667935132980347, "learning_rate": 1e-05, "loss": 0.0199, "step": 750800 }, { "epoch": 0.007509, "grad_norm": 0.1666025072336197, "learning_rate": 1e-05, "loss": 0.0192, "step": 750900 }, { "epoch": 0.00751, "grad_norm": 0.14174114167690277, "learning_rate": 1e-05, "loss": 0.02, "step": 751000 }, { "epoch": 0.007511, "grad_norm": 0.15318810939788818, "learning_rate": 1e-05, "loss": 0.0195, "step": 751100 }, { "epoch": 0.007512, "grad_norm": 0.1679019331932068, "learning_rate": 1e-05, "loss": 0.0197, "step": 751200 }, { "epoch": 0.007513, "grad_norm": 0.12995123863220215, "learning_rate": 1e-05, "loss": 0.0195, "step": 751300 }, { "epoch": 0.007514, "grad_norm": 0.1443997025489807, "learning_rate": 1e-05, "loss": 0.0194, "step": 751400 }, { "epoch": 0.007515, "grad_norm": 0.14645418524742126, "learning_rate": 1e-05, "loss": 0.0192, "step": 751500 }, { "epoch": 0.007516, "grad_norm": 0.1731473058462143, "learning_rate": 1e-05, "loss": 0.0193, "step": 751600 }, { "epoch": 0.007517, "grad_norm": 0.14334623515605927, "learning_rate": 1e-05, "loss": 0.0195, "step": 751700 }, { "epoch": 0.007518, "grad_norm": 0.14205427467823029, "learning_rate": 1e-05, "loss": 0.0197, "step": 751800 }, { "epoch": 0.007519, "grad_norm": 0.21125231683254242, "learning_rate": 1e-05, "loss": 0.0194, "step": 751900 }, { "epoch": 0.00752, "grad_norm": 0.1476544886827469, "learning_rate": 1e-05, "loss": 0.0192, "step": 752000 }, { "epoch": 0.007521, "grad_norm": 0.20307081937789917, "learning_rate": 1e-05, "loss": 0.019, "step": 752100 }, { "epoch": 0.007522, "grad_norm": 0.25941845774650574, "learning_rate": 1e-05, "loss": 0.019, "step": 752200 }, { "epoch": 0.007523, "grad_norm": 0.19638186693191528, "learning_rate": 1e-05, "loss": 0.0196, "step": 752300 }, { "epoch": 0.007524, "grad_norm": 0.14620858430862427, "learning_rate": 1e-05, "loss": 0.0198, "step": 752400 }, { "epoch": 0.007525, "grad_norm": 0.17700845003128052, "learning_rate": 1e-05, "loss": 0.0194, "step": 752500 }, { "epoch": 0.007526, "grad_norm": 0.23717838525772095, "learning_rate": 1e-05, "loss": 0.019, "step": 752600 }, { "epoch": 0.007527, "grad_norm": 0.14923955500125885, "learning_rate": 1e-05, "loss": 0.019, "step": 752700 }, { "epoch": 0.007528, "grad_norm": 0.15682148933410645, "learning_rate": 1e-05, "loss": 0.0191, "step": 752800 }, { "epoch": 0.007529, "grad_norm": 0.18169204890727997, "learning_rate": 1e-05, "loss": 0.0192, "step": 752900 }, { "epoch": 0.00753, "grad_norm": 0.19121703505516052, "learning_rate": 1e-05, "loss": 0.0192, "step": 753000 }, { "epoch": 0.007531, "grad_norm": 0.18400295078754425, "learning_rate": 1e-05, "loss": 0.0195, "step": 753100 }, { "epoch": 0.007532, "grad_norm": 0.15509457886219025, "learning_rate": 1e-05, "loss": 0.0203, "step": 753200 }, { "epoch": 0.007533, "grad_norm": 0.17749814689159393, "learning_rate": 1e-05, "loss": 0.0193, "step": 753300 }, { "epoch": 0.007534, "grad_norm": 0.13332189619541168, "learning_rate": 1e-05, "loss": 0.0189, "step": 753400 }, { "epoch": 0.007535, "grad_norm": 0.1779443770647049, "learning_rate": 1e-05, "loss": 0.0192, "step": 753500 }, { "epoch": 0.007536, "grad_norm": 0.19419464468955994, "learning_rate": 1e-05, "loss": 0.0192, "step": 753600 }, { "epoch": 0.007537, "grad_norm": 0.1737142950296402, "learning_rate": 1e-05, "loss": 0.0196, "step": 753700 }, { "epoch": 0.007538, "grad_norm": 0.14673832058906555, "learning_rate": 1e-05, "loss": 0.0196, "step": 753800 }, { "epoch": 0.007539, "grad_norm": 0.14019957184791565, "learning_rate": 1e-05, "loss": 0.0193, "step": 753900 }, { "epoch": 0.00754, "grad_norm": 0.22913412749767303, "learning_rate": 1e-05, "loss": 0.0193, "step": 754000 }, { "epoch": 0.007541, "grad_norm": 0.150520458817482, "learning_rate": 1e-05, "loss": 0.0193, "step": 754100 }, { "epoch": 0.007542, "grad_norm": 0.1533837467432022, "learning_rate": 1e-05, "loss": 0.0195, "step": 754200 }, { "epoch": 0.007543, "grad_norm": 0.10446280986070633, "learning_rate": 1e-05, "loss": 0.0192, "step": 754300 }, { "epoch": 0.007544, "grad_norm": 0.1546838879585266, "learning_rate": 1e-05, "loss": 0.0191, "step": 754400 }, { "epoch": 0.007545, "grad_norm": 0.23828274011611938, "learning_rate": 1e-05, "loss": 0.019, "step": 754500 }, { "epoch": 0.007546, "grad_norm": 0.35420525074005127, "learning_rate": 1e-05, "loss": 0.0197, "step": 754600 }, { "epoch": 0.007547, "grad_norm": 0.1593271940946579, "learning_rate": 1e-05, "loss": 0.0196, "step": 754700 }, { "epoch": 0.007548, "grad_norm": 0.11802905797958374, "learning_rate": 1e-05, "loss": 0.0195, "step": 754800 }, { "epoch": 0.007549, "grad_norm": 0.16840589046478271, "learning_rate": 1e-05, "loss": 0.0191, "step": 754900 }, { "epoch": 0.00755, "grad_norm": 0.10954350978136063, "learning_rate": 1e-05, "loss": 0.0192, "step": 755000 }, { "epoch": 0.007551, "grad_norm": 0.16294695436954498, "learning_rate": 1e-05, "loss": 0.0192, "step": 755100 }, { "epoch": 0.007552, "grad_norm": 0.17448793351650238, "learning_rate": 1e-05, "loss": 0.0195, "step": 755200 }, { "epoch": 0.007553, "grad_norm": 0.14267092943191528, "learning_rate": 1e-05, "loss": 0.0195, "step": 755300 }, { "epoch": 0.007554, "grad_norm": 0.16398335993289948, "learning_rate": 1e-05, "loss": 0.0194, "step": 755400 }, { "epoch": 0.007555, "grad_norm": 0.16520850360393524, "learning_rate": 1e-05, "loss": 0.0194, "step": 755500 }, { "epoch": 0.007556, "grad_norm": 0.16482631862163544, "learning_rate": 1e-05, "loss": 0.0195, "step": 755600 }, { "epoch": 0.007557, "grad_norm": 0.14905156195163727, "learning_rate": 1e-05, "loss": 0.0187, "step": 755700 }, { "epoch": 0.007558, "grad_norm": 0.12477238476276398, "learning_rate": 1e-05, "loss": 0.019, "step": 755800 }, { "epoch": 0.007559, "grad_norm": 0.127049520611763, "learning_rate": 1e-05, "loss": 0.0193, "step": 755900 }, { "epoch": 0.00756, "grad_norm": 0.13834649324417114, "learning_rate": 1e-05, "loss": 0.0196, "step": 756000 }, { "epoch": 0.007561, "grad_norm": 0.14084284007549286, "learning_rate": 1e-05, "loss": 0.0191, "step": 756100 }, { "epoch": 0.007562, "grad_norm": 0.157247394323349, "learning_rate": 1e-05, "loss": 0.0196, "step": 756200 }, { "epoch": 0.007563, "grad_norm": 0.14573821425437927, "learning_rate": 1e-05, "loss": 0.0195, "step": 756300 }, { "epoch": 0.007564, "grad_norm": 0.1259627491235733, "learning_rate": 1e-05, "loss": 0.02, "step": 756400 }, { "epoch": 0.007565, "grad_norm": 0.1886405497789383, "learning_rate": 1e-05, "loss": 0.0195, "step": 756500 }, { "epoch": 0.007566, "grad_norm": 0.15340501070022583, "learning_rate": 1e-05, "loss": 0.0196, "step": 756600 }, { "epoch": 0.007567, "grad_norm": 0.14791767299175262, "learning_rate": 1e-05, "loss": 0.0194, "step": 756700 }, { "epoch": 0.007568, "grad_norm": 0.20108656585216522, "learning_rate": 1e-05, "loss": 0.0195, "step": 756800 }, { "epoch": 0.007569, "grad_norm": 0.16300426423549652, "learning_rate": 1e-05, "loss": 0.019, "step": 756900 }, { "epoch": 0.00757, "grad_norm": 0.12524008750915527, "learning_rate": 1e-05, "loss": 0.0197, "step": 757000 }, { "epoch": 0.007571, "grad_norm": 0.22086365520954132, "learning_rate": 1e-05, "loss": 0.0199, "step": 757100 }, { "epoch": 0.007572, "grad_norm": 0.13023921847343445, "learning_rate": 1e-05, "loss": 0.0191, "step": 757200 }, { "epoch": 0.007573, "grad_norm": 0.14690625667572021, "learning_rate": 1e-05, "loss": 0.0191, "step": 757300 }, { "epoch": 0.007574, "grad_norm": 0.2204228937625885, "learning_rate": 1e-05, "loss": 0.0194, "step": 757400 }, { "epoch": 0.007575, "grad_norm": 0.18786142766475677, "learning_rate": 1e-05, "loss": 0.0192, "step": 757500 }, { "epoch": 0.007576, "grad_norm": 0.13083043694496155, "learning_rate": 1e-05, "loss": 0.0193, "step": 757600 }, { "epoch": 0.007577, "grad_norm": 0.18311242759227753, "learning_rate": 1e-05, "loss": 0.0193, "step": 757700 }, { "epoch": 0.007578, "grad_norm": 0.12519146502017975, "learning_rate": 1e-05, "loss": 0.0191, "step": 757800 }, { "epoch": 0.007579, "grad_norm": 0.12091723084449768, "learning_rate": 1e-05, "loss": 0.0192, "step": 757900 }, { "epoch": 0.00758, "grad_norm": 0.17383885383605957, "learning_rate": 1e-05, "loss": 0.0191, "step": 758000 }, { "epoch": 0.007581, "grad_norm": 0.19999396800994873, "learning_rate": 1e-05, "loss": 0.0193, "step": 758100 }, { "epoch": 0.007582, "grad_norm": 0.12202490866184235, "learning_rate": 1e-05, "loss": 0.0191, "step": 758200 }, { "epoch": 0.007583, "grad_norm": 0.16801321506500244, "learning_rate": 1e-05, "loss": 0.02, "step": 758300 }, { "epoch": 0.007584, "grad_norm": 0.13257081806659698, "learning_rate": 1e-05, "loss": 0.0193, "step": 758400 }, { "epoch": 0.007585, "grad_norm": 0.15225449204444885, "learning_rate": 1e-05, "loss": 0.0189, "step": 758500 }, { "epoch": 0.007586, "grad_norm": 0.14171074330806732, "learning_rate": 1e-05, "loss": 0.0191, "step": 758600 }, { "epoch": 0.007587, "grad_norm": 0.15023526549339294, "learning_rate": 1e-05, "loss": 0.0189, "step": 758700 }, { "epoch": 0.007588, "grad_norm": 0.15241454541683197, "learning_rate": 1e-05, "loss": 0.0195, "step": 758800 }, { "epoch": 0.007589, "grad_norm": 0.15629832446575165, "learning_rate": 1e-05, "loss": 0.0188, "step": 758900 }, { "epoch": 0.00759, "grad_norm": 0.19027535617351532, "learning_rate": 1e-05, "loss": 0.0192, "step": 759000 }, { "epoch": 0.007591, "grad_norm": 0.1497412919998169, "learning_rate": 1e-05, "loss": 0.0189, "step": 759100 }, { "epoch": 0.007592, "grad_norm": 0.2274346947669983, "learning_rate": 1e-05, "loss": 0.0196, "step": 759200 }, { "epoch": 0.007593, "grad_norm": 0.13535794615745544, "learning_rate": 1e-05, "loss": 0.0196, "step": 759300 }, { "epoch": 0.007594, "grad_norm": 0.20375370979309082, "learning_rate": 1e-05, "loss": 0.0192, "step": 759400 }, { "epoch": 0.007595, "grad_norm": 0.2294812649488449, "learning_rate": 1e-05, "loss": 0.0189, "step": 759500 }, { "epoch": 0.007596, "grad_norm": 0.14942452311515808, "learning_rate": 1e-05, "loss": 0.0199, "step": 759600 }, { "epoch": 0.007597, "grad_norm": 0.16094975173473358, "learning_rate": 1e-05, "loss": 0.0192, "step": 759700 }, { "epoch": 0.007598, "grad_norm": 0.13225747644901276, "learning_rate": 1e-05, "loss": 0.0191, "step": 759800 }, { "epoch": 0.007599, "grad_norm": 0.15132230520248413, "learning_rate": 1e-05, "loss": 0.0195, "step": 759900 }, { "epoch": 0.0076, "grad_norm": 0.19544145464897156, "learning_rate": 1e-05, "loss": 0.019, "step": 760000 }, { "epoch": 0.0076, "eval_loss": 0.01687508635222912, "eval_runtime": 169.7863, "eval_samples_per_second": 294.488, "eval_steps_per_second": 18.405, "step": 760000 }, { "epoch": 0.007601, "grad_norm": 0.20416928827762604, "learning_rate": 1e-05, "loss": 0.0196, "step": 760100 }, { "epoch": 0.007602, "grad_norm": 0.18158312141895294, "learning_rate": 1e-05, "loss": 0.0194, "step": 760200 }, { "epoch": 0.007603, "grad_norm": 0.15977609157562256, "learning_rate": 1e-05, "loss": 0.0188, "step": 760300 }, { "epoch": 0.007604, "grad_norm": 0.179363414645195, "learning_rate": 1e-05, "loss": 0.0196, "step": 760400 }, { "epoch": 0.007605, "grad_norm": 0.2004944533109665, "learning_rate": 1e-05, "loss": 0.0191, "step": 760500 }, { "epoch": 0.007606, "grad_norm": 0.26188185811042786, "learning_rate": 1e-05, "loss": 0.0195, "step": 760600 }, { "epoch": 0.007607, "grad_norm": 0.17463430762290955, "learning_rate": 1e-05, "loss": 0.0193, "step": 760700 }, { "epoch": 0.007608, "grad_norm": 0.1375441998243332, "learning_rate": 1e-05, "loss": 0.0192, "step": 760800 }, { "epoch": 0.007609, "grad_norm": 0.12228087335824966, "learning_rate": 1e-05, "loss": 0.0191, "step": 760900 }, { "epoch": 0.00761, "grad_norm": 0.1483268141746521, "learning_rate": 1e-05, "loss": 0.0197, "step": 761000 }, { "epoch": 0.007611, "grad_norm": 0.1397564858198166, "learning_rate": 1e-05, "loss": 0.019, "step": 761100 }, { "epoch": 0.007612, "grad_norm": 0.16720382869243622, "learning_rate": 1e-05, "loss": 0.0192, "step": 761200 }, { "epoch": 0.007613, "grad_norm": 0.21778661012649536, "learning_rate": 1e-05, "loss": 0.0192, "step": 761300 }, { "epoch": 0.007614, "grad_norm": 0.16060911118984222, "learning_rate": 1e-05, "loss": 0.0196, "step": 761400 }, { "epoch": 0.007615, "grad_norm": 0.14518219232559204, "learning_rate": 1e-05, "loss": 0.0192, "step": 761500 }, { "epoch": 0.007616, "grad_norm": 0.18174390494823456, "learning_rate": 1e-05, "loss": 0.0195, "step": 761600 }, { "epoch": 0.007617, "grad_norm": 0.14272163808345795, "learning_rate": 1e-05, "loss": 0.0186, "step": 761700 }, { "epoch": 0.007618, "grad_norm": 0.14442454278469086, "learning_rate": 1e-05, "loss": 0.0191, "step": 761800 }, { "epoch": 0.007619, "grad_norm": 0.2247307002544403, "learning_rate": 1e-05, "loss": 0.0193, "step": 761900 }, { "epoch": 0.00762, "grad_norm": 0.1384669542312622, "learning_rate": 1e-05, "loss": 0.0193, "step": 762000 }, { "epoch": 0.007621, "grad_norm": 0.17374339699745178, "learning_rate": 1e-05, "loss": 0.0192, "step": 762100 }, { "epoch": 0.007622, "grad_norm": 0.1539120227098465, "learning_rate": 1e-05, "loss": 0.0188, "step": 762200 }, { "epoch": 0.007623, "grad_norm": 0.20428675413131714, "learning_rate": 1e-05, "loss": 0.0194, "step": 762300 }, { "epoch": 0.007624, "grad_norm": 0.13455288112163544, "learning_rate": 1e-05, "loss": 0.0195, "step": 762400 }, { "epoch": 0.007625, "grad_norm": 0.16280029714107513, "learning_rate": 1e-05, "loss": 0.0197, "step": 762500 }, { "epoch": 0.007626, "grad_norm": 0.1330794245004654, "learning_rate": 1e-05, "loss": 0.0191, "step": 762600 }, { "epoch": 0.007627, "grad_norm": 0.18239878118038177, "learning_rate": 1e-05, "loss": 0.0195, "step": 762700 }, { "epoch": 0.007628, "grad_norm": 0.14082659780979156, "learning_rate": 1e-05, "loss": 0.0194, "step": 762800 }, { "epoch": 0.007629, "grad_norm": 0.17850400507450104, "learning_rate": 1e-05, "loss": 0.0191, "step": 762900 }, { "epoch": 0.00763, "grad_norm": 0.12613750994205475, "learning_rate": 1e-05, "loss": 0.0193, "step": 763000 }, { "epoch": 0.007631, "grad_norm": 0.16639302670955658, "learning_rate": 1e-05, "loss": 0.0188, "step": 763100 }, { "epoch": 0.007632, "grad_norm": 0.15618892014026642, "learning_rate": 1e-05, "loss": 0.0188, "step": 763200 }, { "epoch": 0.007633, "grad_norm": 0.14400449395179749, "learning_rate": 1e-05, "loss": 0.0185, "step": 763300 }, { "epoch": 0.007634, "grad_norm": 0.135957270860672, "learning_rate": 1e-05, "loss": 0.0191, "step": 763400 }, { "epoch": 0.007635, "grad_norm": 0.19081725180149078, "learning_rate": 1e-05, "loss": 0.0193, "step": 763500 }, { "epoch": 0.007636, "grad_norm": 0.1863856017589569, "learning_rate": 1e-05, "loss": 0.0192, "step": 763600 }, { "epoch": 0.007637, "grad_norm": 0.14803500473499298, "learning_rate": 1e-05, "loss": 0.0195, "step": 763700 }, { "epoch": 0.007638, "grad_norm": 0.1454806625843048, "learning_rate": 1e-05, "loss": 0.0195, "step": 763800 }, { "epoch": 0.007639, "grad_norm": 0.196010559797287, "learning_rate": 1e-05, "loss": 0.0198, "step": 763900 }, { "epoch": 0.00764, "grad_norm": 0.14763329923152924, "learning_rate": 1e-05, "loss": 0.0193, "step": 764000 }, { "epoch": 0.007641, "grad_norm": 0.2513766884803772, "learning_rate": 1e-05, "loss": 0.019, "step": 764100 }, { "epoch": 0.007642, "grad_norm": 0.12726448476314545, "learning_rate": 1e-05, "loss": 0.0187, "step": 764200 }, { "epoch": 0.007643, "grad_norm": 0.12288402020931244, "learning_rate": 1e-05, "loss": 0.019, "step": 764300 }, { "epoch": 0.007644, "grad_norm": 0.191612109541893, "learning_rate": 1e-05, "loss": 0.0191, "step": 764400 }, { "epoch": 0.007645, "grad_norm": 0.16951929032802582, "learning_rate": 1e-05, "loss": 0.0191, "step": 764500 }, { "epoch": 0.007646, "grad_norm": 0.1568640023469925, "learning_rate": 1e-05, "loss": 0.0191, "step": 764600 }, { "epoch": 0.007647, "grad_norm": 0.12636613845825195, "learning_rate": 1e-05, "loss": 0.019, "step": 764700 }, { "epoch": 0.007648, "grad_norm": 0.1316346973180771, "learning_rate": 1e-05, "loss": 0.0194, "step": 764800 }, { "epoch": 0.007649, "grad_norm": 0.16837775707244873, "learning_rate": 1e-05, "loss": 0.0194, "step": 764900 }, { "epoch": 0.00765, "grad_norm": 0.20386070013046265, "learning_rate": 1e-05, "loss": 0.0186, "step": 765000 }, { "epoch": 0.007651, "grad_norm": 0.23314298689365387, "learning_rate": 1e-05, "loss": 0.0193, "step": 765100 }, { "epoch": 0.007652, "grad_norm": 0.12028111517429352, "learning_rate": 1e-05, "loss": 0.0197, "step": 765200 }, { "epoch": 0.007653, "grad_norm": 0.1638604998588562, "learning_rate": 1e-05, "loss": 0.0193, "step": 765300 }, { "epoch": 0.007654, "grad_norm": 0.19719889760017395, "learning_rate": 1e-05, "loss": 0.0192, "step": 765400 }, { "epoch": 0.007655, "grad_norm": 0.18822716176509857, "learning_rate": 1e-05, "loss": 0.0198, "step": 765500 }, { "epoch": 0.007656, "grad_norm": 0.16738039255142212, "learning_rate": 1e-05, "loss": 0.0196, "step": 765600 }, { "epoch": 0.007657, "grad_norm": 0.1662168800830841, "learning_rate": 1e-05, "loss": 0.019, "step": 765700 }, { "epoch": 0.007658, "grad_norm": 0.1857258528470993, "learning_rate": 1e-05, "loss": 0.0192, "step": 765800 }, { "epoch": 0.007659, "grad_norm": 0.1751387119293213, "learning_rate": 1e-05, "loss": 0.0193, "step": 765900 }, { "epoch": 0.00766, "grad_norm": 0.1579889953136444, "learning_rate": 1e-05, "loss": 0.0189, "step": 766000 }, { "epoch": 0.007661, "grad_norm": 0.17023180425167084, "learning_rate": 1e-05, "loss": 0.0195, "step": 766100 }, { "epoch": 0.007662, "grad_norm": 0.1623077541589737, "learning_rate": 1e-05, "loss": 0.0191, "step": 766200 }, { "epoch": 0.007663, "grad_norm": 0.1969958394765854, "learning_rate": 1e-05, "loss": 0.0188, "step": 766300 }, { "epoch": 0.007664, "grad_norm": 0.1487826406955719, "learning_rate": 1e-05, "loss": 0.0194, "step": 766400 }, { "epoch": 0.007665, "grad_norm": 0.13180209696292877, "learning_rate": 1e-05, "loss": 0.0191, "step": 766500 }, { "epoch": 0.007666, "grad_norm": 0.1861986219882965, "learning_rate": 1e-05, "loss": 0.0194, "step": 766600 }, { "epoch": 0.007667, "grad_norm": 0.1510457843542099, "learning_rate": 1e-05, "loss": 0.0192, "step": 766700 }, { "epoch": 0.007668, "grad_norm": 0.21203969419002533, "learning_rate": 1e-05, "loss": 0.0193, "step": 766800 }, { "epoch": 0.007669, "grad_norm": 0.13817881047725677, "learning_rate": 1e-05, "loss": 0.019, "step": 766900 }, { "epoch": 0.00767, "grad_norm": 0.19206270575523376, "learning_rate": 1e-05, "loss": 0.019, "step": 767000 }, { "epoch": 0.007671, "grad_norm": 0.19194109737873077, "learning_rate": 1e-05, "loss": 0.0192, "step": 767100 }, { "epoch": 0.007672, "grad_norm": 0.2358757108449936, "learning_rate": 1e-05, "loss": 0.0196, "step": 767200 }, { "epoch": 0.007673, "grad_norm": 0.3321593999862671, "learning_rate": 1e-05, "loss": 0.0194, "step": 767300 }, { "epoch": 0.007674, "grad_norm": 0.1187981516122818, "learning_rate": 1e-05, "loss": 0.0194, "step": 767400 }, { "epoch": 0.007675, "grad_norm": 0.1927706003189087, "learning_rate": 1e-05, "loss": 0.0192, "step": 767500 }, { "epoch": 0.007676, "grad_norm": 0.1759009212255478, "learning_rate": 1e-05, "loss": 0.0189, "step": 767600 }, { "epoch": 0.007677, "grad_norm": 0.1479274332523346, "learning_rate": 1e-05, "loss": 0.0192, "step": 767700 }, { "epoch": 0.007678, "grad_norm": 0.16502007842063904, "learning_rate": 1e-05, "loss": 0.0194, "step": 767800 }, { "epoch": 0.007679, "grad_norm": 0.20017340779304504, "learning_rate": 1e-05, "loss": 0.019, "step": 767900 }, { "epoch": 0.00768, "grad_norm": 0.17978838086128235, "learning_rate": 1e-05, "loss": 0.0192, "step": 768000 }, { "epoch": 0.007681, "grad_norm": 0.20914611220359802, "learning_rate": 1e-05, "loss": 0.019, "step": 768100 }, { "epoch": 0.007682, "grad_norm": 0.14184428751468658, "learning_rate": 1e-05, "loss": 0.0189, "step": 768200 }, { "epoch": 0.007683, "grad_norm": 0.18127766251564026, "learning_rate": 1e-05, "loss": 0.0194, "step": 768300 }, { "epoch": 0.007684, "grad_norm": 0.1671641767024994, "learning_rate": 1e-05, "loss": 0.0187, "step": 768400 }, { "epoch": 0.007685, "grad_norm": 0.20379842817783356, "learning_rate": 1e-05, "loss": 0.019, "step": 768500 }, { "epoch": 0.007686, "grad_norm": 0.1682528257369995, "learning_rate": 1e-05, "loss": 0.019, "step": 768600 }, { "epoch": 0.007687, "grad_norm": 0.15215793251991272, "learning_rate": 1e-05, "loss": 0.0195, "step": 768700 }, { "epoch": 0.007688, "grad_norm": 0.15825822949409485, "learning_rate": 1e-05, "loss": 0.0192, "step": 768800 }, { "epoch": 0.007689, "grad_norm": 0.17986734211444855, "learning_rate": 1e-05, "loss": 0.0194, "step": 768900 }, { "epoch": 0.00769, "grad_norm": 0.16894644498825073, "learning_rate": 1e-05, "loss": 0.0191, "step": 769000 }, { "epoch": 0.007691, "grad_norm": 0.19438080489635468, "learning_rate": 1e-05, "loss": 0.0192, "step": 769100 }, { "epoch": 0.007692, "grad_norm": 0.1799708753824234, "learning_rate": 1e-05, "loss": 0.0196, "step": 769200 }, { "epoch": 0.007693, "grad_norm": 0.12698949873447418, "learning_rate": 1e-05, "loss": 0.0191, "step": 769300 }, { "epoch": 0.007694, "grad_norm": 0.12478208541870117, "learning_rate": 1e-05, "loss": 0.019, "step": 769400 }, { "epoch": 0.007695, "grad_norm": 0.18011878430843353, "learning_rate": 1e-05, "loss": 0.0192, "step": 769500 }, { "epoch": 0.007696, "grad_norm": 0.20886404812335968, "learning_rate": 1e-05, "loss": 0.0191, "step": 769600 }, { "epoch": 0.007697, "grad_norm": 0.18504011631011963, "learning_rate": 1e-05, "loss": 0.0191, "step": 769700 }, { "epoch": 0.007698, "grad_norm": 0.12865211069583893, "learning_rate": 1e-05, "loss": 0.0191, "step": 769800 }, { "epoch": 0.007699, "grad_norm": 0.13659629225730896, "learning_rate": 1e-05, "loss": 0.0188, "step": 769900 }, { "epoch": 0.0077, "grad_norm": 0.14464502036571503, "learning_rate": 1e-05, "loss": 0.0188, "step": 770000 }, { "epoch": 0.007701, "grad_norm": 0.13151928782463074, "learning_rate": 1e-05, "loss": 0.0189, "step": 770100 }, { "epoch": 0.007702, "grad_norm": 0.15464268624782562, "learning_rate": 1e-05, "loss": 0.0189, "step": 770200 }, { "epoch": 0.007703, "grad_norm": 0.17920705676078796, "learning_rate": 1e-05, "loss": 0.0195, "step": 770300 }, { "epoch": 0.007704, "grad_norm": 0.13809572160243988, "learning_rate": 1e-05, "loss": 0.0191, "step": 770400 }, { "epoch": 0.007705, "grad_norm": 0.13465730845928192, "learning_rate": 1e-05, "loss": 0.0189, "step": 770500 }, { "epoch": 0.007706, "grad_norm": 0.14541056752204895, "learning_rate": 1e-05, "loss": 0.0192, "step": 770600 }, { "epoch": 0.007707, "grad_norm": 0.17808130383491516, "learning_rate": 1e-05, "loss": 0.0192, "step": 770700 }, { "epoch": 0.007708, "grad_norm": 0.13139736652374268, "learning_rate": 1e-05, "loss": 0.0193, "step": 770800 }, { "epoch": 0.007709, "grad_norm": 0.14232398569583893, "learning_rate": 1e-05, "loss": 0.0198, "step": 770900 }, { "epoch": 0.00771, "grad_norm": 0.10501966625452042, "learning_rate": 1e-05, "loss": 0.0191, "step": 771000 }, { "epoch": 0.007711, "grad_norm": 0.24385398626327515, "learning_rate": 1e-05, "loss": 0.0195, "step": 771100 }, { "epoch": 0.007712, "grad_norm": 0.1781701147556305, "learning_rate": 1e-05, "loss": 0.0192, "step": 771200 }, { "epoch": 0.007713, "grad_norm": 0.19481968879699707, "learning_rate": 1e-05, "loss": 0.0194, "step": 771300 }, { "epoch": 0.007714, "grad_norm": 0.15036515891551971, "learning_rate": 1e-05, "loss": 0.0198, "step": 771400 }, { "epoch": 0.007715, "grad_norm": 0.131203755736351, "learning_rate": 1e-05, "loss": 0.0191, "step": 771500 }, { "epoch": 0.007716, "grad_norm": 0.16685043275356293, "learning_rate": 1e-05, "loss": 0.0192, "step": 771600 }, { "epoch": 0.007717, "grad_norm": 0.12114625424146652, "learning_rate": 1e-05, "loss": 0.0191, "step": 771700 }, { "epoch": 0.007718, "grad_norm": 0.17540471255779266, "learning_rate": 1e-05, "loss": 0.0198, "step": 771800 }, { "epoch": 0.007719, "grad_norm": 0.1836840659379959, "learning_rate": 1e-05, "loss": 0.019, "step": 771900 }, { "epoch": 0.00772, "grad_norm": 0.1707053929567337, "learning_rate": 1e-05, "loss": 0.0195, "step": 772000 }, { "epoch": 0.007721, "grad_norm": 0.22354356944561005, "learning_rate": 1e-05, "loss": 0.0195, "step": 772100 }, { "epoch": 0.007722, "grad_norm": 0.12224312126636505, "learning_rate": 1e-05, "loss": 0.0192, "step": 772200 }, { "epoch": 0.007723, "grad_norm": 0.18118269741535187, "learning_rate": 1e-05, "loss": 0.0188, "step": 772300 }, { "epoch": 0.007724, "grad_norm": 0.16255128383636475, "learning_rate": 1e-05, "loss": 0.0191, "step": 772400 }, { "epoch": 0.007725, "grad_norm": 0.1407899260520935, "learning_rate": 1e-05, "loss": 0.0188, "step": 772500 }, { "epoch": 0.007726, "grad_norm": 0.09548201411962509, "learning_rate": 1e-05, "loss": 0.0196, "step": 772600 }, { "epoch": 0.007727, "grad_norm": 0.15033653378486633, "learning_rate": 1e-05, "loss": 0.0191, "step": 772700 }, { "epoch": 0.007728, "grad_norm": 0.17861410975456238, "learning_rate": 1e-05, "loss": 0.0195, "step": 772800 }, { "epoch": 0.007729, "grad_norm": 0.17420199513435364, "learning_rate": 1e-05, "loss": 0.0191, "step": 772900 }, { "epoch": 0.00773, "grad_norm": 0.13140970468521118, "learning_rate": 1e-05, "loss": 0.0191, "step": 773000 }, { "epoch": 0.007731, "grad_norm": 0.182146355509758, "learning_rate": 1e-05, "loss": 0.0191, "step": 773100 }, { "epoch": 0.007732, "grad_norm": 0.15108874440193176, "learning_rate": 1e-05, "loss": 0.0189, "step": 773200 }, { "epoch": 0.007733, "grad_norm": 0.15893672406673431, "learning_rate": 1e-05, "loss": 0.019, "step": 773300 }, { "epoch": 0.007734, "grad_norm": 0.20988349616527557, "learning_rate": 1e-05, "loss": 0.0192, "step": 773400 }, { "epoch": 0.007735, "grad_norm": 0.16981709003448486, "learning_rate": 1e-05, "loss": 0.019, "step": 773500 }, { "epoch": 0.007736, "grad_norm": 0.11821989715099335, "learning_rate": 1e-05, "loss": 0.0194, "step": 773600 }, { "epoch": 0.007737, "grad_norm": 0.15401580929756165, "learning_rate": 1e-05, "loss": 0.0195, "step": 773700 }, { "epoch": 0.007738, "grad_norm": 0.11319807916879654, "learning_rate": 1e-05, "loss": 0.019, "step": 773800 }, { "epoch": 0.007739, "grad_norm": 0.15672136843204498, "learning_rate": 1e-05, "loss": 0.0187, "step": 773900 }, { "epoch": 0.00774, "grad_norm": 0.14892567694187164, "learning_rate": 1e-05, "loss": 0.0193, "step": 774000 }, { "epoch": 0.007741, "grad_norm": 0.17214448750019073, "learning_rate": 1e-05, "loss": 0.0189, "step": 774100 }, { "epoch": 0.007742, "grad_norm": 0.14465153217315674, "learning_rate": 1e-05, "loss": 0.0193, "step": 774200 }, { "epoch": 0.007743, "grad_norm": 0.13860803842544556, "learning_rate": 1e-05, "loss": 0.0194, "step": 774300 }, { "epoch": 0.007744, "grad_norm": 0.23131565749645233, "learning_rate": 1e-05, "loss": 0.0192, "step": 774400 }, { "epoch": 0.007745, "grad_norm": 0.13765977323055267, "learning_rate": 1e-05, "loss": 0.0191, "step": 774500 }, { "epoch": 0.007746, "grad_norm": 0.19227512180805206, "learning_rate": 1e-05, "loss": 0.0196, "step": 774600 }, { "epoch": 0.007747, "grad_norm": 0.21498988568782806, "learning_rate": 1e-05, "loss": 0.0191, "step": 774700 }, { "epoch": 0.007748, "grad_norm": 0.15117239952087402, "learning_rate": 1e-05, "loss": 0.0195, "step": 774800 }, { "epoch": 0.007749, "grad_norm": 0.14472264051437378, "learning_rate": 1e-05, "loss": 0.0195, "step": 774900 }, { "epoch": 0.00775, "grad_norm": 0.16918466985225677, "learning_rate": 1e-05, "loss": 0.0191, "step": 775000 }, { "epoch": 0.007751, "grad_norm": 0.14108990132808685, "learning_rate": 1e-05, "loss": 0.0194, "step": 775100 }, { "epoch": 0.007752, "grad_norm": 0.1720188558101654, "learning_rate": 1e-05, "loss": 0.0193, "step": 775200 }, { "epoch": 0.007753, "grad_norm": 0.162660613656044, "learning_rate": 1e-05, "loss": 0.0192, "step": 775300 }, { "epoch": 0.007754, "grad_norm": 0.1842542290687561, "learning_rate": 1e-05, "loss": 0.0192, "step": 775400 }, { "epoch": 0.007755, "grad_norm": 0.12252360582351685, "learning_rate": 1e-05, "loss": 0.0193, "step": 775500 }, { "epoch": 0.007756, "grad_norm": 0.13150958716869354, "learning_rate": 1e-05, "loss": 0.0193, "step": 775600 }, { "epoch": 0.007757, "grad_norm": 0.14490200579166412, "learning_rate": 1e-05, "loss": 0.0191, "step": 775700 }, { "epoch": 0.007758, "grad_norm": 0.19483418762683868, "learning_rate": 1e-05, "loss": 0.0198, "step": 775800 }, { "epoch": 0.007759, "grad_norm": 0.17516465485095978, "learning_rate": 1e-05, "loss": 0.0188, "step": 775900 }, { "epoch": 0.00776, "grad_norm": 0.14340746402740479, "learning_rate": 1e-05, "loss": 0.0189, "step": 776000 }, { "epoch": 0.007761, "grad_norm": 0.1525442749261856, "learning_rate": 1e-05, "loss": 0.0195, "step": 776100 }, { "epoch": 0.007762, "grad_norm": 0.15910929441452026, "learning_rate": 1e-05, "loss": 0.019, "step": 776200 }, { "epoch": 0.007763, "grad_norm": 0.1156257912516594, "learning_rate": 1e-05, "loss": 0.0192, "step": 776300 }, { "epoch": 0.007764, "grad_norm": 0.12039706856012344, "learning_rate": 1e-05, "loss": 0.0192, "step": 776400 }, { "epoch": 0.007765, "grad_norm": 0.11445718258619308, "learning_rate": 1e-05, "loss": 0.0195, "step": 776500 }, { "epoch": 0.007766, "grad_norm": 0.24225878715515137, "learning_rate": 1e-05, "loss": 0.0193, "step": 776600 }, { "epoch": 0.007767, "grad_norm": 0.15026919543743134, "learning_rate": 1e-05, "loss": 0.0186, "step": 776700 }, { "epoch": 0.007768, "grad_norm": 0.20185576379299164, "learning_rate": 1e-05, "loss": 0.0197, "step": 776800 }, { "epoch": 0.007769, "grad_norm": 0.1543840616941452, "learning_rate": 1e-05, "loss": 0.0188, "step": 776900 }, { "epoch": 0.00777, "grad_norm": 0.14705035090446472, "learning_rate": 1e-05, "loss": 0.0186, "step": 777000 }, { "epoch": 0.007771, "grad_norm": 0.1497599184513092, "learning_rate": 1e-05, "loss": 0.0191, "step": 777100 }, { "epoch": 0.007772, "grad_norm": 0.1933748573064804, "learning_rate": 1e-05, "loss": 0.0184, "step": 777200 }, { "epoch": 0.007773, "grad_norm": 0.18017393350601196, "learning_rate": 1e-05, "loss": 0.0193, "step": 777300 }, { "epoch": 0.007774, "grad_norm": 0.15358735620975494, "learning_rate": 1e-05, "loss": 0.019, "step": 777400 }, { "epoch": 0.007775, "grad_norm": 0.14888179302215576, "learning_rate": 1e-05, "loss": 0.0187, "step": 777500 }, { "epoch": 0.007776, "grad_norm": 0.21946604549884796, "learning_rate": 1e-05, "loss": 0.0191, "step": 777600 }, { "epoch": 0.007777, "grad_norm": 0.12975665926933289, "learning_rate": 1e-05, "loss": 0.0187, "step": 777700 }, { "epoch": 0.007778, "grad_norm": 0.2429368942975998, "learning_rate": 1e-05, "loss": 0.0192, "step": 777800 }, { "epoch": 0.007779, "grad_norm": 0.15013892948627472, "learning_rate": 1e-05, "loss": 0.0192, "step": 777900 }, { "epoch": 0.00778, "grad_norm": 0.13916032016277313, "learning_rate": 1e-05, "loss": 0.0187, "step": 778000 }, { "epoch": 0.007781, "grad_norm": 0.18052484095096588, "learning_rate": 1e-05, "loss": 0.019, "step": 778100 }, { "epoch": 0.007782, "grad_norm": 0.1286340206861496, "learning_rate": 1e-05, "loss": 0.0198, "step": 778200 }, { "epoch": 0.007783, "grad_norm": 0.1698673963546753, "learning_rate": 1e-05, "loss": 0.0198, "step": 778300 }, { "epoch": 0.007784, "grad_norm": 0.1680203527212143, "learning_rate": 1e-05, "loss": 0.019, "step": 778400 }, { "epoch": 0.007785, "grad_norm": 0.20458181202411652, "learning_rate": 1e-05, "loss": 0.0194, "step": 778500 }, { "epoch": 0.007786, "grad_norm": 0.17059606313705444, "learning_rate": 1e-05, "loss": 0.0192, "step": 778600 }, { "epoch": 0.007787, "grad_norm": 0.13362814486026764, "learning_rate": 1e-05, "loss": 0.0194, "step": 778700 }, { "epoch": 0.007788, "grad_norm": 0.1394757777452469, "learning_rate": 1e-05, "loss": 0.0187, "step": 778800 }, { "epoch": 0.007789, "grad_norm": 0.1334112584590912, "learning_rate": 1e-05, "loss": 0.019, "step": 778900 }, { "epoch": 0.00779, "grad_norm": 0.14684990048408508, "learning_rate": 1e-05, "loss": 0.0196, "step": 779000 }, { "epoch": 0.007791, "grad_norm": 0.18989701569080353, "learning_rate": 1e-05, "loss": 0.0192, "step": 779100 }, { "epoch": 0.007792, "grad_norm": 0.16801637411117554, "learning_rate": 1e-05, "loss": 0.0189, "step": 779200 }, { "epoch": 0.007793, "grad_norm": 0.16262203454971313, "learning_rate": 1e-05, "loss": 0.0195, "step": 779300 }, { "epoch": 0.007794, "grad_norm": 0.15845516324043274, "learning_rate": 1e-05, "loss": 0.019, "step": 779400 }, { "epoch": 0.007795, "grad_norm": 0.15233339369297028, "learning_rate": 1e-05, "loss": 0.0191, "step": 779500 }, { "epoch": 0.007796, "grad_norm": 0.134091317653656, "learning_rate": 1e-05, "loss": 0.0189, "step": 779600 }, { "epoch": 0.007797, "grad_norm": 0.13219298422336578, "learning_rate": 1e-05, "loss": 0.0194, "step": 779700 }, { "epoch": 0.007798, "grad_norm": 0.14747296273708344, "learning_rate": 1e-05, "loss": 0.0191, "step": 779800 }, { "epoch": 0.007799, "grad_norm": 0.14312651753425598, "learning_rate": 1e-05, "loss": 0.0193, "step": 779900 }, { "epoch": 0.0078, "grad_norm": 0.14848719537258148, "learning_rate": 1e-05, "loss": 0.0191, "step": 780000 }, { "epoch": 0.0078, "eval_loss": 0.016961172223091125, "eval_runtime": 191.5947, "eval_samples_per_second": 260.968, "eval_steps_per_second": 16.31, "step": 780000 }, { "epoch": 0.007801, "grad_norm": 0.13870656490325928, "learning_rate": 1e-05, "loss": 0.0189, "step": 780100 }, { "epoch": 0.007802, "grad_norm": 0.17286108434200287, "learning_rate": 1e-05, "loss": 0.0188, "step": 780200 }, { "epoch": 0.007803, "grad_norm": 0.11970204859972, "learning_rate": 1e-05, "loss": 0.019, "step": 780300 }, { "epoch": 0.007804, "grad_norm": 0.13199782371520996, "learning_rate": 1e-05, "loss": 0.0186, "step": 780400 }, { "epoch": 0.007805, "grad_norm": 0.18632467091083527, "learning_rate": 1e-05, "loss": 0.0197, "step": 780500 }, { "epoch": 0.007806, "grad_norm": 0.14548036456108093, "learning_rate": 1e-05, "loss": 0.0193, "step": 780600 }, { "epoch": 0.007807, "grad_norm": 0.17405878007411957, "learning_rate": 1e-05, "loss": 0.0191, "step": 780700 }, { "epoch": 0.007808, "grad_norm": 0.19417768716812134, "learning_rate": 1e-05, "loss": 0.0187, "step": 780800 }, { "epoch": 0.007809, "grad_norm": 0.22058361768722534, "learning_rate": 1e-05, "loss": 0.0187, "step": 780900 }, { "epoch": 0.00781, "grad_norm": 0.1778315305709839, "learning_rate": 1e-05, "loss": 0.0195, "step": 781000 }, { "epoch": 0.007811, "grad_norm": 0.15285883843898773, "learning_rate": 1e-05, "loss": 0.0195, "step": 781100 }, { "epoch": 0.007812, "grad_norm": 0.14776761829853058, "learning_rate": 1e-05, "loss": 0.0189, "step": 781200 }, { "epoch": 0.007813, "grad_norm": 0.1866835206747055, "learning_rate": 1e-05, "loss": 0.0189, "step": 781300 }, { "epoch": 0.007814, "grad_norm": 0.1458323448896408, "learning_rate": 1e-05, "loss": 0.019, "step": 781400 }, { "epoch": 0.007815, "grad_norm": 0.21960169076919556, "learning_rate": 1e-05, "loss": 0.019, "step": 781500 }, { "epoch": 0.007816, "grad_norm": 0.13681435585021973, "learning_rate": 1e-05, "loss": 0.0189, "step": 781600 }, { "epoch": 0.007817, "grad_norm": 0.16523325443267822, "learning_rate": 1e-05, "loss": 0.0194, "step": 781700 }, { "epoch": 0.007818, "grad_norm": 0.17462114989757538, "learning_rate": 1e-05, "loss": 0.0189, "step": 781800 }, { "epoch": 0.007819, "grad_norm": 0.15156680345535278, "learning_rate": 1e-05, "loss": 0.0191, "step": 781900 }, { "epoch": 0.00782, "grad_norm": 0.1442713588476181, "learning_rate": 1e-05, "loss": 0.0194, "step": 782000 }, { "epoch": 0.007821, "grad_norm": 0.155539408326149, "learning_rate": 1e-05, "loss": 0.0189, "step": 782100 }, { "epoch": 0.007822, "grad_norm": 0.13141383230686188, "learning_rate": 1e-05, "loss": 0.0195, "step": 782200 }, { "epoch": 0.007823, "grad_norm": 0.1807076632976532, "learning_rate": 1e-05, "loss": 0.0195, "step": 782300 }, { "epoch": 0.007824, "grad_norm": 0.12485107779502869, "learning_rate": 1e-05, "loss": 0.0189, "step": 782400 }, { "epoch": 0.007825, "grad_norm": 0.1729000061750412, "learning_rate": 1e-05, "loss": 0.0189, "step": 782500 }, { "epoch": 0.007826, "grad_norm": 0.1581580489873886, "learning_rate": 1e-05, "loss": 0.0187, "step": 782600 }, { "epoch": 0.007827, "grad_norm": 0.142274409532547, "learning_rate": 1e-05, "loss": 0.0193, "step": 782700 }, { "epoch": 0.007828, "grad_norm": 0.1505553275346756, "learning_rate": 1e-05, "loss": 0.0191, "step": 782800 }, { "epoch": 0.007829, "grad_norm": 0.18316178023815155, "learning_rate": 1e-05, "loss": 0.0187, "step": 782900 }, { "epoch": 0.00783, "grad_norm": 0.13025732338428497, "learning_rate": 1e-05, "loss": 0.019, "step": 783000 }, { "epoch": 0.007831, "grad_norm": 0.2030618041753769, "learning_rate": 1e-05, "loss": 0.0191, "step": 783100 }, { "epoch": 0.007832, "grad_norm": 0.20868250727653503, "learning_rate": 1e-05, "loss": 0.0189, "step": 783200 }, { "epoch": 0.007833, "grad_norm": 0.16962936520576477, "learning_rate": 1e-05, "loss": 0.0192, "step": 783300 }, { "epoch": 0.007834, "grad_norm": 0.12752941250801086, "learning_rate": 1e-05, "loss": 0.0193, "step": 783400 }, { "epoch": 0.007835, "grad_norm": 0.13670389354228973, "learning_rate": 1e-05, "loss": 0.0189, "step": 783500 }, { "epoch": 0.007836, "grad_norm": 0.15038679540157318, "learning_rate": 1e-05, "loss": 0.0196, "step": 783600 }, { "epoch": 0.007837, "grad_norm": 0.13739995658397675, "learning_rate": 1e-05, "loss": 0.019, "step": 783700 }, { "epoch": 0.007838, "grad_norm": 0.1511637270450592, "learning_rate": 1e-05, "loss": 0.0191, "step": 783800 }, { "epoch": 0.007839, "grad_norm": 0.18843835592269897, "learning_rate": 1e-05, "loss": 0.0191, "step": 783900 }, { "epoch": 0.00784, "grad_norm": 0.19241565465927124, "learning_rate": 1e-05, "loss": 0.0194, "step": 784000 }, { "epoch": 0.007841, "grad_norm": 0.18803279101848602, "learning_rate": 1e-05, "loss": 0.019, "step": 784100 }, { "epoch": 0.007842, "grad_norm": 0.1300446093082428, "learning_rate": 1e-05, "loss": 0.0187, "step": 784200 }, { "epoch": 0.007843, "grad_norm": 0.15666182339191437, "learning_rate": 1e-05, "loss": 0.0189, "step": 784300 }, { "epoch": 0.007844, "grad_norm": 0.14301922917366028, "learning_rate": 1e-05, "loss": 0.0192, "step": 784400 }, { "epoch": 0.007845, "grad_norm": 0.12285074591636658, "learning_rate": 1e-05, "loss": 0.0192, "step": 784500 }, { "epoch": 0.007846, "grad_norm": 0.12937834858894348, "learning_rate": 1e-05, "loss": 0.0193, "step": 784600 }, { "epoch": 0.007847, "grad_norm": 0.1500912308692932, "learning_rate": 1e-05, "loss": 0.0192, "step": 784700 }, { "epoch": 0.007848, "grad_norm": 0.12577147781848907, "learning_rate": 1e-05, "loss": 0.0192, "step": 784800 }, { "epoch": 0.007849, "grad_norm": 0.15595398843288422, "learning_rate": 1e-05, "loss": 0.0186, "step": 784900 }, { "epoch": 0.00785, "grad_norm": 0.11154453456401825, "learning_rate": 1e-05, "loss": 0.0188, "step": 785000 }, { "epoch": 0.007851, "grad_norm": 0.1636892408132553, "learning_rate": 1e-05, "loss": 0.0192, "step": 785100 }, { "epoch": 0.007852, "grad_norm": 0.16422238945960999, "learning_rate": 1e-05, "loss": 0.0187, "step": 785200 }, { "epoch": 0.007853, "grad_norm": 0.1514994353055954, "learning_rate": 1e-05, "loss": 0.0193, "step": 785300 }, { "epoch": 0.007854, "grad_norm": 0.180953711271286, "learning_rate": 1e-05, "loss": 0.0189, "step": 785400 }, { "epoch": 0.007855, "grad_norm": 0.16475751996040344, "learning_rate": 1e-05, "loss": 0.0189, "step": 785500 }, { "epoch": 0.007856, "grad_norm": 0.11637082695960999, "learning_rate": 1e-05, "loss": 0.0191, "step": 785600 }, { "epoch": 0.007857, "grad_norm": 0.15459562838077545, "learning_rate": 1e-05, "loss": 0.0193, "step": 785700 }, { "epoch": 0.007858, "grad_norm": 0.1314704567193985, "learning_rate": 1e-05, "loss": 0.019, "step": 785800 }, { "epoch": 0.007859, "grad_norm": 0.1955498307943344, "learning_rate": 1e-05, "loss": 0.0191, "step": 785900 }, { "epoch": 0.00786, "grad_norm": 0.21593359112739563, "learning_rate": 1e-05, "loss": 0.0191, "step": 786000 }, { "epoch": 0.007861, "grad_norm": 0.12837019562721252, "learning_rate": 1e-05, "loss": 0.0192, "step": 786100 }, { "epoch": 0.007862, "grad_norm": 0.1209622398018837, "learning_rate": 1e-05, "loss": 0.0191, "step": 786200 }, { "epoch": 0.007863, "grad_norm": 0.2766092121601105, "learning_rate": 1e-05, "loss": 0.0192, "step": 786300 }, { "epoch": 0.007864, "grad_norm": 0.1617877036333084, "learning_rate": 1e-05, "loss": 0.019, "step": 786400 }, { "epoch": 0.007865, "grad_norm": 0.16178768873214722, "learning_rate": 1e-05, "loss": 0.0191, "step": 786500 }, { "epoch": 0.007866, "grad_norm": 0.14908047020435333, "learning_rate": 1e-05, "loss": 0.0191, "step": 786600 }, { "epoch": 0.007867, "grad_norm": 0.14643582701683044, "learning_rate": 1e-05, "loss": 0.0187, "step": 786700 }, { "epoch": 0.007868, "grad_norm": 0.09520955383777618, "learning_rate": 1e-05, "loss": 0.0192, "step": 786800 }, { "epoch": 0.007869, "grad_norm": 0.1974208652973175, "learning_rate": 1e-05, "loss": 0.0192, "step": 786900 }, { "epoch": 0.00787, "grad_norm": 0.1971866488456726, "learning_rate": 1e-05, "loss": 0.0194, "step": 787000 }, { "epoch": 0.007871, "grad_norm": 0.12650901079177856, "learning_rate": 1e-05, "loss": 0.0191, "step": 787100 }, { "epoch": 0.007872, "grad_norm": 0.13402599096298218, "learning_rate": 1e-05, "loss": 0.0193, "step": 787200 }, { "epoch": 0.007873, "grad_norm": 0.1609029322862625, "learning_rate": 1e-05, "loss": 0.0187, "step": 787300 }, { "epoch": 0.007874, "grad_norm": 0.16024994850158691, "learning_rate": 1e-05, "loss": 0.0188, "step": 787400 }, { "epoch": 0.007875, "grad_norm": 0.14369820058345795, "learning_rate": 1e-05, "loss": 0.0193, "step": 787500 }, { "epoch": 0.007876, "grad_norm": 0.19654294848442078, "learning_rate": 1e-05, "loss": 0.0191, "step": 787600 }, { "epoch": 0.007877, "grad_norm": 0.11509920656681061, "learning_rate": 1e-05, "loss": 0.0191, "step": 787700 }, { "epoch": 0.007878, "grad_norm": 0.15145647525787354, "learning_rate": 1e-05, "loss": 0.019, "step": 787800 }, { "epoch": 0.007879, "grad_norm": 0.17468975484371185, "learning_rate": 1e-05, "loss": 0.0195, "step": 787900 }, { "epoch": 0.00788, "grad_norm": 0.19280566275119781, "learning_rate": 1e-05, "loss": 0.0189, "step": 788000 }, { "epoch": 0.007881, "grad_norm": 0.16493134200572968, "learning_rate": 1e-05, "loss": 0.0191, "step": 788100 }, { "epoch": 0.007882, "grad_norm": 0.15640677511692047, "learning_rate": 1e-05, "loss": 0.0197, "step": 788200 }, { "epoch": 0.007883, "grad_norm": 0.21278375387191772, "learning_rate": 1e-05, "loss": 0.019, "step": 788300 }, { "epoch": 0.007884, "grad_norm": 0.1189623698592186, "learning_rate": 1e-05, "loss": 0.0192, "step": 788400 }, { "epoch": 0.007885, "grad_norm": 0.12643150985240936, "learning_rate": 1e-05, "loss": 0.0189, "step": 788500 }, { "epoch": 0.007886, "grad_norm": 0.20459546148777008, "learning_rate": 1e-05, "loss": 0.019, "step": 788600 }, { "epoch": 0.007887, "grad_norm": 0.2004060298204422, "learning_rate": 1e-05, "loss": 0.0193, "step": 788700 }, { "epoch": 0.007888, "grad_norm": 0.1919257938861847, "learning_rate": 1e-05, "loss": 0.0189, "step": 788800 }, { "epoch": 0.007889, "grad_norm": 0.14687366783618927, "learning_rate": 1e-05, "loss": 0.0193, "step": 788900 }, { "epoch": 0.00789, "grad_norm": 0.17652662098407745, "learning_rate": 1e-05, "loss": 0.019, "step": 789000 }, { "epoch": 0.007891, "grad_norm": 0.1686694473028183, "learning_rate": 1e-05, "loss": 0.0193, "step": 789100 }, { "epoch": 0.007892, "grad_norm": 0.16850323975086212, "learning_rate": 1e-05, "loss": 0.0192, "step": 789200 }, { "epoch": 0.007893, "grad_norm": 0.15244583785533905, "learning_rate": 1e-05, "loss": 0.0194, "step": 789300 }, { "epoch": 0.007894, "grad_norm": 0.14465788006782532, "learning_rate": 1e-05, "loss": 0.0195, "step": 789400 }, { "epoch": 0.007895, "grad_norm": 0.16371707618236542, "learning_rate": 1e-05, "loss": 0.0197, "step": 789500 }, { "epoch": 0.007896, "grad_norm": 0.13529618084430695, "learning_rate": 1e-05, "loss": 0.0191, "step": 789600 }, { "epoch": 0.007897, "grad_norm": 0.17634594440460205, "learning_rate": 1e-05, "loss": 0.0195, "step": 789700 }, { "epoch": 0.007898, "grad_norm": 0.17581583559513092, "learning_rate": 1e-05, "loss": 0.0193, "step": 789800 }, { "epoch": 0.007899, "grad_norm": 0.12567508220672607, "learning_rate": 1e-05, "loss": 0.0192, "step": 789900 }, { "epoch": 0.0079, "grad_norm": 0.19161003828048706, "learning_rate": 1e-05, "loss": 0.019, "step": 790000 }, { "epoch": 0.007901, "grad_norm": 0.1348656862974167, "learning_rate": 1e-05, "loss": 0.019, "step": 790100 }, { "epoch": 0.007902, "grad_norm": 0.16397812962532043, "learning_rate": 1e-05, "loss": 0.0195, "step": 790200 }, { "epoch": 0.007903, "grad_norm": 0.14411215484142303, "learning_rate": 1e-05, "loss": 0.0189, "step": 790300 }, { "epoch": 0.007904, "grad_norm": 0.16465012729167938, "learning_rate": 1e-05, "loss": 0.0188, "step": 790400 }, { "epoch": 0.007905, "grad_norm": 0.14758546650409698, "learning_rate": 1e-05, "loss": 0.019, "step": 790500 }, { "epoch": 0.007906, "grad_norm": 0.1347510665655136, "learning_rate": 1e-05, "loss": 0.0189, "step": 790600 }, { "epoch": 0.007907, "grad_norm": 0.21390113234519958, "learning_rate": 1e-05, "loss": 0.0189, "step": 790700 }, { "epoch": 0.007908, "grad_norm": 0.19780543446540833, "learning_rate": 1e-05, "loss": 0.0186, "step": 790800 }, { "epoch": 0.007909, "grad_norm": 0.1388365924358368, "learning_rate": 1e-05, "loss": 0.019, "step": 790900 }, { "epoch": 0.00791, "grad_norm": 0.1476580798625946, "learning_rate": 1e-05, "loss": 0.019, "step": 791000 }, { "epoch": 0.007911, "grad_norm": 0.1504552662372589, "learning_rate": 1e-05, "loss": 0.019, "step": 791100 }, { "epoch": 0.007912, "grad_norm": 0.13933618366718292, "learning_rate": 1e-05, "loss": 0.0189, "step": 791200 }, { "epoch": 0.007913, "grad_norm": 0.13164182007312775, "learning_rate": 1e-05, "loss": 0.0194, "step": 791300 }, { "epoch": 0.007914, "grad_norm": 0.12611441314220428, "learning_rate": 1e-05, "loss": 0.0187, "step": 791400 }, { "epoch": 0.007915, "grad_norm": 0.17375929653644562, "learning_rate": 1e-05, "loss": 0.0195, "step": 791500 }, { "epoch": 0.007916, "grad_norm": 0.1332891285419464, "learning_rate": 1e-05, "loss": 0.0193, "step": 791600 }, { "epoch": 0.007917, "grad_norm": 0.17979171872138977, "learning_rate": 1e-05, "loss": 0.0194, "step": 791700 }, { "epoch": 0.007918, "grad_norm": 0.12532204389572144, "learning_rate": 1e-05, "loss": 0.0187, "step": 791800 }, { "epoch": 0.007919, "grad_norm": 0.15350943803787231, "learning_rate": 1e-05, "loss": 0.0189, "step": 791900 }, { "epoch": 0.00792, "grad_norm": 0.14408962428569794, "learning_rate": 1e-05, "loss": 0.0185, "step": 792000 }, { "epoch": 0.007921, "grad_norm": 0.12195149809122086, "learning_rate": 1e-05, "loss": 0.0187, "step": 792100 }, { "epoch": 0.007922, "grad_norm": 0.14996199309825897, "learning_rate": 1e-05, "loss": 0.019, "step": 792200 }, { "epoch": 0.007923, "grad_norm": 0.26222720742225647, "learning_rate": 1e-05, "loss": 0.0189, "step": 792300 }, { "epoch": 0.007924, "grad_norm": 0.17007264494895935, "learning_rate": 1e-05, "loss": 0.0189, "step": 792400 }, { "epoch": 0.007925, "grad_norm": 0.2419803887605667, "learning_rate": 1e-05, "loss": 0.0195, "step": 792500 }, { "epoch": 0.007926, "grad_norm": 0.15008123219013214, "learning_rate": 1e-05, "loss": 0.0188, "step": 792600 }, { "epoch": 0.007927, "grad_norm": 0.179548978805542, "learning_rate": 1e-05, "loss": 0.0195, "step": 792700 }, { "epoch": 0.007928, "grad_norm": 0.1299038827419281, "learning_rate": 1e-05, "loss": 0.0193, "step": 792800 }, { "epoch": 0.007929, "grad_norm": 0.18084610998630524, "learning_rate": 1e-05, "loss": 0.0188, "step": 792900 }, { "epoch": 0.00793, "grad_norm": 0.13461047410964966, "learning_rate": 1e-05, "loss": 0.0193, "step": 793000 }, { "epoch": 0.007931, "grad_norm": 0.12033967673778534, "learning_rate": 1e-05, "loss": 0.0192, "step": 793100 }, { "epoch": 0.007932, "grad_norm": 0.17694814503192902, "learning_rate": 1e-05, "loss": 0.0187, "step": 793200 }, { "epoch": 0.007933, "grad_norm": 0.1396075338125229, "learning_rate": 1e-05, "loss": 0.0188, "step": 793300 }, { "epoch": 0.007934, "grad_norm": 0.156344473361969, "learning_rate": 1e-05, "loss": 0.0189, "step": 793400 }, { "epoch": 0.007935, "grad_norm": 0.1857299655675888, "learning_rate": 1e-05, "loss": 0.0188, "step": 793500 }, { "epoch": 0.007936, "grad_norm": 0.1728232204914093, "learning_rate": 1e-05, "loss": 0.0185, "step": 793600 }, { "epoch": 0.007937, "grad_norm": 0.13686677813529968, "learning_rate": 1e-05, "loss": 0.0189, "step": 793700 }, { "epoch": 0.007938, "grad_norm": 0.14096054434776306, "learning_rate": 1e-05, "loss": 0.019, "step": 793800 }, { "epoch": 0.007939, "grad_norm": 0.17582984268665314, "learning_rate": 1e-05, "loss": 0.019, "step": 793900 }, { "epoch": 0.00794, "grad_norm": 0.15372155606746674, "learning_rate": 1e-05, "loss": 0.0192, "step": 794000 }, { "epoch": 0.007941, "grad_norm": 0.11844019591808319, "learning_rate": 1e-05, "loss": 0.0189, "step": 794100 }, { "epoch": 0.007942, "grad_norm": 0.14624455571174622, "learning_rate": 1e-05, "loss": 0.0189, "step": 794200 }, { "epoch": 0.007943, "grad_norm": 0.1433866322040558, "learning_rate": 1e-05, "loss": 0.0188, "step": 794300 }, { "epoch": 0.007944, "grad_norm": 0.16007980704307556, "learning_rate": 1e-05, "loss": 0.0186, "step": 794400 }, { "epoch": 0.007945, "grad_norm": 0.1731543093919754, "learning_rate": 1e-05, "loss": 0.0194, "step": 794500 }, { "epoch": 0.007946, "grad_norm": 0.11150135099887848, "learning_rate": 1e-05, "loss": 0.0191, "step": 794600 }, { "epoch": 0.007947, "grad_norm": 0.15255261957645416, "learning_rate": 1e-05, "loss": 0.0187, "step": 794700 }, { "epoch": 0.007948, "grad_norm": 0.13685300946235657, "learning_rate": 1e-05, "loss": 0.0192, "step": 794800 }, { "epoch": 0.007949, "grad_norm": 0.112589530646801, "learning_rate": 1e-05, "loss": 0.0186, "step": 794900 }, { "epoch": 0.00795, "grad_norm": 0.17732475697994232, "learning_rate": 1e-05, "loss": 0.0191, "step": 795000 }, { "epoch": 0.007951, "grad_norm": 0.21802911162376404, "learning_rate": 1e-05, "loss": 0.019, "step": 795100 }, { "epoch": 0.007952, "grad_norm": 0.1547027975320816, "learning_rate": 1e-05, "loss": 0.0189, "step": 795200 }, { "epoch": 0.007953, "grad_norm": 0.20404963195323944, "learning_rate": 1e-05, "loss": 0.019, "step": 795300 }, { "epoch": 0.007954, "grad_norm": 0.16553185880184174, "learning_rate": 1e-05, "loss": 0.019, "step": 795400 }, { "epoch": 0.007955, "grad_norm": 0.1135692372918129, "learning_rate": 1e-05, "loss": 0.0191, "step": 795500 }, { "epoch": 0.007956, "grad_norm": 0.15229980647563934, "learning_rate": 1e-05, "loss": 0.019, "step": 795600 }, { "epoch": 0.007957, "grad_norm": 0.12132416665554047, "learning_rate": 1e-05, "loss": 0.0191, "step": 795700 }, { "epoch": 0.007958, "grad_norm": 0.1475445032119751, "learning_rate": 1e-05, "loss": 0.0188, "step": 795800 }, { "epoch": 0.007959, "grad_norm": 0.20538251101970673, "learning_rate": 1e-05, "loss": 0.0191, "step": 795900 }, { "epoch": 0.00796, "grad_norm": 0.20153164863586426, "learning_rate": 1e-05, "loss": 0.019, "step": 796000 }, { "epoch": 0.007961, "grad_norm": 0.17223747074604034, "learning_rate": 1e-05, "loss": 0.0195, "step": 796100 }, { "epoch": 0.007962, "grad_norm": 0.15243108570575714, "learning_rate": 1e-05, "loss": 0.0188, "step": 796200 }, { "epoch": 0.007963, "grad_norm": 0.17482665181159973, "learning_rate": 1e-05, "loss": 0.0186, "step": 796300 }, { "epoch": 0.007964, "grad_norm": 0.15889477729797363, "learning_rate": 1e-05, "loss": 0.0194, "step": 796400 }, { "epoch": 0.007965, "grad_norm": 0.1927558034658432, "learning_rate": 1e-05, "loss": 0.0192, "step": 796500 }, { "epoch": 0.007966, "grad_norm": 0.1383131593465805, "learning_rate": 1e-05, "loss": 0.0187, "step": 796600 }, { "epoch": 0.007967, "grad_norm": 0.16343601047992706, "learning_rate": 1e-05, "loss": 0.0196, "step": 796700 }, { "epoch": 0.007968, "grad_norm": 0.16049370169639587, "learning_rate": 1e-05, "loss": 0.0188, "step": 796800 }, { "epoch": 0.007969, "grad_norm": 0.1897665113210678, "learning_rate": 1e-05, "loss": 0.0185, "step": 796900 }, { "epoch": 0.00797, "grad_norm": 0.18184949457645416, "learning_rate": 1e-05, "loss": 0.0191, "step": 797000 }, { "epoch": 0.007971, "grad_norm": 0.19606700539588928, "learning_rate": 1e-05, "loss": 0.0193, "step": 797100 }, { "epoch": 0.007972, "grad_norm": 0.16576626896858215, "learning_rate": 1e-05, "loss": 0.0189, "step": 797200 }, { "epoch": 0.007973, "grad_norm": 0.1797633320093155, "learning_rate": 1e-05, "loss": 0.019, "step": 797300 }, { "epoch": 0.007974, "grad_norm": 0.2139395922422409, "learning_rate": 1e-05, "loss": 0.019, "step": 797400 }, { "epoch": 0.007975, "grad_norm": 0.1461814045906067, "learning_rate": 1e-05, "loss": 0.0194, "step": 797500 }, { "epoch": 0.007976, "grad_norm": 0.15368422865867615, "learning_rate": 1e-05, "loss": 0.0192, "step": 797600 }, { "epoch": 0.007977, "grad_norm": 0.10884765535593033, "learning_rate": 1e-05, "loss": 0.019, "step": 797700 }, { "epoch": 0.007978, "grad_norm": 0.1550731509923935, "learning_rate": 1e-05, "loss": 0.0188, "step": 797800 }, { "epoch": 0.007979, "grad_norm": 0.17400318384170532, "learning_rate": 1e-05, "loss": 0.019, "step": 797900 }, { "epoch": 0.00798, "grad_norm": 0.2553693354129791, "learning_rate": 1e-05, "loss": 0.0191, "step": 798000 }, { "epoch": 0.007981, "grad_norm": 0.2134876400232315, "learning_rate": 1e-05, "loss": 0.019, "step": 798100 }, { "epoch": 0.007982, "grad_norm": 0.14764373004436493, "learning_rate": 1e-05, "loss": 0.0187, "step": 798200 }, { "epoch": 0.007983, "grad_norm": 0.161677747964859, "learning_rate": 1e-05, "loss": 0.019, "step": 798300 }, { "epoch": 0.007984, "grad_norm": 0.13663513958454132, "learning_rate": 1e-05, "loss": 0.0192, "step": 798400 }, { "epoch": 0.007985, "grad_norm": 0.16835494339466095, "learning_rate": 1e-05, "loss": 0.0188, "step": 798500 }, { "epoch": 0.007986, "grad_norm": 0.12216142565011978, "learning_rate": 1e-05, "loss": 0.019, "step": 798600 }, { "epoch": 0.007987, "grad_norm": 0.17246265709400177, "learning_rate": 1e-05, "loss": 0.0195, "step": 798700 }, { "epoch": 0.007988, "grad_norm": 0.16901002824306488, "learning_rate": 1e-05, "loss": 0.0188, "step": 798800 }, { "epoch": 0.007989, "grad_norm": 0.14592525362968445, "learning_rate": 1e-05, "loss": 0.0193, "step": 798900 }, { "epoch": 0.00799, "grad_norm": 0.2041911482810974, "learning_rate": 1e-05, "loss": 0.019, "step": 799000 }, { "epoch": 0.007991, "grad_norm": 0.1463153213262558, "learning_rate": 1e-05, "loss": 0.019, "step": 799100 }, { "epoch": 0.007992, "grad_norm": 0.11204078793525696, "learning_rate": 1e-05, "loss": 0.019, "step": 799200 }, { "epoch": 0.007993, "grad_norm": 0.1428191363811493, "learning_rate": 1e-05, "loss": 0.0189, "step": 799300 }, { "epoch": 0.007994, "grad_norm": 0.14009995758533478, "learning_rate": 1e-05, "loss": 0.019, "step": 799400 }, { "epoch": 0.007995, "grad_norm": 0.171320840716362, "learning_rate": 1e-05, "loss": 0.019, "step": 799500 }, { "epoch": 0.007996, "grad_norm": 0.1880444586277008, "learning_rate": 1e-05, "loss": 0.0188, "step": 799600 }, { "epoch": 0.007997, "grad_norm": 0.243787944316864, "learning_rate": 1e-05, "loss": 0.0191, "step": 799700 }, { "epoch": 0.007998, "grad_norm": 0.1435822993516922, "learning_rate": 1e-05, "loss": 0.019, "step": 799800 }, { "epoch": 0.007999, "grad_norm": 0.15515878796577454, "learning_rate": 1e-05, "loss": 0.0185, "step": 799900 }, { "epoch": 0.008, "grad_norm": 0.1910889744758606, "learning_rate": 1e-05, "loss": 0.0188, "step": 800000 }, { "epoch": 0.008, "eval_loss": 0.016870884224772453, "eval_runtime": 191.815, "eval_samples_per_second": 260.668, "eval_steps_per_second": 16.292, "step": 800000 }, { "epoch": 0.008001, "grad_norm": 0.166649729013443, "learning_rate": 1e-05, "loss": 0.0183, "step": 800100 }, { "epoch": 0.008002, "grad_norm": 0.15723535418510437, "learning_rate": 1e-05, "loss": 0.0192, "step": 800200 }, { "epoch": 0.008003, "grad_norm": 0.185715913772583, "learning_rate": 1e-05, "loss": 0.0188, "step": 800300 }, { "epoch": 0.008004, "grad_norm": 0.173756405711174, "learning_rate": 1e-05, "loss": 0.0189, "step": 800400 }, { "epoch": 0.008005, "grad_norm": 0.14555978775024414, "learning_rate": 1e-05, "loss": 0.0188, "step": 800500 }, { "epoch": 0.008006, "grad_norm": 0.1507168859243393, "learning_rate": 1e-05, "loss": 0.0191, "step": 800600 }, { "epoch": 0.008007, "grad_norm": 0.18577606976032257, "learning_rate": 1e-05, "loss": 0.0193, "step": 800700 }, { "epoch": 0.008008, "grad_norm": 0.16483266651630402, "learning_rate": 1e-05, "loss": 0.0195, "step": 800800 }, { "epoch": 0.008009, "grad_norm": 0.14782802760601044, "learning_rate": 1e-05, "loss": 0.0192, "step": 800900 }, { "epoch": 0.00801, "grad_norm": 0.13473060727119446, "learning_rate": 1e-05, "loss": 0.019, "step": 801000 }, { "epoch": 0.008011, "grad_norm": 0.1568913608789444, "learning_rate": 1e-05, "loss": 0.0191, "step": 801100 }, { "epoch": 0.008012, "grad_norm": 0.12539027631282806, "learning_rate": 1e-05, "loss": 0.0189, "step": 801200 }, { "epoch": 0.008013, "grad_norm": 0.1278301328420639, "learning_rate": 1e-05, "loss": 0.0186, "step": 801300 }, { "epoch": 0.008014, "grad_norm": 0.21419784426689148, "learning_rate": 1e-05, "loss": 0.019, "step": 801400 }, { "epoch": 0.008015, "grad_norm": 0.14353904128074646, "learning_rate": 1e-05, "loss": 0.0191, "step": 801500 }, { "epoch": 0.008016, "grad_norm": 0.17399436235427856, "learning_rate": 1e-05, "loss": 0.019, "step": 801600 }, { "epoch": 0.008017, "grad_norm": 0.15862420201301575, "learning_rate": 1e-05, "loss": 0.0191, "step": 801700 }, { "epoch": 0.008018, "grad_norm": 0.125766783952713, "learning_rate": 1e-05, "loss": 0.019, "step": 801800 }, { "epoch": 0.008019, "grad_norm": 0.1654219925403595, "learning_rate": 1e-05, "loss": 0.0185, "step": 801900 }, { "epoch": 0.00802, "grad_norm": 0.18725058436393738, "learning_rate": 1e-05, "loss": 0.019, "step": 802000 }, { "epoch": 0.008021, "grad_norm": 0.16987964510917664, "learning_rate": 1e-05, "loss": 0.0196, "step": 802100 }, { "epoch": 0.008022, "grad_norm": 0.141996830701828, "learning_rate": 1e-05, "loss": 0.019, "step": 802200 }, { "epoch": 0.008023, "grad_norm": 0.13804461061954498, "learning_rate": 1e-05, "loss": 0.0193, "step": 802300 }, { "epoch": 0.008024, "grad_norm": 0.17991411685943604, "learning_rate": 1e-05, "loss": 0.0189, "step": 802400 }, { "epoch": 0.008025, "grad_norm": 0.18338808417320251, "learning_rate": 1e-05, "loss": 0.0187, "step": 802500 }, { "epoch": 0.008026, "grad_norm": 0.17473718523979187, "learning_rate": 1e-05, "loss": 0.0189, "step": 802600 }, { "epoch": 0.008027, "grad_norm": 0.1912785768508911, "learning_rate": 1e-05, "loss": 0.0194, "step": 802700 }, { "epoch": 0.008028, "grad_norm": 0.17462711036205292, "learning_rate": 1e-05, "loss": 0.0188, "step": 802800 }, { "epoch": 0.008029, "grad_norm": 0.17501147091388702, "learning_rate": 1e-05, "loss": 0.0191, "step": 802900 }, { "epoch": 0.00803, "grad_norm": 0.18109281361103058, "learning_rate": 1e-05, "loss": 0.0194, "step": 803000 }, { "epoch": 0.008031, "grad_norm": 0.16490377485752106, "learning_rate": 1e-05, "loss": 0.0186, "step": 803100 }, { "epoch": 0.008032, "grad_norm": 0.13522236049175262, "learning_rate": 1e-05, "loss": 0.0194, "step": 803200 }, { "epoch": 0.008033, "grad_norm": 0.1366279572248459, "learning_rate": 1e-05, "loss": 0.0189, "step": 803300 }, { "epoch": 0.008034, "grad_norm": 0.1406523883342743, "learning_rate": 1e-05, "loss": 0.0195, "step": 803400 }, { "epoch": 0.008035, "grad_norm": 0.13559085130691528, "learning_rate": 1e-05, "loss": 0.0188, "step": 803500 }, { "epoch": 0.008036, "grad_norm": 0.14297015964984894, "learning_rate": 1e-05, "loss": 0.0187, "step": 803600 }, { "epoch": 0.008037, "grad_norm": 0.17465466260910034, "learning_rate": 1e-05, "loss": 0.0191, "step": 803700 }, { "epoch": 0.008038, "grad_norm": 0.10313781350851059, "learning_rate": 1e-05, "loss": 0.0191, "step": 803800 }, { "epoch": 0.008039, "grad_norm": 0.1513640284538269, "learning_rate": 1e-05, "loss": 0.0194, "step": 803900 }, { "epoch": 0.00804, "grad_norm": 0.11065026372671127, "learning_rate": 1e-05, "loss": 0.019, "step": 804000 }, { "epoch": 0.008041, "grad_norm": 0.17840419709682465, "learning_rate": 1e-05, "loss": 0.0189, "step": 804100 }, { "epoch": 0.008042, "grad_norm": 0.19647783041000366, "learning_rate": 1e-05, "loss": 0.0191, "step": 804200 }, { "epoch": 0.008043, "grad_norm": 0.18565933406352997, "learning_rate": 1e-05, "loss": 0.0189, "step": 804300 }, { "epoch": 0.008044, "grad_norm": 0.11717821657657623, "learning_rate": 1e-05, "loss": 0.0192, "step": 804400 }, { "epoch": 0.008045, "grad_norm": 0.17626960575580597, "learning_rate": 1e-05, "loss": 0.0187, "step": 804500 }, { "epoch": 0.008046, "grad_norm": 0.1757706105709076, "learning_rate": 1e-05, "loss": 0.0192, "step": 804600 }, { "epoch": 0.008047, "grad_norm": 0.15020988881587982, "learning_rate": 1e-05, "loss": 0.0188, "step": 804700 }, { "epoch": 0.008048, "grad_norm": 0.14481377601623535, "learning_rate": 1e-05, "loss": 0.0188, "step": 804800 }, { "epoch": 0.008049, "grad_norm": 0.17468398809432983, "learning_rate": 1e-05, "loss": 0.0188, "step": 804900 }, { "epoch": 0.00805, "grad_norm": 0.15793798863887787, "learning_rate": 1e-05, "loss": 0.019, "step": 805000 }, { "epoch": 0.008051, "grad_norm": 0.12020246684551239, "learning_rate": 1e-05, "loss": 0.0191, "step": 805100 }, { "epoch": 0.008052, "grad_norm": 0.17434297502040863, "learning_rate": 1e-05, "loss": 0.0188, "step": 805200 }, { "epoch": 0.008053, "grad_norm": 0.16407015919685364, "learning_rate": 1e-05, "loss": 0.0187, "step": 805300 }, { "epoch": 0.008054, "grad_norm": 0.14911803603172302, "learning_rate": 1e-05, "loss": 0.0191, "step": 805400 }, { "epoch": 0.008055, "grad_norm": 0.1893591433763504, "learning_rate": 1e-05, "loss": 0.0187, "step": 805500 }, { "epoch": 0.008056, "grad_norm": 0.11468628793954849, "learning_rate": 1e-05, "loss": 0.0188, "step": 805600 }, { "epoch": 0.008057, "grad_norm": 0.25928497314453125, "learning_rate": 1e-05, "loss": 0.0187, "step": 805700 }, { "epoch": 0.008058, "grad_norm": 0.2366941273212433, "learning_rate": 1e-05, "loss": 0.0184, "step": 805800 }, { "epoch": 0.008059, "grad_norm": 0.19853360950946808, "learning_rate": 1e-05, "loss": 0.0195, "step": 805900 }, { "epoch": 0.00806, "grad_norm": 0.14313505589962006, "learning_rate": 1e-05, "loss": 0.0189, "step": 806000 }, { "epoch": 0.008061, "grad_norm": 0.18395261466503143, "learning_rate": 1e-05, "loss": 0.0193, "step": 806100 }, { "epoch": 0.008062, "grad_norm": 0.1430797427892685, "learning_rate": 1e-05, "loss": 0.0192, "step": 806200 }, { "epoch": 0.008063, "grad_norm": 0.15583771467208862, "learning_rate": 1e-05, "loss": 0.0187, "step": 806300 }, { "epoch": 0.008064, "grad_norm": 0.2166193276643753, "learning_rate": 1e-05, "loss": 0.0193, "step": 806400 }, { "epoch": 0.008065, "grad_norm": 0.14299613237380981, "learning_rate": 1e-05, "loss": 0.019, "step": 806500 }, { "epoch": 0.008066, "grad_norm": 0.13489770889282227, "learning_rate": 1e-05, "loss": 0.0187, "step": 806600 }, { "epoch": 0.008067, "grad_norm": 0.13713708519935608, "learning_rate": 1e-05, "loss": 0.0187, "step": 806700 }, { "epoch": 0.008068, "grad_norm": 0.188313826918602, "learning_rate": 1e-05, "loss": 0.0196, "step": 806800 }, { "epoch": 0.008069, "grad_norm": 0.20709583163261414, "learning_rate": 1e-05, "loss": 0.0191, "step": 806900 }, { "epoch": 0.00807, "grad_norm": 0.15344415605068207, "learning_rate": 1e-05, "loss": 0.0188, "step": 807000 }, { "epoch": 0.008071, "grad_norm": 0.11464856564998627, "learning_rate": 1e-05, "loss": 0.0191, "step": 807100 }, { "epoch": 0.008072, "grad_norm": 0.12867361307144165, "learning_rate": 1e-05, "loss": 0.019, "step": 807200 }, { "epoch": 0.008073, "grad_norm": 0.14141158759593964, "learning_rate": 1e-05, "loss": 0.0184, "step": 807300 }, { "epoch": 0.008074, "grad_norm": 0.14822161197662354, "learning_rate": 1e-05, "loss": 0.0189, "step": 807400 }, { "epoch": 0.008075, "grad_norm": 0.16350199282169342, "learning_rate": 1e-05, "loss": 0.0192, "step": 807500 }, { "epoch": 0.008076, "grad_norm": 0.2075500190258026, "learning_rate": 1e-05, "loss": 0.0189, "step": 807600 }, { "epoch": 0.008077, "grad_norm": 0.14998656511306763, "learning_rate": 1e-05, "loss": 0.0189, "step": 807700 }, { "epoch": 0.008078, "grad_norm": 0.17255273461341858, "learning_rate": 1e-05, "loss": 0.0186, "step": 807800 }, { "epoch": 0.008079, "grad_norm": 0.136601984500885, "learning_rate": 1e-05, "loss": 0.0193, "step": 807900 }, { "epoch": 0.00808, "grad_norm": 0.1486748903989792, "learning_rate": 1e-05, "loss": 0.019, "step": 808000 }, { "epoch": 0.008081, "grad_norm": 0.11780425161123276, "learning_rate": 1e-05, "loss": 0.0193, "step": 808100 }, { "epoch": 0.008082, "grad_norm": 0.14016684889793396, "learning_rate": 1e-05, "loss": 0.019, "step": 808200 }, { "epoch": 0.008083, "grad_norm": 0.22153887152671814, "learning_rate": 1e-05, "loss": 0.0186, "step": 808300 }, { "epoch": 0.008084, "grad_norm": 0.15512695908546448, "learning_rate": 1e-05, "loss": 0.0193, "step": 808400 }, { "epoch": 0.008085, "grad_norm": 0.23694270849227905, "learning_rate": 1e-05, "loss": 0.0192, "step": 808500 }, { "epoch": 0.008086, "grad_norm": 0.15340270102024078, "learning_rate": 1e-05, "loss": 0.0188, "step": 808600 }, { "epoch": 0.008087, "grad_norm": 0.19888682663440704, "learning_rate": 1e-05, "loss": 0.0189, "step": 808700 }, { "epoch": 0.008088, "grad_norm": 0.148932546377182, "learning_rate": 1e-05, "loss": 0.019, "step": 808800 }, { "epoch": 0.008089, "grad_norm": 0.22244815528392792, "learning_rate": 1e-05, "loss": 0.0186, "step": 808900 }, { "epoch": 0.00809, "grad_norm": 0.1406697928905487, "learning_rate": 1e-05, "loss": 0.0189, "step": 809000 }, { "epoch": 0.008091, "grad_norm": 0.1582546830177307, "learning_rate": 1e-05, "loss": 0.0187, "step": 809100 }, { "epoch": 0.008092, "grad_norm": 0.1730857938528061, "learning_rate": 1e-05, "loss": 0.0188, "step": 809200 }, { "epoch": 0.008093, "grad_norm": 0.12735554575920105, "learning_rate": 1e-05, "loss": 0.0186, "step": 809300 }, { "epoch": 0.008094, "grad_norm": 0.155547097325325, "learning_rate": 1e-05, "loss": 0.019, "step": 809400 }, { "epoch": 0.008095, "grad_norm": 0.12213418632745743, "learning_rate": 1e-05, "loss": 0.0192, "step": 809500 }, { "epoch": 0.008096, "grad_norm": 0.17221176624298096, "learning_rate": 1e-05, "loss": 0.0191, "step": 809600 }, { "epoch": 0.008097, "grad_norm": 0.20602627098560333, "learning_rate": 1e-05, "loss": 0.0189, "step": 809700 }, { "epoch": 0.008098, "grad_norm": 0.13218869268894196, "learning_rate": 1e-05, "loss": 0.0187, "step": 809800 }, { "epoch": 0.008099, "grad_norm": 0.15529753267765045, "learning_rate": 1e-05, "loss": 0.0188, "step": 809900 }, { "epoch": 0.0081, "grad_norm": 0.13325783610343933, "learning_rate": 1e-05, "loss": 0.0191, "step": 810000 }, { "epoch": 0.008101, "grad_norm": 0.16968071460723877, "learning_rate": 1e-05, "loss": 0.0192, "step": 810100 }, { "epoch": 0.008102, "grad_norm": 0.2093035727739334, "learning_rate": 1e-05, "loss": 0.0187, "step": 810200 }, { "epoch": 0.008103, "grad_norm": 0.19390128552913666, "learning_rate": 1e-05, "loss": 0.019, "step": 810300 }, { "epoch": 0.008104, "grad_norm": 0.1431635171175003, "learning_rate": 1e-05, "loss": 0.019, "step": 810400 }, { "epoch": 0.008105, "grad_norm": 0.1708240807056427, "learning_rate": 1e-05, "loss": 0.0186, "step": 810500 }, { "epoch": 0.008106, "grad_norm": 0.14884459972381592, "learning_rate": 1e-05, "loss": 0.0188, "step": 810600 }, { "epoch": 0.008107, "grad_norm": 0.12844137847423553, "learning_rate": 1e-05, "loss": 0.019, "step": 810700 }, { "epoch": 0.008108, "grad_norm": 0.2247413992881775, "learning_rate": 1e-05, "loss": 0.0186, "step": 810800 }, { "epoch": 0.008109, "grad_norm": 0.1578207165002823, "learning_rate": 1e-05, "loss": 0.0186, "step": 810900 }, { "epoch": 0.00811, "grad_norm": 0.13466419279575348, "learning_rate": 1e-05, "loss": 0.0195, "step": 811000 }, { "epoch": 0.008111, "grad_norm": 0.12769602239131927, "learning_rate": 1e-05, "loss": 0.0185, "step": 811100 }, { "epoch": 0.008112, "grad_norm": 0.14126582443714142, "learning_rate": 1e-05, "loss": 0.0188, "step": 811200 }, { "epoch": 0.008113, "grad_norm": 0.14771687984466553, "learning_rate": 1e-05, "loss": 0.0183, "step": 811300 }, { "epoch": 0.008114, "grad_norm": 0.16631753742694855, "learning_rate": 1e-05, "loss": 0.0189, "step": 811400 }, { "epoch": 0.008115, "grad_norm": 0.14412035048007965, "learning_rate": 1e-05, "loss": 0.0188, "step": 811500 }, { "epoch": 0.008116, "grad_norm": 0.13190840184688568, "learning_rate": 1e-05, "loss": 0.0191, "step": 811600 }, { "epoch": 0.008117, "grad_norm": 0.13549605011940002, "learning_rate": 1e-05, "loss": 0.0188, "step": 811700 }, { "epoch": 0.008118, "grad_norm": 0.13419833779335022, "learning_rate": 1e-05, "loss": 0.0191, "step": 811800 }, { "epoch": 0.008119, "grad_norm": 0.13320980966091156, "learning_rate": 1e-05, "loss": 0.0189, "step": 811900 }, { "epoch": 0.00812, "grad_norm": 0.21347931027412415, "learning_rate": 1e-05, "loss": 0.019, "step": 812000 }, { "epoch": 0.008121, "grad_norm": 0.17716509103775024, "learning_rate": 1e-05, "loss": 0.0188, "step": 812100 }, { "epoch": 0.008122, "grad_norm": 0.15826262533664703, "learning_rate": 1e-05, "loss": 0.0188, "step": 812200 }, { "epoch": 0.008123, "grad_norm": 0.1587977111339569, "learning_rate": 1e-05, "loss": 0.0185, "step": 812300 }, { "epoch": 0.008124, "grad_norm": 0.16595517098903656, "learning_rate": 1e-05, "loss": 0.0188, "step": 812400 }, { "epoch": 0.008125, "grad_norm": 0.1677248477935791, "learning_rate": 1e-05, "loss": 0.0185, "step": 812500 }, { "epoch": 0.008126, "grad_norm": 0.1950235366821289, "learning_rate": 1e-05, "loss": 0.0187, "step": 812600 }, { "epoch": 0.008127, "grad_norm": 0.18119138479232788, "learning_rate": 1e-05, "loss": 0.0192, "step": 812700 }, { "epoch": 0.008128, "grad_norm": 0.2032155841588974, "learning_rate": 1e-05, "loss": 0.0185, "step": 812800 }, { "epoch": 0.008129, "grad_norm": 0.13209347426891327, "learning_rate": 1e-05, "loss": 0.0189, "step": 812900 }, { "epoch": 0.00813, "grad_norm": 0.20818959176540375, "learning_rate": 1e-05, "loss": 0.0191, "step": 813000 }, { "epoch": 0.008131, "grad_norm": 0.2558955252170563, "learning_rate": 1e-05, "loss": 0.0189, "step": 813100 }, { "epoch": 0.008132, "grad_norm": 0.17180080711841583, "learning_rate": 1e-05, "loss": 0.0188, "step": 813200 }, { "epoch": 0.008133, "grad_norm": 0.12622122466564178, "learning_rate": 1e-05, "loss": 0.019, "step": 813300 }, { "epoch": 0.008134, "grad_norm": 0.14313682913780212, "learning_rate": 1e-05, "loss": 0.0191, "step": 813400 }, { "epoch": 0.008135, "grad_norm": 0.11664015799760818, "learning_rate": 1e-05, "loss": 0.0186, "step": 813500 }, { "epoch": 0.008136, "grad_norm": 0.1777046173810959, "learning_rate": 1e-05, "loss": 0.019, "step": 813600 }, { "epoch": 0.008137, "grad_norm": 0.1404317319393158, "learning_rate": 1e-05, "loss": 0.0191, "step": 813700 }, { "epoch": 0.008138, "grad_norm": 0.134482741355896, "learning_rate": 1e-05, "loss": 0.019, "step": 813800 }, { "epoch": 0.008139, "grad_norm": 0.1325339525938034, "learning_rate": 1e-05, "loss": 0.0187, "step": 813900 }, { "epoch": 0.00814, "grad_norm": 0.17309391498565674, "learning_rate": 1e-05, "loss": 0.0188, "step": 814000 }, { "epoch": 0.008141, "grad_norm": 0.2200092375278473, "learning_rate": 1e-05, "loss": 0.0189, "step": 814100 }, { "epoch": 0.008142, "grad_norm": 0.13401953876018524, "learning_rate": 1e-05, "loss": 0.0186, "step": 814200 }, { "epoch": 0.008143, "grad_norm": 0.16334488987922668, "learning_rate": 1e-05, "loss": 0.0186, "step": 814300 }, { "epoch": 0.008144, "grad_norm": 0.19414757192134857, "learning_rate": 1e-05, "loss": 0.019, "step": 814400 }, { "epoch": 0.008145, "grad_norm": 0.12011253833770752, "learning_rate": 1e-05, "loss": 0.0187, "step": 814500 }, { "epoch": 0.008146, "grad_norm": 0.15319567918777466, "learning_rate": 1e-05, "loss": 0.0191, "step": 814600 }, { "epoch": 0.008147, "grad_norm": 0.1422000229358673, "learning_rate": 1e-05, "loss": 0.0191, "step": 814700 }, { "epoch": 0.008148, "grad_norm": 0.21929800510406494, "learning_rate": 1e-05, "loss": 0.0189, "step": 814800 }, { "epoch": 0.008149, "grad_norm": 0.22845953702926636, "learning_rate": 1e-05, "loss": 0.0186, "step": 814900 }, { "epoch": 0.00815, "grad_norm": 0.1477116495370865, "learning_rate": 1e-05, "loss": 0.0193, "step": 815000 }, { "epoch": 0.008151, "grad_norm": 0.17466679215431213, "learning_rate": 1e-05, "loss": 0.0182, "step": 815100 }, { "epoch": 0.008152, "grad_norm": 0.13259144127368927, "learning_rate": 1e-05, "loss": 0.0186, "step": 815200 }, { "epoch": 0.008153, "grad_norm": 0.25539496541023254, "learning_rate": 1e-05, "loss": 0.019, "step": 815300 }, { "epoch": 0.008154, "grad_norm": 0.17711982131004333, "learning_rate": 1e-05, "loss": 0.0191, "step": 815400 }, { "epoch": 0.008155, "grad_norm": 0.12822818756103516, "learning_rate": 1e-05, "loss": 0.019, "step": 815500 }, { "epoch": 0.008156, "grad_norm": 0.15872366726398468, "learning_rate": 1e-05, "loss": 0.0187, "step": 815600 }, { "epoch": 0.008157, "grad_norm": 0.139593705534935, "learning_rate": 1e-05, "loss": 0.0187, "step": 815700 }, { "epoch": 0.008158, "grad_norm": 0.0993078202009201, "learning_rate": 1e-05, "loss": 0.0187, "step": 815800 }, { "epoch": 0.008159, "grad_norm": 0.15995965898036957, "learning_rate": 1e-05, "loss": 0.0192, "step": 815900 }, { "epoch": 0.00816, "grad_norm": 0.12610113620758057, "learning_rate": 1e-05, "loss": 0.0191, "step": 816000 }, { "epoch": 0.008161, "grad_norm": 0.126889169216156, "learning_rate": 1e-05, "loss": 0.0192, "step": 816100 }, { "epoch": 0.008162, "grad_norm": 0.19867470860481262, "learning_rate": 1e-05, "loss": 0.0185, "step": 816200 }, { "epoch": 0.008163, "grad_norm": 0.14401116967201233, "learning_rate": 1e-05, "loss": 0.0188, "step": 816300 }, { "epoch": 0.008164, "grad_norm": 0.15513625741004944, "learning_rate": 1e-05, "loss": 0.0188, "step": 816400 }, { "epoch": 0.008165, "grad_norm": 0.13005255162715912, "learning_rate": 1e-05, "loss": 0.0191, "step": 816500 }, { "epoch": 0.008166, "grad_norm": 0.11921598762273788, "learning_rate": 1e-05, "loss": 0.0192, "step": 816600 }, { "epoch": 0.008167, "grad_norm": 0.15483610332012177, "learning_rate": 1e-05, "loss": 0.0188, "step": 816700 }, { "epoch": 0.008168, "grad_norm": 0.155282124876976, "learning_rate": 1e-05, "loss": 0.0184, "step": 816800 }, { "epoch": 0.008169, "grad_norm": 0.1769125908613205, "learning_rate": 1e-05, "loss": 0.0189, "step": 816900 }, { "epoch": 0.00817, "grad_norm": 0.14615939557552338, "learning_rate": 1e-05, "loss": 0.0193, "step": 817000 }, { "epoch": 0.008171, "grad_norm": 0.1838274896144867, "learning_rate": 1e-05, "loss": 0.019, "step": 817100 }, { "epoch": 0.008172, "grad_norm": 0.13035179674625397, "learning_rate": 1e-05, "loss": 0.0189, "step": 817200 }, { "epoch": 0.008173, "grad_norm": 0.13187739253044128, "learning_rate": 1e-05, "loss": 0.0189, "step": 817300 }, { "epoch": 0.008174, "grad_norm": 0.12284134328365326, "learning_rate": 1e-05, "loss": 0.019, "step": 817400 }, { "epoch": 0.008175, "grad_norm": 0.14839418232440948, "learning_rate": 1e-05, "loss": 0.0189, "step": 817500 }, { "epoch": 0.008176, "grad_norm": 0.22468788921833038, "learning_rate": 1e-05, "loss": 0.0192, "step": 817600 }, { "epoch": 0.008177, "grad_norm": 0.12902693450450897, "learning_rate": 1e-05, "loss": 0.0189, "step": 817700 }, { "epoch": 0.008178, "grad_norm": 0.11599735915660858, "learning_rate": 1e-05, "loss": 0.0187, "step": 817800 }, { "epoch": 0.008179, "grad_norm": 0.16649819910526276, "learning_rate": 1e-05, "loss": 0.0186, "step": 817900 }, { "epoch": 0.00818, "grad_norm": 0.17167241871356964, "learning_rate": 1e-05, "loss": 0.0186, "step": 818000 }, { "epoch": 0.008181, "grad_norm": 0.14128102362155914, "learning_rate": 1e-05, "loss": 0.0186, "step": 818100 }, { "epoch": 0.008182, "grad_norm": 0.13695640861988068, "learning_rate": 1e-05, "loss": 0.0191, "step": 818200 }, { "epoch": 0.008183, "grad_norm": 0.12654507160186768, "learning_rate": 1e-05, "loss": 0.0189, "step": 818300 }, { "epoch": 0.008184, "grad_norm": 0.15879735350608826, "learning_rate": 1e-05, "loss": 0.0186, "step": 818400 }, { "epoch": 0.008185, "grad_norm": 0.21413612365722656, "learning_rate": 1e-05, "loss": 0.0187, "step": 818500 }, { "epoch": 0.008186, "grad_norm": 0.1363525688648224, "learning_rate": 1e-05, "loss": 0.0191, "step": 818600 }, { "epoch": 0.008187, "grad_norm": 0.1541542261838913, "learning_rate": 1e-05, "loss": 0.0187, "step": 818700 }, { "epoch": 0.008188, "grad_norm": 0.13069643080234528, "learning_rate": 1e-05, "loss": 0.0187, "step": 818800 }, { "epoch": 0.008189, "grad_norm": 0.13579946756362915, "learning_rate": 1e-05, "loss": 0.0187, "step": 818900 }, { "epoch": 0.00819, "grad_norm": 0.11518040299415588, "learning_rate": 1e-05, "loss": 0.0191, "step": 819000 }, { "epoch": 0.008191, "grad_norm": 0.163763165473938, "learning_rate": 1e-05, "loss": 0.0189, "step": 819100 }, { "epoch": 0.008192, "grad_norm": 0.1597258746623993, "learning_rate": 1e-05, "loss": 0.0192, "step": 819200 }, { "epoch": 0.008193, "grad_norm": 0.13204698264598846, "learning_rate": 1e-05, "loss": 0.0189, "step": 819300 }, { "epoch": 0.008194, "grad_norm": 0.19410623610019684, "learning_rate": 1e-05, "loss": 0.0189, "step": 819400 }, { "epoch": 0.008195, "grad_norm": 0.1296689510345459, "learning_rate": 1e-05, "loss": 0.0193, "step": 819500 }, { "epoch": 0.008196, "grad_norm": 0.19016918540000916, "learning_rate": 1e-05, "loss": 0.0188, "step": 819600 }, { "epoch": 0.008197, "grad_norm": 0.13668487966060638, "learning_rate": 1e-05, "loss": 0.0192, "step": 819700 }, { "epoch": 0.008198, "grad_norm": 0.1154247522354126, "learning_rate": 1e-05, "loss": 0.0187, "step": 819800 }, { "epoch": 0.008199, "grad_norm": 0.13306815922260284, "learning_rate": 1e-05, "loss": 0.0188, "step": 819900 }, { "epoch": 0.0082, "grad_norm": 0.10873663425445557, "learning_rate": 1e-05, "loss": 0.0183, "step": 820000 }, { "epoch": 0.0082, "eval_loss": 0.01665344275534153, "eval_runtime": 193.3548, "eval_samples_per_second": 258.592, "eval_steps_per_second": 16.162, "step": 820000 }, { "epoch": 0.008201, "grad_norm": 0.14266209304332733, "learning_rate": 1e-05, "loss": 0.0185, "step": 820100 }, { "epoch": 0.008202, "grad_norm": 0.1451946645975113, "learning_rate": 1e-05, "loss": 0.0192, "step": 820200 }, { "epoch": 0.008203, "grad_norm": 0.22686094045639038, "learning_rate": 1e-05, "loss": 0.0186, "step": 820300 }, { "epoch": 0.008204, "grad_norm": 0.13039565086364746, "learning_rate": 1e-05, "loss": 0.0186, "step": 820400 }, { "epoch": 0.008205, "grad_norm": 0.18803638219833374, "learning_rate": 1e-05, "loss": 0.0189, "step": 820500 }, { "epoch": 0.008206, "grad_norm": 0.21732376515865326, "learning_rate": 1e-05, "loss": 0.0185, "step": 820600 }, { "epoch": 0.008207, "grad_norm": 0.13051149249076843, "learning_rate": 1e-05, "loss": 0.0184, "step": 820700 }, { "epoch": 0.008208, "grad_norm": 0.1780024766921997, "learning_rate": 1e-05, "loss": 0.0186, "step": 820800 }, { "epoch": 0.008209, "grad_norm": 0.1966998279094696, "learning_rate": 1e-05, "loss": 0.0192, "step": 820900 }, { "epoch": 0.00821, "grad_norm": 0.13931238651275635, "learning_rate": 1e-05, "loss": 0.0192, "step": 821000 }, { "epoch": 0.008211, "grad_norm": 0.23690253496170044, "learning_rate": 1e-05, "loss": 0.0184, "step": 821100 }, { "epoch": 0.008212, "grad_norm": 0.11448873579502106, "learning_rate": 1e-05, "loss": 0.0189, "step": 821200 }, { "epoch": 0.008213, "grad_norm": 0.1691114455461502, "learning_rate": 1e-05, "loss": 0.0183, "step": 821300 }, { "epoch": 0.008214, "grad_norm": 0.1652698665857315, "learning_rate": 1e-05, "loss": 0.0189, "step": 821400 }, { "epoch": 0.008215, "grad_norm": 0.18293440341949463, "learning_rate": 1e-05, "loss": 0.0188, "step": 821500 }, { "epoch": 0.008216, "grad_norm": 0.1431853473186493, "learning_rate": 1e-05, "loss": 0.0188, "step": 821600 }, { "epoch": 0.008217, "grad_norm": 0.12381626665592194, "learning_rate": 1e-05, "loss": 0.0183, "step": 821700 }, { "epoch": 0.008218, "grad_norm": 0.1781560480594635, "learning_rate": 1e-05, "loss": 0.0185, "step": 821800 }, { "epoch": 0.008219, "grad_norm": 0.11558887362480164, "learning_rate": 1e-05, "loss": 0.0191, "step": 821900 }, { "epoch": 0.00822, "grad_norm": 0.13685543835163116, "learning_rate": 1e-05, "loss": 0.0181, "step": 822000 }, { "epoch": 0.008221, "grad_norm": 0.1291433721780777, "learning_rate": 1e-05, "loss": 0.0189, "step": 822100 }, { "epoch": 0.008222, "grad_norm": 0.17257541418075562, "learning_rate": 1e-05, "loss": 0.019, "step": 822200 }, { "epoch": 0.008223, "grad_norm": 0.15330171585083008, "learning_rate": 1e-05, "loss": 0.0183, "step": 822300 }, { "epoch": 0.008224, "grad_norm": 0.22303031384944916, "learning_rate": 1e-05, "loss": 0.0186, "step": 822400 }, { "epoch": 0.008225, "grad_norm": 0.14452023804187775, "learning_rate": 1e-05, "loss": 0.0187, "step": 822500 }, { "epoch": 0.008226, "grad_norm": 0.15301157534122467, "learning_rate": 1e-05, "loss": 0.0184, "step": 822600 }, { "epoch": 0.008227, "grad_norm": 0.17850418388843536, "learning_rate": 1e-05, "loss": 0.0186, "step": 822700 }, { "epoch": 0.008228, "grad_norm": 0.1897379755973816, "learning_rate": 1e-05, "loss": 0.0192, "step": 822800 }, { "epoch": 0.008229, "grad_norm": 0.14521723985671997, "learning_rate": 1e-05, "loss": 0.0188, "step": 822900 }, { "epoch": 0.00823, "grad_norm": 0.17404614388942719, "learning_rate": 1e-05, "loss": 0.0188, "step": 823000 }, { "epoch": 0.008231, "grad_norm": 0.12041222304105759, "learning_rate": 1e-05, "loss": 0.0194, "step": 823100 }, { "epoch": 0.008232, "grad_norm": 0.19359809160232544, "learning_rate": 1e-05, "loss": 0.0191, "step": 823200 }, { "epoch": 0.008233, "grad_norm": 0.15295608341693878, "learning_rate": 1e-05, "loss": 0.0189, "step": 823300 }, { "epoch": 0.008234, "grad_norm": 0.17675061523914337, "learning_rate": 1e-05, "loss": 0.0188, "step": 823400 }, { "epoch": 0.008235, "grad_norm": 0.23338259756565094, "learning_rate": 1e-05, "loss": 0.0184, "step": 823500 }, { "epoch": 0.008236, "grad_norm": 0.16617274284362793, "learning_rate": 1e-05, "loss": 0.0188, "step": 823600 }, { "epoch": 0.008237, "grad_norm": 0.17224206030368805, "learning_rate": 1e-05, "loss": 0.0185, "step": 823700 }, { "epoch": 0.008238, "grad_norm": 0.18567799031734467, "learning_rate": 1e-05, "loss": 0.0193, "step": 823800 }, { "epoch": 0.008239, "grad_norm": 0.15318427979946136, "learning_rate": 1e-05, "loss": 0.0189, "step": 823900 }, { "epoch": 0.00824, "grad_norm": 0.21982809901237488, "learning_rate": 1e-05, "loss": 0.0187, "step": 824000 }, { "epoch": 0.008241, "grad_norm": 0.11163543164730072, "learning_rate": 1e-05, "loss": 0.0188, "step": 824100 }, { "epoch": 0.008242, "grad_norm": 0.1384405791759491, "learning_rate": 1e-05, "loss": 0.0192, "step": 824200 }, { "epoch": 0.008243, "grad_norm": 0.15399228036403656, "learning_rate": 1e-05, "loss": 0.0193, "step": 824300 }, { "epoch": 0.008244, "grad_norm": 0.1579444408416748, "learning_rate": 1e-05, "loss": 0.0189, "step": 824400 }, { "epoch": 0.008245, "grad_norm": 0.1426205188035965, "learning_rate": 1e-05, "loss": 0.0187, "step": 824500 }, { "epoch": 0.008246, "grad_norm": 0.13467486202716827, "learning_rate": 1e-05, "loss": 0.0189, "step": 824600 }, { "epoch": 0.008247, "grad_norm": 0.1673516482114792, "learning_rate": 1e-05, "loss": 0.0185, "step": 824700 }, { "epoch": 0.008248, "grad_norm": 0.12181340903043747, "learning_rate": 1e-05, "loss": 0.0186, "step": 824800 }, { "epoch": 0.008249, "grad_norm": 0.16626200079917908, "learning_rate": 1e-05, "loss": 0.0189, "step": 824900 }, { "epoch": 0.00825, "grad_norm": 0.15843692421913147, "learning_rate": 1e-05, "loss": 0.0186, "step": 825000 }, { "epoch": 0.008251, "grad_norm": 0.13431857526302338, "learning_rate": 1e-05, "loss": 0.0189, "step": 825100 }, { "epoch": 0.008252, "grad_norm": 0.13016115128993988, "learning_rate": 1e-05, "loss": 0.019, "step": 825200 }, { "epoch": 0.008253, "grad_norm": 0.20994818210601807, "learning_rate": 1e-05, "loss": 0.0186, "step": 825300 }, { "epoch": 0.008254, "grad_norm": 0.13749665021896362, "learning_rate": 1e-05, "loss": 0.019, "step": 825400 }, { "epoch": 0.008255, "grad_norm": 0.13935527205467224, "learning_rate": 1e-05, "loss": 0.019, "step": 825500 }, { "epoch": 0.008256, "grad_norm": 0.19138315320014954, "learning_rate": 1e-05, "loss": 0.0192, "step": 825600 }, { "epoch": 0.008257, "grad_norm": 0.15562404692173004, "learning_rate": 1e-05, "loss": 0.0186, "step": 825700 }, { "epoch": 0.008258, "grad_norm": 0.1072089895606041, "learning_rate": 1e-05, "loss": 0.0189, "step": 825800 }, { "epoch": 0.008259, "grad_norm": 0.17657805979251862, "learning_rate": 1e-05, "loss": 0.0192, "step": 825900 }, { "epoch": 0.00826, "grad_norm": 0.18521425127983093, "learning_rate": 1e-05, "loss": 0.0191, "step": 826000 }, { "epoch": 0.008261, "grad_norm": 0.1211959570646286, "learning_rate": 1e-05, "loss": 0.019, "step": 826100 }, { "epoch": 0.008262, "grad_norm": 0.13282176852226257, "learning_rate": 1e-05, "loss": 0.0186, "step": 826200 }, { "epoch": 0.008263, "grad_norm": 0.11719886213541031, "learning_rate": 1e-05, "loss": 0.019, "step": 826300 }, { "epoch": 0.008264, "grad_norm": 0.1619863063097, "learning_rate": 1e-05, "loss": 0.0185, "step": 826400 }, { "epoch": 0.008265, "grad_norm": 0.13958938419818878, "learning_rate": 1e-05, "loss": 0.0184, "step": 826500 }, { "epoch": 0.008266, "grad_norm": 0.16102254390716553, "learning_rate": 1e-05, "loss": 0.0185, "step": 826600 }, { "epoch": 0.008267, "grad_norm": 0.22852812707424164, "learning_rate": 1e-05, "loss": 0.0189, "step": 826700 }, { "epoch": 0.008268, "grad_norm": 0.18509700894355774, "learning_rate": 1e-05, "loss": 0.0189, "step": 826800 }, { "epoch": 0.008269, "grad_norm": 0.2256520688533783, "learning_rate": 1e-05, "loss": 0.0188, "step": 826900 }, { "epoch": 0.00827, "grad_norm": 0.19171741604804993, "learning_rate": 1e-05, "loss": 0.0189, "step": 827000 }, { "epoch": 0.008271, "grad_norm": 0.1588001698255539, "learning_rate": 1e-05, "loss": 0.0187, "step": 827100 }, { "epoch": 0.008272, "grad_norm": 0.1908348947763443, "learning_rate": 1e-05, "loss": 0.0186, "step": 827200 }, { "epoch": 0.008273, "grad_norm": 0.1961078941822052, "learning_rate": 1e-05, "loss": 0.0191, "step": 827300 }, { "epoch": 0.008274, "grad_norm": 0.2516116499900818, "learning_rate": 1e-05, "loss": 0.0192, "step": 827400 }, { "epoch": 0.008275, "grad_norm": 0.15360330045223236, "learning_rate": 1e-05, "loss": 0.0191, "step": 827500 }, { "epoch": 0.008276, "grad_norm": 0.1638190597295761, "learning_rate": 1e-05, "loss": 0.0187, "step": 827600 }, { "epoch": 0.008277, "grad_norm": 0.177389457821846, "learning_rate": 1e-05, "loss": 0.019, "step": 827700 }, { "epoch": 0.008278, "grad_norm": 0.14115335047245026, "learning_rate": 1e-05, "loss": 0.019, "step": 827800 }, { "epoch": 0.008279, "grad_norm": 0.13176089525222778, "learning_rate": 1e-05, "loss": 0.0191, "step": 827900 }, { "epoch": 0.00828, "grad_norm": 0.15109939873218536, "learning_rate": 1e-05, "loss": 0.0184, "step": 828000 }, { "epoch": 0.008281, "grad_norm": 0.13595597445964813, "learning_rate": 1e-05, "loss": 0.0186, "step": 828100 }, { "epoch": 0.008282, "grad_norm": 0.14415772259235382, "learning_rate": 1e-05, "loss": 0.0188, "step": 828200 }, { "epoch": 0.008283, "grad_norm": 0.16819050908088684, "learning_rate": 1e-05, "loss": 0.0189, "step": 828300 }, { "epoch": 0.008284, "grad_norm": 0.18806616961956024, "learning_rate": 1e-05, "loss": 0.0189, "step": 828400 }, { "epoch": 0.008285, "grad_norm": 0.1938045620918274, "learning_rate": 1e-05, "loss": 0.0186, "step": 828500 }, { "epoch": 0.008286, "grad_norm": 0.14455178380012512, "learning_rate": 1e-05, "loss": 0.0184, "step": 828600 }, { "epoch": 0.008287, "grad_norm": 0.1730591505765915, "learning_rate": 1e-05, "loss": 0.019, "step": 828700 }, { "epoch": 0.008288, "grad_norm": 0.1508539468050003, "learning_rate": 1e-05, "loss": 0.0192, "step": 828800 }, { "epoch": 0.008289, "grad_norm": 0.12134287506341934, "learning_rate": 1e-05, "loss": 0.0181, "step": 828900 }, { "epoch": 0.00829, "grad_norm": 0.16486003994941711, "learning_rate": 1e-05, "loss": 0.0192, "step": 829000 }, { "epoch": 0.008291, "grad_norm": 0.14834357798099518, "learning_rate": 1e-05, "loss": 0.0184, "step": 829100 }, { "epoch": 0.008292, "grad_norm": 0.1421622484922409, "learning_rate": 1e-05, "loss": 0.0188, "step": 829200 }, { "epoch": 0.008293, "grad_norm": 0.2270756959915161, "learning_rate": 1e-05, "loss": 0.0189, "step": 829300 }, { "epoch": 0.008294, "grad_norm": 0.1896379292011261, "learning_rate": 1e-05, "loss": 0.0187, "step": 829400 }, { "epoch": 0.008295, "grad_norm": 0.24607129395008087, "learning_rate": 1e-05, "loss": 0.0192, "step": 829500 }, { "epoch": 0.008296, "grad_norm": 0.1266806423664093, "learning_rate": 1e-05, "loss": 0.0189, "step": 829600 }, { "epoch": 0.008297, "grad_norm": 0.1869385987520218, "learning_rate": 1e-05, "loss": 0.019, "step": 829700 }, { "epoch": 0.008298, "grad_norm": 0.14291489124298096, "learning_rate": 1e-05, "loss": 0.0186, "step": 829800 }, { "epoch": 0.008299, "grad_norm": 0.1763085126876831, "learning_rate": 1e-05, "loss": 0.0189, "step": 829900 }, { "epoch": 0.0083, "grad_norm": 0.11878268420696259, "learning_rate": 1e-05, "loss": 0.0189, "step": 830000 }, { "epoch": 0.008301, "grad_norm": 0.13175922632217407, "learning_rate": 1e-05, "loss": 0.0193, "step": 830100 }, { "epoch": 0.008302, "grad_norm": 0.16079983115196228, "learning_rate": 1e-05, "loss": 0.0191, "step": 830200 }, { "epoch": 0.008303, "grad_norm": 0.1920696198940277, "learning_rate": 1e-05, "loss": 0.0185, "step": 830300 }, { "epoch": 0.008304, "grad_norm": 0.1260228455066681, "learning_rate": 1e-05, "loss": 0.019, "step": 830400 }, { "epoch": 0.008305, "grad_norm": 0.17400886118412018, "learning_rate": 1e-05, "loss": 0.0189, "step": 830500 }, { "epoch": 0.008306, "grad_norm": 0.2350611686706543, "learning_rate": 1e-05, "loss": 0.0186, "step": 830600 }, { "epoch": 0.008307, "grad_norm": 0.13892237842082977, "learning_rate": 1e-05, "loss": 0.0193, "step": 830700 }, { "epoch": 0.008308, "grad_norm": 0.1789434254169464, "learning_rate": 1e-05, "loss": 0.0186, "step": 830800 }, { "epoch": 0.008309, "grad_norm": 0.14153608679771423, "learning_rate": 1e-05, "loss": 0.0185, "step": 830900 }, { "epoch": 0.00831, "grad_norm": 0.15251684188842773, "learning_rate": 1e-05, "loss": 0.0186, "step": 831000 }, { "epoch": 0.008311, "grad_norm": 0.12433870136737823, "learning_rate": 1e-05, "loss": 0.0185, "step": 831100 }, { "epoch": 0.008312, "grad_norm": 0.16583102941513062, "learning_rate": 1e-05, "loss": 0.0187, "step": 831200 }, { "epoch": 0.008313, "grad_norm": 0.171479269862175, "learning_rate": 1e-05, "loss": 0.0186, "step": 831300 }, { "epoch": 0.008314, "grad_norm": 0.14667122066020966, "learning_rate": 1e-05, "loss": 0.0189, "step": 831400 }, { "epoch": 0.008315, "grad_norm": 0.15832096338272095, "learning_rate": 1e-05, "loss": 0.0182, "step": 831500 }, { "epoch": 0.008316, "grad_norm": 0.14326190948486328, "learning_rate": 1e-05, "loss": 0.0189, "step": 831600 }, { "epoch": 0.008317, "grad_norm": 0.25778716802597046, "learning_rate": 1e-05, "loss": 0.0186, "step": 831700 }, { "epoch": 0.008318, "grad_norm": 0.11209197342395782, "learning_rate": 1e-05, "loss": 0.0192, "step": 831800 }, { "epoch": 0.008319, "grad_norm": 0.21625548601150513, "learning_rate": 1e-05, "loss": 0.0187, "step": 831900 }, { "epoch": 0.00832, "grad_norm": 0.1638410985469818, "learning_rate": 1e-05, "loss": 0.0195, "step": 832000 }, { "epoch": 0.008321, "grad_norm": 0.12833793461322784, "learning_rate": 1e-05, "loss": 0.0189, "step": 832100 }, { "epoch": 0.008322, "grad_norm": 0.12371176481246948, "learning_rate": 1e-05, "loss": 0.0186, "step": 832200 }, { "epoch": 0.008323, "grad_norm": 0.1992127150297165, "learning_rate": 1e-05, "loss": 0.0185, "step": 832300 }, { "epoch": 0.008324, "grad_norm": 0.1413569152355194, "learning_rate": 1e-05, "loss": 0.0188, "step": 832400 }, { "epoch": 0.008325, "grad_norm": 0.18728961050510406, "learning_rate": 1e-05, "loss": 0.0188, "step": 832500 }, { "epoch": 0.008326, "grad_norm": 0.1508142203092575, "learning_rate": 1e-05, "loss": 0.0188, "step": 832600 }, { "epoch": 0.008327, "grad_norm": 0.17737290263175964, "learning_rate": 1e-05, "loss": 0.0184, "step": 832700 }, { "epoch": 0.008328, "grad_norm": 0.20883110165596008, "learning_rate": 1e-05, "loss": 0.0183, "step": 832800 }, { "epoch": 0.008329, "grad_norm": 0.13780130445957184, "learning_rate": 1e-05, "loss": 0.0192, "step": 832900 }, { "epoch": 0.00833, "grad_norm": 0.11545750498771667, "learning_rate": 1e-05, "loss": 0.0188, "step": 833000 }, { "epoch": 0.008331, "grad_norm": 0.13980254530906677, "learning_rate": 1e-05, "loss": 0.0183, "step": 833100 }, { "epoch": 0.008332, "grad_norm": 0.16560125350952148, "learning_rate": 1e-05, "loss": 0.0185, "step": 833200 }, { "epoch": 0.008333, "grad_norm": 0.15758617222309113, "learning_rate": 1e-05, "loss": 0.0188, "step": 833300 }, { "epoch": 0.008334, "grad_norm": 0.14542293548583984, "learning_rate": 1e-05, "loss": 0.0186, "step": 833400 }, { "epoch": 0.008335, "grad_norm": 0.11126602441072464, "learning_rate": 1e-05, "loss": 0.0187, "step": 833500 }, { "epoch": 0.008336, "grad_norm": 0.13622745871543884, "learning_rate": 1e-05, "loss": 0.0187, "step": 833600 }, { "epoch": 0.008337, "grad_norm": 0.20508994162082672, "learning_rate": 1e-05, "loss": 0.0188, "step": 833700 }, { "epoch": 0.008338, "grad_norm": 0.11186863481998444, "learning_rate": 1e-05, "loss": 0.0187, "step": 833800 }, { "epoch": 0.008339, "grad_norm": 0.11731823533773422, "learning_rate": 1e-05, "loss": 0.019, "step": 833900 }, { "epoch": 0.00834, "grad_norm": 0.1798611432313919, "learning_rate": 1e-05, "loss": 0.0184, "step": 834000 }, { "epoch": 0.008341, "grad_norm": 0.18037788569927216, "learning_rate": 1e-05, "loss": 0.019, "step": 834100 }, { "epoch": 0.008342, "grad_norm": 0.17767514288425446, "learning_rate": 1e-05, "loss": 0.0189, "step": 834200 }, { "epoch": 0.008343, "grad_norm": 0.18693992495536804, "learning_rate": 1e-05, "loss": 0.0186, "step": 834300 }, { "epoch": 0.008344, "grad_norm": 0.1519898772239685, "learning_rate": 1e-05, "loss": 0.0191, "step": 834400 }, { "epoch": 0.008345, "grad_norm": 0.14548906683921814, "learning_rate": 1e-05, "loss": 0.0193, "step": 834500 }, { "epoch": 0.008346, "grad_norm": 0.12887564301490784, "learning_rate": 1e-05, "loss": 0.0188, "step": 834600 }, { "epoch": 0.008347, "grad_norm": 0.13114714622497559, "learning_rate": 1e-05, "loss": 0.019, "step": 834700 }, { "epoch": 0.008348, "grad_norm": 0.17315888404846191, "learning_rate": 1e-05, "loss": 0.0185, "step": 834800 }, { "epoch": 0.008349, "grad_norm": 0.1427219808101654, "learning_rate": 1e-05, "loss": 0.0187, "step": 834900 }, { "epoch": 0.00835, "grad_norm": 0.1825488805770874, "learning_rate": 1e-05, "loss": 0.0187, "step": 835000 }, { "epoch": 0.008351, "grad_norm": 0.17244912683963776, "learning_rate": 1e-05, "loss": 0.0185, "step": 835100 }, { "epoch": 0.008352, "grad_norm": 0.16460560262203217, "learning_rate": 1e-05, "loss": 0.0186, "step": 835200 }, { "epoch": 0.008353, "grad_norm": 0.11863704770803452, "learning_rate": 1e-05, "loss": 0.0188, "step": 835300 }, { "epoch": 0.008354, "grad_norm": 0.25811100006103516, "learning_rate": 1e-05, "loss": 0.0187, "step": 835400 }, { "epoch": 0.008355, "grad_norm": 0.11072838306427002, "learning_rate": 1e-05, "loss": 0.0187, "step": 835500 }, { "epoch": 0.008356, "grad_norm": 0.13893269002437592, "learning_rate": 1e-05, "loss": 0.0185, "step": 835600 }, { "epoch": 0.008357, "grad_norm": 0.18971803784370422, "learning_rate": 1e-05, "loss": 0.0188, "step": 835700 }, { "epoch": 0.008358, "grad_norm": 0.1957428902387619, "learning_rate": 1e-05, "loss": 0.0191, "step": 835800 }, { "epoch": 0.008359, "grad_norm": 0.16659219563007355, "learning_rate": 1e-05, "loss": 0.0189, "step": 835900 }, { "epoch": 0.00836, "grad_norm": 0.1414104402065277, "learning_rate": 1e-05, "loss": 0.019, "step": 836000 }, { "epoch": 0.008361, "grad_norm": 0.1318855583667755, "learning_rate": 1e-05, "loss": 0.0189, "step": 836100 }, { "epoch": 0.008362, "grad_norm": 0.13983570039272308, "learning_rate": 1e-05, "loss": 0.0187, "step": 836200 }, { "epoch": 0.008363, "grad_norm": 0.13539183139801025, "learning_rate": 1e-05, "loss": 0.0185, "step": 836300 }, { "epoch": 0.008364, "grad_norm": 0.17105978727340698, "learning_rate": 1e-05, "loss": 0.0186, "step": 836400 }, { "epoch": 0.008365, "grad_norm": 0.1522587090730667, "learning_rate": 1e-05, "loss": 0.0186, "step": 836500 }, { "epoch": 0.008366, "grad_norm": 0.13427212834358215, "learning_rate": 1e-05, "loss": 0.0188, "step": 836600 }, { "epoch": 0.008367, "grad_norm": 0.10713755339384079, "learning_rate": 1e-05, "loss": 0.0178, "step": 836700 }, { "epoch": 0.008368, "grad_norm": 0.13627566397190094, "learning_rate": 1e-05, "loss": 0.0186, "step": 836800 }, { "epoch": 0.008369, "grad_norm": 0.15633469820022583, "learning_rate": 1e-05, "loss": 0.0186, "step": 836900 }, { "epoch": 0.00837, "grad_norm": 0.14684534072875977, "learning_rate": 1e-05, "loss": 0.0188, "step": 837000 }, { "epoch": 0.008371, "grad_norm": 0.15060535073280334, "learning_rate": 1e-05, "loss": 0.0189, "step": 837100 }, { "epoch": 0.008372, "grad_norm": 0.16829492151737213, "learning_rate": 1e-05, "loss": 0.0191, "step": 837200 }, { "epoch": 0.008373, "grad_norm": 0.1477278172969818, "learning_rate": 1e-05, "loss": 0.0187, "step": 837300 }, { "epoch": 0.008374, "grad_norm": 0.11352024227380753, "learning_rate": 1e-05, "loss": 0.0191, "step": 837400 }, { "epoch": 0.008375, "grad_norm": 0.09866739064455032, "learning_rate": 1e-05, "loss": 0.0191, "step": 837500 }, { "epoch": 0.008376, "grad_norm": 0.15197709202766418, "learning_rate": 1e-05, "loss": 0.0188, "step": 837600 }, { "epoch": 0.008377, "grad_norm": 0.18941281735897064, "learning_rate": 1e-05, "loss": 0.0188, "step": 837700 }, { "epoch": 0.008378, "grad_norm": 0.14206814765930176, "learning_rate": 1e-05, "loss": 0.0188, "step": 837800 }, { "epoch": 0.008379, "grad_norm": 0.19268350303173065, "learning_rate": 1e-05, "loss": 0.0184, "step": 837900 }, { "epoch": 0.00838, "grad_norm": 0.20057439804077148, "learning_rate": 1e-05, "loss": 0.0186, "step": 838000 }, { "epoch": 0.008381, "grad_norm": 0.16077546775341034, "learning_rate": 1e-05, "loss": 0.019, "step": 838100 }, { "epoch": 0.008382, "grad_norm": 0.13469375669956207, "learning_rate": 1e-05, "loss": 0.0189, "step": 838200 }, { "epoch": 0.008383, "grad_norm": 0.16246597468852997, "learning_rate": 1e-05, "loss": 0.0179, "step": 838300 }, { "epoch": 0.008384, "grad_norm": 0.13728296756744385, "learning_rate": 1e-05, "loss": 0.0183, "step": 838400 }, { "epoch": 0.008385, "grad_norm": 0.14441893994808197, "learning_rate": 1e-05, "loss": 0.0191, "step": 838500 }, { "epoch": 0.008386, "grad_norm": 0.18417176604270935, "learning_rate": 1e-05, "loss": 0.0191, "step": 838600 }, { "epoch": 0.008387, "grad_norm": 0.09554116427898407, "learning_rate": 1e-05, "loss": 0.0193, "step": 838700 }, { "epoch": 0.008388, "grad_norm": 0.1584380865097046, "learning_rate": 1e-05, "loss": 0.0183, "step": 838800 }, { "epoch": 0.008389, "grad_norm": 0.1796867400407791, "learning_rate": 1e-05, "loss": 0.0189, "step": 838900 }, { "epoch": 0.00839, "grad_norm": 0.12534625828266144, "learning_rate": 1e-05, "loss": 0.0188, "step": 839000 }, { "epoch": 0.008391, "grad_norm": 0.19026564061641693, "learning_rate": 1e-05, "loss": 0.0184, "step": 839100 }, { "epoch": 0.008392, "grad_norm": 0.14815223217010498, "learning_rate": 1e-05, "loss": 0.0184, "step": 839200 }, { "epoch": 0.008393, "grad_norm": 0.18742702901363373, "learning_rate": 1e-05, "loss": 0.0187, "step": 839300 }, { "epoch": 0.008394, "grad_norm": 0.13280540704727173, "learning_rate": 1e-05, "loss": 0.0186, "step": 839400 }, { "epoch": 0.008395, "grad_norm": 0.11785636842250824, "learning_rate": 1e-05, "loss": 0.019, "step": 839500 }, { "epoch": 0.008396, "grad_norm": 0.1876227706670761, "learning_rate": 1e-05, "loss": 0.0188, "step": 839600 }, { "epoch": 0.008397, "grad_norm": 0.12986145913600922, "learning_rate": 1e-05, "loss": 0.019, "step": 839700 }, { "epoch": 0.008398, "grad_norm": 0.10436882078647614, "learning_rate": 1e-05, "loss": 0.0191, "step": 839800 }, { "epoch": 0.008399, "grad_norm": 0.14806991815567017, "learning_rate": 1e-05, "loss": 0.0187, "step": 839900 }, { "epoch": 0.0084, "grad_norm": 0.15344581007957458, "learning_rate": 1e-05, "loss": 0.0189, "step": 840000 }, { "epoch": 0.0084, "eval_loss": 0.016549859195947647, "eval_runtime": 191.6539, "eval_samples_per_second": 260.887, "eval_steps_per_second": 16.305, "step": 840000 }, { "epoch": 0.008401, "grad_norm": 0.14481200277805328, "learning_rate": 1e-05, "loss": 0.0187, "step": 840100 }, { "epoch": 0.008402, "grad_norm": 0.1467781662940979, "learning_rate": 1e-05, "loss": 0.0184, "step": 840200 }, { "epoch": 0.008403, "grad_norm": 0.13861337304115295, "learning_rate": 1e-05, "loss": 0.0189, "step": 840300 }, { "epoch": 0.008404, "grad_norm": 0.1481960266828537, "learning_rate": 1e-05, "loss": 0.0181, "step": 840400 }, { "epoch": 0.008405, "grad_norm": 0.10900911688804626, "learning_rate": 1e-05, "loss": 0.0187, "step": 840500 }, { "epoch": 0.008406, "grad_norm": 0.17633455991744995, "learning_rate": 1e-05, "loss": 0.0188, "step": 840600 }, { "epoch": 0.008407, "grad_norm": 0.175925150513649, "learning_rate": 1e-05, "loss": 0.0191, "step": 840700 }, { "epoch": 0.008408, "grad_norm": 0.1675090342760086, "learning_rate": 1e-05, "loss": 0.0185, "step": 840800 }, { "epoch": 0.008409, "grad_norm": 0.19967687129974365, "learning_rate": 1e-05, "loss": 0.0188, "step": 840900 }, { "epoch": 0.00841, "grad_norm": 0.11850710958242416, "learning_rate": 1e-05, "loss": 0.0184, "step": 841000 }, { "epoch": 0.008411, "grad_norm": 0.16034065186977386, "learning_rate": 1e-05, "loss": 0.0189, "step": 841100 }, { "epoch": 0.008412, "grad_norm": 0.16192980110645294, "learning_rate": 1e-05, "loss": 0.0183, "step": 841200 }, { "epoch": 0.008413, "grad_norm": 0.1539708375930786, "learning_rate": 1e-05, "loss": 0.019, "step": 841300 }, { "epoch": 0.008414, "grad_norm": 0.17291918396949768, "learning_rate": 1e-05, "loss": 0.0193, "step": 841400 }, { "epoch": 0.008415, "grad_norm": 0.19277364015579224, "learning_rate": 1e-05, "loss": 0.019, "step": 841500 }, { "epoch": 0.008416, "grad_norm": 0.13743920624256134, "learning_rate": 1e-05, "loss": 0.0192, "step": 841600 }, { "epoch": 0.008417, "grad_norm": 0.16099588572978973, "learning_rate": 1e-05, "loss": 0.0188, "step": 841700 }, { "epoch": 0.008418, "grad_norm": 0.1460626721382141, "learning_rate": 1e-05, "loss": 0.0186, "step": 841800 }, { "epoch": 0.008419, "grad_norm": 0.14140254259109497, "learning_rate": 1e-05, "loss": 0.0184, "step": 841900 }, { "epoch": 0.00842, "grad_norm": 0.14765582978725433, "learning_rate": 1e-05, "loss": 0.0188, "step": 842000 }, { "epoch": 0.008421, "grad_norm": 0.14957788586616516, "learning_rate": 1e-05, "loss": 0.0187, "step": 842100 }, { "epoch": 0.008422, "grad_norm": 0.197893425822258, "learning_rate": 1e-05, "loss": 0.0184, "step": 842200 }, { "epoch": 0.008423, "grad_norm": 0.2183571308851242, "learning_rate": 1e-05, "loss": 0.0189, "step": 842300 }, { "epoch": 0.008424, "grad_norm": 0.15064814686775208, "learning_rate": 1e-05, "loss": 0.0189, "step": 842400 }, { "epoch": 0.008425, "grad_norm": 0.13953399658203125, "learning_rate": 1e-05, "loss": 0.0189, "step": 842500 }, { "epoch": 0.008426, "grad_norm": 0.26414355635643005, "learning_rate": 1e-05, "loss": 0.0183, "step": 842600 }, { "epoch": 0.008427, "grad_norm": 0.22091692686080933, "learning_rate": 1e-05, "loss": 0.0186, "step": 842700 }, { "epoch": 0.008428, "grad_norm": 0.20392300188541412, "learning_rate": 1e-05, "loss": 0.0192, "step": 842800 }, { "epoch": 0.008429, "grad_norm": 0.24661654233932495, "learning_rate": 1e-05, "loss": 0.0187, "step": 842900 }, { "epoch": 0.00843, "grad_norm": 0.13196779787540436, "learning_rate": 1e-05, "loss": 0.0189, "step": 843000 }, { "epoch": 0.008431, "grad_norm": 0.12784768640995026, "learning_rate": 1e-05, "loss": 0.0187, "step": 843100 }, { "epoch": 0.008432, "grad_norm": 0.15287289023399353, "learning_rate": 1e-05, "loss": 0.0185, "step": 843200 }, { "epoch": 0.008433, "grad_norm": 0.17283833026885986, "learning_rate": 1e-05, "loss": 0.019, "step": 843300 }, { "epoch": 0.008434, "grad_norm": 0.20093533396720886, "learning_rate": 1e-05, "loss": 0.0185, "step": 843400 }, { "epoch": 0.008435, "grad_norm": 0.16012521088123322, "learning_rate": 1e-05, "loss": 0.0188, "step": 843500 }, { "epoch": 0.008436, "grad_norm": 0.14092840254306793, "learning_rate": 1e-05, "loss": 0.0187, "step": 843600 }, { "epoch": 0.008437, "grad_norm": 0.21164846420288086, "learning_rate": 1e-05, "loss": 0.0186, "step": 843700 }, { "epoch": 0.008438, "grad_norm": 0.11277782917022705, "learning_rate": 1e-05, "loss": 0.019, "step": 843800 }, { "epoch": 0.008439, "grad_norm": 0.1580587476491928, "learning_rate": 1e-05, "loss": 0.0187, "step": 843900 }, { "epoch": 0.00844, "grad_norm": 0.17019204795360565, "learning_rate": 1e-05, "loss": 0.0184, "step": 844000 }, { "epoch": 0.008441, "grad_norm": 0.16870978474617004, "learning_rate": 1e-05, "loss": 0.0184, "step": 844100 }, { "epoch": 0.008442, "grad_norm": 0.169677272439003, "learning_rate": 1e-05, "loss": 0.0188, "step": 844200 }, { "epoch": 0.008443, "grad_norm": 0.1254122406244278, "learning_rate": 1e-05, "loss": 0.0189, "step": 844300 }, { "epoch": 0.008444, "grad_norm": 0.15964435040950775, "learning_rate": 1e-05, "loss": 0.0188, "step": 844400 }, { "epoch": 0.008445, "grad_norm": 0.14077802002429962, "learning_rate": 1e-05, "loss": 0.0189, "step": 844500 }, { "epoch": 0.008446, "grad_norm": 0.20577014982700348, "learning_rate": 1e-05, "loss": 0.0186, "step": 844600 }, { "epoch": 0.008447, "grad_norm": 0.1301194429397583, "learning_rate": 1e-05, "loss": 0.0181, "step": 844700 }, { "epoch": 0.008448, "grad_norm": 0.12223310768604279, "learning_rate": 1e-05, "loss": 0.0184, "step": 844800 }, { "epoch": 0.008449, "grad_norm": 0.18532584607601166, "learning_rate": 1e-05, "loss": 0.0191, "step": 844900 }, { "epoch": 0.00845, "grad_norm": 0.1283584088087082, "learning_rate": 1e-05, "loss": 0.0186, "step": 845000 }, { "epoch": 0.008451, "grad_norm": 0.12504063546657562, "learning_rate": 1e-05, "loss": 0.0188, "step": 845100 }, { "epoch": 0.008452, "grad_norm": 0.1605972945690155, "learning_rate": 1e-05, "loss": 0.0189, "step": 845200 }, { "epoch": 0.008453, "grad_norm": 0.16281665861606598, "learning_rate": 1e-05, "loss": 0.0188, "step": 845300 }, { "epoch": 0.008454, "grad_norm": 0.13006658852100372, "learning_rate": 1e-05, "loss": 0.0186, "step": 845400 }, { "epoch": 0.008455, "grad_norm": 0.14864562451839447, "learning_rate": 1e-05, "loss": 0.0188, "step": 845500 }, { "epoch": 0.008456, "grad_norm": 0.12502680718898773, "learning_rate": 1e-05, "loss": 0.0185, "step": 845600 }, { "epoch": 0.008457, "grad_norm": 0.1727662980556488, "learning_rate": 1e-05, "loss": 0.0188, "step": 845700 }, { "epoch": 0.008458, "grad_norm": 0.1828327775001526, "learning_rate": 1e-05, "loss": 0.0182, "step": 845800 }, { "epoch": 0.008459, "grad_norm": 0.17733755707740784, "learning_rate": 1e-05, "loss": 0.0184, "step": 845900 }, { "epoch": 0.00846, "grad_norm": 0.15225182473659515, "learning_rate": 1e-05, "loss": 0.0186, "step": 846000 }, { "epoch": 0.008461, "grad_norm": 0.13388831913471222, "learning_rate": 1e-05, "loss": 0.0189, "step": 846100 }, { "epoch": 0.008462, "grad_norm": 0.16641104221343994, "learning_rate": 1e-05, "loss": 0.0188, "step": 846200 }, { "epoch": 0.008463, "grad_norm": 0.1608770787715912, "learning_rate": 1e-05, "loss": 0.0184, "step": 846300 }, { "epoch": 0.008464, "grad_norm": 0.13361066579818726, "learning_rate": 1e-05, "loss": 0.0187, "step": 846400 }, { "epoch": 0.008465, "grad_norm": 0.19351041316986084, "learning_rate": 1e-05, "loss": 0.019, "step": 846500 }, { "epoch": 0.008466, "grad_norm": 0.12012863904237747, "learning_rate": 1e-05, "loss": 0.0184, "step": 846600 }, { "epoch": 0.008467, "grad_norm": 0.18704888224601746, "learning_rate": 1e-05, "loss": 0.0185, "step": 846700 }, { "epoch": 0.008468, "grad_norm": 0.23862144351005554, "learning_rate": 1e-05, "loss": 0.0183, "step": 846800 }, { "epoch": 0.008469, "grad_norm": 0.13034909963607788, "learning_rate": 1e-05, "loss": 0.019, "step": 846900 }, { "epoch": 0.00847, "grad_norm": 0.12152991443872452, "learning_rate": 1e-05, "loss": 0.0185, "step": 847000 }, { "epoch": 0.008471, "grad_norm": 0.14663462340831757, "learning_rate": 1e-05, "loss": 0.0186, "step": 847100 }, { "epoch": 0.008472, "grad_norm": 0.15708467364311218, "learning_rate": 1e-05, "loss": 0.0189, "step": 847200 }, { "epoch": 0.008473, "grad_norm": 0.1490301638841629, "learning_rate": 1e-05, "loss": 0.0183, "step": 847300 }, { "epoch": 0.008474, "grad_norm": 0.139240100979805, "learning_rate": 1e-05, "loss": 0.019, "step": 847400 }, { "epoch": 0.008475, "grad_norm": 0.15065434575080872, "learning_rate": 1e-05, "loss": 0.0187, "step": 847500 }, { "epoch": 0.008476, "grad_norm": 0.1480453908443451, "learning_rate": 1e-05, "loss": 0.0184, "step": 847600 }, { "epoch": 0.008477, "grad_norm": 0.13116657733917236, "learning_rate": 1e-05, "loss": 0.0192, "step": 847700 }, { "epoch": 0.008478, "grad_norm": 0.17576773464679718, "learning_rate": 1e-05, "loss": 0.0185, "step": 847800 }, { "epoch": 0.008479, "grad_norm": 0.17132392525672913, "learning_rate": 1e-05, "loss": 0.0187, "step": 847900 }, { "epoch": 0.00848, "grad_norm": 0.1567673236131668, "learning_rate": 1e-05, "loss": 0.0187, "step": 848000 }, { "epoch": 0.008481, "grad_norm": 0.149844691157341, "learning_rate": 1e-05, "loss": 0.0186, "step": 848100 }, { "epoch": 0.008482, "grad_norm": 0.13491038978099823, "learning_rate": 1e-05, "loss": 0.0185, "step": 848200 }, { "epoch": 0.008483, "grad_norm": 0.15645739436149597, "learning_rate": 1e-05, "loss": 0.0186, "step": 848300 }, { "epoch": 0.008484, "grad_norm": 0.16121074557304382, "learning_rate": 1e-05, "loss": 0.0185, "step": 848400 }, { "epoch": 0.008485, "grad_norm": 0.1420610398054123, "learning_rate": 1e-05, "loss": 0.0189, "step": 848500 }, { "epoch": 0.008486, "grad_norm": 0.14480777084827423, "learning_rate": 1e-05, "loss": 0.0184, "step": 848600 }, { "epoch": 0.008487, "grad_norm": 0.10785872489213943, "learning_rate": 1e-05, "loss": 0.0188, "step": 848700 }, { "epoch": 0.008488, "grad_norm": 0.13434754312038422, "learning_rate": 1e-05, "loss": 0.0188, "step": 848800 }, { "epoch": 0.008489, "grad_norm": 0.35825616121292114, "learning_rate": 1e-05, "loss": 0.0193, "step": 848900 }, { "epoch": 0.00849, "grad_norm": 0.1887466162443161, "learning_rate": 1e-05, "loss": 0.0189, "step": 849000 }, { "epoch": 0.008491, "grad_norm": 0.14610616862773895, "learning_rate": 1e-05, "loss": 0.0183, "step": 849100 }, { "epoch": 0.008492, "grad_norm": 0.13487742841243744, "learning_rate": 1e-05, "loss": 0.0189, "step": 849200 }, { "epoch": 0.008493, "grad_norm": 0.16833004355430603, "learning_rate": 1e-05, "loss": 0.0182, "step": 849300 }, { "epoch": 0.008494, "grad_norm": 0.17480620741844177, "learning_rate": 1e-05, "loss": 0.0181, "step": 849400 }, { "epoch": 0.008495, "grad_norm": 0.1243591159582138, "learning_rate": 1e-05, "loss": 0.019, "step": 849500 }, { "epoch": 0.008496, "grad_norm": 0.13901041448116302, "learning_rate": 1e-05, "loss": 0.0186, "step": 849600 }, { "epoch": 0.008497, "grad_norm": 0.13917115330696106, "learning_rate": 1e-05, "loss": 0.0187, "step": 849700 }, { "epoch": 0.008498, "grad_norm": 0.12255901843309402, "learning_rate": 1e-05, "loss": 0.0188, "step": 849800 }, { "epoch": 0.008499, "grad_norm": 0.1366538256406784, "learning_rate": 1e-05, "loss": 0.0185, "step": 849900 }, { "epoch": 0.0085, "grad_norm": 0.16293175518512726, "learning_rate": 1e-05, "loss": 0.0184, "step": 850000 }, { "epoch": 0.008501, "grad_norm": 0.15071174502372742, "learning_rate": 1e-05, "loss": 0.0188, "step": 850100 }, { "epoch": 0.008502, "grad_norm": 0.1637212485074997, "learning_rate": 1e-05, "loss": 0.018, "step": 850200 }, { "epoch": 0.008503, "grad_norm": 0.14920653402805328, "learning_rate": 1e-05, "loss": 0.0184, "step": 850300 }, { "epoch": 0.008504, "grad_norm": 0.16265307366847992, "learning_rate": 1e-05, "loss": 0.0185, "step": 850400 }, { "epoch": 0.008505, "grad_norm": 0.11952477693557739, "learning_rate": 1e-05, "loss": 0.0186, "step": 850500 }, { "epoch": 0.008506, "grad_norm": 0.12440568208694458, "learning_rate": 1e-05, "loss": 0.0185, "step": 850600 }, { "epoch": 0.008507, "grad_norm": 0.1218852773308754, "learning_rate": 1e-05, "loss": 0.0187, "step": 850700 }, { "epoch": 0.008508, "grad_norm": 0.11979790031909943, "learning_rate": 1e-05, "loss": 0.0184, "step": 850800 }, { "epoch": 0.008509, "grad_norm": 0.16148938238620758, "learning_rate": 1e-05, "loss": 0.0183, "step": 850900 }, { "epoch": 0.00851, "grad_norm": 0.17792396247386932, "learning_rate": 1e-05, "loss": 0.0187, "step": 851000 }, { "epoch": 0.008511, "grad_norm": 0.20462051033973694, "learning_rate": 1e-05, "loss": 0.0186, "step": 851100 }, { "epoch": 0.008512, "grad_norm": 0.12730005383491516, "learning_rate": 1e-05, "loss": 0.0184, "step": 851200 }, { "epoch": 0.008513, "grad_norm": 0.17379936575889587, "learning_rate": 1e-05, "loss": 0.0179, "step": 851300 }, { "epoch": 0.008514, "grad_norm": 0.14783434569835663, "learning_rate": 1e-05, "loss": 0.0182, "step": 851400 }, { "epoch": 0.008515, "grad_norm": 0.2900052070617676, "learning_rate": 1e-05, "loss": 0.019, "step": 851500 }, { "epoch": 0.008516, "grad_norm": 0.15381275117397308, "learning_rate": 1e-05, "loss": 0.0188, "step": 851600 }, { "epoch": 0.008517, "grad_norm": 0.17281223833560944, "learning_rate": 1e-05, "loss": 0.0187, "step": 851700 }, { "epoch": 0.008518, "grad_norm": 0.18663008511066437, "learning_rate": 1e-05, "loss": 0.0185, "step": 851800 }, { "epoch": 0.008519, "grad_norm": 0.17510399222373962, "learning_rate": 1e-05, "loss": 0.0184, "step": 851900 }, { "epoch": 0.00852, "grad_norm": 0.15412914752960205, "learning_rate": 1e-05, "loss": 0.0185, "step": 852000 }, { "epoch": 0.008521, "grad_norm": 0.1372521072626114, "learning_rate": 1e-05, "loss": 0.0185, "step": 852100 }, { "epoch": 0.008522, "grad_norm": 0.13510069251060486, "learning_rate": 1e-05, "loss": 0.0191, "step": 852200 }, { "epoch": 0.008523, "grad_norm": 0.1315656155347824, "learning_rate": 1e-05, "loss": 0.0188, "step": 852300 }, { "epoch": 0.008524, "grad_norm": 0.17502069473266602, "learning_rate": 1e-05, "loss": 0.0187, "step": 852400 }, { "epoch": 0.008525, "grad_norm": 0.15513116121292114, "learning_rate": 1e-05, "loss": 0.0186, "step": 852500 }, { "epoch": 0.008526, "grad_norm": 0.1793607771396637, "learning_rate": 1e-05, "loss": 0.0181, "step": 852600 }, { "epoch": 0.008527, "grad_norm": 0.17860177159309387, "learning_rate": 1e-05, "loss": 0.0182, "step": 852700 }, { "epoch": 0.008528, "grad_norm": 0.195967435836792, "learning_rate": 1e-05, "loss": 0.0182, "step": 852800 }, { "epoch": 0.008529, "grad_norm": 0.12867391109466553, "learning_rate": 1e-05, "loss": 0.0186, "step": 852900 }, { "epoch": 0.00853, "grad_norm": 0.14145198464393616, "learning_rate": 1e-05, "loss": 0.0188, "step": 853000 }, { "epoch": 0.008531, "grad_norm": 0.1659252941608429, "learning_rate": 1e-05, "loss": 0.0187, "step": 853100 }, { "epoch": 0.008532, "grad_norm": 0.1608964055776596, "learning_rate": 1e-05, "loss": 0.0185, "step": 853200 }, { "epoch": 0.008533, "grad_norm": 0.16281086206436157, "learning_rate": 1e-05, "loss": 0.0187, "step": 853300 }, { "epoch": 0.008534, "grad_norm": 0.17632149159908295, "learning_rate": 1e-05, "loss": 0.019, "step": 853400 }, { "epoch": 0.008535, "grad_norm": 0.1751045286655426, "learning_rate": 1e-05, "loss": 0.019, "step": 853500 }, { "epoch": 0.008536, "grad_norm": 0.12642407417297363, "learning_rate": 1e-05, "loss": 0.0182, "step": 853600 }, { "epoch": 0.008537, "grad_norm": 0.1663094460964203, "learning_rate": 1e-05, "loss": 0.0189, "step": 853700 }, { "epoch": 0.008538, "grad_norm": 0.13372911512851715, "learning_rate": 1e-05, "loss": 0.0188, "step": 853800 }, { "epoch": 0.008539, "grad_norm": 0.10659553855657578, "learning_rate": 1e-05, "loss": 0.0181, "step": 853900 }, { "epoch": 0.00854, "grad_norm": 0.18370027840137482, "learning_rate": 1e-05, "loss": 0.0189, "step": 854000 }, { "epoch": 0.008541, "grad_norm": 0.16519978642463684, "learning_rate": 1e-05, "loss": 0.0185, "step": 854100 }, { "epoch": 0.008542, "grad_norm": 0.18896180391311646, "learning_rate": 1e-05, "loss": 0.0185, "step": 854200 }, { "epoch": 0.008543, "grad_norm": 0.1274733543395996, "learning_rate": 1e-05, "loss": 0.0185, "step": 854300 }, { "epoch": 0.008544, "grad_norm": 0.15758013725280762, "learning_rate": 1e-05, "loss": 0.0187, "step": 854400 }, { "epoch": 0.008545, "grad_norm": 0.08828910440206528, "learning_rate": 1e-05, "loss": 0.0185, "step": 854500 }, { "epoch": 0.008546, "grad_norm": 0.16567537188529968, "learning_rate": 1e-05, "loss": 0.0182, "step": 854600 }, { "epoch": 0.008547, "grad_norm": 0.15643800795078278, "learning_rate": 1e-05, "loss": 0.0184, "step": 854700 }, { "epoch": 0.008548, "grad_norm": 0.12173013389110565, "learning_rate": 1e-05, "loss": 0.0185, "step": 854800 }, { "epoch": 0.008549, "grad_norm": 0.20929479598999023, "learning_rate": 1e-05, "loss": 0.0186, "step": 854900 }, { "epoch": 0.00855, "grad_norm": 0.1000482365489006, "learning_rate": 1e-05, "loss": 0.0186, "step": 855000 }, { "epoch": 0.008551, "grad_norm": 0.12348522245883942, "learning_rate": 1e-05, "loss": 0.0183, "step": 855100 }, { "epoch": 0.008552, "grad_norm": 0.13704268634319305, "learning_rate": 1e-05, "loss": 0.0184, "step": 855200 }, { "epoch": 0.008553, "grad_norm": 0.1380419135093689, "learning_rate": 1e-05, "loss": 0.0179, "step": 855300 }, { "epoch": 0.008554, "grad_norm": 0.19596624374389648, "learning_rate": 1e-05, "loss": 0.0186, "step": 855400 }, { "epoch": 0.008555, "grad_norm": 0.1264543980360031, "learning_rate": 1e-05, "loss": 0.0186, "step": 855500 }, { "epoch": 0.008556, "grad_norm": 0.19547446072101593, "learning_rate": 1e-05, "loss": 0.019, "step": 855600 }, { "epoch": 0.008557, "grad_norm": 0.19333820044994354, "learning_rate": 1e-05, "loss": 0.0182, "step": 855700 }, { "epoch": 0.008558, "grad_norm": 0.14444711804389954, "learning_rate": 1e-05, "loss": 0.0187, "step": 855800 }, { "epoch": 0.008559, "grad_norm": 0.1193908154964447, "learning_rate": 1e-05, "loss": 0.0184, "step": 855900 }, { "epoch": 0.00856, "grad_norm": 0.12708650529384613, "learning_rate": 1e-05, "loss": 0.0188, "step": 856000 }, { "epoch": 0.008561, "grad_norm": 0.11507633328437805, "learning_rate": 1e-05, "loss": 0.0184, "step": 856100 }, { "epoch": 0.008562, "grad_norm": 0.14782090485095978, "learning_rate": 1e-05, "loss": 0.0183, "step": 856200 }, { "epoch": 0.008563, "grad_norm": 0.17326800525188446, "learning_rate": 1e-05, "loss": 0.0186, "step": 856300 }, { "epoch": 0.008564, "grad_norm": 0.15683072805404663, "learning_rate": 1e-05, "loss": 0.0188, "step": 856400 }, { "epoch": 0.008565, "grad_norm": 0.13918420672416687, "learning_rate": 1e-05, "loss": 0.0189, "step": 856500 }, { "epoch": 0.008566, "grad_norm": 0.18805131316184998, "learning_rate": 1e-05, "loss": 0.0188, "step": 856600 }, { "epoch": 0.008567, "grad_norm": 0.2263769954442978, "learning_rate": 1e-05, "loss": 0.0186, "step": 856700 }, { "epoch": 0.008568, "grad_norm": 0.12000883370637894, "learning_rate": 1e-05, "loss": 0.0187, "step": 856800 }, { "epoch": 0.008569, "grad_norm": 0.21407368779182434, "learning_rate": 1e-05, "loss": 0.0188, "step": 856900 }, { "epoch": 0.00857, "grad_norm": 0.1578390747308731, "learning_rate": 1e-05, "loss": 0.0183, "step": 857000 }, { "epoch": 0.008571, "grad_norm": 0.1745588332414627, "learning_rate": 1e-05, "loss": 0.0189, "step": 857100 }, { "epoch": 0.008572, "grad_norm": 0.15748558938503265, "learning_rate": 1e-05, "loss": 0.0188, "step": 857200 }, { "epoch": 0.008573, "grad_norm": 0.1312299370765686, "learning_rate": 1e-05, "loss": 0.0185, "step": 857300 }, { "epoch": 0.008574, "grad_norm": 0.1563064306974411, "learning_rate": 1e-05, "loss": 0.0181, "step": 857400 }, { "epoch": 0.008575, "grad_norm": 0.20776718854904175, "learning_rate": 1e-05, "loss": 0.0185, "step": 857500 }, { "epoch": 0.008576, "grad_norm": 0.155299574136734, "learning_rate": 1e-05, "loss": 0.0189, "step": 857600 }, { "epoch": 0.008577, "grad_norm": 0.11810062825679779, "learning_rate": 1e-05, "loss": 0.0187, "step": 857700 }, { "epoch": 0.008578, "grad_norm": 0.1368316113948822, "learning_rate": 1e-05, "loss": 0.0184, "step": 857800 }, { "epoch": 0.008579, "grad_norm": 0.17015062272548676, "learning_rate": 1e-05, "loss": 0.0181, "step": 857900 }, { "epoch": 0.00858, "grad_norm": 0.16615846753120422, "learning_rate": 1e-05, "loss": 0.0185, "step": 858000 }, { "epoch": 0.008581, "grad_norm": 0.15277418494224548, "learning_rate": 1e-05, "loss": 0.0186, "step": 858100 }, { "epoch": 0.008582, "grad_norm": 0.1506882756948471, "learning_rate": 1e-05, "loss": 0.0188, "step": 858200 }, { "epoch": 0.008583, "grad_norm": 0.15450212359428406, "learning_rate": 1e-05, "loss": 0.0189, "step": 858300 }, { "epoch": 0.008584, "grad_norm": 0.1548556238412857, "learning_rate": 1e-05, "loss": 0.019, "step": 858400 }, { "epoch": 0.008585, "grad_norm": 0.19533976912498474, "learning_rate": 1e-05, "loss": 0.0188, "step": 858500 }, { "epoch": 0.008586, "grad_norm": 0.1553467959165573, "learning_rate": 1e-05, "loss": 0.0184, "step": 858600 }, { "epoch": 0.008587, "grad_norm": 0.14134523272514343, "learning_rate": 1e-05, "loss": 0.0184, "step": 858700 }, { "epoch": 0.008588, "grad_norm": 0.13621000945568085, "learning_rate": 1e-05, "loss": 0.019, "step": 858800 }, { "epoch": 0.008589, "grad_norm": 0.16073787212371826, "learning_rate": 1e-05, "loss": 0.0183, "step": 858900 }, { "epoch": 0.00859, "grad_norm": 0.1272251009941101, "learning_rate": 1e-05, "loss": 0.0186, "step": 859000 }, { "epoch": 0.008591, "grad_norm": 0.12999433279037476, "learning_rate": 1e-05, "loss": 0.0188, "step": 859100 }, { "epoch": 0.008592, "grad_norm": 0.15561732649803162, "learning_rate": 1e-05, "loss": 0.019, "step": 859200 }, { "epoch": 0.008593, "grad_norm": 0.20546385645866394, "learning_rate": 1e-05, "loss": 0.0186, "step": 859300 }, { "epoch": 0.008594, "grad_norm": 0.12644308805465698, "learning_rate": 1e-05, "loss": 0.019, "step": 859400 }, { "epoch": 0.008595, "grad_norm": 0.14834792912006378, "learning_rate": 1e-05, "loss": 0.0189, "step": 859500 }, { "epoch": 0.008596, "grad_norm": 0.13942191004753113, "learning_rate": 1e-05, "loss": 0.0184, "step": 859600 }, { "epoch": 0.008597, "grad_norm": 0.19134604930877686, "learning_rate": 1e-05, "loss": 0.0185, "step": 859700 }, { "epoch": 0.008598, "grad_norm": 0.18476970493793488, "learning_rate": 1e-05, "loss": 0.0188, "step": 859800 }, { "epoch": 0.008599, "grad_norm": 0.15614810585975647, "learning_rate": 1e-05, "loss": 0.0183, "step": 859900 }, { "epoch": 0.0086, "grad_norm": 0.1129843145608902, "learning_rate": 1e-05, "loss": 0.0184, "step": 860000 }, { "epoch": 0.0086, "eval_loss": 0.016237856820225716, "eval_runtime": 193.1371, "eval_samples_per_second": 258.883, "eval_steps_per_second": 16.18, "step": 860000 }, { "epoch": 0.008601, "grad_norm": 0.1583297997713089, "learning_rate": 1e-05, "loss": 0.0191, "step": 860100 }, { "epoch": 0.008602, "grad_norm": 0.20651449263095856, "learning_rate": 1e-05, "loss": 0.0184, "step": 860200 }, { "epoch": 0.008603, "grad_norm": 0.19557519257068634, "learning_rate": 1e-05, "loss": 0.0184, "step": 860300 }, { "epoch": 0.008604, "grad_norm": 0.17164331674575806, "learning_rate": 1e-05, "loss": 0.019, "step": 860400 }, { "epoch": 0.008605, "grad_norm": 0.1875590682029724, "learning_rate": 1e-05, "loss": 0.0182, "step": 860500 }, { "epoch": 0.008606, "grad_norm": 0.1874011904001236, "learning_rate": 1e-05, "loss": 0.0183, "step": 860600 }, { "epoch": 0.008607, "grad_norm": 0.179922953248024, "learning_rate": 1e-05, "loss": 0.0183, "step": 860700 }, { "epoch": 0.008608, "grad_norm": 0.12793782353401184, "learning_rate": 1e-05, "loss": 0.0181, "step": 860800 }, { "epoch": 0.008609, "grad_norm": 0.13102753460407257, "learning_rate": 1e-05, "loss": 0.0188, "step": 860900 }, { "epoch": 0.00861, "grad_norm": 0.184385746717453, "learning_rate": 1e-05, "loss": 0.0191, "step": 861000 }, { "epoch": 0.008611, "grad_norm": 0.12180142849683762, "learning_rate": 1e-05, "loss": 0.0185, "step": 861100 }, { "epoch": 0.008612, "grad_norm": 0.14723557233810425, "learning_rate": 1e-05, "loss": 0.0184, "step": 861200 }, { "epoch": 0.008613, "grad_norm": 0.18393783271312714, "learning_rate": 1e-05, "loss": 0.0192, "step": 861300 }, { "epoch": 0.008614, "grad_norm": 0.21284325420856476, "learning_rate": 1e-05, "loss": 0.0185, "step": 861400 }, { "epoch": 0.008615, "grad_norm": 0.13459736108779907, "learning_rate": 1e-05, "loss": 0.0182, "step": 861500 }, { "epoch": 0.008616, "grad_norm": 0.15448977053165436, "learning_rate": 1e-05, "loss": 0.0184, "step": 861600 }, { "epoch": 0.008617, "grad_norm": 0.14555704593658447, "learning_rate": 1e-05, "loss": 0.0183, "step": 861700 }, { "epoch": 0.008618, "grad_norm": 0.23051634430885315, "learning_rate": 1e-05, "loss": 0.0186, "step": 861800 }, { "epoch": 0.008619, "grad_norm": 0.14128002524375916, "learning_rate": 1e-05, "loss": 0.0184, "step": 861900 }, { "epoch": 0.00862, "grad_norm": 0.21713712811470032, "learning_rate": 1e-05, "loss": 0.0184, "step": 862000 }, { "epoch": 0.008621, "grad_norm": 0.14954572916030884, "learning_rate": 1e-05, "loss": 0.0185, "step": 862100 }, { "epoch": 0.008622, "grad_norm": 0.16100461781024933, "learning_rate": 1e-05, "loss": 0.0189, "step": 862200 }, { "epoch": 0.008623, "grad_norm": 0.16636161506175995, "learning_rate": 1e-05, "loss": 0.0185, "step": 862300 }, { "epoch": 0.008624, "grad_norm": 0.2048698216676712, "learning_rate": 1e-05, "loss": 0.0184, "step": 862400 }, { "epoch": 0.008625, "grad_norm": 0.1501368284225464, "learning_rate": 1e-05, "loss": 0.0187, "step": 862500 }, { "epoch": 0.008626, "grad_norm": 0.12604841589927673, "learning_rate": 1e-05, "loss": 0.0187, "step": 862600 }, { "epoch": 0.008627, "grad_norm": 0.17997145652770996, "learning_rate": 1e-05, "loss": 0.0185, "step": 862700 }, { "epoch": 0.008628, "grad_norm": 0.1334325224161148, "learning_rate": 1e-05, "loss": 0.0183, "step": 862800 }, { "epoch": 0.008629, "grad_norm": 0.14956121146678925, "learning_rate": 1e-05, "loss": 0.0183, "step": 862900 }, { "epoch": 0.00863, "grad_norm": 0.15890425443649292, "learning_rate": 1e-05, "loss": 0.0183, "step": 863000 }, { "epoch": 0.008631, "grad_norm": 0.14226113259792328, "learning_rate": 1e-05, "loss": 0.0181, "step": 863100 }, { "epoch": 0.008632, "grad_norm": 0.15432757139205933, "learning_rate": 1e-05, "loss": 0.0186, "step": 863200 }, { "epoch": 0.008633, "grad_norm": 0.1500442922115326, "learning_rate": 1e-05, "loss": 0.0185, "step": 863300 }, { "epoch": 0.008634, "grad_norm": 0.1424553096294403, "learning_rate": 1e-05, "loss": 0.0183, "step": 863400 }, { "epoch": 0.008635, "grad_norm": 0.2354820817708969, "learning_rate": 1e-05, "loss": 0.0184, "step": 863500 }, { "epoch": 0.008636, "grad_norm": 0.14333809912204742, "learning_rate": 1e-05, "loss": 0.0188, "step": 863600 }, { "epoch": 0.008637, "grad_norm": 0.16527903079986572, "learning_rate": 1e-05, "loss": 0.0187, "step": 863700 }, { "epoch": 0.008638, "grad_norm": 0.12386555969715118, "learning_rate": 1e-05, "loss": 0.0189, "step": 863800 }, { "epoch": 0.008639, "grad_norm": 0.1771816611289978, "learning_rate": 1e-05, "loss": 0.0184, "step": 863900 }, { "epoch": 0.00864, "grad_norm": 0.1813625991344452, "learning_rate": 1e-05, "loss": 0.0185, "step": 864000 }, { "epoch": 0.008641, "grad_norm": 0.14576959609985352, "learning_rate": 1e-05, "loss": 0.0185, "step": 864100 }, { "epoch": 0.008642, "grad_norm": 0.23645524680614471, "learning_rate": 1e-05, "loss": 0.018, "step": 864200 }, { "epoch": 0.008643, "grad_norm": 0.1863594502210617, "learning_rate": 1e-05, "loss": 0.0184, "step": 864300 }, { "epoch": 0.008644, "grad_norm": 0.13713227212429047, "learning_rate": 1e-05, "loss": 0.0183, "step": 864400 }, { "epoch": 0.008645, "grad_norm": 0.10894183814525604, "learning_rate": 1e-05, "loss": 0.0184, "step": 864500 }, { "epoch": 0.008646, "grad_norm": 0.2037230283021927, "learning_rate": 1e-05, "loss": 0.0186, "step": 864600 }, { "epoch": 0.008647, "grad_norm": 0.12614932656288147, "learning_rate": 1e-05, "loss": 0.0186, "step": 864700 }, { "epoch": 0.008648, "grad_norm": 0.15267989039421082, "learning_rate": 1e-05, "loss": 0.0186, "step": 864800 }, { "epoch": 0.008649, "grad_norm": 0.14615829288959503, "learning_rate": 1e-05, "loss": 0.0186, "step": 864900 }, { "epoch": 0.00865, "grad_norm": 0.12047168612480164, "learning_rate": 1e-05, "loss": 0.018, "step": 865000 }, { "epoch": 0.008651, "grad_norm": 0.27930453419685364, "learning_rate": 1e-05, "loss": 0.0184, "step": 865100 }, { "epoch": 0.008652, "grad_norm": 0.15010792016983032, "learning_rate": 1e-05, "loss": 0.0181, "step": 865200 }, { "epoch": 0.008653, "grad_norm": 0.19017308950424194, "learning_rate": 1e-05, "loss": 0.0186, "step": 865300 }, { "epoch": 0.008654, "grad_norm": 0.2176983505487442, "learning_rate": 1e-05, "loss": 0.0181, "step": 865400 }, { "epoch": 0.008655, "grad_norm": 0.1533956080675125, "learning_rate": 1e-05, "loss": 0.0187, "step": 865500 }, { "epoch": 0.008656, "grad_norm": 0.1986723244190216, "learning_rate": 1e-05, "loss": 0.0182, "step": 865600 }, { "epoch": 0.008657, "grad_norm": 0.13980521261692047, "learning_rate": 1e-05, "loss": 0.0183, "step": 865700 }, { "epoch": 0.008658, "grad_norm": 0.23486937582492828, "learning_rate": 1e-05, "loss": 0.0188, "step": 865800 }, { "epoch": 0.008659, "grad_norm": 0.13180288672447205, "learning_rate": 1e-05, "loss": 0.0187, "step": 865900 }, { "epoch": 0.00866, "grad_norm": 0.11950536072254181, "learning_rate": 1e-05, "loss": 0.018, "step": 866000 }, { "epoch": 0.008661, "grad_norm": 0.15569955110549927, "learning_rate": 1e-05, "loss": 0.0188, "step": 866100 }, { "epoch": 0.008662, "grad_norm": 0.12320094555616379, "learning_rate": 1e-05, "loss": 0.0189, "step": 866200 }, { "epoch": 0.008663, "grad_norm": 0.17038144171237946, "learning_rate": 1e-05, "loss": 0.0184, "step": 866300 }, { "epoch": 0.008664, "grad_norm": 0.15978620946407318, "learning_rate": 1e-05, "loss": 0.0185, "step": 866400 }, { "epoch": 0.008665, "grad_norm": 0.15020595490932465, "learning_rate": 1e-05, "loss": 0.0184, "step": 866500 }, { "epoch": 0.008666, "grad_norm": 0.12315741926431656, "learning_rate": 1e-05, "loss": 0.0188, "step": 866600 }, { "epoch": 0.008667, "grad_norm": 0.16382789611816406, "learning_rate": 1e-05, "loss": 0.0186, "step": 866700 }, { "epoch": 0.008668, "grad_norm": 0.1987118422985077, "learning_rate": 1e-05, "loss": 0.0186, "step": 866800 }, { "epoch": 0.008669, "grad_norm": 0.1600421816110611, "learning_rate": 1e-05, "loss": 0.0187, "step": 866900 }, { "epoch": 0.00867, "grad_norm": 0.13513420522212982, "learning_rate": 1e-05, "loss": 0.0187, "step": 867000 }, { "epoch": 0.008671, "grad_norm": 0.157680943608284, "learning_rate": 1e-05, "loss": 0.0184, "step": 867100 }, { "epoch": 0.008672, "grad_norm": 0.1576307713985443, "learning_rate": 1e-05, "loss": 0.0181, "step": 867200 }, { "epoch": 0.008673, "grad_norm": 0.12113591283559799, "learning_rate": 1e-05, "loss": 0.0183, "step": 867300 }, { "epoch": 0.008674, "grad_norm": 0.1901729553937912, "learning_rate": 1e-05, "loss": 0.0186, "step": 867400 }, { "epoch": 0.008675, "grad_norm": 0.14101041853427887, "learning_rate": 1e-05, "loss": 0.0184, "step": 867500 }, { "epoch": 0.008676, "grad_norm": 0.10454657673835754, "learning_rate": 1e-05, "loss": 0.0188, "step": 867600 }, { "epoch": 0.008677, "grad_norm": 0.15678225457668304, "learning_rate": 1e-05, "loss": 0.0184, "step": 867700 }, { "epoch": 0.008678, "grad_norm": 0.14052525162696838, "learning_rate": 1e-05, "loss": 0.0178, "step": 867800 }, { "epoch": 0.008679, "grad_norm": 0.13929831981658936, "learning_rate": 1e-05, "loss": 0.0186, "step": 867900 }, { "epoch": 0.00868, "grad_norm": 0.13414494693279266, "learning_rate": 1e-05, "loss": 0.0183, "step": 868000 }, { "epoch": 0.008681, "grad_norm": 0.24176187813282013, "learning_rate": 1e-05, "loss": 0.0182, "step": 868100 }, { "epoch": 0.008682, "grad_norm": 0.18224535882472992, "learning_rate": 1e-05, "loss": 0.0186, "step": 868200 }, { "epoch": 0.008683, "grad_norm": 0.18480432033538818, "learning_rate": 1e-05, "loss": 0.0185, "step": 868300 }, { "epoch": 0.008684, "grad_norm": 0.1571822315454483, "learning_rate": 1e-05, "loss": 0.0185, "step": 868400 }, { "epoch": 0.008685, "grad_norm": 0.11620522290468216, "learning_rate": 1e-05, "loss": 0.0184, "step": 868500 }, { "epoch": 0.008686, "grad_norm": 0.12258745729923248, "learning_rate": 1e-05, "loss": 0.0184, "step": 868600 }, { "epoch": 0.008687, "grad_norm": 0.15281988680362701, "learning_rate": 1e-05, "loss": 0.0186, "step": 868700 }, { "epoch": 0.008688, "grad_norm": 0.12091915309429169, "learning_rate": 1e-05, "loss": 0.0183, "step": 868800 }, { "epoch": 0.008689, "grad_norm": 0.15248152613639832, "learning_rate": 1e-05, "loss": 0.0185, "step": 868900 }, { "epoch": 0.00869, "grad_norm": 0.2005286067724228, "learning_rate": 1e-05, "loss": 0.0184, "step": 869000 }, { "epoch": 0.008691, "grad_norm": 0.18565693497657776, "learning_rate": 1e-05, "loss": 0.0184, "step": 869100 }, { "epoch": 0.008692, "grad_norm": 0.14085853099822998, "learning_rate": 1e-05, "loss": 0.0183, "step": 869200 }, { "epoch": 0.008693, "grad_norm": 0.2384158819913864, "learning_rate": 1e-05, "loss": 0.0187, "step": 869300 }, { "epoch": 0.008694, "grad_norm": 0.1306806355714798, "learning_rate": 1e-05, "loss": 0.0185, "step": 869400 }, { "epoch": 0.008695, "grad_norm": 0.19374512135982513, "learning_rate": 1e-05, "loss": 0.0184, "step": 869500 }, { "epoch": 0.008696, "grad_norm": 0.1787797510623932, "learning_rate": 1e-05, "loss": 0.0184, "step": 869600 }, { "epoch": 0.008697, "grad_norm": 0.12028446048498154, "learning_rate": 1e-05, "loss": 0.0186, "step": 869700 }, { "epoch": 0.008698, "grad_norm": 0.15014944970607758, "learning_rate": 1e-05, "loss": 0.0186, "step": 869800 }, { "epoch": 0.008699, "grad_norm": 0.1280466467142105, "learning_rate": 1e-05, "loss": 0.0182, "step": 869900 }, { "epoch": 0.0087, "grad_norm": 0.17187736928462982, "learning_rate": 1e-05, "loss": 0.0189, "step": 870000 }, { "epoch": 0.008701, "grad_norm": 0.1486150026321411, "learning_rate": 1e-05, "loss": 0.0185, "step": 870100 }, { "epoch": 0.008702, "grad_norm": 0.1770174652338028, "learning_rate": 1e-05, "loss": 0.0183, "step": 870200 }, { "epoch": 0.008703, "grad_norm": 0.20413470268249512, "learning_rate": 1e-05, "loss": 0.0186, "step": 870300 }, { "epoch": 0.008704, "grad_norm": 0.13263431191444397, "learning_rate": 1e-05, "loss": 0.0185, "step": 870400 }, { "epoch": 0.008705, "grad_norm": 0.16442815959453583, "learning_rate": 1e-05, "loss": 0.0188, "step": 870500 }, { "epoch": 0.008706, "grad_norm": 0.15862475335597992, "learning_rate": 1e-05, "loss": 0.0181, "step": 870600 }, { "epoch": 0.008707, "grad_norm": 0.1705312728881836, "learning_rate": 1e-05, "loss": 0.0186, "step": 870700 }, { "epoch": 0.008708, "grad_norm": 0.17873601615428925, "learning_rate": 1e-05, "loss": 0.0182, "step": 870800 }, { "epoch": 0.008709, "grad_norm": 0.1645902693271637, "learning_rate": 1e-05, "loss": 0.0186, "step": 870900 }, { "epoch": 0.00871, "grad_norm": 0.12625685334205627, "learning_rate": 1e-05, "loss": 0.0184, "step": 871000 }, { "epoch": 0.008711, "grad_norm": 0.15443825721740723, "learning_rate": 1e-05, "loss": 0.0187, "step": 871100 }, { "epoch": 0.008712, "grad_norm": 0.10927312076091766, "learning_rate": 1e-05, "loss": 0.0184, "step": 871200 }, { "epoch": 0.008713, "grad_norm": 0.13370484113693237, "learning_rate": 1e-05, "loss": 0.0187, "step": 871300 }, { "epoch": 0.008714, "grad_norm": 0.12467143684625626, "learning_rate": 1e-05, "loss": 0.0182, "step": 871400 }, { "epoch": 0.008715, "grad_norm": 0.16119369864463806, "learning_rate": 1e-05, "loss": 0.0182, "step": 871500 }, { "epoch": 0.008716, "grad_norm": 0.12081447243690491, "learning_rate": 1e-05, "loss": 0.0186, "step": 871600 }, { "epoch": 0.008717, "grad_norm": 0.1531410813331604, "learning_rate": 1e-05, "loss": 0.0182, "step": 871700 }, { "epoch": 0.008718, "grad_norm": 0.16026076674461365, "learning_rate": 1e-05, "loss": 0.0184, "step": 871800 }, { "epoch": 0.008719, "grad_norm": 0.23037678003311157, "learning_rate": 1e-05, "loss": 0.0184, "step": 871900 }, { "epoch": 0.00872, "grad_norm": 0.15930558741092682, "learning_rate": 1e-05, "loss": 0.0185, "step": 872000 }, { "epoch": 0.008721, "grad_norm": 0.13841521739959717, "learning_rate": 1e-05, "loss": 0.0186, "step": 872100 }, { "epoch": 0.008722, "grad_norm": 0.1856965273618698, "learning_rate": 1e-05, "loss": 0.0183, "step": 872200 }, { "epoch": 0.008723, "grad_norm": 0.15583489835262299, "learning_rate": 1e-05, "loss": 0.0185, "step": 872300 }, { "epoch": 0.008724, "grad_norm": 0.1536627560853958, "learning_rate": 1e-05, "loss": 0.0183, "step": 872400 }, { "epoch": 0.008725, "grad_norm": 0.1345738023519516, "learning_rate": 1e-05, "loss": 0.0185, "step": 872500 }, { "epoch": 0.008726, "grad_norm": 0.14933055639266968, "learning_rate": 1e-05, "loss": 0.0181, "step": 872600 }, { "epoch": 0.008727, "grad_norm": 0.155944362282753, "learning_rate": 1e-05, "loss": 0.0185, "step": 872700 }, { "epoch": 0.008728, "grad_norm": 0.1265515238046646, "learning_rate": 1e-05, "loss": 0.0185, "step": 872800 }, { "epoch": 0.008729, "grad_norm": 0.12762734293937683, "learning_rate": 1e-05, "loss": 0.0182, "step": 872900 }, { "epoch": 0.00873, "grad_norm": 0.1428944319486618, "learning_rate": 1e-05, "loss": 0.0184, "step": 873000 }, { "epoch": 0.008731, "grad_norm": 0.16642452776432037, "learning_rate": 1e-05, "loss": 0.0185, "step": 873100 }, { "epoch": 0.008732, "grad_norm": 0.1398540586233139, "learning_rate": 1e-05, "loss": 0.0182, "step": 873200 }, { "epoch": 0.008733, "grad_norm": 0.1512795239686966, "learning_rate": 1e-05, "loss": 0.0187, "step": 873300 }, { "epoch": 0.008734, "grad_norm": 0.11704552173614502, "learning_rate": 1e-05, "loss": 0.0186, "step": 873400 }, { "epoch": 0.008735, "grad_norm": 0.14904972910881042, "learning_rate": 1e-05, "loss": 0.0188, "step": 873500 }, { "epoch": 0.008736, "grad_norm": 0.12108015269041061, "learning_rate": 1e-05, "loss": 0.018, "step": 873600 }, { "epoch": 0.008737, "grad_norm": 0.19360658526420593, "learning_rate": 1e-05, "loss": 0.0181, "step": 873700 }, { "epoch": 0.008738, "grad_norm": 0.12054184079170227, "learning_rate": 1e-05, "loss": 0.0183, "step": 873800 }, { "epoch": 0.008739, "grad_norm": 0.10273429751396179, "learning_rate": 1e-05, "loss": 0.0186, "step": 873900 }, { "epoch": 0.00874, "grad_norm": 0.21025708317756653, "learning_rate": 1e-05, "loss": 0.0183, "step": 874000 }, { "epoch": 0.008741, "grad_norm": 0.17953529953956604, "learning_rate": 1e-05, "loss": 0.0187, "step": 874100 }, { "epoch": 0.008742, "grad_norm": 0.1350858360528946, "learning_rate": 1e-05, "loss": 0.019, "step": 874200 }, { "epoch": 0.008743, "grad_norm": 0.17580540478229523, "learning_rate": 1e-05, "loss": 0.0185, "step": 874300 }, { "epoch": 0.008744, "grad_norm": 0.17329013347625732, "learning_rate": 1e-05, "loss": 0.0183, "step": 874400 }, { "epoch": 0.008745, "grad_norm": 0.19242072105407715, "learning_rate": 1e-05, "loss": 0.0186, "step": 874500 }, { "epoch": 0.008746, "grad_norm": 0.11703689396381378, "learning_rate": 1e-05, "loss": 0.0185, "step": 874600 }, { "epoch": 0.008747, "grad_norm": 0.1590590626001358, "learning_rate": 1e-05, "loss": 0.0182, "step": 874700 }, { "epoch": 0.008748, "grad_norm": 0.14908738434314728, "learning_rate": 1e-05, "loss": 0.0186, "step": 874800 }, { "epoch": 0.008749, "grad_norm": 0.16394561529159546, "learning_rate": 1e-05, "loss": 0.0188, "step": 874900 }, { "epoch": 0.00875, "grad_norm": 0.12415405362844467, "learning_rate": 1e-05, "loss": 0.019, "step": 875000 }, { "epoch": 0.008751, "grad_norm": 0.15426790714263916, "learning_rate": 1e-05, "loss": 0.0183, "step": 875100 }, { "epoch": 0.008752, "grad_norm": 0.19661279022693634, "learning_rate": 1e-05, "loss": 0.0185, "step": 875200 }, { "epoch": 0.008753, "grad_norm": 0.15779154002666473, "learning_rate": 1e-05, "loss": 0.0187, "step": 875300 }, { "epoch": 0.008754, "grad_norm": 0.2129068225622177, "learning_rate": 1e-05, "loss": 0.0185, "step": 875400 }, { "epoch": 0.008755, "grad_norm": 0.10726603120565414, "learning_rate": 1e-05, "loss": 0.018, "step": 875500 }, { "epoch": 0.008756, "grad_norm": 0.16807173192501068, "learning_rate": 1e-05, "loss": 0.0187, "step": 875600 }, { "epoch": 0.008757, "grad_norm": 0.1628911942243576, "learning_rate": 1e-05, "loss": 0.0183, "step": 875700 }, { "epoch": 0.008758, "grad_norm": 0.17062713205814362, "learning_rate": 1e-05, "loss": 0.0182, "step": 875800 }, { "epoch": 0.008759, "grad_norm": 0.13081581890583038, "learning_rate": 1e-05, "loss": 0.0185, "step": 875900 }, { "epoch": 0.00876, "grad_norm": 0.15656022727489471, "learning_rate": 1e-05, "loss": 0.0185, "step": 876000 }, { "epoch": 0.008761, "grad_norm": 0.14621686935424805, "learning_rate": 1e-05, "loss": 0.0184, "step": 876100 }, { "epoch": 0.008762, "grad_norm": 0.15905456244945526, "learning_rate": 1e-05, "loss": 0.0184, "step": 876200 }, { "epoch": 0.008763, "grad_norm": 0.19403229653835297, "learning_rate": 1e-05, "loss": 0.0185, "step": 876300 }, { "epoch": 0.008764, "grad_norm": 0.11406006664037704, "learning_rate": 1e-05, "loss": 0.0183, "step": 876400 }, { "epoch": 0.008765, "grad_norm": 0.17895850539207458, "learning_rate": 1e-05, "loss": 0.0184, "step": 876500 }, { "epoch": 0.008766, "grad_norm": 0.1633761078119278, "learning_rate": 1e-05, "loss": 0.0185, "step": 876600 }, { "epoch": 0.008767, "grad_norm": 0.167178213596344, "learning_rate": 1e-05, "loss": 0.0183, "step": 876700 }, { "epoch": 0.008768, "grad_norm": 0.1719236671924591, "learning_rate": 1e-05, "loss": 0.0187, "step": 876800 }, { "epoch": 0.008769, "grad_norm": 0.14956308901309967, "learning_rate": 1e-05, "loss": 0.0184, "step": 876900 }, { "epoch": 0.00877, "grad_norm": 0.14457207918167114, "learning_rate": 1e-05, "loss": 0.0183, "step": 877000 }, { "epoch": 0.008771, "grad_norm": 0.11838071793317795, "learning_rate": 1e-05, "loss": 0.0186, "step": 877100 }, { "epoch": 0.008772, "grad_norm": 0.15192940831184387, "learning_rate": 1e-05, "loss": 0.0184, "step": 877200 }, { "epoch": 0.008773, "grad_norm": 0.1836523711681366, "learning_rate": 1e-05, "loss": 0.0183, "step": 877300 }, { "epoch": 0.008774, "grad_norm": 0.14000414311885834, "learning_rate": 1e-05, "loss": 0.018, "step": 877400 }, { "epoch": 0.008775, "grad_norm": 0.12766854465007782, "learning_rate": 1e-05, "loss": 0.0185, "step": 877500 }, { "epoch": 0.008776, "grad_norm": 0.21863588690757751, "learning_rate": 1e-05, "loss": 0.0187, "step": 877600 }, { "epoch": 0.008777, "grad_norm": 0.16444934904575348, "learning_rate": 1e-05, "loss": 0.018, "step": 877700 }, { "epoch": 0.008778, "grad_norm": 0.10352537035942078, "learning_rate": 1e-05, "loss": 0.0182, "step": 877800 }, { "epoch": 0.008779, "grad_norm": 0.13244041800498962, "learning_rate": 1e-05, "loss": 0.0183, "step": 877900 }, { "epoch": 0.00878, "grad_norm": 0.15563419461250305, "learning_rate": 1e-05, "loss": 0.0186, "step": 878000 }, { "epoch": 0.008781, "grad_norm": 0.1264432817697525, "learning_rate": 1e-05, "loss": 0.0183, "step": 878100 }, { "epoch": 0.008782, "grad_norm": 0.11195482313632965, "learning_rate": 1e-05, "loss": 0.0188, "step": 878200 }, { "epoch": 0.008783, "grad_norm": 0.21770939230918884, "learning_rate": 1e-05, "loss": 0.0191, "step": 878300 }, { "epoch": 0.008784, "grad_norm": 0.16072431206703186, "learning_rate": 1e-05, "loss": 0.0188, "step": 878400 }, { "epoch": 0.008785, "grad_norm": 0.1513386368751526, "learning_rate": 1e-05, "loss": 0.0188, "step": 878500 }, { "epoch": 0.008786, "grad_norm": 0.14829011261463165, "learning_rate": 1e-05, "loss": 0.0186, "step": 878600 }, { "epoch": 0.008787, "grad_norm": 0.13129781186580658, "learning_rate": 1e-05, "loss": 0.0186, "step": 878700 }, { "epoch": 0.008788, "grad_norm": 0.15758684277534485, "learning_rate": 1e-05, "loss": 0.0184, "step": 878800 }, { "epoch": 0.008789, "grad_norm": 0.16325940191745758, "learning_rate": 1e-05, "loss": 0.0182, "step": 878900 }, { "epoch": 0.00879, "grad_norm": 0.185978963971138, "learning_rate": 1e-05, "loss": 0.0186, "step": 879000 }, { "epoch": 0.008791, "grad_norm": 0.12941130995750427, "learning_rate": 1e-05, "loss": 0.0182, "step": 879100 }, { "epoch": 0.008792, "grad_norm": 0.19413012266159058, "learning_rate": 1e-05, "loss": 0.0182, "step": 879200 }, { "epoch": 0.008793, "grad_norm": 0.11880682408809662, "learning_rate": 1e-05, "loss": 0.0181, "step": 879300 }, { "epoch": 0.008794, "grad_norm": 0.1618969887495041, "learning_rate": 1e-05, "loss": 0.0184, "step": 879400 }, { "epoch": 0.008795, "grad_norm": 0.14989496767520905, "learning_rate": 1e-05, "loss": 0.019, "step": 879500 }, { "epoch": 0.008796, "grad_norm": 0.12696397304534912, "learning_rate": 1e-05, "loss": 0.0184, "step": 879600 }, { "epoch": 0.008797, "grad_norm": 0.1398801952600479, "learning_rate": 1e-05, "loss": 0.0187, "step": 879700 }, { "epoch": 0.008798, "grad_norm": 0.17320065200328827, "learning_rate": 1e-05, "loss": 0.018, "step": 879800 }, { "epoch": 0.008799, "grad_norm": 0.1468813121318817, "learning_rate": 1e-05, "loss": 0.0183, "step": 879900 }, { "epoch": 0.0088, "grad_norm": 0.1519888937473297, "learning_rate": 1e-05, "loss": 0.0185, "step": 880000 }, { "epoch": 0.0088, "eval_loss": 0.01641898602247238, "eval_runtime": 193.4883, "eval_samples_per_second": 258.414, "eval_steps_per_second": 16.151, "step": 880000 }, { "epoch": 0.008801, "grad_norm": 0.13218548893928528, "learning_rate": 1e-05, "loss": 0.0183, "step": 880100 }, { "epoch": 0.008802, "grad_norm": 0.17569676041603088, "learning_rate": 1e-05, "loss": 0.0183, "step": 880200 }, { "epoch": 0.008803, "grad_norm": 0.1743326187133789, "learning_rate": 1e-05, "loss": 0.0183, "step": 880300 }, { "epoch": 0.008804, "grad_norm": 0.11857958137989044, "learning_rate": 1e-05, "loss": 0.0187, "step": 880400 }, { "epoch": 0.008805, "grad_norm": 0.15016457438468933, "learning_rate": 1e-05, "loss": 0.0184, "step": 880500 }, { "epoch": 0.008806, "grad_norm": 0.16006501019001007, "learning_rate": 1e-05, "loss": 0.0181, "step": 880600 }, { "epoch": 0.008807, "grad_norm": 0.1674988865852356, "learning_rate": 1e-05, "loss": 0.0186, "step": 880700 }, { "epoch": 0.008808, "grad_norm": 0.21977591514587402, "learning_rate": 1e-05, "loss": 0.0187, "step": 880800 }, { "epoch": 0.008809, "grad_norm": 0.16367803514003754, "learning_rate": 1e-05, "loss": 0.0188, "step": 880900 }, { "epoch": 0.00881, "grad_norm": 0.13708262145519257, "learning_rate": 1e-05, "loss": 0.0187, "step": 881000 }, { "epoch": 0.008811, "grad_norm": 0.15987981855869293, "learning_rate": 1e-05, "loss": 0.0181, "step": 881100 }, { "epoch": 0.008812, "grad_norm": 0.12909099459648132, "learning_rate": 1e-05, "loss": 0.0185, "step": 881200 }, { "epoch": 0.008813, "grad_norm": 0.16755270957946777, "learning_rate": 1e-05, "loss": 0.0184, "step": 881300 }, { "epoch": 0.008814, "grad_norm": 0.12443696707487106, "learning_rate": 1e-05, "loss": 0.0182, "step": 881400 }, { "epoch": 0.008815, "grad_norm": 0.12546463310718536, "learning_rate": 1e-05, "loss": 0.0185, "step": 881500 }, { "epoch": 0.008816, "grad_norm": 0.1226990595459938, "learning_rate": 1e-05, "loss": 0.0183, "step": 881600 }, { "epoch": 0.008817, "grad_norm": 0.1546824872493744, "learning_rate": 1e-05, "loss": 0.0183, "step": 881700 }, { "epoch": 0.008818, "grad_norm": 0.16216115653514862, "learning_rate": 1e-05, "loss": 0.0186, "step": 881800 }, { "epoch": 0.008819, "grad_norm": 0.14111272990703583, "learning_rate": 1e-05, "loss": 0.018, "step": 881900 }, { "epoch": 0.00882, "grad_norm": 0.1547940969467163, "learning_rate": 1e-05, "loss": 0.0185, "step": 882000 }, { "epoch": 0.008821, "grad_norm": 0.14312316477298737, "learning_rate": 1e-05, "loss": 0.0183, "step": 882100 }, { "epoch": 0.008822, "grad_norm": 0.1578245311975479, "learning_rate": 1e-05, "loss": 0.0188, "step": 882200 }, { "epoch": 0.008823, "grad_norm": 0.12020063400268555, "learning_rate": 1e-05, "loss": 0.0187, "step": 882300 }, { "epoch": 0.008824, "grad_norm": 0.16226626932621002, "learning_rate": 1e-05, "loss": 0.0182, "step": 882400 }, { "epoch": 0.008825, "grad_norm": 0.1465127021074295, "learning_rate": 1e-05, "loss": 0.0181, "step": 882500 }, { "epoch": 0.008826, "grad_norm": 0.11880172789096832, "learning_rate": 1e-05, "loss": 0.0185, "step": 882600 }, { "epoch": 0.008827, "grad_norm": 0.14200527966022491, "learning_rate": 1e-05, "loss": 0.0185, "step": 882700 }, { "epoch": 0.008828, "grad_norm": 0.1847248375415802, "learning_rate": 1e-05, "loss": 0.0182, "step": 882800 }, { "epoch": 0.008829, "grad_norm": 0.30286112427711487, "learning_rate": 1e-05, "loss": 0.0188, "step": 882900 }, { "epoch": 0.00883, "grad_norm": 0.1720632016658783, "learning_rate": 1e-05, "loss": 0.0184, "step": 883000 }, { "epoch": 0.008831, "grad_norm": 0.14803841710090637, "learning_rate": 1e-05, "loss": 0.0188, "step": 883100 }, { "epoch": 0.008832, "grad_norm": 0.11972980201244354, "learning_rate": 1e-05, "loss": 0.0181, "step": 883200 }, { "epoch": 0.008833, "grad_norm": 0.2532072365283966, "learning_rate": 1e-05, "loss": 0.0181, "step": 883300 }, { "epoch": 0.008834, "grad_norm": 0.11033746600151062, "learning_rate": 1e-05, "loss": 0.0185, "step": 883400 }, { "epoch": 0.008835, "grad_norm": 0.1545845866203308, "learning_rate": 1e-05, "loss": 0.0184, "step": 883500 }, { "epoch": 0.008836, "grad_norm": 0.19224192202091217, "learning_rate": 1e-05, "loss": 0.0184, "step": 883600 }, { "epoch": 0.008837, "grad_norm": 0.15350870788097382, "learning_rate": 1e-05, "loss": 0.0182, "step": 883700 }, { "epoch": 0.008838, "grad_norm": 0.1386798769235611, "learning_rate": 1e-05, "loss": 0.0184, "step": 883800 }, { "epoch": 0.008839, "grad_norm": 0.13774198293685913, "learning_rate": 1e-05, "loss": 0.0186, "step": 883900 }, { "epoch": 0.00884, "grad_norm": 0.17120882868766785, "learning_rate": 1e-05, "loss": 0.0181, "step": 884000 }, { "epoch": 0.008841, "grad_norm": 0.11334135383367538, "learning_rate": 1e-05, "loss": 0.0187, "step": 884100 }, { "epoch": 0.008842, "grad_norm": 0.11007793992757797, "learning_rate": 1e-05, "loss": 0.0183, "step": 884200 }, { "epoch": 0.008843, "grad_norm": 0.14736858010292053, "learning_rate": 1e-05, "loss": 0.0184, "step": 884300 }, { "epoch": 0.008844, "grad_norm": 0.12676750123500824, "learning_rate": 1e-05, "loss": 0.0181, "step": 884400 }, { "epoch": 0.008845, "grad_norm": 0.1577613353729248, "learning_rate": 1e-05, "loss": 0.0181, "step": 884500 }, { "epoch": 0.008846, "grad_norm": 0.17493946850299835, "learning_rate": 1e-05, "loss": 0.0185, "step": 884600 }, { "epoch": 0.008847, "grad_norm": 0.11704888939857483, "learning_rate": 1e-05, "loss": 0.0183, "step": 884700 }, { "epoch": 0.008848, "grad_norm": 0.18892425298690796, "learning_rate": 1e-05, "loss": 0.0185, "step": 884800 }, { "epoch": 0.008849, "grad_norm": 0.20540696382522583, "learning_rate": 1e-05, "loss": 0.0183, "step": 884900 }, { "epoch": 0.00885, "grad_norm": 0.14976677298545837, "learning_rate": 1e-05, "loss": 0.018, "step": 885000 }, { "epoch": 0.008851, "grad_norm": 0.1572694033384323, "learning_rate": 1e-05, "loss": 0.0179, "step": 885100 }, { "epoch": 0.008852, "grad_norm": 0.11416725069284439, "learning_rate": 1e-05, "loss": 0.0188, "step": 885200 }, { "epoch": 0.008853, "grad_norm": 0.1498834639787674, "learning_rate": 1e-05, "loss": 0.0183, "step": 885300 }, { "epoch": 0.008854, "grad_norm": 0.12894538044929504, "learning_rate": 1e-05, "loss": 0.0182, "step": 885400 }, { "epoch": 0.008855, "grad_norm": 0.124419666826725, "learning_rate": 1e-05, "loss": 0.0181, "step": 885500 }, { "epoch": 0.008856, "grad_norm": 0.12909086048603058, "learning_rate": 1e-05, "loss": 0.0186, "step": 885600 }, { "epoch": 0.008857, "grad_norm": 0.17190445959568024, "learning_rate": 1e-05, "loss": 0.0184, "step": 885700 }, { "epoch": 0.008858, "grad_norm": 0.18886810541152954, "learning_rate": 1e-05, "loss": 0.0184, "step": 885800 }, { "epoch": 0.008859, "grad_norm": 0.12172508239746094, "learning_rate": 1e-05, "loss": 0.0183, "step": 885900 }, { "epoch": 0.00886, "grad_norm": 0.14471036195755005, "learning_rate": 1e-05, "loss": 0.018, "step": 886000 }, { "epoch": 0.008861, "grad_norm": 0.12085343897342682, "learning_rate": 1e-05, "loss": 0.0182, "step": 886100 }, { "epoch": 0.008862, "grad_norm": 0.12224338203668594, "learning_rate": 1e-05, "loss": 0.0183, "step": 886200 }, { "epoch": 0.008863, "grad_norm": 0.16575980186462402, "learning_rate": 1e-05, "loss": 0.0187, "step": 886300 }, { "epoch": 0.008864, "grad_norm": 0.138269305229187, "learning_rate": 1e-05, "loss": 0.0182, "step": 886400 }, { "epoch": 0.008865, "grad_norm": 0.1397436559200287, "learning_rate": 1e-05, "loss": 0.019, "step": 886500 }, { "epoch": 0.008866, "grad_norm": 0.11881553381681442, "learning_rate": 1e-05, "loss": 0.0181, "step": 886600 }, { "epoch": 0.008867, "grad_norm": 0.13319750130176544, "learning_rate": 1e-05, "loss": 0.0183, "step": 886700 }, { "epoch": 0.008868, "grad_norm": 0.22268471121788025, "learning_rate": 1e-05, "loss": 0.0187, "step": 886800 }, { "epoch": 0.008869, "grad_norm": 0.12764257192611694, "learning_rate": 1e-05, "loss": 0.0182, "step": 886900 }, { "epoch": 0.00887, "grad_norm": 0.17432604730129242, "learning_rate": 1e-05, "loss": 0.018, "step": 887000 }, { "epoch": 0.008871, "grad_norm": 0.12416495382785797, "learning_rate": 1e-05, "loss": 0.018, "step": 887100 }, { "epoch": 0.008872, "grad_norm": 0.1548972725868225, "learning_rate": 1e-05, "loss": 0.0186, "step": 887200 }, { "epoch": 0.008873, "grad_norm": 0.2871725261211395, "learning_rate": 1e-05, "loss": 0.0187, "step": 887300 }, { "epoch": 0.008874, "grad_norm": 0.12544582784175873, "learning_rate": 1e-05, "loss": 0.0182, "step": 887400 }, { "epoch": 0.008875, "grad_norm": 0.11191093176603317, "learning_rate": 1e-05, "loss": 0.0188, "step": 887500 }, { "epoch": 0.008876, "grad_norm": 0.18067890405654907, "learning_rate": 1e-05, "loss": 0.0182, "step": 887600 }, { "epoch": 0.008877, "grad_norm": 0.15382122993469238, "learning_rate": 1e-05, "loss": 0.0185, "step": 887700 }, { "epoch": 0.008878, "grad_norm": 0.17073437571525574, "learning_rate": 1e-05, "loss": 0.0187, "step": 887800 }, { "epoch": 0.008879, "grad_norm": 0.1525074690580368, "learning_rate": 1e-05, "loss": 0.0186, "step": 887900 }, { "epoch": 0.00888, "grad_norm": 0.113156758248806, "learning_rate": 1e-05, "loss": 0.0181, "step": 888000 }, { "epoch": 0.008881, "grad_norm": 0.15739959478378296, "learning_rate": 1e-05, "loss": 0.0181, "step": 888100 }, { "epoch": 0.008882, "grad_norm": 0.13989394903182983, "learning_rate": 1e-05, "loss": 0.0184, "step": 888200 }, { "epoch": 0.008883, "grad_norm": 0.12536217272281647, "learning_rate": 1e-05, "loss": 0.0181, "step": 888300 }, { "epoch": 0.008884, "grad_norm": 0.15557079017162323, "learning_rate": 1e-05, "loss": 0.0183, "step": 888400 }, { "epoch": 0.008885, "grad_norm": 0.2431812286376953, "learning_rate": 1e-05, "loss": 0.0185, "step": 888500 }, { "epoch": 0.008886, "grad_norm": 0.2247869372367859, "learning_rate": 1e-05, "loss": 0.018, "step": 888600 }, { "epoch": 0.008887, "grad_norm": 0.1694740653038025, "learning_rate": 1e-05, "loss": 0.0185, "step": 888700 }, { "epoch": 0.008888, "grad_norm": 0.19485482573509216, "learning_rate": 1e-05, "loss": 0.0181, "step": 888800 }, { "epoch": 0.008889, "grad_norm": 0.12617766857147217, "learning_rate": 1e-05, "loss": 0.0182, "step": 888900 }, { "epoch": 0.00889, "grad_norm": 0.18240231275558472, "learning_rate": 1e-05, "loss": 0.0182, "step": 889000 }, { "epoch": 0.008891, "grad_norm": 0.2514484226703644, "learning_rate": 1e-05, "loss": 0.0186, "step": 889100 }, { "epoch": 0.008892, "grad_norm": 0.15203696489334106, "learning_rate": 1e-05, "loss": 0.0177, "step": 889200 }, { "epoch": 0.008893, "grad_norm": 0.17165015637874603, "learning_rate": 1e-05, "loss": 0.0185, "step": 889300 }, { "epoch": 0.008894, "grad_norm": 0.12678508460521698, "learning_rate": 1e-05, "loss": 0.0187, "step": 889400 }, { "epoch": 0.008895, "grad_norm": 0.1252703070640564, "learning_rate": 1e-05, "loss": 0.0179, "step": 889500 }, { "epoch": 0.008896, "grad_norm": 0.1284625083208084, "learning_rate": 1e-05, "loss": 0.0182, "step": 889600 }, { "epoch": 0.008897, "grad_norm": 0.19089512526988983, "learning_rate": 1e-05, "loss": 0.0179, "step": 889700 }, { "epoch": 0.008898, "grad_norm": 0.17799112200737, "learning_rate": 1e-05, "loss": 0.0188, "step": 889800 }, { "epoch": 0.008899, "grad_norm": 0.1637365221977234, "learning_rate": 1e-05, "loss": 0.0183, "step": 889900 }, { "epoch": 0.0089, "grad_norm": 0.1481604427099228, "learning_rate": 1e-05, "loss": 0.0184, "step": 890000 }, { "epoch": 0.008901, "grad_norm": 0.21722514927387238, "learning_rate": 1e-05, "loss": 0.0184, "step": 890100 }, { "epoch": 0.008902, "grad_norm": 0.204048752784729, "learning_rate": 1e-05, "loss": 0.0185, "step": 890200 }, { "epoch": 0.008903, "grad_norm": 0.1114475354552269, "learning_rate": 1e-05, "loss": 0.0184, "step": 890300 }, { "epoch": 0.008904, "grad_norm": 0.12076828628778458, "learning_rate": 1e-05, "loss": 0.0184, "step": 890400 }, { "epoch": 0.008905, "grad_norm": 0.1468270868062973, "learning_rate": 1e-05, "loss": 0.0182, "step": 890500 }, { "epoch": 0.008906, "grad_norm": 0.08753448724746704, "learning_rate": 1e-05, "loss": 0.0182, "step": 890600 }, { "epoch": 0.008907, "grad_norm": 0.14071469008922577, "learning_rate": 1e-05, "loss": 0.0185, "step": 890700 }, { "epoch": 0.008908, "grad_norm": 0.1425124555826187, "learning_rate": 1e-05, "loss": 0.018, "step": 890800 }, { "epoch": 0.008909, "grad_norm": 0.1256362348794937, "learning_rate": 1e-05, "loss": 0.0184, "step": 890900 }, { "epoch": 0.00891, "grad_norm": 0.17781147360801697, "learning_rate": 1e-05, "loss": 0.0187, "step": 891000 }, { "epoch": 0.008911, "grad_norm": 0.1440223902463913, "learning_rate": 1e-05, "loss": 0.018, "step": 891100 }, { "epoch": 0.008912, "grad_norm": 0.09459402412176132, "learning_rate": 1e-05, "loss": 0.0181, "step": 891200 }, { "epoch": 0.008913, "grad_norm": 0.16252535581588745, "learning_rate": 1e-05, "loss": 0.0184, "step": 891300 }, { "epoch": 0.008914, "grad_norm": 0.1684494912624359, "learning_rate": 1e-05, "loss": 0.0185, "step": 891400 }, { "epoch": 0.008915, "grad_norm": 0.1675552874803543, "learning_rate": 1e-05, "loss": 0.0182, "step": 891500 }, { "epoch": 0.008916, "grad_norm": 0.1640411764383316, "learning_rate": 1e-05, "loss": 0.0187, "step": 891600 }, { "epoch": 0.008917, "grad_norm": 0.14788655936717987, "learning_rate": 1e-05, "loss": 0.0179, "step": 891700 }, { "epoch": 0.008918, "grad_norm": 0.19635003805160522, "learning_rate": 1e-05, "loss": 0.0189, "step": 891800 }, { "epoch": 0.008919, "grad_norm": 0.12005744874477386, "learning_rate": 1e-05, "loss": 0.0184, "step": 891900 }, { "epoch": 0.00892, "grad_norm": 0.17441624402999878, "learning_rate": 1e-05, "loss": 0.0184, "step": 892000 }, { "epoch": 0.008921, "grad_norm": 0.16168160736560822, "learning_rate": 1e-05, "loss": 0.0183, "step": 892100 }, { "epoch": 0.008922, "grad_norm": 0.14362284541130066, "learning_rate": 1e-05, "loss": 0.0181, "step": 892200 }, { "epoch": 0.008923, "grad_norm": 0.1770874261856079, "learning_rate": 1e-05, "loss": 0.0182, "step": 892300 }, { "epoch": 0.008924, "grad_norm": 0.1183050200343132, "learning_rate": 1e-05, "loss": 0.0181, "step": 892400 }, { "epoch": 0.008925, "grad_norm": 0.13709016144275665, "learning_rate": 1e-05, "loss": 0.0185, "step": 892500 }, { "epoch": 0.008926, "grad_norm": 0.15055549144744873, "learning_rate": 1e-05, "loss": 0.0182, "step": 892600 }, { "epoch": 0.008927, "grad_norm": 0.16314135491847992, "learning_rate": 1e-05, "loss": 0.0184, "step": 892700 }, { "epoch": 0.008928, "grad_norm": 0.1657627820968628, "learning_rate": 1e-05, "loss": 0.0183, "step": 892800 }, { "epoch": 0.008929, "grad_norm": 0.11273057758808136, "learning_rate": 1e-05, "loss": 0.0188, "step": 892900 }, { "epoch": 0.00893, "grad_norm": 0.1194474920630455, "learning_rate": 1e-05, "loss": 0.0184, "step": 893000 }, { "epoch": 0.008931, "grad_norm": 0.1647646427154541, "learning_rate": 1e-05, "loss": 0.0183, "step": 893100 }, { "epoch": 0.008932, "grad_norm": 0.14943626523017883, "learning_rate": 1e-05, "loss": 0.0184, "step": 893200 }, { "epoch": 0.008933, "grad_norm": 0.1465156525373459, "learning_rate": 1e-05, "loss": 0.0182, "step": 893300 }, { "epoch": 0.008934, "grad_norm": 0.14284905791282654, "learning_rate": 1e-05, "loss": 0.0184, "step": 893400 }, { "epoch": 0.008935, "grad_norm": 0.17235097289085388, "learning_rate": 1e-05, "loss": 0.0186, "step": 893500 }, { "epoch": 0.008936, "grad_norm": 0.130260169506073, "learning_rate": 1e-05, "loss": 0.0182, "step": 893600 }, { "epoch": 0.008937, "grad_norm": 0.21399646997451782, "learning_rate": 1e-05, "loss": 0.0183, "step": 893700 }, { "epoch": 0.008938, "grad_norm": 0.18235015869140625, "learning_rate": 1e-05, "loss": 0.0177, "step": 893800 }, { "epoch": 0.008939, "grad_norm": 0.1596331149339676, "learning_rate": 1e-05, "loss": 0.0185, "step": 893900 }, { "epoch": 0.00894, "grad_norm": 0.1557033807039261, "learning_rate": 1e-05, "loss": 0.0186, "step": 894000 }, { "epoch": 0.008941, "grad_norm": 0.13060544431209564, "learning_rate": 1e-05, "loss": 0.0188, "step": 894100 }, { "epoch": 0.008942, "grad_norm": 0.12413998693227768, "learning_rate": 1e-05, "loss": 0.0182, "step": 894200 }, { "epoch": 0.008943, "grad_norm": 0.1436523050069809, "learning_rate": 1e-05, "loss": 0.0184, "step": 894300 }, { "epoch": 0.008944, "grad_norm": 0.19819749891757965, "learning_rate": 1e-05, "loss": 0.0181, "step": 894400 }, { "epoch": 0.008945, "grad_norm": 0.15600457787513733, "learning_rate": 1e-05, "loss": 0.018, "step": 894500 }, { "epoch": 0.008946, "grad_norm": 0.15322893857955933, "learning_rate": 1e-05, "loss": 0.0184, "step": 894600 }, { "epoch": 0.008947, "grad_norm": 0.12137267738580704, "learning_rate": 1e-05, "loss": 0.0185, "step": 894700 }, { "epoch": 0.008948, "grad_norm": 0.13376349210739136, "learning_rate": 1e-05, "loss": 0.0186, "step": 894800 }, { "epoch": 0.008949, "grad_norm": 0.1641359031200409, "learning_rate": 1e-05, "loss": 0.0181, "step": 894900 }, { "epoch": 0.00895, "grad_norm": 0.13603949546813965, "learning_rate": 1e-05, "loss": 0.0181, "step": 895000 }, { "epoch": 0.008951, "grad_norm": 0.10615458339452744, "learning_rate": 1e-05, "loss": 0.0182, "step": 895100 }, { "epoch": 0.008952, "grad_norm": 0.14984838664531708, "learning_rate": 1e-05, "loss": 0.0185, "step": 895200 }, { "epoch": 0.008953, "grad_norm": 0.1276775598526001, "learning_rate": 1e-05, "loss": 0.018, "step": 895300 }, { "epoch": 0.008954, "grad_norm": 0.15329837799072266, "learning_rate": 1e-05, "loss": 0.0182, "step": 895400 }, { "epoch": 0.008955, "grad_norm": 0.1516328901052475, "learning_rate": 1e-05, "loss": 0.0185, "step": 895500 }, { "epoch": 0.008956, "grad_norm": 0.10916660726070404, "learning_rate": 1e-05, "loss": 0.0184, "step": 895600 }, { "epoch": 0.008957, "grad_norm": 0.16709795594215393, "learning_rate": 1e-05, "loss": 0.0179, "step": 895700 }, { "epoch": 0.008958, "grad_norm": 0.16508051753044128, "learning_rate": 1e-05, "loss": 0.0185, "step": 895800 }, { "epoch": 0.008959, "grad_norm": 0.15746326744556427, "learning_rate": 1e-05, "loss": 0.0177, "step": 895900 }, { "epoch": 0.00896, "grad_norm": 0.13662554323673248, "learning_rate": 1e-05, "loss": 0.0182, "step": 896000 }, { "epoch": 0.008961, "grad_norm": 0.21864333748817444, "learning_rate": 1e-05, "loss": 0.0184, "step": 896100 }, { "epoch": 0.008962, "grad_norm": 0.18055827915668488, "learning_rate": 1e-05, "loss": 0.0179, "step": 896200 }, { "epoch": 0.008963, "grad_norm": 0.185174360871315, "learning_rate": 1e-05, "loss": 0.0182, "step": 896300 }, { "epoch": 0.008964, "grad_norm": 0.1455935835838318, "learning_rate": 1e-05, "loss": 0.0185, "step": 896400 }, { "epoch": 0.008965, "grad_norm": 0.11540350317955017, "learning_rate": 1e-05, "loss": 0.0184, "step": 896500 }, { "epoch": 0.008966, "grad_norm": 0.1783336102962494, "learning_rate": 1e-05, "loss": 0.0183, "step": 896600 }, { "epoch": 0.008967, "grad_norm": 0.14569242298603058, "learning_rate": 1e-05, "loss": 0.0183, "step": 896700 }, { "epoch": 0.008968, "grad_norm": 0.28414928913116455, "learning_rate": 1e-05, "loss": 0.018, "step": 896800 }, { "epoch": 0.008969, "grad_norm": 0.13377220928668976, "learning_rate": 1e-05, "loss": 0.0185, "step": 896900 }, { "epoch": 0.00897, "grad_norm": 0.1544981747865677, "learning_rate": 1e-05, "loss": 0.0183, "step": 897000 }, { "epoch": 0.008971, "grad_norm": 0.14483313262462616, "learning_rate": 1e-05, "loss": 0.0184, "step": 897100 }, { "epoch": 0.008972, "grad_norm": 0.12448232620954514, "learning_rate": 1e-05, "loss": 0.0184, "step": 897200 }, { "epoch": 0.008973, "grad_norm": 0.16498441994190216, "learning_rate": 1e-05, "loss": 0.0183, "step": 897300 }, { "epoch": 0.008974, "grad_norm": 0.10547696799039841, "learning_rate": 1e-05, "loss": 0.0183, "step": 897400 }, { "epoch": 0.008975, "grad_norm": 0.17409470677375793, "learning_rate": 1e-05, "loss": 0.0184, "step": 897500 }, { "epoch": 0.008976, "grad_norm": 0.13213294744491577, "learning_rate": 1e-05, "loss": 0.0183, "step": 897600 }, { "epoch": 0.008977, "grad_norm": 0.13813254237174988, "learning_rate": 1e-05, "loss": 0.0182, "step": 897700 }, { "epoch": 0.008978, "grad_norm": 0.12715953588485718, "learning_rate": 1e-05, "loss": 0.0176, "step": 897800 }, { "epoch": 0.008979, "grad_norm": 0.15553529560565948, "learning_rate": 1e-05, "loss": 0.0185, "step": 897900 }, { "epoch": 0.00898, "grad_norm": 0.1901012361049652, "learning_rate": 1e-05, "loss": 0.0188, "step": 898000 }, { "epoch": 0.008981, "grad_norm": 0.14978104829788208, "learning_rate": 1e-05, "loss": 0.018, "step": 898100 }, { "epoch": 0.008982, "grad_norm": 0.2050703465938568, "learning_rate": 1e-05, "loss": 0.0185, "step": 898200 }, { "epoch": 0.008983, "grad_norm": 0.19320955872535706, "learning_rate": 1e-05, "loss": 0.0185, "step": 898300 }, { "epoch": 0.008984, "grad_norm": 0.2142619639635086, "learning_rate": 1e-05, "loss": 0.0182, "step": 898400 }, { "epoch": 0.008985, "grad_norm": 0.22499720752239227, "learning_rate": 1e-05, "loss": 0.0183, "step": 898500 }, { "epoch": 0.008986, "grad_norm": 0.14002572000026703, "learning_rate": 1e-05, "loss": 0.0183, "step": 898600 }, { "epoch": 0.008987, "grad_norm": 0.15564993023872375, "learning_rate": 1e-05, "loss": 0.0186, "step": 898700 }, { "epoch": 0.008988, "grad_norm": 0.15369071066379547, "learning_rate": 1e-05, "loss": 0.0184, "step": 898800 }, { "epoch": 0.008989, "grad_norm": 0.16441072523593903, "learning_rate": 1e-05, "loss": 0.0182, "step": 898900 }, { "epoch": 0.00899, "grad_norm": 0.13086707890033722, "learning_rate": 1e-05, "loss": 0.0184, "step": 899000 }, { "epoch": 0.008991, "grad_norm": 0.15019665658473969, "learning_rate": 1e-05, "loss": 0.018, "step": 899100 }, { "epoch": 0.008992, "grad_norm": 0.15562453866004944, "learning_rate": 1e-05, "loss": 0.0187, "step": 899200 }, { "epoch": 0.008993, "grad_norm": 0.16081053018569946, "learning_rate": 1e-05, "loss": 0.0186, "step": 899300 }, { "epoch": 0.008994, "grad_norm": 0.13084328174591064, "learning_rate": 1e-05, "loss": 0.0179, "step": 899400 }, { "epoch": 0.008995, "grad_norm": 0.154328390955925, "learning_rate": 1e-05, "loss": 0.0183, "step": 899500 }, { "epoch": 0.008996, "grad_norm": 0.19648942351341248, "learning_rate": 1e-05, "loss": 0.0183, "step": 899600 }, { "epoch": 0.008997, "grad_norm": 0.11091084778308868, "learning_rate": 1e-05, "loss": 0.0184, "step": 899700 }, { "epoch": 0.008998, "grad_norm": 0.1377362459897995, "learning_rate": 1e-05, "loss": 0.0179, "step": 899800 }, { "epoch": 0.008999, "grad_norm": 0.14875337481498718, "learning_rate": 1e-05, "loss": 0.0181, "step": 899900 }, { "epoch": 0.009, "grad_norm": 0.13930389285087585, "learning_rate": 1e-05, "loss": 0.0185, "step": 900000 }, { "epoch": 0.009, "eval_loss": 0.01620975136756897, "eval_runtime": 194.3975, "eval_samples_per_second": 257.205, "eval_steps_per_second": 16.075, "step": 900000 }, { "epoch": 0.009001, "grad_norm": 0.1339753419160843, "learning_rate": 1e-05, "loss": 0.0184, "step": 900100 }, { "epoch": 0.009002, "grad_norm": 0.1191481426358223, "learning_rate": 1e-05, "loss": 0.0183, "step": 900200 }, { "epoch": 0.009003, "grad_norm": 0.13086791336536407, "learning_rate": 1e-05, "loss": 0.0188, "step": 900300 }, { "epoch": 0.009004, "grad_norm": 0.13443750143051147, "learning_rate": 1e-05, "loss": 0.0186, "step": 900400 }, { "epoch": 0.009005, "grad_norm": 0.16258789598941803, "learning_rate": 1e-05, "loss": 0.0185, "step": 900500 }, { "epoch": 0.009006, "grad_norm": 0.1365686058998108, "learning_rate": 1e-05, "loss": 0.0184, "step": 900600 }, { "epoch": 0.009007, "grad_norm": 0.1722697913646698, "learning_rate": 1e-05, "loss": 0.0183, "step": 900700 }, { "epoch": 0.009008, "grad_norm": 0.16477273404598236, "learning_rate": 1e-05, "loss": 0.0177, "step": 900800 }, { "epoch": 0.009009, "grad_norm": 0.1287097930908203, "learning_rate": 1e-05, "loss": 0.0186, "step": 900900 }, { "epoch": 0.00901, "grad_norm": 0.17200882732868195, "learning_rate": 1e-05, "loss": 0.0186, "step": 901000 }, { "epoch": 0.009011, "grad_norm": 0.13215701282024384, "learning_rate": 1e-05, "loss": 0.018, "step": 901100 }, { "epoch": 0.009012, "grad_norm": 0.18580371141433716, "learning_rate": 1e-05, "loss": 0.0178, "step": 901200 }, { "epoch": 0.009013, "grad_norm": 0.1957247108221054, "learning_rate": 1e-05, "loss": 0.0179, "step": 901300 }, { "epoch": 0.009014, "grad_norm": 0.149771049618721, "learning_rate": 1e-05, "loss": 0.0182, "step": 901400 }, { "epoch": 0.009015, "grad_norm": 0.18833106756210327, "learning_rate": 1e-05, "loss": 0.0186, "step": 901500 }, { "epoch": 0.009016, "grad_norm": 0.15521962940692902, "learning_rate": 1e-05, "loss": 0.0182, "step": 901600 }, { "epoch": 0.009017, "grad_norm": 0.1745748519897461, "learning_rate": 1e-05, "loss": 0.0182, "step": 901700 }, { "epoch": 0.009018, "grad_norm": 0.14503264427185059, "learning_rate": 1e-05, "loss": 0.0182, "step": 901800 }, { "epoch": 0.009019, "grad_norm": 0.15710879862308502, "learning_rate": 1e-05, "loss": 0.0186, "step": 901900 }, { "epoch": 0.00902, "grad_norm": 0.1786089390516281, "learning_rate": 1e-05, "loss": 0.0185, "step": 902000 }, { "epoch": 0.009021, "grad_norm": 0.11969982832670212, "learning_rate": 1e-05, "loss": 0.0182, "step": 902100 }, { "epoch": 0.009022, "grad_norm": 0.11116117984056473, "learning_rate": 1e-05, "loss": 0.0179, "step": 902200 }, { "epoch": 0.009023, "grad_norm": 0.10238993912935257, "learning_rate": 1e-05, "loss": 0.0181, "step": 902300 }, { "epoch": 0.009024, "grad_norm": 0.1601875275373459, "learning_rate": 1e-05, "loss": 0.0183, "step": 902400 }, { "epoch": 0.009025, "grad_norm": 0.1750229299068451, "learning_rate": 1e-05, "loss": 0.0181, "step": 902500 }, { "epoch": 0.009026, "grad_norm": 0.20710398256778717, "learning_rate": 1e-05, "loss": 0.0181, "step": 902600 }, { "epoch": 0.009027, "grad_norm": 0.2569558322429657, "learning_rate": 1e-05, "loss": 0.0184, "step": 902700 }, { "epoch": 0.009028, "grad_norm": 0.14114144444465637, "learning_rate": 1e-05, "loss": 0.0188, "step": 902800 }, { "epoch": 0.009029, "grad_norm": 0.12740157544612885, "learning_rate": 1e-05, "loss": 0.0184, "step": 902900 }, { "epoch": 0.00903, "grad_norm": 0.13876007497310638, "learning_rate": 1e-05, "loss": 0.0183, "step": 903000 }, { "epoch": 0.009031, "grad_norm": 0.13861452043056488, "learning_rate": 1e-05, "loss": 0.0182, "step": 903100 }, { "epoch": 0.009032, "grad_norm": 0.20795853435993195, "learning_rate": 1e-05, "loss": 0.0181, "step": 903200 }, { "epoch": 0.009033, "grad_norm": 0.14702165126800537, "learning_rate": 1e-05, "loss": 0.0183, "step": 903300 }, { "epoch": 0.009034, "grad_norm": 0.17248563468456268, "learning_rate": 1e-05, "loss": 0.0183, "step": 903400 }, { "epoch": 0.009035, "grad_norm": 0.18107415735721588, "learning_rate": 1e-05, "loss": 0.0185, "step": 903500 }, { "epoch": 0.009036, "grad_norm": 0.15514513850212097, "learning_rate": 1e-05, "loss": 0.0184, "step": 903600 }, { "epoch": 0.009037, "grad_norm": 0.10000342130661011, "learning_rate": 1e-05, "loss": 0.018, "step": 903700 }, { "epoch": 0.009038, "grad_norm": 0.11735961586236954, "learning_rate": 1e-05, "loss": 0.0182, "step": 903800 }, { "epoch": 0.009039, "grad_norm": 0.23049503564834595, "learning_rate": 1e-05, "loss": 0.0185, "step": 903900 }, { "epoch": 0.00904, "grad_norm": 0.16551798582077026, "learning_rate": 1e-05, "loss": 0.0181, "step": 904000 }, { "epoch": 0.009041, "grad_norm": 0.14486253261566162, "learning_rate": 1e-05, "loss": 0.018, "step": 904100 }, { "epoch": 0.009042, "grad_norm": 0.17602477967739105, "learning_rate": 1e-05, "loss": 0.0187, "step": 904200 }, { "epoch": 0.009043, "grad_norm": 0.15205474197864532, "learning_rate": 1e-05, "loss": 0.0179, "step": 904300 }, { "epoch": 0.009044, "grad_norm": 0.16135556995868683, "learning_rate": 1e-05, "loss": 0.0185, "step": 904400 }, { "epoch": 0.009045, "grad_norm": 0.14183300733566284, "learning_rate": 1e-05, "loss": 0.0187, "step": 904500 }, { "epoch": 0.009046, "grad_norm": 0.1296810805797577, "learning_rate": 1e-05, "loss": 0.0183, "step": 904600 }, { "epoch": 0.009047, "grad_norm": 0.10929430276155472, "learning_rate": 1e-05, "loss": 0.018, "step": 904700 }, { "epoch": 0.009048, "grad_norm": 0.21671271324157715, "learning_rate": 1e-05, "loss": 0.0178, "step": 904800 }, { "epoch": 0.009049, "grad_norm": 0.23055292665958405, "learning_rate": 1e-05, "loss": 0.0182, "step": 904900 }, { "epoch": 0.00905, "grad_norm": 0.16244637966156006, "learning_rate": 1e-05, "loss": 0.018, "step": 905000 }, { "epoch": 0.009051, "grad_norm": 0.19939225912094116, "learning_rate": 1e-05, "loss": 0.0186, "step": 905100 }, { "epoch": 0.009052, "grad_norm": 0.1269141137599945, "learning_rate": 1e-05, "loss": 0.0187, "step": 905200 }, { "epoch": 0.009053, "grad_norm": 0.11314720660448074, "learning_rate": 1e-05, "loss": 0.0177, "step": 905300 }, { "epoch": 0.009054, "grad_norm": 0.13043023645877838, "learning_rate": 1e-05, "loss": 0.0184, "step": 905400 }, { "epoch": 0.009055, "grad_norm": 0.11729022115468979, "learning_rate": 1e-05, "loss": 0.018, "step": 905500 }, { "epoch": 0.009056, "grad_norm": 0.11440848559141159, "learning_rate": 1e-05, "loss": 0.0184, "step": 905600 }, { "epoch": 0.009057, "grad_norm": 0.13289757072925568, "learning_rate": 1e-05, "loss": 0.0182, "step": 905700 }, { "epoch": 0.009058, "grad_norm": 0.1679769903421402, "learning_rate": 1e-05, "loss": 0.0181, "step": 905800 }, { "epoch": 0.009059, "grad_norm": 0.10538488626480103, "learning_rate": 1e-05, "loss": 0.0185, "step": 905900 }, { "epoch": 0.00906, "grad_norm": 0.11753145605325699, "learning_rate": 1e-05, "loss": 0.0181, "step": 906000 }, { "epoch": 0.009061, "grad_norm": 0.1308729201555252, "learning_rate": 1e-05, "loss": 0.0182, "step": 906100 }, { "epoch": 0.009062, "grad_norm": 0.1755107343196869, "learning_rate": 1e-05, "loss": 0.0187, "step": 906200 }, { "epoch": 0.009063, "grad_norm": 0.14389504492282867, "learning_rate": 1e-05, "loss": 0.0177, "step": 906300 }, { "epoch": 0.009064, "grad_norm": 0.1589806079864502, "learning_rate": 1e-05, "loss": 0.0188, "step": 906400 }, { "epoch": 0.009065, "grad_norm": 0.16026440262794495, "learning_rate": 1e-05, "loss": 0.0183, "step": 906500 }, { "epoch": 0.009066, "grad_norm": 0.1487005054950714, "learning_rate": 1e-05, "loss": 0.0185, "step": 906600 }, { "epoch": 0.009067, "grad_norm": 0.13854189217090607, "learning_rate": 1e-05, "loss": 0.0181, "step": 906700 }, { "epoch": 0.009068, "grad_norm": 0.15581367909908295, "learning_rate": 1e-05, "loss": 0.0183, "step": 906800 }, { "epoch": 0.009069, "grad_norm": 0.16160351037979126, "learning_rate": 1e-05, "loss": 0.0183, "step": 906900 }, { "epoch": 0.00907, "grad_norm": 0.13625609874725342, "learning_rate": 1e-05, "loss": 0.0186, "step": 907000 }, { "epoch": 0.009071, "grad_norm": 0.1551782786846161, "learning_rate": 1e-05, "loss": 0.0181, "step": 907100 }, { "epoch": 0.009072, "grad_norm": 0.19150665402412415, "learning_rate": 1e-05, "loss": 0.018, "step": 907200 }, { "epoch": 0.009073, "grad_norm": 0.13136209547519684, "learning_rate": 1e-05, "loss": 0.0181, "step": 907300 }, { "epoch": 0.009074, "grad_norm": 0.2252502143383026, "learning_rate": 1e-05, "loss": 0.0181, "step": 907400 }, { "epoch": 0.009075, "grad_norm": 0.12503722310066223, "learning_rate": 1e-05, "loss": 0.0181, "step": 907500 }, { "epoch": 0.009076, "grad_norm": 0.14299078285694122, "learning_rate": 1e-05, "loss": 0.018, "step": 907600 }, { "epoch": 0.009077, "grad_norm": 0.13211672008037567, "learning_rate": 1e-05, "loss": 0.0184, "step": 907700 }, { "epoch": 0.009078, "grad_norm": 0.12957976758480072, "learning_rate": 1e-05, "loss": 0.0183, "step": 907800 }, { "epoch": 0.009079, "grad_norm": 0.17991727590560913, "learning_rate": 1e-05, "loss": 0.0183, "step": 907900 }, { "epoch": 0.00908, "grad_norm": 0.13151182234287262, "learning_rate": 1e-05, "loss": 0.0184, "step": 908000 }, { "epoch": 0.009081, "grad_norm": 0.13625861704349518, "learning_rate": 1e-05, "loss": 0.018, "step": 908100 }, { "epoch": 0.009082, "grad_norm": 0.17613469064235687, "learning_rate": 1e-05, "loss": 0.0181, "step": 908200 }, { "epoch": 0.009083, "grad_norm": 0.257829874753952, "learning_rate": 1e-05, "loss": 0.0182, "step": 908300 }, { "epoch": 0.009084, "grad_norm": 0.17116406559944153, "learning_rate": 1e-05, "loss": 0.018, "step": 908400 }, { "epoch": 0.009085, "grad_norm": 0.14171284437179565, "learning_rate": 1e-05, "loss": 0.0182, "step": 908500 }, { "epoch": 0.009086, "grad_norm": 0.13752669095993042, "learning_rate": 1e-05, "loss": 0.018, "step": 908600 }, { "epoch": 0.009087, "grad_norm": 0.15878327190876007, "learning_rate": 1e-05, "loss": 0.0184, "step": 908700 }, { "epoch": 0.009088, "grad_norm": 0.13227443397045135, "learning_rate": 1e-05, "loss": 0.018, "step": 908800 }, { "epoch": 0.009089, "grad_norm": 0.1354200541973114, "learning_rate": 1e-05, "loss": 0.0186, "step": 908900 }, { "epoch": 0.00909, "grad_norm": 0.1905861645936966, "learning_rate": 1e-05, "loss": 0.0183, "step": 909000 }, { "epoch": 0.009091, "grad_norm": 0.12319248914718628, "learning_rate": 1e-05, "loss": 0.0179, "step": 909100 }, { "epoch": 0.009092, "grad_norm": 0.12243539839982986, "learning_rate": 1e-05, "loss": 0.0185, "step": 909200 }, { "epoch": 0.009093, "grad_norm": 0.10801847279071808, "learning_rate": 1e-05, "loss": 0.0182, "step": 909300 }, { "epoch": 0.009094, "grad_norm": 0.12347684055566788, "learning_rate": 1e-05, "loss": 0.0179, "step": 909400 }, { "epoch": 0.009095, "grad_norm": 0.16766808927059174, "learning_rate": 1e-05, "loss": 0.0184, "step": 909500 }, { "epoch": 0.009096, "grad_norm": 0.11338943243026733, "learning_rate": 1e-05, "loss": 0.0178, "step": 909600 }, { "epoch": 0.009097, "grad_norm": 0.1437970995903015, "learning_rate": 1e-05, "loss": 0.0179, "step": 909700 }, { "epoch": 0.009098, "grad_norm": 0.16513782739639282, "learning_rate": 1e-05, "loss": 0.0182, "step": 909800 }, { "epoch": 0.009099, "grad_norm": 0.13089655339717865, "learning_rate": 1e-05, "loss": 0.0182, "step": 909900 }, { "epoch": 0.0091, "grad_norm": 0.15495318174362183, "learning_rate": 1e-05, "loss": 0.0181, "step": 910000 }, { "epoch": 0.009101, "grad_norm": 0.1654045134782791, "learning_rate": 1e-05, "loss": 0.0184, "step": 910100 }, { "epoch": 0.009102, "grad_norm": 0.1616503745317459, "learning_rate": 1e-05, "loss": 0.0182, "step": 910200 }, { "epoch": 0.009103, "grad_norm": 0.11588028073310852, "learning_rate": 1e-05, "loss": 0.0181, "step": 910300 }, { "epoch": 0.009104, "grad_norm": 0.12710782885551453, "learning_rate": 1e-05, "loss": 0.0184, "step": 910400 }, { "epoch": 0.009105, "grad_norm": 0.10313770920038223, "learning_rate": 1e-05, "loss": 0.0181, "step": 910500 }, { "epoch": 0.009106, "grad_norm": 0.15451881289482117, "learning_rate": 1e-05, "loss": 0.0179, "step": 910600 }, { "epoch": 0.009107, "grad_norm": 0.1245746910572052, "learning_rate": 1e-05, "loss": 0.0182, "step": 910700 }, { "epoch": 0.009108, "grad_norm": 0.10585563629865646, "learning_rate": 1e-05, "loss": 0.0183, "step": 910800 }, { "epoch": 0.009109, "grad_norm": 0.23502252995967865, "learning_rate": 1e-05, "loss": 0.0181, "step": 910900 }, { "epoch": 0.00911, "grad_norm": 0.17142805457115173, "learning_rate": 1e-05, "loss": 0.0183, "step": 911000 }, { "epoch": 0.009111, "grad_norm": 0.1257765293121338, "learning_rate": 1e-05, "loss": 0.018, "step": 911100 }, { "epoch": 0.009112, "grad_norm": 0.16458573937416077, "learning_rate": 1e-05, "loss": 0.0178, "step": 911200 }, { "epoch": 0.009113, "grad_norm": 0.19762933254241943, "learning_rate": 1e-05, "loss": 0.0183, "step": 911300 }, { "epoch": 0.009114, "grad_norm": 0.22020339965820312, "learning_rate": 1e-05, "loss": 0.0185, "step": 911400 }, { "epoch": 0.009115, "grad_norm": 0.20346006751060486, "learning_rate": 1e-05, "loss": 0.0184, "step": 911500 }, { "epoch": 0.009116, "grad_norm": 0.08824114501476288, "learning_rate": 1e-05, "loss": 0.0182, "step": 911600 }, { "epoch": 0.009117, "grad_norm": 0.1487826704978943, "learning_rate": 1e-05, "loss": 0.0178, "step": 911700 }, { "epoch": 0.009118, "grad_norm": 0.17165181040763855, "learning_rate": 1e-05, "loss": 0.0181, "step": 911800 }, { "epoch": 0.009119, "grad_norm": 0.11294116824865341, "learning_rate": 1e-05, "loss": 0.0183, "step": 911900 }, { "epoch": 0.00912, "grad_norm": 0.11257986724376678, "learning_rate": 1e-05, "loss": 0.0183, "step": 912000 }, { "epoch": 0.009121, "grad_norm": 0.1371377408504486, "learning_rate": 1e-05, "loss": 0.0181, "step": 912100 }, { "epoch": 0.009122, "grad_norm": 0.15270961821079254, "learning_rate": 1e-05, "loss": 0.0183, "step": 912200 }, { "epoch": 0.009123, "grad_norm": 0.18021024763584137, "learning_rate": 1e-05, "loss": 0.0185, "step": 912300 }, { "epoch": 0.009124, "grad_norm": 0.16457132995128632, "learning_rate": 1e-05, "loss": 0.0175, "step": 912400 }, { "epoch": 0.009125, "grad_norm": 0.13543152809143066, "learning_rate": 1e-05, "loss": 0.0182, "step": 912500 }, { "epoch": 0.009126, "grad_norm": 0.17155857384204865, "learning_rate": 1e-05, "loss": 0.0183, "step": 912600 }, { "epoch": 0.009127, "grad_norm": 0.15389300882816315, "learning_rate": 1e-05, "loss": 0.0184, "step": 912700 }, { "epoch": 0.009128, "grad_norm": 0.15231451392173767, "learning_rate": 1e-05, "loss": 0.0182, "step": 912800 }, { "epoch": 0.009129, "grad_norm": 0.12564276158809662, "learning_rate": 1e-05, "loss": 0.0183, "step": 912900 }, { "epoch": 0.00913, "grad_norm": 0.17026889324188232, "learning_rate": 1e-05, "loss": 0.0183, "step": 913000 }, { "epoch": 0.009131, "grad_norm": 0.19200018048286438, "learning_rate": 1e-05, "loss": 0.0186, "step": 913100 }, { "epoch": 0.009132, "grad_norm": 0.12115304172039032, "learning_rate": 1e-05, "loss": 0.0181, "step": 913200 }, { "epoch": 0.009133, "grad_norm": 0.11813952773809433, "learning_rate": 1e-05, "loss": 0.0184, "step": 913300 }, { "epoch": 0.009134, "grad_norm": 0.11677964776754379, "learning_rate": 1e-05, "loss": 0.0179, "step": 913400 }, { "epoch": 0.009135, "grad_norm": 0.1435810625553131, "learning_rate": 1e-05, "loss": 0.0177, "step": 913500 }, { "epoch": 0.009136, "grad_norm": 0.17200782895088196, "learning_rate": 1e-05, "loss": 0.0186, "step": 913600 }, { "epoch": 0.009137, "grad_norm": 0.21480798721313477, "learning_rate": 1e-05, "loss": 0.0179, "step": 913700 }, { "epoch": 0.009138, "grad_norm": 0.153737872838974, "learning_rate": 1e-05, "loss": 0.0183, "step": 913800 }, { "epoch": 0.009139, "grad_norm": 0.1261618584394455, "learning_rate": 1e-05, "loss": 0.0179, "step": 913900 }, { "epoch": 0.00914, "grad_norm": 0.18265216052532196, "learning_rate": 1e-05, "loss": 0.0183, "step": 914000 }, { "epoch": 0.009141, "grad_norm": 0.12317760288715363, "learning_rate": 1e-05, "loss": 0.0178, "step": 914100 }, { "epoch": 0.009142, "grad_norm": 0.1227583959698677, "learning_rate": 1e-05, "loss": 0.0183, "step": 914200 }, { "epoch": 0.009143, "grad_norm": 0.1324705183506012, "learning_rate": 1e-05, "loss": 0.0183, "step": 914300 }, { "epoch": 0.009144, "grad_norm": 0.12058267742395401, "learning_rate": 1e-05, "loss": 0.0185, "step": 914400 }, { "epoch": 0.009145, "grad_norm": 0.11869719624519348, "learning_rate": 1e-05, "loss": 0.0179, "step": 914500 }, { "epoch": 0.009146, "grad_norm": 0.10204515606164932, "learning_rate": 1e-05, "loss": 0.0178, "step": 914600 }, { "epoch": 0.009147, "grad_norm": 0.1508868783712387, "learning_rate": 1e-05, "loss": 0.0182, "step": 914700 }, { "epoch": 0.009148, "grad_norm": 0.13255491852760315, "learning_rate": 1e-05, "loss": 0.0183, "step": 914800 }, { "epoch": 0.009149, "grad_norm": 0.12010622024536133, "learning_rate": 1e-05, "loss": 0.0178, "step": 914900 }, { "epoch": 0.00915, "grad_norm": 0.14142505824565887, "learning_rate": 1e-05, "loss": 0.0183, "step": 915000 }, { "epoch": 0.009151, "grad_norm": 0.13176901638507843, "learning_rate": 1e-05, "loss": 0.0186, "step": 915100 }, { "epoch": 0.009152, "grad_norm": 0.13382944464683533, "learning_rate": 1e-05, "loss": 0.0182, "step": 915200 }, { "epoch": 0.009153, "grad_norm": 0.1303553432226181, "learning_rate": 1e-05, "loss": 0.018, "step": 915300 }, { "epoch": 0.009154, "grad_norm": 0.16885201632976532, "learning_rate": 1e-05, "loss": 0.018, "step": 915400 }, { "epoch": 0.009155, "grad_norm": 0.138356551527977, "learning_rate": 1e-05, "loss": 0.0185, "step": 915500 }, { "epoch": 0.009156, "grad_norm": 0.15435273945331573, "learning_rate": 1e-05, "loss": 0.0183, "step": 915600 }, { "epoch": 0.009157, "grad_norm": 0.14538148045539856, "learning_rate": 1e-05, "loss": 0.018, "step": 915700 }, { "epoch": 0.009158, "grad_norm": 0.14903239905834198, "learning_rate": 1e-05, "loss": 0.0183, "step": 915800 }, { "epoch": 0.009159, "grad_norm": 0.1631360799074173, "learning_rate": 1e-05, "loss": 0.0188, "step": 915900 }, { "epoch": 0.00916, "grad_norm": 0.25752729177474976, "learning_rate": 1e-05, "loss": 0.018, "step": 916000 }, { "epoch": 0.009161, "grad_norm": 0.13881543278694153, "learning_rate": 1e-05, "loss": 0.0178, "step": 916100 }, { "epoch": 0.009162, "grad_norm": 0.15876643359661102, "learning_rate": 1e-05, "loss": 0.0182, "step": 916200 }, { "epoch": 0.009163, "grad_norm": 0.1446659117937088, "learning_rate": 1e-05, "loss": 0.0182, "step": 916300 }, { "epoch": 0.009164, "grad_norm": 0.15990681946277618, "learning_rate": 1e-05, "loss": 0.0177, "step": 916400 }, { "epoch": 0.009165, "grad_norm": 0.14670731127262115, "learning_rate": 1e-05, "loss": 0.0185, "step": 916500 }, { "epoch": 0.009166, "grad_norm": 0.1718355119228363, "learning_rate": 1e-05, "loss": 0.0178, "step": 916600 }, { "epoch": 0.009167, "grad_norm": 0.12531401216983795, "learning_rate": 1e-05, "loss": 0.0187, "step": 916700 }, { "epoch": 0.009168, "grad_norm": 0.1639869213104248, "learning_rate": 1e-05, "loss": 0.0181, "step": 916800 }, { "epoch": 0.009169, "grad_norm": 0.11484086513519287, "learning_rate": 1e-05, "loss": 0.0181, "step": 916900 }, { "epoch": 0.00917, "grad_norm": 0.12308012694120407, "learning_rate": 1e-05, "loss": 0.0178, "step": 917000 }, { "epoch": 0.009171, "grad_norm": 0.19148878753185272, "learning_rate": 1e-05, "loss": 0.0182, "step": 917100 }, { "epoch": 0.009172, "grad_norm": 0.14416202902793884, "learning_rate": 1e-05, "loss": 0.0184, "step": 917200 }, { "epoch": 0.009173, "grad_norm": 0.14544907212257385, "learning_rate": 1e-05, "loss": 0.0184, "step": 917300 }, { "epoch": 0.009174, "grad_norm": 0.14580686390399933, "learning_rate": 1e-05, "loss": 0.018, "step": 917400 }, { "epoch": 0.009175, "grad_norm": 0.14949887990951538, "learning_rate": 1e-05, "loss": 0.0182, "step": 917500 }, { "epoch": 0.009176, "grad_norm": 0.12647153437137604, "learning_rate": 1e-05, "loss": 0.018, "step": 917600 }, { "epoch": 0.009177, "grad_norm": 0.22521881759166718, "learning_rate": 1e-05, "loss": 0.0182, "step": 917700 }, { "epoch": 0.009178, "grad_norm": 0.13189885020256042, "learning_rate": 1e-05, "loss": 0.0177, "step": 917800 }, { "epoch": 0.009179, "grad_norm": 0.19122810661792755, "learning_rate": 1e-05, "loss": 0.0178, "step": 917900 }, { "epoch": 0.00918, "grad_norm": 0.13471432030200958, "learning_rate": 1e-05, "loss": 0.0183, "step": 918000 }, { "epoch": 0.009181, "grad_norm": 0.12895028293132782, "learning_rate": 1e-05, "loss": 0.0178, "step": 918100 }, { "epoch": 0.009182, "grad_norm": 0.24000082910060883, "learning_rate": 1e-05, "loss": 0.0182, "step": 918200 }, { "epoch": 0.009183, "grad_norm": 0.10378672182559967, "learning_rate": 1e-05, "loss": 0.0178, "step": 918300 }, { "epoch": 0.009184, "grad_norm": 0.1483374983072281, "learning_rate": 1e-05, "loss": 0.0184, "step": 918400 }, { "epoch": 0.009185, "grad_norm": 0.19142255187034607, "learning_rate": 1e-05, "loss": 0.0181, "step": 918500 }, { "epoch": 0.009186, "grad_norm": 0.17066600918769836, "learning_rate": 1e-05, "loss": 0.0182, "step": 918600 }, { "epoch": 0.009187, "grad_norm": 0.15130764245986938, "learning_rate": 1e-05, "loss": 0.0181, "step": 918700 }, { "epoch": 0.009188, "grad_norm": 0.10992055386304855, "learning_rate": 1e-05, "loss": 0.0184, "step": 918800 }, { "epoch": 0.009189, "grad_norm": 0.20743194222450256, "learning_rate": 1e-05, "loss": 0.0185, "step": 918900 }, { "epoch": 0.00919, "grad_norm": 0.13424213230609894, "learning_rate": 1e-05, "loss": 0.018, "step": 919000 }, { "epoch": 0.009191, "grad_norm": 0.1545516699552536, "learning_rate": 1e-05, "loss": 0.0185, "step": 919100 }, { "epoch": 0.009192, "grad_norm": 0.23465336859226227, "learning_rate": 1e-05, "loss": 0.018, "step": 919200 }, { "epoch": 0.009193, "grad_norm": 0.1427360326051712, "learning_rate": 1e-05, "loss": 0.018, "step": 919300 }, { "epoch": 0.009194, "grad_norm": 0.15774103999137878, "learning_rate": 1e-05, "loss": 0.018, "step": 919400 }, { "epoch": 0.009195, "grad_norm": 0.15877892076969147, "learning_rate": 1e-05, "loss": 0.018, "step": 919500 }, { "epoch": 0.009196, "grad_norm": 0.16741816699504852, "learning_rate": 1e-05, "loss": 0.0185, "step": 919600 }, { "epoch": 0.009197, "grad_norm": 0.1475534588098526, "learning_rate": 1e-05, "loss": 0.0183, "step": 919700 }, { "epoch": 0.009198, "grad_norm": 0.1541590690612793, "learning_rate": 1e-05, "loss": 0.0183, "step": 919800 }, { "epoch": 0.009199, "grad_norm": 0.24268178641796112, "learning_rate": 1e-05, "loss": 0.0184, "step": 919900 }, { "epoch": 0.0092, "grad_norm": 0.11619991064071655, "learning_rate": 1e-05, "loss": 0.0185, "step": 920000 }, { "epoch": 0.0092, "eval_loss": 0.01663963869214058, "eval_runtime": 190.2852, "eval_samples_per_second": 262.763, "eval_steps_per_second": 16.423, "step": 920000 }, { "epoch": 0.009201, "grad_norm": 0.1437567174434662, "learning_rate": 1e-05, "loss": 0.0182, "step": 920100 }, { "epoch": 0.009202, "grad_norm": 0.18913358449935913, "learning_rate": 1e-05, "loss": 0.0182, "step": 920200 }, { "epoch": 0.009203, "grad_norm": 0.12279846519231796, "learning_rate": 1e-05, "loss": 0.018, "step": 920300 }, { "epoch": 0.009204, "grad_norm": 0.1867247223854065, "learning_rate": 1e-05, "loss": 0.0179, "step": 920400 }, { "epoch": 0.009205, "grad_norm": 0.11455371230840683, "learning_rate": 1e-05, "loss": 0.0186, "step": 920500 }, { "epoch": 0.009206, "grad_norm": 0.19211715459823608, "learning_rate": 1e-05, "loss": 0.0176, "step": 920600 }, { "epoch": 0.009207, "grad_norm": 0.14727230370044708, "learning_rate": 1e-05, "loss": 0.0181, "step": 920700 }, { "epoch": 0.009208, "grad_norm": 0.16186140477657318, "learning_rate": 1e-05, "loss": 0.0186, "step": 920800 }, { "epoch": 0.009209, "grad_norm": 0.1823103278875351, "learning_rate": 1e-05, "loss": 0.018, "step": 920900 }, { "epoch": 0.00921, "grad_norm": 0.10577844083309174, "learning_rate": 1e-05, "loss": 0.0187, "step": 921000 }, { "epoch": 0.009211, "grad_norm": 0.1676269918680191, "learning_rate": 1e-05, "loss": 0.0183, "step": 921100 }, { "epoch": 0.009212, "grad_norm": 0.2148158699274063, "learning_rate": 1e-05, "loss": 0.0182, "step": 921200 }, { "epoch": 0.009213, "grad_norm": 0.21941863000392914, "learning_rate": 1e-05, "loss": 0.0185, "step": 921300 }, { "epoch": 0.009214, "grad_norm": 0.1593269258737564, "learning_rate": 1e-05, "loss": 0.0185, "step": 921400 }, { "epoch": 0.009215, "grad_norm": 0.1870548576116562, "learning_rate": 1e-05, "loss": 0.0184, "step": 921500 }, { "epoch": 0.009216, "grad_norm": 0.15370874106884003, "learning_rate": 1e-05, "loss": 0.0179, "step": 921600 }, { "epoch": 0.009217, "grad_norm": 0.13133105635643005, "learning_rate": 1e-05, "loss": 0.0179, "step": 921700 }, { "epoch": 0.009218, "grad_norm": 0.159848153591156, "learning_rate": 1e-05, "loss": 0.0185, "step": 921800 }, { "epoch": 0.009219, "grad_norm": 0.15951529145240784, "learning_rate": 1e-05, "loss": 0.0178, "step": 921900 }, { "epoch": 0.00922, "grad_norm": 0.13572533428668976, "learning_rate": 1e-05, "loss": 0.0181, "step": 922000 }, { "epoch": 0.009221, "grad_norm": 0.1347595453262329, "learning_rate": 1e-05, "loss": 0.0187, "step": 922100 }, { "epoch": 0.009222, "grad_norm": 0.1461350917816162, "learning_rate": 1e-05, "loss": 0.0182, "step": 922200 }, { "epoch": 0.009223, "grad_norm": 0.1714351773262024, "learning_rate": 1e-05, "loss": 0.0184, "step": 922300 }, { "epoch": 0.009224, "grad_norm": 0.1490807682275772, "learning_rate": 1e-05, "loss": 0.0183, "step": 922400 }, { "epoch": 0.009225, "grad_norm": 0.16297945380210876, "learning_rate": 1e-05, "loss": 0.0185, "step": 922500 }, { "epoch": 0.009226, "grad_norm": 0.11821591109037399, "learning_rate": 1e-05, "loss": 0.0182, "step": 922600 }, { "epoch": 0.009227, "grad_norm": 0.17694656550884247, "learning_rate": 1e-05, "loss": 0.0181, "step": 922700 }, { "epoch": 0.009228, "grad_norm": 0.15582583844661713, "learning_rate": 1e-05, "loss": 0.0181, "step": 922800 }, { "epoch": 0.009229, "grad_norm": 0.17057769000530243, "learning_rate": 1e-05, "loss": 0.0185, "step": 922900 }, { "epoch": 0.00923, "grad_norm": 0.13865184783935547, "learning_rate": 1e-05, "loss": 0.0181, "step": 923000 }, { "epoch": 0.009231, "grad_norm": 0.1576641947031021, "learning_rate": 1e-05, "loss": 0.0181, "step": 923100 }, { "epoch": 0.009232, "grad_norm": 0.13237595558166504, "learning_rate": 1e-05, "loss": 0.0183, "step": 923200 }, { "epoch": 0.009233, "grad_norm": 0.10256530344486237, "learning_rate": 1e-05, "loss": 0.0181, "step": 923300 }, { "epoch": 0.009234, "grad_norm": 0.16535161435604095, "learning_rate": 1e-05, "loss": 0.0181, "step": 923400 }, { "epoch": 0.009235, "grad_norm": 0.13914932310581207, "learning_rate": 1e-05, "loss": 0.0184, "step": 923500 }, { "epoch": 0.009236, "grad_norm": 0.19743527472019196, "learning_rate": 1e-05, "loss": 0.0181, "step": 923600 }, { "epoch": 0.009237, "grad_norm": 0.11677033454179764, "learning_rate": 1e-05, "loss": 0.018, "step": 923700 }, { "epoch": 0.009238, "grad_norm": 0.12567251920700073, "learning_rate": 1e-05, "loss": 0.0182, "step": 923800 }, { "epoch": 0.009239, "grad_norm": 0.19981546700000763, "learning_rate": 1e-05, "loss": 0.0182, "step": 923900 }, { "epoch": 0.00924, "grad_norm": 0.16399268805980682, "learning_rate": 1e-05, "loss": 0.0177, "step": 924000 }, { "epoch": 0.009241, "grad_norm": 0.11965800821781158, "learning_rate": 1e-05, "loss": 0.0179, "step": 924100 }, { "epoch": 0.009242, "grad_norm": 0.25004297494888306, "learning_rate": 1e-05, "loss": 0.0181, "step": 924200 }, { "epoch": 0.009243, "grad_norm": 0.15332059562206268, "learning_rate": 1e-05, "loss": 0.0178, "step": 924300 }, { "epoch": 0.009244, "grad_norm": 0.11565189808607101, "learning_rate": 1e-05, "loss": 0.018, "step": 924400 }, { "epoch": 0.009245, "grad_norm": 0.12849970161914825, "learning_rate": 1e-05, "loss": 0.018, "step": 924500 }, { "epoch": 0.009246, "grad_norm": 0.13419952988624573, "learning_rate": 1e-05, "loss": 0.0183, "step": 924600 }, { "epoch": 0.009247, "grad_norm": 0.11896896362304688, "learning_rate": 1e-05, "loss": 0.018, "step": 924700 }, { "epoch": 0.009248, "grad_norm": 0.14726105332374573, "learning_rate": 1e-05, "loss": 0.0181, "step": 924800 }, { "epoch": 0.009249, "grad_norm": 0.11745347827672958, "learning_rate": 1e-05, "loss": 0.0184, "step": 924900 }, { "epoch": 0.00925, "grad_norm": 0.17339281737804413, "learning_rate": 1e-05, "loss": 0.0181, "step": 925000 }, { "epoch": 0.009251, "grad_norm": 0.10399915277957916, "learning_rate": 1e-05, "loss": 0.0182, "step": 925100 }, { "epoch": 0.009252, "grad_norm": 0.21986430883407593, "learning_rate": 1e-05, "loss": 0.0178, "step": 925200 }, { "epoch": 0.009253, "grad_norm": 0.11848963052034378, "learning_rate": 1e-05, "loss": 0.018, "step": 925300 }, { "epoch": 0.009254, "grad_norm": 0.1332116574048996, "learning_rate": 1e-05, "loss": 0.0185, "step": 925400 }, { "epoch": 0.009255, "grad_norm": 0.170301616191864, "learning_rate": 1e-05, "loss": 0.0179, "step": 925500 }, { "epoch": 0.009256, "grad_norm": 0.14232264459133148, "learning_rate": 1e-05, "loss": 0.0181, "step": 925600 }, { "epoch": 0.009257, "grad_norm": 0.14332887530326843, "learning_rate": 1e-05, "loss": 0.0179, "step": 925700 }, { "epoch": 0.009258, "grad_norm": 0.11469817906618118, "learning_rate": 1e-05, "loss": 0.018, "step": 925800 }, { "epoch": 0.009259, "grad_norm": 0.15189829468727112, "learning_rate": 1e-05, "loss": 0.0181, "step": 925900 }, { "epoch": 0.00926, "grad_norm": 0.12052912265062332, "learning_rate": 1e-05, "loss": 0.0184, "step": 926000 }, { "epoch": 0.009261, "grad_norm": 0.15966296195983887, "learning_rate": 1e-05, "loss": 0.0182, "step": 926100 }, { "epoch": 0.009262, "grad_norm": 0.1697082370519638, "learning_rate": 1e-05, "loss": 0.0182, "step": 926200 }, { "epoch": 0.009263, "grad_norm": 0.13757948577404022, "learning_rate": 1e-05, "loss": 0.018, "step": 926300 }, { "epoch": 0.009264, "grad_norm": 0.11993888765573502, "learning_rate": 1e-05, "loss": 0.0182, "step": 926400 }, { "epoch": 0.009265, "grad_norm": 0.17137135565280914, "learning_rate": 1e-05, "loss": 0.0186, "step": 926500 }, { "epoch": 0.009266, "grad_norm": 0.1880064755678177, "learning_rate": 1e-05, "loss": 0.0181, "step": 926600 }, { "epoch": 0.009267, "grad_norm": 0.1702708750963211, "learning_rate": 1e-05, "loss": 0.0179, "step": 926700 }, { "epoch": 0.009268, "grad_norm": 0.1944560557603836, "learning_rate": 1e-05, "loss": 0.0183, "step": 926800 }, { "epoch": 0.009269, "grad_norm": 0.19806364178657532, "learning_rate": 1e-05, "loss": 0.0181, "step": 926900 }, { "epoch": 0.00927, "grad_norm": 0.1699836105108261, "learning_rate": 1e-05, "loss": 0.0182, "step": 927000 }, { "epoch": 0.009271, "grad_norm": 0.15245677530765533, "learning_rate": 1e-05, "loss": 0.0181, "step": 927100 }, { "epoch": 0.009272, "grad_norm": 0.13392511010169983, "learning_rate": 1e-05, "loss": 0.0181, "step": 927200 }, { "epoch": 0.009273, "grad_norm": 0.1890188753604889, "learning_rate": 1e-05, "loss": 0.0181, "step": 927300 }, { "epoch": 0.009274, "grad_norm": 0.11862649023532867, "learning_rate": 1e-05, "loss": 0.0185, "step": 927400 }, { "epoch": 0.009275, "grad_norm": 0.25446391105651855, "learning_rate": 1e-05, "loss": 0.0177, "step": 927500 }, { "epoch": 0.009276, "grad_norm": 0.1564226597547531, "learning_rate": 1e-05, "loss": 0.018, "step": 927600 }, { "epoch": 0.009277, "grad_norm": 0.1381351500749588, "learning_rate": 1e-05, "loss": 0.0179, "step": 927700 }, { "epoch": 0.009278, "grad_norm": 0.1524568796157837, "learning_rate": 1e-05, "loss": 0.018, "step": 927800 }, { "epoch": 0.009279, "grad_norm": 0.15851794183254242, "learning_rate": 1e-05, "loss": 0.0177, "step": 927900 }, { "epoch": 0.00928, "grad_norm": 0.16218140721321106, "learning_rate": 1e-05, "loss": 0.0182, "step": 928000 }, { "epoch": 0.009281, "grad_norm": 0.1230316087603569, "learning_rate": 1e-05, "loss": 0.0181, "step": 928100 }, { "epoch": 0.009282, "grad_norm": 0.14685802161693573, "learning_rate": 1e-05, "loss": 0.0179, "step": 928200 }, { "epoch": 0.009283, "grad_norm": 0.11216016858816147, "learning_rate": 1e-05, "loss": 0.0175, "step": 928300 }, { "epoch": 0.009284, "grad_norm": 0.24725329875946045, "learning_rate": 1e-05, "loss": 0.0183, "step": 928400 }, { "epoch": 0.009285, "grad_norm": 0.13705870509147644, "learning_rate": 1e-05, "loss": 0.018, "step": 928500 }, { "epoch": 0.009286, "grad_norm": 0.14583919942378998, "learning_rate": 1e-05, "loss": 0.0181, "step": 928600 }, { "epoch": 0.009287, "grad_norm": 0.10384032875299454, "learning_rate": 1e-05, "loss": 0.0182, "step": 928700 }, { "epoch": 0.009288, "grad_norm": 0.11147630959749222, "learning_rate": 1e-05, "loss": 0.018, "step": 928800 }, { "epoch": 0.009289, "grad_norm": 0.1739988923072815, "learning_rate": 1e-05, "loss": 0.0177, "step": 928900 }, { "epoch": 0.00929, "grad_norm": 0.142597034573555, "learning_rate": 1e-05, "loss": 0.018, "step": 929000 }, { "epoch": 0.009291, "grad_norm": 0.12384416162967682, "learning_rate": 1e-05, "loss": 0.018, "step": 929100 }, { "epoch": 0.009292, "grad_norm": 0.17769193649291992, "learning_rate": 1e-05, "loss": 0.0177, "step": 929200 }, { "epoch": 0.009293, "grad_norm": 0.13419194519519806, "learning_rate": 1e-05, "loss": 0.0178, "step": 929300 }, { "epoch": 0.009294, "grad_norm": 0.15695710480213165, "learning_rate": 1e-05, "loss": 0.018, "step": 929400 }, { "epoch": 0.009295, "grad_norm": 0.1778259575366974, "learning_rate": 1e-05, "loss": 0.0179, "step": 929500 }, { "epoch": 0.009296, "grad_norm": 0.15090411901474, "learning_rate": 1e-05, "loss": 0.0175, "step": 929600 }, { "epoch": 0.009297, "grad_norm": 0.11081527173519135, "learning_rate": 1e-05, "loss": 0.0181, "step": 929700 }, { "epoch": 0.009298, "grad_norm": 0.12441938370466232, "learning_rate": 1e-05, "loss": 0.0181, "step": 929800 }, { "epoch": 0.009299, "grad_norm": 0.10844814032316208, "learning_rate": 1e-05, "loss": 0.018, "step": 929900 }, { "epoch": 0.0093, "grad_norm": 0.1573033332824707, "learning_rate": 1e-05, "loss": 0.0185, "step": 930000 }, { "epoch": 0.009301, "grad_norm": 0.14442722499370575, "learning_rate": 1e-05, "loss": 0.0181, "step": 930100 }, { "epoch": 0.009302, "grad_norm": 0.19623424112796783, "learning_rate": 1e-05, "loss": 0.0177, "step": 930200 }, { "epoch": 0.009303, "grad_norm": 0.17261752486228943, "learning_rate": 1e-05, "loss": 0.0182, "step": 930300 }, { "epoch": 0.009304, "grad_norm": 0.12873288989067078, "learning_rate": 1e-05, "loss": 0.0179, "step": 930400 }, { "epoch": 0.009305, "grad_norm": 0.1451409012079239, "learning_rate": 1e-05, "loss": 0.0178, "step": 930500 }, { "epoch": 0.009306, "grad_norm": 0.1250436156988144, "learning_rate": 1e-05, "loss": 0.0181, "step": 930600 }, { "epoch": 0.009307, "grad_norm": 0.11203733086585999, "learning_rate": 1e-05, "loss": 0.0181, "step": 930700 }, { "epoch": 0.009308, "grad_norm": 0.1543307602405548, "learning_rate": 1e-05, "loss": 0.018, "step": 930800 }, { "epoch": 0.009309, "grad_norm": 0.13746653497219086, "learning_rate": 1e-05, "loss": 0.0183, "step": 930900 }, { "epoch": 0.00931, "grad_norm": 0.12792786955833435, "learning_rate": 1e-05, "loss": 0.018, "step": 931000 }, { "epoch": 0.009311, "grad_norm": 0.17920421063899994, "learning_rate": 1e-05, "loss": 0.018, "step": 931100 }, { "epoch": 0.009312, "grad_norm": 0.18692167103290558, "learning_rate": 1e-05, "loss": 0.0183, "step": 931200 }, { "epoch": 0.009313, "grad_norm": 0.16546735167503357, "learning_rate": 1e-05, "loss": 0.0179, "step": 931300 }, { "epoch": 0.009314, "grad_norm": 0.13890863955020905, "learning_rate": 1e-05, "loss": 0.0179, "step": 931400 }, { "epoch": 0.009315, "grad_norm": 0.13929997384548187, "learning_rate": 1e-05, "loss": 0.018, "step": 931500 }, { "epoch": 0.009316, "grad_norm": 0.1343056708574295, "learning_rate": 1e-05, "loss": 0.0183, "step": 931600 }, { "epoch": 0.009317, "grad_norm": 0.14534078538417816, "learning_rate": 1e-05, "loss": 0.0186, "step": 931700 }, { "epoch": 0.009318, "grad_norm": 0.16930507123470306, "learning_rate": 1e-05, "loss": 0.0176, "step": 931800 }, { "epoch": 0.009319, "grad_norm": 0.1899981051683426, "learning_rate": 1e-05, "loss": 0.0181, "step": 931900 }, { "epoch": 0.00932, "grad_norm": 0.1715501993894577, "learning_rate": 1e-05, "loss": 0.018, "step": 932000 }, { "epoch": 0.009321, "grad_norm": 0.16633954644203186, "learning_rate": 1e-05, "loss": 0.0184, "step": 932100 }, { "epoch": 0.009322, "grad_norm": 0.16599927842617035, "learning_rate": 1e-05, "loss": 0.0179, "step": 932200 }, { "epoch": 0.009323, "grad_norm": 0.15973451733589172, "learning_rate": 1e-05, "loss": 0.018, "step": 932300 }, { "epoch": 0.009324, "grad_norm": 0.15406367182731628, "learning_rate": 1e-05, "loss": 0.0176, "step": 932400 }, { "epoch": 0.009325, "grad_norm": 0.19366222620010376, "learning_rate": 1e-05, "loss": 0.0176, "step": 932500 }, { "epoch": 0.009326, "grad_norm": 0.15970675647258759, "learning_rate": 1e-05, "loss": 0.018, "step": 932600 }, { "epoch": 0.009327, "grad_norm": 0.14274531602859497, "learning_rate": 1e-05, "loss": 0.0179, "step": 932700 }, { "epoch": 0.009328, "grad_norm": 0.14493858814239502, "learning_rate": 1e-05, "loss": 0.0179, "step": 932800 }, { "epoch": 0.009329, "grad_norm": 0.15462525188922882, "learning_rate": 1e-05, "loss": 0.0183, "step": 932900 }, { "epoch": 0.00933, "grad_norm": 0.15276506543159485, "learning_rate": 1e-05, "loss": 0.0179, "step": 933000 }, { "epoch": 0.009331, "grad_norm": 0.2587019205093384, "learning_rate": 1e-05, "loss": 0.0183, "step": 933100 }, { "epoch": 0.009332, "grad_norm": 0.15870976448059082, "learning_rate": 1e-05, "loss": 0.0177, "step": 933200 }, { "epoch": 0.009333, "grad_norm": 0.10374358296394348, "learning_rate": 1e-05, "loss": 0.0182, "step": 933300 }, { "epoch": 0.009334, "grad_norm": 0.17694774270057678, "learning_rate": 1e-05, "loss": 0.0179, "step": 933400 }, { "epoch": 0.009335, "grad_norm": 0.16231349110603333, "learning_rate": 1e-05, "loss": 0.0179, "step": 933500 }, { "epoch": 0.009336, "grad_norm": 0.21514008939266205, "learning_rate": 1e-05, "loss": 0.0177, "step": 933600 }, { "epoch": 0.009337, "grad_norm": 0.18637490272521973, "learning_rate": 1e-05, "loss": 0.0179, "step": 933700 }, { "epoch": 0.009338, "grad_norm": 0.1684928983449936, "learning_rate": 1e-05, "loss": 0.0182, "step": 933800 }, { "epoch": 0.009339, "grad_norm": 0.13512414693832397, "learning_rate": 1e-05, "loss": 0.018, "step": 933900 }, { "epoch": 0.00934, "grad_norm": 0.1331205815076828, "learning_rate": 1e-05, "loss": 0.0182, "step": 934000 }, { "epoch": 0.009341, "grad_norm": 0.1962091475725174, "learning_rate": 1e-05, "loss": 0.0183, "step": 934100 }, { "epoch": 0.009342, "grad_norm": 0.15741625428199768, "learning_rate": 1e-05, "loss": 0.0182, "step": 934200 }, { "epoch": 0.009343, "grad_norm": 0.15543271601200104, "learning_rate": 1e-05, "loss": 0.0177, "step": 934300 }, { "epoch": 0.009344, "grad_norm": 0.13104425370693207, "learning_rate": 1e-05, "loss": 0.0182, "step": 934400 }, { "epoch": 0.009345, "grad_norm": 0.14001299440860748, "learning_rate": 1e-05, "loss": 0.0184, "step": 934500 }, { "epoch": 0.009346, "grad_norm": 0.1829235553741455, "learning_rate": 1e-05, "loss": 0.0181, "step": 934600 }, { "epoch": 0.009347, "grad_norm": 0.13670490682125092, "learning_rate": 1e-05, "loss": 0.018, "step": 934700 }, { "epoch": 0.009348, "grad_norm": 0.15246827900409698, "learning_rate": 1e-05, "loss": 0.0182, "step": 934800 }, { "epoch": 0.009349, "grad_norm": 0.1166670024394989, "learning_rate": 1e-05, "loss": 0.018, "step": 934900 }, { "epoch": 0.00935, "grad_norm": 0.15546110272407532, "learning_rate": 1e-05, "loss": 0.0178, "step": 935000 }, { "epoch": 0.009351, "grad_norm": 0.12334131449460983, "learning_rate": 1e-05, "loss": 0.0183, "step": 935100 }, { "epoch": 0.009352, "grad_norm": 0.12835505604743958, "learning_rate": 1e-05, "loss": 0.0184, "step": 935200 }, { "epoch": 0.009353, "grad_norm": 0.1311442106962204, "learning_rate": 1e-05, "loss": 0.0182, "step": 935300 }, { "epoch": 0.009354, "grad_norm": 0.13854368031024933, "learning_rate": 1e-05, "loss": 0.0185, "step": 935400 }, { "epoch": 0.009355, "grad_norm": 0.1308385282754898, "learning_rate": 1e-05, "loss": 0.018, "step": 935500 }, { "epoch": 0.009356, "grad_norm": 0.12530401349067688, "learning_rate": 1e-05, "loss": 0.0183, "step": 935600 }, { "epoch": 0.009357, "grad_norm": 0.11094356328248978, "learning_rate": 1e-05, "loss": 0.0179, "step": 935700 }, { "epoch": 0.009358, "grad_norm": 0.19470266997814178, "learning_rate": 1e-05, "loss": 0.0181, "step": 935800 }, { "epoch": 0.009359, "grad_norm": 0.1336561143398285, "learning_rate": 1e-05, "loss": 0.018, "step": 935900 }, { "epoch": 0.00936, "grad_norm": 0.24172258377075195, "learning_rate": 1e-05, "loss": 0.0177, "step": 936000 }, { "epoch": 0.009361, "grad_norm": 0.13881339132785797, "learning_rate": 1e-05, "loss": 0.0178, "step": 936100 }, { "epoch": 0.009362, "grad_norm": 0.14799122512340546, "learning_rate": 1e-05, "loss": 0.0178, "step": 936200 }, { "epoch": 0.009363, "grad_norm": 0.12750327587127686, "learning_rate": 1e-05, "loss": 0.018, "step": 936300 }, { "epoch": 0.009364, "grad_norm": 0.13429421186447144, "learning_rate": 1e-05, "loss": 0.0178, "step": 936400 }, { "epoch": 0.009365, "grad_norm": 0.19858290255069733, "learning_rate": 1e-05, "loss": 0.018, "step": 936500 }, { "epoch": 0.009366, "grad_norm": 0.13402289152145386, "learning_rate": 1e-05, "loss": 0.0179, "step": 936600 }, { "epoch": 0.009367, "grad_norm": 0.10637903213500977, "learning_rate": 1e-05, "loss": 0.0181, "step": 936700 }, { "epoch": 0.009368, "grad_norm": 0.15497159957885742, "learning_rate": 1e-05, "loss": 0.0179, "step": 936800 }, { "epoch": 0.009369, "grad_norm": 0.21724186837673187, "learning_rate": 1e-05, "loss": 0.0181, "step": 936900 }, { "epoch": 0.00937, "grad_norm": 0.1582552045583725, "learning_rate": 1e-05, "loss": 0.0179, "step": 937000 }, { "epoch": 0.009371, "grad_norm": 0.11902865022420883, "learning_rate": 1e-05, "loss": 0.018, "step": 937100 }, { "epoch": 0.009372, "grad_norm": 0.12663181126117706, "learning_rate": 1e-05, "loss": 0.0178, "step": 937200 }, { "epoch": 0.009373, "grad_norm": 0.12109782546758652, "learning_rate": 1e-05, "loss": 0.018, "step": 937300 }, { "epoch": 0.009374, "grad_norm": 0.18306905031204224, "learning_rate": 1e-05, "loss": 0.0183, "step": 937400 }, { "epoch": 0.009375, "grad_norm": 0.11037253588438034, "learning_rate": 1e-05, "loss": 0.0179, "step": 937500 }, { "epoch": 0.009376, "grad_norm": 0.15797671675682068, "learning_rate": 1e-05, "loss": 0.0185, "step": 937600 }, { "epoch": 0.009377, "grad_norm": 0.11785414069890976, "learning_rate": 1e-05, "loss": 0.0178, "step": 937700 }, { "epoch": 0.009378, "grad_norm": 0.12331636250019073, "learning_rate": 1e-05, "loss": 0.0176, "step": 937800 }, { "epoch": 0.009379, "grad_norm": 0.14648693799972534, "learning_rate": 1e-05, "loss": 0.0178, "step": 937900 }, { "epoch": 0.00938, "grad_norm": 0.14254789054393768, "learning_rate": 1e-05, "loss": 0.0178, "step": 938000 }, { "epoch": 0.009381, "grad_norm": 0.1634385883808136, "learning_rate": 1e-05, "loss": 0.0181, "step": 938100 }, { "epoch": 0.009382, "grad_norm": 0.1175227016210556, "learning_rate": 1e-05, "loss": 0.0179, "step": 938200 }, { "epoch": 0.009383, "grad_norm": 0.16071324050426483, "learning_rate": 1e-05, "loss": 0.0182, "step": 938300 }, { "epoch": 0.009384, "grad_norm": 0.11998133361339569, "learning_rate": 1e-05, "loss": 0.0185, "step": 938400 }, { "epoch": 0.009385, "grad_norm": 0.1453508734703064, "learning_rate": 1e-05, "loss": 0.0178, "step": 938500 }, { "epoch": 0.009386, "grad_norm": 0.12241526693105698, "learning_rate": 1e-05, "loss": 0.018, "step": 938600 }, { "epoch": 0.009387, "grad_norm": 0.14062145352363586, "learning_rate": 1e-05, "loss": 0.0185, "step": 938700 }, { "epoch": 0.009388, "grad_norm": 0.15056899189949036, "learning_rate": 1e-05, "loss": 0.0179, "step": 938800 }, { "epoch": 0.009389, "grad_norm": 0.17937175929546356, "learning_rate": 1e-05, "loss": 0.0179, "step": 938900 }, { "epoch": 0.00939, "grad_norm": 0.18851806223392487, "learning_rate": 1e-05, "loss": 0.0182, "step": 939000 }, { "epoch": 0.009391, "grad_norm": 0.1700427532196045, "learning_rate": 1e-05, "loss": 0.0181, "step": 939100 }, { "epoch": 0.009392, "grad_norm": 0.1652897745370865, "learning_rate": 1e-05, "loss": 0.0177, "step": 939200 }, { "epoch": 0.009393, "grad_norm": 0.15616154670715332, "learning_rate": 1e-05, "loss": 0.0185, "step": 939300 }, { "epoch": 0.009394, "grad_norm": 0.1020502969622612, "learning_rate": 1e-05, "loss": 0.0177, "step": 939400 }, { "epoch": 0.009395, "grad_norm": 0.1509954035282135, "learning_rate": 1e-05, "loss": 0.018, "step": 939500 }, { "epoch": 0.009396, "grad_norm": 0.11501530557870865, "learning_rate": 1e-05, "loss": 0.0182, "step": 939600 }, { "epoch": 0.009397, "grad_norm": 0.13742691278457642, "learning_rate": 1e-05, "loss": 0.0182, "step": 939700 }, { "epoch": 0.009398, "grad_norm": 0.15436850488185883, "learning_rate": 1e-05, "loss": 0.0184, "step": 939800 }, { "epoch": 0.009399, "grad_norm": 0.11451384425163269, "learning_rate": 1e-05, "loss": 0.018, "step": 939900 }, { "epoch": 0.0094, "grad_norm": 0.14588619768619537, "learning_rate": 1e-05, "loss": 0.0185, "step": 940000 }, { "epoch": 0.0094, "eval_loss": 0.015838859602808952, "eval_runtime": 191.3173, "eval_samples_per_second": 261.346, "eval_steps_per_second": 16.334, "step": 940000 }, { "epoch": 0.009401, "grad_norm": 0.11793120950460434, "learning_rate": 1e-05, "loss": 0.0187, "step": 940100 }, { "epoch": 0.009402, "grad_norm": 0.11050935834646225, "learning_rate": 1e-05, "loss": 0.0177, "step": 940200 }, { "epoch": 0.009403, "grad_norm": 0.14438027143478394, "learning_rate": 1e-05, "loss": 0.018, "step": 940300 }, { "epoch": 0.009404, "grad_norm": 0.18890702724456787, "learning_rate": 1e-05, "loss": 0.0184, "step": 940400 }, { "epoch": 0.009405, "grad_norm": 0.22736987471580505, "learning_rate": 1e-05, "loss": 0.0183, "step": 940500 }, { "epoch": 0.009406, "grad_norm": 0.15173248946666718, "learning_rate": 1e-05, "loss": 0.0182, "step": 940600 }, { "epoch": 0.009407, "grad_norm": 0.18988750874996185, "learning_rate": 1e-05, "loss": 0.0183, "step": 940700 }, { "epoch": 0.009408, "grad_norm": 0.19747765362262726, "learning_rate": 1e-05, "loss": 0.0179, "step": 940800 }, { "epoch": 0.009409, "grad_norm": 0.1614970713853836, "learning_rate": 1e-05, "loss": 0.018, "step": 940900 }, { "epoch": 0.00941, "grad_norm": 0.1305038332939148, "learning_rate": 1e-05, "loss": 0.0183, "step": 941000 }, { "epoch": 0.009411, "grad_norm": 0.1549704372882843, "learning_rate": 1e-05, "loss": 0.0178, "step": 941100 }, { "epoch": 0.009412, "grad_norm": 0.13940462470054626, "learning_rate": 1e-05, "loss": 0.0179, "step": 941200 }, { "epoch": 0.009413, "grad_norm": 0.13899250328540802, "learning_rate": 1e-05, "loss": 0.0184, "step": 941300 }, { "epoch": 0.009414, "grad_norm": 0.135349839925766, "learning_rate": 1e-05, "loss": 0.018, "step": 941400 }, { "epoch": 0.009415, "grad_norm": 0.1392451524734497, "learning_rate": 1e-05, "loss": 0.0183, "step": 941500 }, { "epoch": 0.009416, "grad_norm": 0.1468505859375, "learning_rate": 1e-05, "loss": 0.0181, "step": 941600 }, { "epoch": 0.009417, "grad_norm": 0.18143370747566223, "learning_rate": 1e-05, "loss": 0.0183, "step": 941700 }, { "epoch": 0.009418, "grad_norm": 0.13255265355110168, "learning_rate": 1e-05, "loss": 0.0179, "step": 941800 }, { "epoch": 0.009419, "grad_norm": 0.11997874826192856, "learning_rate": 1e-05, "loss": 0.0178, "step": 941900 }, { "epoch": 0.00942, "grad_norm": 0.1266726851463318, "learning_rate": 1e-05, "loss": 0.0178, "step": 942000 }, { "epoch": 0.009421, "grad_norm": 0.16673791408538818, "learning_rate": 1e-05, "loss": 0.0181, "step": 942100 }, { "epoch": 0.009422, "grad_norm": 0.18915756046772003, "learning_rate": 1e-05, "loss": 0.0179, "step": 942200 }, { "epoch": 0.009423, "grad_norm": 0.132243350148201, "learning_rate": 1e-05, "loss": 0.0179, "step": 942300 }, { "epoch": 0.009424, "grad_norm": 0.10712031275033951, "learning_rate": 1e-05, "loss": 0.0186, "step": 942400 }, { "epoch": 0.009425, "grad_norm": 0.17145071923732758, "learning_rate": 1e-05, "loss": 0.0181, "step": 942500 }, { "epoch": 0.009426, "grad_norm": 0.17142502963542938, "learning_rate": 1e-05, "loss": 0.0179, "step": 942600 }, { "epoch": 0.009427, "grad_norm": 0.11862899363040924, "learning_rate": 1e-05, "loss": 0.0179, "step": 942700 }, { "epoch": 0.009428, "grad_norm": 0.18932422995567322, "learning_rate": 1e-05, "loss": 0.0177, "step": 942800 }, { "epoch": 0.009429, "grad_norm": 0.12340790778398514, "learning_rate": 1e-05, "loss": 0.0177, "step": 942900 }, { "epoch": 0.00943, "grad_norm": 0.197589710354805, "learning_rate": 1e-05, "loss": 0.0176, "step": 943000 }, { "epoch": 0.009431, "grad_norm": 0.1460799276828766, "learning_rate": 1e-05, "loss": 0.0185, "step": 943100 }, { "epoch": 0.009432, "grad_norm": 0.12229296565055847, "learning_rate": 1e-05, "loss": 0.0179, "step": 943200 }, { "epoch": 0.009433, "grad_norm": 0.11746305227279663, "learning_rate": 1e-05, "loss": 0.018, "step": 943300 }, { "epoch": 0.009434, "grad_norm": 0.1319737285375595, "learning_rate": 1e-05, "loss": 0.0177, "step": 943400 }, { "epoch": 0.009435, "grad_norm": 0.12323155999183655, "learning_rate": 1e-05, "loss": 0.0181, "step": 943500 }, { "epoch": 0.009436, "grad_norm": 0.16949400305747986, "learning_rate": 1e-05, "loss": 0.0182, "step": 943600 }, { "epoch": 0.009437, "grad_norm": 0.16578827798366547, "learning_rate": 1e-05, "loss": 0.0178, "step": 943700 }, { "epoch": 0.009438, "grad_norm": 0.10304205119609833, "learning_rate": 1e-05, "loss": 0.0178, "step": 943800 }, { "epoch": 0.009439, "grad_norm": 0.1581662893295288, "learning_rate": 1e-05, "loss": 0.0186, "step": 943900 }, { "epoch": 0.00944, "grad_norm": 0.15317153930664062, "learning_rate": 1e-05, "loss": 0.0181, "step": 944000 }, { "epoch": 0.009441, "grad_norm": 0.14246517419815063, "learning_rate": 1e-05, "loss": 0.0176, "step": 944100 }, { "epoch": 0.009442, "grad_norm": 0.14863377809524536, "learning_rate": 1e-05, "loss": 0.0182, "step": 944200 }, { "epoch": 0.009443, "grad_norm": 0.1375097632408142, "learning_rate": 1e-05, "loss": 0.0176, "step": 944300 }, { "epoch": 0.009444, "grad_norm": 0.15790222585201263, "learning_rate": 1e-05, "loss": 0.018, "step": 944400 }, { "epoch": 0.009445, "grad_norm": 0.1705692857503891, "learning_rate": 1e-05, "loss": 0.0181, "step": 944500 }, { "epoch": 0.009446, "grad_norm": 0.17581632733345032, "learning_rate": 1e-05, "loss": 0.0178, "step": 944600 }, { "epoch": 0.009447, "grad_norm": 0.11708685010671616, "learning_rate": 1e-05, "loss": 0.018, "step": 944700 }, { "epoch": 0.009448, "grad_norm": 0.14109794795513153, "learning_rate": 1e-05, "loss": 0.018, "step": 944800 }, { "epoch": 0.009449, "grad_norm": 0.12718099355697632, "learning_rate": 1e-05, "loss": 0.0177, "step": 944900 }, { "epoch": 0.00945, "grad_norm": 0.14242209494113922, "learning_rate": 1e-05, "loss": 0.0178, "step": 945000 }, { "epoch": 0.009451, "grad_norm": 0.1657029092311859, "learning_rate": 1e-05, "loss": 0.0181, "step": 945100 }, { "epoch": 0.009452, "grad_norm": 0.1387983113527298, "learning_rate": 1e-05, "loss": 0.0179, "step": 945200 }, { "epoch": 0.009453, "grad_norm": 0.13325797021389008, "learning_rate": 1e-05, "loss": 0.0179, "step": 945300 }, { "epoch": 0.009454, "grad_norm": 0.14563493430614471, "learning_rate": 1e-05, "loss": 0.0177, "step": 945400 }, { "epoch": 0.009455, "grad_norm": 0.14003588259220123, "learning_rate": 1e-05, "loss": 0.0181, "step": 945500 }, { "epoch": 0.009456, "grad_norm": 0.211473286151886, "learning_rate": 1e-05, "loss": 0.0178, "step": 945600 }, { "epoch": 0.009457, "grad_norm": 0.14821314811706543, "learning_rate": 1e-05, "loss": 0.0179, "step": 945700 }, { "epoch": 0.009458, "grad_norm": 0.1504562795162201, "learning_rate": 1e-05, "loss": 0.0181, "step": 945800 }, { "epoch": 0.009459, "grad_norm": 0.16178929805755615, "learning_rate": 1e-05, "loss": 0.0177, "step": 945900 }, { "epoch": 0.00946, "grad_norm": 0.1603882610797882, "learning_rate": 1e-05, "loss": 0.0178, "step": 946000 }, { "epoch": 0.009461, "grad_norm": 0.21819761395454407, "learning_rate": 1e-05, "loss": 0.0177, "step": 946100 }, { "epoch": 0.009462, "grad_norm": 0.13563300669193268, "learning_rate": 1e-05, "loss": 0.0179, "step": 946200 }, { "epoch": 0.009463, "grad_norm": 0.13368982076644897, "learning_rate": 1e-05, "loss": 0.0179, "step": 946300 }, { "epoch": 0.009464, "grad_norm": 0.18668992817401886, "learning_rate": 1e-05, "loss": 0.0182, "step": 946400 }, { "epoch": 0.009465, "grad_norm": 0.1254568248987198, "learning_rate": 1e-05, "loss": 0.0177, "step": 946500 }, { "epoch": 0.009466, "grad_norm": 0.3115992248058319, "learning_rate": 1e-05, "loss": 0.0185, "step": 946600 }, { "epoch": 0.009467, "grad_norm": 0.16160601377487183, "learning_rate": 1e-05, "loss": 0.0185, "step": 946700 }, { "epoch": 0.009468, "grad_norm": 0.23021253943443298, "learning_rate": 1e-05, "loss": 0.0182, "step": 946800 }, { "epoch": 0.009469, "grad_norm": 0.17398318648338318, "learning_rate": 1e-05, "loss": 0.018, "step": 946900 }, { "epoch": 0.00947, "grad_norm": 0.12136880308389664, "learning_rate": 1e-05, "loss": 0.0182, "step": 947000 }, { "epoch": 0.009471, "grad_norm": 0.1627727746963501, "learning_rate": 1e-05, "loss": 0.0184, "step": 947100 }, { "epoch": 0.009472, "grad_norm": 0.13421747088432312, "learning_rate": 1e-05, "loss": 0.0174, "step": 947200 }, { "epoch": 0.009473, "grad_norm": 0.17403683066368103, "learning_rate": 1e-05, "loss": 0.018, "step": 947300 }, { "epoch": 0.009474, "grad_norm": 0.12243827432394028, "learning_rate": 1e-05, "loss": 0.0182, "step": 947400 }, { "epoch": 0.009475, "grad_norm": 0.10883220285177231, "learning_rate": 1e-05, "loss": 0.0181, "step": 947500 }, { "epoch": 0.009476, "grad_norm": 0.13728532195091248, "learning_rate": 1e-05, "loss": 0.0183, "step": 947600 }, { "epoch": 0.009477, "grad_norm": 0.14445815980434418, "learning_rate": 1e-05, "loss": 0.0177, "step": 947700 }, { "epoch": 0.009478, "grad_norm": 0.13360314071178436, "learning_rate": 1e-05, "loss": 0.0179, "step": 947800 }, { "epoch": 0.009479, "grad_norm": 0.13085371255874634, "learning_rate": 1e-05, "loss": 0.018, "step": 947900 }, { "epoch": 0.00948, "grad_norm": 0.19845075905323029, "learning_rate": 1e-05, "loss": 0.0185, "step": 948000 }, { "epoch": 0.009481, "grad_norm": 0.12791161239147186, "learning_rate": 1e-05, "loss": 0.0184, "step": 948100 }, { "epoch": 0.009482, "grad_norm": 0.156454935669899, "learning_rate": 1e-05, "loss": 0.0181, "step": 948200 }, { "epoch": 0.009483, "grad_norm": 0.14785659313201904, "learning_rate": 1e-05, "loss": 0.0179, "step": 948300 }, { "epoch": 0.009484, "grad_norm": 0.11905902624130249, "learning_rate": 1e-05, "loss": 0.0183, "step": 948400 }, { "epoch": 0.009485, "grad_norm": 0.11752021312713623, "learning_rate": 1e-05, "loss": 0.0182, "step": 948500 }, { "epoch": 0.009486, "grad_norm": 0.1353670060634613, "learning_rate": 1e-05, "loss": 0.018, "step": 948600 }, { "epoch": 0.009487, "grad_norm": 0.13574032485485077, "learning_rate": 1e-05, "loss": 0.0181, "step": 948700 }, { "epoch": 0.009488, "grad_norm": 0.10603903979063034, "learning_rate": 1e-05, "loss": 0.0181, "step": 948800 }, { "epoch": 0.009489, "grad_norm": 0.1055871769785881, "learning_rate": 1e-05, "loss": 0.0181, "step": 948900 }, { "epoch": 0.00949, "grad_norm": 0.16486436128616333, "learning_rate": 1e-05, "loss": 0.0175, "step": 949000 }, { "epoch": 0.009491, "grad_norm": 0.2534329891204834, "learning_rate": 1e-05, "loss": 0.0184, "step": 949100 }, { "epoch": 0.009492, "grad_norm": 0.1548013985157013, "learning_rate": 1e-05, "loss": 0.0188, "step": 949200 }, { "epoch": 0.009493, "grad_norm": 0.17351464927196503, "learning_rate": 1e-05, "loss": 0.0183, "step": 949300 }, { "epoch": 0.009494, "grad_norm": 0.21082308888435364, "learning_rate": 1e-05, "loss": 0.0186, "step": 949400 }, { "epoch": 0.009495, "grad_norm": 0.14468476176261902, "learning_rate": 1e-05, "loss": 0.018, "step": 949500 }, { "epoch": 0.009496, "grad_norm": 0.13380751013755798, "learning_rate": 1e-05, "loss": 0.018, "step": 949600 }, { "epoch": 0.009497, "grad_norm": 0.1507839560508728, "learning_rate": 1e-05, "loss": 0.0178, "step": 949700 }, { "epoch": 0.009498, "grad_norm": 0.17213378846645355, "learning_rate": 1e-05, "loss": 0.0178, "step": 949800 }, { "epoch": 0.009499, "grad_norm": 0.161167711019516, "learning_rate": 1e-05, "loss": 0.0183, "step": 949900 }, { "epoch": 0.0095, "grad_norm": 0.14871449768543243, "learning_rate": 1e-05, "loss": 0.0186, "step": 950000 }, { "epoch": 0.009501, "grad_norm": 0.17254063487052917, "learning_rate": 1e-05, "loss": 0.0182, "step": 950100 }, { "epoch": 0.009502, "grad_norm": 0.12380267679691315, "learning_rate": 1e-05, "loss": 0.018, "step": 950200 }, { "epoch": 0.009503, "grad_norm": 0.14276112616062164, "learning_rate": 1e-05, "loss": 0.0179, "step": 950300 }, { "epoch": 0.009504, "grad_norm": 0.20552778244018555, "learning_rate": 1e-05, "loss": 0.018, "step": 950400 }, { "epoch": 0.009505, "grad_norm": 0.16740301251411438, "learning_rate": 1e-05, "loss": 0.0177, "step": 950500 }, { "epoch": 0.009506, "grad_norm": 0.18112064898014069, "learning_rate": 1e-05, "loss": 0.0183, "step": 950600 }, { "epoch": 0.009507, "grad_norm": 0.13253802061080933, "learning_rate": 1e-05, "loss": 0.018, "step": 950700 }, { "epoch": 0.009508, "grad_norm": 0.1273380070924759, "learning_rate": 1e-05, "loss": 0.0178, "step": 950800 }, { "epoch": 0.009509, "grad_norm": 0.15191443264484406, "learning_rate": 1e-05, "loss": 0.0179, "step": 950900 }, { "epoch": 0.00951, "grad_norm": 0.1349819004535675, "learning_rate": 1e-05, "loss": 0.0178, "step": 951000 }, { "epoch": 0.009511, "grad_norm": 0.15290360152721405, "learning_rate": 1e-05, "loss": 0.0179, "step": 951100 }, { "epoch": 0.009512, "grad_norm": 0.13994021713733673, "learning_rate": 1e-05, "loss": 0.0174, "step": 951200 }, { "epoch": 0.009513, "grad_norm": 0.15382781624794006, "learning_rate": 1e-05, "loss": 0.018, "step": 951300 }, { "epoch": 0.009514, "grad_norm": 0.1740882396697998, "learning_rate": 1e-05, "loss": 0.0181, "step": 951400 }, { "epoch": 0.009515, "grad_norm": 0.18523475527763367, "learning_rate": 1e-05, "loss": 0.0177, "step": 951500 }, { "epoch": 0.009516, "grad_norm": 0.1334380954504013, "learning_rate": 1e-05, "loss": 0.0174, "step": 951600 }, { "epoch": 0.009517, "grad_norm": 0.14595051109790802, "learning_rate": 1e-05, "loss": 0.0183, "step": 951700 }, { "epoch": 0.009518, "grad_norm": 0.14485403895378113, "learning_rate": 1e-05, "loss": 0.0181, "step": 951800 }, { "epoch": 0.009519, "grad_norm": 0.11490979045629501, "learning_rate": 1e-05, "loss": 0.018, "step": 951900 }, { "epoch": 0.00952, "grad_norm": 0.12913402915000916, "learning_rate": 1e-05, "loss": 0.0175, "step": 952000 }, { "epoch": 0.009521, "grad_norm": 0.13765612244606018, "learning_rate": 1e-05, "loss": 0.0177, "step": 952100 }, { "epoch": 0.009522, "grad_norm": 0.1497364193201065, "learning_rate": 1e-05, "loss": 0.0178, "step": 952200 }, { "epoch": 0.009523, "grad_norm": 0.14224587380886078, "learning_rate": 1e-05, "loss": 0.0182, "step": 952300 }, { "epoch": 0.009524, "grad_norm": 0.09687583893537521, "learning_rate": 1e-05, "loss": 0.0182, "step": 952400 }, { "epoch": 0.009525, "grad_norm": 0.1987210065126419, "learning_rate": 1e-05, "loss": 0.0181, "step": 952500 }, { "epoch": 0.009526, "grad_norm": 0.1698167771100998, "learning_rate": 1e-05, "loss": 0.0183, "step": 952600 }, { "epoch": 0.009527, "grad_norm": 0.12467025220394135, "learning_rate": 1e-05, "loss": 0.0175, "step": 952700 }, { "epoch": 0.009528, "grad_norm": 0.13725455105304718, "learning_rate": 1e-05, "loss": 0.0177, "step": 952800 }, { "epoch": 0.009529, "grad_norm": 0.1841859221458435, "learning_rate": 1e-05, "loss": 0.0178, "step": 952900 }, { "epoch": 0.00953, "grad_norm": 0.15734036266803741, "learning_rate": 1e-05, "loss": 0.0177, "step": 953000 }, { "epoch": 0.009531, "grad_norm": 0.2655043303966522, "learning_rate": 1e-05, "loss": 0.0181, "step": 953100 }, { "epoch": 0.009532, "grad_norm": 0.1578027606010437, "learning_rate": 1e-05, "loss": 0.0178, "step": 953200 }, { "epoch": 0.009533, "grad_norm": 0.174931600689888, "learning_rate": 1e-05, "loss": 0.0176, "step": 953300 }, { "epoch": 0.009534, "grad_norm": 0.13093125820159912, "learning_rate": 1e-05, "loss": 0.0179, "step": 953400 }, { "epoch": 0.009535, "grad_norm": 0.09978274255990982, "learning_rate": 1e-05, "loss": 0.0176, "step": 953500 }, { "epoch": 0.009536, "grad_norm": 0.14106376469135284, "learning_rate": 1e-05, "loss": 0.0181, "step": 953600 }, { "epoch": 0.009537, "grad_norm": 0.11955684423446655, "learning_rate": 1e-05, "loss": 0.0181, "step": 953700 }, { "epoch": 0.009538, "grad_norm": 0.14301416277885437, "learning_rate": 1e-05, "loss": 0.018, "step": 953800 }, { "epoch": 0.009539, "grad_norm": 0.15769271552562714, "learning_rate": 1e-05, "loss": 0.0182, "step": 953900 }, { "epoch": 0.00954, "grad_norm": 0.1648533195257187, "learning_rate": 1e-05, "loss": 0.0182, "step": 954000 }, { "epoch": 0.009541, "grad_norm": 0.15114901959896088, "learning_rate": 1e-05, "loss": 0.0179, "step": 954100 }, { "epoch": 0.009542, "grad_norm": 0.15082402527332306, "learning_rate": 1e-05, "loss": 0.0183, "step": 954200 }, { "epoch": 0.009543, "grad_norm": 0.155763640999794, "learning_rate": 1e-05, "loss": 0.0182, "step": 954300 }, { "epoch": 0.009544, "grad_norm": 0.12508313357830048, "learning_rate": 1e-05, "loss": 0.0182, "step": 954400 }, { "epoch": 0.009545, "grad_norm": 0.11934836208820343, "learning_rate": 1e-05, "loss": 0.018, "step": 954500 }, { "epoch": 0.009546, "grad_norm": 0.15714412927627563, "learning_rate": 1e-05, "loss": 0.0181, "step": 954600 }, { "epoch": 0.009547, "grad_norm": 0.1475592851638794, "learning_rate": 1e-05, "loss": 0.0176, "step": 954700 }, { "epoch": 0.009548, "grad_norm": 0.13826996088027954, "learning_rate": 1e-05, "loss": 0.0179, "step": 954800 }, { "epoch": 0.009549, "grad_norm": 0.11912572383880615, "learning_rate": 1e-05, "loss": 0.0181, "step": 954900 }, { "epoch": 0.00955, "grad_norm": 0.120777428150177, "learning_rate": 1e-05, "loss": 0.018, "step": 955000 }, { "epoch": 0.009551, "grad_norm": 0.11317130923271179, "learning_rate": 1e-05, "loss": 0.0178, "step": 955100 }, { "epoch": 0.009552, "grad_norm": 0.1655065417289734, "learning_rate": 1e-05, "loss": 0.0179, "step": 955200 }, { "epoch": 0.009553, "grad_norm": 0.15075364708900452, "learning_rate": 1e-05, "loss": 0.0177, "step": 955300 }, { "epoch": 0.009554, "grad_norm": 0.1484041064977646, "learning_rate": 1e-05, "loss": 0.0181, "step": 955400 }, { "epoch": 0.009555, "grad_norm": 0.17565913498401642, "learning_rate": 1e-05, "loss": 0.0178, "step": 955500 }, { "epoch": 0.009556, "grad_norm": 0.1454722285270691, "learning_rate": 1e-05, "loss": 0.018, "step": 955600 }, { "epoch": 0.009557, "grad_norm": 0.17111198604106903, "learning_rate": 1e-05, "loss": 0.0181, "step": 955700 }, { "epoch": 0.009558, "grad_norm": 0.14949911832809448, "learning_rate": 1e-05, "loss": 0.0181, "step": 955800 }, { "epoch": 0.009559, "grad_norm": 0.15436339378356934, "learning_rate": 1e-05, "loss": 0.0179, "step": 955900 }, { "epoch": 0.00956, "grad_norm": 0.17048072814941406, "learning_rate": 1e-05, "loss": 0.0181, "step": 956000 }, { "epoch": 0.009561, "grad_norm": 0.13304750621318817, "learning_rate": 1e-05, "loss": 0.0181, "step": 956100 }, { "epoch": 0.009562, "grad_norm": 0.12560956180095673, "learning_rate": 1e-05, "loss": 0.018, "step": 956200 }, { "epoch": 0.009563, "grad_norm": 0.10967866331338882, "learning_rate": 1e-05, "loss": 0.0176, "step": 956300 }, { "epoch": 0.009564, "grad_norm": 0.1288253217935562, "learning_rate": 1e-05, "loss": 0.0179, "step": 956400 }, { "epoch": 0.009565, "grad_norm": 0.12134408950805664, "learning_rate": 1e-05, "loss": 0.0176, "step": 956500 }, { "epoch": 0.009566, "grad_norm": 0.12199041247367859, "learning_rate": 1e-05, "loss": 0.0182, "step": 956600 }, { "epoch": 0.009567, "grad_norm": 0.136484757065773, "learning_rate": 1e-05, "loss": 0.018, "step": 956700 }, { "epoch": 0.009568, "grad_norm": 0.11849475651979446, "learning_rate": 1e-05, "loss": 0.0179, "step": 956800 }, { "epoch": 0.009569, "grad_norm": 0.10444319993257523, "learning_rate": 1e-05, "loss": 0.0184, "step": 956900 }, { "epoch": 0.00957, "grad_norm": 0.1542115956544876, "learning_rate": 1e-05, "loss": 0.0173, "step": 957000 }, { "epoch": 0.009571, "grad_norm": 0.15491606295108795, "learning_rate": 1e-05, "loss": 0.0182, "step": 957100 }, { "epoch": 0.009572, "grad_norm": 0.15103894472122192, "learning_rate": 1e-05, "loss": 0.018, "step": 957200 }, { "epoch": 0.009573, "grad_norm": 0.17892113327980042, "learning_rate": 1e-05, "loss": 0.0183, "step": 957300 }, { "epoch": 0.009574, "grad_norm": 0.12868262827396393, "learning_rate": 1e-05, "loss": 0.0175, "step": 957400 }, { "epoch": 0.009575, "grad_norm": 0.137395441532135, "learning_rate": 1e-05, "loss": 0.0181, "step": 957500 }, { "epoch": 0.009576, "grad_norm": 0.10323211550712585, "learning_rate": 1e-05, "loss": 0.0177, "step": 957600 }, { "epoch": 0.009577, "grad_norm": 0.15083546936511993, "learning_rate": 1e-05, "loss": 0.018, "step": 957700 }, { "epoch": 0.009578, "grad_norm": 0.12969204783439636, "learning_rate": 1e-05, "loss": 0.0177, "step": 957800 }, { "epoch": 0.009579, "grad_norm": 0.20074324309825897, "learning_rate": 1e-05, "loss": 0.0179, "step": 957900 }, { "epoch": 0.00958, "grad_norm": 0.1492719054222107, "learning_rate": 1e-05, "loss": 0.0177, "step": 958000 }, { "epoch": 0.009581, "grad_norm": 0.12614163756370544, "learning_rate": 1e-05, "loss": 0.0178, "step": 958100 }, { "epoch": 0.009582, "grad_norm": 0.2110724002122879, "learning_rate": 1e-05, "loss": 0.0179, "step": 958200 }, { "epoch": 0.009583, "grad_norm": 0.2728024423122406, "learning_rate": 1e-05, "loss": 0.018, "step": 958300 }, { "epoch": 0.009584, "grad_norm": 0.15139628946781158, "learning_rate": 1e-05, "loss": 0.0176, "step": 958400 }, { "epoch": 0.009585, "grad_norm": 0.13178637623786926, "learning_rate": 1e-05, "loss": 0.018, "step": 958500 }, { "epoch": 0.009586, "grad_norm": 0.13018086552619934, "learning_rate": 1e-05, "loss": 0.0182, "step": 958600 }, { "epoch": 0.009587, "grad_norm": 0.1414661407470703, "learning_rate": 1e-05, "loss": 0.0177, "step": 958700 }, { "epoch": 0.009588, "grad_norm": 0.11952541023492813, "learning_rate": 1e-05, "loss": 0.0175, "step": 958800 }, { "epoch": 0.009589, "grad_norm": 0.14213170111179352, "learning_rate": 1e-05, "loss": 0.0178, "step": 958900 }, { "epoch": 0.00959, "grad_norm": 0.24848879873752594, "learning_rate": 1e-05, "loss": 0.0178, "step": 959000 }, { "epoch": 0.009591, "grad_norm": 0.16298112273216248, "learning_rate": 1e-05, "loss": 0.0176, "step": 959100 }, { "epoch": 0.009592, "grad_norm": 0.13470295071601868, "learning_rate": 1e-05, "loss": 0.0176, "step": 959200 }, { "epoch": 0.009593, "grad_norm": 0.17406019568443298, "learning_rate": 1e-05, "loss": 0.0179, "step": 959300 }, { "epoch": 0.009594, "grad_norm": 0.12250766158103943, "learning_rate": 1e-05, "loss": 0.0175, "step": 959400 }, { "epoch": 0.009595, "grad_norm": 0.1506509780883789, "learning_rate": 1e-05, "loss": 0.0176, "step": 959500 }, { "epoch": 0.009596, "grad_norm": 0.10630518198013306, "learning_rate": 1e-05, "loss": 0.0177, "step": 959600 }, { "epoch": 0.009597, "grad_norm": 0.16946232318878174, "learning_rate": 1e-05, "loss": 0.0174, "step": 959700 }, { "epoch": 0.009598, "grad_norm": 0.14224779605865479, "learning_rate": 1e-05, "loss": 0.0177, "step": 959800 }, { "epoch": 0.009599, "grad_norm": 0.1665811389684677, "learning_rate": 1e-05, "loss": 0.0181, "step": 959900 }, { "epoch": 0.0096, "grad_norm": 0.12815113365650177, "learning_rate": 1e-05, "loss": 0.0184, "step": 960000 }, { "epoch": 0.0096, "eval_loss": 0.016050076112151146, "eval_runtime": 191.901, "eval_samples_per_second": 260.551, "eval_steps_per_second": 16.284, "step": 960000 }, { "epoch": 0.009601, "grad_norm": 0.1375654637813568, "learning_rate": 1e-05, "loss": 0.0178, "step": 960100 }, { "epoch": 0.009602, "grad_norm": 0.11507996916770935, "learning_rate": 1e-05, "loss": 0.0184, "step": 960200 }, { "epoch": 0.009603, "grad_norm": 0.2760280668735504, "learning_rate": 1e-05, "loss": 0.0177, "step": 960300 }, { "epoch": 0.009604, "grad_norm": 0.1614145040512085, "learning_rate": 1e-05, "loss": 0.0178, "step": 960400 }, { "epoch": 0.009605, "grad_norm": 0.1277671456336975, "learning_rate": 1e-05, "loss": 0.0175, "step": 960500 }, { "epoch": 0.009606, "grad_norm": 0.18082669377326965, "learning_rate": 1e-05, "loss": 0.018, "step": 960600 }, { "epoch": 0.009607, "grad_norm": 0.12256241589784622, "learning_rate": 1e-05, "loss": 0.0175, "step": 960700 }, { "epoch": 0.009608, "grad_norm": 0.25798675417900085, "learning_rate": 1e-05, "loss": 0.0178, "step": 960800 }, { "epoch": 0.009609, "grad_norm": 0.15810610353946686, "learning_rate": 1e-05, "loss": 0.0181, "step": 960900 }, { "epoch": 0.00961, "grad_norm": 0.1935126930475235, "learning_rate": 1e-05, "loss": 0.0182, "step": 961000 }, { "epoch": 0.009611, "grad_norm": 0.19467249512672424, "learning_rate": 1e-05, "loss": 0.0184, "step": 961100 }, { "epoch": 0.009612, "grad_norm": 0.12765026092529297, "learning_rate": 1e-05, "loss": 0.0178, "step": 961200 }, { "epoch": 0.009613, "grad_norm": 0.15376392006874084, "learning_rate": 1e-05, "loss": 0.0175, "step": 961300 }, { "epoch": 0.009614, "grad_norm": 0.14208106696605682, "learning_rate": 1e-05, "loss": 0.0179, "step": 961400 }, { "epoch": 0.009615, "grad_norm": 0.1909850537776947, "learning_rate": 1e-05, "loss": 0.0172, "step": 961500 }, { "epoch": 0.009616, "grad_norm": 0.13632066547870636, "learning_rate": 1e-05, "loss": 0.0181, "step": 961600 }, { "epoch": 0.009617, "grad_norm": 0.1252330243587494, "learning_rate": 1e-05, "loss": 0.0181, "step": 961700 }, { "epoch": 0.009618, "grad_norm": 0.12592095136642456, "learning_rate": 1e-05, "loss": 0.0178, "step": 961800 }, { "epoch": 0.009619, "grad_norm": 0.1266692727804184, "learning_rate": 1e-05, "loss": 0.0177, "step": 961900 }, { "epoch": 0.00962, "grad_norm": 0.12575441598892212, "learning_rate": 1e-05, "loss": 0.0179, "step": 962000 }, { "epoch": 0.009621, "grad_norm": 0.1342380791902542, "learning_rate": 1e-05, "loss": 0.0179, "step": 962100 }, { "epoch": 0.009622, "grad_norm": 0.1601039171218872, "learning_rate": 1e-05, "loss": 0.018, "step": 962200 }, { "epoch": 0.009623, "grad_norm": 0.1805805265903473, "learning_rate": 1e-05, "loss": 0.0179, "step": 962300 }, { "epoch": 0.009624, "grad_norm": 0.11358647793531418, "learning_rate": 1e-05, "loss": 0.0182, "step": 962400 }, { "epoch": 0.009625, "grad_norm": 0.14184656739234924, "learning_rate": 1e-05, "loss": 0.018, "step": 962500 }, { "epoch": 0.009626, "grad_norm": 0.1183871403336525, "learning_rate": 1e-05, "loss": 0.0176, "step": 962600 }, { "epoch": 0.009627, "grad_norm": 0.1487293690443039, "learning_rate": 1e-05, "loss": 0.0179, "step": 962700 }, { "epoch": 0.009628, "grad_norm": 0.20307579636573792, "learning_rate": 1e-05, "loss": 0.018, "step": 962800 }, { "epoch": 0.009629, "grad_norm": 0.12775494158267975, "learning_rate": 1e-05, "loss": 0.0178, "step": 962900 }, { "epoch": 0.00963, "grad_norm": 0.145344078540802, "learning_rate": 1e-05, "loss": 0.0179, "step": 963000 }, { "epoch": 0.009631, "grad_norm": 0.12564542889595032, "learning_rate": 1e-05, "loss": 0.0183, "step": 963100 }, { "epoch": 0.009632, "grad_norm": 0.1316869854927063, "learning_rate": 1e-05, "loss": 0.018, "step": 963200 }, { "epoch": 0.009633, "grad_norm": 0.20443272590637207, "learning_rate": 1e-05, "loss": 0.0178, "step": 963300 }, { "epoch": 0.009634, "grad_norm": 0.13617397844791412, "learning_rate": 1e-05, "loss": 0.0183, "step": 963400 }, { "epoch": 0.009635, "grad_norm": 0.19198310375213623, "learning_rate": 1e-05, "loss": 0.0182, "step": 963500 }, { "epoch": 0.009636, "grad_norm": 0.13345414400100708, "learning_rate": 1e-05, "loss": 0.0182, "step": 963600 }, { "epoch": 0.009637, "grad_norm": 0.13906845450401306, "learning_rate": 1e-05, "loss": 0.0182, "step": 963700 }, { "epoch": 0.009638, "grad_norm": 0.1402391493320465, "learning_rate": 1e-05, "loss": 0.018, "step": 963800 }, { "epoch": 0.009639, "grad_norm": 0.19737493991851807, "learning_rate": 1e-05, "loss": 0.0181, "step": 963900 }, { "epoch": 0.00964, "grad_norm": 0.1506827473640442, "learning_rate": 1e-05, "loss": 0.018, "step": 964000 }, { "epoch": 0.009641, "grad_norm": 0.13420754671096802, "learning_rate": 1e-05, "loss": 0.0175, "step": 964100 }, { "epoch": 0.009642, "grad_norm": 0.15917454659938812, "learning_rate": 1e-05, "loss": 0.0179, "step": 964200 }, { "epoch": 0.009643, "grad_norm": 0.12029775232076645, "learning_rate": 1e-05, "loss": 0.018, "step": 964300 }, { "epoch": 0.009644, "grad_norm": 0.09702417254447937, "learning_rate": 1e-05, "loss": 0.0183, "step": 964400 }, { "epoch": 0.009645, "grad_norm": 0.14913353323936462, "learning_rate": 1e-05, "loss": 0.0176, "step": 964500 }, { "epoch": 0.009646, "grad_norm": 0.14353936910629272, "learning_rate": 1e-05, "loss": 0.018, "step": 964600 }, { "epoch": 0.009647, "grad_norm": 0.15647195279598236, "learning_rate": 1e-05, "loss": 0.0185, "step": 964700 }, { "epoch": 0.009648, "grad_norm": 0.12600435316562653, "learning_rate": 1e-05, "loss": 0.0179, "step": 964800 }, { "epoch": 0.009649, "grad_norm": 0.1299232840538025, "learning_rate": 1e-05, "loss": 0.0179, "step": 964900 }, { "epoch": 0.00965, "grad_norm": 0.13706104457378387, "learning_rate": 1e-05, "loss": 0.0176, "step": 965000 }, { "epoch": 0.009651, "grad_norm": 0.18650642037391663, "learning_rate": 1e-05, "loss": 0.0182, "step": 965100 }, { "epoch": 0.009652, "grad_norm": 0.18367671966552734, "learning_rate": 1e-05, "loss": 0.0177, "step": 965200 }, { "epoch": 0.009653, "grad_norm": 0.12339954823255539, "learning_rate": 1e-05, "loss": 0.0179, "step": 965300 }, { "epoch": 0.009654, "grad_norm": 0.11875072121620178, "learning_rate": 1e-05, "loss": 0.0175, "step": 965400 }, { "epoch": 0.009655, "grad_norm": 0.14237023890018463, "learning_rate": 1e-05, "loss": 0.0179, "step": 965500 }, { "epoch": 0.009656, "grad_norm": 0.1565982550382614, "learning_rate": 1e-05, "loss": 0.0177, "step": 965600 }, { "epoch": 0.009657, "grad_norm": 0.1430491805076599, "learning_rate": 1e-05, "loss": 0.0177, "step": 965700 }, { "epoch": 0.009658, "grad_norm": 0.10180248320102692, "learning_rate": 1e-05, "loss": 0.0181, "step": 965800 }, { "epoch": 0.009659, "grad_norm": 0.15974557399749756, "learning_rate": 1e-05, "loss": 0.0178, "step": 965900 }, { "epoch": 0.00966, "grad_norm": 0.2282423973083496, "learning_rate": 1e-05, "loss": 0.0179, "step": 966000 }, { "epoch": 0.009661, "grad_norm": 0.14173637330532074, "learning_rate": 1e-05, "loss": 0.0181, "step": 966100 }, { "epoch": 0.009662, "grad_norm": 0.15193715691566467, "learning_rate": 1e-05, "loss": 0.0177, "step": 966200 }, { "epoch": 0.009663, "grad_norm": 0.12508799135684967, "learning_rate": 1e-05, "loss": 0.0183, "step": 966300 }, { "epoch": 0.009664, "grad_norm": 0.17995114624500275, "learning_rate": 1e-05, "loss": 0.0179, "step": 966400 }, { "epoch": 0.009665, "grad_norm": 0.1284942477941513, "learning_rate": 1e-05, "loss": 0.0179, "step": 966500 }, { "epoch": 0.009666, "grad_norm": 0.17536459863185883, "learning_rate": 1e-05, "loss": 0.0178, "step": 966600 }, { "epoch": 0.009667, "grad_norm": 0.1165534034371376, "learning_rate": 1e-05, "loss": 0.018, "step": 966700 }, { "epoch": 0.009668, "grad_norm": 0.12305973470211029, "learning_rate": 1e-05, "loss": 0.0177, "step": 966800 }, { "epoch": 0.009669, "grad_norm": 0.164478600025177, "learning_rate": 1e-05, "loss": 0.0175, "step": 966900 }, { "epoch": 0.00967, "grad_norm": 0.12029968947172165, "learning_rate": 1e-05, "loss": 0.0175, "step": 967000 }, { "epoch": 0.009671, "grad_norm": 0.1656678318977356, "learning_rate": 1e-05, "loss": 0.0178, "step": 967100 }, { "epoch": 0.009672, "grad_norm": 0.19378836452960968, "learning_rate": 1e-05, "loss": 0.0173, "step": 967200 }, { "epoch": 0.009673, "grad_norm": 0.12959307432174683, "learning_rate": 1e-05, "loss": 0.0178, "step": 967300 }, { "epoch": 0.009674, "grad_norm": 0.1528695523738861, "learning_rate": 1e-05, "loss": 0.0176, "step": 967400 }, { "epoch": 0.009675, "grad_norm": 0.12391874194145203, "learning_rate": 1e-05, "loss": 0.0185, "step": 967500 }, { "epoch": 0.009676, "grad_norm": 0.12753510475158691, "learning_rate": 1e-05, "loss": 0.0179, "step": 967600 }, { "epoch": 0.009677, "grad_norm": 0.11568232625722885, "learning_rate": 1e-05, "loss": 0.0181, "step": 967700 }, { "epoch": 0.009678, "grad_norm": 0.15360622107982635, "learning_rate": 1e-05, "loss": 0.0181, "step": 967800 }, { "epoch": 0.009679, "grad_norm": 0.17073126137256622, "learning_rate": 1e-05, "loss": 0.0181, "step": 967900 }, { "epoch": 0.00968, "grad_norm": 0.130234494805336, "learning_rate": 1e-05, "loss": 0.018, "step": 968000 }, { "epoch": 0.009681, "grad_norm": 0.19802166521549225, "learning_rate": 1e-05, "loss": 0.0178, "step": 968100 }, { "epoch": 0.009682, "grad_norm": 0.15447457134723663, "learning_rate": 1e-05, "loss": 0.0179, "step": 968200 }, { "epoch": 0.009683, "grad_norm": 0.12741445004940033, "learning_rate": 1e-05, "loss": 0.0179, "step": 968300 }, { "epoch": 0.009684, "grad_norm": 0.154180109500885, "learning_rate": 1e-05, "loss": 0.0179, "step": 968400 }, { "epoch": 0.009685, "grad_norm": 0.1548158973455429, "learning_rate": 1e-05, "loss": 0.0178, "step": 968500 }, { "epoch": 0.009686, "grad_norm": 0.1521531641483307, "learning_rate": 1e-05, "loss": 0.0182, "step": 968600 }, { "epoch": 0.009687, "grad_norm": 0.14152328670024872, "learning_rate": 1e-05, "loss": 0.0177, "step": 968700 }, { "epoch": 0.009688, "grad_norm": 0.1659325361251831, "learning_rate": 1e-05, "loss": 0.0178, "step": 968800 }, { "epoch": 0.009689, "grad_norm": 0.1426624208688736, "learning_rate": 1e-05, "loss": 0.0176, "step": 968900 }, { "epoch": 0.00969, "grad_norm": 0.17455023527145386, "learning_rate": 1e-05, "loss": 0.0178, "step": 969000 }, { "epoch": 0.009691, "grad_norm": 0.15004093945026398, "learning_rate": 1e-05, "loss": 0.0178, "step": 969100 }, { "epoch": 0.009692, "grad_norm": 0.10602996498346329, "learning_rate": 1e-05, "loss": 0.018, "step": 969200 }, { "epoch": 0.009693, "grad_norm": 0.12933407723903656, "learning_rate": 1e-05, "loss": 0.0178, "step": 969300 }, { "epoch": 0.009694, "grad_norm": 0.167331725358963, "learning_rate": 1e-05, "loss": 0.0182, "step": 969400 }, { "epoch": 0.009695, "grad_norm": 0.1286153346300125, "learning_rate": 1e-05, "loss": 0.018, "step": 969500 }, { "epoch": 0.009696, "grad_norm": 0.14800603687763214, "learning_rate": 1e-05, "loss": 0.0174, "step": 969600 }, { "epoch": 0.009697, "grad_norm": 0.13064932823181152, "learning_rate": 1e-05, "loss": 0.0176, "step": 969700 }, { "epoch": 0.009698, "grad_norm": 0.12100642174482346, "learning_rate": 1e-05, "loss": 0.0183, "step": 969800 }, { "epoch": 0.009699, "grad_norm": 0.15577559173107147, "learning_rate": 1e-05, "loss": 0.0171, "step": 969900 }, { "epoch": 0.0097, "grad_norm": 0.1616448611021042, "learning_rate": 1e-05, "loss": 0.0178, "step": 970000 }, { "epoch": 0.009701, "grad_norm": 0.15516090393066406, "learning_rate": 1e-05, "loss": 0.0178, "step": 970100 }, { "epoch": 0.009702, "grad_norm": 0.217764213681221, "learning_rate": 1e-05, "loss": 0.0179, "step": 970200 }, { "epoch": 0.009703, "grad_norm": 0.13523469865322113, "learning_rate": 1e-05, "loss": 0.018, "step": 970300 }, { "epoch": 0.009704, "grad_norm": 0.17062900960445404, "learning_rate": 1e-05, "loss": 0.018, "step": 970400 }, { "epoch": 0.009705, "grad_norm": 0.14987072348594666, "learning_rate": 1e-05, "loss": 0.0179, "step": 970500 }, { "epoch": 0.009706, "grad_norm": 0.18438903987407684, "learning_rate": 1e-05, "loss": 0.0181, "step": 970600 }, { "epoch": 0.009707, "grad_norm": 0.10851503163576126, "learning_rate": 1e-05, "loss": 0.0179, "step": 970700 }, { "epoch": 0.009708, "grad_norm": 0.12944602966308594, "learning_rate": 1e-05, "loss": 0.0182, "step": 970800 }, { "epoch": 0.009709, "grad_norm": 0.14124555885791779, "learning_rate": 1e-05, "loss": 0.0178, "step": 970900 }, { "epoch": 0.00971, "grad_norm": 0.13854315876960754, "learning_rate": 1e-05, "loss": 0.0179, "step": 971000 }, { "epoch": 0.009711, "grad_norm": 0.13709117472171783, "learning_rate": 1e-05, "loss": 0.018, "step": 971100 }, { "epoch": 0.009712, "grad_norm": 0.1371414214372635, "learning_rate": 1e-05, "loss": 0.018, "step": 971200 }, { "epoch": 0.009713, "grad_norm": 0.1327771544456482, "learning_rate": 1e-05, "loss": 0.0173, "step": 971300 }, { "epoch": 0.009714, "grad_norm": 0.19659963250160217, "learning_rate": 1e-05, "loss": 0.0173, "step": 971400 }, { "epoch": 0.009715, "grad_norm": 0.12944260239601135, "learning_rate": 1e-05, "loss": 0.0172, "step": 971500 }, { "epoch": 0.009716, "grad_norm": 0.15487928688526154, "learning_rate": 1e-05, "loss": 0.0183, "step": 971600 }, { "epoch": 0.009717, "grad_norm": 0.12770423293113708, "learning_rate": 1e-05, "loss": 0.0179, "step": 971700 }, { "epoch": 0.009718, "grad_norm": 0.14355182647705078, "learning_rate": 1e-05, "loss": 0.0178, "step": 971800 }, { "epoch": 0.009719, "grad_norm": 0.17476782202720642, "learning_rate": 1e-05, "loss": 0.0181, "step": 971900 }, { "epoch": 0.00972, "grad_norm": 0.13755956292152405, "learning_rate": 1e-05, "loss": 0.0181, "step": 972000 }, { "epoch": 0.009721, "grad_norm": 0.13338333368301392, "learning_rate": 1e-05, "loss": 0.0177, "step": 972100 }, { "epoch": 0.009722, "grad_norm": 0.15080995857715607, "learning_rate": 1e-05, "loss": 0.0178, "step": 972200 }, { "epoch": 0.009723, "grad_norm": 0.19369760155677795, "learning_rate": 1e-05, "loss": 0.0174, "step": 972300 }, { "epoch": 0.009724, "grad_norm": 0.16594068706035614, "learning_rate": 1e-05, "loss": 0.0179, "step": 972400 }, { "epoch": 0.009725, "grad_norm": 0.10424978286027908, "learning_rate": 1e-05, "loss": 0.018, "step": 972500 }, { "epoch": 0.009726, "grad_norm": 0.1126861646771431, "learning_rate": 1e-05, "loss": 0.0181, "step": 972600 }, { "epoch": 0.009727, "grad_norm": 0.16980551183223724, "learning_rate": 1e-05, "loss": 0.018, "step": 972700 }, { "epoch": 0.009728, "grad_norm": 0.13042405247688293, "learning_rate": 1e-05, "loss": 0.0179, "step": 972800 }, { "epoch": 0.009729, "grad_norm": 0.12790170311927795, "learning_rate": 1e-05, "loss": 0.0178, "step": 972900 }, { "epoch": 0.00973, "grad_norm": 0.1239447146654129, "learning_rate": 1e-05, "loss": 0.0178, "step": 973000 }, { "epoch": 0.009731, "grad_norm": 0.14913016557693481, "learning_rate": 1e-05, "loss": 0.0178, "step": 973100 }, { "epoch": 0.009732, "grad_norm": 0.15074753761291504, "learning_rate": 1e-05, "loss": 0.0176, "step": 973200 }, { "epoch": 0.009733, "grad_norm": 0.1595781147480011, "learning_rate": 1e-05, "loss": 0.0175, "step": 973300 }, { "epoch": 0.009734, "grad_norm": 0.12206005305051804, "learning_rate": 1e-05, "loss": 0.0178, "step": 973400 }, { "epoch": 0.009735, "grad_norm": 0.16680170595645905, "learning_rate": 1e-05, "loss": 0.0176, "step": 973500 }, { "epoch": 0.009736, "grad_norm": 0.10693177580833435, "learning_rate": 1e-05, "loss": 0.0176, "step": 973600 }, { "epoch": 0.009737, "grad_norm": 0.16245052218437195, "learning_rate": 1e-05, "loss": 0.0175, "step": 973700 }, { "epoch": 0.009738, "grad_norm": 0.14071683585643768, "learning_rate": 1e-05, "loss": 0.0183, "step": 973800 }, { "epoch": 0.009739, "grad_norm": 0.15149462223052979, "learning_rate": 1e-05, "loss": 0.0181, "step": 973900 }, { "epoch": 0.00974, "grad_norm": 0.12518811225891113, "learning_rate": 1e-05, "loss": 0.0178, "step": 974000 }, { "epoch": 0.009741, "grad_norm": 0.15807048976421356, "learning_rate": 1e-05, "loss": 0.0176, "step": 974100 }, { "epoch": 0.009742, "grad_norm": 0.17603187263011932, "learning_rate": 1e-05, "loss": 0.0179, "step": 974200 }, { "epoch": 0.009743, "grad_norm": 0.11634425818920135, "learning_rate": 1e-05, "loss": 0.0178, "step": 974300 }, { "epoch": 0.009744, "grad_norm": 0.13175879418849945, "learning_rate": 1e-05, "loss": 0.0179, "step": 974400 }, { "epoch": 0.009745, "grad_norm": 0.1499554067850113, "learning_rate": 1e-05, "loss": 0.0178, "step": 974500 }, { "epoch": 0.009746, "grad_norm": 0.14173611998558044, "learning_rate": 1e-05, "loss": 0.0178, "step": 974600 }, { "epoch": 0.009747, "grad_norm": 0.13592113554477692, "learning_rate": 1e-05, "loss": 0.0179, "step": 974700 }, { "epoch": 0.009748, "grad_norm": 0.11752822995185852, "learning_rate": 1e-05, "loss": 0.0175, "step": 974800 }, { "epoch": 0.009749, "grad_norm": 0.12610507011413574, "learning_rate": 1e-05, "loss": 0.018, "step": 974900 }, { "epoch": 0.00975, "grad_norm": 0.11269000917673111, "learning_rate": 1e-05, "loss": 0.0181, "step": 975000 }, { "epoch": 0.009751, "grad_norm": 0.1319676637649536, "learning_rate": 1e-05, "loss": 0.0177, "step": 975100 }, { "epoch": 0.009752, "grad_norm": 0.14027026295661926, "learning_rate": 1e-05, "loss": 0.0178, "step": 975200 }, { "epoch": 0.009753, "grad_norm": 0.17258070409297943, "learning_rate": 1e-05, "loss": 0.0183, "step": 975300 }, { "epoch": 0.009754, "grad_norm": 0.1471002846956253, "learning_rate": 1e-05, "loss": 0.0179, "step": 975400 }, { "epoch": 0.009755, "grad_norm": 0.11939527094364166, "learning_rate": 1e-05, "loss": 0.018, "step": 975500 }, { "epoch": 0.009756, "grad_norm": 0.12720711529254913, "learning_rate": 1e-05, "loss": 0.0177, "step": 975600 }, { "epoch": 0.009757, "grad_norm": 0.1265941858291626, "learning_rate": 1e-05, "loss": 0.018, "step": 975700 }, { "epoch": 0.009758, "grad_norm": 0.13533280789852142, "learning_rate": 1e-05, "loss": 0.0175, "step": 975800 }, { "epoch": 0.009759, "grad_norm": 0.1543136090040207, "learning_rate": 1e-05, "loss": 0.0176, "step": 975900 }, { "epoch": 0.00976, "grad_norm": 0.3870061933994293, "learning_rate": 1e-05, "loss": 0.0182, "step": 976000 }, { "epoch": 0.009761, "grad_norm": 0.11066586524248123, "learning_rate": 1e-05, "loss": 0.0179, "step": 976100 }, { "epoch": 0.009762, "grad_norm": 0.13933567702770233, "learning_rate": 1e-05, "loss": 0.0177, "step": 976200 }, { "epoch": 0.009763, "grad_norm": 0.13119535148143768, "learning_rate": 1e-05, "loss": 0.0174, "step": 976300 }, { "epoch": 0.009764, "grad_norm": 0.13569818437099457, "learning_rate": 1e-05, "loss": 0.0176, "step": 976400 }, { "epoch": 0.009765, "grad_norm": 0.13802199065685272, "learning_rate": 1e-05, "loss": 0.0182, "step": 976500 }, { "epoch": 0.009766, "grad_norm": 0.14216215908527374, "learning_rate": 1e-05, "loss": 0.0176, "step": 976600 }, { "epoch": 0.009767, "grad_norm": 0.15681639313697815, "learning_rate": 1e-05, "loss": 0.0179, "step": 976700 }, { "epoch": 0.009768, "grad_norm": 0.13549788296222687, "learning_rate": 1e-05, "loss": 0.0177, "step": 976800 }, { "epoch": 0.009769, "grad_norm": 0.1683572679758072, "learning_rate": 1e-05, "loss": 0.0183, "step": 976900 }, { "epoch": 0.00977, "grad_norm": 0.15745477378368378, "learning_rate": 1e-05, "loss": 0.018, "step": 977000 }, { "epoch": 0.009771, "grad_norm": 0.13545888662338257, "learning_rate": 1e-05, "loss": 0.018, "step": 977100 }, { "epoch": 0.009772, "grad_norm": 0.13827897608280182, "learning_rate": 1e-05, "loss": 0.0174, "step": 977200 }, { "epoch": 0.009773, "grad_norm": 0.1463233381509781, "learning_rate": 1e-05, "loss": 0.0178, "step": 977300 }, { "epoch": 0.009774, "grad_norm": 0.12432350963354111, "learning_rate": 1e-05, "loss": 0.018, "step": 977400 }, { "epoch": 0.009775, "grad_norm": 0.14162762463092804, "learning_rate": 1e-05, "loss": 0.0179, "step": 977500 }, { "epoch": 0.009776, "grad_norm": 0.13470017910003662, "learning_rate": 1e-05, "loss": 0.018, "step": 977600 }, { "epoch": 0.009777, "grad_norm": 0.11150386929512024, "learning_rate": 1e-05, "loss": 0.0179, "step": 977700 }, { "epoch": 0.009778, "grad_norm": 0.16437433660030365, "learning_rate": 1e-05, "loss": 0.0176, "step": 977800 }, { "epoch": 0.009779, "grad_norm": 0.12057038396596909, "learning_rate": 1e-05, "loss": 0.0176, "step": 977900 }, { "epoch": 0.00978, "grad_norm": 0.21697735786437988, "learning_rate": 1e-05, "loss": 0.0175, "step": 978000 }, { "epoch": 0.009781, "grad_norm": 0.1316281259059906, "learning_rate": 1e-05, "loss": 0.0178, "step": 978100 }, { "epoch": 0.009782, "grad_norm": 0.14583849906921387, "learning_rate": 1e-05, "loss": 0.0176, "step": 978200 }, { "epoch": 0.009783, "grad_norm": 0.19838818907737732, "learning_rate": 1e-05, "loss": 0.0177, "step": 978300 }, { "epoch": 0.009784, "grad_norm": 0.13587327301502228, "learning_rate": 1e-05, "loss": 0.0178, "step": 978400 }, { "epoch": 0.009785, "grad_norm": 0.14259693026542664, "learning_rate": 1e-05, "loss": 0.0174, "step": 978500 }, { "epoch": 0.009786, "grad_norm": 0.18168246746063232, "learning_rate": 1e-05, "loss": 0.0173, "step": 978600 }, { "epoch": 0.009787, "grad_norm": 0.1310284286737442, "learning_rate": 1e-05, "loss": 0.0177, "step": 978700 }, { "epoch": 0.009788, "grad_norm": 0.160218745470047, "learning_rate": 1e-05, "loss": 0.0179, "step": 978800 }, { "epoch": 0.009789, "grad_norm": 0.141554057598114, "learning_rate": 1e-05, "loss": 0.0182, "step": 978900 }, { "epoch": 0.00979, "grad_norm": 0.12590841948986053, "learning_rate": 1e-05, "loss": 0.0181, "step": 979000 }, { "epoch": 0.009791, "grad_norm": 0.13056248426437378, "learning_rate": 1e-05, "loss": 0.0173, "step": 979100 }, { "epoch": 0.009792, "grad_norm": 0.146662175655365, "learning_rate": 1e-05, "loss": 0.0176, "step": 979200 }, { "epoch": 0.009793, "grad_norm": 0.1644611656665802, "learning_rate": 1e-05, "loss": 0.0181, "step": 979300 }, { "epoch": 0.009794, "grad_norm": 0.142594113945961, "learning_rate": 1e-05, "loss": 0.0179, "step": 979400 }, { "epoch": 0.009795, "grad_norm": 0.10752857476472855, "learning_rate": 1e-05, "loss": 0.0176, "step": 979500 }, { "epoch": 0.009796, "grad_norm": 0.2066269963979721, "learning_rate": 1e-05, "loss": 0.0177, "step": 979600 }, { "epoch": 0.009797, "grad_norm": 0.11140887439250946, "learning_rate": 1e-05, "loss": 0.0178, "step": 979700 }, { "epoch": 0.009798, "grad_norm": 0.1718418002128601, "learning_rate": 1e-05, "loss": 0.0176, "step": 979800 }, { "epoch": 0.009799, "grad_norm": 0.14062893390655518, "learning_rate": 1e-05, "loss": 0.0183, "step": 979900 }, { "epoch": 0.0098, "grad_norm": 0.18130186200141907, "learning_rate": 1e-05, "loss": 0.0178, "step": 980000 }, { "epoch": 0.0098, "eval_loss": 0.016230540350079536, "eval_runtime": 193.2806, "eval_samples_per_second": 258.691, "eval_steps_per_second": 16.168, "step": 980000 }, { "epoch": 0.009801, "grad_norm": 0.17742988467216492, "learning_rate": 1e-05, "loss": 0.0176, "step": 980100 }, { "epoch": 0.009802, "grad_norm": 0.20130252838134766, "learning_rate": 1e-05, "loss": 0.0175, "step": 980200 }, { "epoch": 0.009803, "grad_norm": 0.18234063684940338, "learning_rate": 1e-05, "loss": 0.018, "step": 980300 }, { "epoch": 0.009804, "grad_norm": 0.12446579337120056, "learning_rate": 1e-05, "loss": 0.0177, "step": 980400 }, { "epoch": 0.009805, "grad_norm": 0.20879025757312775, "learning_rate": 1e-05, "loss": 0.0178, "step": 980500 }, { "epoch": 0.009806, "grad_norm": 0.1530422568321228, "learning_rate": 1e-05, "loss": 0.0182, "step": 980600 }, { "epoch": 0.009807, "grad_norm": 0.15959975123405457, "learning_rate": 1e-05, "loss": 0.0177, "step": 980700 }, { "epoch": 0.009808, "grad_norm": 0.15434546768665314, "learning_rate": 1e-05, "loss": 0.0181, "step": 980800 }, { "epoch": 0.009809, "grad_norm": 0.13207174837589264, "learning_rate": 1e-05, "loss": 0.0178, "step": 980900 }, { "epoch": 0.00981, "grad_norm": 0.13642621040344238, "learning_rate": 1e-05, "loss": 0.0176, "step": 981000 }, { "epoch": 0.009811, "grad_norm": 0.15605708956718445, "learning_rate": 1e-05, "loss": 0.0177, "step": 981100 }, { "epoch": 0.009812, "grad_norm": 0.15820112824440002, "learning_rate": 1e-05, "loss": 0.0178, "step": 981200 }, { "epoch": 0.009813, "grad_norm": 0.1315092295408249, "learning_rate": 1e-05, "loss": 0.0178, "step": 981300 }, { "epoch": 0.009814, "grad_norm": 0.16758953034877777, "learning_rate": 1e-05, "loss": 0.0171, "step": 981400 }, { "epoch": 0.009815, "grad_norm": 0.14983533322811127, "learning_rate": 1e-05, "loss": 0.0178, "step": 981500 }, { "epoch": 0.009816, "grad_norm": 0.1471928209066391, "learning_rate": 1e-05, "loss": 0.0176, "step": 981600 }, { "epoch": 0.009817, "grad_norm": 0.1485586166381836, "learning_rate": 1e-05, "loss": 0.0181, "step": 981700 }, { "epoch": 0.009818, "grad_norm": 0.12604248523712158, "learning_rate": 1e-05, "loss": 0.0182, "step": 981800 }, { "epoch": 0.009819, "grad_norm": 0.14156559109687805, "learning_rate": 1e-05, "loss": 0.0177, "step": 981900 }, { "epoch": 0.00982, "grad_norm": 0.14054641127586365, "learning_rate": 1e-05, "loss": 0.018, "step": 982000 }, { "epoch": 0.009821, "grad_norm": 0.14146648347377777, "learning_rate": 1e-05, "loss": 0.0179, "step": 982100 }, { "epoch": 0.009822, "grad_norm": 0.11647078394889832, "learning_rate": 1e-05, "loss": 0.0181, "step": 982200 }, { "epoch": 0.009823, "grad_norm": 0.11631416529417038, "learning_rate": 1e-05, "loss": 0.018, "step": 982300 }, { "epoch": 0.009824, "grad_norm": 0.14718590676784515, "learning_rate": 1e-05, "loss": 0.0183, "step": 982400 }, { "epoch": 0.009825, "grad_norm": 0.12211163341999054, "learning_rate": 1e-05, "loss": 0.0175, "step": 982500 }, { "epoch": 0.009826, "grad_norm": 0.17270207405090332, "learning_rate": 1e-05, "loss": 0.0175, "step": 982600 }, { "epoch": 0.009827, "grad_norm": 0.12691718339920044, "learning_rate": 1e-05, "loss": 0.0181, "step": 982700 }, { "epoch": 0.009828, "grad_norm": 0.1317170113325119, "learning_rate": 1e-05, "loss": 0.0174, "step": 982800 }, { "epoch": 0.009829, "grad_norm": 0.1490432322025299, "learning_rate": 1e-05, "loss": 0.0179, "step": 982900 }, { "epoch": 0.00983, "grad_norm": 0.17469888925552368, "learning_rate": 1e-05, "loss": 0.0181, "step": 983000 }, { "epoch": 0.009831, "grad_norm": 0.10010451823472977, "learning_rate": 1e-05, "loss": 0.0179, "step": 983100 }, { "epoch": 0.009832, "grad_norm": 0.10846380144357681, "learning_rate": 1e-05, "loss": 0.0177, "step": 983200 }, { "epoch": 0.009833, "grad_norm": 0.1378488838672638, "learning_rate": 1e-05, "loss": 0.018, "step": 983300 }, { "epoch": 0.009834, "grad_norm": 0.19960790872573853, "learning_rate": 1e-05, "loss": 0.0181, "step": 983400 }, { "epoch": 0.009835, "grad_norm": 0.20558622479438782, "learning_rate": 1e-05, "loss": 0.0177, "step": 983500 }, { "epoch": 0.009836, "grad_norm": 0.1457175463438034, "learning_rate": 1e-05, "loss": 0.0177, "step": 983600 }, { "epoch": 0.009837, "grad_norm": 0.157062366604805, "learning_rate": 1e-05, "loss": 0.0177, "step": 983700 }, { "epoch": 0.009838, "grad_norm": 0.12620607018470764, "learning_rate": 1e-05, "loss": 0.0174, "step": 983800 }, { "epoch": 0.009839, "grad_norm": 0.1252482682466507, "learning_rate": 1e-05, "loss": 0.0178, "step": 983900 }, { "epoch": 0.00984, "grad_norm": 0.13269765675067902, "learning_rate": 1e-05, "loss": 0.0178, "step": 984000 }, { "epoch": 0.009841, "grad_norm": 0.19861248135566711, "learning_rate": 1e-05, "loss": 0.0181, "step": 984100 }, { "epoch": 0.009842, "grad_norm": 0.15848408639431, "learning_rate": 1e-05, "loss": 0.0176, "step": 984200 }, { "epoch": 0.009843, "grad_norm": 0.14720210433006287, "learning_rate": 1e-05, "loss": 0.0179, "step": 984300 }, { "epoch": 0.009844, "grad_norm": 0.17213715612888336, "learning_rate": 1e-05, "loss": 0.0175, "step": 984400 }, { "epoch": 0.009845, "grad_norm": 0.14525888860225677, "learning_rate": 1e-05, "loss": 0.018, "step": 984500 }, { "epoch": 0.009846, "grad_norm": 0.12029461562633514, "learning_rate": 1e-05, "loss": 0.0178, "step": 984600 }, { "epoch": 0.009847, "grad_norm": 0.10642600059509277, "learning_rate": 1e-05, "loss": 0.0176, "step": 984700 }, { "epoch": 0.009848, "grad_norm": 0.11030121147632599, "learning_rate": 1e-05, "loss": 0.0178, "step": 984800 }, { "epoch": 0.009849, "grad_norm": 0.12030531466007233, "learning_rate": 1e-05, "loss": 0.0175, "step": 984900 }, { "epoch": 0.00985, "grad_norm": 0.11663129925727844, "learning_rate": 1e-05, "loss": 0.0178, "step": 985000 }, { "epoch": 0.009851, "grad_norm": 0.12932899594306946, "learning_rate": 1e-05, "loss": 0.0179, "step": 985100 }, { "epoch": 0.009852, "grad_norm": 0.11066880077123642, "learning_rate": 1e-05, "loss": 0.0172, "step": 985200 }, { "epoch": 0.009853, "grad_norm": 0.2906988561153412, "learning_rate": 1e-05, "loss": 0.0176, "step": 985300 }, { "epoch": 0.009854, "grad_norm": 0.1461118906736374, "learning_rate": 1e-05, "loss": 0.0176, "step": 985400 }, { "epoch": 0.009855, "grad_norm": 0.14277899265289307, "learning_rate": 1e-05, "loss": 0.018, "step": 985500 }, { "epoch": 0.009856, "grad_norm": 0.176434725522995, "learning_rate": 1e-05, "loss": 0.0179, "step": 985600 }, { "epoch": 0.009857, "grad_norm": 0.14794251322746277, "learning_rate": 1e-05, "loss": 0.0179, "step": 985700 }, { "epoch": 0.009858, "grad_norm": 0.20957228541374207, "learning_rate": 1e-05, "loss": 0.018, "step": 985800 }, { "epoch": 0.009859, "grad_norm": 0.11549887806177139, "learning_rate": 1e-05, "loss": 0.0177, "step": 985900 }, { "epoch": 0.00986, "grad_norm": 0.1363794356584549, "learning_rate": 1e-05, "loss": 0.0177, "step": 986000 }, { "epoch": 0.009861, "grad_norm": 0.1532014161348343, "learning_rate": 1e-05, "loss": 0.0174, "step": 986100 }, { "epoch": 0.009862, "grad_norm": 0.15005096793174744, "learning_rate": 1e-05, "loss": 0.018, "step": 986200 }, { "epoch": 0.009863, "grad_norm": 0.20423533022403717, "learning_rate": 1e-05, "loss": 0.0176, "step": 986300 }, { "epoch": 0.009864, "grad_norm": 0.13826914131641388, "learning_rate": 1e-05, "loss": 0.0175, "step": 986400 }, { "epoch": 0.009865, "grad_norm": 0.1425575464963913, "learning_rate": 1e-05, "loss": 0.0178, "step": 986500 }, { "epoch": 0.009866, "grad_norm": 0.1683439314365387, "learning_rate": 1e-05, "loss": 0.0176, "step": 986600 }, { "epoch": 0.009867, "grad_norm": 0.1409069448709488, "learning_rate": 1e-05, "loss": 0.0173, "step": 986700 }, { "epoch": 0.009868, "grad_norm": 0.11842384934425354, "learning_rate": 1e-05, "loss": 0.018, "step": 986800 }, { "epoch": 0.009869, "grad_norm": 0.14205116033554077, "learning_rate": 1e-05, "loss": 0.0174, "step": 986900 }, { "epoch": 0.00987, "grad_norm": 0.13570451736450195, "learning_rate": 1e-05, "loss": 0.0177, "step": 987000 }, { "epoch": 0.009871, "grad_norm": 0.12332887202501297, "learning_rate": 1e-05, "loss": 0.0172, "step": 987100 }, { "epoch": 0.009872, "grad_norm": 0.1769087016582489, "learning_rate": 1e-05, "loss": 0.018, "step": 987200 }, { "epoch": 0.009873, "grad_norm": 0.1443926841020584, "learning_rate": 1e-05, "loss": 0.0179, "step": 987300 }, { "epoch": 0.009874, "grad_norm": 0.15139319002628326, "learning_rate": 1e-05, "loss": 0.0179, "step": 987400 }, { "epoch": 0.009875, "grad_norm": 0.1496719866991043, "learning_rate": 1e-05, "loss": 0.0176, "step": 987500 }, { "epoch": 0.009876, "grad_norm": 0.15274734795093536, "learning_rate": 1e-05, "loss": 0.0178, "step": 987600 }, { "epoch": 0.009877, "grad_norm": 0.15102897584438324, "learning_rate": 1e-05, "loss": 0.0175, "step": 987700 }, { "epoch": 0.009878, "grad_norm": 0.12215752899646759, "learning_rate": 1e-05, "loss": 0.0176, "step": 987800 }, { "epoch": 0.009879, "grad_norm": 0.13123396039009094, "learning_rate": 1e-05, "loss": 0.0178, "step": 987900 }, { "epoch": 0.00988, "grad_norm": 0.1514354795217514, "learning_rate": 1e-05, "loss": 0.0181, "step": 988000 }, { "epoch": 0.009881, "grad_norm": 0.11627977341413498, "learning_rate": 1e-05, "loss": 0.0175, "step": 988100 }, { "epoch": 0.009882, "grad_norm": 0.13650557398796082, "learning_rate": 1e-05, "loss": 0.0174, "step": 988200 }, { "epoch": 0.009883, "grad_norm": 0.12001965194940567, "learning_rate": 1e-05, "loss": 0.0174, "step": 988300 }, { "epoch": 0.009884, "grad_norm": 0.15418042242527008, "learning_rate": 1e-05, "loss": 0.018, "step": 988400 }, { "epoch": 0.009885, "grad_norm": 0.2064676731824875, "learning_rate": 1e-05, "loss": 0.0178, "step": 988500 }, { "epoch": 0.009886, "grad_norm": 0.13495232164859772, "learning_rate": 1e-05, "loss": 0.0175, "step": 988600 }, { "epoch": 0.009887, "grad_norm": 0.12551827728748322, "learning_rate": 1e-05, "loss": 0.0178, "step": 988700 }, { "epoch": 0.009888, "grad_norm": 0.17036981880664825, "learning_rate": 1e-05, "loss": 0.0178, "step": 988800 }, { "epoch": 0.009889, "grad_norm": 0.12408590316772461, "learning_rate": 1e-05, "loss": 0.0183, "step": 988900 }, { "epoch": 0.00989, "grad_norm": 0.19467498362064362, "learning_rate": 1e-05, "loss": 0.0176, "step": 989000 }, { "epoch": 0.009891, "grad_norm": 0.13141728937625885, "learning_rate": 1e-05, "loss": 0.0174, "step": 989100 }, { "epoch": 0.009892, "grad_norm": 0.1516573280096054, "learning_rate": 1e-05, "loss": 0.0179, "step": 989200 }, { "epoch": 0.009893, "grad_norm": 0.12567465007305145, "learning_rate": 1e-05, "loss": 0.0177, "step": 989300 }, { "epoch": 0.009894, "grad_norm": 0.1623883992433548, "learning_rate": 1e-05, "loss": 0.0175, "step": 989400 }, { "epoch": 0.009895, "grad_norm": 0.14890146255493164, "learning_rate": 1e-05, "loss": 0.018, "step": 989500 }, { "epoch": 0.009896, "grad_norm": 0.14091844856739044, "learning_rate": 1e-05, "loss": 0.0179, "step": 989600 }, { "epoch": 0.009897, "grad_norm": 0.10508344322443008, "learning_rate": 1e-05, "loss": 0.0178, "step": 989700 }, { "epoch": 0.009898, "grad_norm": 0.14180517196655273, "learning_rate": 1e-05, "loss": 0.0179, "step": 989800 }, { "epoch": 0.009899, "grad_norm": 0.1301991045475006, "learning_rate": 1e-05, "loss": 0.0177, "step": 989900 }, { "epoch": 0.0099, "grad_norm": 0.14139620959758759, "learning_rate": 1e-05, "loss": 0.0174, "step": 990000 }, { "epoch": 0.009901, "grad_norm": 0.14801116287708282, "learning_rate": 1e-05, "loss": 0.0175, "step": 990100 }, { "epoch": 0.009902, "grad_norm": 0.13594812154769897, "learning_rate": 1e-05, "loss": 0.0178, "step": 990200 }, { "epoch": 0.009903, "grad_norm": 0.13415086269378662, "learning_rate": 1e-05, "loss": 0.018, "step": 990300 }, { "epoch": 0.009904, "grad_norm": 0.11056048423051834, "learning_rate": 1e-05, "loss": 0.0181, "step": 990400 }, { "epoch": 0.009905, "grad_norm": 0.2508842647075653, "learning_rate": 1e-05, "loss": 0.0179, "step": 990500 }, { "epoch": 0.009906, "grad_norm": 0.13144679367542267, "learning_rate": 1e-05, "loss": 0.0178, "step": 990600 }, { "epoch": 0.009907, "grad_norm": 0.16997791826725006, "learning_rate": 1e-05, "loss": 0.018, "step": 990700 }, { "epoch": 0.009908, "grad_norm": 0.14725832641124725, "learning_rate": 1e-05, "loss": 0.0172, "step": 990800 }, { "epoch": 0.009909, "grad_norm": 0.19668659567832947, "learning_rate": 1e-05, "loss": 0.0174, "step": 990900 }, { "epoch": 0.00991, "grad_norm": 0.15474288165569305, "learning_rate": 1e-05, "loss": 0.0177, "step": 991000 }, { "epoch": 0.009911, "grad_norm": 0.13303501904010773, "learning_rate": 1e-05, "loss": 0.0176, "step": 991100 }, { "epoch": 0.009912, "grad_norm": 0.10955076664686203, "learning_rate": 1e-05, "loss": 0.0177, "step": 991200 }, { "epoch": 0.009913, "grad_norm": 0.14957353472709656, "learning_rate": 1e-05, "loss": 0.0176, "step": 991300 }, { "epoch": 0.009914, "grad_norm": 0.16088584065437317, "learning_rate": 1e-05, "loss": 0.0182, "step": 991400 }, { "epoch": 0.009915, "grad_norm": 0.14280326664447784, "learning_rate": 1e-05, "loss": 0.018, "step": 991500 }, { "epoch": 0.009916, "grad_norm": 0.15463459491729736, "learning_rate": 1e-05, "loss": 0.0183, "step": 991600 }, { "epoch": 0.009917, "grad_norm": 0.14990299940109253, "learning_rate": 1e-05, "loss": 0.0174, "step": 991700 }, { "epoch": 0.009918, "grad_norm": 0.1421930193901062, "learning_rate": 1e-05, "loss": 0.0176, "step": 991800 }, { "epoch": 0.009919, "grad_norm": 0.12555918097496033, "learning_rate": 1e-05, "loss": 0.018, "step": 991900 }, { "epoch": 0.00992, "grad_norm": 0.14472267031669617, "learning_rate": 1e-05, "loss": 0.0175, "step": 992000 }, { "epoch": 0.009921, "grad_norm": 0.1274133324623108, "learning_rate": 1e-05, "loss": 0.0183, "step": 992100 }, { "epoch": 0.009922, "grad_norm": 0.13342596590518951, "learning_rate": 1e-05, "loss": 0.018, "step": 992200 }, { "epoch": 0.009923, "grad_norm": 0.11091531068086624, "learning_rate": 1e-05, "loss": 0.0182, "step": 992300 }, { "epoch": 0.009924, "grad_norm": 0.14202776551246643, "learning_rate": 1e-05, "loss": 0.0176, "step": 992400 }, { "epoch": 0.009925, "grad_norm": 0.12998352944850922, "learning_rate": 1e-05, "loss": 0.0174, "step": 992500 }, { "epoch": 0.009926, "grad_norm": 0.1272992640733719, "learning_rate": 1e-05, "loss": 0.0179, "step": 992600 }, { "epoch": 0.009927, "grad_norm": 0.12614740431308746, "learning_rate": 1e-05, "loss": 0.0178, "step": 992700 }, { "epoch": 0.009928, "grad_norm": 0.15571163594722748, "learning_rate": 1e-05, "loss": 0.0179, "step": 992800 }, { "epoch": 0.009929, "grad_norm": 0.16228051483631134, "learning_rate": 1e-05, "loss": 0.0178, "step": 992900 }, { "epoch": 0.00993, "grad_norm": 0.15377365052700043, "learning_rate": 1e-05, "loss": 0.0176, "step": 993000 }, { "epoch": 0.009931, "grad_norm": 0.13828237354755402, "learning_rate": 1e-05, "loss": 0.0175, "step": 993100 }, { "epoch": 0.009932, "grad_norm": 0.13194149732589722, "learning_rate": 1e-05, "loss": 0.0177, "step": 993200 }, { "epoch": 0.009933, "grad_norm": 0.13856510818004608, "learning_rate": 1e-05, "loss": 0.0174, "step": 993300 }, { "epoch": 0.009934, "grad_norm": 0.10831373929977417, "learning_rate": 1e-05, "loss": 0.0175, "step": 993400 }, { "epoch": 0.009935, "grad_norm": 0.16341303288936615, "learning_rate": 1e-05, "loss": 0.0178, "step": 993500 }, { "epoch": 0.009936, "grad_norm": 0.11407075822353363, "learning_rate": 1e-05, "loss": 0.0178, "step": 993600 }, { "epoch": 0.009937, "grad_norm": 0.14793018996715546, "learning_rate": 1e-05, "loss": 0.0176, "step": 993700 }, { "epoch": 0.009938, "grad_norm": 0.16698937118053436, "learning_rate": 1e-05, "loss": 0.0177, "step": 993800 }, { "epoch": 0.009939, "grad_norm": 0.16812531650066376, "learning_rate": 1e-05, "loss": 0.018, "step": 993900 }, { "epoch": 0.00994, "grad_norm": 0.12976470589637756, "learning_rate": 1e-05, "loss": 0.0177, "step": 994000 }, { "epoch": 0.009941, "grad_norm": 0.1497420370578766, "learning_rate": 1e-05, "loss": 0.0177, "step": 994100 }, { "epoch": 0.009942, "grad_norm": 0.1458023637533188, "learning_rate": 1e-05, "loss": 0.0178, "step": 994200 }, { "epoch": 0.009943, "grad_norm": 0.11970306187868118, "learning_rate": 1e-05, "loss": 0.0181, "step": 994300 }, { "epoch": 0.009944, "grad_norm": 0.16291090846061707, "learning_rate": 1e-05, "loss": 0.0179, "step": 994400 }, { "epoch": 0.009945, "grad_norm": 0.16305851936340332, "learning_rate": 1e-05, "loss": 0.0172, "step": 994500 }, { "epoch": 0.009946, "grad_norm": 0.15789452195167542, "learning_rate": 1e-05, "loss": 0.018, "step": 994600 }, { "epoch": 0.009947, "grad_norm": 0.1554637998342514, "learning_rate": 1e-05, "loss": 0.0176, "step": 994700 }, { "epoch": 0.009948, "grad_norm": 0.17559874057769775, "learning_rate": 1e-05, "loss": 0.0178, "step": 994800 }, { "epoch": 0.009949, "grad_norm": 0.16068091988563538, "learning_rate": 1e-05, "loss": 0.0177, "step": 994900 }, { "epoch": 0.00995, "grad_norm": 0.18692880868911743, "learning_rate": 1e-05, "loss": 0.0176, "step": 995000 }, { "epoch": 0.009951, "grad_norm": 0.14853154122829437, "learning_rate": 1e-05, "loss": 0.018, "step": 995100 }, { "epoch": 0.009952, "grad_norm": 0.13028135895729065, "learning_rate": 1e-05, "loss": 0.0177, "step": 995200 }, { "epoch": 0.009953, "grad_norm": 0.1370738446712494, "learning_rate": 1e-05, "loss": 0.0175, "step": 995300 }, { "epoch": 0.009954, "grad_norm": 0.1431790143251419, "learning_rate": 1e-05, "loss": 0.0184, "step": 995400 }, { "epoch": 0.009955, "grad_norm": 0.12926821410655975, "learning_rate": 1e-05, "loss": 0.0178, "step": 995500 }, { "epoch": 0.009956, "grad_norm": 0.2121957391500473, "learning_rate": 1e-05, "loss": 0.0177, "step": 995600 }, { "epoch": 0.009957, "grad_norm": 0.11526373773813248, "learning_rate": 1e-05, "loss": 0.0176, "step": 995700 }, { "epoch": 0.009958, "grad_norm": 0.11836830526590347, "learning_rate": 1e-05, "loss": 0.0178, "step": 995800 }, { "epoch": 0.009959, "grad_norm": 0.12821626663208008, "learning_rate": 1e-05, "loss": 0.0176, "step": 995900 }, { "epoch": 0.00996, "grad_norm": 0.14616426825523376, "learning_rate": 1e-05, "loss": 0.0174, "step": 996000 }, { "epoch": 0.009961, "grad_norm": 0.1805770993232727, "learning_rate": 1e-05, "loss": 0.018, "step": 996100 }, { "epoch": 0.009962, "grad_norm": 0.16658417880535126, "learning_rate": 1e-05, "loss": 0.0176, "step": 996200 }, { "epoch": 0.009963, "grad_norm": 0.14888067543506622, "learning_rate": 1e-05, "loss": 0.0174, "step": 996300 }, { "epoch": 0.009964, "grad_norm": 0.136169895529747, "learning_rate": 1e-05, "loss": 0.0176, "step": 996400 }, { "epoch": 0.009965, "grad_norm": 0.11328653991222382, "learning_rate": 1e-05, "loss": 0.0176, "step": 996500 }, { "epoch": 0.009966, "grad_norm": 0.1495729386806488, "learning_rate": 1e-05, "loss": 0.0177, "step": 996600 }, { "epoch": 0.009967, "grad_norm": 0.16054333746433258, "learning_rate": 1e-05, "loss": 0.0174, "step": 996700 }, { "epoch": 0.009968, "grad_norm": 0.15570423007011414, "learning_rate": 1e-05, "loss": 0.0182, "step": 996800 }, { "epoch": 0.009969, "grad_norm": 0.1712031215429306, "learning_rate": 1e-05, "loss": 0.0173, "step": 996900 }, { "epoch": 0.00997, "grad_norm": 0.1508294641971588, "learning_rate": 1e-05, "loss": 0.0175, "step": 997000 }, { "epoch": 0.009971, "grad_norm": 0.13266651332378387, "learning_rate": 1e-05, "loss": 0.0174, "step": 997100 }, { "epoch": 0.009972, "grad_norm": 0.12819764018058777, "learning_rate": 1e-05, "loss": 0.018, "step": 997200 }, { "epoch": 0.009973, "grad_norm": 0.14046382904052734, "learning_rate": 1e-05, "loss": 0.0179, "step": 997300 }, { "epoch": 0.009974, "grad_norm": 0.1358417421579361, "learning_rate": 1e-05, "loss": 0.018, "step": 997400 }, { "epoch": 0.009975, "grad_norm": 0.15806944668293, "learning_rate": 1e-05, "loss": 0.0175, "step": 997500 }, { "epoch": 0.009976, "grad_norm": 0.1443408727645874, "learning_rate": 1e-05, "loss": 0.018, "step": 997600 }, { "epoch": 0.009977, "grad_norm": 0.129330575466156, "learning_rate": 1e-05, "loss": 0.0182, "step": 997700 }, { "epoch": 0.009978, "grad_norm": 0.14039374887943268, "learning_rate": 1e-05, "loss": 0.0176, "step": 997800 }, { "epoch": 0.009979, "grad_norm": 0.15116272866725922, "learning_rate": 1e-05, "loss": 0.0176, "step": 997900 }, { "epoch": 0.00998, "grad_norm": 0.21422210335731506, "learning_rate": 1e-05, "loss": 0.0178, "step": 998000 }, { "epoch": 0.009981, "grad_norm": 0.14855067431926727, "learning_rate": 1e-05, "loss": 0.0174, "step": 998100 }, { "epoch": 0.009982, "grad_norm": 0.14174138009548187, "learning_rate": 1e-05, "loss": 0.0177, "step": 998200 }, { "epoch": 0.009983, "grad_norm": 0.12495587021112442, "learning_rate": 1e-05, "loss": 0.0178, "step": 998300 }, { "epoch": 0.009984, "grad_norm": 0.15814109146595, "learning_rate": 1e-05, "loss": 0.018, "step": 998400 }, { "epoch": 0.009985, "grad_norm": 0.17627373337745667, "learning_rate": 1e-05, "loss": 0.0179, "step": 998500 }, { "epoch": 0.009986, "grad_norm": 0.11817760020494461, "learning_rate": 1e-05, "loss": 0.0176, "step": 998600 }, { "epoch": 0.009987, "grad_norm": 0.118190698325634, "learning_rate": 1e-05, "loss": 0.0173, "step": 998700 }, { "epoch": 0.009988, "grad_norm": 0.14696422219276428, "learning_rate": 1e-05, "loss": 0.0182, "step": 998800 }, { "epoch": 0.009989, "grad_norm": 0.13177813589572906, "learning_rate": 1e-05, "loss": 0.0181, "step": 998900 }, { "epoch": 0.00999, "grad_norm": 0.14606986939907074, "learning_rate": 1e-05, "loss": 0.0174, "step": 999000 }, { "epoch": 0.009991, "grad_norm": 0.13002532720565796, "learning_rate": 1e-05, "loss": 0.0173, "step": 999100 }, { "epoch": 0.009992, "grad_norm": 0.15128880739212036, "learning_rate": 1e-05, "loss": 0.018, "step": 999200 }, { "epoch": 0.009993, "grad_norm": 0.10601651668548584, "learning_rate": 1e-05, "loss": 0.0181, "step": 999300 }, { "epoch": 0.009994, "grad_norm": 0.13263332843780518, "learning_rate": 1e-05, "loss": 0.0173, "step": 999400 }, { "epoch": 0.009995, "grad_norm": 0.13653919100761414, "learning_rate": 1e-05, "loss": 0.0176, "step": 999500 }, { "epoch": 0.009996, "grad_norm": 0.14669930934906006, "learning_rate": 1e-05, "loss": 0.0175, "step": 999600 }, { "epoch": 0.009997, "grad_norm": 0.12468747049570084, "learning_rate": 1e-05, "loss": 0.018, "step": 999700 }, { "epoch": 0.009998, "grad_norm": 0.11297482252120972, "learning_rate": 1e-05, "loss": 0.0174, "step": 999800 }, { "epoch": 0.009999, "grad_norm": 0.1380089372396469, "learning_rate": 1e-05, "loss": 0.0177, "step": 999900 }, { "epoch": 0.01, "grad_norm": 0.14848780632019043, "learning_rate": 1e-05, "loss": 0.0178, "step": 1000000 }, { "epoch": 0.01, "eval_loss": 0.015677427873015404, "eval_runtime": 194.3826, "eval_samples_per_second": 257.225, "eval_steps_per_second": 16.077, "step": 1000000 }, { "epoch": 0.010001, "grad_norm": 0.11952350288629532, "learning_rate": 1e-05, "loss": 0.0178, "step": 1000100 }, { "epoch": 0.010002, "grad_norm": 0.13682597875595093, "learning_rate": 1e-05, "loss": 0.0175, "step": 1000200 }, { "epoch": 0.010003, "grad_norm": 0.17075768113136292, "learning_rate": 1e-05, "loss": 0.0174, "step": 1000300 }, { "epoch": 0.010004, "grad_norm": 0.12981368601322174, "learning_rate": 1e-05, "loss": 0.0176, "step": 1000400 }, { "epoch": 0.010005, "grad_norm": 0.09527723491191864, "learning_rate": 1e-05, "loss": 0.0176, "step": 1000500 }, { "epoch": 0.010006, "grad_norm": 0.11236711591482162, "learning_rate": 1e-05, "loss": 0.0178, "step": 1000600 }, { "epoch": 0.010007, "grad_norm": 0.14195124804973602, "learning_rate": 1e-05, "loss": 0.0176, "step": 1000700 }, { "epoch": 0.010008, "grad_norm": 0.10858549177646637, "learning_rate": 1e-05, "loss": 0.0178, "step": 1000800 }, { "epoch": 0.010009, "grad_norm": 0.12160372734069824, "learning_rate": 1e-05, "loss": 0.0179, "step": 1000900 }, { "epoch": 0.01001, "grad_norm": 0.16441255807876587, "learning_rate": 1e-05, "loss": 0.0174, "step": 1001000 }, { "epoch": 0.010011, "grad_norm": 0.12393173575401306, "learning_rate": 1e-05, "loss": 0.0177, "step": 1001100 }, { "epoch": 0.010012, "grad_norm": 0.1558436155319214, "learning_rate": 1e-05, "loss": 0.0174, "step": 1001200 }, { "epoch": 0.010013, "grad_norm": 0.11830200999975204, "learning_rate": 1e-05, "loss": 0.0174, "step": 1001300 }, { "epoch": 0.010014, "grad_norm": 0.16578808426856995, "learning_rate": 1e-05, "loss": 0.0181, "step": 1001400 }, { "epoch": 0.010015, "grad_norm": 0.1622014343738556, "learning_rate": 1e-05, "loss": 0.0176, "step": 1001500 }, { "epoch": 0.010016, "grad_norm": 0.10572869330644608, "learning_rate": 1e-05, "loss": 0.0179, "step": 1001600 }, { "epoch": 0.010017, "grad_norm": 0.1361035406589508, "learning_rate": 1e-05, "loss": 0.0179, "step": 1001700 }, { "epoch": 0.010018, "grad_norm": 0.12091025710105896, "learning_rate": 1e-05, "loss": 0.0176, "step": 1001800 }, { "epoch": 0.010019, "grad_norm": 0.14585508406162262, "learning_rate": 1e-05, "loss": 0.0173, "step": 1001900 }, { "epoch": 0.01002, "grad_norm": 0.11678944528102875, "learning_rate": 1e-05, "loss": 0.0174, "step": 1002000 }, { "epoch": 0.010021, "grad_norm": 0.10246483981609344, "learning_rate": 1e-05, "loss": 0.0176, "step": 1002100 }, { "epoch": 0.010022, "grad_norm": 0.17445562779903412, "learning_rate": 1e-05, "loss": 0.0176, "step": 1002200 }, { "epoch": 0.010023, "grad_norm": 0.15893420577049255, "learning_rate": 1e-05, "loss": 0.0178, "step": 1002300 }, { "epoch": 0.010024, "grad_norm": 0.12339092046022415, "learning_rate": 1e-05, "loss": 0.0177, "step": 1002400 }, { "epoch": 0.010025, "grad_norm": 0.1377716213464737, "learning_rate": 1e-05, "loss": 0.0176, "step": 1002500 }, { "epoch": 0.010026, "grad_norm": 0.15385554730892181, "learning_rate": 1e-05, "loss": 0.0177, "step": 1002600 }, { "epoch": 0.010027, "grad_norm": 0.1495867222547531, "learning_rate": 1e-05, "loss": 0.0176, "step": 1002700 }, { "epoch": 0.010028, "grad_norm": 0.12728027999401093, "learning_rate": 1e-05, "loss": 0.0168, "step": 1002800 }, { "epoch": 0.010029, "grad_norm": 0.14245231449604034, "learning_rate": 1e-05, "loss": 0.0182, "step": 1002900 }, { "epoch": 0.01003, "grad_norm": 0.1173754408955574, "learning_rate": 1e-05, "loss": 0.0177, "step": 1003000 }, { "epoch": 0.010031, "grad_norm": 0.16216987371444702, "learning_rate": 1e-05, "loss": 0.0176, "step": 1003100 }, { "epoch": 0.010032, "grad_norm": 0.13703711330890656, "learning_rate": 1e-05, "loss": 0.0176, "step": 1003200 }, { "epoch": 0.010033, "grad_norm": 0.1526353359222412, "learning_rate": 1e-05, "loss": 0.0177, "step": 1003300 }, { "epoch": 0.010034, "grad_norm": 0.10624656826257706, "learning_rate": 1e-05, "loss": 0.0177, "step": 1003400 }, { "epoch": 0.010035, "grad_norm": 0.13986603915691376, "learning_rate": 1e-05, "loss": 0.0174, "step": 1003500 }, { "epoch": 0.010036, "grad_norm": 0.14384345710277557, "learning_rate": 1e-05, "loss": 0.0172, "step": 1003600 }, { "epoch": 0.010037, "grad_norm": 0.12093856185674667, "learning_rate": 1e-05, "loss": 0.0175, "step": 1003700 }, { "epoch": 0.010038, "grad_norm": 0.16059713065624237, "learning_rate": 1e-05, "loss": 0.0181, "step": 1003800 }, { "epoch": 0.010039, "grad_norm": 0.21251815557479858, "learning_rate": 1e-05, "loss": 0.018, "step": 1003900 }, { "epoch": 0.01004, "grad_norm": 0.16603530943393707, "learning_rate": 1e-05, "loss": 0.0175, "step": 1004000 }, { "epoch": 0.010041, "grad_norm": 0.1870737373828888, "learning_rate": 1e-05, "loss": 0.0179, "step": 1004100 }, { "epoch": 0.010042, "grad_norm": 0.1422659307718277, "learning_rate": 1e-05, "loss": 0.0176, "step": 1004200 }, { "epoch": 0.010043, "grad_norm": 0.15785014629364014, "learning_rate": 1e-05, "loss": 0.0175, "step": 1004300 }, { "epoch": 0.010044, "grad_norm": 0.16008518636226654, "learning_rate": 1e-05, "loss": 0.0177, "step": 1004400 }, { "epoch": 0.010045, "grad_norm": 0.12294738739728928, "learning_rate": 1e-05, "loss": 0.0175, "step": 1004500 }, { "epoch": 0.010046, "grad_norm": 0.10401184111833572, "learning_rate": 1e-05, "loss": 0.0173, "step": 1004600 }, { "epoch": 0.010047, "grad_norm": 0.14548532664775848, "learning_rate": 1e-05, "loss": 0.0173, "step": 1004700 }, { "epoch": 0.010048, "grad_norm": 0.1200275868177414, "learning_rate": 1e-05, "loss": 0.0175, "step": 1004800 }, { "epoch": 0.010049, "grad_norm": 0.16970030963420868, "learning_rate": 1e-05, "loss": 0.0178, "step": 1004900 }, { "epoch": 0.01005, "grad_norm": 0.17139367759227753, "learning_rate": 1e-05, "loss": 0.0175, "step": 1005000 }, { "epoch": 0.010051, "grad_norm": 0.14196571707725525, "learning_rate": 1e-05, "loss": 0.017, "step": 1005100 }, { "epoch": 0.010052, "grad_norm": 0.13109299540519714, "learning_rate": 1e-05, "loss": 0.0176, "step": 1005200 }, { "epoch": 0.010053, "grad_norm": 0.19775064289569855, "learning_rate": 1e-05, "loss": 0.0177, "step": 1005300 }, { "epoch": 0.010054, "grad_norm": 0.1489570289850235, "learning_rate": 1e-05, "loss": 0.018, "step": 1005400 }, { "epoch": 0.010055, "grad_norm": 0.150162011384964, "learning_rate": 1e-05, "loss": 0.0178, "step": 1005500 }, { "epoch": 0.010056, "grad_norm": 0.1500353366136551, "learning_rate": 1e-05, "loss": 0.0182, "step": 1005600 }, { "epoch": 0.010057, "grad_norm": 0.18692660331726074, "learning_rate": 1e-05, "loss": 0.0173, "step": 1005700 }, { "epoch": 0.010058, "grad_norm": 0.13979411125183105, "learning_rate": 1e-05, "loss": 0.0178, "step": 1005800 }, { "epoch": 0.010059, "grad_norm": 0.11406039446592331, "learning_rate": 1e-05, "loss": 0.018, "step": 1005900 }, { "epoch": 0.01006, "grad_norm": 0.11750727146863937, "learning_rate": 1e-05, "loss": 0.0171, "step": 1006000 }, { "epoch": 0.010061, "grad_norm": 0.12330952286720276, "learning_rate": 1e-05, "loss": 0.0177, "step": 1006100 }, { "epoch": 0.010062, "grad_norm": 0.1374475657939911, "learning_rate": 1e-05, "loss": 0.0176, "step": 1006200 }, { "epoch": 0.010063, "grad_norm": 0.13846825063228607, "learning_rate": 1e-05, "loss": 0.018, "step": 1006300 }, { "epoch": 0.010064, "grad_norm": 0.17819225788116455, "learning_rate": 1e-05, "loss": 0.0171, "step": 1006400 }, { "epoch": 0.010065, "grad_norm": 0.1665199100971222, "learning_rate": 1e-05, "loss": 0.0174, "step": 1006500 }, { "epoch": 0.010066, "grad_norm": 0.11915154755115509, "learning_rate": 1e-05, "loss": 0.017, "step": 1006600 }, { "epoch": 0.010067, "grad_norm": 0.13354997336864471, "learning_rate": 1e-05, "loss": 0.018, "step": 1006700 }, { "epoch": 0.010068, "grad_norm": 0.12235656380653381, "learning_rate": 1e-05, "loss": 0.0177, "step": 1006800 }, { "epoch": 0.010069, "grad_norm": 0.13254553079605103, "learning_rate": 1e-05, "loss": 0.0177, "step": 1006900 }, { "epoch": 0.01007, "grad_norm": 0.11989499628543854, "learning_rate": 1e-05, "loss": 0.0174, "step": 1007000 }, { "epoch": 0.010071, "grad_norm": 0.16093315184116364, "learning_rate": 1e-05, "loss": 0.0179, "step": 1007100 }, { "epoch": 0.010072, "grad_norm": 0.134983092546463, "learning_rate": 1e-05, "loss": 0.0177, "step": 1007200 }, { "epoch": 0.010073, "grad_norm": 0.16790834069252014, "learning_rate": 1e-05, "loss": 0.0175, "step": 1007300 }, { "epoch": 0.010074, "grad_norm": 0.15677960216999054, "learning_rate": 1e-05, "loss": 0.0178, "step": 1007400 }, { "epoch": 0.010075, "grad_norm": 0.1312137246131897, "learning_rate": 1e-05, "loss": 0.0179, "step": 1007500 }, { "epoch": 0.010076, "grad_norm": 0.15454867482185364, "learning_rate": 1e-05, "loss": 0.0176, "step": 1007600 }, { "epoch": 0.010077, "grad_norm": 0.16393987834453583, "learning_rate": 1e-05, "loss": 0.0179, "step": 1007700 }, { "epoch": 0.010078, "grad_norm": 0.20540626347064972, "learning_rate": 1e-05, "loss": 0.0173, "step": 1007800 }, { "epoch": 0.010079, "grad_norm": 0.14700911939144135, "learning_rate": 1e-05, "loss": 0.0174, "step": 1007900 }, { "epoch": 0.01008, "grad_norm": 0.1262180656194687, "learning_rate": 1e-05, "loss": 0.0181, "step": 1008000 }, { "epoch": 0.010081, "grad_norm": 0.14022059738636017, "learning_rate": 1e-05, "loss": 0.0177, "step": 1008100 }, { "epoch": 0.010082, "grad_norm": 0.1544470340013504, "learning_rate": 1e-05, "loss": 0.0178, "step": 1008200 }, { "epoch": 0.010083, "grad_norm": 0.12518763542175293, "learning_rate": 1e-05, "loss": 0.0176, "step": 1008300 }, { "epoch": 0.010084, "grad_norm": 0.08402888476848602, "learning_rate": 1e-05, "loss": 0.0179, "step": 1008400 }, { "epoch": 0.010085, "grad_norm": 0.12616044282913208, "learning_rate": 1e-05, "loss": 0.0178, "step": 1008500 }, { "epoch": 0.010086, "grad_norm": 0.1349126547574997, "learning_rate": 1e-05, "loss": 0.0174, "step": 1008600 }, { "epoch": 0.010087, "grad_norm": 0.12409678101539612, "learning_rate": 1e-05, "loss": 0.0173, "step": 1008700 }, { "epoch": 0.010088, "grad_norm": 0.1272343248128891, "learning_rate": 1e-05, "loss": 0.0177, "step": 1008800 }, { "epoch": 0.010089, "grad_norm": 0.13566075265407562, "learning_rate": 1e-05, "loss": 0.0184, "step": 1008900 }, { "epoch": 0.01009, "grad_norm": 0.1380251795053482, "learning_rate": 1e-05, "loss": 0.0174, "step": 1009000 }, { "epoch": 0.010091, "grad_norm": 0.11821264773607254, "learning_rate": 1e-05, "loss": 0.0175, "step": 1009100 }, { "epoch": 0.010092, "grad_norm": 0.12889741361141205, "learning_rate": 1e-05, "loss": 0.0172, "step": 1009200 }, { "epoch": 0.010093, "grad_norm": 0.13462994992733002, "learning_rate": 1e-05, "loss": 0.0177, "step": 1009300 }, { "epoch": 0.010094, "grad_norm": 0.1533074527978897, "learning_rate": 1e-05, "loss": 0.0181, "step": 1009400 }, { "epoch": 0.010095, "grad_norm": 0.12093422561883926, "learning_rate": 1e-05, "loss": 0.0176, "step": 1009500 }, { "epoch": 0.010096, "grad_norm": 0.0921555757522583, "learning_rate": 1e-05, "loss": 0.0176, "step": 1009600 }, { "epoch": 0.010097, "grad_norm": 0.13634364306926727, "learning_rate": 1e-05, "loss": 0.0178, "step": 1009700 }, { "epoch": 0.010098, "grad_norm": 0.12508073449134827, "learning_rate": 1e-05, "loss": 0.0175, "step": 1009800 }, { "epoch": 0.010099, "grad_norm": 0.1686936318874359, "learning_rate": 1e-05, "loss": 0.0177, "step": 1009900 }, { "epoch": 0.0101, "grad_norm": 0.15782853960990906, "learning_rate": 1e-05, "loss": 0.0174, "step": 1010000 }, { "epoch": 0.010101, "grad_norm": 0.11575949937105179, "learning_rate": 1e-05, "loss": 0.0174, "step": 1010100 }, { "epoch": 0.010102, "grad_norm": 0.12385135143995285, "learning_rate": 1e-05, "loss": 0.0177, "step": 1010200 }, { "epoch": 0.010103, "grad_norm": 0.15678460896015167, "learning_rate": 1e-05, "loss": 0.0182, "step": 1010300 }, { "epoch": 0.010104, "grad_norm": 0.17237980663776398, "learning_rate": 1e-05, "loss": 0.0175, "step": 1010400 }, { "epoch": 0.010105, "grad_norm": 0.10743454098701477, "learning_rate": 1e-05, "loss": 0.0173, "step": 1010500 }, { "epoch": 0.010106, "grad_norm": 0.17658963799476624, "learning_rate": 1e-05, "loss": 0.0175, "step": 1010600 }, { "epoch": 0.010107, "grad_norm": 0.12610791623592377, "learning_rate": 1e-05, "loss": 0.0177, "step": 1010700 }, { "epoch": 0.010108, "grad_norm": 0.17434284090995789, "learning_rate": 1e-05, "loss": 0.0176, "step": 1010800 }, { "epoch": 0.010109, "grad_norm": 0.1733924299478531, "learning_rate": 1e-05, "loss": 0.0173, "step": 1010900 }, { "epoch": 0.01011, "grad_norm": 0.16822007298469543, "learning_rate": 1e-05, "loss": 0.0175, "step": 1011000 }, { "epoch": 0.010111, "grad_norm": 0.14988751709461212, "learning_rate": 1e-05, "loss": 0.0177, "step": 1011100 }, { "epoch": 0.010112, "grad_norm": 0.12048187851905823, "learning_rate": 1e-05, "loss": 0.0174, "step": 1011200 }, { "epoch": 0.010113, "grad_norm": 0.17067661881446838, "learning_rate": 1e-05, "loss": 0.0173, "step": 1011300 }, { "epoch": 0.010114, "grad_norm": 0.1640622466802597, "learning_rate": 1e-05, "loss": 0.0177, "step": 1011400 }, { "epoch": 0.010115, "grad_norm": 0.1303969919681549, "learning_rate": 1e-05, "loss": 0.0171, "step": 1011500 }, { "epoch": 0.010116, "grad_norm": 0.1687624454498291, "learning_rate": 1e-05, "loss": 0.0177, "step": 1011600 }, { "epoch": 0.010117, "grad_norm": 0.17299102246761322, "learning_rate": 1e-05, "loss": 0.0173, "step": 1011700 }, { "epoch": 0.010118, "grad_norm": 0.11646290123462677, "learning_rate": 1e-05, "loss": 0.0178, "step": 1011800 }, { "epoch": 0.010119, "grad_norm": 0.15238027274608612, "learning_rate": 1e-05, "loss": 0.0177, "step": 1011900 }, { "epoch": 0.01012, "grad_norm": 0.10595665127038956, "learning_rate": 1e-05, "loss": 0.018, "step": 1012000 }, { "epoch": 0.010121, "grad_norm": 0.1380605846643448, "learning_rate": 1e-05, "loss": 0.0176, "step": 1012100 }, { "epoch": 0.010122, "grad_norm": 0.12657637894153595, "learning_rate": 1e-05, "loss": 0.0173, "step": 1012200 }, { "epoch": 0.010123, "grad_norm": 0.13191817700862885, "learning_rate": 1e-05, "loss": 0.0177, "step": 1012300 }, { "epoch": 0.010124, "grad_norm": 0.17509318888187408, "learning_rate": 1e-05, "loss": 0.0184, "step": 1012400 }, { "epoch": 0.010125, "grad_norm": 0.1307971179485321, "learning_rate": 1e-05, "loss": 0.0176, "step": 1012500 }, { "epoch": 0.010126, "grad_norm": 0.1558491289615631, "learning_rate": 1e-05, "loss": 0.0177, "step": 1012600 }, { "epoch": 0.010127, "grad_norm": 0.15611694753170013, "learning_rate": 1e-05, "loss": 0.0177, "step": 1012700 }, { "epoch": 0.010128, "grad_norm": 0.13222725689411163, "learning_rate": 1e-05, "loss": 0.0176, "step": 1012800 }, { "epoch": 0.010129, "grad_norm": 0.1422492414712906, "learning_rate": 1e-05, "loss": 0.0176, "step": 1012900 }, { "epoch": 0.01013, "grad_norm": 0.16288477182388306, "learning_rate": 1e-05, "loss": 0.0172, "step": 1013000 }, { "epoch": 0.010131, "grad_norm": 0.14505308866500854, "learning_rate": 1e-05, "loss": 0.0175, "step": 1013100 }, { "epoch": 0.010132, "grad_norm": 0.2075696587562561, "learning_rate": 1e-05, "loss": 0.0172, "step": 1013200 }, { "epoch": 0.010133, "grad_norm": 0.1384744644165039, "learning_rate": 1e-05, "loss": 0.018, "step": 1013300 }, { "epoch": 0.010134, "grad_norm": 0.16835173964500427, "learning_rate": 1e-05, "loss": 0.0177, "step": 1013400 }, { "epoch": 0.010135, "grad_norm": 0.140804722905159, "learning_rate": 1e-05, "loss": 0.0179, "step": 1013500 }, { "epoch": 0.010136, "grad_norm": 0.13522456586360931, "learning_rate": 1e-05, "loss": 0.0176, "step": 1013600 }, { "epoch": 0.010137, "grad_norm": 0.20016250014305115, "learning_rate": 1e-05, "loss": 0.0174, "step": 1013700 }, { "epoch": 0.010138, "grad_norm": 0.1276240199804306, "learning_rate": 1e-05, "loss": 0.0177, "step": 1013800 }, { "epoch": 0.010139, "grad_norm": 0.1379183828830719, "learning_rate": 1e-05, "loss": 0.0175, "step": 1013900 }, { "epoch": 0.01014, "grad_norm": 0.14574769139289856, "learning_rate": 1e-05, "loss": 0.0174, "step": 1014000 }, { "epoch": 0.010141, "grad_norm": 0.16396468877792358, "learning_rate": 1e-05, "loss": 0.0176, "step": 1014100 }, { "epoch": 0.010142, "grad_norm": 0.10676564276218414, "learning_rate": 1e-05, "loss": 0.0173, "step": 1014200 }, { "epoch": 0.010143, "grad_norm": 0.1474931687116623, "learning_rate": 1e-05, "loss": 0.0174, "step": 1014300 }, { "epoch": 0.010144, "grad_norm": 0.15035535395145416, "learning_rate": 1e-05, "loss": 0.0176, "step": 1014400 }, { "epoch": 0.010145, "grad_norm": 0.1380535215139389, "learning_rate": 1e-05, "loss": 0.0173, "step": 1014500 }, { "epoch": 0.010146, "grad_norm": 0.15102533996105194, "learning_rate": 1e-05, "loss": 0.0179, "step": 1014600 }, { "epoch": 0.010147, "grad_norm": 0.16804002225399017, "learning_rate": 1e-05, "loss": 0.0178, "step": 1014700 }, { "epoch": 0.010148, "grad_norm": 0.1356675773859024, "learning_rate": 1e-05, "loss": 0.018, "step": 1014800 }, { "epoch": 0.010149, "grad_norm": 0.16086618602275848, "learning_rate": 1e-05, "loss": 0.0178, "step": 1014900 }, { "epoch": 0.01015, "grad_norm": 0.15332187712192535, "learning_rate": 1e-05, "loss": 0.0175, "step": 1015000 }, { "epoch": 0.010151, "grad_norm": 0.15521931648254395, "learning_rate": 1e-05, "loss": 0.018, "step": 1015100 }, { "epoch": 0.010152, "grad_norm": 0.15687605738639832, "learning_rate": 1e-05, "loss": 0.018, "step": 1015200 }, { "epoch": 0.010153, "grad_norm": 0.14084093272686005, "learning_rate": 1e-05, "loss": 0.0172, "step": 1015300 }, { "epoch": 0.010154, "grad_norm": 0.16419555246829987, "learning_rate": 1e-05, "loss": 0.0175, "step": 1015400 }, { "epoch": 0.010155, "grad_norm": 0.15817846357822418, "learning_rate": 1e-05, "loss": 0.0175, "step": 1015500 }, { "epoch": 0.010156, "grad_norm": 0.10072106122970581, "learning_rate": 1e-05, "loss": 0.0182, "step": 1015600 }, { "epoch": 0.010157, "grad_norm": 0.14748477935791016, "learning_rate": 1e-05, "loss": 0.0173, "step": 1015700 }, { "epoch": 0.010158, "grad_norm": 0.11968681961297989, "learning_rate": 1e-05, "loss": 0.0175, "step": 1015800 }, { "epoch": 0.010159, "grad_norm": 0.13643746078014374, "learning_rate": 1e-05, "loss": 0.0174, "step": 1015900 }, { "epoch": 0.01016, "grad_norm": 0.1108623743057251, "learning_rate": 1e-05, "loss": 0.0176, "step": 1016000 }, { "epoch": 0.010161, "grad_norm": 0.14656610786914825, "learning_rate": 1e-05, "loss": 0.0172, "step": 1016100 }, { "epoch": 0.010162, "grad_norm": 0.13292884826660156, "learning_rate": 1e-05, "loss": 0.0179, "step": 1016200 }, { "epoch": 0.010163, "grad_norm": 0.13193513453006744, "learning_rate": 1e-05, "loss": 0.0174, "step": 1016300 }, { "epoch": 0.010164, "grad_norm": 0.1475083827972412, "learning_rate": 1e-05, "loss": 0.0179, "step": 1016400 }, { "epoch": 0.010165, "grad_norm": 0.1728820502758026, "learning_rate": 1e-05, "loss": 0.0173, "step": 1016500 }, { "epoch": 0.010166, "grad_norm": 0.14163751900196075, "learning_rate": 1e-05, "loss": 0.0177, "step": 1016600 }, { "epoch": 0.010167, "grad_norm": 0.16882435977458954, "learning_rate": 1e-05, "loss": 0.0175, "step": 1016700 }, { "epoch": 0.010168, "grad_norm": 0.14792400598526, "learning_rate": 1e-05, "loss": 0.0178, "step": 1016800 }, { "epoch": 0.010169, "grad_norm": 0.10700967162847519, "learning_rate": 1e-05, "loss": 0.0178, "step": 1016900 }, { "epoch": 0.01017, "grad_norm": 0.13037611544132233, "learning_rate": 1e-05, "loss": 0.0179, "step": 1017000 }, { "epoch": 0.010171, "grad_norm": 0.11280237883329391, "learning_rate": 1e-05, "loss": 0.0173, "step": 1017100 }, { "epoch": 0.010172, "grad_norm": 0.16534553468227386, "learning_rate": 1e-05, "loss": 0.0179, "step": 1017200 }, { "epoch": 0.010173, "grad_norm": 0.17858082056045532, "learning_rate": 1e-05, "loss": 0.0175, "step": 1017300 }, { "epoch": 0.010174, "grad_norm": 0.11898922175168991, "learning_rate": 1e-05, "loss": 0.0173, "step": 1017400 }, { "epoch": 0.010175, "grad_norm": 0.18904800713062286, "learning_rate": 1e-05, "loss": 0.0178, "step": 1017500 }, { "epoch": 0.010176, "grad_norm": 0.15978045761585236, "learning_rate": 1e-05, "loss": 0.0174, "step": 1017600 }, { "epoch": 0.010177, "grad_norm": 0.1099935993552208, "learning_rate": 1e-05, "loss": 0.0176, "step": 1017700 }, { "epoch": 0.010178, "grad_norm": 0.14623913168907166, "learning_rate": 1e-05, "loss": 0.0171, "step": 1017800 }, { "epoch": 0.010179, "grad_norm": 0.15568853914737701, "learning_rate": 1e-05, "loss": 0.0176, "step": 1017900 }, { "epoch": 0.01018, "grad_norm": 0.17557813227176666, "learning_rate": 1e-05, "loss": 0.0178, "step": 1018000 }, { "epoch": 0.010181, "grad_norm": 0.16670294106006622, "learning_rate": 1e-05, "loss": 0.0185, "step": 1018100 }, { "epoch": 0.010182, "grad_norm": 0.13107462227344513, "learning_rate": 1e-05, "loss": 0.0176, "step": 1018200 }, { "epoch": 0.010183, "grad_norm": 0.14889146387577057, "learning_rate": 1e-05, "loss": 0.0176, "step": 1018300 }, { "epoch": 0.010184, "grad_norm": 0.14253850281238556, "learning_rate": 1e-05, "loss": 0.0173, "step": 1018400 }, { "epoch": 0.010185, "grad_norm": 0.17000509798526764, "learning_rate": 1e-05, "loss": 0.0176, "step": 1018500 }, { "epoch": 0.010186, "grad_norm": 0.10491350293159485, "learning_rate": 1e-05, "loss": 0.0177, "step": 1018600 }, { "epoch": 0.010187, "grad_norm": 0.11601057648658752, "learning_rate": 1e-05, "loss": 0.0176, "step": 1018700 }, { "epoch": 0.010188, "grad_norm": 0.1362113058567047, "learning_rate": 1e-05, "loss": 0.0177, "step": 1018800 }, { "epoch": 0.010189, "grad_norm": 0.16518332064151764, "learning_rate": 1e-05, "loss": 0.0178, "step": 1018900 }, { "epoch": 0.01019, "grad_norm": 0.13615034520626068, "learning_rate": 1e-05, "loss": 0.0171, "step": 1019000 }, { "epoch": 0.010191, "grad_norm": 0.17150871455669403, "learning_rate": 1e-05, "loss": 0.0171, "step": 1019100 }, { "epoch": 0.010192, "grad_norm": 0.1478675752878189, "learning_rate": 1e-05, "loss": 0.0174, "step": 1019200 }, { "epoch": 0.010193, "grad_norm": 0.14809074997901917, "learning_rate": 1e-05, "loss": 0.0174, "step": 1019300 }, { "epoch": 0.010194, "grad_norm": 0.11580786854028702, "learning_rate": 1e-05, "loss": 0.0176, "step": 1019400 }, { "epoch": 0.010195, "grad_norm": 0.15558244287967682, "learning_rate": 1e-05, "loss": 0.0179, "step": 1019500 }, { "epoch": 0.010196, "grad_norm": 0.17223723232746124, "learning_rate": 1e-05, "loss": 0.0173, "step": 1019600 }, { "epoch": 0.010197, "grad_norm": 0.1576339155435562, "learning_rate": 1e-05, "loss": 0.0178, "step": 1019700 }, { "epoch": 0.010198, "grad_norm": 0.1600988358259201, "learning_rate": 1e-05, "loss": 0.0174, "step": 1019800 }, { "epoch": 0.010199, "grad_norm": 0.17330797016620636, "learning_rate": 1e-05, "loss": 0.0177, "step": 1019900 }, { "epoch": 0.0102, "grad_norm": 0.1721992790699005, "learning_rate": 1e-05, "loss": 0.0175, "step": 1020000 }, { "epoch": 0.0102, "eval_loss": 0.015374348498880863, "eval_runtime": 188.0283, "eval_samples_per_second": 265.917, "eval_steps_per_second": 16.62, "step": 1020000 }, { "epoch": 0.010201, "grad_norm": 0.1695314347743988, "learning_rate": 1e-05, "loss": 0.0177, "step": 1020100 }, { "epoch": 0.010202, "grad_norm": 0.15517336130142212, "learning_rate": 1e-05, "loss": 0.0171, "step": 1020200 }, { "epoch": 0.010203, "grad_norm": 0.11508290469646454, "learning_rate": 1e-05, "loss": 0.0173, "step": 1020300 }, { "epoch": 0.010204, "grad_norm": 0.15662556886672974, "learning_rate": 1e-05, "loss": 0.018, "step": 1020400 }, { "epoch": 0.010205, "grad_norm": 0.22937677800655365, "learning_rate": 1e-05, "loss": 0.0176, "step": 1020500 }, { "epoch": 0.010206, "grad_norm": 0.14078831672668457, "learning_rate": 1e-05, "loss": 0.0171, "step": 1020600 }, { "epoch": 0.010207, "grad_norm": 0.14299756288528442, "learning_rate": 1e-05, "loss": 0.0174, "step": 1020700 }, { "epoch": 0.010208, "grad_norm": 0.1581902652978897, "learning_rate": 1e-05, "loss": 0.0175, "step": 1020800 }, { "epoch": 0.010209, "grad_norm": 0.1584523320198059, "learning_rate": 1e-05, "loss": 0.0178, "step": 1020900 }, { "epoch": 0.01021, "grad_norm": 0.12622815370559692, "learning_rate": 1e-05, "loss": 0.0171, "step": 1021000 }, { "epoch": 0.010211, "grad_norm": 0.12147415429353714, "learning_rate": 1e-05, "loss": 0.0178, "step": 1021100 }, { "epoch": 0.010212, "grad_norm": 0.13829848170280457, "learning_rate": 1e-05, "loss": 0.0181, "step": 1021200 }, { "epoch": 0.010213, "grad_norm": 0.19012412428855896, "learning_rate": 1e-05, "loss": 0.0174, "step": 1021300 }, { "epoch": 0.010214, "grad_norm": 0.11388678848743439, "learning_rate": 1e-05, "loss": 0.0175, "step": 1021400 }, { "epoch": 0.010215, "grad_norm": 0.1554012894630432, "learning_rate": 1e-05, "loss": 0.0173, "step": 1021500 }, { "epoch": 0.010216, "grad_norm": 0.10904356092214584, "learning_rate": 1e-05, "loss": 0.0174, "step": 1021600 }, { "epoch": 0.010217, "grad_norm": 0.16520290076732635, "learning_rate": 1e-05, "loss": 0.0174, "step": 1021700 }, { "epoch": 0.010218, "grad_norm": 0.11549406498670578, "learning_rate": 1e-05, "loss": 0.0177, "step": 1021800 }, { "epoch": 0.010219, "grad_norm": 0.14829355478286743, "learning_rate": 1e-05, "loss": 0.0169, "step": 1021900 }, { "epoch": 0.01022, "grad_norm": 0.1613519787788391, "learning_rate": 1e-05, "loss": 0.0176, "step": 1022000 }, { "epoch": 0.010221, "grad_norm": 0.19085785746574402, "learning_rate": 1e-05, "loss": 0.0177, "step": 1022100 }, { "epoch": 0.010222, "grad_norm": 0.11435044556856155, "learning_rate": 1e-05, "loss": 0.0173, "step": 1022200 }, { "epoch": 0.010223, "grad_norm": 0.10486282408237457, "learning_rate": 1e-05, "loss": 0.0174, "step": 1022300 }, { "epoch": 0.010224, "grad_norm": 0.1864302009344101, "learning_rate": 1e-05, "loss": 0.0175, "step": 1022400 }, { "epoch": 0.010225, "grad_norm": 0.12174709886312485, "learning_rate": 1e-05, "loss": 0.0174, "step": 1022500 }, { "epoch": 0.010226, "grad_norm": 0.20557929575443268, "learning_rate": 1e-05, "loss": 0.0177, "step": 1022600 }, { "epoch": 0.010227, "grad_norm": 0.17181093990802765, "learning_rate": 1e-05, "loss": 0.0172, "step": 1022700 }, { "epoch": 0.010228, "grad_norm": 0.15395976603031158, "learning_rate": 1e-05, "loss": 0.0175, "step": 1022800 }, { "epoch": 0.010229, "grad_norm": 0.21198265254497528, "learning_rate": 1e-05, "loss": 0.017, "step": 1022900 }, { "epoch": 0.01023, "grad_norm": 0.10893562436103821, "learning_rate": 1e-05, "loss": 0.0173, "step": 1023000 }, { "epoch": 0.010231, "grad_norm": 0.15863724052906036, "learning_rate": 1e-05, "loss": 0.0178, "step": 1023100 }, { "epoch": 0.010232, "grad_norm": 0.1722174882888794, "learning_rate": 1e-05, "loss": 0.0177, "step": 1023200 }, { "epoch": 0.010233, "grad_norm": 0.1532629281282425, "learning_rate": 1e-05, "loss": 0.0177, "step": 1023300 }, { "epoch": 0.010234, "grad_norm": 0.1435515433549881, "learning_rate": 1e-05, "loss": 0.0174, "step": 1023400 }, { "epoch": 0.010235, "grad_norm": 0.10047304630279541, "learning_rate": 1e-05, "loss": 0.0176, "step": 1023500 }, { "epoch": 0.010236, "grad_norm": 0.12318634241819382, "learning_rate": 1e-05, "loss": 0.0177, "step": 1023600 }, { "epoch": 0.010237, "grad_norm": 0.13339154422283173, "learning_rate": 1e-05, "loss": 0.0174, "step": 1023700 }, { "epoch": 0.010238, "grad_norm": 0.15397676825523376, "learning_rate": 1e-05, "loss": 0.0176, "step": 1023800 }, { "epoch": 0.010239, "grad_norm": 0.135126993060112, "learning_rate": 1e-05, "loss": 0.0174, "step": 1023900 }, { "epoch": 0.01024, "grad_norm": 0.12342015653848648, "learning_rate": 1e-05, "loss": 0.0174, "step": 1024000 }, { "epoch": 0.010241, "grad_norm": 0.12925592064857483, "learning_rate": 1e-05, "loss": 0.0173, "step": 1024100 }, { "epoch": 0.010242, "grad_norm": 0.12787459790706635, "learning_rate": 1e-05, "loss": 0.0175, "step": 1024200 }, { "epoch": 0.010243, "grad_norm": 0.18887200951576233, "learning_rate": 1e-05, "loss": 0.0177, "step": 1024300 }, { "epoch": 0.010244, "grad_norm": 0.1408914178609848, "learning_rate": 1e-05, "loss": 0.0179, "step": 1024400 }, { "epoch": 0.010245, "grad_norm": 0.13553117215633392, "learning_rate": 1e-05, "loss": 0.0173, "step": 1024500 }, { "epoch": 0.010246, "grad_norm": 0.11199691146612167, "learning_rate": 1e-05, "loss": 0.0176, "step": 1024600 }, { "epoch": 0.010247, "grad_norm": 0.11907190084457397, "learning_rate": 1e-05, "loss": 0.0176, "step": 1024700 }, { "epoch": 0.010248, "grad_norm": 0.12312062829732895, "learning_rate": 1e-05, "loss": 0.0174, "step": 1024800 }, { "epoch": 0.010249, "grad_norm": 0.1256524622440338, "learning_rate": 1e-05, "loss": 0.0175, "step": 1024900 }, { "epoch": 0.01025, "grad_norm": 0.14651979506015778, "learning_rate": 1e-05, "loss": 0.0178, "step": 1025000 }, { "epoch": 0.010251, "grad_norm": 0.1542985588312149, "learning_rate": 1e-05, "loss": 0.0179, "step": 1025100 }, { "epoch": 0.010252, "grad_norm": 0.16999739408493042, "learning_rate": 1e-05, "loss": 0.0175, "step": 1025200 }, { "epoch": 0.010253, "grad_norm": 0.14983037114143372, "learning_rate": 1e-05, "loss": 0.0176, "step": 1025300 }, { "epoch": 0.010254, "grad_norm": 0.17199882864952087, "learning_rate": 1e-05, "loss": 0.0175, "step": 1025400 }, { "epoch": 0.010255, "grad_norm": 0.12360895425081253, "learning_rate": 1e-05, "loss": 0.0177, "step": 1025500 }, { "epoch": 0.010256, "grad_norm": 0.14050738513469696, "learning_rate": 1e-05, "loss": 0.0177, "step": 1025600 }, { "epoch": 0.010257, "grad_norm": 0.13840697705745697, "learning_rate": 1e-05, "loss": 0.0177, "step": 1025700 }, { "epoch": 0.010258, "grad_norm": 0.18315459787845612, "learning_rate": 1e-05, "loss": 0.018, "step": 1025800 }, { "epoch": 0.010259, "grad_norm": 0.1727372109889984, "learning_rate": 1e-05, "loss": 0.0178, "step": 1025900 }, { "epoch": 0.01026, "grad_norm": 0.10487888008356094, "learning_rate": 1e-05, "loss": 0.0175, "step": 1026000 }, { "epoch": 0.010261, "grad_norm": 0.3162733018398285, "learning_rate": 1e-05, "loss": 0.0171, "step": 1026100 }, { "epoch": 0.010262, "grad_norm": 0.2029065042734146, "learning_rate": 1e-05, "loss": 0.0177, "step": 1026200 }, { "epoch": 0.010263, "grad_norm": 0.13037654757499695, "learning_rate": 1e-05, "loss": 0.0177, "step": 1026300 }, { "epoch": 0.010264, "grad_norm": 0.16697372496128082, "learning_rate": 1e-05, "loss": 0.0174, "step": 1026400 }, { "epoch": 0.010265, "grad_norm": 0.13185474276542664, "learning_rate": 1e-05, "loss": 0.0172, "step": 1026500 }, { "epoch": 0.010266, "grad_norm": 0.17745539546012878, "learning_rate": 1e-05, "loss": 0.0173, "step": 1026600 }, { "epoch": 0.010267, "grad_norm": 0.16540952026844025, "learning_rate": 1e-05, "loss": 0.0173, "step": 1026700 }, { "epoch": 0.010268, "grad_norm": 0.12098090350627899, "learning_rate": 1e-05, "loss": 0.0179, "step": 1026800 }, { "epoch": 0.010269, "grad_norm": 0.12773685157299042, "learning_rate": 1e-05, "loss": 0.0177, "step": 1026900 }, { "epoch": 0.01027, "grad_norm": 0.1614050418138504, "learning_rate": 1e-05, "loss": 0.0174, "step": 1027000 }, { "epoch": 0.010271, "grad_norm": 0.1255476474761963, "learning_rate": 1e-05, "loss": 0.0173, "step": 1027100 }, { "epoch": 0.010272, "grad_norm": 0.15028595924377441, "learning_rate": 1e-05, "loss": 0.0178, "step": 1027200 }, { "epoch": 0.010273, "grad_norm": 0.15383394062519073, "learning_rate": 1e-05, "loss": 0.0173, "step": 1027300 }, { "epoch": 0.010274, "grad_norm": 0.1264660507440567, "learning_rate": 1e-05, "loss": 0.0177, "step": 1027400 }, { "epoch": 0.010275, "grad_norm": 0.21653634309768677, "learning_rate": 1e-05, "loss": 0.0177, "step": 1027500 }, { "epoch": 0.010276, "grad_norm": 0.17192146182060242, "learning_rate": 1e-05, "loss": 0.0173, "step": 1027600 }, { "epoch": 0.010277, "grad_norm": 0.19942183792591095, "learning_rate": 1e-05, "loss": 0.0174, "step": 1027700 }, { "epoch": 0.010278, "grad_norm": 0.189261332154274, "learning_rate": 1e-05, "loss": 0.0172, "step": 1027800 }, { "epoch": 0.010279, "grad_norm": 0.12700606882572174, "learning_rate": 1e-05, "loss": 0.0179, "step": 1027900 }, { "epoch": 0.01028, "grad_norm": 0.11024083942174911, "learning_rate": 1e-05, "loss": 0.0176, "step": 1028000 }, { "epoch": 0.010281, "grad_norm": 0.16464421153068542, "learning_rate": 1e-05, "loss": 0.0175, "step": 1028100 }, { "epoch": 0.010282, "grad_norm": 0.20630136132240295, "learning_rate": 1e-05, "loss": 0.0171, "step": 1028200 }, { "epoch": 0.010283, "grad_norm": 0.23076264560222626, "learning_rate": 1e-05, "loss": 0.0176, "step": 1028300 }, { "epoch": 0.010284, "grad_norm": 0.17861124873161316, "learning_rate": 1e-05, "loss": 0.0177, "step": 1028400 }, { "epoch": 0.010285, "grad_norm": 0.13409478962421417, "learning_rate": 1e-05, "loss": 0.0171, "step": 1028500 }, { "epoch": 0.010286, "grad_norm": 0.17337603867053986, "learning_rate": 1e-05, "loss": 0.0174, "step": 1028600 }, { "epoch": 0.010287, "grad_norm": 0.12962718307971954, "learning_rate": 1e-05, "loss": 0.0177, "step": 1028700 }, { "epoch": 0.010288, "grad_norm": 0.17625604569911957, "learning_rate": 1e-05, "loss": 0.0175, "step": 1028800 }, { "epoch": 0.010289, "grad_norm": 0.18782682716846466, "learning_rate": 1e-05, "loss": 0.0173, "step": 1028900 }, { "epoch": 0.01029, "grad_norm": 0.14847099781036377, "learning_rate": 1e-05, "loss": 0.0176, "step": 1029000 }, { "epoch": 0.010291, "grad_norm": 0.1283634752035141, "learning_rate": 1e-05, "loss": 0.0175, "step": 1029100 }, { "epoch": 0.010292, "grad_norm": 0.13324511051177979, "learning_rate": 1e-05, "loss": 0.0174, "step": 1029200 }, { "epoch": 0.010293, "grad_norm": 0.117378368973732, "learning_rate": 1e-05, "loss": 0.0172, "step": 1029300 }, { "epoch": 0.010294, "grad_norm": 0.14466997981071472, "learning_rate": 1e-05, "loss": 0.0172, "step": 1029400 }, { "epoch": 0.010295, "grad_norm": 0.1402762234210968, "learning_rate": 1e-05, "loss": 0.0174, "step": 1029500 }, { "epoch": 0.010296, "grad_norm": 0.13686709105968475, "learning_rate": 1e-05, "loss": 0.0178, "step": 1029600 }, { "epoch": 0.010297, "grad_norm": 0.17478781938552856, "learning_rate": 1e-05, "loss": 0.0176, "step": 1029700 }, { "epoch": 0.010298, "grad_norm": 0.15509609878063202, "learning_rate": 1e-05, "loss": 0.0179, "step": 1029800 }, { "epoch": 0.010299, "grad_norm": 0.09297945350408554, "learning_rate": 1e-05, "loss": 0.0176, "step": 1029900 }, { "epoch": 0.0103, "grad_norm": 0.10621003061532974, "learning_rate": 1e-05, "loss": 0.0171, "step": 1030000 }, { "epoch": 0.010301, "grad_norm": 0.1446264237165451, "learning_rate": 1e-05, "loss": 0.0177, "step": 1030100 }, { "epoch": 0.010302, "grad_norm": 0.15316180884838104, "learning_rate": 1e-05, "loss": 0.0174, "step": 1030200 }, { "epoch": 0.010303, "grad_norm": 0.14136169850826263, "learning_rate": 1e-05, "loss": 0.0177, "step": 1030300 }, { "epoch": 0.010304, "grad_norm": 0.17559783160686493, "learning_rate": 1e-05, "loss": 0.0174, "step": 1030400 }, { "epoch": 0.010305, "grad_norm": 0.14677689969539642, "learning_rate": 1e-05, "loss": 0.0173, "step": 1030500 }, { "epoch": 0.010306, "grad_norm": 0.12363976985216141, "learning_rate": 1e-05, "loss": 0.0179, "step": 1030600 }, { "epoch": 0.010307, "grad_norm": 0.192070871591568, "learning_rate": 1e-05, "loss": 0.0171, "step": 1030700 }, { "epoch": 0.010308, "grad_norm": 0.12203861773014069, "learning_rate": 1e-05, "loss": 0.0174, "step": 1030800 }, { "epoch": 0.010309, "grad_norm": 0.10513618588447571, "learning_rate": 1e-05, "loss": 0.0177, "step": 1030900 }, { "epoch": 0.01031, "grad_norm": 0.16288168728351593, "learning_rate": 1e-05, "loss": 0.0174, "step": 1031000 }, { "epoch": 0.010311, "grad_norm": 0.11138980090618134, "learning_rate": 1e-05, "loss": 0.0179, "step": 1031100 }, { "epoch": 0.010312, "grad_norm": 0.10406161844730377, "learning_rate": 1e-05, "loss": 0.0171, "step": 1031200 }, { "epoch": 0.010313, "grad_norm": 0.1100248396396637, "learning_rate": 1e-05, "loss": 0.0175, "step": 1031300 }, { "epoch": 0.010314, "grad_norm": 0.11704910546541214, "learning_rate": 1e-05, "loss": 0.0175, "step": 1031400 }, { "epoch": 0.010315, "grad_norm": 0.25532400608062744, "learning_rate": 1e-05, "loss": 0.0172, "step": 1031500 }, { "epoch": 0.010316, "grad_norm": 0.12805978953838348, "learning_rate": 1e-05, "loss": 0.0179, "step": 1031600 }, { "epoch": 0.010317, "grad_norm": 0.15268856287002563, "learning_rate": 1e-05, "loss": 0.0173, "step": 1031700 }, { "epoch": 0.010318, "grad_norm": 0.18037357926368713, "learning_rate": 1e-05, "loss": 0.0175, "step": 1031800 }, { "epoch": 0.010319, "grad_norm": 0.16061517596244812, "learning_rate": 1e-05, "loss": 0.0176, "step": 1031900 }, { "epoch": 0.01032, "grad_norm": 0.17663991451263428, "learning_rate": 1e-05, "loss": 0.0174, "step": 1032000 }, { "epoch": 0.010321, "grad_norm": 0.15765678882598877, "learning_rate": 1e-05, "loss": 0.0177, "step": 1032100 }, { "epoch": 0.010322, "grad_norm": 0.15554963052272797, "learning_rate": 1e-05, "loss": 0.0178, "step": 1032200 }, { "epoch": 0.010323, "grad_norm": 0.12958350777626038, "learning_rate": 1e-05, "loss": 0.0173, "step": 1032300 }, { "epoch": 0.010324, "grad_norm": 0.1783933937549591, "learning_rate": 1e-05, "loss": 0.0175, "step": 1032400 }, { "epoch": 0.010325, "grad_norm": 0.1693446785211563, "learning_rate": 1e-05, "loss": 0.0175, "step": 1032500 }, { "epoch": 0.010326, "grad_norm": 0.14653603732585907, "learning_rate": 1e-05, "loss": 0.0179, "step": 1032600 }, { "epoch": 0.010327, "grad_norm": 0.15547038614749908, "learning_rate": 1e-05, "loss": 0.0173, "step": 1032700 }, { "epoch": 0.010328, "grad_norm": 0.15893499553203583, "learning_rate": 1e-05, "loss": 0.0177, "step": 1032800 }, { "epoch": 0.010329, "grad_norm": 0.17921751737594604, "learning_rate": 1e-05, "loss": 0.0174, "step": 1032900 }, { "epoch": 0.01033, "grad_norm": 0.14345772564411163, "learning_rate": 1e-05, "loss": 0.0175, "step": 1033000 }, { "epoch": 0.010331, "grad_norm": 0.12460330128669739, "learning_rate": 1e-05, "loss": 0.0174, "step": 1033100 }, { "epoch": 0.010332, "grad_norm": 0.1252346932888031, "learning_rate": 1e-05, "loss": 0.0174, "step": 1033200 }, { "epoch": 0.010333, "grad_norm": 0.11417403817176819, "learning_rate": 1e-05, "loss": 0.0175, "step": 1033300 }, { "epoch": 0.010334, "grad_norm": 0.11377641558647156, "learning_rate": 1e-05, "loss": 0.0177, "step": 1033400 }, { "epoch": 0.010335, "grad_norm": 0.14557693898677826, "learning_rate": 1e-05, "loss": 0.0173, "step": 1033500 }, { "epoch": 0.010336, "grad_norm": 0.12105031311511993, "learning_rate": 1e-05, "loss": 0.0175, "step": 1033600 }, { "epoch": 0.010337, "grad_norm": 0.15691952407360077, "learning_rate": 1e-05, "loss": 0.0178, "step": 1033700 }, { "epoch": 0.010338, "grad_norm": 0.10338311642408371, "learning_rate": 1e-05, "loss": 0.0175, "step": 1033800 }, { "epoch": 0.010339, "grad_norm": 0.1639430969953537, "learning_rate": 1e-05, "loss": 0.0175, "step": 1033900 }, { "epoch": 0.01034, "grad_norm": 0.21278496086597443, "learning_rate": 1e-05, "loss": 0.0176, "step": 1034000 }, { "epoch": 0.010341, "grad_norm": 0.14229896664619446, "learning_rate": 1e-05, "loss": 0.0172, "step": 1034100 }, { "epoch": 0.010342, "grad_norm": 0.13286440074443817, "learning_rate": 1e-05, "loss": 0.0175, "step": 1034200 }, { "epoch": 0.010343, "grad_norm": 0.13113000988960266, "learning_rate": 1e-05, "loss": 0.0177, "step": 1034300 }, { "epoch": 0.010344, "grad_norm": 0.11687090992927551, "learning_rate": 1e-05, "loss": 0.0176, "step": 1034400 }, { "epoch": 0.010345, "grad_norm": 0.15012167394161224, "learning_rate": 1e-05, "loss": 0.0174, "step": 1034500 }, { "epoch": 0.010346, "grad_norm": 0.14741457998752594, "learning_rate": 1e-05, "loss": 0.0176, "step": 1034600 }, { "epoch": 0.010347, "grad_norm": 0.1615910530090332, "learning_rate": 1e-05, "loss": 0.0175, "step": 1034700 }, { "epoch": 0.010348, "grad_norm": 0.13978075981140137, "learning_rate": 1e-05, "loss": 0.0174, "step": 1034800 }, { "epoch": 0.010349, "grad_norm": 0.1367821991443634, "learning_rate": 1e-05, "loss": 0.0173, "step": 1034900 }, { "epoch": 0.01035, "grad_norm": 0.10796761512756348, "learning_rate": 1e-05, "loss": 0.0173, "step": 1035000 }, { "epoch": 0.010351, "grad_norm": 0.11069205403327942, "learning_rate": 1e-05, "loss": 0.0176, "step": 1035100 }, { "epoch": 0.010352, "grad_norm": 0.1171109601855278, "learning_rate": 1e-05, "loss": 0.0173, "step": 1035200 }, { "epoch": 0.010353, "grad_norm": 0.1358485370874405, "learning_rate": 1e-05, "loss": 0.0172, "step": 1035300 }, { "epoch": 0.010354, "grad_norm": 0.16330265998840332, "learning_rate": 1e-05, "loss": 0.0174, "step": 1035400 }, { "epoch": 0.010355, "grad_norm": 0.13341465592384338, "learning_rate": 1e-05, "loss": 0.0177, "step": 1035500 }, { "epoch": 0.010356, "grad_norm": 0.1763749122619629, "learning_rate": 1e-05, "loss": 0.0176, "step": 1035600 }, { "epoch": 0.010357, "grad_norm": 0.16426655650138855, "learning_rate": 1e-05, "loss": 0.0173, "step": 1035700 }, { "epoch": 0.010358, "grad_norm": 0.11113731563091278, "learning_rate": 1e-05, "loss": 0.0171, "step": 1035800 }, { "epoch": 0.010359, "grad_norm": 0.11484696716070175, "learning_rate": 1e-05, "loss": 0.0178, "step": 1035900 }, { "epoch": 0.01036, "grad_norm": 0.11203271895647049, "learning_rate": 1e-05, "loss": 0.0173, "step": 1036000 }, { "epoch": 0.010361, "grad_norm": 0.1679798811674118, "learning_rate": 1e-05, "loss": 0.0174, "step": 1036100 }, { "epoch": 0.010362, "grad_norm": 0.17455658316612244, "learning_rate": 1e-05, "loss": 0.0175, "step": 1036200 }, { "epoch": 0.010363, "grad_norm": 0.07925804704427719, "learning_rate": 1e-05, "loss": 0.0175, "step": 1036300 }, { "epoch": 0.010364, "grad_norm": 0.13884788751602173, "learning_rate": 1e-05, "loss": 0.0171, "step": 1036400 }, { "epoch": 0.010365, "grad_norm": 0.11854597181081772, "learning_rate": 1e-05, "loss": 0.0173, "step": 1036500 }, { "epoch": 0.010366, "grad_norm": 0.15603503584861755, "learning_rate": 1e-05, "loss": 0.0173, "step": 1036600 }, { "epoch": 0.010367, "grad_norm": 0.15654335916042328, "learning_rate": 1e-05, "loss": 0.0173, "step": 1036700 }, { "epoch": 0.010368, "grad_norm": 0.15269815921783447, "learning_rate": 1e-05, "loss": 0.018, "step": 1036800 }, { "epoch": 0.010369, "grad_norm": 0.2658461928367615, "learning_rate": 1e-05, "loss": 0.0176, "step": 1036900 }, { "epoch": 0.01037, "grad_norm": 0.12896274030208588, "learning_rate": 1e-05, "loss": 0.0173, "step": 1037000 }, { "epoch": 0.010371, "grad_norm": 0.12144401669502258, "learning_rate": 1e-05, "loss": 0.0175, "step": 1037100 }, { "epoch": 0.010372, "grad_norm": 0.24856944382190704, "learning_rate": 1e-05, "loss": 0.0174, "step": 1037200 }, { "epoch": 0.010373, "grad_norm": 0.17754030227661133, "learning_rate": 1e-05, "loss": 0.0179, "step": 1037300 }, { "epoch": 0.010374, "grad_norm": 0.1495322287082672, "learning_rate": 1e-05, "loss": 0.0178, "step": 1037400 }, { "epoch": 0.010375, "grad_norm": 0.16183167695999146, "learning_rate": 1e-05, "loss": 0.0172, "step": 1037500 }, { "epoch": 0.010376, "grad_norm": 0.16674087941646576, "learning_rate": 1e-05, "loss": 0.0174, "step": 1037600 }, { "epoch": 0.010377, "grad_norm": 0.15114524960517883, "learning_rate": 1e-05, "loss": 0.0174, "step": 1037700 }, { "epoch": 0.010378, "grad_norm": 0.15317130088806152, "learning_rate": 1e-05, "loss": 0.0173, "step": 1037800 }, { "epoch": 0.010379, "grad_norm": 0.26734820008277893, "learning_rate": 1e-05, "loss": 0.0174, "step": 1037900 }, { "epoch": 0.01038, "grad_norm": 0.13093949854373932, "learning_rate": 1e-05, "loss": 0.0177, "step": 1038000 }, { "epoch": 0.010381, "grad_norm": 0.17457695305347443, "learning_rate": 1e-05, "loss": 0.0171, "step": 1038100 }, { "epoch": 0.010382, "grad_norm": 0.16390614211559296, "learning_rate": 1e-05, "loss": 0.018, "step": 1038200 }, { "epoch": 0.010383, "grad_norm": 0.1280159056186676, "learning_rate": 1e-05, "loss": 0.0175, "step": 1038300 }, { "epoch": 0.010384, "grad_norm": 0.15936259925365448, "learning_rate": 1e-05, "loss": 0.0174, "step": 1038400 }, { "epoch": 0.010385, "grad_norm": 0.12854497134685516, "learning_rate": 1e-05, "loss": 0.0175, "step": 1038500 }, { "epoch": 0.010386, "grad_norm": 0.16027872264385223, "learning_rate": 1e-05, "loss": 0.0173, "step": 1038600 }, { "epoch": 0.010387, "grad_norm": 0.13926972448825836, "learning_rate": 1e-05, "loss": 0.0177, "step": 1038700 }, { "epoch": 0.010388, "grad_norm": 0.12036896497011185, "learning_rate": 1e-05, "loss": 0.0172, "step": 1038800 }, { "epoch": 0.010389, "grad_norm": 0.14281065762043, "learning_rate": 1e-05, "loss": 0.0176, "step": 1038900 }, { "epoch": 0.01039, "grad_norm": 0.1637876331806183, "learning_rate": 1e-05, "loss": 0.0173, "step": 1039000 }, { "epoch": 0.010391, "grad_norm": 0.15413422882556915, "learning_rate": 1e-05, "loss": 0.0172, "step": 1039100 }, { "epoch": 0.010392, "grad_norm": 0.11329230666160583, "learning_rate": 1e-05, "loss": 0.0173, "step": 1039200 }, { "epoch": 0.010393, "grad_norm": 0.12619423866271973, "learning_rate": 1e-05, "loss": 0.0173, "step": 1039300 }, { "epoch": 0.010394, "grad_norm": 0.12409593164920807, "learning_rate": 1e-05, "loss": 0.0175, "step": 1039400 }, { "epoch": 0.010395, "grad_norm": 0.14829093217849731, "learning_rate": 1e-05, "loss": 0.0171, "step": 1039500 }, { "epoch": 0.010396, "grad_norm": 0.12498482316732407, "learning_rate": 1e-05, "loss": 0.0174, "step": 1039600 }, { "epoch": 0.010397, "grad_norm": 0.11874280869960785, "learning_rate": 1e-05, "loss": 0.0178, "step": 1039700 }, { "epoch": 0.010398, "grad_norm": 0.15854093432426453, "learning_rate": 1e-05, "loss": 0.0168, "step": 1039800 }, { "epoch": 0.010399, "grad_norm": 0.14881226420402527, "learning_rate": 1e-05, "loss": 0.0176, "step": 1039900 }, { "epoch": 0.0104, "grad_norm": 0.14293888211250305, "learning_rate": 1e-05, "loss": 0.0176, "step": 1040000 }, { "epoch": 0.0104, "eval_loss": 0.015368642285466194, "eval_runtime": 189.2554, "eval_samples_per_second": 264.193, "eval_steps_per_second": 16.512, "step": 1040000 }, { "epoch": 0.010401, "grad_norm": 0.13580529391765594, "learning_rate": 1e-05, "loss": 0.0172, "step": 1040100 }, { "epoch": 0.010402, "grad_norm": 0.09819935262203217, "learning_rate": 1e-05, "loss": 0.0177, "step": 1040200 }, { "epoch": 0.010403, "grad_norm": 0.13110040128231049, "learning_rate": 1e-05, "loss": 0.0171, "step": 1040300 }, { "epoch": 0.010404, "grad_norm": 0.13078509271144867, "learning_rate": 1e-05, "loss": 0.0175, "step": 1040400 }, { "epoch": 0.010405, "grad_norm": 0.12224096059799194, "learning_rate": 1e-05, "loss": 0.0175, "step": 1040500 }, { "epoch": 0.010406, "grad_norm": 0.10549431294202805, "learning_rate": 1e-05, "loss": 0.0175, "step": 1040600 }, { "epoch": 0.010407, "grad_norm": 0.12264421582221985, "learning_rate": 1e-05, "loss": 0.0176, "step": 1040700 }, { "epoch": 0.010408, "grad_norm": 0.1282280534505844, "learning_rate": 1e-05, "loss": 0.0174, "step": 1040800 }, { "epoch": 0.010409, "grad_norm": 0.13866712152957916, "learning_rate": 1e-05, "loss": 0.0172, "step": 1040900 }, { "epoch": 0.01041, "grad_norm": 0.14849333465099335, "learning_rate": 1e-05, "loss": 0.0174, "step": 1041000 }, { "epoch": 0.010411, "grad_norm": 0.14052098989486694, "learning_rate": 1e-05, "loss": 0.0174, "step": 1041100 }, { "epoch": 0.010412, "grad_norm": 0.22518737614154816, "learning_rate": 1e-05, "loss": 0.0176, "step": 1041200 }, { "epoch": 0.010413, "grad_norm": 0.1406812071800232, "learning_rate": 1e-05, "loss": 0.0174, "step": 1041300 }, { "epoch": 0.010414, "grad_norm": 0.11507716774940491, "learning_rate": 1e-05, "loss": 0.0179, "step": 1041400 }, { "epoch": 0.010415, "grad_norm": 0.1567552089691162, "learning_rate": 1e-05, "loss": 0.0173, "step": 1041500 }, { "epoch": 0.010416, "grad_norm": 0.12768223881721497, "learning_rate": 1e-05, "loss": 0.0177, "step": 1041600 }, { "epoch": 0.010417, "grad_norm": 0.1468605101108551, "learning_rate": 1e-05, "loss": 0.0176, "step": 1041700 }, { "epoch": 0.010418, "grad_norm": 0.1815379410982132, "learning_rate": 1e-05, "loss": 0.0178, "step": 1041800 }, { "epoch": 0.010419, "grad_norm": 0.15017196536064148, "learning_rate": 1e-05, "loss": 0.0174, "step": 1041900 }, { "epoch": 0.01042, "grad_norm": 0.09092283248901367, "learning_rate": 1e-05, "loss": 0.0173, "step": 1042000 }, { "epoch": 0.010421, "grad_norm": 0.19985665380954742, "learning_rate": 1e-05, "loss": 0.0177, "step": 1042100 }, { "epoch": 0.010422, "grad_norm": 0.13604575395584106, "learning_rate": 1e-05, "loss": 0.0174, "step": 1042200 }, { "epoch": 0.010423, "grad_norm": 0.13606378436088562, "learning_rate": 1e-05, "loss": 0.0173, "step": 1042300 }, { "epoch": 0.010424, "grad_norm": 0.12187448143959045, "learning_rate": 1e-05, "loss": 0.0174, "step": 1042400 }, { "epoch": 0.010425, "grad_norm": 0.11448171734809875, "learning_rate": 1e-05, "loss": 0.0176, "step": 1042500 }, { "epoch": 0.010426, "grad_norm": 0.18738871812820435, "learning_rate": 1e-05, "loss": 0.0173, "step": 1042600 }, { "epoch": 0.010427, "grad_norm": 0.15706346929073334, "learning_rate": 1e-05, "loss": 0.0172, "step": 1042700 }, { "epoch": 0.010428, "grad_norm": 0.1378595232963562, "learning_rate": 1e-05, "loss": 0.0173, "step": 1042800 }, { "epoch": 0.010429, "grad_norm": 0.16721142828464508, "learning_rate": 1e-05, "loss": 0.0174, "step": 1042900 }, { "epoch": 0.01043, "grad_norm": 0.11430317908525467, "learning_rate": 1e-05, "loss": 0.0174, "step": 1043000 }, { "epoch": 0.010431, "grad_norm": 0.14520157873630524, "learning_rate": 1e-05, "loss": 0.0174, "step": 1043100 }, { "epoch": 0.010432, "grad_norm": 0.17685256898403168, "learning_rate": 1e-05, "loss": 0.0174, "step": 1043200 }, { "epoch": 0.010433, "grad_norm": 0.1102147027850151, "learning_rate": 1e-05, "loss": 0.0173, "step": 1043300 }, { "epoch": 0.010434, "grad_norm": 0.12426308542490005, "learning_rate": 1e-05, "loss": 0.0176, "step": 1043400 }, { "epoch": 0.010435, "grad_norm": 0.11751005053520203, "learning_rate": 1e-05, "loss": 0.0177, "step": 1043500 }, { "epoch": 0.010436, "grad_norm": 0.14220470190048218, "learning_rate": 1e-05, "loss": 0.0174, "step": 1043600 }, { "epoch": 0.010437, "grad_norm": 0.1406431943178177, "learning_rate": 1e-05, "loss": 0.0175, "step": 1043700 }, { "epoch": 0.010438, "grad_norm": 0.15251487493515015, "learning_rate": 1e-05, "loss": 0.0174, "step": 1043800 }, { "epoch": 0.010439, "grad_norm": 0.2027580738067627, "learning_rate": 1e-05, "loss": 0.0172, "step": 1043900 }, { "epoch": 0.01044, "grad_norm": 0.17310121655464172, "learning_rate": 1e-05, "loss": 0.0177, "step": 1044000 }, { "epoch": 0.010441, "grad_norm": 0.1344534009695053, "learning_rate": 1e-05, "loss": 0.0176, "step": 1044100 }, { "epoch": 0.010442, "grad_norm": 0.13319924473762512, "learning_rate": 1e-05, "loss": 0.0171, "step": 1044200 }, { "epoch": 0.010443, "grad_norm": 0.1750355362892151, "learning_rate": 1e-05, "loss": 0.0178, "step": 1044300 }, { "epoch": 0.010444, "grad_norm": 0.10491763055324554, "learning_rate": 1e-05, "loss": 0.0172, "step": 1044400 }, { "epoch": 0.010445, "grad_norm": 0.1847921460866928, "learning_rate": 1e-05, "loss": 0.0175, "step": 1044500 }, { "epoch": 0.010446, "grad_norm": 0.1762174516916275, "learning_rate": 1e-05, "loss": 0.0173, "step": 1044600 }, { "epoch": 0.010447, "grad_norm": 0.16125527024269104, "learning_rate": 1e-05, "loss": 0.0175, "step": 1044700 }, { "epoch": 0.010448, "grad_norm": 0.13327910006046295, "learning_rate": 1e-05, "loss": 0.0175, "step": 1044800 }, { "epoch": 0.010449, "grad_norm": 0.16857805848121643, "learning_rate": 1e-05, "loss": 0.0172, "step": 1044900 }, { "epoch": 0.01045, "grad_norm": 0.13828708231449127, "learning_rate": 1e-05, "loss": 0.0178, "step": 1045000 }, { "epoch": 0.010451, "grad_norm": 0.19947895407676697, "learning_rate": 1e-05, "loss": 0.0174, "step": 1045100 }, { "epoch": 0.010452, "grad_norm": 0.14453630149364471, "learning_rate": 1e-05, "loss": 0.0174, "step": 1045200 }, { "epoch": 0.010453, "grad_norm": 0.1650848537683487, "learning_rate": 1e-05, "loss": 0.0176, "step": 1045300 }, { "epoch": 0.010454, "grad_norm": 0.12582989037036896, "learning_rate": 1e-05, "loss": 0.0173, "step": 1045400 }, { "epoch": 0.010455, "grad_norm": 0.12453660368919373, "learning_rate": 1e-05, "loss": 0.0174, "step": 1045500 }, { "epoch": 0.010456, "grad_norm": 0.1184794008731842, "learning_rate": 1e-05, "loss": 0.0173, "step": 1045600 }, { "epoch": 0.010457, "grad_norm": 0.13146625459194183, "learning_rate": 1e-05, "loss": 0.0179, "step": 1045700 }, { "epoch": 0.010458, "grad_norm": 0.14135707914829254, "learning_rate": 1e-05, "loss": 0.0171, "step": 1045800 }, { "epoch": 0.010459, "grad_norm": 0.14210845530033112, "learning_rate": 1e-05, "loss": 0.0176, "step": 1045900 }, { "epoch": 0.01046, "grad_norm": 0.12389176338911057, "learning_rate": 1e-05, "loss": 0.0172, "step": 1046000 }, { "epoch": 0.010461, "grad_norm": 0.16718535125255585, "learning_rate": 1e-05, "loss": 0.017, "step": 1046100 }, { "epoch": 0.010462, "grad_norm": 0.130445659160614, "learning_rate": 1e-05, "loss": 0.0174, "step": 1046200 }, { "epoch": 0.010463, "grad_norm": 0.10770788788795471, "learning_rate": 1e-05, "loss": 0.0176, "step": 1046300 }, { "epoch": 0.010464, "grad_norm": 0.12374358624219894, "learning_rate": 1e-05, "loss": 0.0177, "step": 1046400 }, { "epoch": 0.010465, "grad_norm": 0.1530674695968628, "learning_rate": 1e-05, "loss": 0.0177, "step": 1046500 }, { "epoch": 0.010466, "grad_norm": 0.1289099156856537, "learning_rate": 1e-05, "loss": 0.0177, "step": 1046600 }, { "epoch": 0.010467, "grad_norm": 0.13210482895374298, "learning_rate": 1e-05, "loss": 0.0174, "step": 1046700 }, { "epoch": 0.010468, "grad_norm": 0.16716860234737396, "learning_rate": 1e-05, "loss": 0.0175, "step": 1046800 }, { "epoch": 0.010469, "grad_norm": 0.11164829134941101, "learning_rate": 1e-05, "loss": 0.0174, "step": 1046900 }, { "epoch": 0.01047, "grad_norm": 0.17741140723228455, "learning_rate": 1e-05, "loss": 0.0173, "step": 1047000 }, { "epoch": 0.010471, "grad_norm": 0.15623873472213745, "learning_rate": 1e-05, "loss": 0.0175, "step": 1047100 }, { "epoch": 0.010472, "grad_norm": 0.1849936842918396, "learning_rate": 1e-05, "loss": 0.0177, "step": 1047200 }, { "epoch": 0.010473, "grad_norm": 0.12719640135765076, "learning_rate": 1e-05, "loss": 0.0176, "step": 1047300 }, { "epoch": 0.010474, "grad_norm": 0.12879854440689087, "learning_rate": 1e-05, "loss": 0.0175, "step": 1047400 }, { "epoch": 0.010475, "grad_norm": 0.12092121690511703, "learning_rate": 1e-05, "loss": 0.0174, "step": 1047500 }, { "epoch": 0.010476, "grad_norm": 0.12515266239643097, "learning_rate": 1e-05, "loss": 0.018, "step": 1047600 }, { "epoch": 0.010477, "grad_norm": 0.10206804424524307, "learning_rate": 1e-05, "loss": 0.0169, "step": 1047700 }, { "epoch": 0.010478, "grad_norm": 0.12194743752479553, "learning_rate": 1e-05, "loss": 0.0174, "step": 1047800 }, { "epoch": 0.010479, "grad_norm": 0.14760752022266388, "learning_rate": 1e-05, "loss": 0.0175, "step": 1047900 }, { "epoch": 0.01048, "grad_norm": 0.11591548472642899, "learning_rate": 1e-05, "loss": 0.0171, "step": 1048000 }, { "epoch": 0.010481, "grad_norm": 0.13478291034698486, "learning_rate": 1e-05, "loss": 0.0174, "step": 1048100 }, { "epoch": 0.010482, "grad_norm": 0.1388506293296814, "learning_rate": 1e-05, "loss": 0.0175, "step": 1048200 }, { "epoch": 0.010483, "grad_norm": 0.11157665401697159, "learning_rate": 1e-05, "loss": 0.0171, "step": 1048300 }, { "epoch": 0.010484, "grad_norm": 0.11446770280599594, "learning_rate": 1e-05, "loss": 0.0174, "step": 1048400 }, { "epoch": 0.010485, "grad_norm": 0.1232234463095665, "learning_rate": 1e-05, "loss": 0.0173, "step": 1048500 }, { "epoch": 0.010486, "grad_norm": 0.12477286905050278, "learning_rate": 1e-05, "loss": 0.0176, "step": 1048600 }, { "epoch": 0.010487, "grad_norm": 0.1230551078915596, "learning_rate": 1e-05, "loss": 0.0174, "step": 1048700 }, { "epoch": 0.010488, "grad_norm": 0.11607471108436584, "learning_rate": 1e-05, "loss": 0.0175, "step": 1048800 }, { "epoch": 0.010489, "grad_norm": 0.1042313501238823, "learning_rate": 1e-05, "loss": 0.0177, "step": 1048900 }, { "epoch": 0.01049, "grad_norm": 0.1402939260005951, "learning_rate": 1e-05, "loss": 0.018, "step": 1049000 }, { "epoch": 0.010491, "grad_norm": 0.15108722448349, "learning_rate": 1e-05, "loss": 0.018, "step": 1049100 }, { "epoch": 0.010492, "grad_norm": 0.18152651190757751, "learning_rate": 1e-05, "loss": 0.0172, "step": 1049200 }, { "epoch": 0.010493, "grad_norm": 0.12755745649337769, "learning_rate": 1e-05, "loss": 0.0174, "step": 1049300 }, { "epoch": 0.010494, "grad_norm": 0.1010243147611618, "learning_rate": 1e-05, "loss": 0.0175, "step": 1049400 }, { "epoch": 0.010495, "grad_norm": 0.14523595571517944, "learning_rate": 1e-05, "loss": 0.0172, "step": 1049500 }, { "epoch": 0.010496, "grad_norm": 0.14023351669311523, "learning_rate": 1e-05, "loss": 0.017, "step": 1049600 }, { "epoch": 0.010497, "grad_norm": 0.13812756538391113, "learning_rate": 1e-05, "loss": 0.0173, "step": 1049700 }, { "epoch": 0.010498, "grad_norm": 0.17001047730445862, "learning_rate": 1e-05, "loss": 0.0175, "step": 1049800 }, { "epoch": 0.010499, "grad_norm": 0.16609716415405273, "learning_rate": 1e-05, "loss": 0.0175, "step": 1049900 }, { "epoch": 0.0105, "grad_norm": 0.14568237960338593, "learning_rate": 1e-05, "loss": 0.0174, "step": 1050000 }, { "epoch": 0.010501, "grad_norm": 0.14112083613872528, "learning_rate": 1e-05, "loss": 0.0168, "step": 1050100 }, { "epoch": 0.010502, "grad_norm": 0.12644249200820923, "learning_rate": 1e-05, "loss": 0.017, "step": 1050200 }, { "epoch": 0.010503, "grad_norm": 0.14562413096427917, "learning_rate": 1e-05, "loss": 0.0177, "step": 1050300 }, { "epoch": 0.010504, "grad_norm": 0.1601952165365219, "learning_rate": 1e-05, "loss": 0.0174, "step": 1050400 }, { "epoch": 0.010505, "grad_norm": 0.13049687445163727, "learning_rate": 1e-05, "loss": 0.0176, "step": 1050500 }, { "epoch": 0.010506, "grad_norm": 0.10640658438205719, "learning_rate": 1e-05, "loss": 0.0175, "step": 1050600 }, { "epoch": 0.010507, "grad_norm": 0.1757018268108368, "learning_rate": 1e-05, "loss": 0.0173, "step": 1050700 }, { "epoch": 0.010508, "grad_norm": 0.17482681572437286, "learning_rate": 1e-05, "loss": 0.0175, "step": 1050800 }, { "epoch": 0.010509, "grad_norm": 0.14726047217845917, "learning_rate": 1e-05, "loss": 0.0177, "step": 1050900 }, { "epoch": 0.01051, "grad_norm": 0.12647323310375214, "learning_rate": 1e-05, "loss": 0.0175, "step": 1051000 }, { "epoch": 0.010511, "grad_norm": 0.12052825093269348, "learning_rate": 1e-05, "loss": 0.0169, "step": 1051100 }, { "epoch": 0.010512, "grad_norm": 0.16239094734191895, "learning_rate": 1e-05, "loss": 0.0175, "step": 1051200 }, { "epoch": 0.010513, "grad_norm": 0.15146537125110626, "learning_rate": 1e-05, "loss": 0.0175, "step": 1051300 }, { "epoch": 0.010514, "grad_norm": 0.1346597671508789, "learning_rate": 1e-05, "loss": 0.0171, "step": 1051400 }, { "epoch": 0.010515, "grad_norm": 0.1468886137008667, "learning_rate": 1e-05, "loss": 0.0175, "step": 1051500 }, { "epoch": 0.010516, "grad_norm": 0.1264266073703766, "learning_rate": 1e-05, "loss": 0.0172, "step": 1051600 }, { "epoch": 0.010517, "grad_norm": 0.1480812281370163, "learning_rate": 1e-05, "loss": 0.0172, "step": 1051700 }, { "epoch": 0.010518, "grad_norm": 0.16107264161109924, "learning_rate": 1e-05, "loss": 0.0171, "step": 1051800 }, { "epoch": 0.010519, "grad_norm": 0.10910248011350632, "learning_rate": 1e-05, "loss": 0.0173, "step": 1051900 }, { "epoch": 0.01052, "grad_norm": 0.14812889695167542, "learning_rate": 1e-05, "loss": 0.018, "step": 1052000 }, { "epoch": 0.010521, "grad_norm": 0.136376291513443, "learning_rate": 1e-05, "loss": 0.0174, "step": 1052100 }, { "epoch": 0.010522, "grad_norm": 0.12796570360660553, "learning_rate": 1e-05, "loss": 0.0172, "step": 1052200 }, { "epoch": 0.010523, "grad_norm": 0.13378475606441498, "learning_rate": 1e-05, "loss": 0.0171, "step": 1052300 }, { "epoch": 0.010524, "grad_norm": 0.11981290578842163, "learning_rate": 1e-05, "loss": 0.0174, "step": 1052400 }, { "epoch": 0.010525, "grad_norm": 0.16338811814785004, "learning_rate": 1e-05, "loss": 0.0175, "step": 1052500 }, { "epoch": 0.010526, "grad_norm": 0.21461904048919678, "learning_rate": 1e-05, "loss": 0.0172, "step": 1052600 }, { "epoch": 0.010527, "grad_norm": 0.12773075699806213, "learning_rate": 1e-05, "loss": 0.0174, "step": 1052700 }, { "epoch": 0.010528, "grad_norm": 0.15571141242980957, "learning_rate": 1e-05, "loss": 0.0173, "step": 1052800 }, { "epoch": 0.010529, "grad_norm": 0.13842815160751343, "learning_rate": 1e-05, "loss": 0.0175, "step": 1052900 }, { "epoch": 0.01053, "grad_norm": 0.11295677721500397, "learning_rate": 1e-05, "loss": 0.0174, "step": 1053000 }, { "epoch": 0.010531, "grad_norm": 0.1390824019908905, "learning_rate": 1e-05, "loss": 0.0181, "step": 1053100 }, { "epoch": 0.010532, "grad_norm": 0.12731394171714783, "learning_rate": 1e-05, "loss": 0.0172, "step": 1053200 }, { "epoch": 0.010533, "grad_norm": 0.15064309537410736, "learning_rate": 1e-05, "loss": 0.0174, "step": 1053300 }, { "epoch": 0.010534, "grad_norm": 0.15762650966644287, "learning_rate": 1e-05, "loss": 0.017, "step": 1053400 }, { "epoch": 0.010535, "grad_norm": 0.10762911289930344, "learning_rate": 1e-05, "loss": 0.0174, "step": 1053500 }, { "epoch": 0.010536, "grad_norm": 0.09981267154216766, "learning_rate": 1e-05, "loss": 0.0174, "step": 1053600 }, { "epoch": 0.010537, "grad_norm": 0.17118598520755768, "learning_rate": 1e-05, "loss": 0.0176, "step": 1053700 }, { "epoch": 0.010538, "grad_norm": 0.1320803165435791, "learning_rate": 1e-05, "loss": 0.0176, "step": 1053800 }, { "epoch": 0.010539, "grad_norm": 0.1266605257987976, "learning_rate": 1e-05, "loss": 0.0173, "step": 1053900 }, { "epoch": 0.01054, "grad_norm": 0.10390046238899231, "learning_rate": 1e-05, "loss": 0.0175, "step": 1054000 }, { "epoch": 0.010541, "grad_norm": 0.12316720187664032, "learning_rate": 1e-05, "loss": 0.0173, "step": 1054100 }, { "epoch": 0.010542, "grad_norm": 0.10728579759597778, "learning_rate": 1e-05, "loss": 0.0178, "step": 1054200 }, { "epoch": 0.010543, "grad_norm": 0.1244812086224556, "learning_rate": 1e-05, "loss": 0.0173, "step": 1054300 }, { "epoch": 0.010544, "grad_norm": 0.16185104846954346, "learning_rate": 1e-05, "loss": 0.0171, "step": 1054400 }, { "epoch": 0.010545, "grad_norm": 0.1483929306268692, "learning_rate": 1e-05, "loss": 0.0171, "step": 1054500 }, { "epoch": 0.010546, "grad_norm": 0.15555277466773987, "learning_rate": 1e-05, "loss": 0.0174, "step": 1054600 }, { "epoch": 0.010547, "grad_norm": 0.13011476397514343, "learning_rate": 1e-05, "loss": 0.0174, "step": 1054700 }, { "epoch": 0.010548, "grad_norm": 0.11427925527095795, "learning_rate": 1e-05, "loss": 0.0178, "step": 1054800 }, { "epoch": 0.010549, "grad_norm": 0.13193416595458984, "learning_rate": 1e-05, "loss": 0.0172, "step": 1054900 }, { "epoch": 0.01055, "grad_norm": 0.12113174051046371, "learning_rate": 1e-05, "loss": 0.0173, "step": 1055000 }, { "epoch": 0.010551, "grad_norm": 0.1408478170633316, "learning_rate": 1e-05, "loss": 0.0171, "step": 1055100 }, { "epoch": 0.010552, "grad_norm": 0.13804522156715393, "learning_rate": 1e-05, "loss": 0.0174, "step": 1055200 }, { "epoch": 0.010553, "grad_norm": 0.12379052489995956, "learning_rate": 1e-05, "loss": 0.0174, "step": 1055300 }, { "epoch": 0.010554, "grad_norm": 0.11869920790195465, "learning_rate": 1e-05, "loss": 0.0174, "step": 1055400 }, { "epoch": 0.010555, "grad_norm": 0.19021157920360565, "learning_rate": 1e-05, "loss": 0.0174, "step": 1055500 }, { "epoch": 0.010556, "grad_norm": 0.13680802285671234, "learning_rate": 1e-05, "loss": 0.0175, "step": 1055600 }, { "epoch": 0.010557, "grad_norm": 0.17780929803848267, "learning_rate": 1e-05, "loss": 0.017, "step": 1055700 }, { "epoch": 0.010558, "grad_norm": 0.11672142148017883, "learning_rate": 1e-05, "loss": 0.0175, "step": 1055800 }, { "epoch": 0.010559, "grad_norm": 0.14288438856601715, "learning_rate": 1e-05, "loss": 0.018, "step": 1055900 }, { "epoch": 0.01056, "grad_norm": 0.10897313803434372, "learning_rate": 1e-05, "loss": 0.0173, "step": 1056000 }, { "epoch": 0.010561, "grad_norm": 0.17318663001060486, "learning_rate": 1e-05, "loss": 0.0175, "step": 1056100 }, { "epoch": 0.010562, "grad_norm": 0.1678372621536255, "learning_rate": 1e-05, "loss": 0.0171, "step": 1056200 }, { "epoch": 0.010563, "grad_norm": 0.16312584280967712, "learning_rate": 1e-05, "loss": 0.0176, "step": 1056300 }, { "epoch": 0.010564, "grad_norm": 0.10887308418750763, "learning_rate": 1e-05, "loss": 0.0171, "step": 1056400 }, { "epoch": 0.010565, "grad_norm": 0.11272864043712616, "learning_rate": 1e-05, "loss": 0.0176, "step": 1056500 }, { "epoch": 0.010566, "grad_norm": 0.13766300678253174, "learning_rate": 1e-05, "loss": 0.0173, "step": 1056600 }, { "epoch": 0.010567, "grad_norm": 0.16155682504177094, "learning_rate": 1e-05, "loss": 0.0173, "step": 1056700 }, { "epoch": 0.010568, "grad_norm": 0.1424289345741272, "learning_rate": 1e-05, "loss": 0.0175, "step": 1056800 }, { "epoch": 0.010569, "grad_norm": 0.14452727138996124, "learning_rate": 1e-05, "loss": 0.0177, "step": 1056900 }, { "epoch": 0.01057, "grad_norm": 0.12253423035144806, "learning_rate": 1e-05, "loss": 0.0176, "step": 1057000 }, { "epoch": 0.010571, "grad_norm": 0.19532278180122375, "learning_rate": 1e-05, "loss": 0.0171, "step": 1057100 }, { "epoch": 0.010572, "grad_norm": 0.1362159252166748, "learning_rate": 1e-05, "loss": 0.0175, "step": 1057200 }, { "epoch": 0.010573, "grad_norm": 0.10967010259628296, "learning_rate": 1e-05, "loss": 0.0175, "step": 1057300 }, { "epoch": 0.010574, "grad_norm": 0.11368265748023987, "learning_rate": 1e-05, "loss": 0.0177, "step": 1057400 }, { "epoch": 0.010575, "grad_norm": 0.10396154224872589, "learning_rate": 1e-05, "loss": 0.0176, "step": 1057500 }, { "epoch": 0.010576, "grad_norm": 0.08038429170846939, "learning_rate": 1e-05, "loss": 0.0178, "step": 1057600 }, { "epoch": 0.010577, "grad_norm": 0.21615161001682281, "learning_rate": 1e-05, "loss": 0.0177, "step": 1057700 }, { "epoch": 0.010578, "grad_norm": 0.1144191175699234, "learning_rate": 1e-05, "loss": 0.0174, "step": 1057800 }, { "epoch": 0.010579, "grad_norm": 0.11580310761928558, "learning_rate": 1e-05, "loss": 0.0173, "step": 1057900 }, { "epoch": 0.01058, "grad_norm": 0.12283748388290405, "learning_rate": 1e-05, "loss": 0.0178, "step": 1058000 }, { "epoch": 0.010581, "grad_norm": 0.1458136886358261, "learning_rate": 1e-05, "loss": 0.0175, "step": 1058100 }, { "epoch": 0.010582, "grad_norm": 0.12495569884777069, "learning_rate": 1e-05, "loss": 0.0178, "step": 1058200 }, { "epoch": 0.010583, "grad_norm": 0.10149644315242767, "learning_rate": 1e-05, "loss": 0.0175, "step": 1058300 }, { "epoch": 0.010584, "grad_norm": 0.10622517764568329, "learning_rate": 1e-05, "loss": 0.0171, "step": 1058400 }, { "epoch": 0.010585, "grad_norm": 0.13656994700431824, "learning_rate": 1e-05, "loss": 0.0178, "step": 1058500 }, { "epoch": 0.010586, "grad_norm": 0.0998438373208046, "learning_rate": 1e-05, "loss": 0.0172, "step": 1058600 }, { "epoch": 0.010587, "grad_norm": 0.14571861922740936, "learning_rate": 1e-05, "loss": 0.0172, "step": 1058700 }, { "epoch": 0.010588, "grad_norm": 0.192903071641922, "learning_rate": 1e-05, "loss": 0.0176, "step": 1058800 }, { "epoch": 0.010589, "grad_norm": 0.15157076716423035, "learning_rate": 1e-05, "loss": 0.0176, "step": 1058900 }, { "epoch": 0.01059, "grad_norm": 0.15938575565814972, "learning_rate": 1e-05, "loss": 0.0176, "step": 1059000 }, { "epoch": 0.010591, "grad_norm": 0.23804549872875214, "learning_rate": 1e-05, "loss": 0.0173, "step": 1059100 }, { "epoch": 0.010592, "grad_norm": 0.17320606112480164, "learning_rate": 1e-05, "loss": 0.017, "step": 1059200 }, { "epoch": 0.010593, "grad_norm": 0.13806764781475067, "learning_rate": 1e-05, "loss": 0.0171, "step": 1059300 }, { "epoch": 0.010594, "grad_norm": 0.10507871210575104, "learning_rate": 1e-05, "loss": 0.0174, "step": 1059400 }, { "epoch": 0.010595, "grad_norm": 0.1613716036081314, "learning_rate": 1e-05, "loss": 0.0171, "step": 1059500 }, { "epoch": 0.010596, "grad_norm": 0.13601332902908325, "learning_rate": 1e-05, "loss": 0.0173, "step": 1059600 }, { "epoch": 0.010597, "grad_norm": 0.1478947401046753, "learning_rate": 1e-05, "loss": 0.0175, "step": 1059700 }, { "epoch": 0.010598, "grad_norm": 0.12687547504901886, "learning_rate": 1e-05, "loss": 0.0172, "step": 1059800 }, { "epoch": 0.010599, "grad_norm": 0.16448672115802765, "learning_rate": 1e-05, "loss": 0.018, "step": 1059900 }, { "epoch": 0.0106, "grad_norm": 0.1022099182009697, "learning_rate": 1e-05, "loss": 0.017, "step": 1060000 }, { "epoch": 0.0106, "eval_loss": 0.015300117433071136, "eval_runtime": 189.3545, "eval_samples_per_second": 264.055, "eval_steps_per_second": 16.503, "step": 1060000 }, { "epoch": 0.010601, "grad_norm": 0.1914234310388565, "learning_rate": 1e-05, "loss": 0.0173, "step": 1060100 }, { "epoch": 0.010602, "grad_norm": 0.14736506342887878, "learning_rate": 1e-05, "loss": 0.0171, "step": 1060200 }, { "epoch": 0.010603, "grad_norm": 0.1365083009004593, "learning_rate": 1e-05, "loss": 0.0175, "step": 1060300 }, { "epoch": 0.010604, "grad_norm": 0.1593974232673645, "learning_rate": 1e-05, "loss": 0.0173, "step": 1060400 }, { "epoch": 0.010605, "grad_norm": 0.15271779894828796, "learning_rate": 1e-05, "loss": 0.0169, "step": 1060500 }, { "epoch": 0.010606, "grad_norm": 0.14827823638916016, "learning_rate": 1e-05, "loss": 0.0171, "step": 1060600 }, { "epoch": 0.010607, "grad_norm": 0.1270606517791748, "learning_rate": 1e-05, "loss": 0.0174, "step": 1060700 }, { "epoch": 0.010608, "grad_norm": 0.10473182797431946, "learning_rate": 1e-05, "loss": 0.0171, "step": 1060800 }, { "epoch": 0.010609, "grad_norm": 0.1376986801624298, "learning_rate": 1e-05, "loss": 0.0172, "step": 1060900 }, { "epoch": 0.01061, "grad_norm": 0.19425876438617706, "learning_rate": 1e-05, "loss": 0.0174, "step": 1061000 }, { "epoch": 0.010611, "grad_norm": 0.12582992017269135, "learning_rate": 1e-05, "loss": 0.0174, "step": 1061100 }, { "epoch": 0.010612, "grad_norm": 0.17729590833187103, "learning_rate": 1e-05, "loss": 0.0172, "step": 1061200 }, { "epoch": 0.010613, "grad_norm": 0.11482353508472443, "learning_rate": 1e-05, "loss": 0.0172, "step": 1061300 }, { "epoch": 0.010614, "grad_norm": 0.1444118320941925, "learning_rate": 1e-05, "loss": 0.0177, "step": 1061400 }, { "epoch": 0.010615, "grad_norm": 0.12450309097766876, "learning_rate": 1e-05, "loss": 0.0174, "step": 1061500 }, { "epoch": 0.010616, "grad_norm": 0.1506439596414566, "learning_rate": 1e-05, "loss": 0.0173, "step": 1061600 }, { "epoch": 0.010617, "grad_norm": 0.11803510040044785, "learning_rate": 1e-05, "loss": 0.0178, "step": 1061700 }, { "epoch": 0.010618, "grad_norm": 0.10913629829883575, "learning_rate": 1e-05, "loss": 0.0173, "step": 1061800 }, { "epoch": 0.010619, "grad_norm": 0.1605820208787918, "learning_rate": 1e-05, "loss": 0.0171, "step": 1061900 }, { "epoch": 0.01062, "grad_norm": 0.17894940078258514, "learning_rate": 1e-05, "loss": 0.0174, "step": 1062000 }, { "epoch": 0.010621, "grad_norm": 0.14812275767326355, "learning_rate": 1e-05, "loss": 0.017, "step": 1062100 }, { "epoch": 0.010622, "grad_norm": 0.10608285665512085, "learning_rate": 1e-05, "loss": 0.0175, "step": 1062200 }, { "epoch": 0.010623, "grad_norm": 0.17750461399555206, "learning_rate": 1e-05, "loss": 0.0165, "step": 1062300 }, { "epoch": 0.010624, "grad_norm": 0.14720496535301208, "learning_rate": 1e-05, "loss": 0.0171, "step": 1062400 }, { "epoch": 0.010625, "grad_norm": 0.12365583330392838, "learning_rate": 1e-05, "loss": 0.0174, "step": 1062500 }, { "epoch": 0.010626, "grad_norm": 0.12131417542695999, "learning_rate": 1e-05, "loss": 0.0171, "step": 1062600 }, { "epoch": 0.010627, "grad_norm": 0.15227176249027252, "learning_rate": 1e-05, "loss": 0.0175, "step": 1062700 }, { "epoch": 0.010628, "grad_norm": 0.16861988604068756, "learning_rate": 1e-05, "loss": 0.0172, "step": 1062800 }, { "epoch": 0.010629, "grad_norm": 0.1620737463235855, "learning_rate": 1e-05, "loss": 0.0174, "step": 1062900 }, { "epoch": 0.01063, "grad_norm": 0.16171739995479584, "learning_rate": 1e-05, "loss": 0.0176, "step": 1063000 }, { "epoch": 0.010631, "grad_norm": 0.14427241683006287, "learning_rate": 1e-05, "loss": 0.0173, "step": 1063100 }, { "epoch": 0.010632, "grad_norm": 0.12708748877048492, "learning_rate": 1e-05, "loss": 0.0174, "step": 1063200 }, { "epoch": 0.010633, "grad_norm": 0.12943580746650696, "learning_rate": 1e-05, "loss": 0.0176, "step": 1063300 }, { "epoch": 0.010634, "grad_norm": 0.1376064270734787, "learning_rate": 1e-05, "loss": 0.0175, "step": 1063400 }, { "epoch": 0.010635, "grad_norm": 0.12757781147956848, "learning_rate": 1e-05, "loss": 0.0172, "step": 1063500 }, { "epoch": 0.010636, "grad_norm": 0.15691637992858887, "learning_rate": 1e-05, "loss": 0.0173, "step": 1063600 }, { "epoch": 0.010637, "grad_norm": 0.19281776249408722, "learning_rate": 1e-05, "loss": 0.0171, "step": 1063700 }, { "epoch": 0.010638, "grad_norm": 0.21174567937850952, "learning_rate": 1e-05, "loss": 0.0177, "step": 1063800 }, { "epoch": 0.010639, "grad_norm": 0.1540903002023697, "learning_rate": 1e-05, "loss": 0.0175, "step": 1063900 }, { "epoch": 0.01064, "grad_norm": 0.16283611953258514, "learning_rate": 1e-05, "loss": 0.0174, "step": 1064000 }, { "epoch": 0.010641, "grad_norm": 0.15131883323192596, "learning_rate": 1e-05, "loss": 0.0179, "step": 1064100 }, { "epoch": 0.010642, "grad_norm": 0.1266225129365921, "learning_rate": 1e-05, "loss": 0.0176, "step": 1064200 }, { "epoch": 0.010643, "grad_norm": 0.2297109067440033, "learning_rate": 1e-05, "loss": 0.0174, "step": 1064300 }, { "epoch": 0.010644, "grad_norm": 0.13194406032562256, "learning_rate": 1e-05, "loss": 0.0174, "step": 1064400 }, { "epoch": 0.010645, "grad_norm": 0.15993112325668335, "learning_rate": 1e-05, "loss": 0.0173, "step": 1064500 }, { "epoch": 0.010646, "grad_norm": 0.18062075972557068, "learning_rate": 1e-05, "loss": 0.0175, "step": 1064600 }, { "epoch": 0.010647, "grad_norm": 0.17852173745632172, "learning_rate": 1e-05, "loss": 0.0176, "step": 1064700 }, { "epoch": 0.010648, "grad_norm": 0.13789749145507812, "learning_rate": 1e-05, "loss": 0.017, "step": 1064800 }, { "epoch": 0.010649, "grad_norm": 0.19738918542861938, "learning_rate": 1e-05, "loss": 0.0174, "step": 1064900 }, { "epoch": 0.01065, "grad_norm": 0.13456831872463226, "learning_rate": 1e-05, "loss": 0.017, "step": 1065000 }, { "epoch": 0.010651, "grad_norm": 0.16378338634967804, "learning_rate": 1e-05, "loss": 0.0171, "step": 1065100 }, { "epoch": 0.010652, "grad_norm": 0.13614389300346375, "learning_rate": 1e-05, "loss": 0.0174, "step": 1065200 }, { "epoch": 0.010653, "grad_norm": 0.14385917782783508, "learning_rate": 1e-05, "loss": 0.0171, "step": 1065300 }, { "epoch": 0.010654, "grad_norm": 0.12482054531574249, "learning_rate": 1e-05, "loss": 0.0176, "step": 1065400 }, { "epoch": 0.010655, "grad_norm": 0.1825328916311264, "learning_rate": 1e-05, "loss": 0.0172, "step": 1065500 }, { "epoch": 0.010656, "grad_norm": 0.10458457469940186, "learning_rate": 1e-05, "loss": 0.0172, "step": 1065600 }, { "epoch": 0.010657, "grad_norm": 0.1412176936864853, "learning_rate": 1e-05, "loss": 0.0181, "step": 1065700 }, { "epoch": 0.010658, "grad_norm": 0.13363713026046753, "learning_rate": 1e-05, "loss": 0.0172, "step": 1065800 }, { "epoch": 0.010659, "grad_norm": 0.15236639976501465, "learning_rate": 1e-05, "loss": 0.0174, "step": 1065900 }, { "epoch": 0.01066, "grad_norm": 0.09874068945646286, "learning_rate": 1e-05, "loss": 0.0172, "step": 1066000 }, { "epoch": 0.010661, "grad_norm": 0.15078607201576233, "learning_rate": 1e-05, "loss": 0.0168, "step": 1066100 }, { "epoch": 0.010662, "grad_norm": 0.13991853594779968, "learning_rate": 1e-05, "loss": 0.018, "step": 1066200 }, { "epoch": 0.010663, "grad_norm": 0.20126120746135712, "learning_rate": 1e-05, "loss": 0.0175, "step": 1066300 }, { "epoch": 0.010664, "grad_norm": 0.12009242922067642, "learning_rate": 1e-05, "loss": 0.0172, "step": 1066400 }, { "epoch": 0.010665, "grad_norm": 0.14202141761779785, "learning_rate": 1e-05, "loss": 0.0171, "step": 1066500 }, { "epoch": 0.010666, "grad_norm": 0.14733734726905823, "learning_rate": 1e-05, "loss": 0.0172, "step": 1066600 }, { "epoch": 0.010667, "grad_norm": 0.11882024258375168, "learning_rate": 1e-05, "loss": 0.0175, "step": 1066700 }, { "epoch": 0.010668, "grad_norm": 0.13599441945552826, "learning_rate": 1e-05, "loss": 0.0172, "step": 1066800 }, { "epoch": 0.010669, "grad_norm": 0.14530271291732788, "learning_rate": 1e-05, "loss": 0.0173, "step": 1066900 }, { "epoch": 0.01067, "grad_norm": 0.15988706052303314, "learning_rate": 1e-05, "loss": 0.0172, "step": 1067000 }, { "epoch": 0.010671, "grad_norm": 0.14239679276943207, "learning_rate": 1e-05, "loss": 0.0175, "step": 1067100 }, { "epoch": 0.010672, "grad_norm": 0.1767028272151947, "learning_rate": 1e-05, "loss": 0.0176, "step": 1067200 }, { "epoch": 0.010673, "grad_norm": 0.12981949746608734, "learning_rate": 1e-05, "loss": 0.017, "step": 1067300 }, { "epoch": 0.010674, "grad_norm": 0.1661832183599472, "learning_rate": 1e-05, "loss": 0.0172, "step": 1067400 }, { "epoch": 0.010675, "grad_norm": 0.11068834364414215, "learning_rate": 1e-05, "loss": 0.0176, "step": 1067500 }, { "epoch": 0.010676, "grad_norm": 0.167302206158638, "learning_rate": 1e-05, "loss": 0.017, "step": 1067600 }, { "epoch": 0.010677, "grad_norm": 0.14398111402988434, "learning_rate": 1e-05, "loss": 0.0172, "step": 1067700 }, { "epoch": 0.010678, "grad_norm": 0.1664695292711258, "learning_rate": 1e-05, "loss": 0.0173, "step": 1067800 }, { "epoch": 0.010679, "grad_norm": 0.13171231746673584, "learning_rate": 1e-05, "loss": 0.0178, "step": 1067900 }, { "epoch": 0.01068, "grad_norm": 0.13609369099140167, "learning_rate": 1e-05, "loss": 0.0174, "step": 1068000 }, { "epoch": 0.010681, "grad_norm": 0.11731936782598495, "learning_rate": 1e-05, "loss": 0.0171, "step": 1068100 }, { "epoch": 0.010682, "grad_norm": 0.17468702793121338, "learning_rate": 1e-05, "loss": 0.0176, "step": 1068200 }, { "epoch": 0.010683, "grad_norm": 0.1390373706817627, "learning_rate": 1e-05, "loss": 0.0169, "step": 1068300 }, { "epoch": 0.010684, "grad_norm": 0.1501939594745636, "learning_rate": 1e-05, "loss": 0.0173, "step": 1068400 }, { "epoch": 0.010685, "grad_norm": 0.10719031095504761, "learning_rate": 1e-05, "loss": 0.0176, "step": 1068500 }, { "epoch": 0.010686, "grad_norm": 0.15234015882015228, "learning_rate": 1e-05, "loss": 0.0177, "step": 1068600 }, { "epoch": 0.010687, "grad_norm": 0.2235763818025589, "learning_rate": 1e-05, "loss": 0.0175, "step": 1068700 }, { "epoch": 0.010688, "grad_norm": 0.13081733882427216, "learning_rate": 1e-05, "loss": 0.0174, "step": 1068800 }, { "epoch": 0.010689, "grad_norm": 0.13643459975719452, "learning_rate": 1e-05, "loss": 0.0168, "step": 1068900 }, { "epoch": 0.01069, "grad_norm": 0.15732060372829437, "learning_rate": 1e-05, "loss": 0.017, "step": 1069000 }, { "epoch": 0.010691, "grad_norm": 0.162381112575531, "learning_rate": 1e-05, "loss": 0.0172, "step": 1069100 }, { "epoch": 0.010692, "grad_norm": 0.14816808700561523, "learning_rate": 1e-05, "loss": 0.017, "step": 1069200 }, { "epoch": 0.010693, "grad_norm": 0.14549751579761505, "learning_rate": 1e-05, "loss": 0.0172, "step": 1069300 }, { "epoch": 0.010694, "grad_norm": 0.13593803346157074, "learning_rate": 1e-05, "loss": 0.0172, "step": 1069400 }, { "epoch": 0.010695, "grad_norm": 0.12719644606113434, "learning_rate": 1e-05, "loss": 0.0169, "step": 1069500 }, { "epoch": 0.010696, "grad_norm": 0.11802882701158524, "learning_rate": 1e-05, "loss": 0.0175, "step": 1069600 }, { "epoch": 0.010697, "grad_norm": 0.19529326260089874, "learning_rate": 1e-05, "loss": 0.0176, "step": 1069700 }, { "epoch": 0.010698, "grad_norm": 0.13700349628925323, "learning_rate": 1e-05, "loss": 0.0179, "step": 1069800 }, { "epoch": 0.010699, "grad_norm": 0.17435023188591003, "learning_rate": 1e-05, "loss": 0.0174, "step": 1069900 }, { "epoch": 0.0107, "grad_norm": 0.15389809012413025, "learning_rate": 1e-05, "loss": 0.0174, "step": 1070000 }, { "epoch": 0.010701, "grad_norm": 0.11768143624067307, "learning_rate": 1e-05, "loss": 0.017, "step": 1070100 }, { "epoch": 0.010702, "grad_norm": 0.09940724074840546, "learning_rate": 1e-05, "loss": 0.0171, "step": 1070200 }, { "epoch": 0.010703, "grad_norm": 0.137431800365448, "learning_rate": 1e-05, "loss": 0.0175, "step": 1070300 }, { "epoch": 0.010704, "grad_norm": 0.11600134521722794, "learning_rate": 1e-05, "loss": 0.0175, "step": 1070400 }, { "epoch": 0.010705, "grad_norm": 0.14279569685459137, "learning_rate": 1e-05, "loss": 0.0169, "step": 1070500 }, { "epoch": 0.010706, "grad_norm": 0.14372943341732025, "learning_rate": 1e-05, "loss": 0.0176, "step": 1070600 }, { "epoch": 0.010707, "grad_norm": 0.14175759255886078, "learning_rate": 1e-05, "loss": 0.0171, "step": 1070700 }, { "epoch": 0.010708, "grad_norm": 0.12614548206329346, "learning_rate": 1e-05, "loss": 0.0171, "step": 1070800 }, { "epoch": 0.010709, "grad_norm": 0.18713335692882538, "learning_rate": 1e-05, "loss": 0.0174, "step": 1070900 }, { "epoch": 0.01071, "grad_norm": 0.15668080747127533, "learning_rate": 1e-05, "loss": 0.0176, "step": 1071000 }, { "epoch": 0.010711, "grad_norm": 0.12929114699363708, "learning_rate": 1e-05, "loss": 0.0172, "step": 1071100 }, { "epoch": 0.010712, "grad_norm": 0.14865398406982422, "learning_rate": 1e-05, "loss": 0.0173, "step": 1071200 }, { "epoch": 0.010713, "grad_norm": 0.16570597887039185, "learning_rate": 1e-05, "loss": 0.0176, "step": 1071300 }, { "epoch": 0.010714, "grad_norm": 0.13576583564281464, "learning_rate": 1e-05, "loss": 0.0175, "step": 1071400 }, { "epoch": 0.010715, "grad_norm": 0.11251155287027359, "learning_rate": 1e-05, "loss": 0.017, "step": 1071500 }, { "epoch": 0.010716, "grad_norm": 0.15896672010421753, "learning_rate": 1e-05, "loss": 0.0169, "step": 1071600 }, { "epoch": 0.010717, "grad_norm": 0.1386905312538147, "learning_rate": 1e-05, "loss": 0.0172, "step": 1071700 }, { "epoch": 0.010718, "grad_norm": 0.1261800080537796, "learning_rate": 1e-05, "loss": 0.0174, "step": 1071800 }, { "epoch": 0.010719, "grad_norm": 0.19244256615638733, "learning_rate": 1e-05, "loss": 0.0174, "step": 1071900 }, { "epoch": 0.01072, "grad_norm": 0.17344562709331512, "learning_rate": 1e-05, "loss": 0.017, "step": 1072000 }, { "epoch": 0.010721, "grad_norm": 0.2646280825138092, "learning_rate": 1e-05, "loss": 0.0175, "step": 1072100 }, { "epoch": 0.010722, "grad_norm": 0.19306744635105133, "learning_rate": 1e-05, "loss": 0.0175, "step": 1072200 }, { "epoch": 0.010723, "grad_norm": 0.12044854462146759, "learning_rate": 1e-05, "loss": 0.0175, "step": 1072300 }, { "epoch": 0.010724, "grad_norm": 0.1251671016216278, "learning_rate": 1e-05, "loss": 0.0172, "step": 1072400 }, { "epoch": 0.010725, "grad_norm": 0.14582759141921997, "learning_rate": 1e-05, "loss": 0.0173, "step": 1072500 }, { "epoch": 0.010726, "grad_norm": 0.1175781860947609, "learning_rate": 1e-05, "loss": 0.0176, "step": 1072600 }, { "epoch": 0.010727, "grad_norm": 0.10493814945220947, "learning_rate": 1e-05, "loss": 0.017, "step": 1072700 }, { "epoch": 0.010728, "grad_norm": 0.19804684817790985, "learning_rate": 1e-05, "loss": 0.0173, "step": 1072800 }, { "epoch": 0.010729, "grad_norm": 0.15995366871356964, "learning_rate": 1e-05, "loss": 0.0174, "step": 1072900 }, { "epoch": 0.01073, "grad_norm": 0.14134448766708374, "learning_rate": 1e-05, "loss": 0.0172, "step": 1073000 }, { "epoch": 0.010731, "grad_norm": 0.11299267411231995, "learning_rate": 1e-05, "loss": 0.017, "step": 1073100 }, { "epoch": 0.010732, "grad_norm": 0.16824088990688324, "learning_rate": 1e-05, "loss": 0.0176, "step": 1073200 }, { "epoch": 0.010733, "grad_norm": 0.1448025107383728, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073300 }, { "epoch": 0.010734, "grad_norm": 0.10438265651464462, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073400 }, { "epoch": 0.010735, "grad_norm": 0.16071277856826782, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073500 }, { "epoch": 0.010736, "grad_norm": 0.1353500485420227, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073600 }, { "epoch": 0.010737, "grad_norm": 0.10168413817882538, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073700 }, { "epoch": 0.010738, "grad_norm": 0.114518903195858, "learning_rate": 1e-05, "loss": 0.0173, "step": 1073800 }, { "epoch": 0.010739, "grad_norm": 0.13116154074668884, "learning_rate": 1e-05, "loss": 0.0176, "step": 1073900 }, { "epoch": 0.01074, "grad_norm": 0.1700521558523178, "learning_rate": 1e-05, "loss": 0.0171, "step": 1074000 }, { "epoch": 0.010741, "grad_norm": 0.18371504545211792, "learning_rate": 1e-05, "loss": 0.017, "step": 1074100 }, { "epoch": 0.010742, "grad_norm": 0.1726446896791458, "learning_rate": 1e-05, "loss": 0.0175, "step": 1074200 }, { "epoch": 0.010743, "grad_norm": 0.13175897300243378, "learning_rate": 1e-05, "loss": 0.0176, "step": 1074300 }, { "epoch": 0.010744, "grad_norm": 0.1470295488834381, "learning_rate": 1e-05, "loss": 0.0178, "step": 1074400 }, { "epoch": 0.010745, "grad_norm": 0.17839021980762482, "learning_rate": 1e-05, "loss": 0.0172, "step": 1074500 }, { "epoch": 0.010746, "grad_norm": 0.1531170755624771, "learning_rate": 1e-05, "loss": 0.0172, "step": 1074600 }, { "epoch": 0.010747, "grad_norm": 0.1494293510913849, "learning_rate": 1e-05, "loss": 0.0171, "step": 1074700 }, { "epoch": 0.010748, "grad_norm": 0.14532366394996643, "learning_rate": 1e-05, "loss": 0.0179, "step": 1074800 }, { "epoch": 0.010749, "grad_norm": 0.17337347567081451, "learning_rate": 1e-05, "loss": 0.0174, "step": 1074900 }, { "epoch": 0.01075, "grad_norm": 0.1969897747039795, "learning_rate": 1e-05, "loss": 0.0169, "step": 1075000 }, { "epoch": 0.010751, "grad_norm": 0.10718022286891937, "learning_rate": 1e-05, "loss": 0.0173, "step": 1075100 }, { "epoch": 0.010752, "grad_norm": 0.1617673635482788, "learning_rate": 1e-05, "loss": 0.0177, "step": 1075200 }, { "epoch": 0.010753, "grad_norm": 0.1946047991514206, "learning_rate": 1e-05, "loss": 0.0173, "step": 1075300 }, { "epoch": 0.010754, "grad_norm": 0.21727097034454346, "learning_rate": 1e-05, "loss": 0.017, "step": 1075400 }, { "epoch": 0.010755, "grad_norm": 0.17297257483005524, "learning_rate": 1e-05, "loss": 0.0171, "step": 1075500 }, { "epoch": 0.010756, "grad_norm": 0.13625873625278473, "learning_rate": 1e-05, "loss": 0.0174, "step": 1075600 }, { "epoch": 0.010757, "grad_norm": 0.12764215469360352, "learning_rate": 1e-05, "loss": 0.0173, "step": 1075700 }, { "epoch": 0.010758, "grad_norm": 0.13904501497745514, "learning_rate": 1e-05, "loss": 0.0173, "step": 1075800 }, { "epoch": 0.010759, "grad_norm": 0.1271209418773651, "learning_rate": 1e-05, "loss": 0.0173, "step": 1075900 }, { "epoch": 0.01076, "grad_norm": 0.1218172162771225, "learning_rate": 1e-05, "loss": 0.0174, "step": 1076000 }, { "epoch": 0.010761, "grad_norm": 0.11079609394073486, "learning_rate": 1e-05, "loss": 0.0171, "step": 1076100 }, { "epoch": 0.010762, "grad_norm": 0.1376122236251831, "learning_rate": 1e-05, "loss": 0.0168, "step": 1076200 }, { "epoch": 0.010763, "grad_norm": 0.16553722321987152, "learning_rate": 1e-05, "loss": 0.0172, "step": 1076300 }, { "epoch": 0.010764, "grad_norm": 0.13416770100593567, "learning_rate": 1e-05, "loss": 0.0172, "step": 1076400 }, { "epoch": 0.010765, "grad_norm": 0.2710491120815277, "learning_rate": 1e-05, "loss": 0.0169, "step": 1076500 }, { "epoch": 0.010766, "grad_norm": 0.11283021420240402, "learning_rate": 1e-05, "loss": 0.0167, "step": 1076600 }, { "epoch": 0.010767, "grad_norm": 0.13322138786315918, "learning_rate": 1e-05, "loss": 0.0172, "step": 1076700 }, { "epoch": 0.010768, "grad_norm": 0.19792330265045166, "learning_rate": 1e-05, "loss": 0.0175, "step": 1076800 }, { "epoch": 0.010769, "grad_norm": 0.09551171958446503, "learning_rate": 1e-05, "loss": 0.017, "step": 1076900 }, { "epoch": 0.01077, "grad_norm": 0.13088113069534302, "learning_rate": 1e-05, "loss": 0.0171, "step": 1077000 }, { "epoch": 0.010771, "grad_norm": 0.16498799622058868, "learning_rate": 1e-05, "loss": 0.0168, "step": 1077100 }, { "epoch": 0.010772, "grad_norm": 0.14131101965904236, "learning_rate": 1e-05, "loss": 0.0176, "step": 1077200 }, { "epoch": 0.010773, "grad_norm": 0.1434866040945053, "learning_rate": 1e-05, "loss": 0.0172, "step": 1077300 }, { "epoch": 0.010774, "grad_norm": 0.19087544083595276, "learning_rate": 1e-05, "loss": 0.0171, "step": 1077400 }, { "epoch": 0.010775, "grad_norm": 0.18021297454833984, "learning_rate": 1e-05, "loss": 0.0173, "step": 1077500 }, { "epoch": 0.010776, "grad_norm": 0.1502736657857895, "learning_rate": 1e-05, "loss": 0.0173, "step": 1077600 }, { "epoch": 0.010777, "grad_norm": 0.10782192647457123, "learning_rate": 1e-05, "loss": 0.0169, "step": 1077700 }, { "epoch": 0.010778, "grad_norm": 0.12479464709758759, "learning_rate": 1e-05, "loss": 0.0172, "step": 1077800 }, { "epoch": 0.010779, "grad_norm": 0.17510761320590973, "learning_rate": 1e-05, "loss": 0.0175, "step": 1077900 }, { "epoch": 0.01078, "grad_norm": 0.13771387934684753, "learning_rate": 1e-05, "loss": 0.0174, "step": 1078000 }, { "epoch": 0.010781, "grad_norm": 0.17351606488227844, "learning_rate": 1e-05, "loss": 0.0173, "step": 1078100 }, { "epoch": 0.010782, "grad_norm": 0.1653687208890915, "learning_rate": 1e-05, "loss": 0.0172, "step": 1078200 }, { "epoch": 0.010783, "grad_norm": 0.14525292813777924, "learning_rate": 1e-05, "loss": 0.0173, "step": 1078300 }, { "epoch": 0.010784, "grad_norm": 0.13921789824962616, "learning_rate": 1e-05, "loss": 0.0172, "step": 1078400 }, { "epoch": 0.010785, "grad_norm": 0.1350422501564026, "learning_rate": 1e-05, "loss": 0.0169, "step": 1078500 }, { "epoch": 0.010786, "grad_norm": 0.14318601787090302, "learning_rate": 1e-05, "loss": 0.0173, "step": 1078600 }, { "epoch": 0.010787, "grad_norm": 0.15197736024856567, "learning_rate": 1e-05, "loss": 0.0171, "step": 1078700 }, { "epoch": 0.010788, "grad_norm": 0.19261205196380615, "learning_rate": 1e-05, "loss": 0.0172, "step": 1078800 }, { "epoch": 0.010789, "grad_norm": 0.13564424216747284, "learning_rate": 1e-05, "loss": 0.0174, "step": 1078900 }, { "epoch": 0.01079, "grad_norm": 0.10419803112745285, "learning_rate": 1e-05, "loss": 0.0169, "step": 1079000 }, { "epoch": 0.010791, "grad_norm": 0.14431774616241455, "learning_rate": 1e-05, "loss": 0.0172, "step": 1079100 }, { "epoch": 0.010792, "grad_norm": 0.18477605283260345, "learning_rate": 1e-05, "loss": 0.0173, "step": 1079200 }, { "epoch": 0.010793, "grad_norm": 0.18673229217529297, "learning_rate": 1e-05, "loss": 0.0169, "step": 1079300 }, { "epoch": 0.010794, "grad_norm": 0.15092137455940247, "learning_rate": 1e-05, "loss": 0.0173, "step": 1079400 }, { "epoch": 0.010795, "grad_norm": 0.1397489458322525, "learning_rate": 1e-05, "loss": 0.0172, "step": 1079500 }, { "epoch": 0.010796, "grad_norm": 0.12334859371185303, "learning_rate": 1e-05, "loss": 0.0175, "step": 1079600 }, { "epoch": 0.010797, "grad_norm": 0.11384154856204987, "learning_rate": 1e-05, "loss": 0.0175, "step": 1079700 }, { "epoch": 0.010798, "grad_norm": 0.12337977439165115, "learning_rate": 1e-05, "loss": 0.0173, "step": 1079800 }, { "epoch": 0.010799, "grad_norm": 0.11201358586549759, "learning_rate": 1e-05, "loss": 0.0176, "step": 1079900 }, { "epoch": 0.0108, "grad_norm": 0.11204622685909271, "learning_rate": 1e-05, "loss": 0.0172, "step": 1080000 }, { "epoch": 0.0108, "eval_loss": 0.01503554917871952, "eval_runtime": 185.4316, "eval_samples_per_second": 269.641, "eval_steps_per_second": 16.853, "step": 1080000 }, { "epoch": 0.010801, "grad_norm": 0.13304023444652557, "learning_rate": 1e-05, "loss": 0.0171, "step": 1080100 }, { "epoch": 0.010802, "grad_norm": 0.15855318307876587, "learning_rate": 1e-05, "loss": 0.0175, "step": 1080200 }, { "epoch": 0.010803, "grad_norm": 0.17021867632865906, "learning_rate": 1e-05, "loss": 0.0171, "step": 1080300 }, { "epoch": 0.010804, "grad_norm": 0.21918949484825134, "learning_rate": 1e-05, "loss": 0.017, "step": 1080400 }, { "epoch": 0.010805, "grad_norm": 0.19696612656116486, "learning_rate": 1e-05, "loss": 0.0176, "step": 1080500 }, { "epoch": 0.010806, "grad_norm": 0.12583917379379272, "learning_rate": 1e-05, "loss": 0.0168, "step": 1080600 }, { "epoch": 0.010807, "grad_norm": 0.1619870364665985, "learning_rate": 1e-05, "loss": 0.0173, "step": 1080700 }, { "epoch": 0.010808, "grad_norm": 0.14472341537475586, "learning_rate": 1e-05, "loss": 0.0174, "step": 1080800 }, { "epoch": 0.010809, "grad_norm": 0.12304122000932693, "learning_rate": 1e-05, "loss": 0.017, "step": 1080900 }, { "epoch": 0.01081, "grad_norm": 0.2088502049446106, "learning_rate": 1e-05, "loss": 0.0176, "step": 1081000 }, { "epoch": 0.010811, "grad_norm": 0.10933902114629745, "learning_rate": 1e-05, "loss": 0.0174, "step": 1081100 }, { "epoch": 0.010812, "grad_norm": 0.13717377185821533, "learning_rate": 1e-05, "loss": 0.0175, "step": 1081200 }, { "epoch": 0.010813, "grad_norm": 0.17077921330928802, "learning_rate": 1e-05, "loss": 0.017, "step": 1081300 }, { "epoch": 0.010814, "grad_norm": 0.1262265294790268, "learning_rate": 1e-05, "loss": 0.0168, "step": 1081400 }, { "epoch": 0.010815, "grad_norm": 0.13801002502441406, "learning_rate": 1e-05, "loss": 0.017, "step": 1081500 }, { "epoch": 0.010816, "grad_norm": 0.1286059021949768, "learning_rate": 1e-05, "loss": 0.0176, "step": 1081600 }, { "epoch": 0.010817, "grad_norm": 0.17537128925323486, "learning_rate": 1e-05, "loss": 0.0174, "step": 1081700 }, { "epoch": 0.010818, "grad_norm": 0.21347105503082275, "learning_rate": 1e-05, "loss": 0.0169, "step": 1081800 }, { "epoch": 0.010819, "grad_norm": 0.14944855868816376, "learning_rate": 1e-05, "loss": 0.0172, "step": 1081900 }, { "epoch": 0.01082, "grad_norm": 0.11571269482374191, "learning_rate": 1e-05, "loss": 0.0172, "step": 1082000 }, { "epoch": 0.010821, "grad_norm": 0.1582699716091156, "learning_rate": 1e-05, "loss": 0.0178, "step": 1082100 }, { "epoch": 0.010822, "grad_norm": 0.11505886167287827, "learning_rate": 1e-05, "loss": 0.0171, "step": 1082200 }, { "epoch": 0.010823, "grad_norm": 0.1611747294664383, "learning_rate": 1e-05, "loss": 0.017, "step": 1082300 }, { "epoch": 0.010824, "grad_norm": 0.13131651282310486, "learning_rate": 1e-05, "loss": 0.0172, "step": 1082400 }, { "epoch": 0.010825, "grad_norm": 0.11577179282903671, "learning_rate": 1e-05, "loss": 0.0169, "step": 1082500 }, { "epoch": 0.010826, "grad_norm": 0.14672844111919403, "learning_rate": 1e-05, "loss": 0.0175, "step": 1082600 }, { "epoch": 0.010827, "grad_norm": 0.17634400725364685, "learning_rate": 1e-05, "loss": 0.0177, "step": 1082700 }, { "epoch": 0.010828, "grad_norm": 0.1511344611644745, "learning_rate": 1e-05, "loss": 0.0168, "step": 1082800 }, { "epoch": 0.010829, "grad_norm": 0.09893634915351868, "learning_rate": 1e-05, "loss": 0.0178, "step": 1082900 }, { "epoch": 0.01083, "grad_norm": 0.13689760863780975, "learning_rate": 1e-05, "loss": 0.0175, "step": 1083000 }, { "epoch": 0.010831, "grad_norm": 0.12067126482725143, "learning_rate": 1e-05, "loss": 0.0173, "step": 1083100 }, { "epoch": 0.010832, "grad_norm": 0.16405026614665985, "learning_rate": 1e-05, "loss": 0.017, "step": 1083200 }, { "epoch": 0.010833, "grad_norm": 0.18052497506141663, "learning_rate": 1e-05, "loss": 0.0169, "step": 1083300 }, { "epoch": 0.010834, "grad_norm": 0.12580250203609467, "learning_rate": 1e-05, "loss": 0.0175, "step": 1083400 }, { "epoch": 0.010835, "grad_norm": 0.14806139469146729, "learning_rate": 1e-05, "loss": 0.0176, "step": 1083500 }, { "epoch": 0.010836, "grad_norm": 0.15073969960212708, "learning_rate": 1e-05, "loss": 0.0174, "step": 1083600 }, { "epoch": 0.010837, "grad_norm": 0.15377607941627502, "learning_rate": 1e-05, "loss": 0.0169, "step": 1083700 }, { "epoch": 0.010838, "grad_norm": 0.13288433849811554, "learning_rate": 1e-05, "loss": 0.0174, "step": 1083800 }, { "epoch": 0.010839, "grad_norm": 0.18443776667118073, "learning_rate": 1e-05, "loss": 0.017, "step": 1083900 }, { "epoch": 0.01084, "grad_norm": 0.15282581746578217, "learning_rate": 1e-05, "loss": 0.0172, "step": 1084000 }, { "epoch": 0.010841, "grad_norm": 0.12487483024597168, "learning_rate": 1e-05, "loss": 0.017, "step": 1084100 }, { "epoch": 0.010842, "grad_norm": 0.14768631756305695, "learning_rate": 1e-05, "loss": 0.0174, "step": 1084200 }, { "epoch": 0.010843, "grad_norm": 0.12321928143501282, "learning_rate": 1e-05, "loss": 0.017, "step": 1084300 }, { "epoch": 0.010844, "grad_norm": 0.1272806078195572, "learning_rate": 1e-05, "loss": 0.0177, "step": 1084400 }, { "epoch": 0.010845, "grad_norm": 0.1309462934732437, "learning_rate": 1e-05, "loss": 0.0169, "step": 1084500 }, { "epoch": 0.010846, "grad_norm": 0.16111576557159424, "learning_rate": 1e-05, "loss": 0.0175, "step": 1084600 }, { "epoch": 0.010847, "grad_norm": 0.09331147372722626, "learning_rate": 1e-05, "loss": 0.0172, "step": 1084700 }, { "epoch": 0.010848, "grad_norm": 0.12600713968276978, "learning_rate": 1e-05, "loss": 0.0174, "step": 1084800 }, { "epoch": 0.010849, "grad_norm": 0.11440177261829376, "learning_rate": 1e-05, "loss": 0.0171, "step": 1084900 }, { "epoch": 0.01085, "grad_norm": 0.1353342980146408, "learning_rate": 1e-05, "loss": 0.017, "step": 1085000 }, { "epoch": 0.010851, "grad_norm": 0.09122303128242493, "learning_rate": 1e-05, "loss": 0.017, "step": 1085100 }, { "epoch": 0.010852, "grad_norm": 0.15389195084571838, "learning_rate": 1e-05, "loss": 0.0175, "step": 1085200 }, { "epoch": 0.010853, "grad_norm": 0.11351118981838226, "learning_rate": 1e-05, "loss": 0.0174, "step": 1085300 }, { "epoch": 0.010854, "grad_norm": 0.11524728685617447, "learning_rate": 1e-05, "loss": 0.0173, "step": 1085400 }, { "epoch": 0.010855, "grad_norm": 0.10661421716213226, "learning_rate": 1e-05, "loss": 0.017, "step": 1085500 }, { "epoch": 0.010856, "grad_norm": 0.13403624296188354, "learning_rate": 1e-05, "loss": 0.0174, "step": 1085600 }, { "epoch": 0.010857, "grad_norm": 0.14199113845825195, "learning_rate": 1e-05, "loss": 0.0174, "step": 1085700 }, { "epoch": 0.010858, "grad_norm": 0.13180585205554962, "learning_rate": 1e-05, "loss": 0.0172, "step": 1085800 }, { "epoch": 0.010859, "grad_norm": 0.2215389758348465, "learning_rate": 1e-05, "loss": 0.0172, "step": 1085900 }, { "epoch": 0.01086, "grad_norm": 0.14047206938266754, "learning_rate": 1e-05, "loss": 0.0175, "step": 1086000 }, { "epoch": 0.010861, "grad_norm": 0.11892475187778473, "learning_rate": 1e-05, "loss": 0.0175, "step": 1086100 }, { "epoch": 0.010862, "grad_norm": 0.18811114132404327, "learning_rate": 1e-05, "loss": 0.0171, "step": 1086200 }, { "epoch": 0.010863, "grad_norm": 0.1844596266746521, "learning_rate": 1e-05, "loss": 0.0177, "step": 1086300 }, { "epoch": 0.010864, "grad_norm": 0.22074510157108307, "learning_rate": 1e-05, "loss": 0.0178, "step": 1086400 }, { "epoch": 0.010865, "grad_norm": 0.13417696952819824, "learning_rate": 1e-05, "loss": 0.0168, "step": 1086500 }, { "epoch": 0.010866, "grad_norm": 0.18649770319461823, "learning_rate": 1e-05, "loss": 0.0168, "step": 1086600 }, { "epoch": 0.010867, "grad_norm": 0.11740618944168091, "learning_rate": 1e-05, "loss": 0.0171, "step": 1086700 }, { "epoch": 0.010868, "grad_norm": 0.17026883363723755, "learning_rate": 1e-05, "loss": 0.0169, "step": 1086800 }, { "epoch": 0.010869, "grad_norm": 0.15170517563819885, "learning_rate": 1e-05, "loss": 0.0171, "step": 1086900 }, { "epoch": 0.01087, "grad_norm": 0.10992207378149033, "learning_rate": 1e-05, "loss": 0.0173, "step": 1087000 }, { "epoch": 0.010871, "grad_norm": 0.13282859325408936, "learning_rate": 1e-05, "loss": 0.0173, "step": 1087100 }, { "epoch": 0.010872, "grad_norm": 0.13664624094963074, "learning_rate": 1e-05, "loss": 0.0174, "step": 1087200 }, { "epoch": 0.010873, "grad_norm": 0.15507379174232483, "learning_rate": 1e-05, "loss": 0.0176, "step": 1087300 }, { "epoch": 0.010874, "grad_norm": 0.2256808578968048, "learning_rate": 1e-05, "loss": 0.0167, "step": 1087400 }, { "epoch": 0.010875, "grad_norm": 0.1409025937318802, "learning_rate": 1e-05, "loss": 0.017, "step": 1087500 }, { "epoch": 0.010876, "grad_norm": 0.1601884365081787, "learning_rate": 1e-05, "loss": 0.0177, "step": 1087600 }, { "epoch": 0.010877, "grad_norm": 0.1529698222875595, "learning_rate": 1e-05, "loss": 0.018, "step": 1087700 }, { "epoch": 0.010878, "grad_norm": 0.11555805802345276, "learning_rate": 1e-05, "loss": 0.0172, "step": 1087800 }, { "epoch": 0.010879, "grad_norm": 0.10735109448432922, "learning_rate": 1e-05, "loss": 0.0175, "step": 1087900 }, { "epoch": 0.01088, "grad_norm": 0.15398433804512024, "learning_rate": 1e-05, "loss": 0.0171, "step": 1088000 }, { "epoch": 0.010881, "grad_norm": 0.18778257071971893, "learning_rate": 1e-05, "loss": 0.0171, "step": 1088100 }, { "epoch": 0.010882, "grad_norm": 0.0913759395480156, "learning_rate": 1e-05, "loss": 0.017, "step": 1088200 }, { "epoch": 0.010883, "grad_norm": 0.13877922296524048, "learning_rate": 1e-05, "loss": 0.0172, "step": 1088300 }, { "epoch": 0.010884, "grad_norm": 0.15668627619743347, "learning_rate": 1e-05, "loss": 0.0171, "step": 1088400 }, { "epoch": 0.010885, "grad_norm": 0.11946287006139755, "learning_rate": 1e-05, "loss": 0.0175, "step": 1088500 }, { "epoch": 0.010886, "grad_norm": 0.14420759677886963, "learning_rate": 1e-05, "loss": 0.0173, "step": 1088600 }, { "epoch": 0.010887, "grad_norm": 0.12893931567668915, "learning_rate": 1e-05, "loss": 0.0173, "step": 1088700 }, { "epoch": 0.010888, "grad_norm": 0.15178434550762177, "learning_rate": 1e-05, "loss": 0.0171, "step": 1088800 }, { "epoch": 0.010889, "grad_norm": 0.1532706469297409, "learning_rate": 1e-05, "loss": 0.017, "step": 1088900 }, { "epoch": 0.01089, "grad_norm": 0.12952762842178345, "learning_rate": 1e-05, "loss": 0.017, "step": 1089000 }, { "epoch": 0.010891, "grad_norm": 0.12814754247665405, "learning_rate": 1e-05, "loss": 0.0171, "step": 1089100 }, { "epoch": 0.010892, "grad_norm": 0.10446563363075256, "learning_rate": 1e-05, "loss": 0.0175, "step": 1089200 }, { "epoch": 0.010893, "grad_norm": 0.16297686100006104, "learning_rate": 1e-05, "loss": 0.0174, "step": 1089300 }, { "epoch": 0.010894, "grad_norm": 0.16095395386219025, "learning_rate": 1e-05, "loss": 0.017, "step": 1089400 }, { "epoch": 0.010895, "grad_norm": 0.17660295963287354, "learning_rate": 1e-05, "loss": 0.0174, "step": 1089500 }, { "epoch": 0.010896, "grad_norm": 0.1140505000948906, "learning_rate": 1e-05, "loss": 0.0176, "step": 1089600 }, { "epoch": 0.010897, "grad_norm": 0.12459470331668854, "learning_rate": 1e-05, "loss": 0.017, "step": 1089700 }, { "epoch": 0.010898, "grad_norm": 0.1613878607749939, "learning_rate": 1e-05, "loss": 0.0169, "step": 1089800 }, { "epoch": 0.010899, "grad_norm": 0.13809040188789368, "learning_rate": 1e-05, "loss": 0.0172, "step": 1089900 }, { "epoch": 0.0109, "grad_norm": 0.15723541378974915, "learning_rate": 1e-05, "loss": 0.0176, "step": 1090000 }, { "epoch": 0.010901, "grad_norm": 0.09591066092252731, "learning_rate": 1e-05, "loss": 0.017, "step": 1090100 }, { "epoch": 0.010902, "grad_norm": 0.10585977882146835, "learning_rate": 1e-05, "loss": 0.017, "step": 1090200 }, { "epoch": 0.010903, "grad_norm": 0.13060356676578522, "learning_rate": 1e-05, "loss": 0.0173, "step": 1090300 }, { "epoch": 0.010904, "grad_norm": 0.1668078899383545, "learning_rate": 1e-05, "loss": 0.0175, "step": 1090400 }, { "epoch": 0.010905, "grad_norm": 0.1291077733039856, "learning_rate": 1e-05, "loss": 0.0173, "step": 1090500 }, { "epoch": 0.010906, "grad_norm": 0.163765087723732, "learning_rate": 1e-05, "loss": 0.0171, "step": 1090600 }, { "epoch": 0.010907, "grad_norm": 0.10242309421300888, "learning_rate": 1e-05, "loss": 0.0173, "step": 1090700 }, { "epoch": 0.010908, "grad_norm": 0.13182824850082397, "learning_rate": 1e-05, "loss": 0.017, "step": 1090800 }, { "epoch": 0.010909, "grad_norm": 0.13648982346057892, "learning_rate": 1e-05, "loss": 0.0169, "step": 1090900 }, { "epoch": 0.01091, "grad_norm": 0.13561245799064636, "learning_rate": 1e-05, "loss": 0.0172, "step": 1091000 }, { "epoch": 0.010911, "grad_norm": 0.17514483630657196, "learning_rate": 1e-05, "loss": 0.017, "step": 1091100 }, { "epoch": 0.010912, "grad_norm": 0.17622199654579163, "learning_rate": 1e-05, "loss": 0.0172, "step": 1091200 }, { "epoch": 0.010913, "grad_norm": 0.1625063419342041, "learning_rate": 1e-05, "loss": 0.0171, "step": 1091300 }, { "epoch": 0.010914, "grad_norm": 0.15657182037830353, "learning_rate": 1e-05, "loss": 0.0173, "step": 1091400 }, { "epoch": 0.010915, "grad_norm": 0.17011167109012604, "learning_rate": 1e-05, "loss": 0.0176, "step": 1091500 }, { "epoch": 0.010916, "grad_norm": 0.11670643091201782, "learning_rate": 1e-05, "loss": 0.017, "step": 1091600 }, { "epoch": 0.010917, "grad_norm": 0.1542414426803589, "learning_rate": 1e-05, "loss": 0.0169, "step": 1091700 }, { "epoch": 0.010918, "grad_norm": 0.10906277596950531, "learning_rate": 1e-05, "loss": 0.0169, "step": 1091800 }, { "epoch": 0.010919, "grad_norm": 0.2045799344778061, "learning_rate": 1e-05, "loss": 0.017, "step": 1091900 }, { "epoch": 0.01092, "grad_norm": 0.128950297832489, "learning_rate": 1e-05, "loss": 0.0171, "step": 1092000 }, { "epoch": 0.010921, "grad_norm": 0.13405664265155792, "learning_rate": 1e-05, "loss": 0.0174, "step": 1092100 }, { "epoch": 0.010922, "grad_norm": 0.1546197384595871, "learning_rate": 1e-05, "loss": 0.017, "step": 1092200 }, { "epoch": 0.010923, "grad_norm": 0.1367926448583603, "learning_rate": 1e-05, "loss": 0.0175, "step": 1092300 }, { "epoch": 0.010924, "grad_norm": 0.15711157023906708, "learning_rate": 1e-05, "loss": 0.0176, "step": 1092400 }, { "epoch": 0.010925, "grad_norm": 0.1564728021621704, "learning_rate": 1e-05, "loss": 0.0166, "step": 1092500 }, { "epoch": 0.010926, "grad_norm": 0.16672579944133759, "learning_rate": 1e-05, "loss": 0.017, "step": 1092600 }, { "epoch": 0.010927, "grad_norm": 0.105653315782547, "learning_rate": 1e-05, "loss": 0.0173, "step": 1092700 }, { "epoch": 0.010928, "grad_norm": 0.14017416536808014, "learning_rate": 1e-05, "loss": 0.0172, "step": 1092800 }, { "epoch": 0.010929, "grad_norm": 0.13586749136447906, "learning_rate": 1e-05, "loss": 0.0173, "step": 1092900 }, { "epoch": 0.01093, "grad_norm": 0.16037669777870178, "learning_rate": 1e-05, "loss": 0.0173, "step": 1093000 }, { "epoch": 0.010931, "grad_norm": 0.12741760909557343, "learning_rate": 1e-05, "loss": 0.0174, "step": 1093100 }, { "epoch": 0.010932, "grad_norm": 0.1241641715168953, "learning_rate": 1e-05, "loss": 0.0171, "step": 1093200 }, { "epoch": 0.010933, "grad_norm": 0.10272790491580963, "learning_rate": 1e-05, "loss": 0.0178, "step": 1093300 }, { "epoch": 0.010934, "grad_norm": 0.15526042878627777, "learning_rate": 1e-05, "loss": 0.0172, "step": 1093400 }, { "epoch": 0.010935, "grad_norm": 0.19014157354831696, "learning_rate": 1e-05, "loss": 0.0176, "step": 1093500 }, { "epoch": 0.010936, "grad_norm": 0.11338913440704346, "learning_rate": 1e-05, "loss": 0.017, "step": 1093600 }, { "epoch": 0.010937, "grad_norm": 0.1537153422832489, "learning_rate": 1e-05, "loss": 0.0177, "step": 1093700 }, { "epoch": 0.010938, "grad_norm": 0.11516964435577393, "learning_rate": 1e-05, "loss": 0.017, "step": 1093800 }, { "epoch": 0.010939, "grad_norm": 0.10628233850002289, "learning_rate": 1e-05, "loss": 0.0171, "step": 1093900 }, { "epoch": 0.01094, "grad_norm": 0.1358746588230133, "learning_rate": 1e-05, "loss": 0.0173, "step": 1094000 }, { "epoch": 0.010941, "grad_norm": 0.13360324501991272, "learning_rate": 1e-05, "loss": 0.0171, "step": 1094100 }, { "epoch": 0.010942, "grad_norm": 0.19204020500183105, "learning_rate": 1e-05, "loss": 0.0167, "step": 1094200 }, { "epoch": 0.010943, "grad_norm": 0.12117959558963776, "learning_rate": 1e-05, "loss": 0.017, "step": 1094300 }, { "epoch": 0.010944, "grad_norm": 0.1392979472875595, "learning_rate": 1e-05, "loss": 0.0174, "step": 1094400 }, { "epoch": 0.010945, "grad_norm": 0.15185104310512543, "learning_rate": 1e-05, "loss": 0.0173, "step": 1094500 }, { "epoch": 0.010946, "grad_norm": 0.11331585794687271, "learning_rate": 1e-05, "loss": 0.0174, "step": 1094600 }, { "epoch": 0.010947, "grad_norm": 0.12322938442230225, "learning_rate": 1e-05, "loss": 0.0175, "step": 1094700 }, { "epoch": 0.010948, "grad_norm": 0.14820633828639984, "learning_rate": 1e-05, "loss": 0.0177, "step": 1094800 }, { "epoch": 0.010949, "grad_norm": 0.12151075154542923, "learning_rate": 1e-05, "loss": 0.0172, "step": 1094900 }, { "epoch": 0.01095, "grad_norm": 0.11638996005058289, "learning_rate": 1e-05, "loss": 0.0176, "step": 1095000 }, { "epoch": 0.010951, "grad_norm": 0.17508606612682343, "learning_rate": 1e-05, "loss": 0.0174, "step": 1095100 }, { "epoch": 0.010952, "grad_norm": 0.09971053898334503, "learning_rate": 1e-05, "loss": 0.0173, "step": 1095200 }, { "epoch": 0.010953, "grad_norm": 0.12634612619876862, "learning_rate": 1e-05, "loss": 0.0172, "step": 1095300 }, { "epoch": 0.010954, "grad_norm": 0.1213083267211914, "learning_rate": 1e-05, "loss": 0.017, "step": 1095400 }, { "epoch": 0.010955, "grad_norm": 0.13718144595623016, "learning_rate": 1e-05, "loss": 0.0173, "step": 1095500 }, { "epoch": 0.010956, "grad_norm": 0.1404784619808197, "learning_rate": 1e-05, "loss": 0.0172, "step": 1095600 }, { "epoch": 0.010957, "grad_norm": 0.11809555441141129, "learning_rate": 1e-05, "loss": 0.0172, "step": 1095700 }, { "epoch": 0.010958, "grad_norm": 0.18463875353336334, "learning_rate": 1e-05, "loss": 0.0174, "step": 1095800 }, { "epoch": 0.010959, "grad_norm": 0.12225694954395294, "learning_rate": 1e-05, "loss": 0.017, "step": 1095900 }, { "epoch": 0.01096, "grad_norm": 0.10889337211847305, "learning_rate": 1e-05, "loss": 0.0171, "step": 1096000 }, { "epoch": 0.010961, "grad_norm": 0.1929037719964981, "learning_rate": 1e-05, "loss": 0.0171, "step": 1096100 }, { "epoch": 0.010962, "grad_norm": 0.12589691579341888, "learning_rate": 1e-05, "loss": 0.0169, "step": 1096200 }, { "epoch": 0.010963, "grad_norm": 0.15083050727844238, "learning_rate": 1e-05, "loss": 0.0175, "step": 1096300 }, { "epoch": 0.010964, "grad_norm": 0.1342569887638092, "learning_rate": 1e-05, "loss": 0.0172, "step": 1096400 }, { "epoch": 0.010965, "grad_norm": 0.12722238898277283, "learning_rate": 1e-05, "loss": 0.0169, "step": 1096500 }, { "epoch": 0.010966, "grad_norm": 0.1316491663455963, "learning_rate": 1e-05, "loss": 0.0174, "step": 1096600 }, { "epoch": 0.010967, "grad_norm": 0.137531116604805, "learning_rate": 1e-05, "loss": 0.0175, "step": 1096700 }, { "epoch": 0.010968, "grad_norm": 0.11659149080514908, "learning_rate": 1e-05, "loss": 0.0167, "step": 1096800 }, { "epoch": 0.010969, "grad_norm": 0.1824338287115097, "learning_rate": 1e-05, "loss": 0.0168, "step": 1096900 }, { "epoch": 0.01097, "grad_norm": 0.10809279978275299, "learning_rate": 1e-05, "loss": 0.0174, "step": 1097000 }, { "epoch": 0.010971, "grad_norm": 0.1468055546283722, "learning_rate": 1e-05, "loss": 0.0172, "step": 1097100 }, { "epoch": 0.010972, "grad_norm": 0.14760185778141022, "learning_rate": 1e-05, "loss": 0.0174, "step": 1097200 }, { "epoch": 0.010973, "grad_norm": 0.12494143843650818, "learning_rate": 1e-05, "loss": 0.0172, "step": 1097300 }, { "epoch": 0.010974, "grad_norm": 0.24947722256183624, "learning_rate": 1e-05, "loss": 0.0173, "step": 1097400 }, { "epoch": 0.010975, "grad_norm": 0.15427327156066895, "learning_rate": 1e-05, "loss": 0.017, "step": 1097500 }, { "epoch": 0.010976, "grad_norm": 0.1453981101512909, "learning_rate": 1e-05, "loss": 0.017, "step": 1097600 }, { "epoch": 0.010977, "grad_norm": 0.12696854770183563, "learning_rate": 1e-05, "loss": 0.0171, "step": 1097700 }, { "epoch": 0.010978, "grad_norm": 0.2170618623495102, "learning_rate": 1e-05, "loss": 0.017, "step": 1097800 }, { "epoch": 0.010979, "grad_norm": 0.14713190495967865, "learning_rate": 1e-05, "loss": 0.0169, "step": 1097900 }, { "epoch": 0.01098, "grad_norm": 0.14603115618228912, "learning_rate": 1e-05, "loss": 0.0174, "step": 1098000 }, { "epoch": 0.010981, "grad_norm": 0.09954600781202316, "learning_rate": 1e-05, "loss": 0.017, "step": 1098100 }, { "epoch": 0.010982, "grad_norm": 0.13037699460983276, "learning_rate": 1e-05, "loss": 0.0168, "step": 1098200 }, { "epoch": 0.010983, "grad_norm": 0.12883739173412323, "learning_rate": 1e-05, "loss": 0.0175, "step": 1098300 }, { "epoch": 0.010984, "grad_norm": 0.1543600857257843, "learning_rate": 1e-05, "loss": 0.0174, "step": 1098400 }, { "epoch": 0.010985, "grad_norm": 0.15962526202201843, "learning_rate": 1e-05, "loss": 0.0171, "step": 1098500 }, { "epoch": 0.010986, "grad_norm": 0.14292670786380768, "learning_rate": 1e-05, "loss": 0.0173, "step": 1098600 }, { "epoch": 0.010987, "grad_norm": 0.14670208096504211, "learning_rate": 1e-05, "loss": 0.0173, "step": 1098700 }, { "epoch": 0.010988, "grad_norm": 0.15138506889343262, "learning_rate": 1e-05, "loss": 0.0173, "step": 1098800 }, { "epoch": 0.010989, "grad_norm": 0.12359067797660828, "learning_rate": 1e-05, "loss": 0.0172, "step": 1098900 }, { "epoch": 0.01099, "grad_norm": 0.17288704216480255, "learning_rate": 1e-05, "loss": 0.0171, "step": 1099000 }, { "epoch": 0.010991, "grad_norm": 0.12242109328508377, "learning_rate": 1e-05, "loss": 0.0172, "step": 1099100 }, { "epoch": 0.010992, "grad_norm": 0.1087186262011528, "learning_rate": 1e-05, "loss": 0.017, "step": 1099200 }, { "epoch": 0.010993, "grad_norm": 0.2058572620153427, "learning_rate": 1e-05, "loss": 0.017, "step": 1099300 }, { "epoch": 0.010994, "grad_norm": 0.135991171002388, "learning_rate": 1e-05, "loss": 0.017, "step": 1099400 }, { "epoch": 0.010995, "grad_norm": 0.12115073949098587, "learning_rate": 1e-05, "loss": 0.0171, "step": 1099500 }, { "epoch": 0.010996, "grad_norm": 0.16950280964374542, "learning_rate": 1e-05, "loss": 0.0174, "step": 1099600 }, { "epoch": 0.010997, "grad_norm": 0.13233448565006256, "learning_rate": 1e-05, "loss": 0.0174, "step": 1099700 }, { "epoch": 0.010998, "grad_norm": 0.1732763946056366, "learning_rate": 1e-05, "loss": 0.0172, "step": 1099800 }, { "epoch": 0.010999, "grad_norm": 0.13910819590091705, "learning_rate": 1e-05, "loss": 0.0172, "step": 1099900 }, { "epoch": 0.011, "grad_norm": 0.11888720840215683, "learning_rate": 1e-05, "loss": 0.0169, "step": 1100000 }, { "epoch": 0.011, "eval_loss": 0.015164710581302643, "eval_runtime": 183.5786, "eval_samples_per_second": 272.363, "eval_steps_per_second": 17.023, "step": 1100000 }, { "epoch": 0.011001, "grad_norm": 0.1091485247015953, "learning_rate": 1e-05, "loss": 0.0174, "step": 1100100 }, { "epoch": 0.011002, "grad_norm": 0.1461533010005951, "learning_rate": 1e-05, "loss": 0.0168, "step": 1100200 }, { "epoch": 0.011003, "grad_norm": 0.18116767704486847, "learning_rate": 1e-05, "loss": 0.0169, "step": 1100300 }, { "epoch": 0.011004, "grad_norm": 0.13349997997283936, "learning_rate": 1e-05, "loss": 0.0171, "step": 1100400 }, { "epoch": 0.011005, "grad_norm": 0.11868207156658173, "learning_rate": 1e-05, "loss": 0.0173, "step": 1100500 }, { "epoch": 0.011006, "grad_norm": 0.18335619568824768, "learning_rate": 1e-05, "loss": 0.0172, "step": 1100600 }, { "epoch": 0.011007, "grad_norm": 0.18616658449172974, "learning_rate": 1e-05, "loss": 0.017, "step": 1100700 }, { "epoch": 0.011008, "grad_norm": 0.1021137610077858, "learning_rate": 1e-05, "loss": 0.0171, "step": 1100800 }, { "epoch": 0.011009, "grad_norm": 0.12377957254648209, "learning_rate": 1e-05, "loss": 0.0172, "step": 1100900 }, { "epoch": 0.01101, "grad_norm": 0.14180658757686615, "learning_rate": 1e-05, "loss": 0.0167, "step": 1101000 }, { "epoch": 0.011011, "grad_norm": 0.20247866213321686, "learning_rate": 1e-05, "loss": 0.017, "step": 1101100 }, { "epoch": 0.011012, "grad_norm": 0.15414024889469147, "learning_rate": 1e-05, "loss": 0.0172, "step": 1101200 }, { "epoch": 0.011013, "grad_norm": 0.09656801074743271, "learning_rate": 1e-05, "loss": 0.017, "step": 1101300 }, { "epoch": 0.011014, "grad_norm": 0.11710761487483978, "learning_rate": 1e-05, "loss": 0.0171, "step": 1101400 }, { "epoch": 0.011015, "grad_norm": 0.12052375078201294, "learning_rate": 1e-05, "loss": 0.0167, "step": 1101500 }, { "epoch": 0.011016, "grad_norm": 0.16464820504188538, "learning_rate": 1e-05, "loss": 0.017, "step": 1101600 }, { "epoch": 0.011017, "grad_norm": 0.16303031146526337, "learning_rate": 1e-05, "loss": 0.0173, "step": 1101700 }, { "epoch": 0.011018, "grad_norm": 0.12489766627550125, "learning_rate": 1e-05, "loss": 0.0169, "step": 1101800 }, { "epoch": 0.011019, "grad_norm": 0.12328203022480011, "learning_rate": 1e-05, "loss": 0.0168, "step": 1101900 }, { "epoch": 0.01102, "grad_norm": 0.15686015784740448, "learning_rate": 1e-05, "loss": 0.0174, "step": 1102000 }, { "epoch": 0.011021, "grad_norm": 0.13664819300174713, "learning_rate": 1e-05, "loss": 0.0173, "step": 1102100 }, { "epoch": 0.011022, "grad_norm": 0.117189921438694, "learning_rate": 1e-05, "loss": 0.0177, "step": 1102200 }, { "epoch": 0.011023, "grad_norm": 0.2145795375108719, "learning_rate": 1e-05, "loss": 0.0169, "step": 1102300 }, { "epoch": 0.011024, "grad_norm": 0.12799373269081116, "learning_rate": 1e-05, "loss": 0.0175, "step": 1102400 }, { "epoch": 0.011025, "grad_norm": 0.13451629877090454, "learning_rate": 1e-05, "loss": 0.0174, "step": 1102500 }, { "epoch": 0.011026, "grad_norm": 0.11533129960298538, "learning_rate": 1e-05, "loss": 0.0176, "step": 1102600 }, { "epoch": 0.011027, "grad_norm": 0.13257604837417603, "learning_rate": 1e-05, "loss": 0.0172, "step": 1102700 }, { "epoch": 0.011028, "grad_norm": 0.1955384910106659, "learning_rate": 1e-05, "loss": 0.0168, "step": 1102800 }, { "epoch": 0.011029, "grad_norm": 0.11701028794050217, "learning_rate": 1e-05, "loss": 0.0171, "step": 1102900 }, { "epoch": 0.01103, "grad_norm": 0.18241776525974274, "learning_rate": 1e-05, "loss": 0.0173, "step": 1103000 }, { "epoch": 0.011031, "grad_norm": 0.12785549461841583, "learning_rate": 1e-05, "loss": 0.0178, "step": 1103100 }, { "epoch": 0.011032, "grad_norm": 0.11966261267662048, "learning_rate": 1e-05, "loss": 0.0172, "step": 1103200 }, { "epoch": 0.011033, "grad_norm": 0.2418939769268036, "learning_rate": 1e-05, "loss": 0.0171, "step": 1103300 }, { "epoch": 0.011034, "grad_norm": 0.14006955921649933, "learning_rate": 1e-05, "loss": 0.0172, "step": 1103400 }, { "epoch": 0.011035, "grad_norm": 0.13745185732841492, "learning_rate": 1e-05, "loss": 0.0171, "step": 1103500 }, { "epoch": 0.011036, "grad_norm": 0.1702473759651184, "learning_rate": 1e-05, "loss": 0.0171, "step": 1103600 }, { "epoch": 0.011037, "grad_norm": 0.14906254410743713, "learning_rate": 1e-05, "loss": 0.0172, "step": 1103700 }, { "epoch": 0.011038, "grad_norm": 0.12921786308288574, "learning_rate": 1e-05, "loss": 0.0174, "step": 1103800 }, { "epoch": 0.011039, "grad_norm": 0.1741725653409958, "learning_rate": 1e-05, "loss": 0.0173, "step": 1103900 }, { "epoch": 0.01104, "grad_norm": 0.1326061487197876, "learning_rate": 1e-05, "loss": 0.017, "step": 1104000 }, { "epoch": 0.011041, "grad_norm": 0.13540348410606384, "learning_rate": 1e-05, "loss": 0.0168, "step": 1104100 }, { "epoch": 0.011042, "grad_norm": 0.13346368074417114, "learning_rate": 1e-05, "loss": 0.0171, "step": 1104200 }, { "epoch": 0.011043, "grad_norm": 0.1607685536146164, "learning_rate": 1e-05, "loss": 0.0173, "step": 1104300 }, { "epoch": 0.011044, "grad_norm": 0.13783195614814758, "learning_rate": 1e-05, "loss": 0.0173, "step": 1104400 }, { "epoch": 0.011045, "grad_norm": 0.15774209797382355, "learning_rate": 1e-05, "loss": 0.017, "step": 1104500 }, { "epoch": 0.011046, "grad_norm": 0.11610382050275803, "learning_rate": 1e-05, "loss": 0.0168, "step": 1104600 }, { "epoch": 0.011047, "grad_norm": 0.1136493980884552, "learning_rate": 1e-05, "loss": 0.017, "step": 1104700 }, { "epoch": 0.011048, "grad_norm": 0.11396181583404541, "learning_rate": 1e-05, "loss": 0.0174, "step": 1104800 }, { "epoch": 0.011049, "grad_norm": 0.15451252460479736, "learning_rate": 1e-05, "loss": 0.0169, "step": 1104900 }, { "epoch": 0.01105, "grad_norm": 0.11693214625120163, "learning_rate": 1e-05, "loss": 0.0171, "step": 1105000 }, { "epoch": 0.011051, "grad_norm": 0.12868213653564453, "learning_rate": 1e-05, "loss": 0.0166, "step": 1105100 }, { "epoch": 0.011052, "grad_norm": 0.10384206473827362, "learning_rate": 1e-05, "loss": 0.0169, "step": 1105200 }, { "epoch": 0.011053, "grad_norm": 0.1245863065123558, "learning_rate": 1e-05, "loss": 0.0178, "step": 1105300 }, { "epoch": 0.011054, "grad_norm": 0.13096322119235992, "learning_rate": 1e-05, "loss": 0.0174, "step": 1105400 }, { "epoch": 0.011055, "grad_norm": 0.15160125494003296, "learning_rate": 1e-05, "loss": 0.0171, "step": 1105500 }, { "epoch": 0.011056, "grad_norm": 0.26715153455734253, "learning_rate": 1e-05, "loss": 0.0172, "step": 1105600 }, { "epoch": 0.011057, "grad_norm": 0.1226559430360794, "learning_rate": 1e-05, "loss": 0.0172, "step": 1105700 }, { "epoch": 0.011058, "grad_norm": 0.12861308455467224, "learning_rate": 1e-05, "loss": 0.0173, "step": 1105800 }, { "epoch": 0.011059, "grad_norm": 0.122829370200634, "learning_rate": 1e-05, "loss": 0.0168, "step": 1105900 }, { "epoch": 0.01106, "grad_norm": 0.14816804230213165, "learning_rate": 1e-05, "loss": 0.0176, "step": 1106000 }, { "epoch": 0.011061, "grad_norm": 0.3052058219909668, "learning_rate": 1e-05, "loss": 0.0174, "step": 1106100 }, { "epoch": 0.011062, "grad_norm": 0.16410349309444427, "learning_rate": 1e-05, "loss": 0.0173, "step": 1106200 }, { "epoch": 0.011063, "grad_norm": 0.1540878266096115, "learning_rate": 1e-05, "loss": 0.0174, "step": 1106300 }, { "epoch": 0.011064, "grad_norm": 0.11424756050109863, "learning_rate": 1e-05, "loss": 0.0174, "step": 1106400 }, { "epoch": 0.011065, "grad_norm": 0.146995410323143, "learning_rate": 1e-05, "loss": 0.017, "step": 1106500 }, { "epoch": 0.011066, "grad_norm": 0.11237838864326477, "learning_rate": 1e-05, "loss": 0.0167, "step": 1106600 }, { "epoch": 0.011067, "grad_norm": 0.0942423939704895, "learning_rate": 1e-05, "loss": 0.0173, "step": 1106700 }, { "epoch": 0.011068, "grad_norm": 0.15416769683361053, "learning_rate": 1e-05, "loss": 0.0171, "step": 1106800 }, { "epoch": 0.011069, "grad_norm": 0.11465846747159958, "learning_rate": 1e-05, "loss": 0.017, "step": 1106900 }, { "epoch": 0.01107, "grad_norm": 0.1306471973657608, "learning_rate": 1e-05, "loss": 0.0172, "step": 1107000 }, { "epoch": 0.011071, "grad_norm": 0.154327392578125, "learning_rate": 1e-05, "loss": 0.0168, "step": 1107100 }, { "epoch": 0.011072, "grad_norm": 0.1887975037097931, "learning_rate": 1e-05, "loss": 0.0173, "step": 1107200 }, { "epoch": 0.011073, "grad_norm": 0.13107439875602722, "learning_rate": 1e-05, "loss": 0.0171, "step": 1107300 }, { "epoch": 0.011074, "grad_norm": 0.15034934878349304, "learning_rate": 1e-05, "loss": 0.0173, "step": 1107400 }, { "epoch": 0.011075, "grad_norm": 0.15952812135219574, "learning_rate": 1e-05, "loss": 0.0173, "step": 1107500 }, { "epoch": 0.011076, "grad_norm": 0.1133982315659523, "learning_rate": 1e-05, "loss": 0.0173, "step": 1107600 }, { "epoch": 0.011077, "grad_norm": 0.150385782122612, "learning_rate": 1e-05, "loss": 0.0168, "step": 1107700 }, { "epoch": 0.011078, "grad_norm": 0.15175525844097137, "learning_rate": 1e-05, "loss": 0.0167, "step": 1107800 }, { "epoch": 0.011079, "grad_norm": 0.13613972067832947, "learning_rate": 1e-05, "loss": 0.0174, "step": 1107900 }, { "epoch": 0.01108, "grad_norm": 0.12326422333717346, "learning_rate": 1e-05, "loss": 0.017, "step": 1108000 }, { "epoch": 0.011081, "grad_norm": 0.17167292535305023, "learning_rate": 1e-05, "loss": 0.017, "step": 1108100 }, { "epoch": 0.011082, "grad_norm": 0.1026163399219513, "learning_rate": 1e-05, "loss": 0.0171, "step": 1108200 }, { "epoch": 0.011083, "grad_norm": 0.1510901004076004, "learning_rate": 1e-05, "loss": 0.0178, "step": 1108300 }, { "epoch": 0.011084, "grad_norm": 0.12822352349758148, "learning_rate": 1e-05, "loss": 0.0173, "step": 1108400 }, { "epoch": 0.011085, "grad_norm": 0.10933881998062134, "learning_rate": 1e-05, "loss": 0.0169, "step": 1108500 }, { "epoch": 0.011086, "grad_norm": 0.12350458651781082, "learning_rate": 1e-05, "loss": 0.0174, "step": 1108600 }, { "epoch": 0.011087, "grad_norm": 0.10214051604270935, "learning_rate": 1e-05, "loss": 0.0172, "step": 1108700 }, { "epoch": 0.011088, "grad_norm": 0.11270057410001755, "learning_rate": 1e-05, "loss": 0.0171, "step": 1108800 }, { "epoch": 0.011089, "grad_norm": 0.12905293703079224, "learning_rate": 1e-05, "loss": 0.0166, "step": 1108900 }, { "epoch": 0.01109, "grad_norm": 0.19127927720546722, "learning_rate": 1e-05, "loss": 0.0169, "step": 1109000 }, { "epoch": 0.011091, "grad_norm": 0.14812923967838287, "learning_rate": 1e-05, "loss": 0.0172, "step": 1109100 }, { "epoch": 0.011092, "grad_norm": 0.14628289639949799, "learning_rate": 1e-05, "loss": 0.0168, "step": 1109200 }, { "epoch": 0.011093, "grad_norm": 0.1385393738746643, "learning_rate": 1e-05, "loss": 0.0171, "step": 1109300 }, { "epoch": 0.011094, "grad_norm": 0.11509137600660324, "learning_rate": 1e-05, "loss": 0.0171, "step": 1109400 }, { "epoch": 0.011095, "grad_norm": 0.16451281309127808, "learning_rate": 1e-05, "loss": 0.0174, "step": 1109500 }, { "epoch": 0.011096, "grad_norm": 0.16712158918380737, "learning_rate": 1e-05, "loss": 0.017, "step": 1109600 }, { "epoch": 0.011097, "grad_norm": 0.16622091829776764, "learning_rate": 1e-05, "loss": 0.0172, "step": 1109700 }, { "epoch": 0.011098, "grad_norm": 0.13395963609218597, "learning_rate": 1e-05, "loss": 0.0169, "step": 1109800 }, { "epoch": 0.011099, "grad_norm": 0.13270656764507294, "learning_rate": 1e-05, "loss": 0.0172, "step": 1109900 }, { "epoch": 0.0111, "grad_norm": 0.12294886261224747, "learning_rate": 1e-05, "loss": 0.0171, "step": 1110000 }, { "epoch": 0.011101, "grad_norm": 0.1321762353181839, "learning_rate": 1e-05, "loss": 0.0171, "step": 1110100 }, { "epoch": 0.011102, "grad_norm": 0.2502821981906891, "learning_rate": 1e-05, "loss": 0.0173, "step": 1110200 }, { "epoch": 0.011103, "grad_norm": 0.14565543830394745, "learning_rate": 1e-05, "loss": 0.017, "step": 1110300 }, { "epoch": 0.011104, "grad_norm": 0.12748417258262634, "learning_rate": 1e-05, "loss": 0.0176, "step": 1110400 }, { "epoch": 0.011105, "grad_norm": 0.13790223002433777, "learning_rate": 1e-05, "loss": 0.0172, "step": 1110500 }, { "epoch": 0.011106, "grad_norm": 0.13188207149505615, "learning_rate": 1e-05, "loss": 0.0167, "step": 1110600 }, { "epoch": 0.011107, "grad_norm": 0.13733543455600739, "learning_rate": 1e-05, "loss": 0.0169, "step": 1110700 }, { "epoch": 0.011108, "grad_norm": 0.1337336301803589, "learning_rate": 1e-05, "loss": 0.0172, "step": 1110800 }, { "epoch": 0.011109, "grad_norm": 0.12175700068473816, "learning_rate": 1e-05, "loss": 0.0172, "step": 1110900 }, { "epoch": 0.01111, "grad_norm": 0.1186102032661438, "learning_rate": 1e-05, "loss": 0.0171, "step": 1111000 }, { "epoch": 0.011111, "grad_norm": 0.15879392623901367, "learning_rate": 1e-05, "loss": 0.0172, "step": 1111100 }, { "epoch": 0.011112, "grad_norm": 0.1172216460108757, "learning_rate": 1e-05, "loss": 0.017, "step": 1111200 }, { "epoch": 0.011113, "grad_norm": 0.12441107630729675, "learning_rate": 1e-05, "loss": 0.017, "step": 1111300 }, { "epoch": 0.011114, "grad_norm": 0.12366344034671783, "learning_rate": 1e-05, "loss": 0.017, "step": 1111400 }, { "epoch": 0.011115, "grad_norm": 0.11972589790821075, "learning_rate": 1e-05, "loss": 0.0172, "step": 1111500 }, { "epoch": 0.011116, "grad_norm": 0.18773861229419708, "learning_rate": 1e-05, "loss": 0.0171, "step": 1111600 }, { "epoch": 0.011117, "grad_norm": 0.1448446661233902, "learning_rate": 1e-05, "loss": 0.0174, "step": 1111700 }, { "epoch": 0.011118, "grad_norm": 0.15515200793743134, "learning_rate": 1e-05, "loss": 0.0169, "step": 1111800 }, { "epoch": 0.011119, "grad_norm": 0.19447174668312073, "learning_rate": 1e-05, "loss": 0.0172, "step": 1111900 }, { "epoch": 0.01112, "grad_norm": 0.1177198514342308, "learning_rate": 1e-05, "loss": 0.0172, "step": 1112000 }, { "epoch": 0.011121, "grad_norm": 0.13857127726078033, "learning_rate": 1e-05, "loss": 0.017, "step": 1112100 }, { "epoch": 0.011122, "grad_norm": 0.14380569756031036, "learning_rate": 1e-05, "loss": 0.0165, "step": 1112200 }, { "epoch": 0.011123, "grad_norm": 0.14927606284618378, "learning_rate": 1e-05, "loss": 0.017, "step": 1112300 }, { "epoch": 0.011124, "grad_norm": 0.13969853520393372, "learning_rate": 1e-05, "loss": 0.0173, "step": 1112400 }, { "epoch": 0.011125, "grad_norm": 0.13895711302757263, "learning_rate": 1e-05, "loss": 0.0175, "step": 1112500 }, { "epoch": 0.011126, "grad_norm": 0.13866353034973145, "learning_rate": 1e-05, "loss": 0.0174, "step": 1112600 }, { "epoch": 0.011127, "grad_norm": 0.2214490920305252, "learning_rate": 1e-05, "loss": 0.0168, "step": 1112700 }, { "epoch": 0.011128, "grad_norm": 0.15977153182029724, "learning_rate": 1e-05, "loss": 0.0168, "step": 1112800 }, { "epoch": 0.011129, "grad_norm": 0.10422132164239883, "learning_rate": 1e-05, "loss": 0.0169, "step": 1112900 }, { "epoch": 0.01113, "grad_norm": 0.1428787112236023, "learning_rate": 1e-05, "loss": 0.0169, "step": 1113000 }, { "epoch": 0.011131, "grad_norm": 0.1400957703590393, "learning_rate": 1e-05, "loss": 0.017, "step": 1113100 }, { "epoch": 0.011132, "grad_norm": 0.18504878878593445, "learning_rate": 1e-05, "loss": 0.0169, "step": 1113200 }, { "epoch": 0.011133, "grad_norm": 0.21661652624607086, "learning_rate": 1e-05, "loss": 0.017, "step": 1113300 }, { "epoch": 0.011134, "grad_norm": 0.1255376785993576, "learning_rate": 1e-05, "loss": 0.0167, "step": 1113400 }, { "epoch": 0.011135, "grad_norm": 0.14084547758102417, "learning_rate": 1e-05, "loss": 0.0169, "step": 1113500 }, { "epoch": 0.011136, "grad_norm": 0.12207011878490448, "learning_rate": 1e-05, "loss": 0.0167, "step": 1113600 }, { "epoch": 0.011137, "grad_norm": 0.17114651203155518, "learning_rate": 1e-05, "loss": 0.0171, "step": 1113700 }, { "epoch": 0.011138, "grad_norm": 0.13341349363327026, "learning_rate": 1e-05, "loss": 0.0173, "step": 1113800 }, { "epoch": 0.011139, "grad_norm": 0.12601310014724731, "learning_rate": 1e-05, "loss": 0.017, "step": 1113900 }, { "epoch": 0.01114, "grad_norm": 0.11021117866039276, "learning_rate": 1e-05, "loss": 0.017, "step": 1114000 }, { "epoch": 0.011141, "grad_norm": 0.14740292727947235, "learning_rate": 1e-05, "loss": 0.0173, "step": 1114100 }, { "epoch": 0.011142, "grad_norm": 0.15652336180210114, "learning_rate": 1e-05, "loss": 0.0167, "step": 1114200 }, { "epoch": 0.011143, "grad_norm": 0.12950006127357483, "learning_rate": 1e-05, "loss": 0.017, "step": 1114300 }, { "epoch": 0.011144, "grad_norm": 0.15118977427482605, "learning_rate": 1e-05, "loss": 0.0171, "step": 1114400 }, { "epoch": 0.011145, "grad_norm": 0.12628324329853058, "learning_rate": 1e-05, "loss": 0.0174, "step": 1114500 }, { "epoch": 0.011146, "grad_norm": 0.21156390011310577, "learning_rate": 1e-05, "loss": 0.0171, "step": 1114600 }, { "epoch": 0.011147, "grad_norm": 0.13159441947937012, "learning_rate": 1e-05, "loss": 0.0171, "step": 1114700 }, { "epoch": 0.011148, "grad_norm": 0.14625290036201477, "learning_rate": 1e-05, "loss": 0.0171, "step": 1114800 }, { "epoch": 0.011149, "grad_norm": 0.08922378718852997, "learning_rate": 1e-05, "loss": 0.0173, "step": 1114900 }, { "epoch": 0.01115, "grad_norm": 0.14147210121154785, "learning_rate": 1e-05, "loss": 0.0173, "step": 1115000 }, { "epoch": 0.011151, "grad_norm": 0.14407594501972198, "learning_rate": 1e-05, "loss": 0.0167, "step": 1115100 }, { "epoch": 0.011152, "grad_norm": 0.1153828576207161, "learning_rate": 1e-05, "loss": 0.0173, "step": 1115200 }, { "epoch": 0.011153, "grad_norm": 0.18106627464294434, "learning_rate": 1e-05, "loss": 0.0171, "step": 1115300 }, { "epoch": 0.011154, "grad_norm": 0.10319028049707413, "learning_rate": 1e-05, "loss": 0.0171, "step": 1115400 }, { "epoch": 0.011155, "grad_norm": 0.10717853158712387, "learning_rate": 1e-05, "loss": 0.0167, "step": 1115500 }, { "epoch": 0.011156, "grad_norm": 0.17786237597465515, "learning_rate": 1e-05, "loss": 0.0171, "step": 1115600 }, { "epoch": 0.011157, "grad_norm": 0.14671847224235535, "learning_rate": 1e-05, "loss": 0.0174, "step": 1115700 }, { "epoch": 0.011158, "grad_norm": 0.1360320746898651, "learning_rate": 1e-05, "loss": 0.0173, "step": 1115800 }, { "epoch": 0.011159, "grad_norm": 0.16332855820655823, "learning_rate": 1e-05, "loss": 0.0166, "step": 1115900 }, { "epoch": 0.01116, "grad_norm": 0.0886194109916687, "learning_rate": 1e-05, "loss": 0.017, "step": 1116000 }, { "epoch": 0.011161, "grad_norm": 0.11078992486000061, "learning_rate": 1e-05, "loss": 0.0172, "step": 1116100 }, { "epoch": 0.011162, "grad_norm": 0.12649224698543549, "learning_rate": 1e-05, "loss": 0.0173, "step": 1116200 }, { "epoch": 0.011163, "grad_norm": 0.08840081840753555, "learning_rate": 1e-05, "loss": 0.0173, "step": 1116300 }, { "epoch": 0.011164, "grad_norm": 0.11028505861759186, "learning_rate": 1e-05, "loss": 0.0168, "step": 1116400 }, { "epoch": 0.011165, "grad_norm": 0.1052124947309494, "learning_rate": 1e-05, "loss": 0.0171, "step": 1116500 }, { "epoch": 0.011166, "grad_norm": 0.17098194360733032, "learning_rate": 1e-05, "loss": 0.017, "step": 1116600 }, { "epoch": 0.011167, "grad_norm": 0.11416328698396683, "learning_rate": 1e-05, "loss": 0.0167, "step": 1116700 }, { "epoch": 0.011168, "grad_norm": 0.1151508018374443, "learning_rate": 1e-05, "loss": 0.0173, "step": 1116800 }, { "epoch": 0.011169, "grad_norm": 0.17043191194534302, "learning_rate": 1e-05, "loss": 0.0173, "step": 1116900 }, { "epoch": 0.01117, "grad_norm": 0.14887669682502747, "learning_rate": 1e-05, "loss": 0.0171, "step": 1117000 }, { "epoch": 0.011171, "grad_norm": 0.1114635095000267, "learning_rate": 1e-05, "loss": 0.0173, "step": 1117100 }, { "epoch": 0.011172, "grad_norm": 0.11553972959518433, "learning_rate": 1e-05, "loss": 0.0176, "step": 1117200 }, { "epoch": 0.011173, "grad_norm": 0.11907055974006653, "learning_rate": 1e-05, "loss": 0.017, "step": 1117300 }, { "epoch": 0.011174, "grad_norm": 0.15594276785850525, "learning_rate": 1e-05, "loss": 0.0169, "step": 1117400 }, { "epoch": 0.011175, "grad_norm": 0.12948302924633026, "learning_rate": 1e-05, "loss": 0.0169, "step": 1117500 }, { "epoch": 0.011176, "grad_norm": 0.12161881476640701, "learning_rate": 1e-05, "loss": 0.0175, "step": 1117600 }, { "epoch": 0.011177, "grad_norm": 0.17511427402496338, "learning_rate": 1e-05, "loss": 0.017, "step": 1117700 }, { "epoch": 0.011178, "grad_norm": 0.14192865788936615, "learning_rate": 1e-05, "loss": 0.017, "step": 1117800 }, { "epoch": 0.011179, "grad_norm": 0.1468031108379364, "learning_rate": 1e-05, "loss": 0.0168, "step": 1117900 }, { "epoch": 0.01118, "grad_norm": 0.14029733836650848, "learning_rate": 1e-05, "loss": 0.0168, "step": 1118000 }, { "epoch": 0.011181, "grad_norm": 0.1446666419506073, "learning_rate": 1e-05, "loss": 0.0172, "step": 1118100 }, { "epoch": 0.011182, "grad_norm": 0.1274413764476776, "learning_rate": 1e-05, "loss": 0.017, "step": 1118200 }, { "epoch": 0.011183, "grad_norm": 0.12777601182460785, "learning_rate": 1e-05, "loss": 0.017, "step": 1118300 }, { "epoch": 0.011184, "grad_norm": 0.1816999316215515, "learning_rate": 1e-05, "loss": 0.0175, "step": 1118400 }, { "epoch": 0.011185, "grad_norm": 0.163295179605484, "learning_rate": 1e-05, "loss": 0.0173, "step": 1118500 }, { "epoch": 0.011186, "grad_norm": 0.11254440248012543, "learning_rate": 1e-05, "loss": 0.0174, "step": 1118600 }, { "epoch": 0.011187, "grad_norm": 0.21450930833816528, "learning_rate": 1e-05, "loss": 0.0168, "step": 1118700 }, { "epoch": 0.011188, "grad_norm": 0.1020445004105568, "learning_rate": 1e-05, "loss": 0.0166, "step": 1118800 }, { "epoch": 0.011189, "grad_norm": 0.1434522271156311, "learning_rate": 1e-05, "loss": 0.0169, "step": 1118900 }, { "epoch": 0.01119, "grad_norm": 0.1568278819322586, "learning_rate": 1e-05, "loss": 0.0174, "step": 1119000 }, { "epoch": 0.011191, "grad_norm": 0.1497504711151123, "learning_rate": 1e-05, "loss": 0.0169, "step": 1119100 }, { "epoch": 0.011192, "grad_norm": 0.13873915374279022, "learning_rate": 1e-05, "loss": 0.0171, "step": 1119200 }, { "epoch": 0.011193, "grad_norm": 0.1298673301935196, "learning_rate": 1e-05, "loss": 0.017, "step": 1119300 }, { "epoch": 0.011194, "grad_norm": 0.15957039594650269, "learning_rate": 1e-05, "loss": 0.017, "step": 1119400 }, { "epoch": 0.011195, "grad_norm": 0.13174790143966675, "learning_rate": 1e-05, "loss": 0.0168, "step": 1119500 }, { "epoch": 0.011196, "grad_norm": 0.16137365996837616, "learning_rate": 1e-05, "loss": 0.017, "step": 1119600 }, { "epoch": 0.011197, "grad_norm": 0.1168437972664833, "learning_rate": 1e-05, "loss": 0.0166, "step": 1119700 }, { "epoch": 0.011198, "grad_norm": 0.12797003984451294, "learning_rate": 1e-05, "loss": 0.0167, "step": 1119800 }, { "epoch": 0.011199, "grad_norm": 0.13630534708499908, "learning_rate": 1e-05, "loss": 0.0173, "step": 1119900 }, { "epoch": 0.0112, "grad_norm": 0.12809821963310242, "learning_rate": 1e-05, "loss": 0.0169, "step": 1120000 }, { "epoch": 0.0112, "eval_loss": 0.015287606045603752, "eval_runtime": 167.8074, "eval_samples_per_second": 297.961, "eval_steps_per_second": 18.623, "step": 1120000 }, { "epoch": 0.011201, "grad_norm": 0.09063588082790375, "learning_rate": 1e-05, "loss": 0.0172, "step": 1120100 }, { "epoch": 0.011202, "grad_norm": 0.13943906128406525, "learning_rate": 1e-05, "loss": 0.0169, "step": 1120200 }, { "epoch": 0.011203, "grad_norm": 0.17318592965602875, "learning_rate": 1e-05, "loss": 0.0168, "step": 1120300 }, { "epoch": 0.011204, "grad_norm": 0.1452368050813675, "learning_rate": 1e-05, "loss": 0.0173, "step": 1120400 }, { "epoch": 0.011205, "grad_norm": 0.127362459897995, "learning_rate": 1e-05, "loss": 0.0171, "step": 1120500 }, { "epoch": 0.011206, "grad_norm": 0.11183317750692368, "learning_rate": 1e-05, "loss": 0.017, "step": 1120600 }, { "epoch": 0.011207, "grad_norm": 0.11396680772304535, "learning_rate": 1e-05, "loss": 0.0171, "step": 1120700 }, { "epoch": 0.011208, "grad_norm": 0.12333860248327255, "learning_rate": 1e-05, "loss": 0.0173, "step": 1120800 }, { "epoch": 0.011209, "grad_norm": 0.10730709880590439, "learning_rate": 1e-05, "loss": 0.0167, "step": 1120900 }, { "epoch": 0.01121, "grad_norm": 0.21195922791957855, "learning_rate": 1e-05, "loss": 0.0169, "step": 1121000 }, { "epoch": 0.011211, "grad_norm": 0.12162138521671295, "learning_rate": 1e-05, "loss": 0.017, "step": 1121100 }, { "epoch": 0.011212, "grad_norm": 0.1178150475025177, "learning_rate": 1e-05, "loss": 0.0165, "step": 1121200 }, { "epoch": 0.011213, "grad_norm": 0.1086525246500969, "learning_rate": 1e-05, "loss": 0.0168, "step": 1121300 }, { "epoch": 0.011214, "grad_norm": 0.1470380574464798, "learning_rate": 1e-05, "loss": 0.0169, "step": 1121400 }, { "epoch": 0.011215, "grad_norm": 0.1705387383699417, "learning_rate": 1e-05, "loss": 0.017, "step": 1121500 }, { "epoch": 0.011216, "grad_norm": 0.14958485960960388, "learning_rate": 1e-05, "loss": 0.0171, "step": 1121600 }, { "epoch": 0.011217, "grad_norm": 0.14746691286563873, "learning_rate": 1e-05, "loss": 0.0171, "step": 1121700 }, { "epoch": 0.011218, "grad_norm": 0.16806983947753906, "learning_rate": 1e-05, "loss": 0.017, "step": 1121800 }, { "epoch": 0.011219, "grad_norm": 0.17904037237167358, "learning_rate": 1e-05, "loss": 0.0173, "step": 1121900 }, { "epoch": 0.01122, "grad_norm": 0.15847726166248322, "learning_rate": 1e-05, "loss": 0.0174, "step": 1122000 }, { "epoch": 0.011221, "grad_norm": 0.1215418353676796, "learning_rate": 1e-05, "loss": 0.0172, "step": 1122100 }, { "epoch": 0.011222, "grad_norm": 0.1523095816373825, "learning_rate": 1e-05, "loss": 0.0171, "step": 1122200 }, { "epoch": 0.011223, "grad_norm": 0.10258562117815018, "learning_rate": 1e-05, "loss": 0.0169, "step": 1122300 }, { "epoch": 0.011224, "grad_norm": 0.1490822434425354, "learning_rate": 1e-05, "loss": 0.0169, "step": 1122400 }, { "epoch": 0.011225, "grad_norm": 0.1686617136001587, "learning_rate": 1e-05, "loss": 0.0173, "step": 1122500 }, { "epoch": 0.011226, "grad_norm": 0.15074171125888824, "learning_rate": 1e-05, "loss": 0.0172, "step": 1122600 }, { "epoch": 0.011227, "grad_norm": 0.09687899053096771, "learning_rate": 1e-05, "loss": 0.017, "step": 1122700 }, { "epoch": 0.011228, "grad_norm": 0.11583299934864044, "learning_rate": 1e-05, "loss": 0.0173, "step": 1122800 }, { "epoch": 0.011229, "grad_norm": 0.14795713126659393, "learning_rate": 1e-05, "loss": 0.0173, "step": 1122900 }, { "epoch": 0.01123, "grad_norm": 0.18081627786159515, "learning_rate": 1e-05, "loss": 0.0172, "step": 1123000 }, { "epoch": 0.011231, "grad_norm": 0.10216055810451508, "learning_rate": 1e-05, "loss": 0.0173, "step": 1123100 }, { "epoch": 0.011232, "grad_norm": 0.13851411640644073, "learning_rate": 1e-05, "loss": 0.017, "step": 1123200 }, { "epoch": 0.011233, "grad_norm": 0.12262114137411118, "learning_rate": 1e-05, "loss": 0.0164, "step": 1123300 }, { "epoch": 0.011234, "grad_norm": 0.11051113903522491, "learning_rate": 1e-05, "loss": 0.0171, "step": 1123400 }, { "epoch": 0.011235, "grad_norm": 0.10449884086847305, "learning_rate": 1e-05, "loss": 0.0177, "step": 1123500 }, { "epoch": 0.011236, "grad_norm": 0.11997269093990326, "learning_rate": 1e-05, "loss": 0.0165, "step": 1123600 }, { "epoch": 0.011237, "grad_norm": 0.15489737689495087, "learning_rate": 1e-05, "loss": 0.0169, "step": 1123700 }, { "epoch": 0.011238, "grad_norm": 0.13791613280773163, "learning_rate": 1e-05, "loss": 0.0174, "step": 1123800 }, { "epoch": 0.011239, "grad_norm": 0.14277717471122742, "learning_rate": 1e-05, "loss": 0.0169, "step": 1123900 }, { "epoch": 0.01124, "grad_norm": 0.11832166463136673, "learning_rate": 1e-05, "loss": 0.0171, "step": 1124000 }, { "epoch": 0.011241, "grad_norm": 0.1637575626373291, "learning_rate": 1e-05, "loss": 0.0175, "step": 1124100 }, { "epoch": 0.011242, "grad_norm": 0.18368655443191528, "learning_rate": 1e-05, "loss": 0.0169, "step": 1124200 }, { "epoch": 0.011243, "grad_norm": 0.14841574430465698, "learning_rate": 1e-05, "loss": 0.0171, "step": 1124300 }, { "epoch": 0.011244, "grad_norm": 0.10886381566524506, "learning_rate": 1e-05, "loss": 0.0172, "step": 1124400 }, { "epoch": 0.011245, "grad_norm": 0.15116022527217865, "learning_rate": 1e-05, "loss": 0.0173, "step": 1124500 }, { "epoch": 0.011246, "grad_norm": 0.13534030318260193, "learning_rate": 1e-05, "loss": 0.0174, "step": 1124600 }, { "epoch": 0.011247, "grad_norm": 0.10634259134531021, "learning_rate": 1e-05, "loss": 0.0171, "step": 1124700 }, { "epoch": 0.011248, "grad_norm": 0.17869974672794342, "learning_rate": 1e-05, "loss": 0.0168, "step": 1124800 }, { "epoch": 0.011249, "grad_norm": 0.1830439418554306, "learning_rate": 1e-05, "loss": 0.0167, "step": 1124900 }, { "epoch": 0.01125, "grad_norm": 0.18736886978149414, "learning_rate": 1e-05, "loss": 0.0168, "step": 1125000 }, { "epoch": 0.011251, "grad_norm": 0.08403582125902176, "learning_rate": 1e-05, "loss": 0.0165, "step": 1125100 }, { "epoch": 0.011252, "grad_norm": 0.1230316236615181, "learning_rate": 1e-05, "loss": 0.0173, "step": 1125200 }, { "epoch": 0.011253, "grad_norm": 0.11458443850278854, "learning_rate": 1e-05, "loss": 0.0167, "step": 1125300 }, { "epoch": 0.011254, "grad_norm": 0.15862858295440674, "learning_rate": 1e-05, "loss": 0.0168, "step": 1125400 }, { "epoch": 0.011255, "grad_norm": 0.12172126770019531, "learning_rate": 1e-05, "loss": 0.0169, "step": 1125500 }, { "epoch": 0.011256, "grad_norm": 0.19674251973628998, "learning_rate": 1e-05, "loss": 0.0169, "step": 1125600 }, { "epoch": 0.011257, "grad_norm": 0.1320592314004898, "learning_rate": 1e-05, "loss": 0.0171, "step": 1125700 }, { "epoch": 0.011258, "grad_norm": 0.11205053329467773, "learning_rate": 1e-05, "loss": 0.0167, "step": 1125800 }, { "epoch": 0.011259, "grad_norm": 0.14960449934005737, "learning_rate": 1e-05, "loss": 0.017, "step": 1125900 }, { "epoch": 0.01126, "grad_norm": 0.10495594888925552, "learning_rate": 1e-05, "loss": 0.0168, "step": 1126000 }, { "epoch": 0.011261, "grad_norm": 0.1236938014626503, "learning_rate": 1e-05, "loss": 0.017, "step": 1126100 }, { "epoch": 0.011262, "grad_norm": 0.17460402846336365, "learning_rate": 1e-05, "loss": 0.017, "step": 1126200 }, { "epoch": 0.011263, "grad_norm": 0.14298957586288452, "learning_rate": 1e-05, "loss": 0.0164, "step": 1126300 }, { "epoch": 0.011264, "grad_norm": 0.09709619730710983, "learning_rate": 1e-05, "loss": 0.0169, "step": 1126400 }, { "epoch": 0.011265, "grad_norm": 0.1376914530992508, "learning_rate": 1e-05, "loss": 0.0171, "step": 1126500 }, { "epoch": 0.011266, "grad_norm": 0.10703008621931076, "learning_rate": 1e-05, "loss": 0.0166, "step": 1126600 }, { "epoch": 0.011267, "grad_norm": 0.14323115348815918, "learning_rate": 1e-05, "loss": 0.017, "step": 1126700 }, { "epoch": 0.011268, "grad_norm": 0.14483052492141724, "learning_rate": 1e-05, "loss": 0.0172, "step": 1126800 }, { "epoch": 0.011269, "grad_norm": 0.10255639255046844, "learning_rate": 1e-05, "loss": 0.017, "step": 1126900 }, { "epoch": 0.01127, "grad_norm": 0.1635277420282364, "learning_rate": 1e-05, "loss": 0.0171, "step": 1127000 }, { "epoch": 0.011271, "grad_norm": 0.13560360670089722, "learning_rate": 1e-05, "loss": 0.0174, "step": 1127100 }, { "epoch": 0.011272, "grad_norm": 0.15436260402202606, "learning_rate": 1e-05, "loss": 0.0172, "step": 1127200 }, { "epoch": 0.011273, "grad_norm": 0.11363206803798676, "learning_rate": 1e-05, "loss": 0.0171, "step": 1127300 }, { "epoch": 0.011274, "grad_norm": 0.12254142016172409, "learning_rate": 1e-05, "loss": 0.0164, "step": 1127400 }, { "epoch": 0.011275, "grad_norm": 0.12402881681919098, "learning_rate": 1e-05, "loss": 0.0165, "step": 1127500 }, { "epoch": 0.011276, "grad_norm": 0.15002486109733582, "learning_rate": 1e-05, "loss": 0.017, "step": 1127600 }, { "epoch": 0.011277, "grad_norm": 0.14036637544631958, "learning_rate": 1e-05, "loss": 0.0169, "step": 1127700 }, { "epoch": 0.011278, "grad_norm": 0.14986664056777954, "learning_rate": 1e-05, "loss": 0.0175, "step": 1127800 }, { "epoch": 0.011279, "grad_norm": 0.15079636871814728, "learning_rate": 1e-05, "loss": 0.0172, "step": 1127900 }, { "epoch": 0.01128, "grad_norm": 0.1273769587278366, "learning_rate": 1e-05, "loss": 0.0171, "step": 1128000 }, { "epoch": 0.011281, "grad_norm": 0.1049274429678917, "learning_rate": 1e-05, "loss": 0.017, "step": 1128100 }, { "epoch": 0.011282, "grad_norm": 0.11931074410676956, "learning_rate": 1e-05, "loss": 0.0171, "step": 1128200 }, { "epoch": 0.011283, "grad_norm": 0.12931257486343384, "learning_rate": 1e-05, "loss": 0.0173, "step": 1128300 }, { "epoch": 0.011284, "grad_norm": 0.12949725985527039, "learning_rate": 1e-05, "loss": 0.017, "step": 1128400 }, { "epoch": 0.011285, "grad_norm": 0.11473546177148819, "learning_rate": 1e-05, "loss": 0.0171, "step": 1128500 }, { "epoch": 0.011286, "grad_norm": 0.16741804778575897, "learning_rate": 1e-05, "loss": 0.017, "step": 1128600 }, { "epoch": 0.011287, "grad_norm": 0.09538117051124573, "learning_rate": 1e-05, "loss": 0.017, "step": 1128700 }, { "epoch": 0.011288, "grad_norm": 0.13411422073841095, "learning_rate": 1e-05, "loss": 0.0167, "step": 1128800 }, { "epoch": 0.011289, "grad_norm": 0.11891023069620132, "learning_rate": 1e-05, "loss": 0.0175, "step": 1128900 }, { "epoch": 0.01129, "grad_norm": 0.12546950578689575, "learning_rate": 1e-05, "loss": 0.017, "step": 1129000 }, { "epoch": 0.011291, "grad_norm": 0.11392230540513992, "learning_rate": 1e-05, "loss": 0.017, "step": 1129100 }, { "epoch": 0.011292, "grad_norm": 0.1349812150001526, "learning_rate": 1e-05, "loss": 0.0168, "step": 1129200 }, { "epoch": 0.011293, "grad_norm": 0.11846213787794113, "learning_rate": 1e-05, "loss": 0.0172, "step": 1129300 }, { "epoch": 0.011294, "grad_norm": 0.1718623787164688, "learning_rate": 1e-05, "loss": 0.0173, "step": 1129400 }, { "epoch": 0.011295, "grad_norm": 0.10486960411071777, "learning_rate": 1e-05, "loss": 0.017, "step": 1129500 }, { "epoch": 0.011296, "grad_norm": 0.14088216423988342, "learning_rate": 1e-05, "loss": 0.0173, "step": 1129600 }, { "epoch": 0.011297, "grad_norm": 0.13113972544670105, "learning_rate": 1e-05, "loss": 0.0168, "step": 1129700 }, { "epoch": 0.011298, "grad_norm": 0.1321193128824234, "learning_rate": 1e-05, "loss": 0.0169, "step": 1129800 }, { "epoch": 0.011299, "grad_norm": 0.15848015248775482, "learning_rate": 1e-05, "loss": 0.0169, "step": 1129900 }, { "epoch": 0.0113, "grad_norm": 0.19547991454601288, "learning_rate": 1e-05, "loss": 0.0165, "step": 1130000 }, { "epoch": 0.011301, "grad_norm": 0.1174364909529686, "learning_rate": 1e-05, "loss": 0.017, "step": 1130100 }, { "epoch": 0.011302, "grad_norm": 0.13318724930286407, "learning_rate": 1e-05, "loss": 0.0167, "step": 1130200 }, { "epoch": 0.011303, "grad_norm": 0.13553965091705322, "learning_rate": 1e-05, "loss": 0.0176, "step": 1130300 }, { "epoch": 0.011304, "grad_norm": 0.16678650677204132, "learning_rate": 1e-05, "loss": 0.0172, "step": 1130400 }, { "epoch": 0.011305, "grad_norm": 0.1929180920124054, "learning_rate": 1e-05, "loss": 0.0172, "step": 1130500 }, { "epoch": 0.011306, "grad_norm": 0.12110816687345505, "learning_rate": 1e-05, "loss": 0.0166, "step": 1130600 }, { "epoch": 0.011307, "grad_norm": 0.1367226392030716, "learning_rate": 1e-05, "loss": 0.0168, "step": 1130700 }, { "epoch": 0.011308, "grad_norm": 0.14079773426055908, "learning_rate": 1e-05, "loss": 0.017, "step": 1130800 }, { "epoch": 0.011309, "grad_norm": 0.11440712213516235, "learning_rate": 1e-05, "loss": 0.017, "step": 1130900 }, { "epoch": 0.01131, "grad_norm": 0.14564000070095062, "learning_rate": 1e-05, "loss": 0.0169, "step": 1131000 }, { "epoch": 0.011311, "grad_norm": 0.14865519106388092, "learning_rate": 1e-05, "loss": 0.017, "step": 1131100 }, { "epoch": 0.011312, "grad_norm": 0.12073763459920883, "learning_rate": 1e-05, "loss": 0.0168, "step": 1131200 }, { "epoch": 0.011313, "grad_norm": 0.14672788977622986, "learning_rate": 1e-05, "loss": 0.0171, "step": 1131300 }, { "epoch": 0.011314, "grad_norm": 0.12927713990211487, "learning_rate": 1e-05, "loss": 0.0168, "step": 1131400 }, { "epoch": 0.011315, "grad_norm": 0.13303321599960327, "learning_rate": 1e-05, "loss": 0.0166, "step": 1131500 }, { "epoch": 0.011316, "grad_norm": 0.2440183460712433, "learning_rate": 1e-05, "loss": 0.0168, "step": 1131600 }, { "epoch": 0.011317, "grad_norm": 0.13427750766277313, "learning_rate": 1e-05, "loss": 0.0168, "step": 1131700 }, { "epoch": 0.011318, "grad_norm": 0.20584119856357574, "learning_rate": 1e-05, "loss": 0.0169, "step": 1131800 }, { "epoch": 0.011319, "grad_norm": 0.11467920988798141, "learning_rate": 1e-05, "loss": 0.017, "step": 1131900 }, { "epoch": 0.01132, "grad_norm": 0.1411975473165512, "learning_rate": 1e-05, "loss": 0.0173, "step": 1132000 }, { "epoch": 0.011321, "grad_norm": 0.14368632435798645, "learning_rate": 1e-05, "loss": 0.0169, "step": 1132100 }, { "epoch": 0.011322, "grad_norm": 0.1375512331724167, "learning_rate": 1e-05, "loss": 0.0169, "step": 1132200 }, { "epoch": 0.011323, "grad_norm": 0.11080422252416611, "learning_rate": 1e-05, "loss": 0.0172, "step": 1132300 }, { "epoch": 0.011324, "grad_norm": 0.12611211836338043, "learning_rate": 1e-05, "loss": 0.0169, "step": 1132400 }, { "epoch": 0.011325, "grad_norm": 0.10247102379798889, "learning_rate": 1e-05, "loss": 0.017, "step": 1132500 }, { "epoch": 0.011326, "grad_norm": 0.1358327567577362, "learning_rate": 1e-05, "loss": 0.0168, "step": 1132600 }, { "epoch": 0.011327, "grad_norm": 0.1599634736776352, "learning_rate": 1e-05, "loss": 0.0172, "step": 1132700 }, { "epoch": 0.011328, "grad_norm": 0.12613654136657715, "learning_rate": 1e-05, "loss": 0.0166, "step": 1132800 }, { "epoch": 0.011329, "grad_norm": 0.15971794724464417, "learning_rate": 1e-05, "loss": 0.0169, "step": 1132900 }, { "epoch": 0.01133, "grad_norm": 0.15001387894153595, "learning_rate": 1e-05, "loss": 0.0174, "step": 1133000 }, { "epoch": 0.011331, "grad_norm": 0.13013264536857605, "learning_rate": 1e-05, "loss": 0.017, "step": 1133100 }, { "epoch": 0.011332, "grad_norm": 0.14445777237415314, "learning_rate": 1e-05, "loss": 0.0166, "step": 1133200 }, { "epoch": 0.011333, "grad_norm": 0.13813233375549316, "learning_rate": 1e-05, "loss": 0.0171, "step": 1133300 }, { "epoch": 0.011334, "grad_norm": 0.16267399489879608, "learning_rate": 1e-05, "loss": 0.017, "step": 1133400 }, { "epoch": 0.011335, "grad_norm": 0.14795510470867157, "learning_rate": 1e-05, "loss": 0.0171, "step": 1133500 }, { "epoch": 0.011336, "grad_norm": 0.11415031552314758, "learning_rate": 1e-05, "loss": 0.0166, "step": 1133600 }, { "epoch": 0.011337, "grad_norm": 0.15094740688800812, "learning_rate": 1e-05, "loss": 0.017, "step": 1133700 }, { "epoch": 0.011338, "grad_norm": 0.10835467278957367, "learning_rate": 1e-05, "loss": 0.0172, "step": 1133800 }, { "epoch": 0.011339, "grad_norm": 0.19475580751895905, "learning_rate": 1e-05, "loss": 0.017, "step": 1133900 }, { "epoch": 0.01134, "grad_norm": 0.11596526950597763, "learning_rate": 1e-05, "loss": 0.017, "step": 1134000 }, { "epoch": 0.011341, "grad_norm": 0.09835969656705856, "learning_rate": 1e-05, "loss": 0.017, "step": 1134100 }, { "epoch": 0.011342, "grad_norm": 0.12963013350963593, "learning_rate": 1e-05, "loss": 0.017, "step": 1134200 }, { "epoch": 0.011343, "grad_norm": 0.11493764817714691, "learning_rate": 1e-05, "loss": 0.0169, "step": 1134300 }, { "epoch": 0.011344, "grad_norm": 0.16819368302822113, "learning_rate": 1e-05, "loss": 0.0176, "step": 1134400 }, { "epoch": 0.011345, "grad_norm": 0.14079982042312622, "learning_rate": 1e-05, "loss": 0.0173, "step": 1134500 }, { "epoch": 0.011346, "grad_norm": 0.17607782781124115, "learning_rate": 1e-05, "loss": 0.0174, "step": 1134600 }, { "epoch": 0.011347, "grad_norm": 0.17923256754875183, "learning_rate": 1e-05, "loss": 0.0172, "step": 1134700 }, { "epoch": 0.011348, "grad_norm": 0.10399632155895233, "learning_rate": 1e-05, "loss": 0.017, "step": 1134800 }, { "epoch": 0.011349, "grad_norm": 0.1215122640132904, "learning_rate": 1e-05, "loss": 0.0169, "step": 1134900 }, { "epoch": 0.01135, "grad_norm": 0.13617266714572906, "learning_rate": 1e-05, "loss": 0.0176, "step": 1135000 }, { "epoch": 0.011351, "grad_norm": 0.1761876940727234, "learning_rate": 1e-05, "loss": 0.017, "step": 1135100 }, { "epoch": 0.011352, "grad_norm": 0.13658931851387024, "learning_rate": 1e-05, "loss": 0.0169, "step": 1135200 }, { "epoch": 0.011353, "grad_norm": 0.11116798222064972, "learning_rate": 1e-05, "loss": 0.0173, "step": 1135300 }, { "epoch": 0.011354, "grad_norm": 0.14434443414211273, "learning_rate": 1e-05, "loss": 0.0166, "step": 1135400 }, { "epoch": 0.011355, "grad_norm": 0.20816203951835632, "learning_rate": 1e-05, "loss": 0.017, "step": 1135500 }, { "epoch": 0.011356, "grad_norm": 0.14001929759979248, "learning_rate": 1e-05, "loss": 0.0166, "step": 1135600 }, { "epoch": 0.011357, "grad_norm": 0.10645480453968048, "learning_rate": 1e-05, "loss": 0.0168, "step": 1135700 }, { "epoch": 0.011358, "grad_norm": 0.16393637657165527, "learning_rate": 1e-05, "loss": 0.0174, "step": 1135800 }, { "epoch": 0.011359, "grad_norm": 0.11381770670413971, "learning_rate": 1e-05, "loss": 0.0166, "step": 1135900 }, { "epoch": 0.01136, "grad_norm": 0.12064823508262634, "learning_rate": 1e-05, "loss": 0.017, "step": 1136000 }, { "epoch": 0.011361, "grad_norm": 0.15967540442943573, "learning_rate": 1e-05, "loss": 0.0167, "step": 1136100 }, { "epoch": 0.011362, "grad_norm": 0.18569375574588776, "learning_rate": 1e-05, "loss": 0.017, "step": 1136200 }, { "epoch": 0.011363, "grad_norm": 0.13788360357284546, "learning_rate": 1e-05, "loss": 0.0172, "step": 1136300 }, { "epoch": 0.011364, "grad_norm": 0.1368684470653534, "learning_rate": 1e-05, "loss": 0.0167, "step": 1136400 }, { "epoch": 0.011365, "grad_norm": 0.11564384400844574, "learning_rate": 1e-05, "loss": 0.0172, "step": 1136500 }, { "epoch": 0.011366, "grad_norm": 0.1463245451450348, "learning_rate": 1e-05, "loss": 0.0171, "step": 1136600 }, { "epoch": 0.011367, "grad_norm": 0.10650021582841873, "learning_rate": 1e-05, "loss": 0.0168, "step": 1136700 }, { "epoch": 0.011368, "grad_norm": 0.1157040148973465, "learning_rate": 1e-05, "loss": 0.0165, "step": 1136800 }, { "epoch": 0.011369, "grad_norm": 0.16200122237205505, "learning_rate": 1e-05, "loss": 0.0171, "step": 1136900 }, { "epoch": 0.01137, "grad_norm": 0.11958647519350052, "learning_rate": 1e-05, "loss": 0.0165, "step": 1137000 }, { "epoch": 0.011371, "grad_norm": 0.13013583421707153, "learning_rate": 1e-05, "loss": 0.0169, "step": 1137100 }, { "epoch": 0.011372, "grad_norm": 0.11175316572189331, "learning_rate": 1e-05, "loss": 0.0171, "step": 1137200 }, { "epoch": 0.011373, "grad_norm": 0.26297616958618164, "learning_rate": 1e-05, "loss": 0.0169, "step": 1137300 }, { "epoch": 0.011374, "grad_norm": 0.1153082326054573, "learning_rate": 1e-05, "loss": 0.0171, "step": 1137400 }, { "epoch": 0.011375, "grad_norm": 0.12205848842859268, "learning_rate": 1e-05, "loss": 0.0168, "step": 1137500 }, { "epoch": 0.011376, "grad_norm": 0.14772921800613403, "learning_rate": 1e-05, "loss": 0.0171, "step": 1137600 }, { "epoch": 0.011377, "grad_norm": 0.1377038210630417, "learning_rate": 1e-05, "loss": 0.017, "step": 1137700 }, { "epoch": 0.011378, "grad_norm": 0.18060466647148132, "learning_rate": 1e-05, "loss": 0.0168, "step": 1137800 }, { "epoch": 0.011379, "grad_norm": 0.1308259516954422, "learning_rate": 1e-05, "loss": 0.0167, "step": 1137900 }, { "epoch": 0.01138, "grad_norm": 0.11246690899133682, "learning_rate": 1e-05, "loss": 0.0174, "step": 1138000 }, { "epoch": 0.011381, "grad_norm": 0.11888374388217926, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138100 }, { "epoch": 0.011382, "grad_norm": 0.1364215910434723, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138200 }, { "epoch": 0.011383, "grad_norm": 0.11306270956993103, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138300 }, { "epoch": 0.011384, "grad_norm": 0.10916703194379807, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138400 }, { "epoch": 0.011385, "grad_norm": 0.16163156926631927, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138500 }, { "epoch": 0.011386, "grad_norm": 0.13058212399482727, "learning_rate": 1e-05, "loss": 0.0169, "step": 1138600 }, { "epoch": 0.011387, "grad_norm": 0.11882205307483673, "learning_rate": 1e-05, "loss": 0.0167, "step": 1138700 }, { "epoch": 0.011388, "grad_norm": 0.14884643256664276, "learning_rate": 1e-05, "loss": 0.0172, "step": 1138800 }, { "epoch": 0.011389, "grad_norm": 0.13712480664253235, "learning_rate": 1e-05, "loss": 0.0167, "step": 1138900 }, { "epoch": 0.01139, "grad_norm": 0.15779553353786469, "learning_rate": 1e-05, "loss": 0.0171, "step": 1139000 }, { "epoch": 0.011391, "grad_norm": 0.10997942090034485, "learning_rate": 1e-05, "loss": 0.0174, "step": 1139100 }, { "epoch": 0.011392, "grad_norm": 0.12567465007305145, "learning_rate": 1e-05, "loss": 0.0171, "step": 1139200 }, { "epoch": 0.011393, "grad_norm": 0.19352926313877106, "learning_rate": 1e-05, "loss": 0.0169, "step": 1139300 }, { "epoch": 0.011394, "grad_norm": 0.10944530367851257, "learning_rate": 1e-05, "loss": 0.0169, "step": 1139400 }, { "epoch": 0.011395, "grad_norm": 0.13691604137420654, "learning_rate": 1e-05, "loss": 0.0167, "step": 1139500 }, { "epoch": 0.011396, "grad_norm": 0.11621107906103134, "learning_rate": 1e-05, "loss": 0.0168, "step": 1139600 }, { "epoch": 0.011397, "grad_norm": 0.14478951692581177, "learning_rate": 1e-05, "loss": 0.0167, "step": 1139700 }, { "epoch": 0.011398, "grad_norm": 0.23139184713363647, "learning_rate": 1e-05, "loss": 0.0174, "step": 1139800 }, { "epoch": 0.011399, "grad_norm": 0.21095965802669525, "learning_rate": 1e-05, "loss": 0.0172, "step": 1139900 }, { "epoch": 0.0114, "grad_norm": 0.17247094213962555, "learning_rate": 1e-05, "loss": 0.0168, "step": 1140000 }, { "epoch": 0.0114, "eval_loss": 0.01488504558801651, "eval_runtime": 174.0843, "eval_samples_per_second": 287.217, "eval_steps_per_second": 17.951, "step": 1140000 }, { "epoch": 0.011401, "grad_norm": 0.14041578769683838, "learning_rate": 1e-05, "loss": 0.017, "step": 1140100 }, { "epoch": 0.011402, "grad_norm": 0.2056211531162262, "learning_rate": 1e-05, "loss": 0.0173, "step": 1140200 }, { "epoch": 0.011403, "grad_norm": 0.18245379626750946, "learning_rate": 1e-05, "loss": 0.0169, "step": 1140300 }, { "epoch": 0.011404, "grad_norm": 0.12272041290998459, "learning_rate": 1e-05, "loss": 0.0167, "step": 1140400 }, { "epoch": 0.011405, "grad_norm": 0.15782156586647034, "learning_rate": 1e-05, "loss": 0.017, "step": 1140500 }, { "epoch": 0.011406, "grad_norm": 0.11372246593236923, "learning_rate": 1e-05, "loss": 0.0171, "step": 1140600 }, { "epoch": 0.011407, "grad_norm": 0.12753663957118988, "learning_rate": 1e-05, "loss": 0.0173, "step": 1140700 }, { "epoch": 0.011408, "grad_norm": 0.14147977530956268, "learning_rate": 1e-05, "loss": 0.0172, "step": 1140800 }, { "epoch": 0.011409, "grad_norm": 0.10080444067716599, "learning_rate": 1e-05, "loss": 0.017, "step": 1140900 }, { "epoch": 0.01141, "grad_norm": 0.13318295776844025, "learning_rate": 1e-05, "loss": 0.0169, "step": 1141000 }, { "epoch": 0.011411, "grad_norm": 0.19556483626365662, "learning_rate": 1e-05, "loss": 0.0167, "step": 1141100 }, { "epoch": 0.011412, "grad_norm": 0.12289344519376755, "learning_rate": 1e-05, "loss": 0.0169, "step": 1141200 }, { "epoch": 0.011413, "grad_norm": 0.13305822014808655, "learning_rate": 1e-05, "loss": 0.0168, "step": 1141300 }, { "epoch": 0.011414, "grad_norm": 0.12480240315198898, "learning_rate": 1e-05, "loss": 0.0174, "step": 1141400 }, { "epoch": 0.011415, "grad_norm": 0.10226883739233017, "learning_rate": 1e-05, "loss": 0.017, "step": 1141500 }, { "epoch": 0.011416, "grad_norm": 0.13029083609580994, "learning_rate": 1e-05, "loss": 0.0173, "step": 1141600 }, { "epoch": 0.011417, "grad_norm": 0.1396496444940567, "learning_rate": 1e-05, "loss": 0.0168, "step": 1141700 }, { "epoch": 0.011418, "grad_norm": 0.14370162785053253, "learning_rate": 1e-05, "loss": 0.0171, "step": 1141800 }, { "epoch": 0.011419, "grad_norm": 0.12659214437007904, "learning_rate": 1e-05, "loss": 0.0169, "step": 1141900 }, { "epoch": 0.01142, "grad_norm": 0.1746404469013214, "learning_rate": 1e-05, "loss": 0.0171, "step": 1142000 }, { "epoch": 0.011421, "grad_norm": 0.1337900161743164, "learning_rate": 1e-05, "loss": 0.017, "step": 1142100 }, { "epoch": 0.011422, "grad_norm": 0.14954328536987305, "learning_rate": 1e-05, "loss": 0.0172, "step": 1142200 }, { "epoch": 0.011423, "grad_norm": 0.13651220500469208, "learning_rate": 1e-05, "loss": 0.0168, "step": 1142300 }, { "epoch": 0.011424, "grad_norm": 0.11656204611063004, "learning_rate": 1e-05, "loss": 0.017, "step": 1142400 }, { "epoch": 0.011425, "grad_norm": 0.10978394001722336, "learning_rate": 1e-05, "loss": 0.0168, "step": 1142500 }, { "epoch": 0.011426, "grad_norm": 0.1684810221195221, "learning_rate": 1e-05, "loss": 0.0172, "step": 1142600 }, { "epoch": 0.011427, "grad_norm": 0.10778705030679703, "learning_rate": 1e-05, "loss": 0.0172, "step": 1142700 }, { "epoch": 0.011428, "grad_norm": 0.14017829298973083, "learning_rate": 1e-05, "loss": 0.0171, "step": 1142800 }, { "epoch": 0.011429, "grad_norm": 0.15369264781475067, "learning_rate": 1e-05, "loss": 0.017, "step": 1142900 }, { "epoch": 0.01143, "grad_norm": 0.13816437125205994, "learning_rate": 1e-05, "loss": 0.0173, "step": 1143000 }, { "epoch": 0.011431, "grad_norm": 0.15179292857646942, "learning_rate": 1e-05, "loss": 0.0168, "step": 1143100 }, { "epoch": 0.011432, "grad_norm": 0.1309405118227005, "learning_rate": 1e-05, "loss": 0.0168, "step": 1143200 }, { "epoch": 0.011433, "grad_norm": 0.11290021985769272, "learning_rate": 1e-05, "loss": 0.0165, "step": 1143300 }, { "epoch": 0.011434, "grad_norm": 0.13102523982524872, "learning_rate": 1e-05, "loss": 0.017, "step": 1143400 }, { "epoch": 0.011435, "grad_norm": 0.14651690423488617, "learning_rate": 1e-05, "loss": 0.0174, "step": 1143500 }, { "epoch": 0.011436, "grad_norm": 0.10508169233798981, "learning_rate": 1e-05, "loss": 0.0175, "step": 1143600 }, { "epoch": 0.011437, "grad_norm": 0.12607678771018982, "learning_rate": 1e-05, "loss": 0.0177, "step": 1143700 }, { "epoch": 0.011438, "grad_norm": 0.08744026720523834, "learning_rate": 1e-05, "loss": 0.0171, "step": 1143800 }, { "epoch": 0.011439, "grad_norm": 0.14271430671215057, "learning_rate": 1e-05, "loss": 0.0173, "step": 1143900 }, { "epoch": 0.01144, "grad_norm": 0.12357847392559052, "learning_rate": 1e-05, "loss": 0.017, "step": 1144000 }, { "epoch": 0.011441, "grad_norm": 0.11874079704284668, "learning_rate": 1e-05, "loss": 0.0169, "step": 1144100 }, { "epoch": 0.011442, "grad_norm": 0.11856932938098907, "learning_rate": 1e-05, "loss": 0.017, "step": 1144200 }, { "epoch": 0.011443, "grad_norm": 0.09640617668628693, "learning_rate": 1e-05, "loss": 0.0166, "step": 1144300 }, { "epoch": 0.011444, "grad_norm": 0.12599176168441772, "learning_rate": 1e-05, "loss": 0.0167, "step": 1144400 }, { "epoch": 0.011445, "grad_norm": 0.12858760356903076, "learning_rate": 1e-05, "loss": 0.0172, "step": 1144500 }, { "epoch": 0.011446, "grad_norm": 0.1416226476430893, "learning_rate": 1e-05, "loss": 0.0169, "step": 1144600 }, { "epoch": 0.011447, "grad_norm": 0.12169510126113892, "learning_rate": 1e-05, "loss": 0.017, "step": 1144700 }, { "epoch": 0.011448, "grad_norm": 0.13205832242965698, "learning_rate": 1e-05, "loss": 0.0174, "step": 1144800 }, { "epoch": 0.011449, "grad_norm": 0.11234971135854721, "learning_rate": 1e-05, "loss": 0.0171, "step": 1144900 }, { "epoch": 0.01145, "grad_norm": 0.14655540883541107, "learning_rate": 1e-05, "loss": 0.0175, "step": 1145000 }, { "epoch": 0.011451, "grad_norm": 0.1254601627588272, "learning_rate": 1e-05, "loss": 0.017, "step": 1145100 }, { "epoch": 0.011452, "grad_norm": 0.10217186063528061, "learning_rate": 1e-05, "loss": 0.017, "step": 1145200 }, { "epoch": 0.011453, "grad_norm": 0.11439372599124908, "learning_rate": 1e-05, "loss": 0.017, "step": 1145300 }, { "epoch": 0.011454, "grad_norm": 0.13497169315814972, "learning_rate": 1e-05, "loss": 0.017, "step": 1145400 }, { "epoch": 0.011455, "grad_norm": 0.15679234266281128, "learning_rate": 1e-05, "loss": 0.0172, "step": 1145500 }, { "epoch": 0.011456, "grad_norm": 0.1055111289024353, "learning_rate": 1e-05, "loss": 0.0171, "step": 1145600 }, { "epoch": 0.011457, "grad_norm": 0.1363486796617508, "learning_rate": 1e-05, "loss": 0.017, "step": 1145700 }, { "epoch": 0.011458, "grad_norm": 0.12562598288059235, "learning_rate": 1e-05, "loss": 0.0173, "step": 1145800 }, { "epoch": 0.011459, "grad_norm": 0.1487775444984436, "learning_rate": 1e-05, "loss": 0.0168, "step": 1145900 }, { "epoch": 0.01146, "grad_norm": 0.12119293957948685, "learning_rate": 1e-05, "loss": 0.0167, "step": 1146000 }, { "epoch": 0.011461, "grad_norm": 0.08600138127803802, "learning_rate": 1e-05, "loss": 0.0169, "step": 1146100 }, { "epoch": 0.011462, "grad_norm": 0.16873273253440857, "learning_rate": 1e-05, "loss": 0.0171, "step": 1146200 }, { "epoch": 0.011463, "grad_norm": 0.12617526948451996, "learning_rate": 1e-05, "loss": 0.0171, "step": 1146300 }, { "epoch": 0.011464, "grad_norm": 0.11782141029834747, "learning_rate": 1e-05, "loss": 0.0173, "step": 1146400 }, { "epoch": 0.011465, "grad_norm": 0.12383846193552017, "learning_rate": 1e-05, "loss": 0.0175, "step": 1146500 }, { "epoch": 0.011466, "grad_norm": 0.12821783125400543, "learning_rate": 1e-05, "loss": 0.017, "step": 1146600 }, { "epoch": 0.011467, "grad_norm": 0.1431439071893692, "learning_rate": 1e-05, "loss": 0.0175, "step": 1146700 }, { "epoch": 0.011468, "grad_norm": 0.1440025418996811, "learning_rate": 1e-05, "loss": 0.0171, "step": 1146800 }, { "epoch": 0.011469, "grad_norm": 0.09689315408468246, "learning_rate": 1e-05, "loss": 0.0172, "step": 1146900 }, { "epoch": 0.01147, "grad_norm": 0.12288029491901398, "learning_rate": 1e-05, "loss": 0.0172, "step": 1147000 }, { "epoch": 0.011471, "grad_norm": 0.11793211102485657, "learning_rate": 1e-05, "loss": 0.0168, "step": 1147100 }, { "epoch": 0.011472, "grad_norm": 0.10725709050893784, "learning_rate": 1e-05, "loss": 0.017, "step": 1147200 }, { "epoch": 0.011473, "grad_norm": 0.1550995409488678, "learning_rate": 1e-05, "loss": 0.017, "step": 1147300 }, { "epoch": 0.011474, "grad_norm": 0.13336896896362305, "learning_rate": 1e-05, "loss": 0.0167, "step": 1147400 }, { "epoch": 0.011475, "grad_norm": 0.16782015562057495, "learning_rate": 1e-05, "loss": 0.0171, "step": 1147500 }, { "epoch": 0.011476, "grad_norm": 0.10072468221187592, "learning_rate": 1e-05, "loss": 0.0171, "step": 1147600 }, { "epoch": 0.011477, "grad_norm": 0.18868134915828705, "learning_rate": 1e-05, "loss": 0.0168, "step": 1147700 }, { "epoch": 0.011478, "grad_norm": 0.13859322667121887, "learning_rate": 1e-05, "loss": 0.0167, "step": 1147800 }, { "epoch": 0.011479, "grad_norm": 0.09394567459821701, "learning_rate": 1e-05, "loss": 0.0167, "step": 1147900 }, { "epoch": 0.01148, "grad_norm": 0.11514028906822205, "learning_rate": 1e-05, "loss": 0.0167, "step": 1148000 }, { "epoch": 0.011481, "grad_norm": 0.15184226632118225, "learning_rate": 1e-05, "loss": 0.017, "step": 1148100 }, { "epoch": 0.011482, "grad_norm": 0.09230310469865799, "learning_rate": 1e-05, "loss": 0.0171, "step": 1148200 }, { "epoch": 0.011483, "grad_norm": 0.12060874700546265, "learning_rate": 1e-05, "loss": 0.0169, "step": 1148300 }, { "epoch": 0.011484, "grad_norm": 0.07826296985149384, "learning_rate": 1e-05, "loss": 0.0166, "step": 1148400 }, { "epoch": 0.011485, "grad_norm": 0.12439630180597305, "learning_rate": 1e-05, "loss": 0.017, "step": 1148500 }, { "epoch": 0.011486, "grad_norm": 0.10205159336328506, "learning_rate": 1e-05, "loss": 0.0168, "step": 1148600 }, { "epoch": 0.011487, "grad_norm": 0.10806619375944138, "learning_rate": 1e-05, "loss": 0.0167, "step": 1148700 }, { "epoch": 0.011488, "grad_norm": 0.18521924316883087, "learning_rate": 1e-05, "loss": 0.0172, "step": 1148800 }, { "epoch": 0.011489, "grad_norm": 0.12685875594615936, "learning_rate": 1e-05, "loss": 0.017, "step": 1148900 }, { "epoch": 0.01149, "grad_norm": 0.10996326804161072, "learning_rate": 1e-05, "loss": 0.017, "step": 1149000 }, { "epoch": 0.011491, "grad_norm": 0.14832176268100739, "learning_rate": 1e-05, "loss": 0.017, "step": 1149100 }, { "epoch": 0.011492, "grad_norm": 0.13710840046405792, "learning_rate": 1e-05, "loss": 0.0172, "step": 1149200 }, { "epoch": 0.011493, "grad_norm": 0.14704883098602295, "learning_rate": 1e-05, "loss": 0.0169, "step": 1149300 }, { "epoch": 0.011494, "grad_norm": 0.13265912234783173, "learning_rate": 1e-05, "loss": 0.0169, "step": 1149400 }, { "epoch": 0.011495, "grad_norm": 0.12097005546092987, "learning_rate": 1e-05, "loss": 0.0169, "step": 1149500 }, { "epoch": 0.011496, "grad_norm": 0.11977540701627731, "learning_rate": 1e-05, "loss": 0.0174, "step": 1149600 }, { "epoch": 0.011497, "grad_norm": 0.19094759225845337, "learning_rate": 1e-05, "loss": 0.0172, "step": 1149700 }, { "epoch": 0.011498, "grad_norm": 0.16750381886959076, "learning_rate": 1e-05, "loss": 0.0167, "step": 1149800 }, { "epoch": 0.011499, "grad_norm": 0.15411625802516937, "learning_rate": 1e-05, "loss": 0.0168, "step": 1149900 }, { "epoch": 0.0115, "grad_norm": 0.12079764902591705, "learning_rate": 1e-05, "loss": 0.0171, "step": 1150000 }, { "epoch": 0.011501, "grad_norm": 0.1491960883140564, "learning_rate": 1e-05, "loss": 0.0168, "step": 1150100 }, { "epoch": 0.011502, "grad_norm": 0.15694183111190796, "learning_rate": 1e-05, "loss": 0.0169, "step": 1150200 }, { "epoch": 0.011503, "grad_norm": 0.12309864163398743, "learning_rate": 1e-05, "loss": 0.0167, "step": 1150300 }, { "epoch": 0.011504, "grad_norm": 0.19817698001861572, "learning_rate": 1e-05, "loss": 0.0169, "step": 1150400 }, { "epoch": 0.011505, "grad_norm": 0.1587182581424713, "learning_rate": 1e-05, "loss": 0.017, "step": 1150500 }, { "epoch": 0.011506, "grad_norm": 0.11457626521587372, "learning_rate": 1e-05, "loss": 0.017, "step": 1150600 }, { "epoch": 0.011507, "grad_norm": 0.15512289106845856, "learning_rate": 1e-05, "loss": 0.0164, "step": 1150700 }, { "epoch": 0.011508, "grad_norm": 0.14177191257476807, "learning_rate": 1e-05, "loss": 0.0166, "step": 1150800 }, { "epoch": 0.011509, "grad_norm": 0.12255334109067917, "learning_rate": 1e-05, "loss": 0.0166, "step": 1150900 }, { "epoch": 0.01151, "grad_norm": 0.15826617181301117, "learning_rate": 1e-05, "loss": 0.017, "step": 1151000 }, { "epoch": 0.011511, "grad_norm": 0.1703743189573288, "learning_rate": 1e-05, "loss": 0.0169, "step": 1151100 }, { "epoch": 0.011512, "grad_norm": 0.11918558925390244, "learning_rate": 1e-05, "loss": 0.0169, "step": 1151200 }, { "epoch": 0.011513, "grad_norm": 0.12170081585645676, "learning_rate": 1e-05, "loss": 0.0174, "step": 1151300 }, { "epoch": 0.011514, "grad_norm": 0.1278006136417389, "learning_rate": 1e-05, "loss": 0.0171, "step": 1151400 }, { "epoch": 0.011515, "grad_norm": 0.15694402158260345, "learning_rate": 1e-05, "loss": 0.0167, "step": 1151500 }, { "epoch": 0.011516, "grad_norm": 0.125954732298851, "learning_rate": 1e-05, "loss": 0.0171, "step": 1151600 }, { "epoch": 0.011517, "grad_norm": 0.12070639431476593, "learning_rate": 1e-05, "loss": 0.017, "step": 1151700 }, { "epoch": 0.011518, "grad_norm": 0.09913370013237, "learning_rate": 1e-05, "loss": 0.017, "step": 1151800 }, { "epoch": 0.011519, "grad_norm": 0.14453202486038208, "learning_rate": 1e-05, "loss": 0.0169, "step": 1151900 }, { "epoch": 0.01152, "grad_norm": 0.12170803546905518, "learning_rate": 1e-05, "loss": 0.0171, "step": 1152000 }, { "epoch": 0.011521, "grad_norm": 0.09660517424345016, "learning_rate": 1e-05, "loss": 0.0172, "step": 1152100 }, { "epoch": 0.011522, "grad_norm": 0.10911038517951965, "learning_rate": 1e-05, "loss": 0.0169, "step": 1152200 }, { "epoch": 0.011523, "grad_norm": 0.11818278580904007, "learning_rate": 1e-05, "loss": 0.0167, "step": 1152300 }, { "epoch": 0.011524, "grad_norm": 0.13629767298698425, "learning_rate": 1e-05, "loss": 0.0171, "step": 1152400 }, { "epoch": 0.011525, "grad_norm": 0.16540230810642242, "learning_rate": 1e-05, "loss": 0.0168, "step": 1152500 }, { "epoch": 0.011526, "grad_norm": 0.14027836918830872, "learning_rate": 1e-05, "loss": 0.0171, "step": 1152600 }, { "epoch": 0.011527, "grad_norm": 0.13486453890800476, "learning_rate": 1e-05, "loss": 0.0172, "step": 1152700 }, { "epoch": 0.011528, "grad_norm": 0.13520467281341553, "learning_rate": 1e-05, "loss": 0.0164, "step": 1152800 }, { "epoch": 0.011529, "grad_norm": 0.09924006462097168, "learning_rate": 1e-05, "loss": 0.0169, "step": 1152900 }, { "epoch": 0.01153, "grad_norm": 0.11575281620025635, "learning_rate": 1e-05, "loss": 0.0171, "step": 1153000 }, { "epoch": 0.011531, "grad_norm": 0.11122308671474457, "learning_rate": 1e-05, "loss": 0.0172, "step": 1153100 }, { "epoch": 0.011532, "grad_norm": 0.2030836045742035, "learning_rate": 1e-05, "loss": 0.0165, "step": 1153200 }, { "epoch": 0.011533, "grad_norm": 0.13658791780471802, "learning_rate": 1e-05, "loss": 0.017, "step": 1153300 }, { "epoch": 0.011534, "grad_norm": 0.08720725774765015, "learning_rate": 1e-05, "loss": 0.0171, "step": 1153400 }, { "epoch": 0.011535, "grad_norm": 0.140541672706604, "learning_rate": 1e-05, "loss": 0.0171, "step": 1153500 }, { "epoch": 0.011536, "grad_norm": 0.11292672902345657, "learning_rate": 1e-05, "loss": 0.0171, "step": 1153600 }, { "epoch": 0.011537, "grad_norm": 0.14500479400157928, "learning_rate": 1e-05, "loss": 0.0173, "step": 1153700 }, { "epoch": 0.011538, "grad_norm": 0.136617049574852, "learning_rate": 1e-05, "loss": 0.0168, "step": 1153800 }, { "epoch": 0.011539, "grad_norm": 0.18509100377559662, "learning_rate": 1e-05, "loss": 0.0167, "step": 1153900 }, { "epoch": 0.01154, "grad_norm": 0.19471536576747894, "learning_rate": 1e-05, "loss": 0.017, "step": 1154000 }, { "epoch": 0.011541, "grad_norm": 0.09877510368824005, "learning_rate": 1e-05, "loss": 0.0168, "step": 1154100 }, { "epoch": 0.011542, "grad_norm": 0.11244404315948486, "learning_rate": 1e-05, "loss": 0.0167, "step": 1154200 }, { "epoch": 0.011543, "grad_norm": 0.1055702418088913, "learning_rate": 1e-05, "loss": 0.0165, "step": 1154300 }, { "epoch": 0.011544, "grad_norm": 0.16583037376403809, "learning_rate": 1e-05, "loss": 0.0167, "step": 1154400 }, { "epoch": 0.011545, "grad_norm": 0.10624352097511292, "learning_rate": 1e-05, "loss": 0.0169, "step": 1154500 }, { "epoch": 0.011546, "grad_norm": 0.13482698798179626, "learning_rate": 1e-05, "loss": 0.0169, "step": 1154600 }, { "epoch": 0.011547, "grad_norm": 0.12261596322059631, "learning_rate": 1e-05, "loss": 0.017, "step": 1154700 }, { "epoch": 0.011548, "grad_norm": 0.09884320199489594, "learning_rate": 1e-05, "loss": 0.0167, "step": 1154800 }, { "epoch": 0.011549, "grad_norm": 0.1419481486082077, "learning_rate": 1e-05, "loss": 0.0167, "step": 1154900 }, { "epoch": 0.01155, "grad_norm": 0.10443808138370514, "learning_rate": 1e-05, "loss": 0.0172, "step": 1155000 }, { "epoch": 0.011551, "grad_norm": 0.14859041571617126, "learning_rate": 1e-05, "loss": 0.0169, "step": 1155100 }, { "epoch": 0.011552, "grad_norm": 0.1257603019475937, "learning_rate": 1e-05, "loss": 0.0172, "step": 1155200 }, { "epoch": 0.011553, "grad_norm": 0.18029823899269104, "learning_rate": 1e-05, "loss": 0.0172, "step": 1155300 }, { "epoch": 0.011554, "grad_norm": 0.13305313885211945, "learning_rate": 1e-05, "loss": 0.0168, "step": 1155400 }, { "epoch": 0.011555, "grad_norm": 0.14905132353305817, "learning_rate": 1e-05, "loss": 0.0171, "step": 1155500 }, { "epoch": 0.011556, "grad_norm": 0.12598878145217896, "learning_rate": 1e-05, "loss": 0.0172, "step": 1155600 }, { "epoch": 0.011557, "grad_norm": 0.12357113510370255, "learning_rate": 1e-05, "loss": 0.0167, "step": 1155700 }, { "epoch": 0.011558, "grad_norm": 0.1423133909702301, "learning_rate": 1e-05, "loss": 0.0169, "step": 1155800 }, { "epoch": 0.011559, "grad_norm": 0.11463060975074768, "learning_rate": 1e-05, "loss": 0.0168, "step": 1155900 }, { "epoch": 0.01156, "grad_norm": 0.1374080628156662, "learning_rate": 1e-05, "loss": 0.017, "step": 1156000 }, { "epoch": 0.011561, "grad_norm": 0.17515811324119568, "learning_rate": 1e-05, "loss": 0.0169, "step": 1156100 }, { "epoch": 0.011562, "grad_norm": 0.1393454670906067, "learning_rate": 1e-05, "loss": 0.0172, "step": 1156200 }, { "epoch": 0.011563, "grad_norm": 0.14202426373958588, "learning_rate": 1e-05, "loss": 0.0168, "step": 1156300 }, { "epoch": 0.011564, "grad_norm": 0.12125104665756226, "learning_rate": 1e-05, "loss": 0.0165, "step": 1156400 }, { "epoch": 0.011565, "grad_norm": 0.15906521677970886, "learning_rate": 1e-05, "loss": 0.0166, "step": 1156500 }, { "epoch": 0.011566, "grad_norm": 0.12216199934482574, "learning_rate": 1e-05, "loss": 0.0167, "step": 1156600 }, { "epoch": 0.011567, "grad_norm": 0.14311066269874573, "learning_rate": 1e-05, "loss": 0.0168, "step": 1156700 }, { "epoch": 0.011568, "grad_norm": 0.12276378273963928, "learning_rate": 1e-05, "loss": 0.0169, "step": 1156800 }, { "epoch": 0.011569, "grad_norm": 0.12568962574005127, "learning_rate": 1e-05, "loss": 0.0174, "step": 1156900 }, { "epoch": 0.01157, "grad_norm": 0.12555183470249176, "learning_rate": 1e-05, "loss": 0.0169, "step": 1157000 }, { "epoch": 0.011571, "grad_norm": 0.1258268654346466, "learning_rate": 1e-05, "loss": 0.0173, "step": 1157100 }, { "epoch": 0.011572, "grad_norm": 0.11907074600458145, "learning_rate": 1e-05, "loss": 0.017, "step": 1157200 }, { "epoch": 0.011573, "grad_norm": 0.14995437860488892, "learning_rate": 1e-05, "loss": 0.017, "step": 1157300 }, { "epoch": 0.011574, "grad_norm": 0.15002556145191193, "learning_rate": 1e-05, "loss": 0.017, "step": 1157400 }, { "epoch": 0.011575, "grad_norm": 0.10962475091218948, "learning_rate": 1e-05, "loss": 0.0172, "step": 1157500 }, { "epoch": 0.011576, "grad_norm": 0.18583998084068298, "learning_rate": 1e-05, "loss": 0.017, "step": 1157600 }, { "epoch": 0.011577, "grad_norm": 0.15677767992019653, "learning_rate": 1e-05, "loss": 0.0168, "step": 1157700 }, { "epoch": 0.011578, "grad_norm": 0.13849863409996033, "learning_rate": 1e-05, "loss": 0.0169, "step": 1157800 }, { "epoch": 0.011579, "grad_norm": 0.11951656639575958, "learning_rate": 1e-05, "loss": 0.0165, "step": 1157900 }, { "epoch": 0.01158, "grad_norm": 0.10607751458883286, "learning_rate": 1e-05, "loss": 0.0168, "step": 1158000 }, { "epoch": 0.011581, "grad_norm": 0.12996050715446472, "learning_rate": 1e-05, "loss": 0.0167, "step": 1158100 }, { "epoch": 0.011582, "grad_norm": 0.1457892805337906, "learning_rate": 1e-05, "loss": 0.0168, "step": 1158200 }, { "epoch": 0.011583, "grad_norm": 0.17351049184799194, "learning_rate": 1e-05, "loss": 0.0171, "step": 1158300 }, { "epoch": 0.011584, "grad_norm": 0.13833743333816528, "learning_rate": 1e-05, "loss": 0.017, "step": 1158400 }, { "epoch": 0.011585, "grad_norm": 0.17302994430065155, "learning_rate": 1e-05, "loss": 0.0171, "step": 1158500 }, { "epoch": 0.011586, "grad_norm": 0.13569103181362152, "learning_rate": 1e-05, "loss": 0.0168, "step": 1158600 }, { "epoch": 0.011587, "grad_norm": 0.12231604754924774, "learning_rate": 1e-05, "loss": 0.0169, "step": 1158700 }, { "epoch": 0.011588, "grad_norm": 0.14116321504116058, "learning_rate": 1e-05, "loss": 0.0165, "step": 1158800 }, { "epoch": 0.011589, "grad_norm": 0.21032007038593292, "learning_rate": 1e-05, "loss": 0.0167, "step": 1158900 }, { "epoch": 0.01159, "grad_norm": 0.15558449923992157, "learning_rate": 1e-05, "loss": 0.017, "step": 1159000 }, { "epoch": 0.011591, "grad_norm": 0.13306882977485657, "learning_rate": 1e-05, "loss": 0.0173, "step": 1159100 }, { "epoch": 0.011592, "grad_norm": 0.10448577255010605, "learning_rate": 1e-05, "loss": 0.0168, "step": 1159200 }, { "epoch": 0.011593, "grad_norm": 0.14566752314567566, "learning_rate": 1e-05, "loss": 0.0167, "step": 1159300 }, { "epoch": 0.011594, "grad_norm": 0.16464507579803467, "learning_rate": 1e-05, "loss": 0.0164, "step": 1159400 }, { "epoch": 0.011595, "grad_norm": 0.13049891591072083, "learning_rate": 1e-05, "loss": 0.0174, "step": 1159500 }, { "epoch": 0.011596, "grad_norm": 0.1432104855775833, "learning_rate": 1e-05, "loss": 0.0169, "step": 1159600 }, { "epoch": 0.011597, "grad_norm": 0.11411581933498383, "learning_rate": 1e-05, "loss": 0.0168, "step": 1159700 }, { "epoch": 0.011598, "grad_norm": 0.13194076716899872, "learning_rate": 1e-05, "loss": 0.0171, "step": 1159800 }, { "epoch": 0.011599, "grad_norm": 0.19861312210559845, "learning_rate": 1e-05, "loss": 0.0173, "step": 1159900 }, { "epoch": 0.0116, "grad_norm": 0.10967374593019485, "learning_rate": 1e-05, "loss": 0.0168, "step": 1160000 }, { "epoch": 0.0116, "eval_loss": 0.015451012179255486, "eval_runtime": 189.1854, "eval_samples_per_second": 264.291, "eval_steps_per_second": 16.518, "step": 1160000 }, { "epoch": 0.011601, "grad_norm": 0.11620233207941055, "learning_rate": 1e-05, "loss": 0.0173, "step": 1160100 }, { "epoch": 0.011602, "grad_norm": 0.12117086350917816, "learning_rate": 1e-05, "loss": 0.0171, "step": 1160200 }, { "epoch": 0.011603, "grad_norm": 0.11896497756242752, "learning_rate": 1e-05, "loss": 0.0168, "step": 1160300 }, { "epoch": 0.011604, "grad_norm": 0.1356159895658493, "learning_rate": 1e-05, "loss": 0.0168, "step": 1160400 }, { "epoch": 0.011605, "grad_norm": 0.1113191470503807, "learning_rate": 1e-05, "loss": 0.0166, "step": 1160500 }, { "epoch": 0.011606, "grad_norm": 0.1407240331172943, "learning_rate": 1e-05, "loss": 0.0168, "step": 1160600 }, { "epoch": 0.011607, "grad_norm": 0.09026578813791275, "learning_rate": 1e-05, "loss": 0.0172, "step": 1160700 }, { "epoch": 0.011608, "grad_norm": 0.11116781085729599, "learning_rate": 1e-05, "loss": 0.0169, "step": 1160800 }, { "epoch": 0.011609, "grad_norm": 0.11791861802339554, "learning_rate": 1e-05, "loss": 0.0176, "step": 1160900 }, { "epoch": 0.01161, "grad_norm": 0.11174912750720978, "learning_rate": 1e-05, "loss": 0.0169, "step": 1161000 }, { "epoch": 0.011611, "grad_norm": 0.2239297330379486, "learning_rate": 1e-05, "loss": 0.0167, "step": 1161100 }, { "epoch": 0.011612, "grad_norm": 0.10821618139743805, "learning_rate": 1e-05, "loss": 0.0171, "step": 1161200 }, { "epoch": 0.011613, "grad_norm": 0.17932580411434174, "learning_rate": 1e-05, "loss": 0.017, "step": 1161300 }, { "epoch": 0.011614, "grad_norm": 0.11578369140625, "learning_rate": 1e-05, "loss": 0.0168, "step": 1161400 }, { "epoch": 0.011615, "grad_norm": 0.11636421829462051, "learning_rate": 1e-05, "loss": 0.0169, "step": 1161500 }, { "epoch": 0.011616, "grad_norm": 0.11189969629049301, "learning_rate": 1e-05, "loss": 0.0166, "step": 1161600 }, { "epoch": 0.011617, "grad_norm": 0.12036963552236557, "learning_rate": 1e-05, "loss": 0.0169, "step": 1161700 }, { "epoch": 0.011618, "grad_norm": 0.0986868143081665, "learning_rate": 1e-05, "loss": 0.017, "step": 1161800 }, { "epoch": 0.011619, "grad_norm": 0.14342622458934784, "learning_rate": 1e-05, "loss": 0.017, "step": 1161900 }, { "epoch": 0.01162, "grad_norm": 0.13772746920585632, "learning_rate": 1e-05, "loss": 0.0168, "step": 1162000 }, { "epoch": 0.011621, "grad_norm": 0.12759920954704285, "learning_rate": 1e-05, "loss": 0.0172, "step": 1162100 }, { "epoch": 0.011622, "grad_norm": 0.15415525436401367, "learning_rate": 1e-05, "loss": 0.0166, "step": 1162200 }, { "epoch": 0.011623, "grad_norm": 0.12171592563390732, "learning_rate": 1e-05, "loss": 0.0166, "step": 1162300 }, { "epoch": 0.011624, "grad_norm": 0.13969838619232178, "learning_rate": 1e-05, "loss": 0.0166, "step": 1162400 }, { "epoch": 0.011625, "grad_norm": 0.1175224632024765, "learning_rate": 1e-05, "loss": 0.017, "step": 1162500 }, { "epoch": 0.011626, "grad_norm": 0.128434419631958, "learning_rate": 1e-05, "loss": 0.0172, "step": 1162600 }, { "epoch": 0.011627, "grad_norm": 0.09072111546993256, "learning_rate": 1e-05, "loss": 0.017, "step": 1162700 }, { "epoch": 0.011628, "grad_norm": 0.1575378179550171, "learning_rate": 1e-05, "loss": 0.0172, "step": 1162800 }, { "epoch": 0.011629, "grad_norm": 0.13103151321411133, "learning_rate": 1e-05, "loss": 0.017, "step": 1162900 }, { "epoch": 0.01163, "grad_norm": 0.1879846453666687, "learning_rate": 1e-05, "loss": 0.0172, "step": 1163000 }, { "epoch": 0.011631, "grad_norm": 0.1294841766357422, "learning_rate": 1e-05, "loss": 0.017, "step": 1163100 }, { "epoch": 0.011632, "grad_norm": 0.12176620215177536, "learning_rate": 1e-05, "loss": 0.0174, "step": 1163200 }, { "epoch": 0.011633, "grad_norm": 0.11845620721578598, "learning_rate": 1e-05, "loss": 0.0168, "step": 1163300 }, { "epoch": 0.011634, "grad_norm": 0.1327875405550003, "learning_rate": 1e-05, "loss": 0.0169, "step": 1163400 }, { "epoch": 0.011635, "grad_norm": 0.1625329703092575, "learning_rate": 1e-05, "loss": 0.0167, "step": 1163500 }, { "epoch": 0.011636, "grad_norm": 0.12130255252122879, "learning_rate": 1e-05, "loss": 0.0167, "step": 1163600 }, { "epoch": 0.011637, "grad_norm": 0.14681793749332428, "learning_rate": 1e-05, "loss": 0.0166, "step": 1163700 }, { "epoch": 0.011638, "grad_norm": 0.12508171796798706, "learning_rate": 1e-05, "loss": 0.0167, "step": 1163800 }, { "epoch": 0.011639, "grad_norm": 0.16415554285049438, "learning_rate": 1e-05, "loss": 0.0168, "step": 1163900 }, { "epoch": 0.01164, "grad_norm": 0.11534056067466736, "learning_rate": 1e-05, "loss": 0.0169, "step": 1164000 }, { "epoch": 0.011641, "grad_norm": 0.1524718850851059, "learning_rate": 1e-05, "loss": 0.0172, "step": 1164100 }, { "epoch": 0.011642, "grad_norm": 0.1374698430299759, "learning_rate": 1e-05, "loss": 0.0168, "step": 1164200 }, { "epoch": 0.011643, "grad_norm": 0.17691610753536224, "learning_rate": 1e-05, "loss": 0.017, "step": 1164300 }, { "epoch": 0.011644, "grad_norm": 0.16697712242603302, "learning_rate": 1e-05, "loss": 0.0167, "step": 1164400 }, { "epoch": 0.011645, "grad_norm": 0.11751645803451538, "learning_rate": 1e-05, "loss": 0.0168, "step": 1164500 }, { "epoch": 0.011646, "grad_norm": 0.13650617003440857, "learning_rate": 1e-05, "loss": 0.0172, "step": 1164600 }, { "epoch": 0.011647, "grad_norm": 0.11907560378313065, "learning_rate": 1e-05, "loss": 0.017, "step": 1164700 }, { "epoch": 0.011648, "grad_norm": 0.09737752377986908, "learning_rate": 1e-05, "loss": 0.017, "step": 1164800 }, { "epoch": 0.011649, "grad_norm": 0.17338569462299347, "learning_rate": 1e-05, "loss": 0.0166, "step": 1164900 }, { "epoch": 0.01165, "grad_norm": 0.11473561823368073, "learning_rate": 1e-05, "loss": 0.0167, "step": 1165000 }, { "epoch": 0.011651, "grad_norm": 0.1160101443529129, "learning_rate": 1e-05, "loss": 0.0173, "step": 1165100 }, { "epoch": 0.011652, "grad_norm": 0.12615197896957397, "learning_rate": 1e-05, "loss": 0.0171, "step": 1165200 }, { "epoch": 0.011653, "grad_norm": 0.12422860413789749, "learning_rate": 1e-05, "loss": 0.0167, "step": 1165300 }, { "epoch": 0.011654, "grad_norm": 0.14425119757652283, "learning_rate": 1e-05, "loss": 0.0166, "step": 1165400 }, { "epoch": 0.011655, "grad_norm": 0.13326430320739746, "learning_rate": 1e-05, "loss": 0.0166, "step": 1165500 }, { "epoch": 0.011656, "grad_norm": 0.18308839201927185, "learning_rate": 1e-05, "loss": 0.0166, "step": 1165600 }, { "epoch": 0.011657, "grad_norm": 0.15285351872444153, "learning_rate": 1e-05, "loss": 0.017, "step": 1165700 }, { "epoch": 0.011658, "grad_norm": 0.14899888634681702, "learning_rate": 1e-05, "loss": 0.0167, "step": 1165800 }, { "epoch": 0.011659, "grad_norm": 0.12106669694185257, "learning_rate": 1e-05, "loss": 0.0165, "step": 1165900 }, { "epoch": 0.01166, "grad_norm": 0.14012600481510162, "learning_rate": 1e-05, "loss": 0.0164, "step": 1166000 }, { "epoch": 0.011661, "grad_norm": 0.1281329095363617, "learning_rate": 1e-05, "loss": 0.0166, "step": 1166100 }, { "epoch": 0.011662, "grad_norm": 0.1387287974357605, "learning_rate": 1e-05, "loss": 0.0165, "step": 1166200 }, { "epoch": 0.011663, "grad_norm": 0.17578372359275818, "learning_rate": 1e-05, "loss": 0.0169, "step": 1166300 }, { "epoch": 0.011664, "grad_norm": 0.17332668602466583, "learning_rate": 1e-05, "loss": 0.017, "step": 1166400 }, { "epoch": 0.011665, "grad_norm": 0.13466253876686096, "learning_rate": 1e-05, "loss": 0.0171, "step": 1166500 }, { "epoch": 0.011666, "grad_norm": 0.12749536335468292, "learning_rate": 1e-05, "loss": 0.0168, "step": 1166600 }, { "epoch": 0.011667, "grad_norm": 0.09603011608123779, "learning_rate": 1e-05, "loss": 0.0167, "step": 1166700 }, { "epoch": 0.011668, "grad_norm": 0.1371699571609497, "learning_rate": 1e-05, "loss": 0.0174, "step": 1166800 }, { "epoch": 0.011669, "grad_norm": 0.10657892376184464, "learning_rate": 1e-05, "loss": 0.0171, "step": 1166900 }, { "epoch": 0.01167, "grad_norm": 0.15924900770187378, "learning_rate": 1e-05, "loss": 0.0169, "step": 1167000 }, { "epoch": 0.011671, "grad_norm": 0.1649934947490692, "learning_rate": 1e-05, "loss": 0.0169, "step": 1167100 }, { "epoch": 0.011672, "grad_norm": 0.13037970662117004, "learning_rate": 1e-05, "loss": 0.0166, "step": 1167200 }, { "epoch": 0.011673, "grad_norm": 0.12294484674930573, "learning_rate": 1e-05, "loss": 0.0167, "step": 1167300 }, { "epoch": 0.011674, "grad_norm": 0.1375347077846527, "learning_rate": 1e-05, "loss": 0.0168, "step": 1167400 }, { "epoch": 0.011675, "grad_norm": 0.13005831837654114, "learning_rate": 1e-05, "loss": 0.0172, "step": 1167500 }, { "epoch": 0.011676, "grad_norm": 0.1318170726299286, "learning_rate": 1e-05, "loss": 0.017, "step": 1167600 }, { "epoch": 0.011677, "grad_norm": 0.2611021101474762, "learning_rate": 1e-05, "loss": 0.0172, "step": 1167700 }, { "epoch": 0.011678, "grad_norm": 0.16120398044586182, "learning_rate": 1e-05, "loss": 0.0169, "step": 1167800 }, { "epoch": 0.011679, "grad_norm": 0.09130186587572098, "learning_rate": 1e-05, "loss": 0.0164, "step": 1167900 }, { "epoch": 0.01168, "grad_norm": 0.14756077527999878, "learning_rate": 1e-05, "loss": 0.017, "step": 1168000 }, { "epoch": 0.011681, "grad_norm": 0.14350806176662445, "learning_rate": 1e-05, "loss": 0.0162, "step": 1168100 }, { "epoch": 0.011682, "grad_norm": 0.14293579757213593, "learning_rate": 1e-05, "loss": 0.0171, "step": 1168200 }, { "epoch": 0.011683, "grad_norm": 0.16173206269741058, "learning_rate": 1e-05, "loss": 0.0169, "step": 1168300 }, { "epoch": 0.011684, "grad_norm": 0.14137813448905945, "learning_rate": 1e-05, "loss": 0.0171, "step": 1168400 }, { "epoch": 0.011685, "grad_norm": 0.1285555064678192, "learning_rate": 1e-05, "loss": 0.0169, "step": 1168500 }, { "epoch": 0.011686, "grad_norm": 0.2380642145872116, "learning_rate": 1e-05, "loss": 0.0172, "step": 1168600 }, { "epoch": 0.011687, "grad_norm": 0.17000462114810944, "learning_rate": 1e-05, "loss": 0.0176, "step": 1168700 }, { "epoch": 0.011688, "grad_norm": 0.1372765302658081, "learning_rate": 1e-05, "loss": 0.0167, "step": 1168800 }, { "epoch": 0.011689, "grad_norm": 0.14422589540481567, "learning_rate": 1e-05, "loss": 0.0165, "step": 1168900 }, { "epoch": 0.01169, "grad_norm": 0.11456261575222015, "learning_rate": 1e-05, "loss": 0.0171, "step": 1169000 }, { "epoch": 0.011691, "grad_norm": 0.07954209297895432, "learning_rate": 1e-05, "loss": 0.0167, "step": 1169100 }, { "epoch": 0.011692, "grad_norm": 0.16201722621917725, "learning_rate": 1e-05, "loss": 0.0164, "step": 1169200 }, { "epoch": 0.011693, "grad_norm": 0.10117172449827194, "learning_rate": 1e-05, "loss": 0.0171, "step": 1169300 }, { "epoch": 0.011694, "grad_norm": 0.1248248815536499, "learning_rate": 1e-05, "loss": 0.017, "step": 1169400 }, { "epoch": 0.011695, "grad_norm": 0.1474696844816208, "learning_rate": 1e-05, "loss": 0.0163, "step": 1169500 }, { "epoch": 0.011696, "grad_norm": 0.10385023057460785, "learning_rate": 1e-05, "loss": 0.0172, "step": 1169600 }, { "epoch": 0.011697, "grad_norm": 0.11392860859632492, "learning_rate": 1e-05, "loss": 0.0169, "step": 1169700 }, { "epoch": 0.011698, "grad_norm": 0.13850222527980804, "learning_rate": 1e-05, "loss": 0.0166, "step": 1169800 }, { "epoch": 0.011699, "grad_norm": 0.13064444065093994, "learning_rate": 1e-05, "loss": 0.0174, "step": 1169900 }, { "epoch": 0.0117, "grad_norm": 0.1037297248840332, "learning_rate": 1e-05, "loss": 0.017, "step": 1170000 }, { "epoch": 0.011701, "grad_norm": 0.0932212844491005, "learning_rate": 1e-05, "loss": 0.0168, "step": 1170100 }, { "epoch": 0.011702, "grad_norm": 0.14123909175395966, "learning_rate": 1e-05, "loss": 0.0169, "step": 1170200 }, { "epoch": 0.011703, "grad_norm": 0.0995071604847908, "learning_rate": 1e-05, "loss": 0.0168, "step": 1170300 }, { "epoch": 0.011704, "grad_norm": 0.10928791761398315, "learning_rate": 1e-05, "loss": 0.0169, "step": 1170400 }, { "epoch": 0.011705, "grad_norm": 0.1917557567358017, "learning_rate": 1e-05, "loss": 0.0174, "step": 1170500 }, { "epoch": 0.011706, "grad_norm": 0.12892447412014008, "learning_rate": 1e-05, "loss": 0.0171, "step": 1170600 }, { "epoch": 0.011707, "grad_norm": 0.12995941936969757, "learning_rate": 1e-05, "loss": 0.0171, "step": 1170700 }, { "epoch": 0.011708, "grad_norm": 0.14961691200733185, "learning_rate": 1e-05, "loss": 0.0168, "step": 1170800 }, { "epoch": 0.011709, "grad_norm": 0.13692538440227509, "learning_rate": 1e-05, "loss": 0.0171, "step": 1170900 }, { "epoch": 0.01171, "grad_norm": 0.13660718500614166, "learning_rate": 1e-05, "loss": 0.017, "step": 1171000 }, { "epoch": 0.011711, "grad_norm": 0.13168936967849731, "learning_rate": 1e-05, "loss": 0.0167, "step": 1171100 }, { "epoch": 0.011712, "grad_norm": 0.12169624119997025, "learning_rate": 1e-05, "loss": 0.017, "step": 1171200 }, { "epoch": 0.011713, "grad_norm": 0.13080643117427826, "learning_rate": 1e-05, "loss": 0.017, "step": 1171300 }, { "epoch": 0.011714, "grad_norm": 0.1734146773815155, "learning_rate": 1e-05, "loss": 0.017, "step": 1171400 }, { "epoch": 0.011715, "grad_norm": 0.15436314046382904, "learning_rate": 1e-05, "loss": 0.0173, "step": 1171500 }, { "epoch": 0.011716, "grad_norm": 0.1452534943819046, "learning_rate": 1e-05, "loss": 0.017, "step": 1171600 }, { "epoch": 0.011717, "grad_norm": 0.11535606533288956, "learning_rate": 1e-05, "loss": 0.0166, "step": 1171700 }, { "epoch": 0.011718, "grad_norm": 0.10714204609394073, "learning_rate": 1e-05, "loss": 0.0171, "step": 1171800 }, { "epoch": 0.011719, "grad_norm": 0.094215989112854, "learning_rate": 1e-05, "loss": 0.0171, "step": 1171900 }, { "epoch": 0.01172, "grad_norm": 0.11793345212936401, "learning_rate": 1e-05, "loss": 0.0173, "step": 1172000 }, { "epoch": 0.011721, "grad_norm": 0.1251632273197174, "learning_rate": 1e-05, "loss": 0.0166, "step": 1172100 }, { "epoch": 0.011722, "grad_norm": 0.1512594372034073, "learning_rate": 1e-05, "loss": 0.0168, "step": 1172200 }, { "epoch": 0.011723, "grad_norm": 0.09456025063991547, "learning_rate": 1e-05, "loss": 0.0171, "step": 1172300 }, { "epoch": 0.011724, "grad_norm": 0.15639455616474152, "learning_rate": 1e-05, "loss": 0.0166, "step": 1172400 }, { "epoch": 0.011725, "grad_norm": 0.13109475374221802, "learning_rate": 1e-05, "loss": 0.0163, "step": 1172500 }, { "epoch": 0.011726, "grad_norm": 0.0989171639084816, "learning_rate": 1e-05, "loss": 0.0167, "step": 1172600 }, { "epoch": 0.011727, "grad_norm": 0.233717143535614, "learning_rate": 1e-05, "loss": 0.0169, "step": 1172700 }, { "epoch": 0.011728, "grad_norm": 0.200834259390831, "learning_rate": 1e-05, "loss": 0.0168, "step": 1172800 }, { "epoch": 0.011729, "grad_norm": 0.12525606155395508, "learning_rate": 1e-05, "loss": 0.0167, "step": 1172900 }, { "epoch": 0.01173, "grad_norm": 0.10598747432231903, "learning_rate": 1e-05, "loss": 0.0166, "step": 1173000 }, { "epoch": 0.011731, "grad_norm": 0.15068912506103516, "learning_rate": 1e-05, "loss": 0.0169, "step": 1173100 }, { "epoch": 0.011732, "grad_norm": 0.11380257457494736, "learning_rate": 1e-05, "loss": 0.0172, "step": 1173200 }, { "epoch": 0.011733, "grad_norm": 0.1559964418411255, "learning_rate": 1e-05, "loss": 0.0169, "step": 1173300 }, { "epoch": 0.011734, "grad_norm": 0.12145651876926422, "learning_rate": 1e-05, "loss": 0.017, "step": 1173400 }, { "epoch": 0.011735, "grad_norm": 0.133161723613739, "learning_rate": 1e-05, "loss": 0.0168, "step": 1173500 }, { "epoch": 0.011736, "grad_norm": 0.14257481694221497, "learning_rate": 1e-05, "loss": 0.0168, "step": 1173600 }, { "epoch": 0.011737, "grad_norm": 0.1305934637784958, "learning_rate": 1e-05, "loss": 0.0167, "step": 1173700 }, { "epoch": 0.011738, "grad_norm": 0.1316295713186264, "learning_rate": 1e-05, "loss": 0.0166, "step": 1173800 }, { "epoch": 0.011739, "grad_norm": 0.1102285236120224, "learning_rate": 1e-05, "loss": 0.0169, "step": 1173900 }, { "epoch": 0.01174, "grad_norm": 0.1439344882965088, "learning_rate": 1e-05, "loss": 0.0161, "step": 1174000 }, { "epoch": 0.011741, "grad_norm": 0.1923600137233734, "learning_rate": 1e-05, "loss": 0.017, "step": 1174100 }, { "epoch": 0.011742, "grad_norm": 0.16870944201946259, "learning_rate": 1e-05, "loss": 0.0169, "step": 1174200 }, { "epoch": 0.011743, "grad_norm": 0.14269182085990906, "learning_rate": 1e-05, "loss": 0.0166, "step": 1174300 }, { "epoch": 0.011744, "grad_norm": 0.1412336528301239, "learning_rate": 1e-05, "loss": 0.017, "step": 1174400 }, { "epoch": 0.011745, "grad_norm": 0.09888223558664322, "learning_rate": 1e-05, "loss": 0.0168, "step": 1174500 }, { "epoch": 0.011746, "grad_norm": 0.11363311111927032, "learning_rate": 1e-05, "loss": 0.017, "step": 1174600 }, { "epoch": 0.011747, "grad_norm": 0.1361258327960968, "learning_rate": 1e-05, "loss": 0.0172, "step": 1174700 }, { "epoch": 0.011748, "grad_norm": 0.18061071634292603, "learning_rate": 1e-05, "loss": 0.017, "step": 1174800 }, { "epoch": 0.011749, "grad_norm": 0.1218658909201622, "learning_rate": 1e-05, "loss": 0.0167, "step": 1174900 }, { "epoch": 0.01175, "grad_norm": 0.13280071318149567, "learning_rate": 1e-05, "loss": 0.0168, "step": 1175000 }, { "epoch": 0.011751, "grad_norm": 0.13348327577114105, "learning_rate": 1e-05, "loss": 0.0173, "step": 1175100 }, { "epoch": 0.011752, "grad_norm": 0.14920948445796967, "learning_rate": 1e-05, "loss": 0.0167, "step": 1175200 }, { "epoch": 0.011753, "grad_norm": 0.14900000393390656, "learning_rate": 1e-05, "loss": 0.0169, "step": 1175300 }, { "epoch": 0.011754, "grad_norm": 0.12930768728256226, "learning_rate": 1e-05, "loss": 0.0172, "step": 1175400 }, { "epoch": 0.011755, "grad_norm": 0.10966739803552628, "learning_rate": 1e-05, "loss": 0.0167, "step": 1175500 }, { "epoch": 0.011756, "grad_norm": 0.0988847017288208, "learning_rate": 1e-05, "loss": 0.017, "step": 1175600 }, { "epoch": 0.011757, "grad_norm": 0.1947464942932129, "learning_rate": 1e-05, "loss": 0.0171, "step": 1175700 }, { "epoch": 0.011758, "grad_norm": 0.0968438908457756, "learning_rate": 1e-05, "loss": 0.0171, "step": 1175800 }, { "epoch": 0.011759, "grad_norm": 0.142981618642807, "learning_rate": 1e-05, "loss": 0.0164, "step": 1175900 }, { "epoch": 0.01176, "grad_norm": 0.23176062107086182, "learning_rate": 1e-05, "loss": 0.017, "step": 1176000 }, { "epoch": 0.011761, "grad_norm": 0.10829086601734161, "learning_rate": 1e-05, "loss": 0.0167, "step": 1176100 }, { "epoch": 0.011762, "grad_norm": 0.1159345954656601, "learning_rate": 1e-05, "loss": 0.0169, "step": 1176200 }, { "epoch": 0.011763, "grad_norm": 0.13436754047870636, "learning_rate": 1e-05, "loss": 0.0165, "step": 1176300 }, { "epoch": 0.011764, "grad_norm": 0.1080852672457695, "learning_rate": 1e-05, "loss": 0.0169, "step": 1176400 }, { "epoch": 0.011765, "grad_norm": 0.15463513135910034, "learning_rate": 1e-05, "loss": 0.0167, "step": 1176500 }, { "epoch": 0.011766, "grad_norm": 0.13026265799999237, "learning_rate": 1e-05, "loss": 0.0165, "step": 1176600 }, { "epoch": 0.011767, "grad_norm": 0.1629854142665863, "learning_rate": 1e-05, "loss": 0.0168, "step": 1176700 }, { "epoch": 0.011768, "grad_norm": 0.13914060592651367, "learning_rate": 1e-05, "loss": 0.0167, "step": 1176800 }, { "epoch": 0.011769, "grad_norm": 0.16170355677604675, "learning_rate": 1e-05, "loss": 0.0168, "step": 1176900 }, { "epoch": 0.01177, "grad_norm": 0.08723961561918259, "learning_rate": 1e-05, "loss": 0.0165, "step": 1177000 }, { "epoch": 0.011771, "grad_norm": 0.12727871537208557, "learning_rate": 1e-05, "loss": 0.0169, "step": 1177100 }, { "epoch": 0.011772, "grad_norm": 0.12120237946510315, "learning_rate": 1e-05, "loss": 0.0169, "step": 1177200 }, { "epoch": 0.011773, "grad_norm": 0.160289004445076, "learning_rate": 1e-05, "loss": 0.0168, "step": 1177300 }, { "epoch": 0.011774, "grad_norm": 0.13158375024795532, "learning_rate": 1e-05, "loss": 0.0174, "step": 1177400 }, { "epoch": 0.011775, "grad_norm": 0.11174638569355011, "learning_rate": 1e-05, "loss": 0.0169, "step": 1177500 }, { "epoch": 0.011776, "grad_norm": 0.1436772644519806, "learning_rate": 1e-05, "loss": 0.0168, "step": 1177600 }, { "epoch": 0.011777, "grad_norm": 0.13151495158672333, "learning_rate": 1e-05, "loss": 0.017, "step": 1177700 }, { "epoch": 0.011778, "grad_norm": 0.11683475226163864, "learning_rate": 1e-05, "loss": 0.0165, "step": 1177800 }, { "epoch": 0.011779, "grad_norm": 0.13662178814411163, "learning_rate": 1e-05, "loss": 0.017, "step": 1177900 }, { "epoch": 0.01178, "grad_norm": 0.11396724730730057, "learning_rate": 1e-05, "loss": 0.0163, "step": 1178000 }, { "epoch": 0.011781, "grad_norm": 0.14623697102069855, "learning_rate": 1e-05, "loss": 0.0167, "step": 1178100 }, { "epoch": 0.011782, "grad_norm": 0.22220134735107422, "learning_rate": 1e-05, "loss": 0.0165, "step": 1178200 }, { "epoch": 0.011783, "grad_norm": 0.12344928085803986, "learning_rate": 1e-05, "loss": 0.0171, "step": 1178300 }, { "epoch": 0.011784, "grad_norm": 0.10050401091575623, "learning_rate": 1e-05, "loss": 0.017, "step": 1178400 }, { "epoch": 0.011785, "grad_norm": 0.13405555486679077, "learning_rate": 1e-05, "loss": 0.0171, "step": 1178500 }, { "epoch": 0.011786, "grad_norm": 0.15018776059150696, "learning_rate": 1e-05, "loss": 0.0169, "step": 1178600 }, { "epoch": 0.011787, "grad_norm": 0.12377027422189713, "learning_rate": 1e-05, "loss": 0.0167, "step": 1178700 }, { "epoch": 0.011788, "grad_norm": 0.18471582233905792, "learning_rate": 1e-05, "loss": 0.017, "step": 1178800 }, { "epoch": 0.011789, "grad_norm": 0.1353936493396759, "learning_rate": 1e-05, "loss": 0.0165, "step": 1178900 }, { "epoch": 0.01179, "grad_norm": 0.19306926429271698, "learning_rate": 1e-05, "loss": 0.0169, "step": 1179000 }, { "epoch": 0.011791, "grad_norm": 0.11720424890518188, "learning_rate": 1e-05, "loss": 0.0172, "step": 1179100 }, { "epoch": 0.011792, "grad_norm": 0.13854867219924927, "learning_rate": 1e-05, "loss": 0.0166, "step": 1179200 }, { "epoch": 0.011793, "grad_norm": 0.14492826163768768, "learning_rate": 1e-05, "loss": 0.0166, "step": 1179300 }, { "epoch": 0.011794, "grad_norm": 0.16925455629825592, "learning_rate": 1e-05, "loss": 0.017, "step": 1179400 }, { "epoch": 0.011795, "grad_norm": 0.10176948457956314, "learning_rate": 1e-05, "loss": 0.0166, "step": 1179500 }, { "epoch": 0.011796, "grad_norm": 0.14905428886413574, "learning_rate": 1e-05, "loss": 0.0166, "step": 1179600 }, { "epoch": 0.011797, "grad_norm": 0.17485518753528595, "learning_rate": 1e-05, "loss": 0.0169, "step": 1179700 }, { "epoch": 0.011798, "grad_norm": 0.10339540988206863, "learning_rate": 1e-05, "loss": 0.0167, "step": 1179800 }, { "epoch": 0.011799, "grad_norm": 0.13794445991516113, "learning_rate": 1e-05, "loss": 0.0166, "step": 1179900 }, { "epoch": 0.0118, "grad_norm": 0.12477602809667587, "learning_rate": 1e-05, "loss": 0.0167, "step": 1180000 }, { "epoch": 0.0118, "eval_loss": 0.014924289658665657, "eval_runtime": 189.2094, "eval_samples_per_second": 264.257, "eval_steps_per_second": 16.516, "step": 1180000 }, { "epoch": 0.011801, "grad_norm": 0.16252675652503967, "learning_rate": 1e-05, "loss": 0.017, "step": 1180100 }, { "epoch": 0.011802, "grad_norm": 0.1473822444677353, "learning_rate": 1e-05, "loss": 0.0164, "step": 1180200 }, { "epoch": 0.011803, "grad_norm": 0.14463016390800476, "learning_rate": 1e-05, "loss": 0.0164, "step": 1180300 }, { "epoch": 0.011804, "grad_norm": 0.08894936740398407, "learning_rate": 1e-05, "loss": 0.0169, "step": 1180400 }, { "epoch": 0.011805, "grad_norm": 0.17445281147956848, "learning_rate": 1e-05, "loss": 0.0164, "step": 1180500 }, { "epoch": 0.011806, "grad_norm": 0.14551866054534912, "learning_rate": 1e-05, "loss": 0.0169, "step": 1180600 }, { "epoch": 0.011807, "grad_norm": 0.1933366060256958, "learning_rate": 1e-05, "loss": 0.0165, "step": 1180700 }, { "epoch": 0.011808, "grad_norm": 0.12596265971660614, "learning_rate": 1e-05, "loss": 0.0171, "step": 1180800 }, { "epoch": 0.011809, "grad_norm": 0.09381230920553207, "learning_rate": 1e-05, "loss": 0.0169, "step": 1180900 }, { "epoch": 0.01181, "grad_norm": 0.15645550191402435, "learning_rate": 1e-05, "loss": 0.0167, "step": 1181000 }, { "epoch": 0.011811, "grad_norm": 0.0964626595377922, "learning_rate": 1e-05, "loss": 0.0166, "step": 1181100 }, { "epoch": 0.011812, "grad_norm": 0.14473383128643036, "learning_rate": 1e-05, "loss": 0.0171, "step": 1181200 }, { "epoch": 0.011813, "grad_norm": 0.19233672320842743, "learning_rate": 1e-05, "loss": 0.0167, "step": 1181300 }, { "epoch": 0.011814, "grad_norm": 0.09228183329105377, "learning_rate": 1e-05, "loss": 0.0164, "step": 1181400 }, { "epoch": 0.011815, "grad_norm": 0.15691964328289032, "learning_rate": 1e-05, "loss": 0.0165, "step": 1181500 }, { "epoch": 0.011816, "grad_norm": 0.13913275301456451, "learning_rate": 1e-05, "loss": 0.0169, "step": 1181600 }, { "epoch": 0.011817, "grad_norm": 0.11754409968852997, "learning_rate": 1e-05, "loss": 0.0167, "step": 1181700 }, { "epoch": 0.011818, "grad_norm": 0.1267780065536499, "learning_rate": 1e-05, "loss": 0.017, "step": 1181800 }, { "epoch": 0.011819, "grad_norm": 0.10035046190023422, "learning_rate": 1e-05, "loss": 0.017, "step": 1181900 }, { "epoch": 0.01182, "grad_norm": 0.12818269431591034, "learning_rate": 1e-05, "loss": 0.0168, "step": 1182000 }, { "epoch": 0.011821, "grad_norm": 0.11328757554292679, "learning_rate": 1e-05, "loss": 0.0167, "step": 1182100 }, { "epoch": 0.011822, "grad_norm": 0.16057947278022766, "learning_rate": 1e-05, "loss": 0.0168, "step": 1182200 }, { "epoch": 0.011823, "grad_norm": 0.11021552234888077, "learning_rate": 1e-05, "loss": 0.0167, "step": 1182300 }, { "epoch": 0.011824, "grad_norm": 0.11231599003076553, "learning_rate": 1e-05, "loss": 0.0168, "step": 1182400 }, { "epoch": 0.011825, "grad_norm": 0.15460768342018127, "learning_rate": 1e-05, "loss": 0.0166, "step": 1182500 }, { "epoch": 0.011826, "grad_norm": 0.11784128844738007, "learning_rate": 1e-05, "loss": 0.0167, "step": 1182600 }, { "epoch": 0.011827, "grad_norm": 0.16400142014026642, "learning_rate": 1e-05, "loss": 0.0169, "step": 1182700 }, { "epoch": 0.011828, "grad_norm": 0.11881192773580551, "learning_rate": 1e-05, "loss": 0.0167, "step": 1182800 }, { "epoch": 0.011829, "grad_norm": 0.11541695147752762, "learning_rate": 1e-05, "loss": 0.0167, "step": 1182900 }, { "epoch": 0.01183, "grad_norm": 0.1554231196641922, "learning_rate": 1e-05, "loss": 0.017, "step": 1183000 }, { "epoch": 0.011831, "grad_norm": 0.11347845941781998, "learning_rate": 1e-05, "loss": 0.0168, "step": 1183100 }, { "epoch": 0.011832, "grad_norm": 0.12699726223945618, "learning_rate": 1e-05, "loss": 0.0169, "step": 1183200 }, { "epoch": 0.011833, "grad_norm": 0.15752704441547394, "learning_rate": 1e-05, "loss": 0.0169, "step": 1183300 }, { "epoch": 0.011834, "grad_norm": 0.11574111878871918, "learning_rate": 1e-05, "loss": 0.0171, "step": 1183400 }, { "epoch": 0.011835, "grad_norm": 0.13187438249588013, "learning_rate": 1e-05, "loss": 0.0171, "step": 1183500 }, { "epoch": 0.011836, "grad_norm": 0.1329299807548523, "learning_rate": 1e-05, "loss": 0.0171, "step": 1183600 }, { "epoch": 0.011837, "grad_norm": 0.10596989095211029, "learning_rate": 1e-05, "loss": 0.017, "step": 1183700 }, { "epoch": 0.011838, "grad_norm": 0.1519593447446823, "learning_rate": 1e-05, "loss": 0.0167, "step": 1183800 }, { "epoch": 0.011839, "grad_norm": 0.18003487586975098, "learning_rate": 1e-05, "loss": 0.0167, "step": 1183900 }, { "epoch": 0.01184, "grad_norm": 0.13919369876384735, "learning_rate": 1e-05, "loss": 0.0167, "step": 1184000 }, { "epoch": 0.011841, "grad_norm": 0.11750081181526184, "learning_rate": 1e-05, "loss": 0.0171, "step": 1184100 }, { "epoch": 0.011842, "grad_norm": 0.16244499385356903, "learning_rate": 1e-05, "loss": 0.0171, "step": 1184200 }, { "epoch": 0.011843, "grad_norm": 0.15132594108581543, "learning_rate": 1e-05, "loss": 0.0167, "step": 1184300 }, { "epoch": 0.011844, "grad_norm": 0.09778079390525818, "learning_rate": 1e-05, "loss": 0.0169, "step": 1184400 }, { "epoch": 0.011845, "grad_norm": 0.17630790174007416, "learning_rate": 1e-05, "loss": 0.0169, "step": 1184500 }, { "epoch": 0.011846, "grad_norm": 0.12087170779705048, "learning_rate": 1e-05, "loss": 0.0171, "step": 1184600 }, { "epoch": 0.011847, "grad_norm": 0.09329478442668915, "learning_rate": 1e-05, "loss": 0.0169, "step": 1184700 }, { "epoch": 0.011848, "grad_norm": 0.11893750727176666, "learning_rate": 1e-05, "loss": 0.0164, "step": 1184800 }, { "epoch": 0.011849, "grad_norm": 0.14526760578155518, "learning_rate": 1e-05, "loss": 0.0165, "step": 1184900 }, { "epoch": 0.01185, "grad_norm": 0.10855072736740112, "learning_rate": 1e-05, "loss": 0.0168, "step": 1185000 }, { "epoch": 0.011851, "grad_norm": 0.12005794793367386, "learning_rate": 1e-05, "loss": 0.0167, "step": 1185100 }, { "epoch": 0.011852, "grad_norm": 0.16484887897968292, "learning_rate": 1e-05, "loss": 0.0172, "step": 1185200 }, { "epoch": 0.011853, "grad_norm": 0.09904816001653671, "learning_rate": 1e-05, "loss": 0.0166, "step": 1185300 }, { "epoch": 0.011854, "grad_norm": 0.11151108145713806, "learning_rate": 1e-05, "loss": 0.0167, "step": 1185400 }, { "epoch": 0.011855, "grad_norm": 0.19185025990009308, "learning_rate": 1e-05, "loss": 0.0166, "step": 1185500 }, { "epoch": 0.011856, "grad_norm": 0.13907162845134735, "learning_rate": 1e-05, "loss": 0.0167, "step": 1185600 }, { "epoch": 0.011857, "grad_norm": 0.10045190155506134, "learning_rate": 1e-05, "loss": 0.0169, "step": 1185700 }, { "epoch": 0.011858, "grad_norm": 0.11111525446176529, "learning_rate": 1e-05, "loss": 0.0167, "step": 1185800 }, { "epoch": 0.011859, "grad_norm": 0.11590731143951416, "learning_rate": 1e-05, "loss": 0.0169, "step": 1185900 }, { "epoch": 0.01186, "grad_norm": 0.12612390518188477, "learning_rate": 1e-05, "loss": 0.0167, "step": 1186000 }, { "epoch": 0.011861, "grad_norm": 0.12860219180583954, "learning_rate": 1e-05, "loss": 0.0171, "step": 1186100 }, { "epoch": 0.011862, "grad_norm": 0.18189120292663574, "learning_rate": 1e-05, "loss": 0.0164, "step": 1186200 }, { "epoch": 0.011863, "grad_norm": 0.156722292304039, "learning_rate": 1e-05, "loss": 0.0164, "step": 1186300 }, { "epoch": 0.011864, "grad_norm": 0.17492233216762543, "learning_rate": 1e-05, "loss": 0.0162, "step": 1186400 }, { "epoch": 0.011865, "grad_norm": 0.142281174659729, "learning_rate": 1e-05, "loss": 0.0167, "step": 1186500 }, { "epoch": 0.011866, "grad_norm": 0.15363357961177826, "learning_rate": 1e-05, "loss": 0.0169, "step": 1186600 }, { "epoch": 0.011867, "grad_norm": 0.11524546146392822, "learning_rate": 1e-05, "loss": 0.017, "step": 1186700 }, { "epoch": 0.011868, "grad_norm": 0.17341843247413635, "learning_rate": 1e-05, "loss": 0.0167, "step": 1186800 }, { "epoch": 0.011869, "grad_norm": 0.1050552949309349, "learning_rate": 1e-05, "loss": 0.0166, "step": 1186900 }, { "epoch": 0.01187, "grad_norm": 0.1665286123752594, "learning_rate": 1e-05, "loss": 0.017, "step": 1187000 }, { "epoch": 0.011871, "grad_norm": 0.13522644340991974, "learning_rate": 1e-05, "loss": 0.0167, "step": 1187100 }, { "epoch": 0.011872, "grad_norm": 0.1614864319562912, "learning_rate": 1e-05, "loss": 0.0167, "step": 1187200 }, { "epoch": 0.011873, "grad_norm": 0.12978392839431763, "learning_rate": 1e-05, "loss": 0.0168, "step": 1187300 }, { "epoch": 0.011874, "grad_norm": 0.1448657065629959, "learning_rate": 1e-05, "loss": 0.0168, "step": 1187400 }, { "epoch": 0.011875, "grad_norm": 0.15962332487106323, "learning_rate": 1e-05, "loss": 0.0171, "step": 1187500 }, { "epoch": 0.011876, "grad_norm": 0.14723791182041168, "learning_rate": 1e-05, "loss": 0.017, "step": 1187600 }, { "epoch": 0.011877, "grad_norm": 0.11865302920341492, "learning_rate": 1e-05, "loss": 0.0168, "step": 1187700 }, { "epoch": 0.011878, "grad_norm": 0.10812386870384216, "learning_rate": 1e-05, "loss": 0.0169, "step": 1187800 }, { "epoch": 0.011879, "grad_norm": 0.146334707736969, "learning_rate": 1e-05, "loss": 0.0169, "step": 1187900 }, { "epoch": 0.01188, "grad_norm": 0.09870865941047668, "learning_rate": 1e-05, "loss": 0.0165, "step": 1188000 }, { "epoch": 0.011881, "grad_norm": 0.10766094923019409, "learning_rate": 1e-05, "loss": 0.017, "step": 1188100 }, { "epoch": 0.011882, "grad_norm": 0.12890399992465973, "learning_rate": 1e-05, "loss": 0.0168, "step": 1188200 }, { "epoch": 0.011883, "grad_norm": 0.13720881938934326, "learning_rate": 1e-05, "loss": 0.0173, "step": 1188300 }, { "epoch": 0.011884, "grad_norm": 0.12274720519781113, "learning_rate": 1e-05, "loss": 0.0169, "step": 1188400 }, { "epoch": 0.011885, "grad_norm": 0.1314547061920166, "learning_rate": 1e-05, "loss": 0.0166, "step": 1188500 }, { "epoch": 0.011886, "grad_norm": 0.1159341037273407, "learning_rate": 1e-05, "loss": 0.0169, "step": 1188600 }, { "epoch": 0.011887, "grad_norm": 0.10722777247428894, "learning_rate": 1e-05, "loss": 0.0169, "step": 1188700 }, { "epoch": 0.011888, "grad_norm": 0.19496169686317444, "learning_rate": 1e-05, "loss": 0.0165, "step": 1188800 }, { "epoch": 0.011889, "grad_norm": 0.16430805623531342, "learning_rate": 1e-05, "loss": 0.0168, "step": 1188900 }, { "epoch": 0.01189, "grad_norm": 0.10871028900146484, "learning_rate": 1e-05, "loss": 0.0167, "step": 1189000 }, { "epoch": 0.011891, "grad_norm": 0.15021973848342896, "learning_rate": 1e-05, "loss": 0.0167, "step": 1189100 }, { "epoch": 0.011892, "grad_norm": 0.13173003494739532, "learning_rate": 1e-05, "loss": 0.0166, "step": 1189200 }, { "epoch": 0.011893, "grad_norm": 0.21125704050064087, "learning_rate": 1e-05, "loss": 0.0172, "step": 1189300 }, { "epoch": 0.011894, "grad_norm": 0.1273326575756073, "learning_rate": 1e-05, "loss": 0.0165, "step": 1189400 }, { "epoch": 0.011895, "grad_norm": 0.10536225885152817, "learning_rate": 1e-05, "loss": 0.0167, "step": 1189500 }, { "epoch": 0.011896, "grad_norm": 0.15058556199073792, "learning_rate": 1e-05, "loss": 0.0166, "step": 1189600 }, { "epoch": 0.011897, "grad_norm": 0.12002454698085785, "learning_rate": 1e-05, "loss": 0.0172, "step": 1189700 }, { "epoch": 0.011898, "grad_norm": 0.14819052815437317, "learning_rate": 1e-05, "loss": 0.0171, "step": 1189800 }, { "epoch": 0.011899, "grad_norm": 0.16826128959655762, "learning_rate": 1e-05, "loss": 0.017, "step": 1189900 }, { "epoch": 0.0119, "grad_norm": 0.11643534898757935, "learning_rate": 1e-05, "loss": 0.0165, "step": 1190000 }, { "epoch": 0.011901, "grad_norm": 0.16712148487567902, "learning_rate": 1e-05, "loss": 0.0167, "step": 1190100 }, { "epoch": 0.011902, "grad_norm": 0.14133431017398834, "learning_rate": 1e-05, "loss": 0.0167, "step": 1190200 }, { "epoch": 0.011903, "grad_norm": 0.10299797356128693, "learning_rate": 1e-05, "loss": 0.0167, "step": 1190300 }, { "epoch": 0.011904, "grad_norm": 0.12986911833286285, "learning_rate": 1e-05, "loss": 0.0169, "step": 1190400 }, { "epoch": 0.011905, "grad_norm": 0.12501128017902374, "learning_rate": 1e-05, "loss": 0.0172, "step": 1190500 }, { "epoch": 0.011906, "grad_norm": 0.24354179203510284, "learning_rate": 1e-05, "loss": 0.0167, "step": 1190600 }, { "epoch": 0.011907, "grad_norm": 0.1268087476491928, "learning_rate": 1e-05, "loss": 0.0165, "step": 1190700 }, { "epoch": 0.011908, "grad_norm": 0.11267440766096115, "learning_rate": 1e-05, "loss": 0.0166, "step": 1190800 }, { "epoch": 0.011909, "grad_norm": 0.14188794791698456, "learning_rate": 1e-05, "loss": 0.0164, "step": 1190900 }, { "epoch": 0.01191, "grad_norm": 0.11610504984855652, "learning_rate": 1e-05, "loss": 0.017, "step": 1191000 }, { "epoch": 0.011911, "grad_norm": 0.1762504130601883, "learning_rate": 1e-05, "loss": 0.0166, "step": 1191100 }, { "epoch": 0.011912, "grad_norm": 0.11020927876234055, "learning_rate": 1e-05, "loss": 0.017, "step": 1191200 }, { "epoch": 0.011913, "grad_norm": 0.21736754477024078, "learning_rate": 1e-05, "loss": 0.0168, "step": 1191300 }, { "epoch": 0.011914, "grad_norm": 0.12422352284193039, "learning_rate": 1e-05, "loss": 0.0168, "step": 1191400 }, { "epoch": 0.011915, "grad_norm": 0.14049804210662842, "learning_rate": 1e-05, "loss": 0.0167, "step": 1191500 }, { "epoch": 0.011916, "grad_norm": 0.22207772731781006, "learning_rate": 1e-05, "loss": 0.0165, "step": 1191600 }, { "epoch": 0.011917, "grad_norm": 0.10259600728750229, "learning_rate": 1e-05, "loss": 0.0171, "step": 1191700 }, { "epoch": 0.011918, "grad_norm": 0.09920425713062286, "learning_rate": 1e-05, "loss": 0.0171, "step": 1191800 }, { "epoch": 0.011919, "grad_norm": 0.14237448573112488, "learning_rate": 1e-05, "loss": 0.017, "step": 1191900 }, { "epoch": 0.01192, "grad_norm": 0.1178436428308487, "learning_rate": 1e-05, "loss": 0.0166, "step": 1192000 }, { "epoch": 0.011921, "grad_norm": 0.14689959585666656, "learning_rate": 1e-05, "loss": 0.0173, "step": 1192100 }, { "epoch": 0.011922, "grad_norm": 0.14301112294197083, "learning_rate": 1e-05, "loss": 0.0173, "step": 1192200 }, { "epoch": 0.011923, "grad_norm": 0.11848088353872299, "learning_rate": 1e-05, "loss": 0.0165, "step": 1192300 }, { "epoch": 0.011924, "grad_norm": 0.17333930730819702, "learning_rate": 1e-05, "loss": 0.0168, "step": 1192400 }, { "epoch": 0.011925, "grad_norm": 0.16843682527542114, "learning_rate": 1e-05, "loss": 0.0167, "step": 1192500 }, { "epoch": 0.011926, "grad_norm": 0.1491720825433731, "learning_rate": 1e-05, "loss": 0.0166, "step": 1192600 }, { "epoch": 0.011927, "grad_norm": 0.11050979793071747, "learning_rate": 1e-05, "loss": 0.0162, "step": 1192700 }, { "epoch": 0.011928, "grad_norm": 0.17255248129367828, "learning_rate": 1e-05, "loss": 0.0173, "step": 1192800 }, { "epoch": 0.011929, "grad_norm": 0.1302196979522705, "learning_rate": 1e-05, "loss": 0.0168, "step": 1192900 }, { "epoch": 0.01193, "grad_norm": 0.15389510989189148, "learning_rate": 1e-05, "loss": 0.0167, "step": 1193000 }, { "epoch": 0.011931, "grad_norm": 0.1166398674249649, "learning_rate": 1e-05, "loss": 0.017, "step": 1193100 }, { "epoch": 0.011932, "grad_norm": 0.16499000787734985, "learning_rate": 1e-05, "loss": 0.017, "step": 1193200 }, { "epoch": 0.011933, "grad_norm": 0.09456483274698257, "learning_rate": 1e-05, "loss": 0.0164, "step": 1193300 }, { "epoch": 0.011934, "grad_norm": 0.14116239547729492, "learning_rate": 1e-05, "loss": 0.0168, "step": 1193400 }, { "epoch": 0.011935, "grad_norm": 0.12920206785202026, "learning_rate": 1e-05, "loss": 0.0167, "step": 1193500 }, { "epoch": 0.011936, "grad_norm": 0.1325928419828415, "learning_rate": 1e-05, "loss": 0.0168, "step": 1193600 }, { "epoch": 0.011937, "grad_norm": 0.12448328733444214, "learning_rate": 1e-05, "loss": 0.0165, "step": 1193700 }, { "epoch": 0.011938, "grad_norm": 0.11163333058357239, "learning_rate": 1e-05, "loss": 0.0171, "step": 1193800 }, { "epoch": 0.011939, "grad_norm": 0.14191125333309174, "learning_rate": 1e-05, "loss": 0.0172, "step": 1193900 }, { "epoch": 0.01194, "grad_norm": 0.14852677285671234, "learning_rate": 1e-05, "loss": 0.0166, "step": 1194000 }, { "epoch": 0.011941, "grad_norm": 0.16963475942611694, "learning_rate": 1e-05, "loss": 0.0171, "step": 1194100 }, { "epoch": 0.011942, "grad_norm": 0.15804769098758698, "learning_rate": 1e-05, "loss": 0.0165, "step": 1194200 }, { "epoch": 0.011943, "grad_norm": 0.11641165614128113, "learning_rate": 1e-05, "loss": 0.0166, "step": 1194300 }, { "epoch": 0.011944, "grad_norm": 0.15543609857559204, "learning_rate": 1e-05, "loss": 0.0166, "step": 1194400 }, { "epoch": 0.011945, "grad_norm": 0.13667556643486023, "learning_rate": 1e-05, "loss": 0.0168, "step": 1194500 }, { "epoch": 0.011946, "grad_norm": 0.14675575494766235, "learning_rate": 1e-05, "loss": 0.017, "step": 1194600 }, { "epoch": 0.011947, "grad_norm": 0.12317287176847458, "learning_rate": 1e-05, "loss": 0.0166, "step": 1194700 }, { "epoch": 0.011948, "grad_norm": 0.1158578097820282, "learning_rate": 1e-05, "loss": 0.0166, "step": 1194800 }, { "epoch": 0.011949, "grad_norm": 0.12321803718805313, "learning_rate": 1e-05, "loss": 0.0167, "step": 1194900 }, { "epoch": 0.01195, "grad_norm": 0.14847537875175476, "learning_rate": 1e-05, "loss": 0.0169, "step": 1195000 }, { "epoch": 0.011951, "grad_norm": 0.12535572052001953, "learning_rate": 1e-05, "loss": 0.0167, "step": 1195100 }, { "epoch": 0.011952, "grad_norm": 0.12695170938968658, "learning_rate": 1e-05, "loss": 0.0166, "step": 1195200 }, { "epoch": 0.011953, "grad_norm": 0.10673629492521286, "learning_rate": 1e-05, "loss": 0.0166, "step": 1195300 }, { "epoch": 0.011954, "grad_norm": 0.17228779196739197, "learning_rate": 1e-05, "loss": 0.0163, "step": 1195400 }, { "epoch": 0.011955, "grad_norm": 0.1640133559703827, "learning_rate": 1e-05, "loss": 0.0169, "step": 1195500 }, { "epoch": 0.011956, "grad_norm": 0.1050846129655838, "learning_rate": 1e-05, "loss": 0.0166, "step": 1195600 }, { "epoch": 0.011957, "grad_norm": 0.13445957005023956, "learning_rate": 1e-05, "loss": 0.0164, "step": 1195700 }, { "epoch": 0.011958, "grad_norm": 0.11114504188299179, "learning_rate": 1e-05, "loss": 0.0167, "step": 1195800 }, { "epoch": 0.011959, "grad_norm": 0.17909646034240723, "learning_rate": 1e-05, "loss": 0.0169, "step": 1195900 }, { "epoch": 0.01196, "grad_norm": 0.159846693277359, "learning_rate": 1e-05, "loss": 0.017, "step": 1196000 }, { "epoch": 0.011961, "grad_norm": 0.16887369751930237, "learning_rate": 1e-05, "loss": 0.0171, "step": 1196100 }, { "epoch": 0.011962, "grad_norm": 0.10436786711215973, "learning_rate": 1e-05, "loss": 0.0165, "step": 1196200 }, { "epoch": 0.011963, "grad_norm": 0.11556665599346161, "learning_rate": 1e-05, "loss": 0.0165, "step": 1196300 }, { "epoch": 0.011964, "grad_norm": 0.17017242312431335, "learning_rate": 1e-05, "loss": 0.0163, "step": 1196400 }, { "epoch": 0.011965, "grad_norm": 0.1232890710234642, "learning_rate": 1e-05, "loss": 0.0166, "step": 1196500 }, { "epoch": 0.011966, "grad_norm": 0.10316316783428192, "learning_rate": 1e-05, "loss": 0.0168, "step": 1196600 }, { "epoch": 0.011967, "grad_norm": 0.12208861112594604, "learning_rate": 1e-05, "loss": 0.0165, "step": 1196700 }, { "epoch": 0.011968, "grad_norm": 0.11215940117835999, "learning_rate": 1e-05, "loss": 0.0166, "step": 1196800 }, { "epoch": 0.011969, "grad_norm": 0.13625679910182953, "learning_rate": 1e-05, "loss": 0.0166, "step": 1196900 }, { "epoch": 0.01197, "grad_norm": 0.1320088505744934, "learning_rate": 1e-05, "loss": 0.0167, "step": 1197000 }, { "epoch": 0.011971, "grad_norm": 0.11331596225500107, "learning_rate": 1e-05, "loss": 0.0165, "step": 1197100 }, { "epoch": 0.011972, "grad_norm": 0.1274658888578415, "learning_rate": 1e-05, "loss": 0.0169, "step": 1197200 }, { "epoch": 0.011973, "grad_norm": 0.15145401656627655, "learning_rate": 1e-05, "loss": 0.0169, "step": 1197300 }, { "epoch": 0.011974, "grad_norm": 0.1357748657464981, "learning_rate": 1e-05, "loss": 0.0164, "step": 1197400 }, { "epoch": 0.011975, "grad_norm": 0.14019443094730377, "learning_rate": 1e-05, "loss": 0.0168, "step": 1197500 }, { "epoch": 0.011976, "grad_norm": 0.11156771332025528, "learning_rate": 1e-05, "loss": 0.0171, "step": 1197600 }, { "epoch": 0.011977, "grad_norm": 0.11515803635120392, "learning_rate": 1e-05, "loss": 0.0171, "step": 1197700 }, { "epoch": 0.011978, "grad_norm": 0.12102801352739334, "learning_rate": 1e-05, "loss": 0.017, "step": 1197800 }, { "epoch": 0.011979, "grad_norm": 0.14640487730503082, "learning_rate": 1e-05, "loss": 0.0169, "step": 1197900 }, { "epoch": 0.01198, "grad_norm": 0.12998026609420776, "learning_rate": 1e-05, "loss": 0.0165, "step": 1198000 }, { "epoch": 0.011981, "grad_norm": 0.1790340542793274, "learning_rate": 1e-05, "loss": 0.0168, "step": 1198100 }, { "epoch": 0.011982, "grad_norm": 0.10772202908992767, "learning_rate": 1e-05, "loss": 0.0165, "step": 1198200 }, { "epoch": 0.011983, "grad_norm": 0.1283879280090332, "learning_rate": 1e-05, "loss": 0.0169, "step": 1198300 }, { "epoch": 0.011984, "grad_norm": 0.11450960487127304, "learning_rate": 1e-05, "loss": 0.0168, "step": 1198400 }, { "epoch": 0.011985, "grad_norm": 0.12765125930309296, "learning_rate": 1e-05, "loss": 0.0166, "step": 1198500 }, { "epoch": 0.011986, "grad_norm": 0.13437388837337494, "learning_rate": 1e-05, "loss": 0.0169, "step": 1198600 }, { "epoch": 0.011987, "grad_norm": 0.0993192195892334, "learning_rate": 1e-05, "loss": 0.0165, "step": 1198700 }, { "epoch": 0.011988, "grad_norm": 0.12928719818592072, "learning_rate": 1e-05, "loss": 0.0163, "step": 1198800 }, { "epoch": 0.011989, "grad_norm": 0.13342782855033875, "learning_rate": 1e-05, "loss": 0.0168, "step": 1198900 }, { "epoch": 0.01199, "grad_norm": 0.10666661709547043, "learning_rate": 1e-05, "loss": 0.017, "step": 1199000 }, { "epoch": 0.011991, "grad_norm": 0.1736457645893097, "learning_rate": 1e-05, "loss": 0.0166, "step": 1199100 }, { "epoch": 0.011992, "grad_norm": 0.15409226715564728, "learning_rate": 1e-05, "loss": 0.0163, "step": 1199200 }, { "epoch": 0.011993, "grad_norm": 0.12451795488595963, "learning_rate": 1e-05, "loss": 0.0164, "step": 1199300 }, { "epoch": 0.011994, "grad_norm": 0.12548522651195526, "learning_rate": 1e-05, "loss": 0.0169, "step": 1199400 }, { "epoch": 0.011995, "grad_norm": 0.10486599057912827, "learning_rate": 1e-05, "loss": 0.0165, "step": 1199500 }, { "epoch": 0.011996, "grad_norm": 0.135768324136734, "learning_rate": 1e-05, "loss": 0.0166, "step": 1199600 }, { "epoch": 0.011997, "grad_norm": 0.11539030075073242, "learning_rate": 1e-05, "loss": 0.0172, "step": 1199700 }, { "epoch": 0.011998, "grad_norm": 0.13478009402751923, "learning_rate": 1e-05, "loss": 0.0165, "step": 1199800 }, { "epoch": 0.011999, "grad_norm": 0.145995631814003, "learning_rate": 1e-05, "loss": 0.0167, "step": 1199900 }, { "epoch": 0.012, "grad_norm": 0.11513392627239227, "learning_rate": 1e-05, "loss": 0.0164, "step": 1200000 }, { "epoch": 0.012, "eval_loss": 0.014706585556268692, "eval_runtime": 187.3529, "eval_samples_per_second": 266.876, "eval_steps_per_second": 16.68, "step": 1200000 }, { "epoch": 0.012001, "grad_norm": 0.14404995739459991, "learning_rate": 1e-05, "loss": 0.0168, "step": 1200100 }, { "epoch": 0.012002, "grad_norm": 0.142418771982193, "learning_rate": 1e-05, "loss": 0.0169, "step": 1200200 }, { "epoch": 0.012003, "grad_norm": 0.17927217483520508, "learning_rate": 1e-05, "loss": 0.0167, "step": 1200300 }, { "epoch": 0.012004, "grad_norm": 0.11811665445566177, "learning_rate": 1e-05, "loss": 0.0164, "step": 1200400 }, { "epoch": 0.012005, "grad_norm": 0.1346016824245453, "learning_rate": 1e-05, "loss": 0.017, "step": 1200500 }, { "epoch": 0.012006, "grad_norm": 0.18777363002300262, "learning_rate": 1e-05, "loss": 0.0164, "step": 1200600 }, { "epoch": 0.012007, "grad_norm": 0.14997604489326477, "learning_rate": 1e-05, "loss": 0.0167, "step": 1200700 }, { "epoch": 0.012008, "grad_norm": 0.1984741985797882, "learning_rate": 1e-05, "loss": 0.0171, "step": 1200800 }, { "epoch": 0.012009, "grad_norm": 0.12745226919651031, "learning_rate": 1e-05, "loss": 0.0164, "step": 1200900 }, { "epoch": 0.01201, "grad_norm": 0.141198992729187, "learning_rate": 1e-05, "loss": 0.0166, "step": 1201000 }, { "epoch": 0.012011, "grad_norm": 0.12028905004262924, "learning_rate": 1e-05, "loss": 0.0168, "step": 1201100 }, { "epoch": 0.012012, "grad_norm": 0.1283135861158371, "learning_rate": 1e-05, "loss": 0.0168, "step": 1201200 }, { "epoch": 0.012013, "grad_norm": 0.18308821320533752, "learning_rate": 1e-05, "loss": 0.0165, "step": 1201300 }, { "epoch": 0.012014, "grad_norm": 0.08968440443277359, "learning_rate": 1e-05, "loss": 0.0166, "step": 1201400 }, { "epoch": 0.012015, "grad_norm": 0.11232523620128632, "learning_rate": 1e-05, "loss": 0.0167, "step": 1201500 }, { "epoch": 0.012016, "grad_norm": 0.14779126644134521, "learning_rate": 1e-05, "loss": 0.0169, "step": 1201600 }, { "epoch": 0.012017, "grad_norm": 0.14631012082099915, "learning_rate": 1e-05, "loss": 0.0167, "step": 1201700 }, { "epoch": 0.012018, "grad_norm": 0.13522560894489288, "learning_rate": 1e-05, "loss": 0.0163, "step": 1201800 }, { "epoch": 0.012019, "grad_norm": 0.1406165212392807, "learning_rate": 1e-05, "loss": 0.0167, "step": 1201900 }, { "epoch": 0.01202, "grad_norm": 0.139296755194664, "learning_rate": 1e-05, "loss": 0.0174, "step": 1202000 }, { "epoch": 0.012021, "grad_norm": 0.17596475780010223, "learning_rate": 1e-05, "loss": 0.0164, "step": 1202100 }, { "epoch": 0.012022, "grad_norm": 0.13216130435466766, "learning_rate": 1e-05, "loss": 0.0166, "step": 1202200 }, { "epoch": 0.012023, "grad_norm": 0.170655757188797, "learning_rate": 1e-05, "loss": 0.017, "step": 1202300 }, { "epoch": 0.012024, "grad_norm": 0.15912528336048126, "learning_rate": 1e-05, "loss": 0.0162, "step": 1202400 }, { "epoch": 0.012025, "grad_norm": 0.12798205018043518, "learning_rate": 1e-05, "loss": 0.0166, "step": 1202500 }, { "epoch": 0.012026, "grad_norm": 0.1303805410861969, "learning_rate": 1e-05, "loss": 0.0168, "step": 1202600 }, { "epoch": 0.012027, "grad_norm": 0.2073034644126892, "learning_rate": 1e-05, "loss": 0.0168, "step": 1202700 }, { "epoch": 0.012028, "grad_norm": 0.1464296281337738, "learning_rate": 1e-05, "loss": 0.0165, "step": 1202800 }, { "epoch": 0.012029, "grad_norm": 0.1614082157611847, "learning_rate": 1e-05, "loss": 0.0167, "step": 1202900 }, { "epoch": 0.01203, "grad_norm": 0.16222982108592987, "learning_rate": 1e-05, "loss": 0.0167, "step": 1203000 }, { "epoch": 0.012031, "grad_norm": 0.13617777824401855, "learning_rate": 1e-05, "loss": 0.0168, "step": 1203100 }, { "epoch": 0.012032, "grad_norm": 0.15030916035175323, "learning_rate": 1e-05, "loss": 0.0165, "step": 1203200 }, { "epoch": 0.012033, "grad_norm": 0.19970600306987762, "learning_rate": 1e-05, "loss": 0.017, "step": 1203300 }, { "epoch": 0.012034, "grad_norm": 0.13554109632968903, "learning_rate": 1e-05, "loss": 0.0169, "step": 1203400 }, { "epoch": 0.012035, "grad_norm": 0.13934408128261566, "learning_rate": 1e-05, "loss": 0.0167, "step": 1203500 }, { "epoch": 0.012036, "grad_norm": 0.1297684907913208, "learning_rate": 1e-05, "loss": 0.0168, "step": 1203600 }, { "epoch": 0.012037, "grad_norm": 0.10141365975141525, "learning_rate": 1e-05, "loss": 0.0165, "step": 1203700 }, { "epoch": 0.012038, "grad_norm": 0.13906808197498322, "learning_rate": 1e-05, "loss": 0.0166, "step": 1203800 }, { "epoch": 0.012039, "grad_norm": 0.1666269153356552, "learning_rate": 1e-05, "loss": 0.0167, "step": 1203900 }, { "epoch": 0.01204, "grad_norm": 0.13140156865119934, "learning_rate": 1e-05, "loss": 0.0166, "step": 1204000 }, { "epoch": 0.012041, "grad_norm": 0.13209408521652222, "learning_rate": 1e-05, "loss": 0.0165, "step": 1204100 }, { "epoch": 0.012042, "grad_norm": 0.16950468719005585, "learning_rate": 1e-05, "loss": 0.0166, "step": 1204200 }, { "epoch": 0.012043, "grad_norm": 0.18820008635520935, "learning_rate": 1e-05, "loss": 0.0167, "step": 1204300 }, { "epoch": 0.012044, "grad_norm": 0.10760045796632767, "learning_rate": 1e-05, "loss": 0.0165, "step": 1204400 }, { "epoch": 0.012045, "grad_norm": 0.17801836133003235, "learning_rate": 1e-05, "loss": 0.0168, "step": 1204500 }, { "epoch": 0.012046, "grad_norm": 0.1290166974067688, "learning_rate": 1e-05, "loss": 0.017, "step": 1204600 }, { "epoch": 0.012047, "grad_norm": 0.14603769779205322, "learning_rate": 1e-05, "loss": 0.0168, "step": 1204700 }, { "epoch": 0.012048, "grad_norm": 0.12670490145683289, "learning_rate": 1e-05, "loss": 0.0166, "step": 1204800 }, { "epoch": 0.012049, "grad_norm": 0.12133526802062988, "learning_rate": 1e-05, "loss": 0.0169, "step": 1204900 }, { "epoch": 0.01205, "grad_norm": 0.14104501903057098, "learning_rate": 1e-05, "loss": 0.0165, "step": 1205000 }, { "epoch": 0.012051, "grad_norm": 0.10385686159133911, "learning_rate": 1e-05, "loss": 0.0166, "step": 1205100 }, { "epoch": 0.012052, "grad_norm": 0.16703738272190094, "learning_rate": 1e-05, "loss": 0.0166, "step": 1205200 }, { "epoch": 0.012053, "grad_norm": 0.1510094255208969, "learning_rate": 1e-05, "loss": 0.0165, "step": 1205300 }, { "epoch": 0.012054, "grad_norm": 0.12585869431495667, "learning_rate": 1e-05, "loss": 0.0171, "step": 1205400 }, { "epoch": 0.012055, "grad_norm": 0.19592173397541046, "learning_rate": 1e-05, "loss": 0.0165, "step": 1205500 }, { "epoch": 0.012056, "grad_norm": 0.12790891528129578, "learning_rate": 1e-05, "loss": 0.0168, "step": 1205600 }, { "epoch": 0.012057, "grad_norm": 0.14024902880191803, "learning_rate": 1e-05, "loss": 0.017, "step": 1205700 }, { "epoch": 0.012058, "grad_norm": 0.1110956072807312, "learning_rate": 1e-05, "loss": 0.0161, "step": 1205800 }, { "epoch": 0.012059, "grad_norm": 0.10311520099639893, "learning_rate": 1e-05, "loss": 0.0171, "step": 1205900 }, { "epoch": 0.01206, "grad_norm": 0.14057418704032898, "learning_rate": 1e-05, "loss": 0.0166, "step": 1206000 }, { "epoch": 0.012061, "grad_norm": 0.14866602420806885, "learning_rate": 1e-05, "loss": 0.0172, "step": 1206100 }, { "epoch": 0.012062, "grad_norm": 0.15765097737312317, "learning_rate": 1e-05, "loss": 0.0168, "step": 1206200 }, { "epoch": 0.012063, "grad_norm": 0.16957764327526093, "learning_rate": 1e-05, "loss": 0.0165, "step": 1206300 }, { "epoch": 0.012064, "grad_norm": 0.11388910561800003, "learning_rate": 1e-05, "loss": 0.0171, "step": 1206400 }, { "epoch": 0.012065, "grad_norm": 0.107940174639225, "learning_rate": 1e-05, "loss": 0.0169, "step": 1206500 }, { "epoch": 0.012066, "grad_norm": 0.12754173576831818, "learning_rate": 1e-05, "loss": 0.0169, "step": 1206600 }, { "epoch": 0.012067, "grad_norm": 0.1123332753777504, "learning_rate": 1e-05, "loss": 0.0167, "step": 1206700 }, { "epoch": 0.012068, "grad_norm": 0.23252524435520172, "learning_rate": 1e-05, "loss": 0.0169, "step": 1206800 }, { "epoch": 0.012069, "grad_norm": 0.21090523898601532, "learning_rate": 1e-05, "loss": 0.0167, "step": 1206900 }, { "epoch": 0.01207, "grad_norm": 0.13948559761047363, "learning_rate": 1e-05, "loss": 0.0166, "step": 1207000 }, { "epoch": 0.012071, "grad_norm": 0.16115550696849823, "learning_rate": 1e-05, "loss": 0.0165, "step": 1207100 }, { "epoch": 0.012072, "grad_norm": 0.2028600126504898, "learning_rate": 1e-05, "loss": 0.0163, "step": 1207200 }, { "epoch": 0.012073, "grad_norm": 0.1669304519891739, "learning_rate": 1e-05, "loss": 0.0164, "step": 1207300 }, { "epoch": 0.012074, "grad_norm": 0.20607459545135498, "learning_rate": 1e-05, "loss": 0.0169, "step": 1207400 }, { "epoch": 0.012075, "grad_norm": 0.13028477132320404, "learning_rate": 1e-05, "loss": 0.0165, "step": 1207500 }, { "epoch": 0.012076, "grad_norm": 0.13052546977996826, "learning_rate": 1e-05, "loss": 0.0164, "step": 1207600 }, { "epoch": 0.012077, "grad_norm": 0.19329792261123657, "learning_rate": 1e-05, "loss": 0.0166, "step": 1207700 }, { "epoch": 0.012078, "grad_norm": 0.11747283488512039, "learning_rate": 1e-05, "loss": 0.0167, "step": 1207800 }, { "epoch": 0.012079, "grad_norm": 0.12845565378665924, "learning_rate": 1e-05, "loss": 0.0164, "step": 1207900 }, { "epoch": 0.01208, "grad_norm": 0.15835730731487274, "learning_rate": 1e-05, "loss": 0.0167, "step": 1208000 }, { "epoch": 0.012081, "grad_norm": 0.15932190418243408, "learning_rate": 1e-05, "loss": 0.0168, "step": 1208100 }, { "epoch": 0.012082, "grad_norm": 0.10274434834718704, "learning_rate": 1e-05, "loss": 0.0165, "step": 1208200 }, { "epoch": 0.012083, "grad_norm": 0.10520465672016144, "learning_rate": 1e-05, "loss": 0.0164, "step": 1208300 }, { "epoch": 0.012084, "grad_norm": 0.13782210648059845, "learning_rate": 1e-05, "loss": 0.0166, "step": 1208400 }, { "epoch": 0.012085, "grad_norm": 0.1268666833639145, "learning_rate": 1e-05, "loss": 0.0165, "step": 1208500 }, { "epoch": 0.012086, "grad_norm": 0.13191552460193634, "learning_rate": 1e-05, "loss": 0.0167, "step": 1208600 }, { "epoch": 0.012087, "grad_norm": 0.11114289611577988, "learning_rate": 1e-05, "loss": 0.0165, "step": 1208700 }, { "epoch": 0.012088, "grad_norm": 0.09573089331388474, "learning_rate": 1e-05, "loss": 0.0166, "step": 1208800 }, { "epoch": 0.012089, "grad_norm": 0.11679085344076157, "learning_rate": 1e-05, "loss": 0.0164, "step": 1208900 }, { "epoch": 0.01209, "grad_norm": 0.11636869609355927, "learning_rate": 1e-05, "loss": 0.0166, "step": 1209000 }, { "epoch": 0.012091, "grad_norm": 0.1495743989944458, "learning_rate": 1e-05, "loss": 0.0166, "step": 1209100 }, { "epoch": 0.012092, "grad_norm": 0.11868133395910263, "learning_rate": 1e-05, "loss": 0.0166, "step": 1209200 }, { "epoch": 0.012093, "grad_norm": 0.1360892355442047, "learning_rate": 1e-05, "loss": 0.0169, "step": 1209300 }, { "epoch": 0.012094, "grad_norm": 0.1283402442932129, "learning_rate": 1e-05, "loss": 0.017, "step": 1209400 }, { "epoch": 0.012095, "grad_norm": 0.1072293370962143, "learning_rate": 1e-05, "loss": 0.0166, "step": 1209500 }, { "epoch": 0.012096, "grad_norm": 0.1197153627872467, "learning_rate": 1e-05, "loss": 0.0171, "step": 1209600 }, { "epoch": 0.012097, "grad_norm": 0.15337830781936646, "learning_rate": 1e-05, "loss": 0.0164, "step": 1209700 }, { "epoch": 0.012098, "grad_norm": 0.10099601745605469, "learning_rate": 1e-05, "loss": 0.0163, "step": 1209800 }, { "epoch": 0.012099, "grad_norm": 0.11982204765081406, "learning_rate": 1e-05, "loss": 0.0165, "step": 1209900 }, { "epoch": 0.0121, "grad_norm": 0.12175257503986359, "learning_rate": 1e-05, "loss": 0.0165, "step": 1210000 }, { "epoch": 0.012101, "grad_norm": 0.15186819434165955, "learning_rate": 1e-05, "loss": 0.0169, "step": 1210100 }, { "epoch": 0.012102, "grad_norm": 0.16520968079566956, "learning_rate": 1e-05, "loss": 0.0168, "step": 1210200 }, { "epoch": 0.012103, "grad_norm": 0.11475227773189545, "learning_rate": 1e-05, "loss": 0.0165, "step": 1210300 }, { "epoch": 0.012104, "grad_norm": 0.19053272902965546, "learning_rate": 1e-05, "loss": 0.0172, "step": 1210400 }, { "epoch": 0.012105, "grad_norm": 0.11372590065002441, "learning_rate": 1e-05, "loss": 0.0167, "step": 1210500 }, { "epoch": 0.012106, "grad_norm": 0.099099300801754, "learning_rate": 1e-05, "loss": 0.0168, "step": 1210600 }, { "epoch": 0.012107, "grad_norm": 0.11581891775131226, "learning_rate": 1e-05, "loss": 0.0169, "step": 1210700 }, { "epoch": 0.012108, "grad_norm": 0.10421395301818848, "learning_rate": 1e-05, "loss": 0.0168, "step": 1210800 }, { "epoch": 0.012109, "grad_norm": 0.18260137736797333, "learning_rate": 1e-05, "loss": 0.0165, "step": 1210900 }, { "epoch": 0.01211, "grad_norm": 0.12769566476345062, "learning_rate": 1e-05, "loss": 0.0165, "step": 1211000 }, { "epoch": 0.012111, "grad_norm": 0.14791205525398254, "learning_rate": 1e-05, "loss": 0.017, "step": 1211100 }, { "epoch": 0.012112, "grad_norm": 0.13087889552116394, "learning_rate": 1e-05, "loss": 0.0168, "step": 1211200 }, { "epoch": 0.012113, "grad_norm": 0.11083109676837921, "learning_rate": 1e-05, "loss": 0.0165, "step": 1211300 }, { "epoch": 0.012114, "grad_norm": 0.17175497114658356, "learning_rate": 1e-05, "loss": 0.017, "step": 1211400 }, { "epoch": 0.012115, "grad_norm": 0.10161768645048141, "learning_rate": 1e-05, "loss": 0.0162, "step": 1211500 }, { "epoch": 0.012116, "grad_norm": 0.14938247203826904, "learning_rate": 1e-05, "loss": 0.0167, "step": 1211600 }, { "epoch": 0.012117, "grad_norm": 0.12910504639148712, "learning_rate": 1e-05, "loss": 0.0169, "step": 1211700 }, { "epoch": 0.012118, "grad_norm": 0.1083836778998375, "learning_rate": 1e-05, "loss": 0.0166, "step": 1211800 }, { "epoch": 0.012119, "grad_norm": 0.11115726828575134, "learning_rate": 1e-05, "loss": 0.017, "step": 1211900 }, { "epoch": 0.01212, "grad_norm": 0.15211479365825653, "learning_rate": 1e-05, "loss": 0.0167, "step": 1212000 }, { "epoch": 0.012121, "grad_norm": 0.11752162873744965, "learning_rate": 1e-05, "loss": 0.0167, "step": 1212100 }, { "epoch": 0.012122, "grad_norm": 0.14008988440036774, "learning_rate": 1e-05, "loss": 0.0168, "step": 1212200 }, { "epoch": 0.012123, "grad_norm": 0.1255551278591156, "learning_rate": 1e-05, "loss": 0.0168, "step": 1212300 }, { "epoch": 0.012124, "grad_norm": 0.1315186470746994, "learning_rate": 1e-05, "loss": 0.0165, "step": 1212400 }, { "epoch": 0.012125, "grad_norm": 0.09834837913513184, "learning_rate": 1e-05, "loss": 0.0167, "step": 1212500 }, { "epoch": 0.012126, "grad_norm": 0.12014560401439667, "learning_rate": 1e-05, "loss": 0.0172, "step": 1212600 }, { "epoch": 0.012127, "grad_norm": 0.1522642970085144, "learning_rate": 1e-05, "loss": 0.0167, "step": 1212700 }, { "epoch": 0.012128, "grad_norm": 0.1131434217095375, "learning_rate": 1e-05, "loss": 0.0166, "step": 1212800 }, { "epoch": 0.012129, "grad_norm": 0.12043026834726334, "learning_rate": 1e-05, "loss": 0.0168, "step": 1212900 }, { "epoch": 0.01213, "grad_norm": 0.1302383542060852, "learning_rate": 1e-05, "loss": 0.0169, "step": 1213000 }, { "epoch": 0.012131, "grad_norm": 0.137814462184906, "learning_rate": 1e-05, "loss": 0.0165, "step": 1213100 }, { "epoch": 0.012132, "grad_norm": 0.12041417509317398, "learning_rate": 1e-05, "loss": 0.017, "step": 1213200 }, { "epoch": 0.012133, "grad_norm": 0.3421232998371124, "learning_rate": 1e-05, "loss": 0.0167, "step": 1213300 }, { "epoch": 0.012134, "grad_norm": 0.13274028897285461, "learning_rate": 1e-05, "loss": 0.0167, "step": 1213400 }, { "epoch": 0.012135, "grad_norm": 0.14562849700450897, "learning_rate": 1e-05, "loss": 0.0168, "step": 1213500 }, { "epoch": 0.012136, "grad_norm": 0.14058604836463928, "learning_rate": 1e-05, "loss": 0.017, "step": 1213600 }, { "epoch": 0.012137, "grad_norm": 0.12173343449831009, "learning_rate": 1e-05, "loss": 0.0172, "step": 1213700 }, { "epoch": 0.012138, "grad_norm": 0.14680598676204681, "learning_rate": 1e-05, "loss": 0.0167, "step": 1213800 }, { "epoch": 0.012139, "grad_norm": 0.12585300207138062, "learning_rate": 1e-05, "loss": 0.0167, "step": 1213900 }, { "epoch": 0.01214, "grad_norm": 0.09827792644500732, "learning_rate": 1e-05, "loss": 0.017, "step": 1214000 }, { "epoch": 0.012141, "grad_norm": 0.11982507258653641, "learning_rate": 1e-05, "loss": 0.0162, "step": 1214100 }, { "epoch": 0.012142, "grad_norm": 0.12777383625507355, "learning_rate": 1e-05, "loss": 0.0169, "step": 1214200 }, { "epoch": 0.012143, "grad_norm": 0.12758547067642212, "learning_rate": 1e-05, "loss": 0.0168, "step": 1214300 }, { "epoch": 0.012144, "grad_norm": 0.1205534115433693, "learning_rate": 1e-05, "loss": 0.017, "step": 1214400 }, { "epoch": 0.012145, "grad_norm": 0.11297633498907089, "learning_rate": 1e-05, "loss": 0.0165, "step": 1214500 }, { "epoch": 0.012146, "grad_norm": 0.1312529593706131, "learning_rate": 1e-05, "loss": 0.0165, "step": 1214600 }, { "epoch": 0.012147, "grad_norm": 0.14365728199481964, "learning_rate": 1e-05, "loss": 0.0169, "step": 1214700 }, { "epoch": 0.012148, "grad_norm": 0.1418524533510208, "learning_rate": 1e-05, "loss": 0.0167, "step": 1214800 }, { "epoch": 0.012149, "grad_norm": 0.1056717112660408, "learning_rate": 1e-05, "loss": 0.0164, "step": 1214900 }, { "epoch": 0.01215, "grad_norm": 0.1304047554731369, "learning_rate": 1e-05, "loss": 0.0166, "step": 1215000 }, { "epoch": 0.012151, "grad_norm": 0.132797509431839, "learning_rate": 1e-05, "loss": 0.0166, "step": 1215100 }, { "epoch": 0.012152, "grad_norm": 0.1392967700958252, "learning_rate": 1e-05, "loss": 0.0169, "step": 1215200 }, { "epoch": 0.012153, "grad_norm": 0.1748858392238617, "learning_rate": 1e-05, "loss": 0.0165, "step": 1215300 }, { "epoch": 0.012154, "grad_norm": 0.13337890803813934, "learning_rate": 1e-05, "loss": 0.0171, "step": 1215400 }, { "epoch": 0.012155, "grad_norm": 0.1397264301776886, "learning_rate": 1e-05, "loss": 0.0161, "step": 1215500 }, { "epoch": 0.012156, "grad_norm": 0.1255921721458435, "learning_rate": 1e-05, "loss": 0.0173, "step": 1215600 }, { "epoch": 0.012157, "grad_norm": 0.12312144786119461, "learning_rate": 1e-05, "loss": 0.0167, "step": 1215700 }, { "epoch": 0.012158, "grad_norm": 0.11309820413589478, "learning_rate": 1e-05, "loss": 0.0168, "step": 1215800 }, { "epoch": 0.012159, "grad_norm": 0.14869049191474915, "learning_rate": 1e-05, "loss": 0.0171, "step": 1215900 }, { "epoch": 0.01216, "grad_norm": 0.09053853154182434, "learning_rate": 1e-05, "loss": 0.0168, "step": 1216000 }, { "epoch": 0.012161, "grad_norm": 0.15113750100135803, "learning_rate": 1e-05, "loss": 0.0168, "step": 1216100 }, { "epoch": 0.012162, "grad_norm": 0.11148776113986969, "learning_rate": 1e-05, "loss": 0.0162, "step": 1216200 }, { "epoch": 0.012163, "grad_norm": 0.12633630633354187, "learning_rate": 1e-05, "loss": 0.0166, "step": 1216300 }, { "epoch": 0.012164, "grad_norm": 0.1645778864622116, "learning_rate": 1e-05, "loss": 0.0168, "step": 1216400 }, { "epoch": 0.012165, "grad_norm": 0.1391361504793167, "learning_rate": 1e-05, "loss": 0.0165, "step": 1216500 }, { "epoch": 0.012166, "grad_norm": 0.137956440448761, "learning_rate": 1e-05, "loss": 0.0164, "step": 1216600 }, { "epoch": 0.012167, "grad_norm": 0.135931596159935, "learning_rate": 1e-05, "loss": 0.0165, "step": 1216700 }, { "epoch": 0.012168, "grad_norm": 0.14417396485805511, "learning_rate": 1e-05, "loss": 0.0166, "step": 1216800 }, { "epoch": 0.012169, "grad_norm": 0.11697284877300262, "learning_rate": 1e-05, "loss": 0.0165, "step": 1216900 }, { "epoch": 0.01217, "grad_norm": 0.19173003733158112, "learning_rate": 1e-05, "loss": 0.0168, "step": 1217000 }, { "epoch": 0.012171, "grad_norm": 0.16003122925758362, "learning_rate": 1e-05, "loss": 0.0167, "step": 1217100 }, { "epoch": 0.012172, "grad_norm": 0.1452472060918808, "learning_rate": 1e-05, "loss": 0.0167, "step": 1217200 }, { "epoch": 0.012173, "grad_norm": 0.10722502321004868, "learning_rate": 1e-05, "loss": 0.0167, "step": 1217300 }, { "epoch": 0.012174, "grad_norm": 0.17378394305706024, "learning_rate": 1e-05, "loss": 0.0165, "step": 1217400 }, { "epoch": 0.012175, "grad_norm": 0.10588259994983673, "learning_rate": 1e-05, "loss": 0.0165, "step": 1217500 }, { "epoch": 0.012176, "grad_norm": 0.10722053796052933, "learning_rate": 1e-05, "loss": 0.0169, "step": 1217600 }, { "epoch": 0.012177, "grad_norm": 0.10096541047096252, "learning_rate": 1e-05, "loss": 0.0163, "step": 1217700 }, { "epoch": 0.012178, "grad_norm": 0.14672550559043884, "learning_rate": 1e-05, "loss": 0.0167, "step": 1217800 }, { "epoch": 0.012179, "grad_norm": 0.1007736474275589, "learning_rate": 1e-05, "loss": 0.0165, "step": 1217900 }, { "epoch": 0.01218, "grad_norm": 0.12915155291557312, "learning_rate": 1e-05, "loss": 0.0168, "step": 1218000 }, { "epoch": 0.012181, "grad_norm": 0.11095314472913742, "learning_rate": 1e-05, "loss": 0.0163, "step": 1218100 }, { "epoch": 0.012182, "grad_norm": 0.14506641030311584, "learning_rate": 1e-05, "loss": 0.0166, "step": 1218200 }, { "epoch": 0.012183, "grad_norm": 0.11816142499446869, "learning_rate": 1e-05, "loss": 0.0163, "step": 1218300 }, { "epoch": 0.012184, "grad_norm": 0.183712437748909, "learning_rate": 1e-05, "loss": 0.0167, "step": 1218400 }, { "epoch": 0.012185, "grad_norm": 0.16358590126037598, "learning_rate": 1e-05, "loss": 0.0163, "step": 1218500 }, { "epoch": 0.012186, "grad_norm": 0.18492618203163147, "learning_rate": 1e-05, "loss": 0.0167, "step": 1218600 }, { "epoch": 0.012187, "grad_norm": 0.2051723599433899, "learning_rate": 1e-05, "loss": 0.0165, "step": 1218700 }, { "epoch": 0.012188, "grad_norm": 0.1380273699760437, "learning_rate": 1e-05, "loss": 0.017, "step": 1218800 }, { "epoch": 0.012189, "grad_norm": 0.15595710277557373, "learning_rate": 1e-05, "loss": 0.0168, "step": 1218900 }, { "epoch": 0.01219, "grad_norm": 0.12483411282300949, "learning_rate": 1e-05, "loss": 0.0168, "step": 1219000 }, { "epoch": 0.012191, "grad_norm": 0.1323152482509613, "learning_rate": 1e-05, "loss": 0.0169, "step": 1219100 }, { "epoch": 0.012192, "grad_norm": 0.1147628203034401, "learning_rate": 1e-05, "loss": 0.0166, "step": 1219200 }, { "epoch": 0.012193, "grad_norm": 0.1025242730975151, "learning_rate": 1e-05, "loss": 0.0164, "step": 1219300 }, { "epoch": 0.012194, "grad_norm": 0.18249954283237457, "learning_rate": 1e-05, "loss": 0.0161, "step": 1219400 }, { "epoch": 0.012195, "grad_norm": 0.1881493628025055, "learning_rate": 1e-05, "loss": 0.0165, "step": 1219500 }, { "epoch": 0.012196, "grad_norm": 0.12005747109651566, "learning_rate": 1e-05, "loss": 0.0166, "step": 1219600 }, { "epoch": 0.012197, "grad_norm": 0.1267741173505783, "learning_rate": 1e-05, "loss": 0.0165, "step": 1219700 }, { "epoch": 0.012198, "grad_norm": 0.12955324351787567, "learning_rate": 1e-05, "loss": 0.0168, "step": 1219800 }, { "epoch": 0.012199, "grad_norm": 0.16792239248752594, "learning_rate": 1e-05, "loss": 0.0168, "step": 1219900 }, { "epoch": 0.0122, "grad_norm": 0.13794705271720886, "learning_rate": 1e-05, "loss": 0.0172, "step": 1220000 }, { "epoch": 0.0122, "eval_loss": 0.014989226125180721, "eval_runtime": 189.949, "eval_samples_per_second": 263.229, "eval_steps_per_second": 16.452, "step": 1220000 }, { "epoch": 0.012201, "grad_norm": 0.12927521765232086, "learning_rate": 1e-05, "loss": 0.017, "step": 1220100 }, { "epoch": 0.012202, "grad_norm": 0.13747094571590424, "learning_rate": 1e-05, "loss": 0.0169, "step": 1220200 }, { "epoch": 0.012203, "grad_norm": 0.13467971980571747, "learning_rate": 1e-05, "loss": 0.0165, "step": 1220300 }, { "epoch": 0.012204, "grad_norm": 0.14022061228752136, "learning_rate": 1e-05, "loss": 0.0163, "step": 1220400 }, { "epoch": 0.012205, "grad_norm": 0.13987261056900024, "learning_rate": 1e-05, "loss": 0.0163, "step": 1220500 }, { "epoch": 0.012206, "grad_norm": 0.14290989935398102, "learning_rate": 1e-05, "loss": 0.0163, "step": 1220600 }, { "epoch": 0.012207, "grad_norm": 0.14966504275798798, "learning_rate": 1e-05, "loss": 0.0165, "step": 1220700 }, { "epoch": 0.012208, "grad_norm": 0.1053268164396286, "learning_rate": 1e-05, "loss": 0.0163, "step": 1220800 }, { "epoch": 0.012209, "grad_norm": 0.14838358759880066, "learning_rate": 1e-05, "loss": 0.0171, "step": 1220900 }, { "epoch": 0.01221, "grad_norm": 0.12291150540113449, "learning_rate": 1e-05, "loss": 0.017, "step": 1221000 }, { "epoch": 0.012211, "grad_norm": 0.11135499179363251, "learning_rate": 1e-05, "loss": 0.0167, "step": 1221100 }, { "epoch": 0.012212, "grad_norm": 0.0977715402841568, "learning_rate": 1e-05, "loss": 0.0168, "step": 1221200 }, { "epoch": 0.012213, "grad_norm": 0.14506515860557556, "learning_rate": 1e-05, "loss": 0.0167, "step": 1221300 }, { "epoch": 0.012214, "grad_norm": 0.13602767884731293, "learning_rate": 1e-05, "loss": 0.017, "step": 1221400 }, { "epoch": 0.012215, "grad_norm": 0.11184143275022507, "learning_rate": 1e-05, "loss": 0.0165, "step": 1221500 }, { "epoch": 0.012216, "grad_norm": 0.24596814811229706, "learning_rate": 1e-05, "loss": 0.0168, "step": 1221600 }, { "epoch": 0.012217, "grad_norm": 0.14339177310466766, "learning_rate": 1e-05, "loss": 0.0168, "step": 1221700 }, { "epoch": 0.012218, "grad_norm": 0.14971773326396942, "learning_rate": 1e-05, "loss": 0.017, "step": 1221800 }, { "epoch": 0.012219, "grad_norm": 0.10817568749189377, "learning_rate": 1e-05, "loss": 0.0164, "step": 1221900 }, { "epoch": 0.01222, "grad_norm": 0.09504052251577377, "learning_rate": 1e-05, "loss": 0.0167, "step": 1222000 }, { "epoch": 0.012221, "grad_norm": 0.10545291006565094, "learning_rate": 1e-05, "loss": 0.0168, "step": 1222100 }, { "epoch": 0.012222, "grad_norm": 0.1238616555929184, "learning_rate": 1e-05, "loss": 0.0164, "step": 1222200 }, { "epoch": 0.012223, "grad_norm": 0.09680212289094925, "learning_rate": 1e-05, "loss": 0.0166, "step": 1222300 }, { "epoch": 0.012224, "grad_norm": 0.19927051663398743, "learning_rate": 1e-05, "loss": 0.0166, "step": 1222400 }, { "epoch": 0.012225, "grad_norm": 0.10427572578191757, "learning_rate": 1e-05, "loss": 0.0169, "step": 1222500 }, { "epoch": 0.012226, "grad_norm": 0.13661189377307892, "learning_rate": 1e-05, "loss": 0.0165, "step": 1222600 }, { "epoch": 0.012227, "grad_norm": 0.13439615070819855, "learning_rate": 1e-05, "loss": 0.0165, "step": 1222700 }, { "epoch": 0.012228, "grad_norm": 0.1128556951880455, "learning_rate": 1e-05, "loss": 0.0165, "step": 1222800 }, { "epoch": 0.012229, "grad_norm": 0.112553671002388, "learning_rate": 1e-05, "loss": 0.0168, "step": 1222900 }, { "epoch": 0.01223, "grad_norm": 0.12621545791625977, "learning_rate": 1e-05, "loss": 0.0168, "step": 1223000 }, { "epoch": 0.012231, "grad_norm": 0.0930081382393837, "learning_rate": 1e-05, "loss": 0.0166, "step": 1223100 }, { "epoch": 0.012232, "grad_norm": 0.14148961007595062, "learning_rate": 1e-05, "loss": 0.0167, "step": 1223200 }, { "epoch": 0.012233, "grad_norm": 0.14016461372375488, "learning_rate": 1e-05, "loss": 0.0164, "step": 1223300 }, { "epoch": 0.012234, "grad_norm": 0.15427567064762115, "learning_rate": 1e-05, "loss": 0.0165, "step": 1223400 }, { "epoch": 0.012235, "grad_norm": 0.12204691767692566, "learning_rate": 1e-05, "loss": 0.0166, "step": 1223500 }, { "epoch": 0.012236, "grad_norm": 0.14009778201580048, "learning_rate": 1e-05, "loss": 0.0165, "step": 1223600 }, { "epoch": 0.012237, "grad_norm": 0.10611068457365036, "learning_rate": 1e-05, "loss": 0.0168, "step": 1223700 }, { "epoch": 0.012238, "grad_norm": 0.10398045927286148, "learning_rate": 1e-05, "loss": 0.0168, "step": 1223800 }, { "epoch": 0.012239, "grad_norm": 0.10133989155292511, "learning_rate": 1e-05, "loss": 0.0169, "step": 1223900 }, { "epoch": 0.01224, "grad_norm": 0.0945114865899086, "learning_rate": 1e-05, "loss": 0.0164, "step": 1224000 }, { "epoch": 0.012241, "grad_norm": 0.1590738594532013, "learning_rate": 1e-05, "loss": 0.0167, "step": 1224100 }, { "epoch": 0.012242, "grad_norm": 0.3101329505443573, "learning_rate": 1e-05, "loss": 0.0164, "step": 1224200 }, { "epoch": 0.012243, "grad_norm": 0.1322154402732849, "learning_rate": 1e-05, "loss": 0.0169, "step": 1224300 }, { "epoch": 0.012244, "grad_norm": 0.10522852092981339, "learning_rate": 1e-05, "loss": 0.0165, "step": 1224400 }, { "epoch": 0.012245, "grad_norm": 0.12157471477985382, "learning_rate": 1e-05, "loss": 0.0165, "step": 1224500 }, { "epoch": 0.012246, "grad_norm": 0.12218520790338516, "learning_rate": 1e-05, "loss": 0.0165, "step": 1224600 }, { "epoch": 0.012247, "grad_norm": 0.14161069691181183, "learning_rate": 1e-05, "loss": 0.0168, "step": 1224700 }, { "epoch": 0.012248, "grad_norm": 0.14127179980278015, "learning_rate": 1e-05, "loss": 0.0165, "step": 1224800 }, { "epoch": 0.012249, "grad_norm": 0.13340580463409424, "learning_rate": 1e-05, "loss": 0.0168, "step": 1224900 }, { "epoch": 0.01225, "grad_norm": 0.1307450532913208, "learning_rate": 1e-05, "loss": 0.0165, "step": 1225000 }, { "epoch": 0.012251, "grad_norm": 0.12025213986635208, "learning_rate": 1e-05, "loss": 0.0164, "step": 1225100 }, { "epoch": 0.012252, "grad_norm": 0.10620395839214325, "learning_rate": 1e-05, "loss": 0.0162, "step": 1225200 }, { "epoch": 0.012253, "grad_norm": 0.13810870051383972, "learning_rate": 1e-05, "loss": 0.0166, "step": 1225300 }, { "epoch": 0.012254, "grad_norm": 0.1303487867116928, "learning_rate": 1e-05, "loss": 0.0161, "step": 1225400 }, { "epoch": 0.012255, "grad_norm": 0.11027608066797256, "learning_rate": 1e-05, "loss": 0.0166, "step": 1225500 }, { "epoch": 0.012256, "grad_norm": 0.11348742246627808, "learning_rate": 1e-05, "loss": 0.0166, "step": 1225600 }, { "epoch": 0.012257, "grad_norm": 0.11223984509706497, "learning_rate": 1e-05, "loss": 0.0162, "step": 1225700 }, { "epoch": 0.012258, "grad_norm": 0.12688615918159485, "learning_rate": 1e-05, "loss": 0.0169, "step": 1225800 }, { "epoch": 0.012259, "grad_norm": 0.11313784867525101, "learning_rate": 1e-05, "loss": 0.017, "step": 1225900 }, { "epoch": 0.01226, "grad_norm": 0.2582657039165497, "learning_rate": 1e-05, "loss": 0.017, "step": 1226000 }, { "epoch": 0.012261, "grad_norm": 0.17155946791172028, "learning_rate": 1e-05, "loss": 0.0165, "step": 1226100 }, { "epoch": 0.012262, "grad_norm": 0.18590600788593292, "learning_rate": 1e-05, "loss": 0.0168, "step": 1226200 }, { "epoch": 0.012263, "grad_norm": 0.10854554176330566, "learning_rate": 1e-05, "loss": 0.017, "step": 1226300 }, { "epoch": 0.012264, "grad_norm": 0.1038735955953598, "learning_rate": 1e-05, "loss": 0.017, "step": 1226400 }, { "epoch": 0.012265, "grad_norm": 0.12450878322124481, "learning_rate": 1e-05, "loss": 0.0166, "step": 1226500 }, { "epoch": 0.012266, "grad_norm": 0.1302800327539444, "learning_rate": 1e-05, "loss": 0.0166, "step": 1226600 }, { "epoch": 0.012267, "grad_norm": 0.17814509570598602, "learning_rate": 1e-05, "loss": 0.0168, "step": 1226700 }, { "epoch": 0.012268, "grad_norm": 0.1413719803094864, "learning_rate": 1e-05, "loss": 0.0165, "step": 1226800 }, { "epoch": 0.012269, "grad_norm": 0.11320644617080688, "learning_rate": 1e-05, "loss": 0.0168, "step": 1226900 }, { "epoch": 0.01227, "grad_norm": 0.10598450154066086, "learning_rate": 1e-05, "loss": 0.0168, "step": 1227000 }, { "epoch": 0.012271, "grad_norm": 0.12238800525665283, "learning_rate": 1e-05, "loss": 0.0168, "step": 1227100 }, { "epoch": 0.012272, "grad_norm": 0.12301570922136307, "learning_rate": 1e-05, "loss": 0.0164, "step": 1227200 }, { "epoch": 0.012273, "grad_norm": 0.13260649144649506, "learning_rate": 1e-05, "loss": 0.0163, "step": 1227300 }, { "epoch": 0.012274, "grad_norm": 0.12022124230861664, "learning_rate": 1e-05, "loss": 0.0165, "step": 1227400 }, { "epoch": 0.012275, "grad_norm": 0.12065128982067108, "learning_rate": 1e-05, "loss": 0.0167, "step": 1227500 }, { "epoch": 0.012276, "grad_norm": 0.13627223670482635, "learning_rate": 1e-05, "loss": 0.017, "step": 1227600 }, { "epoch": 0.012277, "grad_norm": 0.1534491926431656, "learning_rate": 1e-05, "loss": 0.0169, "step": 1227700 }, { "epoch": 0.012278, "grad_norm": 0.12512384355068207, "learning_rate": 1e-05, "loss": 0.0166, "step": 1227800 }, { "epoch": 0.012279, "grad_norm": 0.1346248984336853, "learning_rate": 1e-05, "loss": 0.0168, "step": 1227900 }, { "epoch": 0.01228, "grad_norm": 0.16124077141284943, "learning_rate": 1e-05, "loss": 0.0165, "step": 1228000 }, { "epoch": 0.012281, "grad_norm": 0.13808073103427887, "learning_rate": 1e-05, "loss": 0.0165, "step": 1228100 }, { "epoch": 0.012282, "grad_norm": 0.14006878435611725, "learning_rate": 1e-05, "loss": 0.0168, "step": 1228200 }, { "epoch": 0.012283, "grad_norm": 0.12316576391458511, "learning_rate": 1e-05, "loss": 0.0169, "step": 1228300 }, { "epoch": 0.012284, "grad_norm": 0.14692118763923645, "learning_rate": 1e-05, "loss": 0.0167, "step": 1228400 }, { "epoch": 0.012285, "grad_norm": 0.1363953948020935, "learning_rate": 1e-05, "loss": 0.0168, "step": 1228500 }, { "epoch": 0.012286, "grad_norm": 0.10653994977474213, "learning_rate": 1e-05, "loss": 0.0162, "step": 1228600 }, { "epoch": 0.012287, "grad_norm": 0.12909406423568726, "learning_rate": 1e-05, "loss": 0.0166, "step": 1228700 }, { "epoch": 0.012288, "grad_norm": 0.10905961692333221, "learning_rate": 1e-05, "loss": 0.0162, "step": 1228800 }, { "epoch": 0.012289, "grad_norm": 0.12822158634662628, "learning_rate": 1e-05, "loss": 0.0164, "step": 1228900 }, { "epoch": 0.01229, "grad_norm": 0.16369348764419556, "learning_rate": 1e-05, "loss": 0.0167, "step": 1229000 }, { "epoch": 0.012291, "grad_norm": 0.2059110403060913, "learning_rate": 1e-05, "loss": 0.0168, "step": 1229100 }, { "epoch": 0.012292, "grad_norm": 0.11825783550739288, "learning_rate": 1e-05, "loss": 0.0166, "step": 1229200 }, { "epoch": 0.012293, "grad_norm": 0.12902013957500458, "learning_rate": 1e-05, "loss": 0.0169, "step": 1229300 }, { "epoch": 0.012294, "grad_norm": 0.11094410717487335, "learning_rate": 1e-05, "loss": 0.0164, "step": 1229400 }, { "epoch": 0.012295, "grad_norm": 0.11624305695295334, "learning_rate": 1e-05, "loss": 0.0168, "step": 1229500 }, { "epoch": 0.012296, "grad_norm": 0.13199183344841003, "learning_rate": 1e-05, "loss": 0.0166, "step": 1229600 }, { "epoch": 0.012297, "grad_norm": 0.09708818048238754, "learning_rate": 1e-05, "loss": 0.0167, "step": 1229700 }, { "epoch": 0.012298, "grad_norm": 0.14675170183181763, "learning_rate": 1e-05, "loss": 0.0163, "step": 1229800 }, { "epoch": 0.012299, "grad_norm": 0.11988089978694916, "learning_rate": 1e-05, "loss": 0.0162, "step": 1229900 }, { "epoch": 0.0123, "grad_norm": 0.11448269337415695, "learning_rate": 1e-05, "loss": 0.0165, "step": 1230000 }, { "epoch": 0.012301, "grad_norm": 0.1760808229446411, "learning_rate": 1e-05, "loss": 0.0165, "step": 1230100 }, { "epoch": 0.012302, "grad_norm": 0.1436384916305542, "learning_rate": 1e-05, "loss": 0.0168, "step": 1230200 }, { "epoch": 0.012303, "grad_norm": 0.1286284625530243, "learning_rate": 1e-05, "loss": 0.0159, "step": 1230300 }, { "epoch": 0.012304, "grad_norm": 0.1253732591867447, "learning_rate": 1e-05, "loss": 0.0165, "step": 1230400 }, { "epoch": 0.012305, "grad_norm": 0.11589112132787704, "learning_rate": 1e-05, "loss": 0.0167, "step": 1230500 }, { "epoch": 0.012306, "grad_norm": 0.12960726022720337, "learning_rate": 1e-05, "loss": 0.0168, "step": 1230600 }, { "epoch": 0.012307, "grad_norm": 0.1524432897567749, "learning_rate": 1e-05, "loss": 0.0165, "step": 1230700 }, { "epoch": 0.012308, "grad_norm": 0.09832068532705307, "learning_rate": 1e-05, "loss": 0.0164, "step": 1230800 }, { "epoch": 0.012309, "grad_norm": 0.11431882530450821, "learning_rate": 1e-05, "loss": 0.0164, "step": 1230900 }, { "epoch": 0.01231, "grad_norm": 0.11774216592311859, "learning_rate": 1e-05, "loss": 0.0167, "step": 1231000 }, { "epoch": 0.012311, "grad_norm": 0.1657627522945404, "learning_rate": 1e-05, "loss": 0.0167, "step": 1231100 }, { "epoch": 0.012312, "grad_norm": 0.12819060683250427, "learning_rate": 1e-05, "loss": 0.0168, "step": 1231200 }, { "epoch": 0.012313, "grad_norm": 0.14471222460269928, "learning_rate": 1e-05, "loss": 0.0165, "step": 1231300 }, { "epoch": 0.012314, "grad_norm": 0.1648627668619156, "learning_rate": 1e-05, "loss": 0.0168, "step": 1231400 }, { "epoch": 0.012315, "grad_norm": 0.1494661420583725, "learning_rate": 1e-05, "loss": 0.0164, "step": 1231500 }, { "epoch": 0.012316, "grad_norm": 0.11907735466957092, "learning_rate": 1e-05, "loss": 0.0166, "step": 1231600 }, { "epoch": 0.012317, "grad_norm": 0.12536384165287018, "learning_rate": 1e-05, "loss": 0.0162, "step": 1231700 }, { "epoch": 0.012318, "grad_norm": 0.1474645435810089, "learning_rate": 1e-05, "loss": 0.0165, "step": 1231800 }, { "epoch": 0.012319, "grad_norm": 0.13377995789051056, "learning_rate": 1e-05, "loss": 0.0164, "step": 1231900 }, { "epoch": 0.01232, "grad_norm": 0.16241899132728577, "learning_rate": 1e-05, "loss": 0.016, "step": 1232000 }, { "epoch": 0.012321, "grad_norm": 0.0989866852760315, "learning_rate": 1e-05, "loss": 0.0171, "step": 1232100 }, { "epoch": 0.012322, "grad_norm": 0.12718310952186584, "learning_rate": 1e-05, "loss": 0.0167, "step": 1232200 }, { "epoch": 0.012323, "grad_norm": 0.19655458629131317, "learning_rate": 1e-05, "loss": 0.0166, "step": 1232300 }, { "epoch": 0.012324, "grad_norm": 0.13128285109996796, "learning_rate": 1e-05, "loss": 0.0166, "step": 1232400 }, { "epoch": 0.012325, "grad_norm": 0.12376218289136887, "learning_rate": 1e-05, "loss": 0.0168, "step": 1232500 }, { "epoch": 0.012326, "grad_norm": 0.10098963230848312, "learning_rate": 1e-05, "loss": 0.0167, "step": 1232600 }, { "epoch": 0.012327, "grad_norm": 0.12275123596191406, "learning_rate": 1e-05, "loss": 0.0172, "step": 1232700 }, { "epoch": 0.012328, "grad_norm": 0.16591057181358337, "learning_rate": 1e-05, "loss": 0.0168, "step": 1232800 }, { "epoch": 0.012329, "grad_norm": 0.11115192621946335, "learning_rate": 1e-05, "loss": 0.0169, "step": 1232900 }, { "epoch": 0.01233, "grad_norm": 0.13797442615032196, "learning_rate": 1e-05, "loss": 0.0163, "step": 1233000 }, { "epoch": 0.012331, "grad_norm": 0.1417749971151352, "learning_rate": 1e-05, "loss": 0.0158, "step": 1233100 }, { "epoch": 0.012332, "grad_norm": 0.11550233513116837, "learning_rate": 1e-05, "loss": 0.0164, "step": 1233200 }, { "epoch": 0.012333, "grad_norm": 0.13046440482139587, "learning_rate": 1e-05, "loss": 0.0169, "step": 1233300 }, { "epoch": 0.012334, "grad_norm": 0.1302209347486496, "learning_rate": 1e-05, "loss": 0.0161, "step": 1233400 }, { "epoch": 0.012335, "grad_norm": 0.11380504816770554, "learning_rate": 1e-05, "loss": 0.0167, "step": 1233500 }, { "epoch": 0.012336, "grad_norm": 0.1238497942686081, "learning_rate": 1e-05, "loss": 0.0165, "step": 1233600 }, { "epoch": 0.012337, "grad_norm": 0.18346305191516876, "learning_rate": 1e-05, "loss": 0.0166, "step": 1233700 }, { "epoch": 0.012338, "grad_norm": 0.11995235830545425, "learning_rate": 1e-05, "loss": 0.0166, "step": 1233800 }, { "epoch": 0.012339, "grad_norm": 0.11447805911302567, "learning_rate": 1e-05, "loss": 0.0169, "step": 1233900 }, { "epoch": 0.01234, "grad_norm": 0.13228663802146912, "learning_rate": 1e-05, "loss": 0.0167, "step": 1234000 }, { "epoch": 0.012341, "grad_norm": 0.10951986163854599, "learning_rate": 1e-05, "loss": 0.0169, "step": 1234100 }, { "epoch": 0.012342, "grad_norm": 0.13936211168766022, "learning_rate": 1e-05, "loss": 0.0161, "step": 1234200 }, { "epoch": 0.012343, "grad_norm": 0.11027520149946213, "learning_rate": 1e-05, "loss": 0.0165, "step": 1234300 }, { "epoch": 0.012344, "grad_norm": 0.1396809220314026, "learning_rate": 1e-05, "loss": 0.0163, "step": 1234400 }, { "epoch": 0.012345, "grad_norm": 0.13186337053775787, "learning_rate": 1e-05, "loss": 0.0164, "step": 1234500 }, { "epoch": 0.012346, "grad_norm": 0.1175846979022026, "learning_rate": 1e-05, "loss": 0.0168, "step": 1234600 }, { "epoch": 0.012347, "grad_norm": 0.19504930078983307, "learning_rate": 1e-05, "loss": 0.0173, "step": 1234700 }, { "epoch": 0.012348, "grad_norm": 0.12241113185882568, "learning_rate": 1e-05, "loss": 0.0171, "step": 1234800 }, { "epoch": 0.012349, "grad_norm": 0.1614532172679901, "learning_rate": 1e-05, "loss": 0.0168, "step": 1234900 }, { "epoch": 0.01235, "grad_norm": 0.20117296278476715, "learning_rate": 1e-05, "loss": 0.0167, "step": 1235000 }, { "epoch": 0.012351, "grad_norm": 0.11918551474809647, "learning_rate": 1e-05, "loss": 0.0166, "step": 1235100 }, { "epoch": 0.012352, "grad_norm": 0.12038086354732513, "learning_rate": 1e-05, "loss": 0.0166, "step": 1235200 }, { "epoch": 0.012353, "grad_norm": 0.11901906132698059, "learning_rate": 1e-05, "loss": 0.017, "step": 1235300 }, { "epoch": 0.012354, "grad_norm": 0.1572829782962799, "learning_rate": 1e-05, "loss": 0.0168, "step": 1235400 }, { "epoch": 0.012355, "grad_norm": 0.18314234912395477, "learning_rate": 1e-05, "loss": 0.0166, "step": 1235500 }, { "epoch": 0.012356, "grad_norm": 0.1285591870546341, "learning_rate": 1e-05, "loss": 0.0164, "step": 1235600 }, { "epoch": 0.012357, "grad_norm": 0.12158698588609695, "learning_rate": 1e-05, "loss": 0.0168, "step": 1235700 }, { "epoch": 0.012358, "grad_norm": 0.08883389830589294, "learning_rate": 1e-05, "loss": 0.0169, "step": 1235800 }, { "epoch": 0.012359, "grad_norm": 0.1338142454624176, "learning_rate": 1e-05, "loss": 0.0163, "step": 1235900 }, { "epoch": 0.01236, "grad_norm": 0.16686303913593292, "learning_rate": 1e-05, "loss": 0.0167, "step": 1236000 }, { "epoch": 0.012361, "grad_norm": 0.12159892916679382, "learning_rate": 1e-05, "loss": 0.0163, "step": 1236100 }, { "epoch": 0.012362, "grad_norm": 0.14744290709495544, "learning_rate": 1e-05, "loss": 0.017, "step": 1236200 }, { "epoch": 0.012363, "grad_norm": 0.11039773374795914, "learning_rate": 1e-05, "loss": 0.017, "step": 1236300 }, { "epoch": 0.012364, "grad_norm": 0.14395549893379211, "learning_rate": 1e-05, "loss": 0.0166, "step": 1236400 }, { "epoch": 0.012365, "grad_norm": 0.13654349744319916, "learning_rate": 1e-05, "loss": 0.0165, "step": 1236500 }, { "epoch": 0.012366, "grad_norm": 0.14565600454807281, "learning_rate": 1e-05, "loss": 0.0166, "step": 1236600 }, { "epoch": 0.012367, "grad_norm": 0.11654715240001678, "learning_rate": 1e-05, "loss": 0.0168, "step": 1236700 }, { "epoch": 0.012368, "grad_norm": 0.13279128074645996, "learning_rate": 1e-05, "loss": 0.0167, "step": 1236800 }, { "epoch": 0.012369, "grad_norm": 0.19284243881702423, "learning_rate": 1e-05, "loss": 0.0163, "step": 1236900 }, { "epoch": 0.01237, "grad_norm": 0.1207505613565445, "learning_rate": 1e-05, "loss": 0.0167, "step": 1237000 }, { "epoch": 0.012371, "grad_norm": 0.115017831325531, "learning_rate": 1e-05, "loss": 0.0162, "step": 1237100 }, { "epoch": 0.012372, "grad_norm": 0.14043469727039337, "learning_rate": 1e-05, "loss": 0.0167, "step": 1237200 }, { "epoch": 0.012373, "grad_norm": 0.10434022545814514, "learning_rate": 1e-05, "loss": 0.0167, "step": 1237300 }, { "epoch": 0.012374, "grad_norm": 0.11916569620370865, "learning_rate": 1e-05, "loss": 0.0165, "step": 1237400 }, { "epoch": 0.012375, "grad_norm": 0.1330840140581131, "learning_rate": 1e-05, "loss": 0.0165, "step": 1237500 }, { "epoch": 0.012376, "grad_norm": 0.1317138671875, "learning_rate": 1e-05, "loss": 0.0169, "step": 1237600 }, { "epoch": 0.012377, "grad_norm": 0.1358848214149475, "learning_rate": 1e-05, "loss": 0.0169, "step": 1237700 }, { "epoch": 0.012378, "grad_norm": 0.12448550015687943, "learning_rate": 1e-05, "loss": 0.0162, "step": 1237800 }, { "epoch": 0.012379, "grad_norm": 0.1880471408367157, "learning_rate": 1e-05, "loss": 0.0167, "step": 1237900 }, { "epoch": 0.01238, "grad_norm": 0.09032108634710312, "learning_rate": 1e-05, "loss": 0.0168, "step": 1238000 }, { "epoch": 0.012381, "grad_norm": 0.19819965958595276, "learning_rate": 1e-05, "loss": 0.017, "step": 1238100 }, { "epoch": 0.012382, "grad_norm": 0.10203223675489426, "learning_rate": 1e-05, "loss": 0.0167, "step": 1238200 }, { "epoch": 0.012383, "grad_norm": 0.1924496293067932, "learning_rate": 1e-05, "loss": 0.0165, "step": 1238300 }, { "epoch": 0.012384, "grad_norm": 0.1215878501534462, "learning_rate": 1e-05, "loss": 0.0166, "step": 1238400 }, { "epoch": 0.012385, "grad_norm": 0.1363038420677185, "learning_rate": 1e-05, "loss": 0.0169, "step": 1238500 }, { "epoch": 0.012386, "grad_norm": 0.12314099073410034, "learning_rate": 1e-05, "loss": 0.0162, "step": 1238600 }, { "epoch": 0.012387, "grad_norm": 0.15161482989788055, "learning_rate": 1e-05, "loss": 0.0165, "step": 1238700 }, { "epoch": 0.012388, "grad_norm": 0.095028355717659, "learning_rate": 1e-05, "loss": 0.0164, "step": 1238800 }, { "epoch": 0.012389, "grad_norm": 0.12796463072299957, "learning_rate": 1e-05, "loss": 0.0166, "step": 1238900 }, { "epoch": 0.01239, "grad_norm": 0.10506271570920944, "learning_rate": 1e-05, "loss": 0.0167, "step": 1239000 }, { "epoch": 0.012391, "grad_norm": 0.2306322306394577, "learning_rate": 1e-05, "loss": 0.0164, "step": 1239100 }, { "epoch": 0.012392, "grad_norm": 0.13388904929161072, "learning_rate": 1e-05, "loss": 0.0164, "step": 1239200 }, { "epoch": 0.012393, "grad_norm": 0.09605563431978226, "learning_rate": 1e-05, "loss": 0.0161, "step": 1239300 }, { "epoch": 0.012394, "grad_norm": 0.13794666528701782, "learning_rate": 1e-05, "loss": 0.0167, "step": 1239400 }, { "epoch": 0.012395, "grad_norm": 0.16840751469135284, "learning_rate": 1e-05, "loss": 0.0169, "step": 1239500 }, { "epoch": 0.012396, "grad_norm": 0.10537184774875641, "learning_rate": 1e-05, "loss": 0.0163, "step": 1239600 }, { "epoch": 0.012397, "grad_norm": 0.1970064640045166, "learning_rate": 1e-05, "loss": 0.0168, "step": 1239700 }, { "epoch": 0.012398, "grad_norm": 0.14761604368686676, "learning_rate": 1e-05, "loss": 0.0166, "step": 1239800 }, { "epoch": 0.012399, "grad_norm": 0.14687612652778625, "learning_rate": 1e-05, "loss": 0.0164, "step": 1239900 }, { "epoch": 0.0124, "grad_norm": 0.13348975777626038, "learning_rate": 1e-05, "loss": 0.0167, "step": 1240000 }, { "epoch": 0.0124, "eval_loss": 0.014523262158036232, "eval_runtime": 189.6463, "eval_samples_per_second": 263.649, "eval_steps_per_second": 16.478, "step": 1240000 }, { "epoch": 0.012401, "grad_norm": 0.11506626009941101, "learning_rate": 1e-05, "loss": 0.0165, "step": 1240100 }, { "epoch": 0.012402, "grad_norm": 0.09205177426338196, "learning_rate": 1e-05, "loss": 0.0164, "step": 1240200 }, { "epoch": 0.012403, "grad_norm": 0.141825869679451, "learning_rate": 1e-05, "loss": 0.0164, "step": 1240300 }, { "epoch": 0.012404, "grad_norm": 0.1145758181810379, "learning_rate": 1e-05, "loss": 0.0169, "step": 1240400 }, { "epoch": 0.012405, "grad_norm": 0.10758596658706665, "learning_rate": 1e-05, "loss": 0.0165, "step": 1240500 }, { "epoch": 0.012406, "grad_norm": 0.12599295377731323, "learning_rate": 1e-05, "loss": 0.0163, "step": 1240600 }, { "epoch": 0.012407, "grad_norm": 0.12615816295146942, "learning_rate": 1e-05, "loss": 0.0166, "step": 1240700 }, { "epoch": 0.012408, "grad_norm": 0.11070822924375534, "learning_rate": 1e-05, "loss": 0.0163, "step": 1240800 }, { "epoch": 0.012409, "grad_norm": 0.12564237415790558, "learning_rate": 1e-05, "loss": 0.0166, "step": 1240900 }, { "epoch": 0.01241, "grad_norm": 0.10397359728813171, "learning_rate": 1e-05, "loss": 0.017, "step": 1241000 }, { "epoch": 0.012411, "grad_norm": 0.15991340577602386, "learning_rate": 1e-05, "loss": 0.0166, "step": 1241100 }, { "epoch": 0.012412, "grad_norm": 0.12276248633861542, "learning_rate": 1e-05, "loss": 0.0161, "step": 1241200 }, { "epoch": 0.012413, "grad_norm": 0.12253876030445099, "learning_rate": 1e-05, "loss": 0.0166, "step": 1241300 }, { "epoch": 0.012414, "grad_norm": 0.1079544797539711, "learning_rate": 1e-05, "loss": 0.0167, "step": 1241400 }, { "epoch": 0.012415, "grad_norm": 0.12230636179447174, "learning_rate": 1e-05, "loss": 0.0165, "step": 1241500 }, { "epoch": 0.012416, "grad_norm": 0.17437607049942017, "learning_rate": 1e-05, "loss": 0.0167, "step": 1241600 }, { "epoch": 0.012417, "grad_norm": 0.10562512278556824, "learning_rate": 1e-05, "loss": 0.0163, "step": 1241700 }, { "epoch": 0.012418, "grad_norm": 0.14303116500377655, "learning_rate": 1e-05, "loss": 0.0164, "step": 1241800 }, { "epoch": 0.012419, "grad_norm": 0.12797170877456665, "learning_rate": 1e-05, "loss": 0.0163, "step": 1241900 }, { "epoch": 0.01242, "grad_norm": 0.11014465987682343, "learning_rate": 1e-05, "loss": 0.0166, "step": 1242000 }, { "epoch": 0.012421, "grad_norm": 0.12182492762804031, "learning_rate": 1e-05, "loss": 0.0168, "step": 1242100 }, { "epoch": 0.012422, "grad_norm": 0.1491089165210724, "learning_rate": 1e-05, "loss": 0.0164, "step": 1242200 }, { "epoch": 0.012423, "grad_norm": 0.1741204559803009, "learning_rate": 1e-05, "loss": 0.0165, "step": 1242300 }, { "epoch": 0.012424, "grad_norm": 0.12600788474082947, "learning_rate": 1e-05, "loss": 0.0165, "step": 1242400 }, { "epoch": 0.012425, "grad_norm": 0.13734714686870575, "learning_rate": 1e-05, "loss": 0.0167, "step": 1242500 }, { "epoch": 0.012426, "grad_norm": 0.1271124631166458, "learning_rate": 1e-05, "loss": 0.0165, "step": 1242600 }, { "epoch": 0.012427, "grad_norm": 0.11425124853849411, "learning_rate": 1e-05, "loss": 0.0166, "step": 1242700 }, { "epoch": 0.012428, "grad_norm": 0.18891751766204834, "learning_rate": 1e-05, "loss": 0.0169, "step": 1242800 }, { "epoch": 0.012429, "grad_norm": 0.17940378189086914, "learning_rate": 1e-05, "loss": 0.0167, "step": 1242900 }, { "epoch": 0.01243, "grad_norm": 0.1284610629081726, "learning_rate": 1e-05, "loss": 0.016, "step": 1243000 }, { "epoch": 0.012431, "grad_norm": 0.12942473590373993, "learning_rate": 1e-05, "loss": 0.0168, "step": 1243100 }, { "epoch": 0.012432, "grad_norm": 0.10657636821269989, "learning_rate": 1e-05, "loss": 0.017, "step": 1243200 }, { "epoch": 0.012433, "grad_norm": 0.15827646851539612, "learning_rate": 1e-05, "loss": 0.0168, "step": 1243300 }, { "epoch": 0.012434, "grad_norm": 0.1643623411655426, "learning_rate": 1e-05, "loss": 0.0165, "step": 1243400 }, { "epoch": 0.012435, "grad_norm": 0.12943756580352783, "learning_rate": 1e-05, "loss": 0.0162, "step": 1243500 }, { "epoch": 0.012436, "grad_norm": 0.19000916182994843, "learning_rate": 1e-05, "loss": 0.0169, "step": 1243600 }, { "epoch": 0.012437, "grad_norm": 0.12277150899171829, "learning_rate": 1e-05, "loss": 0.0164, "step": 1243700 }, { "epoch": 0.012438, "grad_norm": 0.12796945869922638, "learning_rate": 1e-05, "loss": 0.0163, "step": 1243800 }, { "epoch": 0.012439, "grad_norm": 0.13296936452388763, "learning_rate": 1e-05, "loss": 0.0165, "step": 1243900 }, { "epoch": 0.01244, "grad_norm": 0.13980048894882202, "learning_rate": 1e-05, "loss": 0.0161, "step": 1244000 }, { "epoch": 0.012441, "grad_norm": 0.12041507661342621, "learning_rate": 1e-05, "loss": 0.0163, "step": 1244100 }, { "epoch": 0.012442, "grad_norm": 0.13339367508888245, "learning_rate": 1e-05, "loss": 0.0165, "step": 1244200 }, { "epoch": 0.012443, "grad_norm": 0.1116248220205307, "learning_rate": 1e-05, "loss": 0.0162, "step": 1244300 }, { "epoch": 0.012444, "grad_norm": 0.11375829577445984, "learning_rate": 1e-05, "loss": 0.0162, "step": 1244400 }, { "epoch": 0.012445, "grad_norm": 0.12176565080881119, "learning_rate": 1e-05, "loss": 0.0166, "step": 1244500 }, { "epoch": 0.012446, "grad_norm": 0.09583595395088196, "learning_rate": 1e-05, "loss": 0.0168, "step": 1244600 }, { "epoch": 0.012447, "grad_norm": 0.22117552161216736, "learning_rate": 1e-05, "loss": 0.0162, "step": 1244700 }, { "epoch": 0.012448, "grad_norm": 0.13085564970970154, "learning_rate": 1e-05, "loss": 0.0161, "step": 1244800 }, { "epoch": 0.012449, "grad_norm": 0.2277928739786148, "learning_rate": 1e-05, "loss": 0.0164, "step": 1244900 }, { "epoch": 0.01245, "grad_norm": 0.13232196867465973, "learning_rate": 1e-05, "loss": 0.0168, "step": 1245000 }, { "epoch": 0.012451, "grad_norm": 0.08810066431760788, "learning_rate": 1e-05, "loss": 0.0167, "step": 1245100 }, { "epoch": 0.012452, "grad_norm": 0.11394161731004715, "learning_rate": 1e-05, "loss": 0.0165, "step": 1245200 }, { "epoch": 0.012453, "grad_norm": 0.11532611399888992, "learning_rate": 1e-05, "loss": 0.0164, "step": 1245300 }, { "epoch": 0.012454, "grad_norm": 0.1418343335390091, "learning_rate": 1e-05, "loss": 0.0165, "step": 1245400 }, { "epoch": 0.012455, "grad_norm": 0.10672499984502792, "learning_rate": 1e-05, "loss": 0.0167, "step": 1245500 }, { "epoch": 0.012456, "grad_norm": 0.11554903537034988, "learning_rate": 1e-05, "loss": 0.017, "step": 1245600 }, { "epoch": 0.012457, "grad_norm": 0.12093757838010788, "learning_rate": 1e-05, "loss": 0.0167, "step": 1245700 }, { "epoch": 0.012458, "grad_norm": 0.10406356304883957, "learning_rate": 1e-05, "loss": 0.0165, "step": 1245800 }, { "epoch": 0.012459, "grad_norm": 0.10068190097808838, "learning_rate": 1e-05, "loss": 0.0165, "step": 1245900 }, { "epoch": 0.01246, "grad_norm": 0.12420829385519028, "learning_rate": 1e-05, "loss": 0.0165, "step": 1246000 }, { "epoch": 0.012461, "grad_norm": 0.14990325272083282, "learning_rate": 1e-05, "loss": 0.0164, "step": 1246100 }, { "epoch": 0.012462, "grad_norm": 0.12322288006544113, "learning_rate": 1e-05, "loss": 0.0169, "step": 1246200 }, { "epoch": 0.012463, "grad_norm": 0.10209585726261139, "learning_rate": 1e-05, "loss": 0.0168, "step": 1246300 }, { "epoch": 0.012464, "grad_norm": 0.10410961508750916, "learning_rate": 1e-05, "loss": 0.0165, "step": 1246400 }, { "epoch": 0.012465, "grad_norm": 0.1576634794473648, "learning_rate": 1e-05, "loss": 0.0168, "step": 1246500 }, { "epoch": 0.012466, "grad_norm": 0.13346081972122192, "learning_rate": 1e-05, "loss": 0.0164, "step": 1246600 }, { "epoch": 0.012467, "grad_norm": 0.11642003804445267, "learning_rate": 1e-05, "loss": 0.0167, "step": 1246700 }, { "epoch": 0.012468, "grad_norm": 0.1258249580860138, "learning_rate": 1e-05, "loss": 0.0164, "step": 1246800 }, { "epoch": 0.012469, "grad_norm": 0.1233980804681778, "learning_rate": 1e-05, "loss": 0.0165, "step": 1246900 }, { "epoch": 0.01247, "grad_norm": 0.15193039178848267, "learning_rate": 1e-05, "loss": 0.0163, "step": 1247000 }, { "epoch": 0.012471, "grad_norm": 0.08600785583257675, "learning_rate": 1e-05, "loss": 0.0163, "step": 1247100 }, { "epoch": 0.012472, "grad_norm": 0.10841194540262222, "learning_rate": 1e-05, "loss": 0.0162, "step": 1247200 }, { "epoch": 0.012473, "grad_norm": 0.12135295569896698, "learning_rate": 1e-05, "loss": 0.0167, "step": 1247300 }, { "epoch": 0.012474, "grad_norm": 0.1238352507352829, "learning_rate": 1e-05, "loss": 0.0167, "step": 1247400 }, { "epoch": 0.012475, "grad_norm": 0.13397163152694702, "learning_rate": 1e-05, "loss": 0.0165, "step": 1247500 }, { "epoch": 0.012476, "grad_norm": 0.2155490666627884, "learning_rate": 1e-05, "loss": 0.0167, "step": 1247600 }, { "epoch": 0.012477, "grad_norm": 0.10198556631803513, "learning_rate": 1e-05, "loss": 0.0165, "step": 1247700 }, { "epoch": 0.012478, "grad_norm": 0.13143834471702576, "learning_rate": 1e-05, "loss": 0.0165, "step": 1247800 }, { "epoch": 0.012479, "grad_norm": 0.130955770611763, "learning_rate": 1e-05, "loss": 0.0166, "step": 1247900 }, { "epoch": 0.01248, "grad_norm": 0.12364278733730316, "learning_rate": 1e-05, "loss": 0.016, "step": 1248000 }, { "epoch": 0.012481, "grad_norm": 0.14529214799404144, "learning_rate": 1e-05, "loss": 0.0163, "step": 1248100 }, { "epoch": 0.012482, "grad_norm": 0.15882118046283722, "learning_rate": 1e-05, "loss": 0.0166, "step": 1248200 }, { "epoch": 0.012483, "grad_norm": 0.09979769587516785, "learning_rate": 1e-05, "loss": 0.0164, "step": 1248300 }, { "epoch": 0.012484, "grad_norm": 0.1469925343990326, "learning_rate": 1e-05, "loss": 0.0164, "step": 1248400 }, { "epoch": 0.012485, "grad_norm": 0.0868758037686348, "learning_rate": 1e-05, "loss": 0.0163, "step": 1248500 }, { "epoch": 0.012486, "grad_norm": 0.16901414096355438, "learning_rate": 1e-05, "loss": 0.0167, "step": 1248600 }, { "epoch": 0.012487, "grad_norm": 0.11975516378879547, "learning_rate": 1e-05, "loss": 0.0165, "step": 1248700 }, { "epoch": 0.012488, "grad_norm": 0.1110275536775589, "learning_rate": 1e-05, "loss": 0.0161, "step": 1248800 }, { "epoch": 0.012489, "grad_norm": 0.15566644072532654, "learning_rate": 1e-05, "loss": 0.0168, "step": 1248900 }, { "epoch": 0.01249, "grad_norm": 0.1232369914650917, "learning_rate": 1e-05, "loss": 0.0166, "step": 1249000 }, { "epoch": 0.012491, "grad_norm": 0.14796103537082672, "learning_rate": 1e-05, "loss": 0.0166, "step": 1249100 }, { "epoch": 0.012492, "grad_norm": 0.1440306156873703, "learning_rate": 1e-05, "loss": 0.0166, "step": 1249200 }, { "epoch": 0.012493, "grad_norm": 0.10200408846139908, "learning_rate": 1e-05, "loss": 0.0166, "step": 1249300 }, { "epoch": 0.012494, "grad_norm": 0.17058628797531128, "learning_rate": 1e-05, "loss": 0.0172, "step": 1249400 }, { "epoch": 0.012495, "grad_norm": 0.13489463925361633, "learning_rate": 1e-05, "loss": 0.0168, "step": 1249500 }, { "epoch": 0.012496, "grad_norm": 0.11585408449172974, "learning_rate": 1e-05, "loss": 0.0165, "step": 1249600 }, { "epoch": 0.012497, "grad_norm": 0.12292391806840897, "learning_rate": 1e-05, "loss": 0.0162, "step": 1249700 }, { "epoch": 0.012498, "grad_norm": 0.13587398827075958, "learning_rate": 1e-05, "loss": 0.0166, "step": 1249800 }, { "epoch": 0.012499, "grad_norm": 0.09529566764831543, "learning_rate": 1e-05, "loss": 0.0165, "step": 1249900 }, { "epoch": 0.0125, "grad_norm": 0.11829447746276855, "learning_rate": 1e-05, "loss": 0.0163, "step": 1250000 }, { "epoch": 0.012501, "grad_norm": 0.11091116815805435, "learning_rate": 1e-05, "loss": 0.0169, "step": 1250100 }, { "epoch": 0.012502, "grad_norm": 0.1126776933670044, "learning_rate": 1e-05, "loss": 0.0165, "step": 1250200 }, { "epoch": 0.012503, "grad_norm": 0.10741473734378815, "learning_rate": 1e-05, "loss": 0.0163, "step": 1250300 }, { "epoch": 0.012504, "grad_norm": 0.10573921352624893, "learning_rate": 1e-05, "loss": 0.0162, "step": 1250400 }, { "epoch": 0.012505, "grad_norm": 0.1399768888950348, "learning_rate": 1e-05, "loss": 0.0163, "step": 1250500 }, { "epoch": 0.012506, "grad_norm": 0.16256795823574066, "learning_rate": 1e-05, "loss": 0.0166, "step": 1250600 }, { "epoch": 0.012507, "grad_norm": 0.0963299572467804, "learning_rate": 1e-05, "loss": 0.0164, "step": 1250700 }, { "epoch": 0.012508, "grad_norm": 0.09898238629102707, "learning_rate": 1e-05, "loss": 0.0162, "step": 1250800 }, { "epoch": 0.012509, "grad_norm": 0.12978221476078033, "learning_rate": 1e-05, "loss": 0.0166, "step": 1250900 }, { "epoch": 0.01251, "grad_norm": 0.18606151640415192, "learning_rate": 1e-05, "loss": 0.0169, "step": 1251000 }, { "epoch": 0.012511, "grad_norm": 0.14451420307159424, "learning_rate": 1e-05, "loss": 0.0165, "step": 1251100 }, { "epoch": 0.012512, "grad_norm": 0.15227988362312317, "learning_rate": 1e-05, "loss": 0.0165, "step": 1251200 }, { "epoch": 0.012513, "grad_norm": 0.12299990653991699, "learning_rate": 1e-05, "loss": 0.0166, "step": 1251300 }, { "epoch": 0.012514, "grad_norm": 0.10332431644201279, "learning_rate": 1e-05, "loss": 0.0165, "step": 1251400 }, { "epoch": 0.012515, "grad_norm": 0.18304267525672913, "learning_rate": 1e-05, "loss": 0.0164, "step": 1251500 }, { "epoch": 0.012516, "grad_norm": 0.21894334256649017, "learning_rate": 1e-05, "loss": 0.0165, "step": 1251600 }, { "epoch": 0.012517, "grad_norm": 0.13379599153995514, "learning_rate": 1e-05, "loss": 0.0166, "step": 1251700 }, { "epoch": 0.012518, "grad_norm": 0.090218186378479, "learning_rate": 1e-05, "loss": 0.0158, "step": 1251800 }, { "epoch": 0.012519, "grad_norm": 0.1381940394639969, "learning_rate": 1e-05, "loss": 0.0163, "step": 1251900 }, { "epoch": 0.01252, "grad_norm": 0.13274773955345154, "learning_rate": 1e-05, "loss": 0.0167, "step": 1252000 }, { "epoch": 0.012521, "grad_norm": 0.18412810564041138, "learning_rate": 1e-05, "loss": 0.0163, "step": 1252100 }, { "epoch": 0.012522, "grad_norm": 0.10222122073173523, "learning_rate": 1e-05, "loss": 0.0168, "step": 1252200 }, { "epoch": 0.012523, "grad_norm": 0.12198159098625183, "learning_rate": 1e-05, "loss": 0.0165, "step": 1252300 }, { "epoch": 0.012524, "grad_norm": 0.12277067452669144, "learning_rate": 1e-05, "loss": 0.0168, "step": 1252400 }, { "epoch": 0.012525, "grad_norm": 0.1401042342185974, "learning_rate": 1e-05, "loss": 0.0167, "step": 1252500 }, { "epoch": 0.012526, "grad_norm": 0.1271807998418808, "learning_rate": 1e-05, "loss": 0.0163, "step": 1252600 }, { "epoch": 0.012527, "grad_norm": 0.15288963913917542, "learning_rate": 1e-05, "loss": 0.0162, "step": 1252700 }, { "epoch": 0.012528, "grad_norm": 0.10727456957101822, "learning_rate": 1e-05, "loss": 0.0164, "step": 1252800 }, { "epoch": 0.012529, "grad_norm": 0.10968563705682755, "learning_rate": 1e-05, "loss": 0.0165, "step": 1252900 }, { "epoch": 0.01253, "grad_norm": 0.1279468685388565, "learning_rate": 1e-05, "loss": 0.0165, "step": 1253000 }, { "epoch": 0.012531, "grad_norm": 0.13508926331996918, "learning_rate": 1e-05, "loss": 0.0162, "step": 1253100 }, { "epoch": 0.012532, "grad_norm": 0.09541157633066177, "learning_rate": 1e-05, "loss": 0.0168, "step": 1253200 }, { "epoch": 0.012533, "grad_norm": 0.1670912504196167, "learning_rate": 1e-05, "loss": 0.0165, "step": 1253300 }, { "epoch": 0.012534, "grad_norm": 0.1405564844608307, "learning_rate": 1e-05, "loss": 0.0166, "step": 1253400 }, { "epoch": 0.012535, "grad_norm": 0.1297534704208374, "learning_rate": 1e-05, "loss": 0.0166, "step": 1253500 }, { "epoch": 0.012536, "grad_norm": 0.1774311661720276, "learning_rate": 1e-05, "loss": 0.0165, "step": 1253600 }, { "epoch": 0.012537, "grad_norm": 0.16138072311878204, "learning_rate": 1e-05, "loss": 0.0168, "step": 1253700 }, { "epoch": 0.012538, "grad_norm": 0.11952675133943558, "learning_rate": 1e-05, "loss": 0.0162, "step": 1253800 }, { "epoch": 0.012539, "grad_norm": 0.11117417365312576, "learning_rate": 1e-05, "loss": 0.0168, "step": 1253900 }, { "epoch": 0.01254, "grad_norm": 0.15718917548656464, "learning_rate": 1e-05, "loss": 0.0165, "step": 1254000 }, { "epoch": 0.012541, "grad_norm": 0.1263897567987442, "learning_rate": 1e-05, "loss": 0.0166, "step": 1254100 }, { "epoch": 0.012542, "grad_norm": 0.1539185643196106, "learning_rate": 1e-05, "loss": 0.0165, "step": 1254200 }, { "epoch": 0.012543, "grad_norm": 0.10632919520139694, "learning_rate": 1e-05, "loss": 0.0165, "step": 1254300 }, { "epoch": 0.012544, "grad_norm": 0.13403496146202087, "learning_rate": 1e-05, "loss": 0.0167, "step": 1254400 }, { "epoch": 0.012545, "grad_norm": 0.249542698264122, "learning_rate": 1e-05, "loss": 0.0165, "step": 1254500 }, { "epoch": 0.012546, "grad_norm": 0.09500754624605179, "learning_rate": 1e-05, "loss": 0.0166, "step": 1254600 }, { "epoch": 0.012547, "grad_norm": 0.19805824756622314, "learning_rate": 1e-05, "loss": 0.016, "step": 1254700 }, { "epoch": 0.012548, "grad_norm": 0.11522325873374939, "learning_rate": 1e-05, "loss": 0.0162, "step": 1254800 }, { "epoch": 0.012549, "grad_norm": 0.1136920154094696, "learning_rate": 1e-05, "loss": 0.0169, "step": 1254900 }, { "epoch": 0.01255, "grad_norm": 0.19888010621070862, "learning_rate": 1e-05, "loss": 0.0162, "step": 1255000 }, { "epoch": 0.012551, "grad_norm": 0.12935498356819153, "learning_rate": 1e-05, "loss": 0.0167, "step": 1255100 }, { "epoch": 0.012552, "grad_norm": 0.10947629064321518, "learning_rate": 1e-05, "loss": 0.0167, "step": 1255200 }, { "epoch": 0.012553, "grad_norm": 0.1429247111082077, "learning_rate": 1e-05, "loss": 0.0168, "step": 1255300 }, { "epoch": 0.012554, "grad_norm": 0.14652542769908905, "learning_rate": 1e-05, "loss": 0.0166, "step": 1255400 }, { "epoch": 0.012555, "grad_norm": 0.11002659797668457, "learning_rate": 1e-05, "loss": 0.0166, "step": 1255500 }, { "epoch": 0.012556, "grad_norm": 0.08643496036529541, "learning_rate": 1e-05, "loss": 0.0168, "step": 1255600 }, { "epoch": 0.012557, "grad_norm": 0.12196072936058044, "learning_rate": 1e-05, "loss": 0.0162, "step": 1255700 }, { "epoch": 0.012558, "grad_norm": 0.1337079107761383, "learning_rate": 1e-05, "loss": 0.0162, "step": 1255800 }, { "epoch": 0.012559, "grad_norm": 0.13538403809070587, "learning_rate": 1e-05, "loss": 0.0165, "step": 1255900 }, { "epoch": 0.01256, "grad_norm": 0.10448336601257324, "learning_rate": 1e-05, "loss": 0.0163, "step": 1256000 }, { "epoch": 0.012561, "grad_norm": 0.14320601522922516, "learning_rate": 1e-05, "loss": 0.0162, "step": 1256100 }, { "epoch": 0.012562, "grad_norm": 0.11514865607023239, "learning_rate": 1e-05, "loss": 0.0163, "step": 1256200 }, { "epoch": 0.012563, "grad_norm": 0.1765168458223343, "learning_rate": 1e-05, "loss": 0.0159, "step": 1256300 }, { "epoch": 0.012564, "grad_norm": 0.12095076590776443, "learning_rate": 1e-05, "loss": 0.0166, "step": 1256400 }, { "epoch": 0.012565, "grad_norm": 0.16097933053970337, "learning_rate": 1e-05, "loss": 0.0163, "step": 1256500 }, { "epoch": 0.012566, "grad_norm": 0.12425470352172852, "learning_rate": 1e-05, "loss": 0.0166, "step": 1256600 }, { "epoch": 0.012567, "grad_norm": 0.1599469631910324, "learning_rate": 1e-05, "loss": 0.0167, "step": 1256700 }, { "epoch": 0.012568, "grad_norm": 0.0903673991560936, "learning_rate": 1e-05, "loss": 0.0162, "step": 1256800 }, { "epoch": 0.012569, "grad_norm": 0.23137085139751434, "learning_rate": 1e-05, "loss": 0.0164, "step": 1256900 }, { "epoch": 0.01257, "grad_norm": 0.09946409612894058, "learning_rate": 1e-05, "loss": 0.0169, "step": 1257000 }, { "epoch": 0.012571, "grad_norm": 0.1388683319091797, "learning_rate": 1e-05, "loss": 0.0163, "step": 1257100 }, { "epoch": 0.012572, "grad_norm": 0.0916399359703064, "learning_rate": 1e-05, "loss": 0.0165, "step": 1257200 }, { "epoch": 0.012573, "grad_norm": 0.10805005580186844, "learning_rate": 1e-05, "loss": 0.016, "step": 1257300 }, { "epoch": 0.012574, "grad_norm": 0.17532192170619965, "learning_rate": 1e-05, "loss": 0.0165, "step": 1257400 }, { "epoch": 0.012575, "grad_norm": 0.16807642579078674, "learning_rate": 1e-05, "loss": 0.0165, "step": 1257500 }, { "epoch": 0.012576, "grad_norm": 0.12769322097301483, "learning_rate": 1e-05, "loss": 0.0166, "step": 1257600 }, { "epoch": 0.012577, "grad_norm": 0.16755084693431854, "learning_rate": 1e-05, "loss": 0.0167, "step": 1257700 }, { "epoch": 0.012578, "grad_norm": 0.1076383888721466, "learning_rate": 1e-05, "loss": 0.017, "step": 1257800 }, { "epoch": 0.012579, "grad_norm": 0.14116574823856354, "learning_rate": 1e-05, "loss": 0.0167, "step": 1257900 }, { "epoch": 0.01258, "grad_norm": 0.16044096648693085, "learning_rate": 1e-05, "loss": 0.0167, "step": 1258000 }, { "epoch": 0.012581, "grad_norm": 0.1001429557800293, "learning_rate": 1e-05, "loss": 0.0168, "step": 1258100 }, { "epoch": 0.012582, "grad_norm": 0.12777270376682281, "learning_rate": 1e-05, "loss": 0.0163, "step": 1258200 }, { "epoch": 0.012583, "grad_norm": 0.14900930225849152, "learning_rate": 1e-05, "loss": 0.0162, "step": 1258300 }, { "epoch": 0.012584, "grad_norm": 0.11062465608119965, "learning_rate": 1e-05, "loss": 0.0165, "step": 1258400 }, { "epoch": 0.012585, "grad_norm": 0.12648256123065948, "learning_rate": 1e-05, "loss": 0.0161, "step": 1258500 }, { "epoch": 0.012586, "grad_norm": 0.12146135419607162, "learning_rate": 1e-05, "loss": 0.0169, "step": 1258600 }, { "epoch": 0.012587, "grad_norm": 0.09782417118549347, "learning_rate": 1e-05, "loss": 0.0162, "step": 1258700 }, { "epoch": 0.012588, "grad_norm": 0.1972355991601944, "learning_rate": 1e-05, "loss": 0.0163, "step": 1258800 }, { "epoch": 0.012589, "grad_norm": 0.19113993644714355, "learning_rate": 1e-05, "loss": 0.0166, "step": 1258900 }, { "epoch": 0.01259, "grad_norm": 0.13392864167690277, "learning_rate": 1e-05, "loss": 0.0161, "step": 1259000 }, { "epoch": 0.012591, "grad_norm": 0.19226406514644623, "learning_rate": 1e-05, "loss": 0.0162, "step": 1259100 }, { "epoch": 0.012592, "grad_norm": 0.11134419590234756, "learning_rate": 1e-05, "loss": 0.0162, "step": 1259200 }, { "epoch": 0.012593, "grad_norm": 0.12573546171188354, "learning_rate": 1e-05, "loss": 0.0158, "step": 1259300 }, { "epoch": 0.012594, "grad_norm": 0.2822321057319641, "learning_rate": 1e-05, "loss": 0.0161, "step": 1259400 }, { "epoch": 0.012595, "grad_norm": 0.1534976214170456, "learning_rate": 1e-05, "loss": 0.0165, "step": 1259500 }, { "epoch": 0.012596, "grad_norm": 0.1439906805753708, "learning_rate": 1e-05, "loss": 0.0165, "step": 1259600 }, { "epoch": 0.012597, "grad_norm": 0.12205192446708679, "learning_rate": 1e-05, "loss": 0.0166, "step": 1259700 }, { "epoch": 0.012598, "grad_norm": 0.11524111777544022, "learning_rate": 1e-05, "loss": 0.0163, "step": 1259800 }, { "epoch": 0.012599, "grad_norm": 0.1125972718000412, "learning_rate": 1e-05, "loss": 0.0162, "step": 1259900 }, { "epoch": 0.0126, "grad_norm": 0.15061809122562408, "learning_rate": 1e-05, "loss": 0.0167, "step": 1260000 }, { "epoch": 0.0126, "eval_loss": 0.014501271769404411, "eval_runtime": 174.6508, "eval_samples_per_second": 286.286, "eval_steps_per_second": 17.893, "step": 1260000 }, { "epoch": 0.012601, "grad_norm": 0.141098290681839, "learning_rate": 1e-05, "loss": 0.0167, "step": 1260100 }, { "epoch": 0.012602, "grad_norm": 0.1383398473262787, "learning_rate": 1e-05, "loss": 0.0166, "step": 1260200 }, { "epoch": 0.012603, "grad_norm": 0.1404508501291275, "learning_rate": 1e-05, "loss": 0.0163, "step": 1260300 }, { "epoch": 0.012604, "grad_norm": 0.09119415283203125, "learning_rate": 1e-05, "loss": 0.0168, "step": 1260400 }, { "epoch": 0.012605, "grad_norm": 0.14882010221481323, "learning_rate": 1e-05, "loss": 0.0167, "step": 1260500 }, { "epoch": 0.012606, "grad_norm": 0.12939228117465973, "learning_rate": 1e-05, "loss": 0.0166, "step": 1260600 }, { "epoch": 0.012607, "grad_norm": 0.12864862382411957, "learning_rate": 1e-05, "loss": 0.0165, "step": 1260700 }, { "epoch": 0.012608, "grad_norm": 0.10995795577764511, "learning_rate": 1e-05, "loss": 0.0168, "step": 1260800 }, { "epoch": 0.012609, "grad_norm": 0.1490667760372162, "learning_rate": 1e-05, "loss": 0.0163, "step": 1260900 }, { "epoch": 0.01261, "grad_norm": 0.13481858372688293, "learning_rate": 1e-05, "loss": 0.0163, "step": 1261000 }, { "epoch": 0.012611, "grad_norm": 0.12303011864423752, "learning_rate": 1e-05, "loss": 0.0163, "step": 1261100 }, { "epoch": 0.012612, "grad_norm": 0.15561160445213318, "learning_rate": 1e-05, "loss": 0.0161, "step": 1261200 }, { "epoch": 0.012613, "grad_norm": 0.14266391098499298, "learning_rate": 1e-05, "loss": 0.016, "step": 1261300 }, { "epoch": 0.012614, "grad_norm": 0.19079117476940155, "learning_rate": 1e-05, "loss": 0.0164, "step": 1261400 }, { "epoch": 0.012615, "grad_norm": 0.1270839273929596, "learning_rate": 1e-05, "loss": 0.0163, "step": 1261500 }, { "epoch": 0.012616, "grad_norm": 0.09790100157260895, "learning_rate": 1e-05, "loss": 0.0164, "step": 1261600 }, { "epoch": 0.012617, "grad_norm": 0.1192532479763031, "learning_rate": 1e-05, "loss": 0.0165, "step": 1261700 }, { "epoch": 0.012618, "grad_norm": 0.11597774922847748, "learning_rate": 1e-05, "loss": 0.0171, "step": 1261800 }, { "epoch": 0.012619, "grad_norm": 0.15459024906158447, "learning_rate": 1e-05, "loss": 0.0164, "step": 1261900 }, { "epoch": 0.01262, "grad_norm": 0.14245162904262543, "learning_rate": 1e-05, "loss": 0.0165, "step": 1262000 }, { "epoch": 0.012621, "grad_norm": 0.09699998050928116, "learning_rate": 1e-05, "loss": 0.0163, "step": 1262100 }, { "epoch": 0.012622, "grad_norm": 0.12698477506637573, "learning_rate": 1e-05, "loss": 0.0166, "step": 1262200 }, { "epoch": 0.012623, "grad_norm": 0.10814773291349411, "learning_rate": 1e-05, "loss": 0.0166, "step": 1262300 }, { "epoch": 0.012624, "grad_norm": 0.18440139293670654, "learning_rate": 1e-05, "loss": 0.016, "step": 1262400 }, { "epoch": 0.012625, "grad_norm": 0.13310649991035461, "learning_rate": 1e-05, "loss": 0.0166, "step": 1262500 }, { "epoch": 0.012626, "grad_norm": 0.11620922386646271, "learning_rate": 1e-05, "loss": 0.0162, "step": 1262600 }, { "epoch": 0.012627, "grad_norm": 0.10914286971092224, "learning_rate": 1e-05, "loss": 0.0166, "step": 1262700 }, { "epoch": 0.012628, "grad_norm": 0.14889417588710785, "learning_rate": 1e-05, "loss": 0.0167, "step": 1262800 }, { "epoch": 0.012629, "grad_norm": 0.10285869240760803, "learning_rate": 1e-05, "loss": 0.0161, "step": 1262900 }, { "epoch": 0.01263, "grad_norm": 0.11551417410373688, "learning_rate": 1e-05, "loss": 0.0169, "step": 1263000 }, { "epoch": 0.012631, "grad_norm": 0.12132091075181961, "learning_rate": 1e-05, "loss": 0.0163, "step": 1263100 }, { "epoch": 0.012632, "grad_norm": 0.12118510156869888, "learning_rate": 1e-05, "loss": 0.0167, "step": 1263200 }, { "epoch": 0.012633, "grad_norm": 0.14508026838302612, "learning_rate": 1e-05, "loss": 0.0168, "step": 1263300 }, { "epoch": 0.012634, "grad_norm": 0.15503649413585663, "learning_rate": 1e-05, "loss": 0.0164, "step": 1263400 }, { "epoch": 0.012635, "grad_norm": 0.10695096850395203, "learning_rate": 1e-05, "loss": 0.0159, "step": 1263500 }, { "epoch": 0.012636, "grad_norm": 0.09882692247629166, "learning_rate": 1e-05, "loss": 0.016, "step": 1263600 }, { "epoch": 0.012637, "grad_norm": 0.1502724289894104, "learning_rate": 1e-05, "loss": 0.0169, "step": 1263700 }, { "epoch": 0.012638, "grad_norm": 0.1281481385231018, "learning_rate": 1e-05, "loss": 0.0164, "step": 1263800 }, { "epoch": 0.012639, "grad_norm": 0.16571994125843048, "learning_rate": 1e-05, "loss": 0.0167, "step": 1263900 }, { "epoch": 0.01264, "grad_norm": 0.1190827265381813, "learning_rate": 1e-05, "loss": 0.0165, "step": 1264000 }, { "epoch": 0.012641, "grad_norm": 0.10666543245315552, "learning_rate": 1e-05, "loss": 0.0167, "step": 1264100 }, { "epoch": 0.012642, "grad_norm": 0.11398537456989288, "learning_rate": 1e-05, "loss": 0.0162, "step": 1264200 }, { "epoch": 0.012643, "grad_norm": 0.15655197203159332, "learning_rate": 1e-05, "loss": 0.0161, "step": 1264300 }, { "epoch": 0.012644, "grad_norm": 0.20839545130729675, "learning_rate": 1e-05, "loss": 0.0166, "step": 1264400 }, { "epoch": 0.012645, "grad_norm": 0.1696898639202118, "learning_rate": 1e-05, "loss": 0.0163, "step": 1264500 }, { "epoch": 0.012646, "grad_norm": 0.10723387449979782, "learning_rate": 1e-05, "loss": 0.0162, "step": 1264600 }, { "epoch": 0.012647, "grad_norm": 0.13337662816047668, "learning_rate": 1e-05, "loss": 0.0163, "step": 1264700 }, { "epoch": 0.012648, "grad_norm": 0.1120479553937912, "learning_rate": 1e-05, "loss": 0.0168, "step": 1264800 }, { "epoch": 0.012649, "grad_norm": 0.15289941430091858, "learning_rate": 1e-05, "loss": 0.0164, "step": 1264900 }, { "epoch": 0.01265, "grad_norm": 0.12231770157814026, "learning_rate": 1e-05, "loss": 0.0163, "step": 1265000 }, { "epoch": 0.012651, "grad_norm": 0.14588147401809692, "learning_rate": 1e-05, "loss": 0.0166, "step": 1265100 }, { "epoch": 0.012652, "grad_norm": 0.11176847666501999, "learning_rate": 1e-05, "loss": 0.0165, "step": 1265200 }, { "epoch": 0.012653, "grad_norm": 0.19136770069599152, "learning_rate": 1e-05, "loss": 0.0163, "step": 1265300 }, { "epoch": 0.012654, "grad_norm": 0.11831319332122803, "learning_rate": 1e-05, "loss": 0.0167, "step": 1265400 }, { "epoch": 0.012655, "grad_norm": 0.12719157338142395, "learning_rate": 1e-05, "loss": 0.0166, "step": 1265500 }, { "epoch": 0.012656, "grad_norm": 0.12119277566671371, "learning_rate": 1e-05, "loss": 0.0161, "step": 1265600 }, { "epoch": 0.012657, "grad_norm": 0.1458745300769806, "learning_rate": 1e-05, "loss": 0.016, "step": 1265700 }, { "epoch": 0.012658, "grad_norm": 0.1458827257156372, "learning_rate": 1e-05, "loss": 0.0164, "step": 1265800 }, { "epoch": 0.012659, "grad_norm": 0.11198562383651733, "learning_rate": 1e-05, "loss": 0.0168, "step": 1265900 }, { "epoch": 0.01266, "grad_norm": 0.10826896876096725, "learning_rate": 1e-05, "loss": 0.0168, "step": 1266000 }, { "epoch": 0.012661, "grad_norm": 0.12901516258716583, "learning_rate": 1e-05, "loss": 0.0162, "step": 1266100 }, { "epoch": 0.012662, "grad_norm": 0.1586405336856842, "learning_rate": 1e-05, "loss": 0.0165, "step": 1266200 }, { "epoch": 0.012663, "grad_norm": 0.1273271143436432, "learning_rate": 1e-05, "loss": 0.0165, "step": 1266300 }, { "epoch": 0.012664, "grad_norm": 0.12583684921264648, "learning_rate": 1e-05, "loss": 0.0164, "step": 1266400 }, { "epoch": 0.012665, "grad_norm": 0.10397139936685562, "learning_rate": 1e-05, "loss": 0.0162, "step": 1266500 }, { "epoch": 0.012666, "grad_norm": 0.15740592777729034, "learning_rate": 1e-05, "loss": 0.0163, "step": 1266600 }, { "epoch": 0.012667, "grad_norm": 0.15295036137104034, "learning_rate": 1e-05, "loss": 0.0168, "step": 1266700 }, { "epoch": 0.012668, "grad_norm": 0.14992660284042358, "learning_rate": 1e-05, "loss": 0.0163, "step": 1266800 }, { "epoch": 0.012669, "grad_norm": 0.1350657343864441, "learning_rate": 1e-05, "loss": 0.0166, "step": 1266900 }, { "epoch": 0.01267, "grad_norm": 0.10271216928958893, "learning_rate": 1e-05, "loss": 0.0164, "step": 1267000 }, { "epoch": 0.012671, "grad_norm": 0.151779904961586, "learning_rate": 1e-05, "loss": 0.0165, "step": 1267100 }, { "epoch": 0.012672, "grad_norm": 0.09716726839542389, "learning_rate": 1e-05, "loss": 0.0164, "step": 1267200 }, { "epoch": 0.012673, "grad_norm": 0.14804473519325256, "learning_rate": 1e-05, "loss": 0.0166, "step": 1267300 }, { "epoch": 0.012674, "grad_norm": 0.12383927404880524, "learning_rate": 1e-05, "loss": 0.0165, "step": 1267400 }, { "epoch": 0.012675, "grad_norm": 0.12196541577577591, "learning_rate": 1e-05, "loss": 0.0168, "step": 1267500 }, { "epoch": 0.012676, "grad_norm": 0.11335136741399765, "learning_rate": 1e-05, "loss": 0.0168, "step": 1267600 }, { "epoch": 0.012677, "grad_norm": 0.1465173363685608, "learning_rate": 1e-05, "loss": 0.0163, "step": 1267700 }, { "epoch": 0.012678, "grad_norm": 0.13881030678749084, "learning_rate": 1e-05, "loss": 0.0163, "step": 1267800 }, { "epoch": 0.012679, "grad_norm": 0.12148427963256836, "learning_rate": 1e-05, "loss": 0.0163, "step": 1267900 }, { "epoch": 0.01268, "grad_norm": 0.1459297090768814, "learning_rate": 1e-05, "loss": 0.0165, "step": 1268000 }, { "epoch": 0.012681, "grad_norm": 0.09799786657094955, "learning_rate": 1e-05, "loss": 0.0166, "step": 1268100 }, { "epoch": 0.012682, "grad_norm": 0.11379773914813995, "learning_rate": 1e-05, "loss": 0.0162, "step": 1268200 }, { "epoch": 0.012683, "grad_norm": 0.19959311187267303, "learning_rate": 1e-05, "loss": 0.0168, "step": 1268300 }, { "epoch": 0.012684, "grad_norm": 0.12679556012153625, "learning_rate": 1e-05, "loss": 0.0163, "step": 1268400 }, { "epoch": 0.012685, "grad_norm": 0.13856440782546997, "learning_rate": 1e-05, "loss": 0.0163, "step": 1268500 }, { "epoch": 0.012686, "grad_norm": 0.1417471021413803, "learning_rate": 1e-05, "loss": 0.0165, "step": 1268600 }, { "epoch": 0.012687, "grad_norm": 0.14107558131217957, "learning_rate": 1e-05, "loss": 0.0166, "step": 1268700 }, { "epoch": 0.012688, "grad_norm": 0.12882620096206665, "learning_rate": 1e-05, "loss": 0.0165, "step": 1268800 }, { "epoch": 0.012689, "grad_norm": 0.10067876428365707, "learning_rate": 1e-05, "loss": 0.0165, "step": 1268900 }, { "epoch": 0.01269, "grad_norm": 0.1659547984600067, "learning_rate": 1e-05, "loss": 0.0165, "step": 1269000 }, { "epoch": 0.012691, "grad_norm": 0.14518487453460693, "learning_rate": 1e-05, "loss": 0.0163, "step": 1269100 }, { "epoch": 0.012692, "grad_norm": 0.10656031966209412, "learning_rate": 1e-05, "loss": 0.0163, "step": 1269200 }, { "epoch": 0.012693, "grad_norm": 0.1167101338505745, "learning_rate": 1e-05, "loss": 0.0161, "step": 1269300 }, { "epoch": 0.012694, "grad_norm": 0.10044455528259277, "learning_rate": 1e-05, "loss": 0.0166, "step": 1269400 }, { "epoch": 0.012695, "grad_norm": 0.12333839386701584, "learning_rate": 1e-05, "loss": 0.0166, "step": 1269500 }, { "epoch": 0.012696, "grad_norm": 0.10836829245090485, "learning_rate": 1e-05, "loss": 0.0169, "step": 1269600 }, { "epoch": 0.012697, "grad_norm": 0.11172482371330261, "learning_rate": 1e-05, "loss": 0.0165, "step": 1269700 }, { "epoch": 0.012698, "grad_norm": 0.20834918320178986, "learning_rate": 1e-05, "loss": 0.0168, "step": 1269800 }, { "epoch": 0.012699, "grad_norm": 0.13849081099033356, "learning_rate": 1e-05, "loss": 0.0166, "step": 1269900 }, { "epoch": 0.0127, "grad_norm": 0.14484405517578125, "learning_rate": 1e-05, "loss": 0.0171, "step": 1270000 }, { "epoch": 0.012701, "grad_norm": 0.13439229130744934, "learning_rate": 1e-05, "loss": 0.0166, "step": 1270100 }, { "epoch": 0.012702, "grad_norm": 0.12172254920005798, "learning_rate": 1e-05, "loss": 0.0167, "step": 1270200 }, { "epoch": 0.012703, "grad_norm": 0.13051538169384003, "learning_rate": 1e-05, "loss": 0.0162, "step": 1270300 }, { "epoch": 0.012704, "grad_norm": 0.09141126275062561, "learning_rate": 1e-05, "loss": 0.0164, "step": 1270400 }, { "epoch": 0.012705, "grad_norm": 0.1588456928730011, "learning_rate": 1e-05, "loss": 0.0163, "step": 1270500 }, { "epoch": 0.012706, "grad_norm": 0.2480054795742035, "learning_rate": 1e-05, "loss": 0.0164, "step": 1270600 }, { "epoch": 0.012707, "grad_norm": 0.09903229773044586, "learning_rate": 1e-05, "loss": 0.0165, "step": 1270700 }, { "epoch": 0.012708, "grad_norm": 0.11754204332828522, "learning_rate": 1e-05, "loss": 0.0163, "step": 1270800 }, { "epoch": 0.012709, "grad_norm": 0.12328040599822998, "learning_rate": 1e-05, "loss": 0.0165, "step": 1270900 }, { "epoch": 0.01271, "grad_norm": 0.11511868238449097, "learning_rate": 1e-05, "loss": 0.016, "step": 1271000 }, { "epoch": 0.012711, "grad_norm": 0.12081712484359741, "learning_rate": 1e-05, "loss": 0.0163, "step": 1271100 }, { "epoch": 0.012712, "grad_norm": 0.12660542130470276, "learning_rate": 1e-05, "loss": 0.0166, "step": 1271200 }, { "epoch": 0.012713, "grad_norm": 0.13019806146621704, "learning_rate": 1e-05, "loss": 0.0164, "step": 1271300 }, { "epoch": 0.012714, "grad_norm": 0.196410670876503, "learning_rate": 1e-05, "loss": 0.0162, "step": 1271400 }, { "epoch": 0.012715, "grad_norm": 0.13500455021858215, "learning_rate": 1e-05, "loss": 0.0169, "step": 1271500 }, { "epoch": 0.012716, "grad_norm": 0.1267121434211731, "learning_rate": 1e-05, "loss": 0.0166, "step": 1271600 }, { "epoch": 0.012717, "grad_norm": 0.17948870360851288, "learning_rate": 1e-05, "loss": 0.016, "step": 1271700 }, { "epoch": 0.012718, "grad_norm": 0.16422177851200104, "learning_rate": 1e-05, "loss": 0.0166, "step": 1271800 }, { "epoch": 0.012719, "grad_norm": 0.13159160315990448, "learning_rate": 1e-05, "loss": 0.0165, "step": 1271900 }, { "epoch": 0.01272, "grad_norm": 0.12779028713703156, "learning_rate": 1e-05, "loss": 0.0167, "step": 1272000 }, { "epoch": 0.012721, "grad_norm": 0.15589243173599243, "learning_rate": 1e-05, "loss": 0.0164, "step": 1272100 }, { "epoch": 0.012722, "grad_norm": 0.14207720756530762, "learning_rate": 1e-05, "loss": 0.016, "step": 1272200 }, { "epoch": 0.012723, "grad_norm": 0.12482801079750061, "learning_rate": 1e-05, "loss": 0.0165, "step": 1272300 }, { "epoch": 0.012724, "grad_norm": 0.115887351334095, "learning_rate": 1e-05, "loss": 0.0161, "step": 1272400 }, { "epoch": 0.012725, "grad_norm": 0.13204804062843323, "learning_rate": 1e-05, "loss": 0.016, "step": 1272500 }, { "epoch": 0.012726, "grad_norm": 0.25719204545021057, "learning_rate": 1e-05, "loss": 0.0167, "step": 1272600 }, { "epoch": 0.012727, "grad_norm": 0.1153266653418541, "learning_rate": 1e-05, "loss": 0.0161, "step": 1272700 }, { "epoch": 0.012728, "grad_norm": 0.1291966438293457, "learning_rate": 1e-05, "loss": 0.0164, "step": 1272800 }, { "epoch": 0.012729, "grad_norm": 0.11185607314109802, "learning_rate": 1e-05, "loss": 0.0164, "step": 1272900 }, { "epoch": 0.01273, "grad_norm": 0.13092947006225586, "learning_rate": 1e-05, "loss": 0.0166, "step": 1273000 }, { "epoch": 0.012731, "grad_norm": 0.11414914578199387, "learning_rate": 1e-05, "loss": 0.0164, "step": 1273100 }, { "epoch": 0.012732, "grad_norm": 0.08768228441476822, "learning_rate": 1e-05, "loss": 0.0162, "step": 1273200 }, { "epoch": 0.012733, "grad_norm": 0.13938632607460022, "learning_rate": 1e-05, "loss": 0.0162, "step": 1273300 }, { "epoch": 0.012734, "grad_norm": 0.15292012691497803, "learning_rate": 1e-05, "loss": 0.0163, "step": 1273400 }, { "epoch": 0.012735, "grad_norm": 0.11693853884935379, "learning_rate": 1e-05, "loss": 0.0165, "step": 1273500 }, { "epoch": 0.012736, "grad_norm": 0.17576263844966888, "learning_rate": 1e-05, "loss": 0.0162, "step": 1273600 }, { "epoch": 0.012737, "grad_norm": 0.1445605456829071, "learning_rate": 1e-05, "loss": 0.0163, "step": 1273700 }, { "epoch": 0.012738, "grad_norm": 0.12206587940454483, "learning_rate": 1e-05, "loss": 0.0163, "step": 1273800 }, { "epoch": 0.012739, "grad_norm": 0.09871891140937805, "learning_rate": 1e-05, "loss": 0.0166, "step": 1273900 }, { "epoch": 0.01274, "grad_norm": 0.09666603803634644, "learning_rate": 1e-05, "loss": 0.0168, "step": 1274000 }, { "epoch": 0.012741, "grad_norm": 0.12388962507247925, "learning_rate": 1e-05, "loss": 0.0165, "step": 1274100 }, { "epoch": 0.012742, "grad_norm": 0.11191580444574356, "learning_rate": 1e-05, "loss": 0.0164, "step": 1274200 }, { "epoch": 0.012743, "grad_norm": 0.10750073939561844, "learning_rate": 1e-05, "loss": 0.017, "step": 1274300 }, { "epoch": 0.012744, "grad_norm": 0.1256164014339447, "learning_rate": 1e-05, "loss": 0.0167, "step": 1274400 }, { "epoch": 0.012745, "grad_norm": 0.13937151432037354, "learning_rate": 1e-05, "loss": 0.0168, "step": 1274500 }, { "epoch": 0.012746, "grad_norm": 0.09263138473033905, "learning_rate": 1e-05, "loss": 0.0165, "step": 1274600 }, { "epoch": 0.012747, "grad_norm": 0.12454870343208313, "learning_rate": 1e-05, "loss": 0.0162, "step": 1274700 }, { "epoch": 0.012748, "grad_norm": 0.1514689177274704, "learning_rate": 1e-05, "loss": 0.0165, "step": 1274800 }, { "epoch": 0.012749, "grad_norm": 0.18039268255233765, "learning_rate": 1e-05, "loss": 0.017, "step": 1274900 }, { "epoch": 0.01275, "grad_norm": 0.1060633584856987, "learning_rate": 1e-05, "loss": 0.0168, "step": 1275000 }, { "epoch": 0.012751, "grad_norm": 0.15193837881088257, "learning_rate": 1e-05, "loss": 0.0166, "step": 1275100 }, { "epoch": 0.012752, "grad_norm": 0.10935189574956894, "learning_rate": 1e-05, "loss": 0.0167, "step": 1275200 }, { "epoch": 0.012753, "grad_norm": 0.13687679171562195, "learning_rate": 1e-05, "loss": 0.0163, "step": 1275300 }, { "epoch": 0.012754, "grad_norm": 0.10825827717781067, "learning_rate": 1e-05, "loss": 0.0162, "step": 1275400 }, { "epoch": 0.012755, "grad_norm": 0.1330101191997528, "learning_rate": 1e-05, "loss": 0.0162, "step": 1275500 }, { "epoch": 0.012756, "grad_norm": 0.09932491928339005, "learning_rate": 1e-05, "loss": 0.0163, "step": 1275600 }, { "epoch": 0.012757, "grad_norm": 0.1256123036146164, "learning_rate": 1e-05, "loss": 0.0167, "step": 1275700 }, { "epoch": 0.012758, "grad_norm": 0.15832550823688507, "learning_rate": 1e-05, "loss": 0.016, "step": 1275800 }, { "epoch": 0.012759, "grad_norm": 0.11437258869409561, "learning_rate": 1e-05, "loss": 0.0164, "step": 1275900 }, { "epoch": 0.01276, "grad_norm": 0.08124420791864395, "learning_rate": 1e-05, "loss": 0.0164, "step": 1276000 }, { "epoch": 0.012761, "grad_norm": 0.1283300817012787, "learning_rate": 1e-05, "loss": 0.0162, "step": 1276100 }, { "epoch": 0.012762, "grad_norm": 0.1396435797214508, "learning_rate": 1e-05, "loss": 0.0168, "step": 1276200 }, { "epoch": 0.012763, "grad_norm": 0.10537270456552505, "learning_rate": 1e-05, "loss": 0.0163, "step": 1276300 }, { "epoch": 0.012764, "grad_norm": 0.13140851259231567, "learning_rate": 1e-05, "loss": 0.0164, "step": 1276400 }, { "epoch": 0.012765, "grad_norm": 0.09953591972589493, "learning_rate": 1e-05, "loss": 0.0167, "step": 1276500 }, { "epoch": 0.012766, "grad_norm": 0.1656143218278885, "learning_rate": 1e-05, "loss": 0.0167, "step": 1276600 }, { "epoch": 0.012767, "grad_norm": 0.1266968995332718, "learning_rate": 1e-05, "loss": 0.0166, "step": 1276700 }, { "epoch": 0.012768, "grad_norm": 0.10822251439094543, "learning_rate": 1e-05, "loss": 0.0166, "step": 1276800 }, { "epoch": 0.012769, "grad_norm": 0.09874938428401947, "learning_rate": 1e-05, "loss": 0.0164, "step": 1276900 }, { "epoch": 0.01277, "grad_norm": 0.11763493716716766, "learning_rate": 1e-05, "loss": 0.0162, "step": 1277000 }, { "epoch": 0.012771, "grad_norm": 0.10054826736450195, "learning_rate": 1e-05, "loss": 0.0167, "step": 1277100 }, { "epoch": 0.012772, "grad_norm": 0.13767004013061523, "learning_rate": 1e-05, "loss": 0.0166, "step": 1277200 }, { "epoch": 0.012773, "grad_norm": 0.14206935465335846, "learning_rate": 1e-05, "loss": 0.0165, "step": 1277300 }, { "epoch": 0.012774, "grad_norm": 0.12340623885393143, "learning_rate": 1e-05, "loss": 0.0167, "step": 1277400 }, { "epoch": 0.012775, "grad_norm": 0.1183277815580368, "learning_rate": 1e-05, "loss": 0.0166, "step": 1277500 }, { "epoch": 0.012776, "grad_norm": 0.16285479068756104, "learning_rate": 1e-05, "loss": 0.0165, "step": 1277600 }, { "epoch": 0.012777, "grad_norm": 0.12474758177995682, "learning_rate": 1e-05, "loss": 0.0166, "step": 1277700 }, { "epoch": 0.012778, "grad_norm": 0.17580631375312805, "learning_rate": 1e-05, "loss": 0.016, "step": 1277800 }, { "epoch": 0.012779, "grad_norm": 0.13319657742977142, "learning_rate": 1e-05, "loss": 0.0166, "step": 1277900 }, { "epoch": 0.01278, "grad_norm": 0.152767613530159, "learning_rate": 1e-05, "loss": 0.0163, "step": 1278000 }, { "epoch": 0.012781, "grad_norm": 0.12314105778932571, "learning_rate": 1e-05, "loss": 0.0164, "step": 1278100 }, { "epoch": 0.012782, "grad_norm": 0.17351628839969635, "learning_rate": 1e-05, "loss": 0.0163, "step": 1278200 }, { "epoch": 0.012783, "grad_norm": 0.1294000893831253, "learning_rate": 1e-05, "loss": 0.0163, "step": 1278300 }, { "epoch": 0.012784, "grad_norm": 0.12241539359092712, "learning_rate": 1e-05, "loss": 0.0161, "step": 1278400 }, { "epoch": 0.012785, "grad_norm": 0.10340813547372818, "learning_rate": 1e-05, "loss": 0.0161, "step": 1278500 }, { "epoch": 0.012786, "grad_norm": 0.14866098761558533, "learning_rate": 1e-05, "loss": 0.0163, "step": 1278600 }, { "epoch": 0.012787, "grad_norm": 0.14592117071151733, "learning_rate": 1e-05, "loss": 0.0165, "step": 1278700 }, { "epoch": 0.012788, "grad_norm": 0.18189071118831635, "learning_rate": 1e-05, "loss": 0.0165, "step": 1278800 }, { "epoch": 0.012789, "grad_norm": 0.12184290587902069, "learning_rate": 1e-05, "loss": 0.0166, "step": 1278900 }, { "epoch": 0.01279, "grad_norm": 0.22256076335906982, "learning_rate": 1e-05, "loss": 0.0166, "step": 1279000 }, { "epoch": 0.012791, "grad_norm": 0.12109823524951935, "learning_rate": 1e-05, "loss": 0.0163, "step": 1279100 }, { "epoch": 0.012792, "grad_norm": 0.12334799021482468, "learning_rate": 1e-05, "loss": 0.0163, "step": 1279200 }, { "epoch": 0.012793, "grad_norm": 0.1547071933746338, "learning_rate": 1e-05, "loss": 0.0161, "step": 1279300 }, { "epoch": 0.012794, "grad_norm": 0.12132173776626587, "learning_rate": 1e-05, "loss": 0.0166, "step": 1279400 }, { "epoch": 0.012795, "grad_norm": 0.11415690928697586, "learning_rate": 1e-05, "loss": 0.0159, "step": 1279500 }, { "epoch": 0.012796, "grad_norm": 0.12602339684963226, "learning_rate": 1e-05, "loss": 0.0164, "step": 1279600 }, { "epoch": 0.012797, "grad_norm": 0.14492040872573853, "learning_rate": 1e-05, "loss": 0.0165, "step": 1279700 }, { "epoch": 0.012798, "grad_norm": 0.16465310752391815, "learning_rate": 1e-05, "loss": 0.0162, "step": 1279800 }, { "epoch": 0.012799, "grad_norm": 0.14156681299209595, "learning_rate": 1e-05, "loss": 0.0163, "step": 1279900 }, { "epoch": 0.0128, "grad_norm": 0.16408325731754303, "learning_rate": 1e-05, "loss": 0.0165, "step": 1280000 }, { "epoch": 0.0128, "eval_loss": 0.014546223916113377, "eval_runtime": 171.5782, "eval_samples_per_second": 291.412, "eval_steps_per_second": 18.213, "step": 1280000 }, { "epoch": 0.012801, "grad_norm": 0.08524324744939804, "learning_rate": 1e-05, "loss": 0.0167, "step": 1280100 }, { "epoch": 0.012802, "grad_norm": 0.1039324402809143, "learning_rate": 1e-05, "loss": 0.0166, "step": 1280200 }, { "epoch": 0.012803, "grad_norm": 0.09356718510389328, "learning_rate": 1e-05, "loss": 0.0165, "step": 1280300 }, { "epoch": 0.012804, "grad_norm": 0.13187971711158752, "learning_rate": 1e-05, "loss": 0.0167, "step": 1280400 }, { "epoch": 0.012805, "grad_norm": 0.12751761078834534, "learning_rate": 1e-05, "loss": 0.0164, "step": 1280500 }, { "epoch": 0.012806, "grad_norm": 0.16318948566913605, "learning_rate": 1e-05, "loss": 0.0164, "step": 1280600 }, { "epoch": 0.012807, "grad_norm": 0.10927627235651016, "learning_rate": 1e-05, "loss": 0.0164, "step": 1280700 }, { "epoch": 0.012808, "grad_norm": 0.11386704444885254, "learning_rate": 1e-05, "loss": 0.0161, "step": 1280800 }, { "epoch": 0.012809, "grad_norm": 0.1558283269405365, "learning_rate": 1e-05, "loss": 0.0164, "step": 1280900 }, { "epoch": 0.01281, "grad_norm": 0.12671059370040894, "learning_rate": 1e-05, "loss": 0.0166, "step": 1281000 }, { "epoch": 0.012811, "grad_norm": 0.08989663422107697, "learning_rate": 1e-05, "loss": 0.0164, "step": 1281100 }, { "epoch": 0.012812, "grad_norm": 0.14068320393562317, "learning_rate": 1e-05, "loss": 0.0165, "step": 1281200 }, { "epoch": 0.012813, "grad_norm": 0.11306589841842651, "learning_rate": 1e-05, "loss": 0.0163, "step": 1281300 }, { "epoch": 0.012814, "grad_norm": 0.14431163668632507, "learning_rate": 1e-05, "loss": 0.0166, "step": 1281400 }, { "epoch": 0.012815, "grad_norm": 0.16031977534294128, "learning_rate": 1e-05, "loss": 0.0165, "step": 1281500 }, { "epoch": 0.012816, "grad_norm": 0.15119285881519318, "learning_rate": 1e-05, "loss": 0.0167, "step": 1281600 }, { "epoch": 0.012817, "grad_norm": 0.12347672879695892, "learning_rate": 1e-05, "loss": 0.0166, "step": 1281700 }, { "epoch": 0.012818, "grad_norm": 0.18621575832366943, "learning_rate": 1e-05, "loss": 0.0164, "step": 1281800 }, { "epoch": 0.012819, "grad_norm": 0.12518973648548126, "learning_rate": 1e-05, "loss": 0.0162, "step": 1281900 }, { "epoch": 0.01282, "grad_norm": 0.16399289667606354, "learning_rate": 1e-05, "loss": 0.0166, "step": 1282000 }, { "epoch": 0.012821, "grad_norm": 0.13802412152290344, "learning_rate": 1e-05, "loss": 0.0165, "step": 1282100 }, { "epoch": 0.012822, "grad_norm": 0.13094617426395416, "learning_rate": 1e-05, "loss": 0.0164, "step": 1282200 }, { "epoch": 0.012823, "grad_norm": 0.15574589371681213, "learning_rate": 1e-05, "loss": 0.0164, "step": 1282300 }, { "epoch": 0.012824, "grad_norm": 0.1523706018924713, "learning_rate": 1e-05, "loss": 0.0162, "step": 1282400 }, { "epoch": 0.012825, "grad_norm": 0.12594936788082123, "learning_rate": 1e-05, "loss": 0.0166, "step": 1282500 }, { "epoch": 0.012826, "grad_norm": 0.09781824797391891, "learning_rate": 1e-05, "loss": 0.0163, "step": 1282600 }, { "epoch": 0.012827, "grad_norm": 0.13748528063297272, "learning_rate": 1e-05, "loss": 0.0161, "step": 1282700 }, { "epoch": 0.012828, "grad_norm": 0.10142280161380768, "learning_rate": 1e-05, "loss": 0.0164, "step": 1282800 }, { "epoch": 0.012829, "grad_norm": 0.1395193189382553, "learning_rate": 1e-05, "loss": 0.0168, "step": 1282900 }, { "epoch": 0.01283, "grad_norm": 0.152446910738945, "learning_rate": 1e-05, "loss": 0.0164, "step": 1283000 }, { "epoch": 0.012831, "grad_norm": 0.14746759831905365, "learning_rate": 1e-05, "loss": 0.0161, "step": 1283100 }, { "epoch": 0.012832, "grad_norm": 0.1300455778837204, "learning_rate": 1e-05, "loss": 0.0169, "step": 1283200 }, { "epoch": 0.012833, "grad_norm": 0.12168854475021362, "learning_rate": 1e-05, "loss": 0.0164, "step": 1283300 }, { "epoch": 0.012834, "grad_norm": 0.09636258333921432, "learning_rate": 1e-05, "loss": 0.0162, "step": 1283400 }, { "epoch": 0.012835, "grad_norm": 0.13587582111358643, "learning_rate": 1e-05, "loss": 0.0163, "step": 1283500 }, { "epoch": 0.012836, "grad_norm": 0.1636393666267395, "learning_rate": 1e-05, "loss": 0.0163, "step": 1283600 }, { "epoch": 0.012837, "grad_norm": 0.11352898180484772, "learning_rate": 1e-05, "loss": 0.0161, "step": 1283700 }, { "epoch": 0.012838, "grad_norm": 0.1442829817533493, "learning_rate": 1e-05, "loss": 0.016, "step": 1283800 }, { "epoch": 0.012839, "grad_norm": 0.11134222894906998, "learning_rate": 1e-05, "loss": 0.0164, "step": 1283900 }, { "epoch": 0.01284, "grad_norm": 0.14137978851795197, "learning_rate": 1e-05, "loss": 0.0162, "step": 1284000 }, { "epoch": 0.012841, "grad_norm": 0.09849227219820023, "learning_rate": 1e-05, "loss": 0.0168, "step": 1284100 }, { "epoch": 0.012842, "grad_norm": 0.1088496670126915, "learning_rate": 1e-05, "loss": 0.0163, "step": 1284200 }, { "epoch": 0.012843, "grad_norm": 0.1094101220369339, "learning_rate": 1e-05, "loss": 0.0164, "step": 1284300 }, { "epoch": 0.012844, "grad_norm": 0.17259351909160614, "learning_rate": 1e-05, "loss": 0.0165, "step": 1284400 }, { "epoch": 0.012845, "grad_norm": 0.12048124521970749, "learning_rate": 1e-05, "loss": 0.0164, "step": 1284500 }, { "epoch": 0.012846, "grad_norm": 0.11440032720565796, "learning_rate": 1e-05, "loss": 0.0161, "step": 1284600 }, { "epoch": 0.012847, "grad_norm": 0.1449650675058365, "learning_rate": 1e-05, "loss": 0.0166, "step": 1284700 }, { "epoch": 0.012848, "grad_norm": 0.2136886566877365, "learning_rate": 1e-05, "loss": 0.0167, "step": 1284800 }, { "epoch": 0.012849, "grad_norm": 0.10657133907079697, "learning_rate": 1e-05, "loss": 0.0166, "step": 1284900 }, { "epoch": 0.01285, "grad_norm": 0.10290683060884476, "learning_rate": 1e-05, "loss": 0.0163, "step": 1285000 }, { "epoch": 0.012851, "grad_norm": 0.15032395720481873, "learning_rate": 1e-05, "loss": 0.0165, "step": 1285100 }, { "epoch": 0.012852, "grad_norm": 0.14080478250980377, "learning_rate": 1e-05, "loss": 0.0169, "step": 1285200 }, { "epoch": 0.012853, "grad_norm": 0.1227274090051651, "learning_rate": 1e-05, "loss": 0.0167, "step": 1285300 }, { "epoch": 0.012854, "grad_norm": 0.2450525164604187, "learning_rate": 1e-05, "loss": 0.0165, "step": 1285400 }, { "epoch": 0.012855, "grad_norm": 0.11257714033126831, "learning_rate": 1e-05, "loss": 0.0158, "step": 1285500 }, { "epoch": 0.012856, "grad_norm": 0.11278798431158066, "learning_rate": 1e-05, "loss": 0.0163, "step": 1285600 }, { "epoch": 0.012857, "grad_norm": 0.11163649708032608, "learning_rate": 1e-05, "loss": 0.016, "step": 1285700 }, { "epoch": 0.012858, "grad_norm": 0.16873838007450104, "learning_rate": 1e-05, "loss": 0.0161, "step": 1285800 }, { "epoch": 0.012859, "grad_norm": 0.1402616798877716, "learning_rate": 1e-05, "loss": 0.0163, "step": 1285900 }, { "epoch": 0.01286, "grad_norm": 0.13813309371471405, "learning_rate": 1e-05, "loss": 0.0164, "step": 1286000 }, { "epoch": 0.012861, "grad_norm": 0.1347469985485077, "learning_rate": 1e-05, "loss": 0.0163, "step": 1286100 }, { "epoch": 0.012862, "grad_norm": 0.12135297805070877, "learning_rate": 1e-05, "loss": 0.0163, "step": 1286200 }, { "epoch": 0.012863, "grad_norm": 0.14259210228919983, "learning_rate": 1e-05, "loss": 0.0167, "step": 1286300 }, { "epoch": 0.012864, "grad_norm": 0.11646091938018799, "learning_rate": 1e-05, "loss": 0.0162, "step": 1286400 }, { "epoch": 0.012865, "grad_norm": 0.1364946961402893, "learning_rate": 1e-05, "loss": 0.0166, "step": 1286500 }, { "epoch": 0.012866, "grad_norm": 0.14706960320472717, "learning_rate": 1e-05, "loss": 0.0162, "step": 1286600 }, { "epoch": 0.012867, "grad_norm": 0.140062615275383, "learning_rate": 1e-05, "loss": 0.0161, "step": 1286700 }, { "epoch": 0.012868, "grad_norm": 0.13003914058208466, "learning_rate": 1e-05, "loss": 0.0162, "step": 1286800 }, { "epoch": 0.012869, "grad_norm": 0.1204272210597992, "learning_rate": 1e-05, "loss": 0.0163, "step": 1286900 }, { "epoch": 0.01287, "grad_norm": 0.09237930178642273, "learning_rate": 1e-05, "loss": 0.016, "step": 1287000 }, { "epoch": 0.012871, "grad_norm": 0.11580906808376312, "learning_rate": 1e-05, "loss": 0.0165, "step": 1287100 }, { "epoch": 0.012872, "grad_norm": 0.12370546907186508, "learning_rate": 1e-05, "loss": 0.0163, "step": 1287200 }, { "epoch": 0.012873, "grad_norm": 0.11738594621419907, "learning_rate": 1e-05, "loss": 0.0162, "step": 1287300 }, { "epoch": 0.012874, "grad_norm": 0.09673809260129929, "learning_rate": 1e-05, "loss": 0.0162, "step": 1287400 }, { "epoch": 0.012875, "grad_norm": 0.17882993817329407, "learning_rate": 1e-05, "loss": 0.0161, "step": 1287500 }, { "epoch": 0.012876, "grad_norm": 0.1398627758026123, "learning_rate": 1e-05, "loss": 0.0161, "step": 1287600 }, { "epoch": 0.012877, "grad_norm": 0.1703849881887436, "learning_rate": 1e-05, "loss": 0.0164, "step": 1287700 }, { "epoch": 0.012878, "grad_norm": 0.12180347740650177, "learning_rate": 1e-05, "loss": 0.0165, "step": 1287800 }, { "epoch": 0.012879, "grad_norm": 0.10722674429416656, "learning_rate": 1e-05, "loss": 0.0162, "step": 1287900 }, { "epoch": 0.01288, "grad_norm": 0.15929146111011505, "learning_rate": 1e-05, "loss": 0.0163, "step": 1288000 }, { "epoch": 0.012881, "grad_norm": 0.13143381476402283, "learning_rate": 1e-05, "loss": 0.0164, "step": 1288100 }, { "epoch": 0.012882, "grad_norm": 0.12619364261627197, "learning_rate": 1e-05, "loss": 0.0169, "step": 1288200 }, { "epoch": 0.012883, "grad_norm": 0.08781931549310684, "learning_rate": 1e-05, "loss": 0.0162, "step": 1288300 }, { "epoch": 0.012884, "grad_norm": 0.160454660654068, "learning_rate": 1e-05, "loss": 0.0167, "step": 1288400 }, { "epoch": 0.012885, "grad_norm": 0.1673557311296463, "learning_rate": 1e-05, "loss": 0.0164, "step": 1288500 }, { "epoch": 0.012886, "grad_norm": 0.10101684182882309, "learning_rate": 1e-05, "loss": 0.0164, "step": 1288600 }, { "epoch": 0.012887, "grad_norm": 0.13349847495555878, "learning_rate": 1e-05, "loss": 0.0164, "step": 1288700 }, { "epoch": 0.012888, "grad_norm": 0.24320414662361145, "learning_rate": 1e-05, "loss": 0.0164, "step": 1288800 }, { "epoch": 0.012889, "grad_norm": 0.15710991621017456, "learning_rate": 1e-05, "loss": 0.0166, "step": 1288900 }, { "epoch": 0.01289, "grad_norm": 0.1222572773694992, "learning_rate": 1e-05, "loss": 0.0163, "step": 1289000 }, { "epoch": 0.012891, "grad_norm": 0.10737094283103943, "learning_rate": 1e-05, "loss": 0.0164, "step": 1289100 }, { "epoch": 0.012892, "grad_norm": 0.11513182520866394, "learning_rate": 1e-05, "loss": 0.0164, "step": 1289200 }, { "epoch": 0.012893, "grad_norm": 0.16453154385089874, "learning_rate": 1e-05, "loss": 0.0161, "step": 1289300 }, { "epoch": 0.012894, "grad_norm": 0.10786975920200348, "learning_rate": 1e-05, "loss": 0.016, "step": 1289400 }, { "epoch": 0.012895, "grad_norm": 0.10631848871707916, "learning_rate": 1e-05, "loss": 0.0163, "step": 1289500 }, { "epoch": 0.012896, "grad_norm": 0.12148292362689972, "learning_rate": 1e-05, "loss": 0.0162, "step": 1289600 }, { "epoch": 0.012897, "grad_norm": 0.10701166838407516, "learning_rate": 1e-05, "loss": 0.0159, "step": 1289700 }, { "epoch": 0.012898, "grad_norm": 0.13400918245315552, "learning_rate": 1e-05, "loss": 0.0166, "step": 1289800 }, { "epoch": 0.012899, "grad_norm": 0.17818261682987213, "learning_rate": 1e-05, "loss": 0.0157, "step": 1289900 }, { "epoch": 0.0129, "grad_norm": 0.09826414287090302, "learning_rate": 1e-05, "loss": 0.0164, "step": 1290000 }, { "epoch": 0.012901, "grad_norm": 0.1147121712565422, "learning_rate": 1e-05, "loss": 0.0166, "step": 1290100 }, { "epoch": 0.012902, "grad_norm": 0.11302203685045242, "learning_rate": 1e-05, "loss": 0.016, "step": 1290200 }, { "epoch": 0.012903, "grad_norm": 0.17700523138046265, "learning_rate": 1e-05, "loss": 0.0161, "step": 1290300 }, { "epoch": 0.012904, "grad_norm": 0.12441737204790115, "learning_rate": 1e-05, "loss": 0.0165, "step": 1290400 }, { "epoch": 0.012905, "grad_norm": 0.11320865899324417, "learning_rate": 1e-05, "loss": 0.0165, "step": 1290500 }, { "epoch": 0.012906, "grad_norm": 0.09733442217111588, "learning_rate": 1e-05, "loss": 0.0164, "step": 1290600 }, { "epoch": 0.012907, "grad_norm": 0.10897079855203629, "learning_rate": 1e-05, "loss": 0.0167, "step": 1290700 }, { "epoch": 0.012908, "grad_norm": 0.10261164605617523, "learning_rate": 1e-05, "loss": 0.0163, "step": 1290800 }, { "epoch": 0.012909, "grad_norm": 0.15551188588142395, "learning_rate": 1e-05, "loss": 0.0162, "step": 1290900 }, { "epoch": 0.01291, "grad_norm": 0.18155671656131744, "learning_rate": 1e-05, "loss": 0.0163, "step": 1291000 }, { "epoch": 0.012911, "grad_norm": 0.14638613164424896, "learning_rate": 1e-05, "loss": 0.0157, "step": 1291100 }, { "epoch": 0.012912, "grad_norm": 0.1288328319787979, "learning_rate": 1e-05, "loss": 0.0162, "step": 1291200 }, { "epoch": 0.012913, "grad_norm": 0.19822870194911957, "learning_rate": 1e-05, "loss": 0.0164, "step": 1291300 }, { "epoch": 0.012914, "grad_norm": 0.12919066846370697, "learning_rate": 1e-05, "loss": 0.0164, "step": 1291400 }, { "epoch": 0.012915, "grad_norm": 0.12745408713817596, "learning_rate": 1e-05, "loss": 0.0167, "step": 1291500 }, { "epoch": 0.012916, "grad_norm": 0.1306002289056778, "learning_rate": 1e-05, "loss": 0.0163, "step": 1291600 }, { "epoch": 0.012917, "grad_norm": 0.14869581162929535, "learning_rate": 1e-05, "loss": 0.0159, "step": 1291700 }, { "epoch": 0.012918, "grad_norm": 0.11877872049808502, "learning_rate": 1e-05, "loss": 0.0165, "step": 1291800 }, { "epoch": 0.012919, "grad_norm": 0.10423862934112549, "learning_rate": 1e-05, "loss": 0.0167, "step": 1291900 }, { "epoch": 0.01292, "grad_norm": 0.10814166069030762, "learning_rate": 1e-05, "loss": 0.0167, "step": 1292000 }, { "epoch": 0.012921, "grad_norm": 0.09392563253641129, "learning_rate": 1e-05, "loss": 0.0162, "step": 1292100 }, { "epoch": 0.012922, "grad_norm": 0.14944720268249512, "learning_rate": 1e-05, "loss": 0.0164, "step": 1292200 }, { "epoch": 0.012923, "grad_norm": 0.19877097010612488, "learning_rate": 1e-05, "loss": 0.0163, "step": 1292300 }, { "epoch": 0.012924, "grad_norm": 0.11344970017671585, "learning_rate": 1e-05, "loss": 0.0159, "step": 1292400 }, { "epoch": 0.012925, "grad_norm": 0.1256972998380661, "learning_rate": 1e-05, "loss": 0.0162, "step": 1292500 }, { "epoch": 0.012926, "grad_norm": 0.13010413944721222, "learning_rate": 1e-05, "loss": 0.0164, "step": 1292600 }, { "epoch": 0.012927, "grad_norm": 0.11748778820037842, "learning_rate": 1e-05, "loss": 0.0168, "step": 1292700 }, { "epoch": 0.012928, "grad_norm": 0.11317050457000732, "learning_rate": 1e-05, "loss": 0.016, "step": 1292800 }, { "epoch": 0.012929, "grad_norm": 0.12710285186767578, "learning_rate": 1e-05, "loss": 0.0164, "step": 1292900 }, { "epoch": 0.01293, "grad_norm": 0.1672777682542801, "learning_rate": 1e-05, "loss": 0.0164, "step": 1293000 }, { "epoch": 0.012931, "grad_norm": 0.12138889729976654, "learning_rate": 1e-05, "loss": 0.0163, "step": 1293100 }, { "epoch": 0.012932, "grad_norm": 0.10512185096740723, "learning_rate": 1e-05, "loss": 0.0166, "step": 1293200 }, { "epoch": 0.012933, "grad_norm": 0.12099125236272812, "learning_rate": 1e-05, "loss": 0.0165, "step": 1293300 }, { "epoch": 0.012934, "grad_norm": 0.10751255601644516, "learning_rate": 1e-05, "loss": 0.0165, "step": 1293400 }, { "epoch": 0.012935, "grad_norm": 0.09827565401792526, "learning_rate": 1e-05, "loss": 0.016, "step": 1293500 }, { "epoch": 0.012936, "grad_norm": 0.14492422342300415, "learning_rate": 1e-05, "loss": 0.0163, "step": 1293600 }, { "epoch": 0.012937, "grad_norm": 0.14698024094104767, "learning_rate": 1e-05, "loss": 0.0164, "step": 1293700 }, { "epoch": 0.012938, "grad_norm": 0.11842483282089233, "learning_rate": 1e-05, "loss": 0.016, "step": 1293800 }, { "epoch": 0.012939, "grad_norm": 0.19519279897212982, "learning_rate": 1e-05, "loss": 0.0167, "step": 1293900 }, { "epoch": 0.01294, "grad_norm": 0.11671216040849686, "learning_rate": 1e-05, "loss": 0.0164, "step": 1294000 }, { "epoch": 0.012941, "grad_norm": 0.13229066133499146, "learning_rate": 1e-05, "loss": 0.0163, "step": 1294100 }, { "epoch": 0.012942, "grad_norm": 0.12211011350154877, "learning_rate": 1e-05, "loss": 0.0166, "step": 1294200 }, { "epoch": 0.012943, "grad_norm": 0.13194428384304047, "learning_rate": 1e-05, "loss": 0.0166, "step": 1294300 }, { "epoch": 0.012944, "grad_norm": 0.12840677797794342, "learning_rate": 1e-05, "loss": 0.0164, "step": 1294400 }, { "epoch": 0.012945, "grad_norm": 0.13459819555282593, "learning_rate": 1e-05, "loss": 0.0162, "step": 1294500 }, { "epoch": 0.012946, "grad_norm": 0.17860421538352966, "learning_rate": 1e-05, "loss": 0.0163, "step": 1294600 }, { "epoch": 0.012947, "grad_norm": 0.15634343028068542, "learning_rate": 1e-05, "loss": 0.0166, "step": 1294700 }, { "epoch": 0.012948, "grad_norm": 0.09918194264173508, "learning_rate": 1e-05, "loss": 0.0163, "step": 1294800 }, { "epoch": 0.012949, "grad_norm": 0.11281438171863556, "learning_rate": 1e-05, "loss": 0.0169, "step": 1294900 }, { "epoch": 0.01295, "grad_norm": 0.10017973929643631, "learning_rate": 1e-05, "loss": 0.0162, "step": 1295000 }, { "epoch": 0.012951, "grad_norm": 0.12324177473783493, "learning_rate": 1e-05, "loss": 0.0168, "step": 1295100 }, { "epoch": 0.012952, "grad_norm": 0.11160903424024582, "learning_rate": 1e-05, "loss": 0.0165, "step": 1295200 }, { "epoch": 0.012953, "grad_norm": 0.1262902021408081, "learning_rate": 1e-05, "loss": 0.016, "step": 1295300 }, { "epoch": 0.012954, "grad_norm": 0.2025751918554306, "learning_rate": 1e-05, "loss": 0.0159, "step": 1295400 }, { "epoch": 0.012955, "grad_norm": 0.09484890848398209, "learning_rate": 1e-05, "loss": 0.0163, "step": 1295500 }, { "epoch": 0.012956, "grad_norm": 0.11392028629779816, "learning_rate": 1e-05, "loss": 0.0167, "step": 1295600 }, { "epoch": 0.012957, "grad_norm": 0.12185158580541611, "learning_rate": 1e-05, "loss": 0.0165, "step": 1295700 }, { "epoch": 0.012958, "grad_norm": 0.12184388190507889, "learning_rate": 1e-05, "loss": 0.0163, "step": 1295800 }, { "epoch": 0.012959, "grad_norm": 0.09969668090343475, "learning_rate": 1e-05, "loss": 0.0164, "step": 1295900 }, { "epoch": 0.01296, "grad_norm": 0.10861263424158096, "learning_rate": 1e-05, "loss": 0.016, "step": 1296000 }, { "epoch": 0.012961, "grad_norm": 0.11828091740608215, "learning_rate": 1e-05, "loss": 0.0166, "step": 1296100 }, { "epoch": 0.012962, "grad_norm": 0.15247562527656555, "learning_rate": 1e-05, "loss": 0.016, "step": 1296200 }, { "epoch": 0.012963, "grad_norm": 0.18011397123336792, "learning_rate": 1e-05, "loss": 0.0162, "step": 1296300 }, { "epoch": 0.012964, "grad_norm": 0.09692903608083725, "learning_rate": 1e-05, "loss": 0.0167, "step": 1296400 }, { "epoch": 0.012965, "grad_norm": 0.13137374818325043, "learning_rate": 1e-05, "loss": 0.0164, "step": 1296500 }, { "epoch": 0.012966, "grad_norm": 0.1427849978208542, "learning_rate": 1e-05, "loss": 0.0163, "step": 1296600 }, { "epoch": 0.012967, "grad_norm": 0.16932381689548492, "learning_rate": 1e-05, "loss": 0.0167, "step": 1296700 }, { "epoch": 0.012968, "grad_norm": 0.14161317050457, "learning_rate": 1e-05, "loss": 0.0161, "step": 1296800 }, { "epoch": 0.012969, "grad_norm": 0.12313557416200638, "learning_rate": 1e-05, "loss": 0.0163, "step": 1296900 }, { "epoch": 0.01297, "grad_norm": 0.16264313459396362, "learning_rate": 1e-05, "loss": 0.0167, "step": 1297000 }, { "epoch": 0.012971, "grad_norm": 0.10113117843866348, "learning_rate": 1e-05, "loss": 0.0164, "step": 1297100 }, { "epoch": 0.012972, "grad_norm": 0.11698409914970398, "learning_rate": 1e-05, "loss": 0.0159, "step": 1297200 }, { "epoch": 0.012973, "grad_norm": 0.11294545978307724, "learning_rate": 1e-05, "loss": 0.0165, "step": 1297300 }, { "epoch": 0.012974, "grad_norm": 0.12379083782434464, "learning_rate": 1e-05, "loss": 0.0161, "step": 1297400 }, { "epoch": 0.012975, "grad_norm": 0.14176608622074127, "learning_rate": 1e-05, "loss": 0.0165, "step": 1297500 }, { "epoch": 0.012976, "grad_norm": 0.14418675005435944, "learning_rate": 1e-05, "loss": 0.0164, "step": 1297600 }, { "epoch": 0.012977, "grad_norm": 0.1100178137421608, "learning_rate": 1e-05, "loss": 0.0165, "step": 1297700 }, { "epoch": 0.012978, "grad_norm": 0.12119221687316895, "learning_rate": 1e-05, "loss": 0.0162, "step": 1297800 }, { "epoch": 0.012979, "grad_norm": 0.11608950048685074, "learning_rate": 1e-05, "loss": 0.0159, "step": 1297900 }, { "epoch": 0.01298, "grad_norm": 0.10533047467470169, "learning_rate": 1e-05, "loss": 0.0162, "step": 1298000 }, { "epoch": 0.012981, "grad_norm": 0.1336243897676468, "learning_rate": 1e-05, "loss": 0.0165, "step": 1298100 }, { "epoch": 0.012982, "grad_norm": 0.14950190484523773, "learning_rate": 1e-05, "loss": 0.0165, "step": 1298200 }, { "epoch": 0.012983, "grad_norm": 0.13615384697914124, "learning_rate": 1e-05, "loss": 0.0164, "step": 1298300 }, { "epoch": 0.012984, "grad_norm": 0.14283408224582672, "learning_rate": 1e-05, "loss": 0.0161, "step": 1298400 }, { "epoch": 0.012985, "grad_norm": 0.10661213845014572, "learning_rate": 1e-05, "loss": 0.0162, "step": 1298500 }, { "epoch": 0.012986, "grad_norm": 0.1197214275598526, "learning_rate": 1e-05, "loss": 0.0168, "step": 1298600 }, { "epoch": 0.012987, "grad_norm": 0.10084999352693558, "learning_rate": 1e-05, "loss": 0.016, "step": 1298700 }, { "epoch": 0.012988, "grad_norm": 0.14281979203224182, "learning_rate": 1e-05, "loss": 0.0164, "step": 1298800 }, { "epoch": 0.012989, "grad_norm": 0.1443106085062027, "learning_rate": 1e-05, "loss": 0.0163, "step": 1298900 }, { "epoch": 0.01299, "grad_norm": 0.1235065758228302, "learning_rate": 1e-05, "loss": 0.0164, "step": 1299000 }, { "epoch": 0.012991, "grad_norm": 0.14288689196109772, "learning_rate": 1e-05, "loss": 0.016, "step": 1299100 }, { "epoch": 0.012992, "grad_norm": 0.13846567273139954, "learning_rate": 1e-05, "loss": 0.0167, "step": 1299200 }, { "epoch": 0.012993, "grad_norm": 0.11455067247152328, "learning_rate": 1e-05, "loss": 0.0165, "step": 1299300 }, { "epoch": 0.012994, "grad_norm": 0.10035289078950882, "learning_rate": 1e-05, "loss": 0.0165, "step": 1299400 }, { "epoch": 0.012995, "grad_norm": 0.15320877730846405, "learning_rate": 1e-05, "loss": 0.0163, "step": 1299500 }, { "epoch": 0.012996, "grad_norm": 0.14003179967403412, "learning_rate": 1e-05, "loss": 0.0164, "step": 1299600 }, { "epoch": 0.012997, "grad_norm": 0.1769617795944214, "learning_rate": 1e-05, "loss": 0.0164, "step": 1299700 }, { "epoch": 0.012998, "grad_norm": 0.15377619862556458, "learning_rate": 1e-05, "loss": 0.0162, "step": 1299800 }, { "epoch": 0.012999, "grad_norm": 0.11664899438619614, "learning_rate": 1e-05, "loss": 0.0164, "step": 1299900 }, { "epoch": 0.013, "grad_norm": 0.13606807589530945, "learning_rate": 1e-05, "loss": 0.0164, "step": 1300000 }, { "epoch": 0.013, "eval_loss": 0.014660406857728958, "eval_runtime": 175.0991, "eval_samples_per_second": 285.553, "eval_steps_per_second": 17.847, "step": 1300000 }, { "epoch": 0.013001, "grad_norm": 0.24830792844295502, "learning_rate": 1e-05, "loss": 0.0169, "step": 1300100 }, { "epoch": 0.013002, "grad_norm": 0.11338493227958679, "learning_rate": 1e-05, "loss": 0.0164, "step": 1300200 }, { "epoch": 0.013003, "grad_norm": 0.12066248059272766, "learning_rate": 1e-05, "loss": 0.0164, "step": 1300300 }, { "epoch": 0.013004, "grad_norm": 0.10662456601858139, "learning_rate": 1e-05, "loss": 0.0162, "step": 1300400 }, { "epoch": 0.013005, "grad_norm": 0.16255928575992584, "learning_rate": 1e-05, "loss": 0.016, "step": 1300500 }, { "epoch": 0.013006, "grad_norm": 0.11391372233629227, "learning_rate": 1e-05, "loss": 0.0161, "step": 1300600 }, { "epoch": 0.013007, "grad_norm": 0.13971567153930664, "learning_rate": 1e-05, "loss": 0.0162, "step": 1300700 }, { "epoch": 0.013008, "grad_norm": 0.12024446576833725, "learning_rate": 1e-05, "loss": 0.0166, "step": 1300800 }, { "epoch": 0.013009, "grad_norm": 0.14773915708065033, "learning_rate": 1e-05, "loss": 0.0164, "step": 1300900 }, { "epoch": 0.01301, "grad_norm": 0.12858451902866364, "learning_rate": 1e-05, "loss": 0.0165, "step": 1301000 }, { "epoch": 0.013011, "grad_norm": 0.11221491545438766, "learning_rate": 1e-05, "loss": 0.0164, "step": 1301100 }, { "epoch": 0.013012, "grad_norm": 0.11863069981336594, "learning_rate": 1e-05, "loss": 0.0163, "step": 1301200 }, { "epoch": 0.013013, "grad_norm": 0.13193117082118988, "learning_rate": 1e-05, "loss": 0.0163, "step": 1301300 }, { "epoch": 0.013014, "grad_norm": 0.1611839085817337, "learning_rate": 1e-05, "loss": 0.0166, "step": 1301400 }, { "epoch": 0.013015, "grad_norm": 0.1310199350118637, "learning_rate": 1e-05, "loss": 0.0166, "step": 1301500 }, { "epoch": 0.013016, "grad_norm": 0.14449931681156158, "learning_rate": 1e-05, "loss": 0.0161, "step": 1301600 }, { "epoch": 0.013017, "grad_norm": 0.12173353135585785, "learning_rate": 1e-05, "loss": 0.016, "step": 1301700 }, { "epoch": 0.013018, "grad_norm": 0.11278184503316879, "learning_rate": 1e-05, "loss": 0.0168, "step": 1301800 }, { "epoch": 0.013019, "grad_norm": 0.11098256707191467, "learning_rate": 1e-05, "loss": 0.0164, "step": 1301900 }, { "epoch": 0.01302, "grad_norm": 0.12357387691736221, "learning_rate": 1e-05, "loss": 0.0163, "step": 1302000 }, { "epoch": 0.013021, "grad_norm": 0.11461768299341202, "learning_rate": 1e-05, "loss": 0.0162, "step": 1302100 }, { "epoch": 0.013022, "grad_norm": 0.13321007788181305, "learning_rate": 1e-05, "loss": 0.0162, "step": 1302200 }, { "epoch": 0.013023, "grad_norm": 0.19733816385269165, "learning_rate": 1e-05, "loss": 0.0164, "step": 1302300 }, { "epoch": 0.013024, "grad_norm": 0.13673529028892517, "learning_rate": 1e-05, "loss": 0.0164, "step": 1302400 }, { "epoch": 0.013025, "grad_norm": 0.10841869562864304, "learning_rate": 1e-05, "loss": 0.0165, "step": 1302500 }, { "epoch": 0.013026, "grad_norm": 0.15578322112560272, "learning_rate": 1e-05, "loss": 0.016, "step": 1302600 }, { "epoch": 0.013027, "grad_norm": 0.11592362821102142, "learning_rate": 1e-05, "loss": 0.0162, "step": 1302700 }, { "epoch": 0.013028, "grad_norm": 0.10370542854070663, "learning_rate": 1e-05, "loss": 0.0165, "step": 1302800 }, { "epoch": 0.013029, "grad_norm": 0.13876794278621674, "learning_rate": 1e-05, "loss": 0.0164, "step": 1302900 }, { "epoch": 0.01303, "grad_norm": 0.13169240951538086, "learning_rate": 1e-05, "loss": 0.0165, "step": 1303000 }, { "epoch": 0.013031, "grad_norm": 0.11703802645206451, "learning_rate": 1e-05, "loss": 0.0171, "step": 1303100 }, { "epoch": 0.013032, "grad_norm": 0.133927583694458, "learning_rate": 1e-05, "loss": 0.0161, "step": 1303200 }, { "epoch": 0.013033, "grad_norm": 0.12179337441921234, "learning_rate": 1e-05, "loss": 0.0161, "step": 1303300 }, { "epoch": 0.013034, "grad_norm": 0.1173168271780014, "learning_rate": 1e-05, "loss": 0.016, "step": 1303400 }, { "epoch": 0.013035, "grad_norm": 0.09882555902004242, "learning_rate": 1e-05, "loss": 0.0166, "step": 1303500 }, { "epoch": 0.013036, "grad_norm": 0.11328151077032089, "learning_rate": 1e-05, "loss": 0.0163, "step": 1303600 }, { "epoch": 0.013037, "grad_norm": 0.19324494898319244, "learning_rate": 1e-05, "loss": 0.0159, "step": 1303700 }, { "epoch": 0.013038, "grad_norm": 0.10686265677213669, "learning_rate": 1e-05, "loss": 0.0162, "step": 1303800 }, { "epoch": 0.013039, "grad_norm": 0.12659524381160736, "learning_rate": 1e-05, "loss": 0.0162, "step": 1303900 }, { "epoch": 0.01304, "grad_norm": 0.1486463099718094, "learning_rate": 1e-05, "loss": 0.016, "step": 1304000 }, { "epoch": 0.013041, "grad_norm": 0.0859198346734047, "learning_rate": 1e-05, "loss": 0.0164, "step": 1304100 }, { "epoch": 0.013042, "grad_norm": 0.11831842362880707, "learning_rate": 1e-05, "loss": 0.0163, "step": 1304200 }, { "epoch": 0.013043, "grad_norm": 0.12301024794578552, "learning_rate": 1e-05, "loss": 0.0164, "step": 1304300 }, { "epoch": 0.013044, "grad_norm": 0.1548999845981598, "learning_rate": 1e-05, "loss": 0.0164, "step": 1304400 }, { "epoch": 0.013045, "grad_norm": 0.11643747985363007, "learning_rate": 1e-05, "loss": 0.016, "step": 1304500 }, { "epoch": 0.013046, "grad_norm": 0.10458841919898987, "learning_rate": 1e-05, "loss": 0.0164, "step": 1304600 }, { "epoch": 0.013047, "grad_norm": 0.215585395693779, "learning_rate": 1e-05, "loss": 0.0163, "step": 1304700 }, { "epoch": 0.013048, "grad_norm": 0.1108950600028038, "learning_rate": 1e-05, "loss": 0.0167, "step": 1304800 }, { "epoch": 0.013049, "grad_norm": 0.128265842795372, "learning_rate": 1e-05, "loss": 0.0169, "step": 1304900 }, { "epoch": 0.01305, "grad_norm": 0.1437181532382965, "learning_rate": 1e-05, "loss": 0.0163, "step": 1305000 }, { "epoch": 0.013051, "grad_norm": 0.1246330738067627, "learning_rate": 1e-05, "loss": 0.0163, "step": 1305100 }, { "epoch": 0.013052, "grad_norm": 0.0990762785077095, "learning_rate": 1e-05, "loss": 0.0163, "step": 1305200 }, { "epoch": 0.013053, "grad_norm": 0.15388500690460205, "learning_rate": 1e-05, "loss": 0.017, "step": 1305300 }, { "epoch": 0.013054, "grad_norm": 0.1300181746482849, "learning_rate": 1e-05, "loss": 0.0162, "step": 1305400 }, { "epoch": 0.013055, "grad_norm": 0.12378739565610886, "learning_rate": 1e-05, "loss": 0.0161, "step": 1305500 }, { "epoch": 0.013056, "grad_norm": 0.18594135344028473, "learning_rate": 1e-05, "loss": 0.0159, "step": 1305600 }, { "epoch": 0.013057, "grad_norm": 0.20455986261367798, "learning_rate": 1e-05, "loss": 0.0164, "step": 1305700 }, { "epoch": 0.013058, "grad_norm": 0.07545184344053268, "learning_rate": 1e-05, "loss": 0.0164, "step": 1305800 }, { "epoch": 0.013059, "grad_norm": 0.12261780351400375, "learning_rate": 1e-05, "loss": 0.0165, "step": 1305900 }, { "epoch": 0.01306, "grad_norm": 0.14318399131298065, "learning_rate": 1e-05, "loss": 0.0164, "step": 1306000 }, { "epoch": 0.013061, "grad_norm": 0.40198570489883423, "learning_rate": 1e-05, "loss": 0.0163, "step": 1306100 }, { "epoch": 0.013062, "grad_norm": 0.14688166975975037, "learning_rate": 1e-05, "loss": 0.0162, "step": 1306200 }, { "epoch": 0.013063, "grad_norm": 0.12385472655296326, "learning_rate": 1e-05, "loss": 0.0164, "step": 1306300 }, { "epoch": 0.013064, "grad_norm": 0.12278416007757187, "learning_rate": 1e-05, "loss": 0.0164, "step": 1306400 }, { "epoch": 0.013065, "grad_norm": 0.18503397703170776, "learning_rate": 1e-05, "loss": 0.016, "step": 1306500 }, { "epoch": 0.013066, "grad_norm": 0.14084890484809875, "learning_rate": 1e-05, "loss": 0.0168, "step": 1306600 }, { "epoch": 0.013067, "grad_norm": 0.1382458508014679, "learning_rate": 1e-05, "loss": 0.0166, "step": 1306700 }, { "epoch": 0.013068, "grad_norm": 0.14156465232372284, "learning_rate": 1e-05, "loss": 0.0162, "step": 1306800 }, { "epoch": 0.013069, "grad_norm": 0.1651427000761032, "learning_rate": 1e-05, "loss": 0.0163, "step": 1306900 }, { "epoch": 0.01307, "grad_norm": 0.11050745844841003, "learning_rate": 1e-05, "loss": 0.0164, "step": 1307000 }, { "epoch": 0.013071, "grad_norm": 0.13927596807479858, "learning_rate": 1e-05, "loss": 0.0159, "step": 1307100 }, { "epoch": 0.013072, "grad_norm": 0.11515457928180695, "learning_rate": 1e-05, "loss": 0.0162, "step": 1307200 }, { "epoch": 0.013073, "grad_norm": 0.10204284638166428, "learning_rate": 1e-05, "loss": 0.0162, "step": 1307300 }, { "epoch": 0.013074, "grad_norm": 0.10651961714029312, "learning_rate": 1e-05, "loss": 0.0163, "step": 1307400 }, { "epoch": 0.013075, "grad_norm": 0.14779682457447052, "learning_rate": 1e-05, "loss": 0.0162, "step": 1307500 }, { "epoch": 0.013076, "grad_norm": 0.12590038776397705, "learning_rate": 1e-05, "loss": 0.0163, "step": 1307600 }, { "epoch": 0.013077, "grad_norm": 0.1289549171924591, "learning_rate": 1e-05, "loss": 0.0162, "step": 1307700 }, { "epoch": 0.013078, "grad_norm": 0.10276322066783905, "learning_rate": 1e-05, "loss": 0.0161, "step": 1307800 }, { "epoch": 0.013079, "grad_norm": 0.17240484058856964, "learning_rate": 1e-05, "loss": 0.0166, "step": 1307900 }, { "epoch": 0.01308, "grad_norm": 0.12742945551872253, "learning_rate": 1e-05, "loss": 0.0166, "step": 1308000 }, { "epoch": 0.013081, "grad_norm": 0.12227241694927216, "learning_rate": 1e-05, "loss": 0.0163, "step": 1308100 }, { "epoch": 0.013082, "grad_norm": 0.12997616827487946, "learning_rate": 1e-05, "loss": 0.0164, "step": 1308200 }, { "epoch": 0.013083, "grad_norm": 0.15241047739982605, "learning_rate": 1e-05, "loss": 0.0164, "step": 1308300 }, { "epoch": 0.013084, "grad_norm": 0.11695141345262527, "learning_rate": 1e-05, "loss": 0.0161, "step": 1308400 }, { "epoch": 0.013085, "grad_norm": 0.14706721901893616, "learning_rate": 1e-05, "loss": 0.0165, "step": 1308500 }, { "epoch": 0.013086, "grad_norm": 0.12220200151205063, "learning_rate": 1e-05, "loss": 0.016, "step": 1308600 }, { "epoch": 0.013087, "grad_norm": 0.09294383227825165, "learning_rate": 1e-05, "loss": 0.0161, "step": 1308700 }, { "epoch": 0.013088, "grad_norm": 0.14496074616909027, "learning_rate": 1e-05, "loss": 0.0164, "step": 1308800 }, { "epoch": 0.013089, "grad_norm": 0.11295601725578308, "learning_rate": 1e-05, "loss": 0.0158, "step": 1308900 }, { "epoch": 0.01309, "grad_norm": 0.09953613579273224, "learning_rate": 1e-05, "loss": 0.0166, "step": 1309000 }, { "epoch": 0.013091, "grad_norm": 0.14398588240146637, "learning_rate": 1e-05, "loss": 0.0163, "step": 1309100 }, { "epoch": 0.013092, "grad_norm": 0.11448374390602112, "learning_rate": 1e-05, "loss": 0.0163, "step": 1309200 }, { "epoch": 0.013093, "grad_norm": 0.12994535267353058, "learning_rate": 1e-05, "loss": 0.0163, "step": 1309300 }, { "epoch": 0.013094, "grad_norm": 0.10212858766317368, "learning_rate": 1e-05, "loss": 0.0163, "step": 1309400 }, { "epoch": 0.013095, "grad_norm": 0.17227132618427277, "learning_rate": 1e-05, "loss": 0.0161, "step": 1309500 }, { "epoch": 0.013096, "grad_norm": 0.13268069922924042, "learning_rate": 1e-05, "loss": 0.0161, "step": 1309600 }, { "epoch": 0.013097, "grad_norm": 0.17357902228832245, "learning_rate": 1e-05, "loss": 0.0159, "step": 1309700 }, { "epoch": 0.013098, "grad_norm": 0.16008152067661285, "learning_rate": 1e-05, "loss": 0.0165, "step": 1309800 }, { "epoch": 0.013099, "grad_norm": 0.14106006920337677, "learning_rate": 1e-05, "loss": 0.0163, "step": 1309900 }, { "epoch": 0.0131, "grad_norm": 0.09955234825611115, "learning_rate": 1e-05, "loss": 0.0162, "step": 1310000 }, { "epoch": 0.013101, "grad_norm": 0.11654725670814514, "learning_rate": 1e-05, "loss": 0.0163, "step": 1310100 }, { "epoch": 0.013102, "grad_norm": 0.07041823118925095, "learning_rate": 1e-05, "loss": 0.0162, "step": 1310200 }, { "epoch": 0.013103, "grad_norm": 0.1078760027885437, "learning_rate": 1e-05, "loss": 0.0164, "step": 1310300 }, { "epoch": 0.013104, "grad_norm": 0.1652679145336151, "learning_rate": 1e-05, "loss": 0.0166, "step": 1310400 }, { "epoch": 0.013105, "grad_norm": 0.1469324678182602, "learning_rate": 1e-05, "loss": 0.0165, "step": 1310500 }, { "epoch": 0.013106, "grad_norm": 0.10433587431907654, "learning_rate": 1e-05, "loss": 0.0162, "step": 1310600 }, { "epoch": 0.013107, "grad_norm": 0.16562803089618683, "learning_rate": 1e-05, "loss": 0.016, "step": 1310700 }, { "epoch": 0.013108, "grad_norm": 0.16922149062156677, "learning_rate": 1e-05, "loss": 0.0166, "step": 1310800 }, { "epoch": 0.013109, "grad_norm": 0.13875386118888855, "learning_rate": 1e-05, "loss": 0.0159, "step": 1310900 }, { "epoch": 0.01311, "grad_norm": 0.14222760498523712, "learning_rate": 1e-05, "loss": 0.0161, "step": 1311000 }, { "epoch": 0.013111, "grad_norm": 0.20438863337039948, "learning_rate": 1e-05, "loss": 0.0163, "step": 1311100 }, { "epoch": 0.013112, "grad_norm": 0.14288850128650665, "learning_rate": 1e-05, "loss": 0.016, "step": 1311200 }, { "epoch": 0.013113, "grad_norm": 0.11313112080097198, "learning_rate": 1e-05, "loss": 0.0162, "step": 1311300 }, { "epoch": 0.013114, "grad_norm": 0.17490126192569733, "learning_rate": 1e-05, "loss": 0.0164, "step": 1311400 }, { "epoch": 0.013115, "grad_norm": 0.15576356649398804, "learning_rate": 1e-05, "loss": 0.0163, "step": 1311500 }, { "epoch": 0.013116, "grad_norm": 0.12121013551950455, "learning_rate": 1e-05, "loss": 0.0168, "step": 1311600 }, { "epoch": 0.013117, "grad_norm": 0.17026697099208832, "learning_rate": 1e-05, "loss": 0.0167, "step": 1311700 }, { "epoch": 0.013118, "grad_norm": 0.13505344092845917, "learning_rate": 1e-05, "loss": 0.0162, "step": 1311800 }, { "epoch": 0.013119, "grad_norm": 0.10483085364103317, "learning_rate": 1e-05, "loss": 0.0165, "step": 1311900 }, { "epoch": 0.01312, "grad_norm": 0.1483515501022339, "learning_rate": 1e-05, "loss": 0.0163, "step": 1312000 }, { "epoch": 0.013121, "grad_norm": 0.17698782682418823, "learning_rate": 1e-05, "loss": 0.0162, "step": 1312100 }, { "epoch": 0.013122, "grad_norm": 0.11140481382608414, "learning_rate": 1e-05, "loss": 0.0157, "step": 1312200 }, { "epoch": 0.013123, "grad_norm": 0.1426330804824829, "learning_rate": 1e-05, "loss": 0.0163, "step": 1312300 }, { "epoch": 0.013124, "grad_norm": 0.09745586663484573, "learning_rate": 1e-05, "loss": 0.0163, "step": 1312400 }, { "epoch": 0.013125, "grad_norm": 0.10156197100877762, "learning_rate": 1e-05, "loss": 0.0164, "step": 1312500 }, { "epoch": 0.013126, "grad_norm": 0.11071788519620895, "learning_rate": 1e-05, "loss": 0.0165, "step": 1312600 }, { "epoch": 0.013127, "grad_norm": 0.136601522564888, "learning_rate": 1e-05, "loss": 0.0163, "step": 1312700 }, { "epoch": 0.013128, "grad_norm": 0.13520178198814392, "learning_rate": 1e-05, "loss": 0.0162, "step": 1312800 }, { "epoch": 0.013129, "grad_norm": 0.18924041092395782, "learning_rate": 1e-05, "loss": 0.0161, "step": 1312900 }, { "epoch": 0.01313, "grad_norm": 0.11308811604976654, "learning_rate": 1e-05, "loss": 0.0164, "step": 1313000 }, { "epoch": 0.013131, "grad_norm": 0.19692665338516235, "learning_rate": 1e-05, "loss": 0.0167, "step": 1313100 }, { "epoch": 0.013132, "grad_norm": 0.13576501607894897, "learning_rate": 1e-05, "loss": 0.0159, "step": 1313200 }, { "epoch": 0.013133, "grad_norm": 0.08455876260995865, "learning_rate": 1e-05, "loss": 0.0163, "step": 1313300 }, { "epoch": 0.013134, "grad_norm": 0.10517849028110504, "learning_rate": 1e-05, "loss": 0.016, "step": 1313400 }, { "epoch": 0.013135, "grad_norm": 0.11040344834327698, "learning_rate": 1e-05, "loss": 0.0164, "step": 1313500 }, { "epoch": 0.013136, "grad_norm": 0.14804702997207642, "learning_rate": 1e-05, "loss": 0.0166, "step": 1313600 }, { "epoch": 0.013137, "grad_norm": 0.1103484034538269, "learning_rate": 1e-05, "loss": 0.0163, "step": 1313700 }, { "epoch": 0.013138, "grad_norm": 0.11275043338537216, "learning_rate": 1e-05, "loss": 0.0162, "step": 1313800 }, { "epoch": 0.013139, "grad_norm": 0.11119471490383148, "learning_rate": 1e-05, "loss": 0.0166, "step": 1313900 }, { "epoch": 0.01314, "grad_norm": 0.10925398021936417, "learning_rate": 1e-05, "loss": 0.0164, "step": 1314000 }, { "epoch": 0.013141, "grad_norm": 0.13016998767852783, "learning_rate": 1e-05, "loss": 0.0161, "step": 1314100 }, { "epoch": 0.013142, "grad_norm": 0.1372590959072113, "learning_rate": 1e-05, "loss": 0.0161, "step": 1314200 }, { "epoch": 0.013143, "grad_norm": 0.13279500603675842, "learning_rate": 1e-05, "loss": 0.0163, "step": 1314300 }, { "epoch": 0.013144, "grad_norm": 0.13135483860969543, "learning_rate": 1e-05, "loss": 0.0163, "step": 1314400 }, { "epoch": 0.013145, "grad_norm": 0.12004473805427551, "learning_rate": 1e-05, "loss": 0.0167, "step": 1314500 }, { "epoch": 0.013146, "grad_norm": 0.1675342172384262, "learning_rate": 1e-05, "loss": 0.0161, "step": 1314600 }, { "epoch": 0.013147, "grad_norm": 0.13405801355838776, "learning_rate": 1e-05, "loss": 0.0164, "step": 1314700 }, { "epoch": 0.013148, "grad_norm": 0.14084167778491974, "learning_rate": 1e-05, "loss": 0.0162, "step": 1314800 }, { "epoch": 0.013149, "grad_norm": 0.11859835684299469, "learning_rate": 1e-05, "loss": 0.0161, "step": 1314900 }, { "epoch": 0.01315, "grad_norm": 0.11024931818246841, "learning_rate": 1e-05, "loss": 0.0165, "step": 1315000 }, { "epoch": 0.013151, "grad_norm": 0.10745519399642944, "learning_rate": 1e-05, "loss": 0.0165, "step": 1315100 }, { "epoch": 0.013152, "grad_norm": 0.11650753021240234, "learning_rate": 1e-05, "loss": 0.0161, "step": 1315200 }, { "epoch": 0.013153, "grad_norm": 0.11921603977680206, "learning_rate": 1e-05, "loss": 0.0161, "step": 1315300 }, { "epoch": 0.013154, "grad_norm": 0.14929084479808807, "learning_rate": 1e-05, "loss": 0.0162, "step": 1315400 }, { "epoch": 0.013155, "grad_norm": 0.11579030007123947, "learning_rate": 1e-05, "loss": 0.016, "step": 1315500 }, { "epoch": 0.013156, "grad_norm": 0.10873346775770187, "learning_rate": 1e-05, "loss": 0.0165, "step": 1315600 }, { "epoch": 0.013157, "grad_norm": 0.13708019256591797, "learning_rate": 1e-05, "loss": 0.0161, "step": 1315700 }, { "epoch": 0.013158, "grad_norm": 0.11695168912410736, "learning_rate": 1e-05, "loss": 0.0163, "step": 1315800 }, { "epoch": 0.013159, "grad_norm": 0.1273346245288849, "learning_rate": 1e-05, "loss": 0.0168, "step": 1315900 }, { "epoch": 0.01316, "grad_norm": 0.09742959588766098, "learning_rate": 1e-05, "loss": 0.0165, "step": 1316000 }, { "epoch": 0.013161, "grad_norm": 0.13276614248752594, "learning_rate": 1e-05, "loss": 0.0164, "step": 1316100 }, { "epoch": 0.013162, "grad_norm": 0.13738194108009338, "learning_rate": 1e-05, "loss": 0.0164, "step": 1316200 }, { "epoch": 0.013163, "grad_norm": 0.2703345715999603, "learning_rate": 1e-05, "loss": 0.0164, "step": 1316300 }, { "epoch": 0.013164, "grad_norm": 0.18685942888259888, "learning_rate": 1e-05, "loss": 0.0166, "step": 1316400 }, { "epoch": 0.013165, "grad_norm": 0.11895620077848434, "learning_rate": 1e-05, "loss": 0.0163, "step": 1316500 }, { "epoch": 0.013166, "grad_norm": 0.12116740643978119, "learning_rate": 1e-05, "loss": 0.0163, "step": 1316600 }, { "epoch": 0.013167, "grad_norm": 0.08276379853487015, "learning_rate": 1e-05, "loss": 0.0165, "step": 1316700 }, { "epoch": 0.013168, "grad_norm": 0.09710532426834106, "learning_rate": 1e-05, "loss": 0.0163, "step": 1316800 }, { "epoch": 0.013169, "grad_norm": 0.19090770184993744, "learning_rate": 1e-05, "loss": 0.0164, "step": 1316900 }, { "epoch": 0.01317, "grad_norm": 0.13335369527339935, "learning_rate": 1e-05, "loss": 0.0162, "step": 1317000 }, { "epoch": 0.013171, "grad_norm": 0.12051743268966675, "learning_rate": 1e-05, "loss": 0.0163, "step": 1317100 }, { "epoch": 0.013172, "grad_norm": 0.1492830216884613, "learning_rate": 1e-05, "loss": 0.0161, "step": 1317200 }, { "epoch": 0.013173, "grad_norm": 0.13339175283908844, "learning_rate": 1e-05, "loss": 0.0162, "step": 1317300 }, { "epoch": 0.013174, "grad_norm": 0.14951933920383453, "learning_rate": 1e-05, "loss": 0.0166, "step": 1317400 }, { "epoch": 0.013175, "grad_norm": 0.09379350394010544, "learning_rate": 1e-05, "loss": 0.0162, "step": 1317500 }, { "epoch": 0.013176, "grad_norm": 0.1423003077507019, "learning_rate": 1e-05, "loss": 0.0162, "step": 1317600 }, { "epoch": 0.013177, "grad_norm": 0.13295017182826996, "learning_rate": 1e-05, "loss": 0.016, "step": 1317700 }, { "epoch": 0.013178, "grad_norm": 0.09171446412801743, "learning_rate": 1e-05, "loss": 0.0157, "step": 1317800 }, { "epoch": 0.013179, "grad_norm": 0.1275143176317215, "learning_rate": 1e-05, "loss": 0.0161, "step": 1317900 }, { "epoch": 0.01318, "grad_norm": 0.15358583629131317, "learning_rate": 1e-05, "loss": 0.0165, "step": 1318000 }, { "epoch": 0.013181, "grad_norm": 0.1170690581202507, "learning_rate": 1e-05, "loss": 0.0165, "step": 1318100 }, { "epoch": 0.013182, "grad_norm": 0.11175128072500229, "learning_rate": 1e-05, "loss": 0.0158, "step": 1318200 }, { "epoch": 0.013183, "grad_norm": 0.1257290095090866, "learning_rate": 1e-05, "loss": 0.0162, "step": 1318300 }, { "epoch": 0.013184, "grad_norm": 0.16317887604236603, "learning_rate": 1e-05, "loss": 0.0164, "step": 1318400 }, { "epoch": 0.013185, "grad_norm": 0.1321232169866562, "learning_rate": 1e-05, "loss": 0.0161, "step": 1318500 }, { "epoch": 0.013186, "grad_norm": 0.15237344801425934, "learning_rate": 1e-05, "loss": 0.0163, "step": 1318600 }, { "epoch": 0.013187, "grad_norm": 0.12752921879291534, "learning_rate": 1e-05, "loss": 0.016, "step": 1318700 }, { "epoch": 0.013188, "grad_norm": 0.14778846502304077, "learning_rate": 1e-05, "loss": 0.0159, "step": 1318800 }, { "epoch": 0.013189, "grad_norm": 0.13319669663906097, "learning_rate": 1e-05, "loss": 0.0166, "step": 1318900 }, { "epoch": 0.01319, "grad_norm": 0.15784852206707, "learning_rate": 1e-05, "loss": 0.0162, "step": 1319000 }, { "epoch": 0.013191, "grad_norm": 0.10920687764883041, "learning_rate": 1e-05, "loss": 0.0163, "step": 1319100 }, { "epoch": 0.013192, "grad_norm": 0.17341329157352448, "learning_rate": 1e-05, "loss": 0.0162, "step": 1319200 }, { "epoch": 0.013193, "grad_norm": 0.1039993166923523, "learning_rate": 1e-05, "loss": 0.0165, "step": 1319300 }, { "epoch": 0.013194, "grad_norm": 0.12831710278987885, "learning_rate": 1e-05, "loss": 0.016, "step": 1319400 }, { "epoch": 0.013195, "grad_norm": 0.10010214149951935, "learning_rate": 1e-05, "loss": 0.0162, "step": 1319500 }, { "epoch": 0.013196, "grad_norm": 0.13654108345508575, "learning_rate": 1e-05, "loss": 0.0161, "step": 1319600 }, { "epoch": 0.013197, "grad_norm": 0.1205848678946495, "learning_rate": 1e-05, "loss": 0.0159, "step": 1319700 }, { "epoch": 0.013198, "grad_norm": 0.10611536353826523, "learning_rate": 1e-05, "loss": 0.0163, "step": 1319800 }, { "epoch": 0.013199, "grad_norm": 0.13241694867610931, "learning_rate": 1e-05, "loss": 0.0164, "step": 1319900 }, { "epoch": 0.0132, "grad_norm": 0.11127609014511108, "learning_rate": 1e-05, "loss": 0.0158, "step": 1320000 }, { "epoch": 0.0132, "eval_loss": 0.014324733056128025, "eval_runtime": 172.0556, "eval_samples_per_second": 290.604, "eval_steps_per_second": 18.163, "step": 1320000 }, { "epoch": 0.013201, "grad_norm": 0.14475083351135254, "learning_rate": 1e-05, "loss": 0.0164, "step": 1320100 }, { "epoch": 0.013202, "grad_norm": 0.13367584347724915, "learning_rate": 1e-05, "loss": 0.0161, "step": 1320200 }, { "epoch": 0.013203, "grad_norm": 0.11954740434885025, "learning_rate": 1e-05, "loss": 0.0161, "step": 1320300 }, { "epoch": 0.013204, "grad_norm": 0.14419887959957123, "learning_rate": 1e-05, "loss": 0.0163, "step": 1320400 }, { "epoch": 0.013205, "grad_norm": 0.15202313661575317, "learning_rate": 1e-05, "loss": 0.0162, "step": 1320500 }, { "epoch": 0.013206, "grad_norm": 0.13695228099822998, "learning_rate": 1e-05, "loss": 0.0161, "step": 1320600 }, { "epoch": 0.013207, "grad_norm": 0.12661992013454437, "learning_rate": 1e-05, "loss": 0.016, "step": 1320700 }, { "epoch": 0.013208, "grad_norm": 0.10778237134218216, "learning_rate": 1e-05, "loss": 0.0162, "step": 1320800 }, { "epoch": 0.013209, "grad_norm": 0.13470777869224548, "learning_rate": 1e-05, "loss": 0.016, "step": 1320900 }, { "epoch": 0.01321, "grad_norm": 0.14400097727775574, "learning_rate": 1e-05, "loss": 0.0165, "step": 1321000 }, { "epoch": 0.013211, "grad_norm": 0.10586697608232498, "learning_rate": 1e-05, "loss": 0.0163, "step": 1321100 }, { "epoch": 0.013212, "grad_norm": 0.14245665073394775, "learning_rate": 1e-05, "loss": 0.0164, "step": 1321200 }, { "epoch": 0.013213, "grad_norm": 0.1442459374666214, "learning_rate": 1e-05, "loss": 0.0161, "step": 1321300 }, { "epoch": 0.013214, "grad_norm": 0.1351059228181839, "learning_rate": 1e-05, "loss": 0.0164, "step": 1321400 }, { "epoch": 0.013215, "grad_norm": 0.0997355729341507, "learning_rate": 1e-05, "loss": 0.0161, "step": 1321500 }, { "epoch": 0.013216, "grad_norm": 0.09327249974012375, "learning_rate": 1e-05, "loss": 0.016, "step": 1321600 }, { "epoch": 0.013217, "grad_norm": 0.1572100669145584, "learning_rate": 1e-05, "loss": 0.0165, "step": 1321700 }, { "epoch": 0.013218, "grad_norm": 0.19836114346981049, "learning_rate": 1e-05, "loss": 0.0162, "step": 1321800 }, { "epoch": 0.013219, "grad_norm": 0.13689476251602173, "learning_rate": 1e-05, "loss": 0.0164, "step": 1321900 }, { "epoch": 0.01322, "grad_norm": 0.12693540751934052, "learning_rate": 1e-05, "loss": 0.0165, "step": 1322000 }, { "epoch": 0.013221, "grad_norm": 0.1062237098813057, "learning_rate": 1e-05, "loss": 0.0162, "step": 1322100 }, { "epoch": 0.013222, "grad_norm": 0.12325059622526169, "learning_rate": 1e-05, "loss": 0.0167, "step": 1322200 }, { "epoch": 0.013223, "grad_norm": 0.10269874334335327, "learning_rate": 1e-05, "loss": 0.0162, "step": 1322300 }, { "epoch": 0.013224, "grad_norm": 0.09498820453882217, "learning_rate": 1e-05, "loss": 0.0162, "step": 1322400 }, { "epoch": 0.013225, "grad_norm": 0.12016694247722626, "learning_rate": 1e-05, "loss": 0.0161, "step": 1322500 }, { "epoch": 0.013226, "grad_norm": 0.13286536931991577, "learning_rate": 1e-05, "loss": 0.0166, "step": 1322600 }, { "epoch": 0.013227, "grad_norm": 0.1695163995027542, "learning_rate": 1e-05, "loss": 0.0166, "step": 1322700 }, { "epoch": 0.013228, "grad_norm": 0.16491618752479553, "learning_rate": 1e-05, "loss": 0.0161, "step": 1322800 }, { "epoch": 0.013229, "grad_norm": 0.12927086651325226, "learning_rate": 1e-05, "loss": 0.0163, "step": 1322900 }, { "epoch": 0.01323, "grad_norm": 0.15496927499771118, "learning_rate": 1e-05, "loss": 0.0163, "step": 1323000 }, { "epoch": 0.013231, "grad_norm": 0.06983635574579239, "learning_rate": 1e-05, "loss": 0.016, "step": 1323100 }, { "epoch": 0.013232, "grad_norm": 0.09865902364253998, "learning_rate": 1e-05, "loss": 0.0163, "step": 1323200 }, { "epoch": 0.013233, "grad_norm": 0.13549967110157013, "learning_rate": 1e-05, "loss": 0.016, "step": 1323300 }, { "epoch": 0.013234, "grad_norm": 0.1555475890636444, "learning_rate": 1e-05, "loss": 0.0162, "step": 1323400 }, { "epoch": 0.013235, "grad_norm": 0.12091593444347382, "learning_rate": 1e-05, "loss": 0.0162, "step": 1323500 }, { "epoch": 0.013236, "grad_norm": 0.112900510430336, "learning_rate": 1e-05, "loss": 0.0163, "step": 1323600 }, { "epoch": 0.013237, "grad_norm": 0.10278262197971344, "learning_rate": 1e-05, "loss": 0.0165, "step": 1323700 }, { "epoch": 0.013238, "grad_norm": 0.19439108669757843, "learning_rate": 1e-05, "loss": 0.0163, "step": 1323800 }, { "epoch": 0.013239, "grad_norm": 0.10329132527112961, "learning_rate": 1e-05, "loss": 0.0163, "step": 1323900 }, { "epoch": 0.01324, "grad_norm": 0.10932381451129913, "learning_rate": 1e-05, "loss": 0.0162, "step": 1324000 }, { "epoch": 0.013241, "grad_norm": 0.08780393749475479, "learning_rate": 1e-05, "loss": 0.0164, "step": 1324100 }, { "epoch": 0.013242, "grad_norm": 0.15515738725662231, "learning_rate": 1e-05, "loss": 0.0164, "step": 1324200 }, { "epoch": 0.013243, "grad_norm": 0.1418887972831726, "learning_rate": 1e-05, "loss": 0.0162, "step": 1324300 }, { "epoch": 0.013244, "grad_norm": 0.11338596791028976, "learning_rate": 1e-05, "loss": 0.0163, "step": 1324400 }, { "epoch": 0.013245, "grad_norm": 0.12037767469882965, "learning_rate": 1e-05, "loss": 0.0165, "step": 1324500 }, { "epoch": 0.013246, "grad_norm": 0.10544321686029434, "learning_rate": 1e-05, "loss": 0.0162, "step": 1324600 }, { "epoch": 0.013247, "grad_norm": 0.13381901383399963, "learning_rate": 1e-05, "loss": 0.0161, "step": 1324700 }, { "epoch": 0.013248, "grad_norm": 0.11951866000890732, "learning_rate": 1e-05, "loss": 0.0163, "step": 1324800 }, { "epoch": 0.013249, "grad_norm": 0.1079995334148407, "learning_rate": 1e-05, "loss": 0.0164, "step": 1324900 }, { "epoch": 0.01325, "grad_norm": 0.10015648603439331, "learning_rate": 1e-05, "loss": 0.0158, "step": 1325000 }, { "epoch": 0.013251, "grad_norm": 0.15969084203243256, "learning_rate": 1e-05, "loss": 0.0159, "step": 1325100 }, { "epoch": 0.013252, "grad_norm": 0.10675491392612457, "learning_rate": 1e-05, "loss": 0.016, "step": 1325200 }, { "epoch": 0.013253, "grad_norm": 0.17722125351428986, "learning_rate": 1e-05, "loss": 0.0162, "step": 1325300 }, { "epoch": 0.013254, "grad_norm": 0.11670245230197906, "learning_rate": 1e-05, "loss": 0.0161, "step": 1325400 }, { "epoch": 0.013255, "grad_norm": 0.1522880494594574, "learning_rate": 1e-05, "loss": 0.0162, "step": 1325500 }, { "epoch": 0.013256, "grad_norm": 0.24038323760032654, "learning_rate": 1e-05, "loss": 0.016, "step": 1325600 }, { "epoch": 0.013257, "grad_norm": 0.1379215121269226, "learning_rate": 1e-05, "loss": 0.0161, "step": 1325700 }, { "epoch": 0.013258, "grad_norm": 0.11420635879039764, "learning_rate": 1e-05, "loss": 0.0166, "step": 1325800 }, { "epoch": 0.013259, "grad_norm": 0.20740841329097748, "learning_rate": 1e-05, "loss": 0.0162, "step": 1325900 }, { "epoch": 0.01326, "grad_norm": 0.11938764154911041, "learning_rate": 1e-05, "loss": 0.016, "step": 1326000 }, { "epoch": 0.013261, "grad_norm": 0.19868282973766327, "learning_rate": 1e-05, "loss": 0.0164, "step": 1326100 }, { "epoch": 0.013262, "grad_norm": 0.15648630261421204, "learning_rate": 1e-05, "loss": 0.0163, "step": 1326200 }, { "epoch": 0.013263, "grad_norm": 0.13284778594970703, "learning_rate": 1e-05, "loss": 0.0164, "step": 1326300 }, { "epoch": 0.013264, "grad_norm": 0.09387564659118652, "learning_rate": 1e-05, "loss": 0.0163, "step": 1326400 }, { "epoch": 0.013265, "grad_norm": 0.10665692389011383, "learning_rate": 1e-05, "loss": 0.016, "step": 1326500 }, { "epoch": 0.013266, "grad_norm": 0.12427574396133423, "learning_rate": 1e-05, "loss": 0.0161, "step": 1326600 }, { "epoch": 0.013267, "grad_norm": 0.19174952805042267, "learning_rate": 1e-05, "loss": 0.0167, "step": 1326700 }, { "epoch": 0.013268, "grad_norm": 0.11125466972589493, "learning_rate": 1e-05, "loss": 0.0157, "step": 1326800 }, { "epoch": 0.013269, "grad_norm": 0.11503047496080399, "learning_rate": 1e-05, "loss": 0.0162, "step": 1326900 }, { "epoch": 0.01327, "grad_norm": 0.14239339530467987, "learning_rate": 1e-05, "loss": 0.0162, "step": 1327000 }, { "epoch": 0.013271, "grad_norm": 0.12028781324625015, "learning_rate": 1e-05, "loss": 0.0158, "step": 1327100 }, { "epoch": 0.013272, "grad_norm": 0.15598130226135254, "learning_rate": 1e-05, "loss": 0.0163, "step": 1327200 }, { "epoch": 0.013273, "grad_norm": 0.10037216544151306, "learning_rate": 1e-05, "loss": 0.0163, "step": 1327300 }, { "epoch": 0.013274, "grad_norm": 0.12287113815546036, "learning_rate": 1e-05, "loss": 0.0163, "step": 1327400 }, { "epoch": 0.013275, "grad_norm": 0.11034701019525528, "learning_rate": 1e-05, "loss": 0.0161, "step": 1327500 }, { "epoch": 0.013276, "grad_norm": 0.09183290600776672, "learning_rate": 1e-05, "loss": 0.0161, "step": 1327600 }, { "epoch": 0.013277, "grad_norm": 0.11578730493783951, "learning_rate": 1e-05, "loss": 0.0165, "step": 1327700 }, { "epoch": 0.013278, "grad_norm": 0.1037617102265358, "learning_rate": 1e-05, "loss": 0.0162, "step": 1327800 }, { "epoch": 0.013279, "grad_norm": 0.0827946662902832, "learning_rate": 1e-05, "loss": 0.0161, "step": 1327900 }, { "epoch": 0.01328, "grad_norm": 0.14958009123802185, "learning_rate": 1e-05, "loss": 0.0159, "step": 1328000 }, { "epoch": 0.013281, "grad_norm": 0.1446748673915863, "learning_rate": 1e-05, "loss": 0.0165, "step": 1328100 }, { "epoch": 0.013282, "grad_norm": 0.14919348061084747, "learning_rate": 1e-05, "loss": 0.0165, "step": 1328200 }, { "epoch": 0.013283, "grad_norm": 0.11479926109313965, "learning_rate": 1e-05, "loss": 0.0164, "step": 1328300 }, { "epoch": 0.013284, "grad_norm": 0.12295088171958923, "learning_rate": 1e-05, "loss": 0.0161, "step": 1328400 }, { "epoch": 0.013285, "grad_norm": 0.13022077083587646, "learning_rate": 1e-05, "loss": 0.0161, "step": 1328500 }, { "epoch": 0.013286, "grad_norm": 0.15584562718868256, "learning_rate": 1e-05, "loss": 0.0161, "step": 1328600 }, { "epoch": 0.013287, "grad_norm": 0.12066319584846497, "learning_rate": 1e-05, "loss": 0.0162, "step": 1328700 }, { "epoch": 0.013288, "grad_norm": 0.11106076091527939, "learning_rate": 1e-05, "loss": 0.0162, "step": 1328800 }, { "epoch": 0.013289, "grad_norm": 0.2150174230337143, "learning_rate": 1e-05, "loss": 0.0159, "step": 1328900 }, { "epoch": 0.01329, "grad_norm": 0.1339666247367859, "learning_rate": 1e-05, "loss": 0.0159, "step": 1329000 }, { "epoch": 0.013291, "grad_norm": 0.09687516838312149, "learning_rate": 1e-05, "loss": 0.0164, "step": 1329100 }, { "epoch": 0.013292, "grad_norm": 0.13464871048927307, "learning_rate": 1e-05, "loss": 0.0162, "step": 1329200 }, { "epoch": 0.013293, "grad_norm": 0.1276984065771103, "learning_rate": 1e-05, "loss": 0.0163, "step": 1329300 }, { "epoch": 0.013294, "grad_norm": 0.13316866755485535, "learning_rate": 1e-05, "loss": 0.0162, "step": 1329400 }, { "epoch": 0.013295, "grad_norm": 0.12100262939929962, "learning_rate": 1e-05, "loss": 0.016, "step": 1329500 }, { "epoch": 0.013296, "grad_norm": 0.0932469367980957, "learning_rate": 1e-05, "loss": 0.0159, "step": 1329600 }, { "epoch": 0.013297, "grad_norm": 0.18909916281700134, "learning_rate": 1e-05, "loss": 0.0161, "step": 1329700 }, { "epoch": 0.013298, "grad_norm": 0.20852626860141754, "learning_rate": 1e-05, "loss": 0.0167, "step": 1329800 }, { "epoch": 0.013299, "grad_norm": 0.13869282603263855, "learning_rate": 1e-05, "loss": 0.0161, "step": 1329900 }, { "epoch": 0.0133, "grad_norm": 0.1218959167599678, "learning_rate": 1e-05, "loss": 0.0162, "step": 1330000 }, { "epoch": 0.013301, "grad_norm": 0.12175321578979492, "learning_rate": 1e-05, "loss": 0.0162, "step": 1330100 }, { "epoch": 0.013302, "grad_norm": 0.1947467178106308, "learning_rate": 1e-05, "loss": 0.0166, "step": 1330200 }, { "epoch": 0.013303, "grad_norm": 0.13609814643859863, "learning_rate": 1e-05, "loss": 0.0163, "step": 1330300 }, { "epoch": 0.013304, "grad_norm": 0.09422723948955536, "learning_rate": 1e-05, "loss": 0.0159, "step": 1330400 }, { "epoch": 0.013305, "grad_norm": 0.11912335455417633, "learning_rate": 1e-05, "loss": 0.0161, "step": 1330500 }, { "epoch": 0.013306, "grad_norm": 0.1302841603755951, "learning_rate": 1e-05, "loss": 0.0163, "step": 1330600 }, { "epoch": 0.013307, "grad_norm": 0.1316721886396408, "learning_rate": 1e-05, "loss": 0.0168, "step": 1330700 }, { "epoch": 0.013308, "grad_norm": 0.10502658039331436, "learning_rate": 1e-05, "loss": 0.0165, "step": 1330800 }, { "epoch": 0.013309, "grad_norm": 0.14562278985977173, "learning_rate": 1e-05, "loss": 0.0161, "step": 1330900 }, { "epoch": 0.01331, "grad_norm": 0.14147748053073883, "learning_rate": 1e-05, "loss": 0.0161, "step": 1331000 }, { "epoch": 0.013311, "grad_norm": 0.15250715613365173, "learning_rate": 1e-05, "loss": 0.016, "step": 1331100 }, { "epoch": 0.013312, "grad_norm": 0.1034274697303772, "learning_rate": 1e-05, "loss": 0.016, "step": 1331200 }, { "epoch": 0.013313, "grad_norm": 0.11545298993587494, "learning_rate": 1e-05, "loss": 0.0159, "step": 1331300 }, { "epoch": 0.013314, "grad_norm": 0.12633423507213593, "learning_rate": 1e-05, "loss": 0.0162, "step": 1331400 }, { "epoch": 0.013315, "grad_norm": 0.16605746746063232, "learning_rate": 1e-05, "loss": 0.016, "step": 1331500 }, { "epoch": 0.013316, "grad_norm": 0.15132229030132294, "learning_rate": 1e-05, "loss": 0.016, "step": 1331600 }, { "epoch": 0.013317, "grad_norm": 0.1410388946533203, "learning_rate": 1e-05, "loss": 0.0162, "step": 1331700 }, { "epoch": 0.013318, "grad_norm": 0.1433151215314865, "learning_rate": 1e-05, "loss": 0.0165, "step": 1331800 }, { "epoch": 0.013319, "grad_norm": 0.1031736209988594, "learning_rate": 1e-05, "loss": 0.0159, "step": 1331900 }, { "epoch": 0.01332, "grad_norm": 0.10138449817895889, "learning_rate": 1e-05, "loss": 0.0163, "step": 1332000 }, { "epoch": 0.013321, "grad_norm": 0.09954485297203064, "learning_rate": 1e-05, "loss": 0.0164, "step": 1332100 }, { "epoch": 0.013322, "grad_norm": 0.1521698385477066, "learning_rate": 1e-05, "loss": 0.0165, "step": 1332200 }, { "epoch": 0.013323, "grad_norm": 0.1291879266500473, "learning_rate": 1e-05, "loss": 0.0166, "step": 1332300 }, { "epoch": 0.013324, "grad_norm": 0.15219886600971222, "learning_rate": 1e-05, "loss": 0.0162, "step": 1332400 }, { "epoch": 0.013325, "grad_norm": 0.20601026713848114, "learning_rate": 1e-05, "loss": 0.0162, "step": 1332500 }, { "epoch": 0.013326, "grad_norm": 0.13567252457141876, "learning_rate": 1e-05, "loss": 0.0166, "step": 1332600 }, { "epoch": 0.013327, "grad_norm": 0.14686624705791473, "learning_rate": 1e-05, "loss": 0.0161, "step": 1332700 }, { "epoch": 0.013328, "grad_norm": 0.1362001746892929, "learning_rate": 1e-05, "loss": 0.0162, "step": 1332800 }, { "epoch": 0.013329, "grad_norm": 0.11895791441202164, "learning_rate": 1e-05, "loss": 0.0163, "step": 1332900 }, { "epoch": 0.01333, "grad_norm": 0.11705677956342697, "learning_rate": 1e-05, "loss": 0.0162, "step": 1333000 }, { "epoch": 0.013331, "grad_norm": 0.124307781457901, "learning_rate": 1e-05, "loss": 0.0162, "step": 1333100 }, { "epoch": 0.013332, "grad_norm": 0.10763980448246002, "learning_rate": 1e-05, "loss": 0.0166, "step": 1333200 }, { "epoch": 0.013333, "grad_norm": 0.14075182378292084, "learning_rate": 1e-05, "loss": 0.0159, "step": 1333300 }, { "epoch": 0.013334, "grad_norm": 0.11250641196966171, "learning_rate": 1e-05, "loss": 0.0157, "step": 1333400 }, { "epoch": 0.013335, "grad_norm": 0.1061868667602539, "learning_rate": 1e-05, "loss": 0.0161, "step": 1333500 }, { "epoch": 0.013336, "grad_norm": 0.10494494438171387, "learning_rate": 1e-05, "loss": 0.0164, "step": 1333600 }, { "epoch": 0.013337, "grad_norm": 0.07874367386102676, "learning_rate": 1e-05, "loss": 0.0161, "step": 1333700 }, { "epoch": 0.013338, "grad_norm": 0.1854867786169052, "learning_rate": 1e-05, "loss": 0.0165, "step": 1333800 }, { "epoch": 0.013339, "grad_norm": 0.1438368856906891, "learning_rate": 1e-05, "loss": 0.0162, "step": 1333900 }, { "epoch": 0.01334, "grad_norm": 0.11696300655603409, "learning_rate": 1e-05, "loss": 0.0158, "step": 1334000 }, { "epoch": 0.013341, "grad_norm": 0.12186668813228607, "learning_rate": 1e-05, "loss": 0.016, "step": 1334100 }, { "epoch": 0.013342, "grad_norm": 0.1049484834074974, "learning_rate": 1e-05, "loss": 0.0162, "step": 1334200 }, { "epoch": 0.013343, "grad_norm": 0.17790794372558594, "learning_rate": 1e-05, "loss": 0.0163, "step": 1334300 }, { "epoch": 0.013344, "grad_norm": 0.09946180135011673, "learning_rate": 1e-05, "loss": 0.0165, "step": 1334400 }, { "epoch": 0.013345, "grad_norm": 0.42204055190086365, "learning_rate": 1e-05, "loss": 0.0166, "step": 1334500 }, { "epoch": 0.013346, "grad_norm": 0.1744949072599411, "learning_rate": 1e-05, "loss": 0.0163, "step": 1334600 }, { "epoch": 0.013347, "grad_norm": 0.12170768529176712, "learning_rate": 1e-05, "loss": 0.0163, "step": 1334700 }, { "epoch": 0.013348, "grad_norm": 0.1515032947063446, "learning_rate": 1e-05, "loss": 0.0162, "step": 1334800 }, { "epoch": 0.013349, "grad_norm": 0.09983762353658676, "learning_rate": 1e-05, "loss": 0.0165, "step": 1334900 }, { "epoch": 0.01335, "grad_norm": 0.114044688642025, "learning_rate": 1e-05, "loss": 0.0154, "step": 1335000 }, { "epoch": 0.013351, "grad_norm": 0.10269175469875336, "learning_rate": 1e-05, "loss": 0.0161, "step": 1335100 }, { "epoch": 0.013352, "grad_norm": 0.13245487213134766, "learning_rate": 1e-05, "loss": 0.016, "step": 1335200 }, { "epoch": 0.013353, "grad_norm": 0.14604973793029785, "learning_rate": 1e-05, "loss": 0.0162, "step": 1335300 }, { "epoch": 0.013354, "grad_norm": 0.11649958789348602, "learning_rate": 1e-05, "loss": 0.0163, "step": 1335400 }, { "epoch": 0.013355, "grad_norm": 0.16584625840187073, "learning_rate": 1e-05, "loss": 0.0163, "step": 1335500 }, { "epoch": 0.013356, "grad_norm": 0.10824941843748093, "learning_rate": 1e-05, "loss": 0.0159, "step": 1335600 }, { "epoch": 0.013357, "grad_norm": 0.12860675156116486, "learning_rate": 1e-05, "loss": 0.0159, "step": 1335700 }, { "epoch": 0.013358, "grad_norm": 0.10871832072734833, "learning_rate": 1e-05, "loss": 0.0161, "step": 1335800 }, { "epoch": 0.013359, "grad_norm": 0.13666266202926636, "learning_rate": 1e-05, "loss": 0.0161, "step": 1335900 }, { "epoch": 0.01336, "grad_norm": 0.12249309569597244, "learning_rate": 1e-05, "loss": 0.0163, "step": 1336000 }, { "epoch": 0.013361, "grad_norm": 0.09709904342889786, "learning_rate": 1e-05, "loss": 0.0164, "step": 1336100 }, { "epoch": 0.013362, "grad_norm": 0.13797612488269806, "learning_rate": 1e-05, "loss": 0.0162, "step": 1336200 }, { "epoch": 0.013363, "grad_norm": 0.13456128537654877, "learning_rate": 1e-05, "loss": 0.0162, "step": 1336300 }, { "epoch": 0.013364, "grad_norm": 0.16102559864521027, "learning_rate": 1e-05, "loss": 0.0161, "step": 1336400 }, { "epoch": 0.013365, "grad_norm": 0.10420755296945572, "learning_rate": 1e-05, "loss": 0.0161, "step": 1336500 }, { "epoch": 0.013366, "grad_norm": 0.14248746633529663, "learning_rate": 1e-05, "loss": 0.0164, "step": 1336600 }, { "epoch": 0.013367, "grad_norm": 0.10198278725147247, "learning_rate": 1e-05, "loss": 0.0167, "step": 1336700 }, { "epoch": 0.013368, "grad_norm": 0.1212625801563263, "learning_rate": 1e-05, "loss": 0.0164, "step": 1336800 }, { "epoch": 0.013369, "grad_norm": 0.1126525029540062, "learning_rate": 1e-05, "loss": 0.0166, "step": 1336900 }, { "epoch": 0.01337, "grad_norm": 0.12995527684688568, "learning_rate": 1e-05, "loss": 0.0163, "step": 1337000 }, { "epoch": 0.013371, "grad_norm": 0.11967474967241287, "learning_rate": 1e-05, "loss": 0.0162, "step": 1337100 }, { "epoch": 0.013372, "grad_norm": 0.16348446905612946, "learning_rate": 1e-05, "loss": 0.0162, "step": 1337200 }, { "epoch": 0.013373, "grad_norm": 0.1092529445886612, "learning_rate": 1e-05, "loss": 0.0156, "step": 1337300 }, { "epoch": 0.013374, "grad_norm": 0.14822177588939667, "learning_rate": 1e-05, "loss": 0.0164, "step": 1337400 }, { "epoch": 0.013375, "grad_norm": 0.14625002443790436, "learning_rate": 1e-05, "loss": 0.016, "step": 1337500 }, { "epoch": 0.013376, "grad_norm": 0.11503405123949051, "learning_rate": 1e-05, "loss": 0.0162, "step": 1337600 }, { "epoch": 0.013377, "grad_norm": 0.20265518128871918, "learning_rate": 1e-05, "loss": 0.0165, "step": 1337700 }, { "epoch": 0.013378, "grad_norm": 0.10182271152734756, "learning_rate": 1e-05, "loss": 0.0158, "step": 1337800 }, { "epoch": 0.013379, "grad_norm": 0.20562419295310974, "learning_rate": 1e-05, "loss": 0.0166, "step": 1337900 }, { "epoch": 0.01338, "grad_norm": 0.11277598887681961, "learning_rate": 1e-05, "loss": 0.0163, "step": 1338000 }, { "epoch": 0.013381, "grad_norm": 0.1368623524904251, "learning_rate": 1e-05, "loss": 0.0164, "step": 1338100 }, { "epoch": 0.013382, "grad_norm": 0.16408994793891907, "learning_rate": 1e-05, "loss": 0.0161, "step": 1338200 }, { "epoch": 0.013383, "grad_norm": 0.11099887639284134, "learning_rate": 1e-05, "loss": 0.0165, "step": 1338300 }, { "epoch": 0.013384, "grad_norm": 0.1177581325173378, "learning_rate": 1e-05, "loss": 0.0161, "step": 1338400 }, { "epoch": 0.013385, "grad_norm": 0.12943346798419952, "learning_rate": 1e-05, "loss": 0.0163, "step": 1338500 }, { "epoch": 0.013386, "grad_norm": 0.11764039099216461, "learning_rate": 1e-05, "loss": 0.0167, "step": 1338600 }, { "epoch": 0.013387, "grad_norm": 0.12330527603626251, "learning_rate": 1e-05, "loss": 0.0162, "step": 1338700 }, { "epoch": 0.013388, "grad_norm": 0.21508486568927765, "learning_rate": 1e-05, "loss": 0.0163, "step": 1338800 }, { "epoch": 0.013389, "grad_norm": 0.1260402649641037, "learning_rate": 1e-05, "loss": 0.0165, "step": 1338900 }, { "epoch": 0.01339, "grad_norm": 0.15863095223903656, "learning_rate": 1e-05, "loss": 0.0159, "step": 1339000 }, { "epoch": 0.013391, "grad_norm": 0.1334916353225708, "learning_rate": 1e-05, "loss": 0.0162, "step": 1339100 }, { "epoch": 0.013392, "grad_norm": 0.09597885608673096, "learning_rate": 1e-05, "loss": 0.0159, "step": 1339200 }, { "epoch": 0.013393, "grad_norm": 0.175143301486969, "learning_rate": 1e-05, "loss": 0.0168, "step": 1339300 }, { "epoch": 0.013394, "grad_norm": 0.12264876067638397, "learning_rate": 1e-05, "loss": 0.0161, "step": 1339400 }, { "epoch": 0.013395, "grad_norm": 0.13329659402370453, "learning_rate": 1e-05, "loss": 0.0162, "step": 1339500 }, { "epoch": 0.013396, "grad_norm": 0.10051558166742325, "learning_rate": 1e-05, "loss": 0.0159, "step": 1339600 }, { "epoch": 0.013397, "grad_norm": 0.14492587745189667, "learning_rate": 1e-05, "loss": 0.0161, "step": 1339700 }, { "epoch": 0.013398, "grad_norm": 0.11560703814029694, "learning_rate": 1e-05, "loss": 0.0163, "step": 1339800 }, { "epoch": 0.013399, "grad_norm": 0.11702567338943481, "learning_rate": 1e-05, "loss": 0.0163, "step": 1339900 }, { "epoch": 0.0134, "grad_norm": 0.1194954365491867, "learning_rate": 1e-05, "loss": 0.016, "step": 1340000 }, { "epoch": 0.0134, "eval_loss": 0.014248857274651527, "eval_runtime": 173.4791, "eval_samples_per_second": 288.219, "eval_steps_per_second": 18.014, "step": 1340000 }, { "epoch": 0.013401, "grad_norm": 0.1349240392446518, "learning_rate": 1e-05, "loss": 0.0163, "step": 1340100 }, { "epoch": 0.013402, "grad_norm": 0.14384394884109497, "learning_rate": 1e-05, "loss": 0.0161, "step": 1340200 }, { "epoch": 0.013403, "grad_norm": 0.11511562764644623, "learning_rate": 1e-05, "loss": 0.016, "step": 1340300 }, { "epoch": 0.013404, "grad_norm": 0.10244560986757278, "learning_rate": 1e-05, "loss": 0.0161, "step": 1340400 }, { "epoch": 0.013405, "grad_norm": 0.14686931669712067, "learning_rate": 1e-05, "loss": 0.0164, "step": 1340500 }, { "epoch": 0.013406, "grad_norm": 0.09742959588766098, "learning_rate": 1e-05, "loss": 0.0164, "step": 1340600 }, { "epoch": 0.013407, "grad_norm": 0.16414159536361694, "learning_rate": 1e-05, "loss": 0.016, "step": 1340700 }, { "epoch": 0.013408, "grad_norm": 0.13148660957813263, "learning_rate": 1e-05, "loss": 0.0159, "step": 1340800 }, { "epoch": 0.013409, "grad_norm": 0.12868648767471313, "learning_rate": 1e-05, "loss": 0.0165, "step": 1340900 }, { "epoch": 0.01341, "grad_norm": 0.14962218701839447, "learning_rate": 1e-05, "loss": 0.0164, "step": 1341000 }, { "epoch": 0.013411, "grad_norm": 0.1563606709241867, "learning_rate": 1e-05, "loss": 0.016, "step": 1341100 }, { "epoch": 0.013412, "grad_norm": 0.14714199304580688, "learning_rate": 1e-05, "loss": 0.0161, "step": 1341200 }, { "epoch": 0.013413, "grad_norm": 0.12014134228229523, "learning_rate": 1e-05, "loss": 0.0159, "step": 1341300 }, { "epoch": 0.013414, "grad_norm": 0.1294587403535843, "learning_rate": 1e-05, "loss": 0.0163, "step": 1341400 }, { "epoch": 0.013415, "grad_norm": 0.14960679411888123, "learning_rate": 1e-05, "loss": 0.0162, "step": 1341500 }, { "epoch": 0.013416, "grad_norm": 0.11636599153280258, "learning_rate": 1e-05, "loss": 0.0164, "step": 1341600 }, { "epoch": 0.013417, "grad_norm": 0.11883791536092758, "learning_rate": 1e-05, "loss": 0.0159, "step": 1341700 }, { "epoch": 0.013418, "grad_norm": 0.1212269589304924, "learning_rate": 1e-05, "loss": 0.0161, "step": 1341800 }, { "epoch": 0.013419, "grad_norm": 0.16742242872714996, "learning_rate": 1e-05, "loss": 0.0161, "step": 1341900 }, { "epoch": 0.01342, "grad_norm": 0.11704440414905548, "learning_rate": 1e-05, "loss": 0.0161, "step": 1342000 }, { "epoch": 0.013421, "grad_norm": 0.12432153522968292, "learning_rate": 1e-05, "loss": 0.0164, "step": 1342100 }, { "epoch": 0.013422, "grad_norm": 0.09776369482278824, "learning_rate": 1e-05, "loss": 0.0163, "step": 1342200 }, { "epoch": 0.013423, "grad_norm": 0.1473885476589203, "learning_rate": 1e-05, "loss": 0.0162, "step": 1342300 }, { "epoch": 0.013424, "grad_norm": 0.11700066179037094, "learning_rate": 1e-05, "loss": 0.0161, "step": 1342400 }, { "epoch": 0.013425, "grad_norm": 0.23876354098320007, "learning_rate": 1e-05, "loss": 0.016, "step": 1342500 }, { "epoch": 0.013426, "grad_norm": 0.12225119024515152, "learning_rate": 1e-05, "loss": 0.0159, "step": 1342600 }, { "epoch": 0.013427, "grad_norm": 0.16894668340682983, "learning_rate": 1e-05, "loss": 0.0164, "step": 1342700 }, { "epoch": 0.013428, "grad_norm": 0.078093521296978, "learning_rate": 1e-05, "loss": 0.0161, "step": 1342800 }, { "epoch": 0.013429, "grad_norm": 0.13254159688949585, "learning_rate": 1e-05, "loss": 0.0164, "step": 1342900 }, { "epoch": 0.01343, "grad_norm": 0.11705762147903442, "learning_rate": 1e-05, "loss": 0.0162, "step": 1343000 }, { "epoch": 0.013431, "grad_norm": 0.17392632365226746, "learning_rate": 1e-05, "loss": 0.0162, "step": 1343100 }, { "epoch": 0.013432, "grad_norm": 0.10567264258861542, "learning_rate": 1e-05, "loss": 0.0162, "step": 1343200 }, { "epoch": 0.013433, "grad_norm": 0.11891212314367294, "learning_rate": 1e-05, "loss": 0.0159, "step": 1343300 }, { "epoch": 0.013434, "grad_norm": 0.1002144142985344, "learning_rate": 1e-05, "loss": 0.0165, "step": 1343400 }, { "epoch": 0.013435, "grad_norm": 0.09600657224655151, "learning_rate": 1e-05, "loss": 0.0162, "step": 1343500 }, { "epoch": 0.013436, "grad_norm": 0.19006383419036865, "learning_rate": 1e-05, "loss": 0.0162, "step": 1343600 }, { "epoch": 0.013437, "grad_norm": 0.1424771100282669, "learning_rate": 1e-05, "loss": 0.016, "step": 1343700 }, { "epoch": 0.013438, "grad_norm": 0.15867629647254944, "learning_rate": 1e-05, "loss": 0.0163, "step": 1343800 }, { "epoch": 0.013439, "grad_norm": 0.1329519897699356, "learning_rate": 1e-05, "loss": 0.0161, "step": 1343900 }, { "epoch": 0.01344, "grad_norm": 0.186019629240036, "learning_rate": 1e-05, "loss": 0.0159, "step": 1344000 }, { "epoch": 0.013441, "grad_norm": 0.1053474172949791, "learning_rate": 1e-05, "loss": 0.0161, "step": 1344100 }, { "epoch": 0.013442, "grad_norm": 0.126004159450531, "learning_rate": 1e-05, "loss": 0.0162, "step": 1344200 }, { "epoch": 0.013443, "grad_norm": 0.1356130987405777, "learning_rate": 1e-05, "loss": 0.0158, "step": 1344300 }, { "epoch": 0.013444, "grad_norm": 0.13044850528240204, "learning_rate": 1e-05, "loss": 0.0163, "step": 1344400 }, { "epoch": 0.013445, "grad_norm": 0.10235224664211273, "learning_rate": 1e-05, "loss": 0.0165, "step": 1344500 }, { "epoch": 0.013446, "grad_norm": 0.16020840406417847, "learning_rate": 1e-05, "loss": 0.0164, "step": 1344600 }, { "epoch": 0.013447, "grad_norm": 0.13228966295719147, "learning_rate": 1e-05, "loss": 0.0165, "step": 1344700 }, { "epoch": 0.013448, "grad_norm": 0.11531301587820053, "learning_rate": 1e-05, "loss": 0.0161, "step": 1344800 }, { "epoch": 0.013449, "grad_norm": 0.1241195872426033, "learning_rate": 1e-05, "loss": 0.0165, "step": 1344900 }, { "epoch": 0.01345, "grad_norm": 0.13897453248500824, "learning_rate": 1e-05, "loss": 0.0163, "step": 1345000 }, { "epoch": 0.013451, "grad_norm": 0.1351250410079956, "learning_rate": 1e-05, "loss": 0.0162, "step": 1345100 }, { "epoch": 0.013452, "grad_norm": 0.1421210914850235, "learning_rate": 1e-05, "loss": 0.0161, "step": 1345200 }, { "epoch": 0.013453, "grad_norm": 0.14473161101341248, "learning_rate": 1e-05, "loss": 0.0166, "step": 1345300 }, { "epoch": 0.013454, "grad_norm": 0.1649581640958786, "learning_rate": 1e-05, "loss": 0.0158, "step": 1345400 }, { "epoch": 0.013455, "grad_norm": 0.12414558231830597, "learning_rate": 1e-05, "loss": 0.0163, "step": 1345500 }, { "epoch": 0.013456, "grad_norm": 0.11147378385066986, "learning_rate": 1e-05, "loss": 0.0162, "step": 1345600 }, { "epoch": 0.013457, "grad_norm": 0.13121430575847626, "learning_rate": 1e-05, "loss": 0.0159, "step": 1345700 }, { "epoch": 0.013458, "grad_norm": 0.1806178241968155, "learning_rate": 1e-05, "loss": 0.0162, "step": 1345800 }, { "epoch": 0.013459, "grad_norm": 0.09842171519994736, "learning_rate": 1e-05, "loss": 0.0163, "step": 1345900 }, { "epoch": 0.01346, "grad_norm": 0.1338529735803604, "learning_rate": 1e-05, "loss": 0.0161, "step": 1346000 }, { "epoch": 0.013461, "grad_norm": 0.09439342468976974, "learning_rate": 1e-05, "loss": 0.0162, "step": 1346100 }, { "epoch": 0.013462, "grad_norm": 0.12969675660133362, "learning_rate": 1e-05, "loss": 0.0162, "step": 1346200 }, { "epoch": 0.013463, "grad_norm": 0.18558809161186218, "learning_rate": 1e-05, "loss": 0.0164, "step": 1346300 }, { "epoch": 0.013464, "grad_norm": 0.07596418261528015, "learning_rate": 1e-05, "loss": 0.0161, "step": 1346400 }, { "epoch": 0.013465, "grad_norm": 0.150575652718544, "learning_rate": 1e-05, "loss": 0.0159, "step": 1346500 }, { "epoch": 0.013466, "grad_norm": 0.13432282209396362, "learning_rate": 1e-05, "loss": 0.0163, "step": 1346600 }, { "epoch": 0.013467, "grad_norm": 0.10806886106729507, "learning_rate": 1e-05, "loss": 0.0158, "step": 1346700 }, { "epoch": 0.013468, "grad_norm": 0.13938038051128387, "learning_rate": 1e-05, "loss": 0.0161, "step": 1346800 }, { "epoch": 0.013469, "grad_norm": 0.11786992847919464, "learning_rate": 1e-05, "loss": 0.0161, "step": 1346900 }, { "epoch": 0.01347, "grad_norm": 0.199789896607399, "learning_rate": 1e-05, "loss": 0.0165, "step": 1347000 }, { "epoch": 0.013471, "grad_norm": 0.13415898382663727, "learning_rate": 1e-05, "loss": 0.0161, "step": 1347100 }, { "epoch": 0.013472, "grad_norm": 0.11835692822933197, "learning_rate": 1e-05, "loss": 0.016, "step": 1347200 }, { "epoch": 0.013473, "grad_norm": 0.12051954120397568, "learning_rate": 1e-05, "loss": 0.0158, "step": 1347300 }, { "epoch": 0.013474, "grad_norm": 0.08937104046344757, "learning_rate": 1e-05, "loss": 0.0163, "step": 1347400 }, { "epoch": 0.013475, "grad_norm": 0.13956721127033234, "learning_rate": 1e-05, "loss": 0.0159, "step": 1347500 }, { "epoch": 0.013476, "grad_norm": 0.1635609269142151, "learning_rate": 1e-05, "loss": 0.0161, "step": 1347600 }, { "epoch": 0.013477, "grad_norm": 0.1447392851114273, "learning_rate": 1e-05, "loss": 0.016, "step": 1347700 }, { "epoch": 0.013478, "grad_norm": 0.1080499216914177, "learning_rate": 1e-05, "loss": 0.0159, "step": 1347800 }, { "epoch": 0.013479, "grad_norm": 0.14149542152881622, "learning_rate": 1e-05, "loss": 0.0162, "step": 1347900 }, { "epoch": 0.01348, "grad_norm": 0.13723868131637573, "learning_rate": 1e-05, "loss": 0.016, "step": 1348000 }, { "epoch": 0.013481, "grad_norm": 0.13055157661437988, "learning_rate": 1e-05, "loss": 0.0162, "step": 1348100 }, { "epoch": 0.013482, "grad_norm": 0.13344421982765198, "learning_rate": 1e-05, "loss": 0.0162, "step": 1348200 }, { "epoch": 0.013483, "grad_norm": 0.1226157620549202, "learning_rate": 1e-05, "loss": 0.0163, "step": 1348300 }, { "epoch": 0.013484, "grad_norm": 0.1044110357761383, "learning_rate": 1e-05, "loss": 0.0161, "step": 1348400 }, { "epoch": 0.013485, "grad_norm": 0.10326346009969711, "learning_rate": 1e-05, "loss": 0.0163, "step": 1348500 }, { "epoch": 0.013486, "grad_norm": 0.12607644498348236, "learning_rate": 1e-05, "loss": 0.0165, "step": 1348600 }, { "epoch": 0.013487, "grad_norm": 0.13773535192012787, "learning_rate": 1e-05, "loss": 0.0163, "step": 1348700 }, { "epoch": 0.013488, "grad_norm": 0.11861514300107956, "learning_rate": 1e-05, "loss": 0.0163, "step": 1348800 }, { "epoch": 0.013489, "grad_norm": 0.1433177888393402, "learning_rate": 1e-05, "loss": 0.0164, "step": 1348900 }, { "epoch": 0.01349, "grad_norm": 0.11010066419839859, "learning_rate": 1e-05, "loss": 0.0161, "step": 1349000 }, { "epoch": 0.013491, "grad_norm": 0.1515919268131256, "learning_rate": 1e-05, "loss": 0.0163, "step": 1349100 }, { "epoch": 0.013492, "grad_norm": 0.1111433207988739, "learning_rate": 1e-05, "loss": 0.0164, "step": 1349200 }, { "epoch": 0.013493, "grad_norm": 0.1572328805923462, "learning_rate": 1e-05, "loss": 0.0162, "step": 1349300 }, { "epoch": 0.013494, "grad_norm": 0.11761195957660675, "learning_rate": 1e-05, "loss": 0.0169, "step": 1349400 }, { "epoch": 0.013495, "grad_norm": 0.173399418592453, "learning_rate": 1e-05, "loss": 0.0161, "step": 1349500 }, { "epoch": 0.013496, "grad_norm": 0.09862564504146576, "learning_rate": 1e-05, "loss": 0.0161, "step": 1349600 }, { "epoch": 0.013497, "grad_norm": 0.11665821820497513, "learning_rate": 1e-05, "loss": 0.0161, "step": 1349700 }, { "epoch": 0.013498, "grad_norm": 0.11159809678792953, "learning_rate": 1e-05, "loss": 0.0161, "step": 1349800 }, { "epoch": 0.013499, "grad_norm": 0.142875075340271, "learning_rate": 1e-05, "loss": 0.0166, "step": 1349900 }, { "epoch": 0.0135, "grad_norm": 0.14725714921951294, "learning_rate": 1e-05, "loss": 0.0168, "step": 1350000 }, { "epoch": 0.013501, "grad_norm": 0.08478040993213654, "learning_rate": 1e-05, "loss": 0.0163, "step": 1350100 }, { "epoch": 0.013502, "grad_norm": 0.1351030021905899, "learning_rate": 1e-05, "loss": 0.016, "step": 1350200 }, { "epoch": 0.013503, "grad_norm": 0.17397566139698029, "learning_rate": 1e-05, "loss": 0.0163, "step": 1350300 }, { "epoch": 0.013504, "grad_norm": 0.19756928086280823, "learning_rate": 1e-05, "loss": 0.016, "step": 1350400 }, { "epoch": 0.013505, "grad_norm": 0.12053833156824112, "learning_rate": 1e-05, "loss": 0.0158, "step": 1350500 }, { "epoch": 0.013506, "grad_norm": 0.15231235325336456, "learning_rate": 1e-05, "loss": 0.0161, "step": 1350600 }, { "epoch": 0.013507, "grad_norm": 0.1056063175201416, "learning_rate": 1e-05, "loss": 0.0164, "step": 1350700 }, { "epoch": 0.013508, "grad_norm": 0.1427895724773407, "learning_rate": 1e-05, "loss": 0.0159, "step": 1350800 }, { "epoch": 0.013509, "grad_norm": 0.08992701023817062, "learning_rate": 1e-05, "loss": 0.0159, "step": 1350900 }, { "epoch": 0.01351, "grad_norm": 0.11683517694473267, "learning_rate": 1e-05, "loss": 0.0163, "step": 1351000 }, { "epoch": 0.013511, "grad_norm": 0.12737147510051727, "learning_rate": 1e-05, "loss": 0.0162, "step": 1351100 }, { "epoch": 0.013512, "grad_norm": 0.121894471347332, "learning_rate": 1e-05, "loss": 0.0162, "step": 1351200 }, { "epoch": 0.013513, "grad_norm": 0.23051005601882935, "learning_rate": 1e-05, "loss": 0.016, "step": 1351300 }, { "epoch": 0.013514, "grad_norm": 0.09733094274997711, "learning_rate": 1e-05, "loss": 0.016, "step": 1351400 }, { "epoch": 0.013515, "grad_norm": 0.11809241026639938, "learning_rate": 1e-05, "loss": 0.0161, "step": 1351500 }, { "epoch": 0.013516, "grad_norm": 0.12893886864185333, "learning_rate": 1e-05, "loss": 0.0161, "step": 1351600 }, { "epoch": 0.013517, "grad_norm": 0.15044911205768585, "learning_rate": 1e-05, "loss": 0.0155, "step": 1351700 }, { "epoch": 0.013518, "grad_norm": 0.13902068138122559, "learning_rate": 1e-05, "loss": 0.0162, "step": 1351800 }, { "epoch": 0.013519, "grad_norm": 0.09895825386047363, "learning_rate": 1e-05, "loss": 0.0161, "step": 1351900 }, { "epoch": 0.01352, "grad_norm": 0.12115538120269775, "learning_rate": 1e-05, "loss": 0.0158, "step": 1352000 }, { "epoch": 0.013521, "grad_norm": 0.1386166512966156, "learning_rate": 1e-05, "loss": 0.0158, "step": 1352100 }, { "epoch": 0.013522, "grad_norm": 0.10826811194419861, "learning_rate": 1e-05, "loss": 0.016, "step": 1352200 }, { "epoch": 0.013523, "grad_norm": 0.14412686228752136, "learning_rate": 1e-05, "loss": 0.0159, "step": 1352300 }, { "epoch": 0.013524, "grad_norm": 0.12216375023126602, "learning_rate": 1e-05, "loss": 0.0161, "step": 1352400 }, { "epoch": 0.013525, "grad_norm": 0.12516368925571442, "learning_rate": 1e-05, "loss": 0.016, "step": 1352500 }, { "epoch": 0.013526, "grad_norm": 0.14888666570186615, "learning_rate": 1e-05, "loss": 0.0161, "step": 1352600 }, { "epoch": 0.013527, "grad_norm": 0.09743274748325348, "learning_rate": 1e-05, "loss": 0.0163, "step": 1352700 }, { "epoch": 0.013528, "grad_norm": 0.1457269936800003, "learning_rate": 1e-05, "loss": 0.0159, "step": 1352800 }, { "epoch": 0.013529, "grad_norm": 0.11074167490005493, "learning_rate": 1e-05, "loss": 0.0163, "step": 1352900 }, { "epoch": 0.01353, "grad_norm": 0.11907835304737091, "learning_rate": 1e-05, "loss": 0.0162, "step": 1353000 }, { "epoch": 0.013531, "grad_norm": 0.11461029946804047, "learning_rate": 1e-05, "loss": 0.0161, "step": 1353100 }, { "epoch": 0.013532, "grad_norm": 0.23454612493515015, "learning_rate": 1e-05, "loss": 0.0157, "step": 1353200 }, { "epoch": 0.013533, "grad_norm": 0.11696672439575195, "learning_rate": 1e-05, "loss": 0.0161, "step": 1353300 }, { "epoch": 0.013534, "grad_norm": 0.0876341462135315, "learning_rate": 1e-05, "loss": 0.0158, "step": 1353400 }, { "epoch": 0.013535, "grad_norm": 0.119034543633461, "learning_rate": 1e-05, "loss": 0.0159, "step": 1353500 }, { "epoch": 0.013536, "grad_norm": 0.1566765457391739, "learning_rate": 1e-05, "loss": 0.0162, "step": 1353600 }, { "epoch": 0.013537, "grad_norm": 0.13741017878055573, "learning_rate": 1e-05, "loss": 0.0162, "step": 1353700 }, { "epoch": 0.013538, "grad_norm": 0.14879091084003448, "learning_rate": 1e-05, "loss": 0.0163, "step": 1353800 }, { "epoch": 0.013539, "grad_norm": 0.13046281039714813, "learning_rate": 1e-05, "loss": 0.0159, "step": 1353900 }, { "epoch": 0.01354, "grad_norm": 0.12165812402963638, "learning_rate": 1e-05, "loss": 0.0164, "step": 1354000 }, { "epoch": 0.013541, "grad_norm": 0.14396154880523682, "learning_rate": 1e-05, "loss": 0.0159, "step": 1354100 }, { "epoch": 0.013542, "grad_norm": 0.1525970846414566, "learning_rate": 1e-05, "loss": 0.0158, "step": 1354200 }, { "epoch": 0.013543, "grad_norm": 0.133986696600914, "learning_rate": 1e-05, "loss": 0.0161, "step": 1354300 }, { "epoch": 0.013544, "grad_norm": 0.08759519457817078, "learning_rate": 1e-05, "loss": 0.0156, "step": 1354400 }, { "epoch": 0.013545, "grad_norm": 0.10095717757940292, "learning_rate": 1e-05, "loss": 0.0161, "step": 1354500 }, { "epoch": 0.013546, "grad_norm": 0.1010761708021164, "learning_rate": 1e-05, "loss": 0.0159, "step": 1354600 }, { "epoch": 0.013547, "grad_norm": 0.122161366045475, "learning_rate": 1e-05, "loss": 0.0163, "step": 1354700 }, { "epoch": 0.013548, "grad_norm": 0.15017981827259064, "learning_rate": 1e-05, "loss": 0.0166, "step": 1354800 }, { "epoch": 0.013549, "grad_norm": 0.12520456314086914, "learning_rate": 1e-05, "loss": 0.0163, "step": 1354900 }, { "epoch": 0.01355, "grad_norm": 0.10833262652158737, "learning_rate": 1e-05, "loss": 0.0156, "step": 1355000 }, { "epoch": 0.013551, "grad_norm": 0.09918901324272156, "learning_rate": 1e-05, "loss": 0.0162, "step": 1355100 }, { "epoch": 0.013552, "grad_norm": 0.07204120606184006, "learning_rate": 1e-05, "loss": 0.0163, "step": 1355200 }, { "epoch": 0.013553, "grad_norm": 0.13993801176548004, "learning_rate": 1e-05, "loss": 0.0163, "step": 1355300 }, { "epoch": 0.013554, "grad_norm": 0.11797252297401428, "learning_rate": 1e-05, "loss": 0.0164, "step": 1355400 }, { "epoch": 0.013555, "grad_norm": 0.17738503217697144, "learning_rate": 1e-05, "loss": 0.0164, "step": 1355500 }, { "epoch": 0.013556, "grad_norm": 0.12691818177700043, "learning_rate": 1e-05, "loss": 0.0161, "step": 1355600 }, { "epoch": 0.013557, "grad_norm": 0.13286443054676056, "learning_rate": 1e-05, "loss": 0.016, "step": 1355700 }, { "epoch": 0.013558, "grad_norm": 0.10135279595851898, "learning_rate": 1e-05, "loss": 0.016, "step": 1355800 }, { "epoch": 0.013559, "grad_norm": 0.1420484483242035, "learning_rate": 1e-05, "loss": 0.0166, "step": 1355900 }, { "epoch": 0.01356, "grad_norm": 0.13291426002979279, "learning_rate": 1e-05, "loss": 0.0166, "step": 1356000 }, { "epoch": 0.013561, "grad_norm": 0.13082247972488403, "learning_rate": 1e-05, "loss": 0.0163, "step": 1356100 }, { "epoch": 0.013562, "grad_norm": 0.12374965846538544, "learning_rate": 1e-05, "loss": 0.0159, "step": 1356200 }, { "epoch": 0.013563, "grad_norm": 0.17262403666973114, "learning_rate": 1e-05, "loss": 0.0164, "step": 1356300 }, { "epoch": 0.013564, "grad_norm": 0.08657687157392502, "learning_rate": 1e-05, "loss": 0.0165, "step": 1356400 }, { "epoch": 0.013565, "grad_norm": 0.18884125351905823, "learning_rate": 1e-05, "loss": 0.0164, "step": 1356500 }, { "epoch": 0.013566, "grad_norm": 0.1128341481089592, "learning_rate": 1e-05, "loss": 0.0163, "step": 1356600 }, { "epoch": 0.013567, "grad_norm": 0.1278291642665863, "learning_rate": 1e-05, "loss": 0.0164, "step": 1356700 }, { "epoch": 0.013568, "grad_norm": 0.14815174043178558, "learning_rate": 1e-05, "loss": 0.0163, "step": 1356800 }, { "epoch": 0.013569, "grad_norm": 0.12066008150577545, "learning_rate": 1e-05, "loss": 0.0159, "step": 1356900 }, { "epoch": 0.01357, "grad_norm": 0.16465550661087036, "learning_rate": 1e-05, "loss": 0.0159, "step": 1357000 }, { "epoch": 0.013571, "grad_norm": 0.09802985936403275, "learning_rate": 1e-05, "loss": 0.0159, "step": 1357100 }, { "epoch": 0.013572, "grad_norm": 0.12949685752391815, "learning_rate": 1e-05, "loss": 0.016, "step": 1357200 }, { "epoch": 0.013573, "grad_norm": 0.12193460017442703, "learning_rate": 1e-05, "loss": 0.016, "step": 1357300 }, { "epoch": 0.013574, "grad_norm": 0.16742536425590515, "learning_rate": 1e-05, "loss": 0.0164, "step": 1357400 }, { "epoch": 0.013575, "grad_norm": 0.15211647748947144, "learning_rate": 1e-05, "loss": 0.0161, "step": 1357500 }, { "epoch": 0.013576, "grad_norm": 0.10691481828689575, "learning_rate": 1e-05, "loss": 0.0161, "step": 1357600 }, { "epoch": 0.013577, "grad_norm": 0.1587217152118683, "learning_rate": 1e-05, "loss": 0.0158, "step": 1357700 }, { "epoch": 0.013578, "grad_norm": 0.12687668204307556, "learning_rate": 1e-05, "loss": 0.0162, "step": 1357800 }, { "epoch": 0.013579, "grad_norm": 0.139754518866539, "learning_rate": 1e-05, "loss": 0.0161, "step": 1357900 }, { "epoch": 0.01358, "grad_norm": 0.15015172958374023, "learning_rate": 1e-05, "loss": 0.0159, "step": 1358000 }, { "epoch": 0.013581, "grad_norm": 0.11980485916137695, "learning_rate": 1e-05, "loss": 0.0163, "step": 1358100 }, { "epoch": 0.013582, "grad_norm": 0.12260890007019043, "learning_rate": 1e-05, "loss": 0.0163, "step": 1358200 }, { "epoch": 0.013583, "grad_norm": 0.21514102816581726, "learning_rate": 1e-05, "loss": 0.0162, "step": 1358300 }, { "epoch": 0.013584, "grad_norm": 0.1406431645154953, "learning_rate": 1e-05, "loss": 0.0165, "step": 1358400 }, { "epoch": 0.013585, "grad_norm": 0.15733207762241364, "learning_rate": 1e-05, "loss": 0.0159, "step": 1358500 }, { "epoch": 0.013586, "grad_norm": 0.11232259124517441, "learning_rate": 1e-05, "loss": 0.0159, "step": 1358600 }, { "epoch": 0.013587, "grad_norm": 0.10901650786399841, "learning_rate": 1e-05, "loss": 0.0158, "step": 1358700 }, { "epoch": 0.013588, "grad_norm": 0.11222003400325775, "learning_rate": 1e-05, "loss": 0.0162, "step": 1358800 }, { "epoch": 0.013589, "grad_norm": 0.09546849131584167, "learning_rate": 1e-05, "loss": 0.0161, "step": 1358900 }, { "epoch": 0.01359, "grad_norm": 0.15468095242977142, "learning_rate": 1e-05, "loss": 0.0159, "step": 1359000 }, { "epoch": 0.013591, "grad_norm": 0.10783927142620087, "learning_rate": 1e-05, "loss": 0.0162, "step": 1359100 }, { "epoch": 0.013592, "grad_norm": 0.14513474702835083, "learning_rate": 1e-05, "loss": 0.0156, "step": 1359200 }, { "epoch": 0.013593, "grad_norm": 0.1360807716846466, "learning_rate": 1e-05, "loss": 0.0163, "step": 1359300 }, { "epoch": 0.013594, "grad_norm": 0.12331721931695938, "learning_rate": 1e-05, "loss": 0.0161, "step": 1359400 }, { "epoch": 0.013595, "grad_norm": 0.12415486574172974, "learning_rate": 1e-05, "loss": 0.0156, "step": 1359500 }, { "epoch": 0.013596, "grad_norm": 0.1137884259223938, "learning_rate": 1e-05, "loss": 0.0161, "step": 1359600 }, { "epoch": 0.013597, "grad_norm": 0.10449658334255219, "learning_rate": 1e-05, "loss": 0.0161, "step": 1359700 }, { "epoch": 0.013598, "grad_norm": 0.1385415494441986, "learning_rate": 1e-05, "loss": 0.0161, "step": 1359800 }, { "epoch": 0.013599, "grad_norm": 0.12057428807020187, "learning_rate": 1e-05, "loss": 0.0159, "step": 1359900 }, { "epoch": 0.0136, "grad_norm": 0.15518781542778015, "learning_rate": 1e-05, "loss": 0.0157, "step": 1360000 }, { "epoch": 0.0136, "eval_loss": 0.014514348469674587, "eval_runtime": 172.4178, "eval_samples_per_second": 289.993, "eval_steps_per_second": 18.125, "step": 1360000 }, { "epoch": 0.013601, "grad_norm": 0.1132740005850792, "learning_rate": 1e-05, "loss": 0.0161, "step": 1360100 }, { "epoch": 0.013602, "grad_norm": 0.12870629131793976, "learning_rate": 1e-05, "loss": 0.0157, "step": 1360200 }, { "epoch": 0.013603, "grad_norm": 0.12867847084999084, "learning_rate": 1e-05, "loss": 0.0162, "step": 1360300 }, { "epoch": 0.013604, "grad_norm": 0.12577340006828308, "learning_rate": 1e-05, "loss": 0.0166, "step": 1360400 }, { "epoch": 0.013605, "grad_norm": 0.14132261276245117, "learning_rate": 1e-05, "loss": 0.0165, "step": 1360500 }, { "epoch": 0.013606, "grad_norm": 0.11221453547477722, "learning_rate": 1e-05, "loss": 0.016, "step": 1360600 }, { "epoch": 0.013607, "grad_norm": 0.11132636666297913, "learning_rate": 1e-05, "loss": 0.0158, "step": 1360700 }, { "epoch": 0.013608, "grad_norm": 0.15362760424613953, "learning_rate": 1e-05, "loss": 0.0161, "step": 1360800 }, { "epoch": 0.013609, "grad_norm": 0.12930594384670258, "learning_rate": 1e-05, "loss": 0.0159, "step": 1360900 }, { "epoch": 0.01361, "grad_norm": 0.18294037878513336, "learning_rate": 1e-05, "loss": 0.016, "step": 1361000 }, { "epoch": 0.013611, "grad_norm": 0.10593156516551971, "learning_rate": 1e-05, "loss": 0.0161, "step": 1361100 }, { "epoch": 0.013612, "grad_norm": 0.13551364839076996, "learning_rate": 1e-05, "loss": 0.0161, "step": 1361200 }, { "epoch": 0.013613, "grad_norm": 0.12052325904369354, "learning_rate": 1e-05, "loss": 0.0166, "step": 1361300 }, { "epoch": 0.013614, "grad_norm": 0.1685929298400879, "learning_rate": 1e-05, "loss": 0.0166, "step": 1361400 }, { "epoch": 0.013615, "grad_norm": 0.1422627866268158, "learning_rate": 1e-05, "loss": 0.0158, "step": 1361500 }, { "epoch": 0.013616, "grad_norm": 0.1384575217962265, "learning_rate": 1e-05, "loss": 0.0159, "step": 1361600 }, { "epoch": 0.013617, "grad_norm": 0.12381318211555481, "learning_rate": 1e-05, "loss": 0.0161, "step": 1361700 }, { "epoch": 0.013618, "grad_norm": 0.1525225192308426, "learning_rate": 1e-05, "loss": 0.0161, "step": 1361800 }, { "epoch": 0.013619, "grad_norm": 0.15670956671237946, "learning_rate": 1e-05, "loss": 0.0164, "step": 1361900 }, { "epoch": 0.01362, "grad_norm": 0.09367363899946213, "learning_rate": 1e-05, "loss": 0.0157, "step": 1362000 }, { "epoch": 0.013621, "grad_norm": 0.12845218181610107, "learning_rate": 1e-05, "loss": 0.016, "step": 1362100 }, { "epoch": 0.013622, "grad_norm": 0.1180339828133583, "learning_rate": 1e-05, "loss": 0.0161, "step": 1362200 }, { "epoch": 0.013623, "grad_norm": 0.12948991358280182, "learning_rate": 1e-05, "loss": 0.0162, "step": 1362300 }, { "epoch": 0.013624, "grad_norm": 0.1633019745349884, "learning_rate": 1e-05, "loss": 0.0165, "step": 1362400 }, { "epoch": 0.013625, "grad_norm": 0.12367766350507736, "learning_rate": 1e-05, "loss": 0.0157, "step": 1362500 }, { "epoch": 0.013626, "grad_norm": 0.15428628027439117, "learning_rate": 1e-05, "loss": 0.016, "step": 1362600 }, { "epoch": 0.013627, "grad_norm": 0.3062315583229065, "learning_rate": 1e-05, "loss": 0.0162, "step": 1362700 }, { "epoch": 0.013628, "grad_norm": 0.15459588170051575, "learning_rate": 1e-05, "loss": 0.0159, "step": 1362800 }, { "epoch": 0.013629, "grad_norm": 0.12351477891206741, "learning_rate": 1e-05, "loss": 0.0159, "step": 1362900 }, { "epoch": 0.01363, "grad_norm": 0.10199648141860962, "learning_rate": 1e-05, "loss": 0.0158, "step": 1363000 }, { "epoch": 0.013631, "grad_norm": 0.15986435115337372, "learning_rate": 1e-05, "loss": 0.0162, "step": 1363100 }, { "epoch": 0.013632, "grad_norm": 0.12135066092014313, "learning_rate": 1e-05, "loss": 0.0162, "step": 1363200 }, { "epoch": 0.013633, "grad_norm": 0.1512099951505661, "learning_rate": 1e-05, "loss": 0.0163, "step": 1363300 }, { "epoch": 0.013634, "grad_norm": 0.11645647138357162, "learning_rate": 1e-05, "loss": 0.0161, "step": 1363400 }, { "epoch": 0.013635, "grad_norm": 0.13456448912620544, "learning_rate": 1e-05, "loss": 0.0164, "step": 1363500 }, { "epoch": 0.013636, "grad_norm": 0.12000387161970139, "learning_rate": 1e-05, "loss": 0.0158, "step": 1363600 }, { "epoch": 0.013637, "grad_norm": 0.10846920311450958, "learning_rate": 1e-05, "loss": 0.0155, "step": 1363700 }, { "epoch": 0.013638, "grad_norm": 0.12041802704334259, "learning_rate": 1e-05, "loss": 0.0161, "step": 1363800 }, { "epoch": 0.013639, "grad_norm": 0.17610785365104675, "learning_rate": 1e-05, "loss": 0.0159, "step": 1363900 }, { "epoch": 0.01364, "grad_norm": 0.10645934194326401, "learning_rate": 1e-05, "loss": 0.0157, "step": 1364000 }, { "epoch": 0.013641, "grad_norm": 0.12398876249790192, "learning_rate": 1e-05, "loss": 0.0166, "step": 1364100 }, { "epoch": 0.013642, "grad_norm": 0.1284686177968979, "learning_rate": 1e-05, "loss": 0.0157, "step": 1364200 }, { "epoch": 0.013643, "grad_norm": 0.09731802344322205, "learning_rate": 1e-05, "loss": 0.0159, "step": 1364300 }, { "epoch": 0.013644, "grad_norm": 0.12553749978542328, "learning_rate": 1e-05, "loss": 0.0159, "step": 1364400 }, { "epoch": 0.013645, "grad_norm": 0.13750584423542023, "learning_rate": 1e-05, "loss": 0.0165, "step": 1364500 }, { "epoch": 0.013646, "grad_norm": 0.15719304978847504, "learning_rate": 1e-05, "loss": 0.0158, "step": 1364600 }, { "epoch": 0.013647, "grad_norm": 0.11469672620296478, "learning_rate": 1e-05, "loss": 0.016, "step": 1364700 }, { "epoch": 0.013648, "grad_norm": 0.12301254272460938, "learning_rate": 1e-05, "loss": 0.0164, "step": 1364800 }, { "epoch": 0.013649, "grad_norm": 0.18372507393360138, "learning_rate": 1e-05, "loss": 0.016, "step": 1364900 }, { "epoch": 0.01365, "grad_norm": 0.16365037858486176, "learning_rate": 1e-05, "loss": 0.0169, "step": 1365000 }, { "epoch": 0.013651, "grad_norm": 0.09903956204652786, "learning_rate": 1e-05, "loss": 0.0161, "step": 1365100 }, { "epoch": 0.013652, "grad_norm": 0.10994822531938553, "learning_rate": 1e-05, "loss": 0.016, "step": 1365200 }, { "epoch": 0.013653, "grad_norm": 0.14383432269096375, "learning_rate": 1e-05, "loss": 0.0164, "step": 1365300 }, { "epoch": 0.013654, "grad_norm": 0.15823614597320557, "learning_rate": 1e-05, "loss": 0.0159, "step": 1365400 }, { "epoch": 0.013655, "grad_norm": 0.11465776711702347, "learning_rate": 1e-05, "loss": 0.0158, "step": 1365500 }, { "epoch": 0.013656, "grad_norm": 0.15034951269626617, "learning_rate": 1e-05, "loss": 0.0163, "step": 1365600 }, { "epoch": 0.013657, "grad_norm": 0.15324723720550537, "learning_rate": 1e-05, "loss": 0.0163, "step": 1365700 }, { "epoch": 0.013658, "grad_norm": 0.13716191053390503, "learning_rate": 1e-05, "loss": 0.0162, "step": 1365800 }, { "epoch": 0.013659, "grad_norm": 0.09919816255569458, "learning_rate": 1e-05, "loss": 0.0162, "step": 1365900 }, { "epoch": 0.01366, "grad_norm": 0.12044075131416321, "learning_rate": 1e-05, "loss": 0.0159, "step": 1366000 }, { "epoch": 0.013661, "grad_norm": 0.12292826920747757, "learning_rate": 1e-05, "loss": 0.0159, "step": 1366100 }, { "epoch": 0.013662, "grad_norm": 0.10910460352897644, "learning_rate": 1e-05, "loss": 0.0161, "step": 1366200 }, { "epoch": 0.013663, "grad_norm": 0.1197357028722763, "learning_rate": 1e-05, "loss": 0.0158, "step": 1366300 }, { "epoch": 0.013664, "grad_norm": 0.10454928129911423, "learning_rate": 1e-05, "loss": 0.0162, "step": 1366400 }, { "epoch": 0.013665, "grad_norm": 0.10853870958089828, "learning_rate": 1e-05, "loss": 0.0164, "step": 1366500 }, { "epoch": 0.013666, "grad_norm": 0.11211688071489334, "learning_rate": 1e-05, "loss": 0.016, "step": 1366600 }, { "epoch": 0.013667, "grad_norm": 0.12334369122982025, "learning_rate": 1e-05, "loss": 0.0159, "step": 1366700 }, { "epoch": 0.013668, "grad_norm": 0.12170184403657913, "learning_rate": 1e-05, "loss": 0.0162, "step": 1366800 }, { "epoch": 0.013669, "grad_norm": 0.09179303795099258, "learning_rate": 1e-05, "loss": 0.0154, "step": 1366900 }, { "epoch": 0.01367, "grad_norm": 0.10908716171979904, "learning_rate": 1e-05, "loss": 0.0159, "step": 1367000 }, { "epoch": 0.013671, "grad_norm": 0.15080612897872925, "learning_rate": 1e-05, "loss": 0.0167, "step": 1367100 }, { "epoch": 0.013672, "grad_norm": 0.14695122838020325, "learning_rate": 1e-05, "loss": 0.0158, "step": 1367200 }, { "epoch": 0.013673, "grad_norm": 0.1489763855934143, "learning_rate": 1e-05, "loss": 0.0159, "step": 1367300 }, { "epoch": 0.013674, "grad_norm": 0.14462576806545258, "learning_rate": 1e-05, "loss": 0.0157, "step": 1367400 }, { "epoch": 0.013675, "grad_norm": 0.11080989241600037, "learning_rate": 1e-05, "loss": 0.0162, "step": 1367500 }, { "epoch": 0.013676, "grad_norm": 0.12049354612827301, "learning_rate": 1e-05, "loss": 0.016, "step": 1367600 }, { "epoch": 0.013677, "grad_norm": 0.10904203355312347, "learning_rate": 1e-05, "loss": 0.0163, "step": 1367700 }, { "epoch": 0.013678, "grad_norm": 0.11728449165821075, "learning_rate": 1e-05, "loss": 0.0166, "step": 1367800 }, { "epoch": 0.013679, "grad_norm": 0.10888795554637909, "learning_rate": 1e-05, "loss": 0.0162, "step": 1367900 }, { "epoch": 0.01368, "grad_norm": 0.12430769950151443, "learning_rate": 1e-05, "loss": 0.0161, "step": 1368000 }, { "epoch": 0.013681, "grad_norm": 0.1189616248011589, "learning_rate": 1e-05, "loss": 0.0165, "step": 1368100 }, { "epoch": 0.013682, "grad_norm": 0.09769096970558167, "learning_rate": 1e-05, "loss": 0.016, "step": 1368200 }, { "epoch": 0.013683, "grad_norm": 0.15271982550621033, "learning_rate": 1e-05, "loss": 0.0165, "step": 1368300 }, { "epoch": 0.013684, "grad_norm": 0.13043104112148285, "learning_rate": 1e-05, "loss": 0.0158, "step": 1368400 }, { "epoch": 0.013685, "grad_norm": 0.12322390079498291, "learning_rate": 1e-05, "loss": 0.0168, "step": 1368500 }, { "epoch": 0.013686, "grad_norm": 0.10735155642032623, "learning_rate": 1e-05, "loss": 0.016, "step": 1368600 }, { "epoch": 0.013687, "grad_norm": 0.10011187195777893, "learning_rate": 1e-05, "loss": 0.0158, "step": 1368700 }, { "epoch": 0.013688, "grad_norm": 0.10424554347991943, "learning_rate": 1e-05, "loss": 0.0165, "step": 1368800 }, { "epoch": 0.013689, "grad_norm": 0.10433629900217056, "learning_rate": 1e-05, "loss": 0.0161, "step": 1368900 }, { "epoch": 0.01369, "grad_norm": 0.13005714118480682, "learning_rate": 1e-05, "loss": 0.016, "step": 1369000 }, { "epoch": 0.013691, "grad_norm": 0.11390773952007294, "learning_rate": 1e-05, "loss": 0.0164, "step": 1369100 }, { "epoch": 0.013692, "grad_norm": 0.150357186794281, "learning_rate": 1e-05, "loss": 0.0166, "step": 1369200 }, { "epoch": 0.013693, "grad_norm": 0.10823795944452286, "learning_rate": 1e-05, "loss": 0.0161, "step": 1369300 }, { "epoch": 0.013694, "grad_norm": 0.14341847598552704, "learning_rate": 1e-05, "loss": 0.016, "step": 1369400 }, { "epoch": 0.013695, "grad_norm": 0.0890200287103653, "learning_rate": 1e-05, "loss": 0.016, "step": 1369500 }, { "epoch": 0.013696, "grad_norm": 0.1296790987253189, "learning_rate": 1e-05, "loss": 0.0159, "step": 1369600 }, { "epoch": 0.013697, "grad_norm": 0.14273366332054138, "learning_rate": 1e-05, "loss": 0.0161, "step": 1369700 }, { "epoch": 0.013698, "grad_norm": 0.11923115700483322, "learning_rate": 1e-05, "loss": 0.0161, "step": 1369800 }, { "epoch": 0.013699, "grad_norm": 0.10338013619184494, "learning_rate": 1e-05, "loss": 0.0159, "step": 1369900 }, { "epoch": 0.0137, "grad_norm": 0.10754505544900894, "learning_rate": 1e-05, "loss": 0.0161, "step": 1370000 }, { "epoch": 0.013701, "grad_norm": 0.1361694484949112, "learning_rate": 1e-05, "loss": 0.0161, "step": 1370100 }, { "epoch": 0.013702, "grad_norm": 0.1718742698431015, "learning_rate": 1e-05, "loss": 0.0162, "step": 1370200 }, { "epoch": 0.013703, "grad_norm": 0.09823288023471832, "learning_rate": 1e-05, "loss": 0.016, "step": 1370300 }, { "epoch": 0.013704, "grad_norm": 0.17074953019618988, "learning_rate": 1e-05, "loss": 0.0162, "step": 1370400 }, { "epoch": 0.013705, "grad_norm": 0.1074686348438263, "learning_rate": 1e-05, "loss": 0.0159, "step": 1370500 }, { "epoch": 0.013706, "grad_norm": 0.10450301319360733, "learning_rate": 1e-05, "loss": 0.0157, "step": 1370600 }, { "epoch": 0.013707, "grad_norm": 0.11011835932731628, "learning_rate": 1e-05, "loss": 0.0158, "step": 1370700 }, { "epoch": 0.013708, "grad_norm": 0.12121903151273727, "learning_rate": 1e-05, "loss": 0.0158, "step": 1370800 }, { "epoch": 0.013709, "grad_norm": 0.15887956321239471, "learning_rate": 1e-05, "loss": 0.016, "step": 1370900 }, { "epoch": 0.01371, "grad_norm": 0.11496194452047348, "learning_rate": 1e-05, "loss": 0.0156, "step": 1371000 }, { "epoch": 0.013711, "grad_norm": 0.11505766212940216, "learning_rate": 1e-05, "loss": 0.0157, "step": 1371100 }, { "epoch": 0.013712, "grad_norm": 0.0991431400179863, "learning_rate": 1e-05, "loss": 0.0162, "step": 1371200 }, { "epoch": 0.013713, "grad_norm": 0.12539032101631165, "learning_rate": 1e-05, "loss": 0.016, "step": 1371300 }, { "epoch": 0.013714, "grad_norm": 0.11749105900526047, "learning_rate": 1e-05, "loss": 0.0163, "step": 1371400 }, { "epoch": 0.013715, "grad_norm": 0.10562967509031296, "learning_rate": 1e-05, "loss": 0.0162, "step": 1371500 }, { "epoch": 0.013716, "grad_norm": 0.15808477997779846, "learning_rate": 1e-05, "loss": 0.0163, "step": 1371600 }, { "epoch": 0.013717, "grad_norm": 0.1507311314344406, "learning_rate": 1e-05, "loss": 0.0161, "step": 1371700 }, { "epoch": 0.013718, "grad_norm": 0.13683387637138367, "learning_rate": 1e-05, "loss": 0.0161, "step": 1371800 }, { "epoch": 0.013719, "grad_norm": 0.11565684527158737, "learning_rate": 1e-05, "loss": 0.0158, "step": 1371900 }, { "epoch": 0.01372, "grad_norm": 0.09215231984853745, "learning_rate": 1e-05, "loss": 0.0164, "step": 1372000 }, { "epoch": 0.013721, "grad_norm": 0.1283218264579773, "learning_rate": 1e-05, "loss": 0.0164, "step": 1372100 }, { "epoch": 0.013722, "grad_norm": 0.10663194954395294, "learning_rate": 1e-05, "loss": 0.0161, "step": 1372200 }, { "epoch": 0.013723, "grad_norm": 0.1342303454875946, "learning_rate": 1e-05, "loss": 0.0161, "step": 1372300 }, { "epoch": 0.013724, "grad_norm": 0.12530139088630676, "learning_rate": 1e-05, "loss": 0.0157, "step": 1372400 }, { "epoch": 0.013725, "grad_norm": 0.11629973351955414, "learning_rate": 1e-05, "loss": 0.0158, "step": 1372500 }, { "epoch": 0.013726, "grad_norm": 0.11360698193311691, "learning_rate": 1e-05, "loss": 0.0159, "step": 1372600 }, { "epoch": 0.013727, "grad_norm": 0.15098348259925842, "learning_rate": 1e-05, "loss": 0.0162, "step": 1372700 }, { "epoch": 0.013728, "grad_norm": 0.1385694295167923, "learning_rate": 1e-05, "loss": 0.0156, "step": 1372800 }, { "epoch": 0.013729, "grad_norm": 0.12507787346839905, "learning_rate": 1e-05, "loss": 0.0162, "step": 1372900 }, { "epoch": 0.01373, "grad_norm": 0.1218867301940918, "learning_rate": 1e-05, "loss": 0.0159, "step": 1373000 }, { "epoch": 0.013731, "grad_norm": 0.1617507040500641, "learning_rate": 1e-05, "loss": 0.0163, "step": 1373100 }, { "epoch": 0.013732, "grad_norm": 0.1416895091533661, "learning_rate": 1e-05, "loss": 0.0158, "step": 1373200 }, { "epoch": 0.013733, "grad_norm": 0.1388232558965683, "learning_rate": 1e-05, "loss": 0.0154, "step": 1373300 }, { "epoch": 0.013734, "grad_norm": 0.16926397383213043, "learning_rate": 1e-05, "loss": 0.0161, "step": 1373400 }, { "epoch": 0.013735, "grad_norm": 0.11437075585126877, "learning_rate": 1e-05, "loss": 0.0162, "step": 1373500 }, { "epoch": 0.013736, "grad_norm": 0.14341197907924652, "learning_rate": 1e-05, "loss": 0.0157, "step": 1373600 }, { "epoch": 0.013737, "grad_norm": 0.17095236480236053, "learning_rate": 1e-05, "loss": 0.0163, "step": 1373700 }, { "epoch": 0.013738, "grad_norm": 0.13402961194515228, "learning_rate": 1e-05, "loss": 0.0162, "step": 1373800 }, { "epoch": 0.013739, "grad_norm": 0.113771453499794, "learning_rate": 1e-05, "loss": 0.016, "step": 1373900 }, { "epoch": 0.01374, "grad_norm": 0.11452247202396393, "learning_rate": 1e-05, "loss": 0.0163, "step": 1374000 }, { "epoch": 0.013741, "grad_norm": 0.1089392676949501, "learning_rate": 1e-05, "loss": 0.0161, "step": 1374100 }, { "epoch": 0.013742, "grad_norm": 0.10375480353832245, "learning_rate": 1e-05, "loss": 0.0159, "step": 1374200 }, { "epoch": 0.013743, "grad_norm": 0.16025160253047943, "learning_rate": 1e-05, "loss": 0.0159, "step": 1374300 }, { "epoch": 0.013744, "grad_norm": 0.09150442481040955, "learning_rate": 1e-05, "loss": 0.0161, "step": 1374400 }, { "epoch": 0.013745, "grad_norm": 0.12660673260688782, "learning_rate": 1e-05, "loss": 0.0157, "step": 1374500 }, { "epoch": 0.013746, "grad_norm": 0.1428544819355011, "learning_rate": 1e-05, "loss": 0.0167, "step": 1374600 }, { "epoch": 0.013747, "grad_norm": 0.0994538813829422, "learning_rate": 1e-05, "loss": 0.0162, "step": 1374700 }, { "epoch": 0.013748, "grad_norm": 0.12014321982860565, "learning_rate": 1e-05, "loss": 0.016, "step": 1374800 }, { "epoch": 0.013749, "grad_norm": 0.12528547644615173, "learning_rate": 1e-05, "loss": 0.0163, "step": 1374900 }, { "epoch": 0.01375, "grad_norm": 0.10313377529382706, "learning_rate": 1e-05, "loss": 0.0161, "step": 1375000 }, { "epoch": 0.013751, "grad_norm": 0.13248947262763977, "learning_rate": 1e-05, "loss": 0.0162, "step": 1375100 }, { "epoch": 0.013752, "grad_norm": 0.14056093990802765, "learning_rate": 1e-05, "loss": 0.0163, "step": 1375200 }, { "epoch": 0.013753, "grad_norm": 0.10610973089933395, "learning_rate": 1e-05, "loss": 0.0159, "step": 1375300 }, { "epoch": 0.013754, "grad_norm": 0.10601384192705154, "learning_rate": 1e-05, "loss": 0.0162, "step": 1375400 }, { "epoch": 0.013755, "grad_norm": 0.13130366802215576, "learning_rate": 1e-05, "loss": 0.0161, "step": 1375500 }, { "epoch": 0.013756, "grad_norm": 0.11775937676429749, "learning_rate": 1e-05, "loss": 0.0165, "step": 1375600 }, { "epoch": 0.013757, "grad_norm": 0.10963563621044159, "learning_rate": 1e-05, "loss": 0.0159, "step": 1375700 }, { "epoch": 0.013758, "grad_norm": 0.10564207285642624, "learning_rate": 1e-05, "loss": 0.016, "step": 1375800 }, { "epoch": 0.013759, "grad_norm": 0.11714114993810654, "learning_rate": 1e-05, "loss": 0.0162, "step": 1375900 }, { "epoch": 0.01376, "grad_norm": 0.13261666893959045, "learning_rate": 1e-05, "loss": 0.0159, "step": 1376000 }, { "epoch": 0.013761, "grad_norm": 0.12215732038021088, "learning_rate": 1e-05, "loss": 0.0157, "step": 1376100 }, { "epoch": 0.013762, "grad_norm": 0.14886917173862457, "learning_rate": 1e-05, "loss": 0.016, "step": 1376200 }, { "epoch": 0.013763, "grad_norm": 0.11936166882514954, "learning_rate": 1e-05, "loss": 0.0163, "step": 1376300 }, { "epoch": 0.013764, "grad_norm": 0.12318406254053116, "learning_rate": 1e-05, "loss": 0.0159, "step": 1376400 }, { "epoch": 0.013765, "grad_norm": 0.10330426692962646, "learning_rate": 1e-05, "loss": 0.0162, "step": 1376500 }, { "epoch": 0.013766, "grad_norm": 0.17937365174293518, "learning_rate": 1e-05, "loss": 0.0159, "step": 1376600 }, { "epoch": 0.013767, "grad_norm": 0.15281693637371063, "learning_rate": 1e-05, "loss": 0.0162, "step": 1376700 }, { "epoch": 0.013768, "grad_norm": 0.10977499932050705, "learning_rate": 1e-05, "loss": 0.0158, "step": 1376800 }, { "epoch": 0.013769, "grad_norm": 0.1122654601931572, "learning_rate": 1e-05, "loss": 0.0155, "step": 1376900 }, { "epoch": 0.01377, "grad_norm": 0.13125941157341003, "learning_rate": 1e-05, "loss": 0.0162, "step": 1377000 }, { "epoch": 0.013771, "grad_norm": 0.1157495304942131, "learning_rate": 1e-05, "loss": 0.0161, "step": 1377100 }, { "epoch": 0.013772, "grad_norm": 0.13293728232383728, "learning_rate": 1e-05, "loss": 0.0163, "step": 1377200 }, { "epoch": 0.013773, "grad_norm": 0.13515929877758026, "learning_rate": 1e-05, "loss": 0.016, "step": 1377300 }, { "epoch": 0.013774, "grad_norm": 0.09928301721811295, "learning_rate": 1e-05, "loss": 0.0164, "step": 1377400 }, { "epoch": 0.013775, "grad_norm": 0.10386615246534348, "learning_rate": 1e-05, "loss": 0.0163, "step": 1377500 }, { "epoch": 0.013776, "grad_norm": 0.1453794538974762, "learning_rate": 1e-05, "loss": 0.0161, "step": 1377600 }, { "epoch": 0.013777, "grad_norm": 0.1398731768131256, "learning_rate": 1e-05, "loss": 0.0163, "step": 1377700 }, { "epoch": 0.013778, "grad_norm": 0.10831214487552643, "learning_rate": 1e-05, "loss": 0.0163, "step": 1377800 }, { "epoch": 0.013779, "grad_norm": 0.1269521415233612, "learning_rate": 1e-05, "loss": 0.0159, "step": 1377900 }, { "epoch": 0.01378, "grad_norm": 0.11862418800592422, "learning_rate": 1e-05, "loss": 0.0161, "step": 1378000 }, { "epoch": 0.013781, "grad_norm": 0.16337235271930695, "learning_rate": 1e-05, "loss": 0.0163, "step": 1378100 }, { "epoch": 0.013782, "grad_norm": 0.1090797558426857, "learning_rate": 1e-05, "loss": 0.016, "step": 1378200 }, { "epoch": 0.013783, "grad_norm": 0.11402099579572678, "learning_rate": 1e-05, "loss": 0.0159, "step": 1378300 }, { "epoch": 0.013784, "grad_norm": 0.11527585983276367, "learning_rate": 1e-05, "loss": 0.0159, "step": 1378400 }, { "epoch": 0.013785, "grad_norm": 0.12282057851552963, "learning_rate": 1e-05, "loss": 0.0166, "step": 1378500 }, { "epoch": 0.013786, "grad_norm": 0.17155061662197113, "learning_rate": 1e-05, "loss": 0.0158, "step": 1378600 }, { "epoch": 0.013787, "grad_norm": 0.10065263509750366, "learning_rate": 1e-05, "loss": 0.0155, "step": 1378700 }, { "epoch": 0.013788, "grad_norm": 0.0994124785065651, "learning_rate": 1e-05, "loss": 0.0161, "step": 1378800 }, { "epoch": 0.013789, "grad_norm": 0.12342745810747147, "learning_rate": 1e-05, "loss": 0.0158, "step": 1378900 }, { "epoch": 0.01379, "grad_norm": 0.1068306416273117, "learning_rate": 1e-05, "loss": 0.0161, "step": 1379000 }, { "epoch": 0.013791, "grad_norm": 0.1305331438779831, "learning_rate": 1e-05, "loss": 0.0162, "step": 1379100 }, { "epoch": 0.013792, "grad_norm": 0.13052856922149658, "learning_rate": 1e-05, "loss": 0.0162, "step": 1379200 }, { "epoch": 0.013793, "grad_norm": 0.1327943503856659, "learning_rate": 1e-05, "loss": 0.016, "step": 1379300 }, { "epoch": 0.013794, "grad_norm": 0.12546342611312866, "learning_rate": 1e-05, "loss": 0.0158, "step": 1379400 }, { "epoch": 0.013795, "grad_norm": 0.1076110228896141, "learning_rate": 1e-05, "loss": 0.0161, "step": 1379500 }, { "epoch": 0.013796, "grad_norm": 0.10994234681129456, "learning_rate": 1e-05, "loss": 0.0157, "step": 1379600 }, { "epoch": 0.013797, "grad_norm": 0.13698890805244446, "learning_rate": 1e-05, "loss": 0.0162, "step": 1379700 }, { "epoch": 0.013798, "grad_norm": 0.14449936151504517, "learning_rate": 1e-05, "loss": 0.0162, "step": 1379800 }, { "epoch": 0.013799, "grad_norm": 0.11190523207187653, "learning_rate": 1e-05, "loss": 0.0158, "step": 1379900 }, { "epoch": 0.0138, "grad_norm": 0.11828823387622833, "learning_rate": 1e-05, "loss": 0.016, "step": 1380000 }, { "epoch": 0.0138, "eval_loss": 0.014379137195646763, "eval_runtime": 181.794, "eval_samples_per_second": 275.037, "eval_steps_per_second": 17.19, "step": 1380000 }, { "epoch": 0.013801, "grad_norm": 0.12600548565387726, "learning_rate": 1e-05, "loss": 0.0158, "step": 1380100 }, { "epoch": 0.013802, "grad_norm": 0.1685108095407486, "learning_rate": 1e-05, "loss": 0.016, "step": 1380200 }, { "epoch": 0.013803, "grad_norm": 0.0730600580573082, "learning_rate": 1e-05, "loss": 0.0158, "step": 1380300 }, { "epoch": 0.013804, "grad_norm": 0.09467963129281998, "learning_rate": 1e-05, "loss": 0.0157, "step": 1380400 }, { "epoch": 0.013805, "grad_norm": 0.12067090719938278, "learning_rate": 1e-05, "loss": 0.0159, "step": 1380500 }, { "epoch": 0.013806, "grad_norm": 0.09001097083091736, "learning_rate": 1e-05, "loss": 0.0162, "step": 1380600 }, { "epoch": 0.013807, "grad_norm": 0.12082430720329285, "learning_rate": 1e-05, "loss": 0.0161, "step": 1380700 }, { "epoch": 0.013808, "grad_norm": 0.14899571239948273, "learning_rate": 1e-05, "loss": 0.0158, "step": 1380800 }, { "epoch": 0.013809, "grad_norm": 0.14431628584861755, "learning_rate": 1e-05, "loss": 0.0161, "step": 1380900 }, { "epoch": 0.01381, "grad_norm": 0.12305117398500443, "learning_rate": 1e-05, "loss": 0.016, "step": 1381000 }, { "epoch": 0.013811, "grad_norm": 0.12073507905006409, "learning_rate": 1e-05, "loss": 0.0156, "step": 1381100 }, { "epoch": 0.013812, "grad_norm": 0.11650898307561874, "learning_rate": 1e-05, "loss": 0.0161, "step": 1381200 }, { "epoch": 0.013813, "grad_norm": 0.09429006278514862, "learning_rate": 1e-05, "loss": 0.0166, "step": 1381300 }, { "epoch": 0.013814, "grad_norm": 0.10184783488512039, "learning_rate": 1e-05, "loss": 0.0159, "step": 1381400 }, { "epoch": 0.013815, "grad_norm": 0.1764727532863617, "learning_rate": 1e-05, "loss": 0.016, "step": 1381500 }, { "epoch": 0.013816, "grad_norm": 0.1561637818813324, "learning_rate": 1e-05, "loss": 0.0157, "step": 1381600 }, { "epoch": 0.013817, "grad_norm": 0.14003199338912964, "learning_rate": 1e-05, "loss": 0.0161, "step": 1381700 }, { "epoch": 0.013818, "grad_norm": 0.1350497454404831, "learning_rate": 1e-05, "loss": 0.0162, "step": 1381800 }, { "epoch": 0.013819, "grad_norm": 0.14918212592601776, "learning_rate": 1e-05, "loss": 0.0158, "step": 1381900 }, { "epoch": 0.01382, "grad_norm": 0.10436917841434479, "learning_rate": 1e-05, "loss": 0.0158, "step": 1382000 }, { "epoch": 0.013821, "grad_norm": 0.17350125312805176, "learning_rate": 1e-05, "loss": 0.016, "step": 1382100 }, { "epoch": 0.013822, "grad_norm": 0.1299065798521042, "learning_rate": 1e-05, "loss": 0.0157, "step": 1382200 }, { "epoch": 0.013823, "grad_norm": 0.10725600272417068, "learning_rate": 1e-05, "loss": 0.0159, "step": 1382300 }, { "epoch": 0.013824, "grad_norm": 0.1024770513176918, "learning_rate": 1e-05, "loss": 0.0162, "step": 1382400 }, { "epoch": 0.013825, "grad_norm": 0.13823270797729492, "learning_rate": 1e-05, "loss": 0.0162, "step": 1382500 }, { "epoch": 0.013826, "grad_norm": 0.10713586211204529, "learning_rate": 1e-05, "loss": 0.0163, "step": 1382600 }, { "epoch": 0.013827, "grad_norm": 0.1371476799249649, "learning_rate": 1e-05, "loss": 0.016, "step": 1382700 }, { "epoch": 0.013828, "grad_norm": 0.14235332608222961, "learning_rate": 1e-05, "loss": 0.0161, "step": 1382800 }, { "epoch": 0.013829, "grad_norm": 0.10622101277112961, "learning_rate": 1e-05, "loss": 0.0159, "step": 1382900 }, { "epoch": 0.01383, "grad_norm": 0.10308799892663956, "learning_rate": 1e-05, "loss": 0.0161, "step": 1383000 }, { "epoch": 0.013831, "grad_norm": 0.15448325872421265, "learning_rate": 1e-05, "loss": 0.0157, "step": 1383100 }, { "epoch": 0.013832, "grad_norm": 0.15574997663497925, "learning_rate": 1e-05, "loss": 0.0159, "step": 1383200 }, { "epoch": 0.013833, "grad_norm": 0.12497047334909439, "learning_rate": 1e-05, "loss": 0.0159, "step": 1383300 }, { "epoch": 0.013834, "grad_norm": 0.109726183116436, "learning_rate": 1e-05, "loss": 0.016, "step": 1383400 }, { "epoch": 0.013835, "grad_norm": 0.10607220232486725, "learning_rate": 1e-05, "loss": 0.0159, "step": 1383500 }, { "epoch": 0.013836, "grad_norm": 0.13322244584560394, "learning_rate": 1e-05, "loss": 0.0157, "step": 1383600 }, { "epoch": 0.013837, "grad_norm": 0.09437932074069977, "learning_rate": 1e-05, "loss": 0.0161, "step": 1383700 }, { "epoch": 0.013838, "grad_norm": 0.12458586692810059, "learning_rate": 1e-05, "loss": 0.0161, "step": 1383800 }, { "epoch": 0.013839, "grad_norm": 0.15879516303539276, "learning_rate": 1e-05, "loss": 0.0159, "step": 1383900 }, { "epoch": 0.01384, "grad_norm": 0.0946231409907341, "learning_rate": 1e-05, "loss": 0.0158, "step": 1384000 }, { "epoch": 0.013841, "grad_norm": 0.16435833275318146, "learning_rate": 1e-05, "loss": 0.0157, "step": 1384100 }, { "epoch": 0.013842, "grad_norm": 0.11157827079296112, "learning_rate": 1e-05, "loss": 0.0163, "step": 1384200 }, { "epoch": 0.013843, "grad_norm": 0.11176595091819763, "learning_rate": 1e-05, "loss": 0.0165, "step": 1384300 }, { "epoch": 0.013844, "grad_norm": 0.1133665218949318, "learning_rate": 1e-05, "loss": 0.016, "step": 1384400 }, { "epoch": 0.013845, "grad_norm": 0.14587725698947906, "learning_rate": 1e-05, "loss": 0.0159, "step": 1384500 }, { "epoch": 0.013846, "grad_norm": 0.11329390108585358, "learning_rate": 1e-05, "loss": 0.0163, "step": 1384600 }, { "epoch": 0.013847, "grad_norm": 0.12197856605052948, "learning_rate": 1e-05, "loss": 0.0163, "step": 1384700 }, { "epoch": 0.013848, "grad_norm": 0.09476014971733093, "learning_rate": 1e-05, "loss": 0.0161, "step": 1384800 }, { "epoch": 0.013849, "grad_norm": 0.11038997024297714, "learning_rate": 1e-05, "loss": 0.0156, "step": 1384900 }, { "epoch": 0.01385, "grad_norm": 0.1368790566921234, "learning_rate": 1e-05, "loss": 0.0162, "step": 1385000 }, { "epoch": 0.013851, "grad_norm": 0.11730264872312546, "learning_rate": 1e-05, "loss": 0.016, "step": 1385100 }, { "epoch": 0.013852, "grad_norm": 0.1257360279560089, "learning_rate": 1e-05, "loss": 0.016, "step": 1385200 }, { "epoch": 0.013853, "grad_norm": 0.13698583841323853, "learning_rate": 1e-05, "loss": 0.0156, "step": 1385300 }, { "epoch": 0.013854, "grad_norm": 0.11994138360023499, "learning_rate": 1e-05, "loss": 0.0163, "step": 1385400 }, { "epoch": 0.013855, "grad_norm": 0.10802754759788513, "learning_rate": 1e-05, "loss": 0.0158, "step": 1385500 }, { "epoch": 0.013856, "grad_norm": 0.11975641548633575, "learning_rate": 1e-05, "loss": 0.0161, "step": 1385600 }, { "epoch": 0.013857, "grad_norm": 0.1410323828458786, "learning_rate": 1e-05, "loss": 0.0162, "step": 1385700 }, { "epoch": 0.013858, "grad_norm": 0.10279117524623871, "learning_rate": 1e-05, "loss": 0.0159, "step": 1385800 }, { "epoch": 0.013859, "grad_norm": 0.11804615706205368, "learning_rate": 1e-05, "loss": 0.0159, "step": 1385900 }, { "epoch": 0.01386, "grad_norm": 0.0985248014330864, "learning_rate": 1e-05, "loss": 0.0165, "step": 1386000 }, { "epoch": 0.013861, "grad_norm": 0.1937997192144394, "learning_rate": 1e-05, "loss": 0.0162, "step": 1386100 }, { "epoch": 0.013862, "grad_norm": 0.113199882209301, "learning_rate": 1e-05, "loss": 0.016, "step": 1386200 }, { "epoch": 0.013863, "grad_norm": 0.112772636115551, "learning_rate": 1e-05, "loss": 0.0154, "step": 1386300 }, { "epoch": 0.013864, "grad_norm": 0.08404560387134552, "learning_rate": 1e-05, "loss": 0.0161, "step": 1386400 }, { "epoch": 0.013865, "grad_norm": 0.12305912375450134, "learning_rate": 1e-05, "loss": 0.0163, "step": 1386500 }, { "epoch": 0.013866, "grad_norm": 0.09376222640275955, "learning_rate": 1e-05, "loss": 0.0158, "step": 1386600 }, { "epoch": 0.013867, "grad_norm": 0.1054474487900734, "learning_rate": 1e-05, "loss": 0.0157, "step": 1386700 }, { "epoch": 0.013868, "grad_norm": 0.13912461698055267, "learning_rate": 1e-05, "loss": 0.0161, "step": 1386800 }, { "epoch": 0.013869, "grad_norm": 0.15266558527946472, "learning_rate": 1e-05, "loss": 0.0159, "step": 1386900 }, { "epoch": 0.01387, "grad_norm": 0.10539045184850693, "learning_rate": 1e-05, "loss": 0.0163, "step": 1387000 }, { "epoch": 0.013871, "grad_norm": 0.10747305303812027, "learning_rate": 1e-05, "loss": 0.0161, "step": 1387100 }, { "epoch": 0.013872, "grad_norm": 0.0915454775094986, "learning_rate": 1e-05, "loss": 0.0158, "step": 1387200 }, { "epoch": 0.013873, "grad_norm": 0.1363687664270401, "learning_rate": 1e-05, "loss": 0.0157, "step": 1387300 }, { "epoch": 0.013874, "grad_norm": 0.10487410426139832, "learning_rate": 1e-05, "loss": 0.0161, "step": 1387400 }, { "epoch": 0.013875, "grad_norm": 0.15623681247234344, "learning_rate": 1e-05, "loss": 0.0165, "step": 1387500 }, { "epoch": 0.013876, "grad_norm": 0.08724755793809891, "learning_rate": 1e-05, "loss": 0.0162, "step": 1387600 }, { "epoch": 0.013877, "grad_norm": 0.1321588009595871, "learning_rate": 1e-05, "loss": 0.0161, "step": 1387700 }, { "epoch": 0.013878, "grad_norm": 0.1386040598154068, "learning_rate": 1e-05, "loss": 0.0159, "step": 1387800 }, { "epoch": 0.013879, "grad_norm": 0.07088316231966019, "learning_rate": 1e-05, "loss": 0.0157, "step": 1387900 }, { "epoch": 0.01388, "grad_norm": 0.1092010885477066, "learning_rate": 1e-05, "loss": 0.016, "step": 1388000 }, { "epoch": 0.013881, "grad_norm": 0.12727943062782288, "learning_rate": 1e-05, "loss": 0.0156, "step": 1388100 }, { "epoch": 0.013882, "grad_norm": 0.12529002130031586, "learning_rate": 1e-05, "loss": 0.0164, "step": 1388200 }, { "epoch": 0.013883, "grad_norm": 0.19361628592014313, "learning_rate": 1e-05, "loss": 0.016, "step": 1388300 }, { "epoch": 0.013884, "grad_norm": 0.11172652244567871, "learning_rate": 1e-05, "loss": 0.016, "step": 1388400 }, { "epoch": 0.013885, "grad_norm": 0.1140981912612915, "learning_rate": 1e-05, "loss": 0.0164, "step": 1388500 }, { "epoch": 0.013886, "grad_norm": 0.09263898432254791, "learning_rate": 1e-05, "loss": 0.0159, "step": 1388600 }, { "epoch": 0.013887, "grad_norm": 0.10879357159137726, "learning_rate": 1e-05, "loss": 0.016, "step": 1388700 }, { "epoch": 0.013888, "grad_norm": 0.11168143898248672, "learning_rate": 1e-05, "loss": 0.0164, "step": 1388800 }, { "epoch": 0.013889, "grad_norm": 0.16117675602436066, "learning_rate": 1e-05, "loss": 0.0161, "step": 1388900 }, { "epoch": 0.01389, "grad_norm": 0.11238590627908707, "learning_rate": 1e-05, "loss": 0.0162, "step": 1389000 }, { "epoch": 0.013891, "grad_norm": 0.1305629014968872, "learning_rate": 1e-05, "loss": 0.0161, "step": 1389100 }, { "epoch": 0.013892, "grad_norm": 0.09031955897808075, "learning_rate": 1e-05, "loss": 0.0157, "step": 1389200 }, { "epoch": 0.013893, "grad_norm": 0.1395203322172165, "learning_rate": 1e-05, "loss": 0.0161, "step": 1389300 }, { "epoch": 0.013894, "grad_norm": 0.1264759600162506, "learning_rate": 1e-05, "loss": 0.0159, "step": 1389400 }, { "epoch": 0.013895, "grad_norm": 0.1666557639837265, "learning_rate": 1e-05, "loss": 0.016, "step": 1389500 }, { "epoch": 0.013896, "grad_norm": 0.11125408113002777, "learning_rate": 1e-05, "loss": 0.0161, "step": 1389600 }, { "epoch": 0.013897, "grad_norm": 0.11658768355846405, "learning_rate": 1e-05, "loss": 0.016, "step": 1389700 }, { "epoch": 0.013898, "grad_norm": 0.0942181870341301, "learning_rate": 1e-05, "loss": 0.0158, "step": 1389800 }, { "epoch": 0.013899, "grad_norm": 0.13381628692150116, "learning_rate": 1e-05, "loss": 0.0156, "step": 1389900 }, { "epoch": 0.0139, "grad_norm": 0.13071101903915405, "learning_rate": 1e-05, "loss": 0.016, "step": 1390000 }, { "epoch": 0.013901, "grad_norm": 0.09085896611213684, "learning_rate": 1e-05, "loss": 0.0158, "step": 1390100 }, { "epoch": 0.013902, "grad_norm": 0.11728326231241226, "learning_rate": 1e-05, "loss": 0.0164, "step": 1390200 }, { "epoch": 0.013903, "grad_norm": 0.12110377103090286, "learning_rate": 1e-05, "loss": 0.0158, "step": 1390300 }, { "epoch": 0.013904, "grad_norm": 0.1348002851009369, "learning_rate": 1e-05, "loss": 0.0163, "step": 1390400 }, { "epoch": 0.013905, "grad_norm": 0.1271951049566269, "learning_rate": 1e-05, "loss": 0.0158, "step": 1390500 }, { "epoch": 0.013906, "grad_norm": 0.16682186722755432, "learning_rate": 1e-05, "loss": 0.0163, "step": 1390600 }, { "epoch": 0.013907, "grad_norm": 0.16475450992584229, "learning_rate": 1e-05, "loss": 0.016, "step": 1390700 }, { "epoch": 0.013908, "grad_norm": 0.11202455312013626, "learning_rate": 1e-05, "loss": 0.0164, "step": 1390800 }, { "epoch": 0.013909, "grad_norm": 0.1507350653409958, "learning_rate": 1e-05, "loss": 0.0162, "step": 1390900 }, { "epoch": 0.01391, "grad_norm": 0.12206334620714188, "learning_rate": 1e-05, "loss": 0.016, "step": 1391000 }, { "epoch": 0.013911, "grad_norm": 0.08356495946645737, "learning_rate": 1e-05, "loss": 0.016, "step": 1391100 }, { "epoch": 0.013912, "grad_norm": 0.11210621893405914, "learning_rate": 1e-05, "loss": 0.0161, "step": 1391200 }, { "epoch": 0.013913, "grad_norm": 0.1515340507030487, "learning_rate": 1e-05, "loss": 0.0158, "step": 1391300 }, { "epoch": 0.013914, "grad_norm": 0.10919418931007385, "learning_rate": 1e-05, "loss": 0.0153, "step": 1391400 }, { "epoch": 0.013915, "grad_norm": 0.12100593745708466, "learning_rate": 1e-05, "loss": 0.0159, "step": 1391500 }, { "epoch": 0.013916, "grad_norm": 0.11124768108129501, "learning_rate": 1e-05, "loss": 0.016, "step": 1391600 }, { "epoch": 0.013917, "grad_norm": 0.11989603191614151, "learning_rate": 1e-05, "loss": 0.016, "step": 1391700 }, { "epoch": 0.013918, "grad_norm": 0.13761214911937714, "learning_rate": 1e-05, "loss": 0.0159, "step": 1391800 }, { "epoch": 0.013919, "grad_norm": 0.15476448833942413, "learning_rate": 1e-05, "loss": 0.0161, "step": 1391900 }, { "epoch": 0.01392, "grad_norm": 0.11731066554784775, "learning_rate": 1e-05, "loss": 0.0161, "step": 1392000 }, { "epoch": 0.013921, "grad_norm": 0.1224251464009285, "learning_rate": 1e-05, "loss": 0.016, "step": 1392100 }, { "epoch": 0.013922, "grad_norm": 0.11760392040014267, "learning_rate": 1e-05, "loss": 0.0159, "step": 1392200 }, { "epoch": 0.013923, "grad_norm": 0.15217767655849457, "learning_rate": 1e-05, "loss": 0.0162, "step": 1392300 }, { "epoch": 0.013924, "grad_norm": 0.16177628934383392, "learning_rate": 1e-05, "loss": 0.0161, "step": 1392400 }, { "epoch": 0.013925, "grad_norm": 0.127277672290802, "learning_rate": 1e-05, "loss": 0.0157, "step": 1392500 }, { "epoch": 0.013926, "grad_norm": 0.11262700706720352, "learning_rate": 1e-05, "loss": 0.0161, "step": 1392600 }, { "epoch": 0.013927, "grad_norm": 0.1332496851682663, "learning_rate": 1e-05, "loss": 0.0164, "step": 1392700 }, { "epoch": 0.013928, "grad_norm": 0.10444599390029907, "learning_rate": 1e-05, "loss": 0.0157, "step": 1392800 }, { "epoch": 0.013929, "grad_norm": 0.12969976663589478, "learning_rate": 1e-05, "loss": 0.0159, "step": 1392900 }, { "epoch": 0.01393, "grad_norm": 0.10779740661382675, "learning_rate": 1e-05, "loss": 0.0158, "step": 1393000 }, { "epoch": 0.013931, "grad_norm": 0.12614628672599792, "learning_rate": 1e-05, "loss": 0.0159, "step": 1393100 }, { "epoch": 0.013932, "grad_norm": 0.12633772194385529, "learning_rate": 1e-05, "loss": 0.0161, "step": 1393200 }, { "epoch": 0.013933, "grad_norm": 0.12658526003360748, "learning_rate": 1e-05, "loss": 0.0161, "step": 1393300 }, { "epoch": 0.013934, "grad_norm": 0.11079376935958862, "learning_rate": 1e-05, "loss": 0.0157, "step": 1393400 }, { "epoch": 0.013935, "grad_norm": 0.09668028354644775, "learning_rate": 1e-05, "loss": 0.0157, "step": 1393500 }, { "epoch": 0.013936, "grad_norm": 0.10756119340658188, "learning_rate": 1e-05, "loss": 0.016, "step": 1393600 }, { "epoch": 0.013937, "grad_norm": 0.13064377009868622, "learning_rate": 1e-05, "loss": 0.0157, "step": 1393700 }, { "epoch": 0.013938, "grad_norm": 0.09235972166061401, "learning_rate": 1e-05, "loss": 0.0164, "step": 1393800 }, { "epoch": 0.013939, "grad_norm": 0.09809034317731857, "learning_rate": 1e-05, "loss": 0.0163, "step": 1393900 }, { "epoch": 0.01394, "grad_norm": 0.15534041821956635, "learning_rate": 1e-05, "loss": 0.0161, "step": 1394000 }, { "epoch": 0.013941, "grad_norm": 0.1335708200931549, "learning_rate": 1e-05, "loss": 0.016, "step": 1394100 }, { "epoch": 0.013942, "grad_norm": 0.15792202949523926, "learning_rate": 1e-05, "loss": 0.0161, "step": 1394200 }, { "epoch": 0.013943, "grad_norm": 0.18204815685749054, "learning_rate": 1e-05, "loss": 0.016, "step": 1394300 }, { "epoch": 0.013944, "grad_norm": 0.10363347083330154, "learning_rate": 1e-05, "loss": 0.0161, "step": 1394400 }, { "epoch": 0.013945, "grad_norm": 0.1192498654127121, "learning_rate": 1e-05, "loss": 0.0162, "step": 1394500 }, { "epoch": 0.013946, "grad_norm": 0.15314675867557526, "learning_rate": 1e-05, "loss": 0.0163, "step": 1394600 }, { "epoch": 0.013947, "grad_norm": 0.11102167516946793, "learning_rate": 1e-05, "loss": 0.0158, "step": 1394700 }, { "epoch": 0.013948, "grad_norm": 0.08676799386739731, "learning_rate": 1e-05, "loss": 0.0159, "step": 1394800 }, { "epoch": 0.013949, "grad_norm": 0.13463643193244934, "learning_rate": 1e-05, "loss": 0.0161, "step": 1394900 }, { "epoch": 0.01395, "grad_norm": 0.10517618805170059, "learning_rate": 1e-05, "loss": 0.0161, "step": 1395000 }, { "epoch": 0.013951, "grad_norm": 0.1515015959739685, "learning_rate": 1e-05, "loss": 0.0161, "step": 1395100 }, { "epoch": 0.013952, "grad_norm": 0.10842452198266983, "learning_rate": 1e-05, "loss": 0.0161, "step": 1395200 }, { "epoch": 0.013953, "grad_norm": 0.139829620718956, "learning_rate": 1e-05, "loss": 0.016, "step": 1395300 }, { "epoch": 0.013954, "grad_norm": 0.10727687180042267, "learning_rate": 1e-05, "loss": 0.016, "step": 1395400 }, { "epoch": 0.013955, "grad_norm": 0.12469829618930817, "learning_rate": 1e-05, "loss": 0.0162, "step": 1395500 }, { "epoch": 0.013956, "grad_norm": 0.13671346008777618, "learning_rate": 1e-05, "loss": 0.016, "step": 1395600 }, { "epoch": 0.013957, "grad_norm": 0.1185363158583641, "learning_rate": 1e-05, "loss": 0.0161, "step": 1395700 }, { "epoch": 0.013958, "grad_norm": 0.14936140179634094, "learning_rate": 1e-05, "loss": 0.0167, "step": 1395800 }, { "epoch": 0.013959, "grad_norm": 0.09872879087924957, "learning_rate": 1e-05, "loss": 0.016, "step": 1395900 }, { "epoch": 0.01396, "grad_norm": 0.13745319843292236, "learning_rate": 1e-05, "loss": 0.0156, "step": 1396000 }, { "epoch": 0.013961, "grad_norm": 0.15357768535614014, "learning_rate": 1e-05, "loss": 0.0164, "step": 1396100 }, { "epoch": 0.013962, "grad_norm": 0.12568141520023346, "learning_rate": 1e-05, "loss": 0.0157, "step": 1396200 }, { "epoch": 0.013963, "grad_norm": 0.18693962693214417, "learning_rate": 1e-05, "loss": 0.0162, "step": 1396300 }, { "epoch": 0.013964, "grad_norm": 0.13753074407577515, "learning_rate": 1e-05, "loss": 0.0159, "step": 1396400 }, { "epoch": 0.013965, "grad_norm": 0.11587739735841751, "learning_rate": 1e-05, "loss": 0.0161, "step": 1396500 }, { "epoch": 0.013966, "grad_norm": 0.09921610355377197, "learning_rate": 1e-05, "loss": 0.0161, "step": 1396600 }, { "epoch": 0.013967, "grad_norm": 0.11999262869358063, "learning_rate": 1e-05, "loss": 0.0158, "step": 1396700 }, { "epoch": 0.013968, "grad_norm": 0.19048553705215454, "learning_rate": 1e-05, "loss": 0.0158, "step": 1396800 }, { "epoch": 0.013969, "grad_norm": 0.14119383692741394, "learning_rate": 1e-05, "loss": 0.0156, "step": 1396900 }, { "epoch": 0.01397, "grad_norm": 0.12292740494012833, "learning_rate": 1e-05, "loss": 0.0159, "step": 1397000 }, { "epoch": 0.013971, "grad_norm": 0.1107029989361763, "learning_rate": 1e-05, "loss": 0.0162, "step": 1397100 }, { "epoch": 0.013972, "grad_norm": 0.14719176292419434, "learning_rate": 1e-05, "loss": 0.0161, "step": 1397200 }, { "epoch": 0.013973, "grad_norm": 0.10749798268079758, "learning_rate": 1e-05, "loss": 0.0156, "step": 1397300 }, { "epoch": 0.013974, "grad_norm": 0.135345920920372, "learning_rate": 1e-05, "loss": 0.0161, "step": 1397400 }, { "epoch": 0.013975, "grad_norm": 0.13181856274604797, "learning_rate": 1e-05, "loss": 0.0161, "step": 1397500 }, { "epoch": 0.013976, "grad_norm": 0.13446886837482452, "learning_rate": 1e-05, "loss": 0.0159, "step": 1397600 }, { "epoch": 0.013977, "grad_norm": 0.1930995136499405, "learning_rate": 1e-05, "loss": 0.0157, "step": 1397700 }, { "epoch": 0.013978, "grad_norm": 0.12825894355773926, "learning_rate": 1e-05, "loss": 0.0158, "step": 1397800 }, { "epoch": 0.013979, "grad_norm": 0.12441286444664001, "learning_rate": 1e-05, "loss": 0.0161, "step": 1397900 }, { "epoch": 0.01398, "grad_norm": 0.10090823471546173, "learning_rate": 1e-05, "loss": 0.0161, "step": 1398000 }, { "epoch": 0.013981, "grad_norm": 0.16417670249938965, "learning_rate": 1e-05, "loss": 0.0152, "step": 1398100 }, { "epoch": 0.013982, "grad_norm": 0.13921064138412476, "learning_rate": 1e-05, "loss": 0.0161, "step": 1398200 }, { "epoch": 0.013983, "grad_norm": 0.15351760387420654, "learning_rate": 1e-05, "loss": 0.0161, "step": 1398300 }, { "epoch": 0.013984, "grad_norm": 0.12881237268447876, "learning_rate": 1e-05, "loss": 0.0165, "step": 1398400 }, { "epoch": 0.013985, "grad_norm": 0.1247476264834404, "learning_rate": 1e-05, "loss": 0.0162, "step": 1398500 }, { "epoch": 0.013986, "grad_norm": 0.13947172462940216, "learning_rate": 1e-05, "loss": 0.0164, "step": 1398600 }, { "epoch": 0.013987, "grad_norm": 0.14293025434017181, "learning_rate": 1e-05, "loss": 0.0158, "step": 1398700 }, { "epoch": 0.013988, "grad_norm": 0.10537414252758026, "learning_rate": 1e-05, "loss": 0.0161, "step": 1398800 }, { "epoch": 0.013989, "grad_norm": 0.12431295216083527, "learning_rate": 1e-05, "loss": 0.0159, "step": 1398900 }, { "epoch": 0.01399, "grad_norm": 0.1119600385427475, "learning_rate": 1e-05, "loss": 0.0162, "step": 1399000 }, { "epoch": 0.013991, "grad_norm": 0.12145771086215973, "learning_rate": 1e-05, "loss": 0.0159, "step": 1399100 }, { "epoch": 0.013992, "grad_norm": 0.10095585137605667, "learning_rate": 1e-05, "loss": 0.0164, "step": 1399200 }, { "epoch": 0.013993, "grad_norm": 0.1117657795548439, "learning_rate": 1e-05, "loss": 0.0157, "step": 1399300 }, { "epoch": 0.013994, "grad_norm": 0.12253673374652863, "learning_rate": 1e-05, "loss": 0.0162, "step": 1399400 }, { "epoch": 0.013995, "grad_norm": 0.14246134459972382, "learning_rate": 1e-05, "loss": 0.0162, "step": 1399500 }, { "epoch": 0.013996, "grad_norm": 0.12658002972602844, "learning_rate": 1e-05, "loss": 0.0161, "step": 1399600 }, { "epoch": 0.013997, "grad_norm": 0.10939554125070572, "learning_rate": 1e-05, "loss": 0.016, "step": 1399700 }, { "epoch": 0.013998, "grad_norm": 0.08968847990036011, "learning_rate": 1e-05, "loss": 0.0159, "step": 1399800 }, { "epoch": 0.013999, "grad_norm": 0.13161973655223846, "learning_rate": 1e-05, "loss": 0.0156, "step": 1399900 }, { "epoch": 0.014, "grad_norm": 0.11565739661455154, "learning_rate": 1e-05, "loss": 0.0161, "step": 1400000 }, { "epoch": 0.014, "eval_loss": 0.01400625891983509, "eval_runtime": 185.5818, "eval_samples_per_second": 269.423, "eval_steps_per_second": 16.839, "step": 1400000 }, { "epoch": 0.014001, "grad_norm": 0.1310545951128006, "learning_rate": 1e-05, "loss": 0.0155, "step": 1400100 }, { "epoch": 0.014002, "grad_norm": 0.10099498182535172, "learning_rate": 1e-05, "loss": 0.0157, "step": 1400200 }, { "epoch": 0.014003, "grad_norm": 0.15989108383655548, "learning_rate": 1e-05, "loss": 0.0158, "step": 1400300 }, { "epoch": 0.014004, "grad_norm": 0.11143362522125244, "learning_rate": 1e-05, "loss": 0.0159, "step": 1400400 }, { "epoch": 0.014005, "grad_norm": 0.1326364278793335, "learning_rate": 1e-05, "loss": 0.0156, "step": 1400500 }, { "epoch": 0.014006, "grad_norm": 0.11586855351924896, "learning_rate": 1e-05, "loss": 0.0162, "step": 1400600 }, { "epoch": 0.014007, "grad_norm": 0.08972182869911194, "learning_rate": 1e-05, "loss": 0.0159, "step": 1400700 }, { "epoch": 0.014008, "grad_norm": 0.1354723870754242, "learning_rate": 1e-05, "loss": 0.0157, "step": 1400800 }, { "epoch": 0.014009, "grad_norm": 0.14835159480571747, "learning_rate": 1e-05, "loss": 0.0161, "step": 1400900 }, { "epoch": 0.01401, "grad_norm": 0.14007854461669922, "learning_rate": 1e-05, "loss": 0.016, "step": 1401000 }, { "epoch": 0.014011, "grad_norm": 0.13665953278541565, "learning_rate": 1e-05, "loss": 0.016, "step": 1401100 }, { "epoch": 0.014012, "grad_norm": 0.1498432606458664, "learning_rate": 1e-05, "loss": 0.016, "step": 1401200 }, { "epoch": 0.014013, "grad_norm": 0.09869077801704407, "learning_rate": 1e-05, "loss": 0.0161, "step": 1401300 }, { "epoch": 0.014014, "grad_norm": 0.11697456985712051, "learning_rate": 1e-05, "loss": 0.0161, "step": 1401400 }, { "epoch": 0.014015, "grad_norm": 0.09701313078403473, "learning_rate": 1e-05, "loss": 0.0164, "step": 1401500 }, { "epoch": 0.014016, "grad_norm": 0.16636601090431213, "learning_rate": 1e-05, "loss": 0.0162, "step": 1401600 }, { "epoch": 0.014017, "grad_norm": 0.14985515177249908, "learning_rate": 1e-05, "loss": 0.0159, "step": 1401700 }, { "epoch": 0.014018, "grad_norm": 0.11691832542419434, "learning_rate": 1e-05, "loss": 0.0161, "step": 1401800 }, { "epoch": 0.014019, "grad_norm": 0.11613143235445023, "learning_rate": 1e-05, "loss": 0.0157, "step": 1401900 }, { "epoch": 0.01402, "grad_norm": 0.14914456009864807, "learning_rate": 1e-05, "loss": 0.016, "step": 1402000 }, { "epoch": 0.014021, "grad_norm": 0.07856836915016174, "learning_rate": 1e-05, "loss": 0.0156, "step": 1402100 }, { "epoch": 0.014022, "grad_norm": 0.14301429688930511, "learning_rate": 1e-05, "loss": 0.0159, "step": 1402200 }, { "epoch": 0.014023, "grad_norm": 0.12897837162017822, "learning_rate": 1e-05, "loss": 0.0161, "step": 1402300 }, { "epoch": 0.014024, "grad_norm": 0.12462498247623444, "learning_rate": 1e-05, "loss": 0.0159, "step": 1402400 }, { "epoch": 0.014025, "grad_norm": 0.16454747319221497, "learning_rate": 1e-05, "loss": 0.0164, "step": 1402500 }, { "epoch": 0.014026, "grad_norm": 0.11585050821304321, "learning_rate": 1e-05, "loss": 0.0162, "step": 1402600 }, { "epoch": 0.014027, "grad_norm": 0.10828696191310883, "learning_rate": 1e-05, "loss": 0.0159, "step": 1402700 }, { "epoch": 0.014028, "grad_norm": 0.1294192671775818, "learning_rate": 1e-05, "loss": 0.0161, "step": 1402800 }, { "epoch": 0.014029, "grad_norm": 0.1256171464920044, "learning_rate": 1e-05, "loss": 0.0158, "step": 1402900 }, { "epoch": 0.01403, "grad_norm": 0.11313598603010178, "learning_rate": 1e-05, "loss": 0.0164, "step": 1403000 }, { "epoch": 0.014031, "grad_norm": 0.0937613844871521, "learning_rate": 1e-05, "loss": 0.0154, "step": 1403100 }, { "epoch": 0.014032, "grad_norm": 0.10934606939554214, "learning_rate": 1e-05, "loss": 0.0161, "step": 1403200 }, { "epoch": 0.014033, "grad_norm": 0.13849292695522308, "learning_rate": 1e-05, "loss": 0.0162, "step": 1403300 }, { "epoch": 0.014034, "grad_norm": 0.13909438252449036, "learning_rate": 1e-05, "loss": 0.0159, "step": 1403400 }, { "epoch": 0.014035, "grad_norm": 0.10252329707145691, "learning_rate": 1e-05, "loss": 0.016, "step": 1403500 }, { "epoch": 0.014036, "grad_norm": 0.13463521003723145, "learning_rate": 1e-05, "loss": 0.0161, "step": 1403600 }, { "epoch": 0.014037, "grad_norm": 0.15730927884578705, "learning_rate": 1e-05, "loss": 0.0158, "step": 1403700 }, { "epoch": 0.014038, "grad_norm": 0.18675395846366882, "learning_rate": 1e-05, "loss": 0.0159, "step": 1403800 }, { "epoch": 0.014039, "grad_norm": 0.1066649854183197, "learning_rate": 1e-05, "loss": 0.0159, "step": 1403900 }, { "epoch": 0.01404, "grad_norm": 0.11346850544214249, "learning_rate": 1e-05, "loss": 0.0162, "step": 1404000 }, { "epoch": 0.014041, "grad_norm": 0.11445189267396927, "learning_rate": 1e-05, "loss": 0.0155, "step": 1404100 }, { "epoch": 0.014042, "grad_norm": 0.1362583339214325, "learning_rate": 1e-05, "loss": 0.0157, "step": 1404200 }, { "epoch": 0.014043, "grad_norm": 0.10122153162956238, "learning_rate": 1e-05, "loss": 0.0161, "step": 1404300 }, { "epoch": 0.014044, "grad_norm": 0.12037457525730133, "learning_rate": 1e-05, "loss": 0.0159, "step": 1404400 }, { "epoch": 0.014045, "grad_norm": 0.12430290877819061, "learning_rate": 1e-05, "loss": 0.016, "step": 1404500 }, { "epoch": 0.014046, "grad_norm": 0.12529592216014862, "learning_rate": 1e-05, "loss": 0.016, "step": 1404600 }, { "epoch": 0.014047, "grad_norm": 0.12147348374128342, "learning_rate": 1e-05, "loss": 0.016, "step": 1404700 }, { "epoch": 0.014048, "grad_norm": 0.09928569942712784, "learning_rate": 1e-05, "loss": 0.0158, "step": 1404800 }, { "epoch": 0.014049, "grad_norm": 0.1851719170808792, "learning_rate": 1e-05, "loss": 0.0161, "step": 1404900 }, { "epoch": 0.01405, "grad_norm": 0.15927773714065552, "learning_rate": 1e-05, "loss": 0.0159, "step": 1405000 }, { "epoch": 0.014051, "grad_norm": 0.17353670299053192, "learning_rate": 1e-05, "loss": 0.016, "step": 1405100 }, { "epoch": 0.014052, "grad_norm": 0.1281600296497345, "learning_rate": 1e-05, "loss": 0.0159, "step": 1405200 }, { "epoch": 0.014053, "grad_norm": 0.12049464881420135, "learning_rate": 1e-05, "loss": 0.0156, "step": 1405300 }, { "epoch": 0.014054, "grad_norm": 0.15773989260196686, "learning_rate": 1e-05, "loss": 0.016, "step": 1405400 }, { "epoch": 0.014055, "grad_norm": 0.10944665223360062, "learning_rate": 1e-05, "loss": 0.0163, "step": 1405500 }, { "epoch": 0.014056, "grad_norm": 0.11736448109149933, "learning_rate": 1e-05, "loss": 0.016, "step": 1405600 }, { "epoch": 0.014057, "grad_norm": 0.10773903131484985, "learning_rate": 1e-05, "loss": 0.0158, "step": 1405700 }, { "epoch": 0.014058, "grad_norm": 0.10630220919847488, "learning_rate": 1e-05, "loss": 0.0153, "step": 1405800 }, { "epoch": 0.014059, "grad_norm": 0.10478860884904861, "learning_rate": 1e-05, "loss": 0.0158, "step": 1405900 }, { "epoch": 0.01406, "grad_norm": 0.12203098088502884, "learning_rate": 1e-05, "loss": 0.0164, "step": 1406000 }, { "epoch": 0.014061, "grad_norm": 0.1304158717393875, "learning_rate": 1e-05, "loss": 0.016, "step": 1406100 }, { "epoch": 0.014062, "grad_norm": 0.12237317860126495, "learning_rate": 1e-05, "loss": 0.016, "step": 1406200 }, { "epoch": 0.014063, "grad_norm": 0.14139138162136078, "learning_rate": 1e-05, "loss": 0.0161, "step": 1406300 }, { "epoch": 0.014064, "grad_norm": 0.105380579829216, "learning_rate": 1e-05, "loss": 0.0153, "step": 1406400 }, { "epoch": 0.014065, "grad_norm": 0.10555396229028702, "learning_rate": 1e-05, "loss": 0.0159, "step": 1406500 }, { "epoch": 0.014066, "grad_norm": 0.14958824217319489, "learning_rate": 1e-05, "loss": 0.016, "step": 1406600 }, { "epoch": 0.014067, "grad_norm": 0.10839945077896118, "learning_rate": 1e-05, "loss": 0.0161, "step": 1406700 }, { "epoch": 0.014068, "grad_norm": 0.13849975168704987, "learning_rate": 1e-05, "loss": 0.0158, "step": 1406800 }, { "epoch": 0.014069, "grad_norm": 0.09914812445640564, "learning_rate": 1e-05, "loss": 0.0156, "step": 1406900 }, { "epoch": 0.01407, "grad_norm": 0.15352585911750793, "learning_rate": 1e-05, "loss": 0.016, "step": 1407000 }, { "epoch": 0.014071, "grad_norm": 0.11419201642274857, "learning_rate": 1e-05, "loss": 0.0161, "step": 1407100 }, { "epoch": 0.014072, "grad_norm": 0.14560306072235107, "learning_rate": 1e-05, "loss": 0.0159, "step": 1407200 }, { "epoch": 0.014073, "grad_norm": 0.2039615362882614, "learning_rate": 1e-05, "loss": 0.0164, "step": 1407300 }, { "epoch": 0.014074, "grad_norm": 0.08953961730003357, "learning_rate": 1e-05, "loss": 0.0158, "step": 1407400 }, { "epoch": 0.014075, "grad_norm": 0.15884259343147278, "learning_rate": 1e-05, "loss": 0.0157, "step": 1407500 }, { "epoch": 0.014076, "grad_norm": 0.13492818176746368, "learning_rate": 1e-05, "loss": 0.0161, "step": 1407600 }, { "epoch": 0.014077, "grad_norm": 0.128993958234787, "learning_rate": 1e-05, "loss": 0.0162, "step": 1407700 }, { "epoch": 0.014078, "grad_norm": 0.08926854282617569, "learning_rate": 1e-05, "loss": 0.0157, "step": 1407800 }, { "epoch": 0.014079, "grad_norm": 0.11958405375480652, "learning_rate": 1e-05, "loss": 0.0163, "step": 1407900 }, { "epoch": 0.01408, "grad_norm": 0.1432843655347824, "learning_rate": 1e-05, "loss": 0.0161, "step": 1408000 }, { "epoch": 0.014081, "grad_norm": 0.1394651234149933, "learning_rate": 1e-05, "loss": 0.016, "step": 1408100 }, { "epoch": 0.014082, "grad_norm": 0.1060403436422348, "learning_rate": 1e-05, "loss": 0.0163, "step": 1408200 }, { "epoch": 0.014083, "grad_norm": 0.10757028311491013, "learning_rate": 1e-05, "loss": 0.016, "step": 1408300 }, { "epoch": 0.014084, "grad_norm": 0.131935715675354, "learning_rate": 1e-05, "loss": 0.0162, "step": 1408400 }, { "epoch": 0.014085, "grad_norm": 0.31846392154693604, "learning_rate": 1e-05, "loss": 0.0159, "step": 1408500 }, { "epoch": 0.014086, "grad_norm": 0.1100592389702797, "learning_rate": 1e-05, "loss": 0.0159, "step": 1408600 }, { "epoch": 0.014087, "grad_norm": 0.16604304313659668, "learning_rate": 1e-05, "loss": 0.0159, "step": 1408700 }, { "epoch": 0.014088, "grad_norm": 0.12251132726669312, "learning_rate": 1e-05, "loss": 0.0158, "step": 1408800 }, { "epoch": 0.014089, "grad_norm": 0.09263981878757477, "learning_rate": 1e-05, "loss": 0.0161, "step": 1408900 }, { "epoch": 0.01409, "grad_norm": 0.12750375270843506, "learning_rate": 1e-05, "loss": 0.0162, "step": 1409000 }, { "epoch": 0.014091, "grad_norm": 0.16446632146835327, "learning_rate": 1e-05, "loss": 0.0161, "step": 1409100 }, { "epoch": 0.014092, "grad_norm": 0.11859370023012161, "learning_rate": 1e-05, "loss": 0.0159, "step": 1409200 }, { "epoch": 0.014093, "grad_norm": 0.13603349030017853, "learning_rate": 1e-05, "loss": 0.0162, "step": 1409300 }, { "epoch": 0.014094, "grad_norm": 0.1365770697593689, "learning_rate": 1e-05, "loss": 0.0159, "step": 1409400 }, { "epoch": 0.014095, "grad_norm": 0.14800593256950378, "learning_rate": 1e-05, "loss": 0.0159, "step": 1409500 }, { "epoch": 0.014096, "grad_norm": 0.0891667827963829, "learning_rate": 1e-05, "loss": 0.0155, "step": 1409600 }, { "epoch": 0.014097, "grad_norm": 0.12389861047267914, "learning_rate": 1e-05, "loss": 0.0163, "step": 1409700 }, { "epoch": 0.014098, "grad_norm": 0.11779168248176575, "learning_rate": 1e-05, "loss": 0.016, "step": 1409800 }, { "epoch": 0.014099, "grad_norm": 0.10603716224431992, "learning_rate": 1e-05, "loss": 0.0158, "step": 1409900 }, { "epoch": 0.0141, "grad_norm": 0.11152789741754532, "learning_rate": 1e-05, "loss": 0.0159, "step": 1410000 }, { "epoch": 0.014101, "grad_norm": 0.22850573062896729, "learning_rate": 1e-05, "loss": 0.0161, "step": 1410100 }, { "epoch": 0.014102, "grad_norm": 0.17904852330684662, "learning_rate": 1e-05, "loss": 0.016, "step": 1410200 }, { "epoch": 0.014103, "grad_norm": 0.1051916778087616, "learning_rate": 1e-05, "loss": 0.016, "step": 1410300 }, { "epoch": 0.014104, "grad_norm": 0.11373153328895569, "learning_rate": 1e-05, "loss": 0.0164, "step": 1410400 }, { "epoch": 0.014105, "grad_norm": 0.13357000052928925, "learning_rate": 1e-05, "loss": 0.0158, "step": 1410500 }, { "epoch": 0.014106, "grad_norm": 0.15546992421150208, "learning_rate": 1e-05, "loss": 0.0157, "step": 1410600 }, { "epoch": 0.014107, "grad_norm": 0.10095784813165665, "learning_rate": 1e-05, "loss": 0.0159, "step": 1410700 }, { "epoch": 0.014108, "grad_norm": 0.09579001367092133, "learning_rate": 1e-05, "loss": 0.0158, "step": 1410800 }, { "epoch": 0.014109, "grad_norm": 0.09744249284267426, "learning_rate": 1e-05, "loss": 0.0158, "step": 1410900 }, { "epoch": 0.01411, "grad_norm": 0.12693609297275543, "learning_rate": 1e-05, "loss": 0.0158, "step": 1411000 }, { "epoch": 0.014111, "grad_norm": 0.10742376744747162, "learning_rate": 1e-05, "loss": 0.016, "step": 1411100 }, { "epoch": 0.014112, "grad_norm": 0.08410684764385223, "learning_rate": 1e-05, "loss": 0.0161, "step": 1411200 }, { "epoch": 0.014113, "grad_norm": 0.19204534590244293, "learning_rate": 1e-05, "loss": 0.0161, "step": 1411300 }, { "epoch": 0.014114, "grad_norm": 0.12577402591705322, "learning_rate": 1e-05, "loss": 0.0157, "step": 1411400 }, { "epoch": 0.014115, "grad_norm": 0.1201687604188919, "learning_rate": 1e-05, "loss": 0.0161, "step": 1411500 }, { "epoch": 0.014116, "grad_norm": 0.12883971631526947, "learning_rate": 1e-05, "loss": 0.0164, "step": 1411600 }, { "epoch": 0.014117, "grad_norm": 0.1411571353673935, "learning_rate": 1e-05, "loss": 0.0161, "step": 1411700 }, { "epoch": 0.014118, "grad_norm": 0.1523291915655136, "learning_rate": 1e-05, "loss": 0.0158, "step": 1411800 }, { "epoch": 0.014119, "grad_norm": 0.11948402225971222, "learning_rate": 1e-05, "loss": 0.0162, "step": 1411900 }, { "epoch": 0.01412, "grad_norm": 0.1158452033996582, "learning_rate": 1e-05, "loss": 0.0164, "step": 1412000 }, { "epoch": 0.014121, "grad_norm": 0.12279576808214188, "learning_rate": 1e-05, "loss": 0.0162, "step": 1412100 }, { "epoch": 0.014122, "grad_norm": 0.11175145953893661, "learning_rate": 1e-05, "loss": 0.0161, "step": 1412200 }, { "epoch": 0.014123, "grad_norm": 0.11519443243741989, "learning_rate": 1e-05, "loss": 0.0161, "step": 1412300 }, { "epoch": 0.014124, "grad_norm": 0.125851571559906, "learning_rate": 1e-05, "loss": 0.0162, "step": 1412400 }, { "epoch": 0.014125, "grad_norm": 0.12549716234207153, "learning_rate": 1e-05, "loss": 0.0158, "step": 1412500 }, { "epoch": 0.014126, "grad_norm": 0.11160055547952652, "learning_rate": 1e-05, "loss": 0.016, "step": 1412600 }, { "epoch": 0.014127, "grad_norm": 0.10359099507331848, "learning_rate": 1e-05, "loss": 0.0161, "step": 1412700 }, { "epoch": 0.014128, "grad_norm": 0.13938044011592865, "learning_rate": 1e-05, "loss": 0.0163, "step": 1412800 }, { "epoch": 0.014129, "grad_norm": 0.14988334476947784, "learning_rate": 1e-05, "loss": 0.0161, "step": 1412900 }, { "epoch": 0.01413, "grad_norm": 0.11497321724891663, "learning_rate": 1e-05, "loss": 0.0156, "step": 1413000 }, { "epoch": 0.014131, "grad_norm": 0.1148579940199852, "learning_rate": 1e-05, "loss": 0.0163, "step": 1413100 }, { "epoch": 0.014132, "grad_norm": 0.17808398604393005, "learning_rate": 1e-05, "loss": 0.016, "step": 1413200 }, { "epoch": 0.014133, "grad_norm": 0.09518209099769592, "learning_rate": 1e-05, "loss": 0.0157, "step": 1413300 }, { "epoch": 0.014134, "grad_norm": 0.14035631716251373, "learning_rate": 1e-05, "loss": 0.016, "step": 1413400 }, { "epoch": 0.014135, "grad_norm": 0.10150660574436188, "learning_rate": 1e-05, "loss": 0.0157, "step": 1413500 }, { "epoch": 0.014136, "grad_norm": 0.1601075679063797, "learning_rate": 1e-05, "loss": 0.0159, "step": 1413600 }, { "epoch": 0.014137, "grad_norm": 0.1646057814359665, "learning_rate": 1e-05, "loss": 0.0157, "step": 1413700 }, { "epoch": 0.014138, "grad_norm": 0.14298208057880402, "learning_rate": 1e-05, "loss": 0.0158, "step": 1413800 }, { "epoch": 0.014139, "grad_norm": 0.09110221266746521, "learning_rate": 1e-05, "loss": 0.0158, "step": 1413900 }, { "epoch": 0.01414, "grad_norm": 0.09244250506162643, "learning_rate": 1e-05, "loss": 0.0155, "step": 1414000 }, { "epoch": 0.014141, "grad_norm": 0.12432660907506943, "learning_rate": 1e-05, "loss": 0.0162, "step": 1414100 }, { "epoch": 0.014142, "grad_norm": 0.1380050778388977, "learning_rate": 1e-05, "loss": 0.0161, "step": 1414200 }, { "epoch": 0.014143, "grad_norm": 0.1081085354089737, "learning_rate": 1e-05, "loss": 0.016, "step": 1414300 }, { "epoch": 0.014144, "grad_norm": 0.11263155937194824, "learning_rate": 1e-05, "loss": 0.016, "step": 1414400 }, { "epoch": 0.014145, "grad_norm": 0.14960797131061554, "learning_rate": 1e-05, "loss": 0.0165, "step": 1414500 }, { "epoch": 0.014146, "grad_norm": 0.11647398769855499, "learning_rate": 1e-05, "loss": 0.0163, "step": 1414600 }, { "epoch": 0.014147, "grad_norm": 0.1265500783920288, "learning_rate": 1e-05, "loss": 0.016, "step": 1414700 }, { "epoch": 0.014148, "grad_norm": 0.12284009903669357, "learning_rate": 1e-05, "loss": 0.016, "step": 1414800 }, { "epoch": 0.014149, "grad_norm": 0.219247967004776, "learning_rate": 1e-05, "loss": 0.016, "step": 1414900 }, { "epoch": 0.01415, "grad_norm": 0.11351735144853592, "learning_rate": 1e-05, "loss": 0.016, "step": 1415000 }, { "epoch": 0.014151, "grad_norm": 0.11789338290691376, "learning_rate": 1e-05, "loss": 0.0155, "step": 1415100 }, { "epoch": 0.014152, "grad_norm": 0.08208306133747101, "learning_rate": 1e-05, "loss": 0.0159, "step": 1415200 }, { "epoch": 0.014153, "grad_norm": 0.1380775272846222, "learning_rate": 1e-05, "loss": 0.0157, "step": 1415300 }, { "epoch": 0.014154, "grad_norm": 0.13982535898685455, "learning_rate": 1e-05, "loss": 0.0157, "step": 1415400 }, { "epoch": 0.014155, "grad_norm": 0.12311724573373795, "learning_rate": 1e-05, "loss": 0.016, "step": 1415500 }, { "epoch": 0.014156, "grad_norm": 0.15604239702224731, "learning_rate": 1e-05, "loss": 0.0162, "step": 1415600 }, { "epoch": 0.014157, "grad_norm": 0.16637997329235077, "learning_rate": 1e-05, "loss": 0.0159, "step": 1415700 }, { "epoch": 0.014158, "grad_norm": 0.14950846135616302, "learning_rate": 1e-05, "loss": 0.0158, "step": 1415800 }, { "epoch": 0.014159, "grad_norm": 0.10980162024497986, "learning_rate": 1e-05, "loss": 0.0157, "step": 1415900 }, { "epoch": 0.01416, "grad_norm": 0.10889626294374466, "learning_rate": 1e-05, "loss": 0.0156, "step": 1416000 }, { "epoch": 0.014161, "grad_norm": 0.13774368166923523, "learning_rate": 1e-05, "loss": 0.0159, "step": 1416100 }, { "epoch": 0.014162, "grad_norm": 0.14777536690235138, "learning_rate": 1e-05, "loss": 0.0161, "step": 1416200 }, { "epoch": 0.014163, "grad_norm": 0.11060020327568054, "learning_rate": 1e-05, "loss": 0.016, "step": 1416300 }, { "epoch": 0.014164, "grad_norm": 0.09213798493146896, "learning_rate": 1e-05, "loss": 0.016, "step": 1416400 }, { "epoch": 0.014165, "grad_norm": 0.1478041112422943, "learning_rate": 1e-05, "loss": 0.0159, "step": 1416500 }, { "epoch": 0.014166, "grad_norm": 0.09449257701635361, "learning_rate": 1e-05, "loss": 0.0159, "step": 1416600 }, { "epoch": 0.014167, "grad_norm": 0.12366210669279099, "learning_rate": 1e-05, "loss": 0.0156, "step": 1416700 }, { "epoch": 0.014168, "grad_norm": 0.09467578679323196, "learning_rate": 1e-05, "loss": 0.0164, "step": 1416800 }, { "epoch": 0.014169, "grad_norm": 0.12452935427427292, "learning_rate": 1e-05, "loss": 0.0158, "step": 1416900 }, { "epoch": 0.01417, "grad_norm": 0.113227978348732, "learning_rate": 1e-05, "loss": 0.0159, "step": 1417000 }, { "epoch": 0.014171, "grad_norm": 0.14427414536476135, "learning_rate": 1e-05, "loss": 0.0159, "step": 1417100 }, { "epoch": 0.014172, "grad_norm": 0.13295507431030273, "learning_rate": 1e-05, "loss": 0.0158, "step": 1417200 }, { "epoch": 0.014173, "grad_norm": 0.1477758288383484, "learning_rate": 1e-05, "loss": 0.0157, "step": 1417300 }, { "epoch": 0.014174, "grad_norm": 0.13115444779396057, "learning_rate": 1e-05, "loss": 0.0162, "step": 1417400 }, { "epoch": 0.014175, "grad_norm": 0.09695737808942795, "learning_rate": 1e-05, "loss": 0.0161, "step": 1417500 }, { "epoch": 0.014176, "grad_norm": 0.10733936727046967, "learning_rate": 1e-05, "loss": 0.0156, "step": 1417600 }, { "epoch": 0.014177, "grad_norm": 0.11215189844369888, "learning_rate": 1e-05, "loss": 0.0158, "step": 1417700 }, { "epoch": 0.014178, "grad_norm": 0.1227172315120697, "learning_rate": 1e-05, "loss": 0.016, "step": 1417800 }, { "epoch": 0.014179, "grad_norm": 0.10662703216075897, "learning_rate": 1e-05, "loss": 0.0155, "step": 1417900 }, { "epoch": 0.01418, "grad_norm": 0.09017784893512726, "learning_rate": 1e-05, "loss": 0.0162, "step": 1418000 }, { "epoch": 0.014181, "grad_norm": 0.10484384745359421, "learning_rate": 1e-05, "loss": 0.0157, "step": 1418100 }, { "epoch": 0.014182, "grad_norm": 0.12161215394735336, "learning_rate": 1e-05, "loss": 0.0165, "step": 1418200 }, { "epoch": 0.014183, "grad_norm": 0.13270220160484314, "learning_rate": 1e-05, "loss": 0.0158, "step": 1418300 }, { "epoch": 0.014184, "grad_norm": 0.11462744325399399, "learning_rate": 1e-05, "loss": 0.0157, "step": 1418400 }, { "epoch": 0.014185, "grad_norm": 0.16658388078212738, "learning_rate": 1e-05, "loss": 0.0157, "step": 1418500 }, { "epoch": 0.014186, "grad_norm": 0.11830633133649826, "learning_rate": 1e-05, "loss": 0.016, "step": 1418600 }, { "epoch": 0.014187, "grad_norm": 0.10518387705087662, "learning_rate": 1e-05, "loss": 0.0163, "step": 1418700 }, { "epoch": 0.014188, "grad_norm": 0.12080011516809464, "learning_rate": 1e-05, "loss": 0.0164, "step": 1418800 }, { "epoch": 0.014189, "grad_norm": 0.10522958636283875, "learning_rate": 1e-05, "loss": 0.0161, "step": 1418900 }, { "epoch": 0.01419, "grad_norm": 0.11923743039369583, "learning_rate": 1e-05, "loss": 0.0161, "step": 1419000 }, { "epoch": 0.014191, "grad_norm": 0.12421231716871262, "learning_rate": 1e-05, "loss": 0.0161, "step": 1419100 }, { "epoch": 0.014192, "grad_norm": 0.13529950380325317, "learning_rate": 1e-05, "loss": 0.0155, "step": 1419200 }, { "epoch": 0.014193, "grad_norm": 0.12699109315872192, "learning_rate": 1e-05, "loss": 0.016, "step": 1419300 }, { "epoch": 0.014194, "grad_norm": 0.14395113289356232, "learning_rate": 1e-05, "loss": 0.0161, "step": 1419400 }, { "epoch": 0.014195, "grad_norm": 0.1725299060344696, "learning_rate": 1e-05, "loss": 0.0153, "step": 1419500 }, { "epoch": 0.014196, "grad_norm": 0.12728410959243774, "learning_rate": 1e-05, "loss": 0.0156, "step": 1419600 }, { "epoch": 0.014197, "grad_norm": 0.13680051267147064, "learning_rate": 1e-05, "loss": 0.016, "step": 1419700 }, { "epoch": 0.014198, "grad_norm": 0.13692858815193176, "learning_rate": 1e-05, "loss": 0.0159, "step": 1419800 }, { "epoch": 0.014199, "grad_norm": 0.11894924193620682, "learning_rate": 1e-05, "loss": 0.0155, "step": 1419900 }, { "epoch": 0.0142, "grad_norm": 0.11284433305263519, "learning_rate": 1e-05, "loss": 0.0159, "step": 1420000 }, { "epoch": 0.0142, "eval_loss": 0.013992251828312874, "eval_runtime": 183.7225, "eval_samples_per_second": 272.15, "eval_steps_per_second": 17.009, "step": 1420000 } ], "logging_steps": 100, "max_steps": 100000000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 20000, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 200, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.374621986816e+19, "train_batch_size": 128, "trial_name": null, "trial_params": null }