{ "best_global_step": 1125, "best_metric": 0.0719488188624382, "best_model_checkpoint": "outputs_3/checkpoint-1125", "epoch": 3.630048465266559, "eval_steps": 75, "global_step": 1125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032310177705977385, "grad_norm": 0.0250613521784544, "learning_rate": 0.0, "loss": 0.1103, "step": 1 }, { "epoch": 0.006462035541195477, "grad_norm": 0.03354981914162636, "learning_rate": 1.0000000000000001e-07, "loss": 0.1336, "step": 2 }, { "epoch": 0.009693053311793215, "grad_norm": 0.024361876770853996, "learning_rate": 2.0000000000000002e-07, "loss": 0.1158, "step": 3 }, { "epoch": 0.012924071082390954, "grad_norm": 0.024172648787498474, "learning_rate": 3.0000000000000004e-07, "loss": 0.0937, "step": 4 }, { "epoch": 0.01615508885298869, "grad_norm": 0.024678289890289307, "learning_rate": 4.0000000000000003e-07, "loss": 0.1047, "step": 5 }, { "epoch": 0.01938610662358643, "grad_norm": 0.031230105087161064, "learning_rate": 5.000000000000001e-07, "loss": 0.1479, "step": 6 }, { "epoch": 0.022617124394184167, "grad_norm": 0.024539202451705933, "learning_rate": 6.000000000000001e-07, "loss": 0.126, "step": 7 }, { "epoch": 0.025848142164781908, "grad_norm": 0.02983587421476841, "learning_rate": 7.000000000000001e-07, "loss": 0.1218, "step": 8 }, { "epoch": 0.029079159935379646, "grad_norm": 0.02570008486509323, "learning_rate": 8.000000000000001e-07, "loss": 0.1153, "step": 9 }, { "epoch": 0.03231017770597738, "grad_norm": 0.024239273741841316, "learning_rate": 9.000000000000001e-07, "loss": 0.0952, "step": 10 }, { "epoch": 0.035541195476575124, "grad_norm": 0.020495450124144554, "learning_rate": 1.0000000000000002e-06, "loss": 0.0958, "step": 11 }, { "epoch": 0.03877221324717286, "grad_norm": 0.028608962893486023, "learning_rate": 1.1e-06, "loss": 0.1169, "step": 12 }, { "epoch": 0.0420032310177706, "grad_norm": 0.03878644108772278, "learning_rate": 1.2000000000000002e-06, "loss": 0.1371, "step": 13 }, { "epoch": 0.045234248788368334, "grad_norm": 0.02789674885571003, "learning_rate": 1.3e-06, "loss": 0.1266, "step": 14 }, { "epoch": 0.048465266558966075, "grad_norm": 0.03313566744327545, "learning_rate": 1.4000000000000001e-06, "loss": 0.1301, "step": 15 }, { "epoch": 0.051696284329563816, "grad_norm": 0.0248391292989254, "learning_rate": 1.5e-06, "loss": 0.1056, "step": 16 }, { "epoch": 0.05492730210016155, "grad_norm": 0.024395154789090157, "learning_rate": 1.6000000000000001e-06, "loss": 0.112, "step": 17 }, { "epoch": 0.05815831987075929, "grad_norm": 0.03043658658862114, "learning_rate": 1.7000000000000002e-06, "loss": 0.1037, "step": 18 }, { "epoch": 0.061389337641357025, "grad_norm": 0.02323235385119915, "learning_rate": 1.8000000000000001e-06, "loss": 0.0996, "step": 19 }, { "epoch": 0.06462035541195477, "grad_norm": 0.03656580671668053, "learning_rate": 1.9000000000000002e-06, "loss": 0.1327, "step": 20 }, { "epoch": 0.06785137318255251, "grad_norm": 0.02677535079419613, "learning_rate": 2.0000000000000003e-06, "loss": 0.1205, "step": 21 }, { "epoch": 0.07108239095315025, "grad_norm": 0.029969926923513412, "learning_rate": 2.1000000000000002e-06, "loss": 0.126, "step": 22 }, { "epoch": 0.07431340872374798, "grad_norm": 0.02832009270787239, "learning_rate": 2.2e-06, "loss": 0.1208, "step": 23 }, { "epoch": 0.07754442649434572, "grad_norm": 0.023000000044703484, "learning_rate": 2.3000000000000004e-06, "loss": 0.102, "step": 24 }, { "epoch": 0.08077544426494346, "grad_norm": 0.04773552715778351, "learning_rate": 2.4000000000000003e-06, "loss": 0.1403, "step": 25 }, { "epoch": 0.0840064620355412, "grad_norm": 0.03021993860602379, "learning_rate": 2.5e-06, "loss": 0.1116, "step": 26 }, { "epoch": 0.08723747980613894, "grad_norm": 0.026623567566275597, "learning_rate": 2.6e-06, "loss": 0.1052, "step": 27 }, { "epoch": 0.09046849757673667, "grad_norm": 0.02503894828259945, "learning_rate": 2.7000000000000004e-06, "loss": 0.1016, "step": 28 }, { "epoch": 0.09369951534733441, "grad_norm": 0.026578862220048904, "learning_rate": 2.8000000000000003e-06, "loss": 0.1026, "step": 29 }, { "epoch": 0.09693053311793215, "grad_norm": 0.029514916241168976, "learning_rate": 2.9e-06, "loss": 0.1047, "step": 30 }, { "epoch": 0.10016155088852989, "grad_norm": 0.025679711252450943, "learning_rate": 3e-06, "loss": 0.1228, "step": 31 }, { "epoch": 0.10339256865912763, "grad_norm": 0.023719631135463715, "learning_rate": 3.1000000000000004e-06, "loss": 0.0997, "step": 32 }, { "epoch": 0.10662358642972536, "grad_norm": 0.03468646854162216, "learning_rate": 3.2000000000000003e-06, "loss": 0.1354, "step": 33 }, { "epoch": 0.1098546042003231, "grad_norm": 0.02570706605911255, "learning_rate": 3.3000000000000006e-06, "loss": 0.1007, "step": 34 }, { "epoch": 0.11308562197092084, "grad_norm": 0.03752969205379486, "learning_rate": 3.4000000000000005e-06, "loss": 0.1512, "step": 35 }, { "epoch": 0.11631663974151858, "grad_norm": 0.034612756222486496, "learning_rate": 3.5e-06, "loss": 0.1212, "step": 36 }, { "epoch": 0.11954765751211632, "grad_norm": 0.022866638377308846, "learning_rate": 3.6000000000000003e-06, "loss": 0.0962, "step": 37 }, { "epoch": 0.12277867528271405, "grad_norm": 0.03314971551299095, "learning_rate": 3.7e-06, "loss": 0.1275, "step": 38 }, { "epoch": 0.1260096930533118, "grad_norm": 0.026049382984638214, "learning_rate": 3.8000000000000005e-06, "loss": 0.1095, "step": 39 }, { "epoch": 0.12924071082390953, "grad_norm": 0.027846891433000565, "learning_rate": 3.900000000000001e-06, "loss": 0.1208, "step": 40 }, { "epoch": 0.13247172859450726, "grad_norm": 0.03061266988515854, "learning_rate": 4.000000000000001e-06, "loss": 0.1071, "step": 41 }, { "epoch": 0.13570274636510501, "grad_norm": 0.028831277042627335, "learning_rate": 4.1e-06, "loss": 0.122, "step": 42 }, { "epoch": 0.13893376413570274, "grad_norm": 0.024587510153651237, "learning_rate": 4.2000000000000004e-06, "loss": 0.0891, "step": 43 }, { "epoch": 0.1421647819063005, "grad_norm": 0.03614000231027603, "learning_rate": 4.3e-06, "loss": 0.1443, "step": 44 }, { "epoch": 0.14539579967689822, "grad_norm": 0.0399002730846405, "learning_rate": 4.4e-06, "loss": 0.1306, "step": 45 }, { "epoch": 0.14862681744749595, "grad_norm": 0.03731178864836693, "learning_rate": 4.5e-06, "loss": 0.1552, "step": 46 }, { "epoch": 0.1518578352180937, "grad_norm": 0.03669052943587303, "learning_rate": 4.600000000000001e-06, "loss": 0.1285, "step": 47 }, { "epoch": 0.15508885298869143, "grad_norm": 0.029436811804771423, "learning_rate": 4.7e-06, "loss": 0.1128, "step": 48 }, { "epoch": 0.1583198707592892, "grad_norm": 0.03553691506385803, "learning_rate": 4.800000000000001e-06, "loss": 0.1295, "step": 49 }, { "epoch": 0.16155088852988692, "grad_norm": 0.045196086168289185, "learning_rate": 4.9000000000000005e-06, "loss": 0.1589, "step": 50 }, { "epoch": 0.16478190630048464, "grad_norm": 0.03583088517189026, "learning_rate": 5e-06, "loss": 0.1195, "step": 51 }, { "epoch": 0.1680129240710824, "grad_norm": 0.03799896687269211, "learning_rate": 5.1e-06, "loss": 0.1293, "step": 52 }, { "epoch": 0.17124394184168013, "grad_norm": 0.0341208279132843, "learning_rate": 5.2e-06, "loss": 0.1121, "step": 53 }, { "epoch": 0.17447495961227788, "grad_norm": 0.0367840901017189, "learning_rate": 5.300000000000001e-06, "loss": 0.1288, "step": 54 }, { "epoch": 0.1777059773828756, "grad_norm": 0.03497275337576866, "learning_rate": 5.400000000000001e-06, "loss": 0.1146, "step": 55 }, { "epoch": 0.18093699515347333, "grad_norm": 0.04450898617506027, "learning_rate": 5.500000000000001e-06, "loss": 0.1261, "step": 56 }, { "epoch": 0.1841680129240711, "grad_norm": 0.029873637482523918, "learning_rate": 5.600000000000001e-06, "loss": 0.0981, "step": 57 }, { "epoch": 0.18739903069466882, "grad_norm": 0.030145753175020218, "learning_rate": 5.7e-06, "loss": 0.1121, "step": 58 }, { "epoch": 0.19063004846526657, "grad_norm": 0.03658242151141167, "learning_rate": 5.8e-06, "loss": 0.1416, "step": 59 }, { "epoch": 0.1938610662358643, "grad_norm": 0.049440935254096985, "learning_rate": 5.9e-06, "loss": 0.125, "step": 60 }, { "epoch": 0.19709208400646203, "grad_norm": 0.0388176292181015, "learning_rate": 6e-06, "loss": 0.1195, "step": 61 }, { "epoch": 0.20032310177705978, "grad_norm": 0.03422081843018532, "learning_rate": 6.1e-06, "loss": 0.1191, "step": 62 }, { "epoch": 0.2035541195476575, "grad_norm": 0.047777559608221054, "learning_rate": 6.200000000000001e-06, "loss": 0.1304, "step": 63 }, { "epoch": 0.20678513731825526, "grad_norm": 0.031583212316036224, "learning_rate": 6.300000000000001e-06, "loss": 0.0996, "step": 64 }, { "epoch": 0.210016155088853, "grad_norm": 0.03744835779070854, "learning_rate": 6.4000000000000006e-06, "loss": 0.1124, "step": 65 }, { "epoch": 0.21324717285945072, "grad_norm": 0.04165760055184364, "learning_rate": 6.5000000000000004e-06, "loss": 0.1041, "step": 66 }, { "epoch": 0.21647819063004847, "grad_norm": 0.03987026587128639, "learning_rate": 6.600000000000001e-06, "loss": 0.133, "step": 67 }, { "epoch": 0.2197092084006462, "grad_norm": 0.040481116622686386, "learning_rate": 6.700000000000001e-06, "loss": 0.0944, "step": 68 }, { "epoch": 0.22294022617124395, "grad_norm": 0.03691282495856285, "learning_rate": 6.800000000000001e-06, "loss": 0.1018, "step": 69 }, { "epoch": 0.22617124394184168, "grad_norm": 0.04485676437616348, "learning_rate": 6.9e-06, "loss": 0.137, "step": 70 }, { "epoch": 0.2294022617124394, "grad_norm": 0.02854587510228157, "learning_rate": 7e-06, "loss": 0.0885, "step": 71 }, { "epoch": 0.23263327948303716, "grad_norm": 0.04462384432554245, "learning_rate": 7.100000000000001e-06, "loss": 0.1429, "step": 72 }, { "epoch": 0.2358642972536349, "grad_norm": 0.040676336735486984, "learning_rate": 7.2000000000000005e-06, "loss": 0.121, "step": 73 }, { "epoch": 0.23909531502423265, "grad_norm": 0.0420430488884449, "learning_rate": 7.3e-06, "loss": 0.1197, "step": 74 }, { "epoch": 0.24232633279483037, "grad_norm": 0.034694962203502655, "learning_rate": 7.4e-06, "loss": 0.1177, "step": 75 }, { "epoch": 0.24232633279483037, "eval_loss": 0.12372970581054688, "eval_runtime": 188.2884, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 75 }, { "epoch": 0.2455573505654281, "grad_norm": 0.03579672798514366, "learning_rate": 7.500000000000001e-06, "loss": 0.1058, "step": 76 }, { "epoch": 0.24878836833602586, "grad_norm": 0.029704652726650238, "learning_rate": 7.600000000000001e-06, "loss": 0.0882, "step": 77 }, { "epoch": 0.2520193861066236, "grad_norm": 0.04732828587293625, "learning_rate": 7.7e-06, "loss": 0.1277, "step": 78 }, { "epoch": 0.2552504038772213, "grad_norm": 0.027987899258732796, "learning_rate": 7.800000000000002e-06, "loss": 0.0751, "step": 79 }, { "epoch": 0.25848142164781907, "grad_norm": 0.04119185730814934, "learning_rate": 7.9e-06, "loss": 0.1043, "step": 80 }, { "epoch": 0.2617124394184168, "grad_norm": 0.04631367698311806, "learning_rate": 8.000000000000001e-06, "loss": 0.118, "step": 81 }, { "epoch": 0.2649434571890145, "grad_norm": 0.03165270760655403, "learning_rate": 8.1e-06, "loss": 0.0889, "step": 82 }, { "epoch": 0.2681744749596123, "grad_norm": 0.03328806161880493, "learning_rate": 8.2e-06, "loss": 0.097, "step": 83 }, { "epoch": 0.27140549273021003, "grad_norm": 0.05337163060903549, "learning_rate": 8.3e-06, "loss": 0.1373, "step": 84 }, { "epoch": 0.27463651050080773, "grad_norm": 0.030968431383371353, "learning_rate": 8.400000000000001e-06, "loss": 0.077, "step": 85 }, { "epoch": 0.2778675282714055, "grad_norm": 0.03477643430233002, "learning_rate": 8.5e-06, "loss": 0.1016, "step": 86 }, { "epoch": 0.28109854604200324, "grad_norm": 0.04051528871059418, "learning_rate": 8.6e-06, "loss": 0.1017, "step": 87 }, { "epoch": 0.284329563812601, "grad_norm": 0.039160750806331635, "learning_rate": 8.700000000000001e-06, "loss": 0.112, "step": 88 }, { "epoch": 0.2875605815831987, "grad_norm": 0.03572917729616165, "learning_rate": 8.8e-06, "loss": 0.083, "step": 89 }, { "epoch": 0.29079159935379645, "grad_norm": 0.05116155743598938, "learning_rate": 8.900000000000001e-06, "loss": 0.1262, "step": 90 }, { "epoch": 0.2940226171243942, "grad_norm": 0.043991196900606155, "learning_rate": 9e-06, "loss": 0.1147, "step": 91 }, { "epoch": 0.2972536348949919, "grad_norm": 0.03514918312430382, "learning_rate": 9.100000000000001e-06, "loss": 0.0874, "step": 92 }, { "epoch": 0.30048465266558966, "grad_norm": 0.03676354140043259, "learning_rate": 9.200000000000002e-06, "loss": 0.0988, "step": 93 }, { "epoch": 0.3037156704361874, "grad_norm": 0.04684548079967499, "learning_rate": 9.3e-06, "loss": 0.129, "step": 94 }, { "epoch": 0.3069466882067851, "grad_norm": 0.033971093595027924, "learning_rate": 9.4e-06, "loss": 0.09, "step": 95 }, { "epoch": 0.31017770597738287, "grad_norm": 0.0339505635201931, "learning_rate": 9.5e-06, "loss": 0.083, "step": 96 }, { "epoch": 0.3134087237479806, "grad_norm": 0.04067440703511238, "learning_rate": 9.600000000000001e-06, "loss": 0.1093, "step": 97 }, { "epoch": 0.3166397415185784, "grad_norm": 0.036671143025159836, "learning_rate": 9.7e-06, "loss": 0.0929, "step": 98 }, { "epoch": 0.3198707592891761, "grad_norm": 0.0459955595433712, "learning_rate": 9.800000000000001e-06, "loss": 0.1273, "step": 99 }, { "epoch": 0.32310177705977383, "grad_norm": 0.05120276287198067, "learning_rate": 9.9e-06, "loss": 0.1349, "step": 100 }, { "epoch": 0.3263327948303716, "grad_norm": 0.03149951994419098, "learning_rate": 1e-05, "loss": 0.074, "step": 101 }, { "epoch": 0.3295638126009693, "grad_norm": 0.2025747299194336, "learning_rate": 1.0100000000000002e-05, "loss": 0.099, "step": 102 }, { "epoch": 0.33279483037156704, "grad_norm": 0.03870416432619095, "learning_rate": 1.02e-05, "loss": 0.0984, "step": 103 }, { "epoch": 0.3360258481421648, "grad_norm": 0.05344085767865181, "learning_rate": 1.0300000000000001e-05, "loss": 0.1057, "step": 104 }, { "epoch": 0.3392568659127625, "grad_norm": 0.04757027328014374, "learning_rate": 1.04e-05, "loss": 0.124, "step": 105 }, { "epoch": 0.34248788368336025, "grad_norm": 0.03201949968934059, "learning_rate": 1.0500000000000001e-05, "loss": 0.0807, "step": 106 }, { "epoch": 0.345718901453958, "grad_norm": 0.03873045742511749, "learning_rate": 1.0600000000000002e-05, "loss": 0.1048, "step": 107 }, { "epoch": 0.34894991922455576, "grad_norm": 0.027501031756401062, "learning_rate": 1.0700000000000001e-05, "loss": 0.0613, "step": 108 }, { "epoch": 0.35218093699515346, "grad_norm": 0.03909388184547424, "learning_rate": 1.0800000000000002e-05, "loss": 0.0929, "step": 109 }, { "epoch": 0.3554119547657512, "grad_norm": 0.08006097376346588, "learning_rate": 1.0900000000000002e-05, "loss": 0.0908, "step": 110 }, { "epoch": 0.35864297253634897, "grad_norm": 0.03813672810792923, "learning_rate": 1.1000000000000001e-05, "loss": 0.0764, "step": 111 }, { "epoch": 0.36187399030694667, "grad_norm": 0.030327340587973595, "learning_rate": 1.1100000000000002e-05, "loss": 0.077, "step": 112 }, { "epoch": 0.3651050080775444, "grad_norm": 0.03776196017861366, "learning_rate": 1.1200000000000001e-05, "loss": 0.0759, "step": 113 }, { "epoch": 0.3683360258481422, "grad_norm": 0.039037927985191345, "learning_rate": 1.13e-05, "loss": 0.0926, "step": 114 }, { "epoch": 0.3715670436187399, "grad_norm": 0.0416683666408062, "learning_rate": 1.14e-05, "loss": 0.0914, "step": 115 }, { "epoch": 0.37479806138933763, "grad_norm": 0.04185537248849869, "learning_rate": 1.15e-05, "loss": 0.0893, "step": 116 }, { "epoch": 0.3780290791599354, "grad_norm": 0.04651897773146629, "learning_rate": 1.16e-05, "loss": 0.1094, "step": 117 }, { "epoch": 0.38126009693053314, "grad_norm": 0.04604775831103325, "learning_rate": 1.17e-05, "loss": 0.1068, "step": 118 }, { "epoch": 0.38449111470113084, "grad_norm": 0.02846536412835121, "learning_rate": 1.18e-05, "loss": 0.064, "step": 119 }, { "epoch": 0.3877221324717286, "grad_norm": 0.033402059227228165, "learning_rate": 1.1900000000000001e-05, "loss": 0.0725, "step": 120 }, { "epoch": 0.39095315024232635, "grad_norm": 0.044676899909973145, "learning_rate": 1.2e-05, "loss": 0.1017, "step": 121 }, { "epoch": 0.39418416801292405, "grad_norm": 0.05336389318108559, "learning_rate": 1.2100000000000001e-05, "loss": 0.1063, "step": 122 }, { "epoch": 0.3974151857835218, "grad_norm": 0.0402502678334713, "learning_rate": 1.22e-05, "loss": 0.0764, "step": 123 }, { "epoch": 0.40064620355411956, "grad_norm": 0.04342082887887955, "learning_rate": 1.23e-05, "loss": 0.1008, "step": 124 }, { "epoch": 0.40387722132471726, "grad_norm": 0.047081444412469864, "learning_rate": 1.2400000000000002e-05, "loss": 0.1029, "step": 125 }, { "epoch": 0.407108239095315, "grad_norm": 0.038031261414289474, "learning_rate": 1.25e-05, "loss": 0.0783, "step": 126 }, { "epoch": 0.41033925686591277, "grad_norm": 0.03746628388762474, "learning_rate": 1.2600000000000001e-05, "loss": 0.0764, "step": 127 }, { "epoch": 0.4135702746365105, "grad_norm": 0.04288509115576744, "learning_rate": 1.27e-05, "loss": 0.0892, "step": 128 }, { "epoch": 0.4168012924071082, "grad_norm": 0.042407114058732986, "learning_rate": 1.2800000000000001e-05, "loss": 0.0925, "step": 129 }, { "epoch": 0.420032310177706, "grad_norm": 0.03754522651433945, "learning_rate": 1.2900000000000002e-05, "loss": 0.0833, "step": 130 }, { "epoch": 0.42326332794830374, "grad_norm": 0.04337688535451889, "learning_rate": 1.3000000000000001e-05, "loss": 0.0976, "step": 131 }, { "epoch": 0.42649434571890144, "grad_norm": 0.03174331411719322, "learning_rate": 1.3100000000000002e-05, "loss": 0.0684, "step": 132 }, { "epoch": 0.4297253634894992, "grad_norm": 0.04556446522474289, "learning_rate": 1.3200000000000002e-05, "loss": 0.0979, "step": 133 }, { "epoch": 0.43295638126009695, "grad_norm": 0.04222508519887924, "learning_rate": 1.3300000000000001e-05, "loss": 0.0923, "step": 134 }, { "epoch": 0.43618739903069464, "grad_norm": 0.047948531806468964, "learning_rate": 1.3400000000000002e-05, "loss": 0.0991, "step": 135 }, { "epoch": 0.4394184168012924, "grad_norm": 0.04752594605088234, "learning_rate": 1.3500000000000001e-05, "loss": 0.0864, "step": 136 }, { "epoch": 0.44264943457189015, "grad_norm": 0.049508459866046906, "learning_rate": 1.3600000000000002e-05, "loss": 0.1035, "step": 137 }, { "epoch": 0.4458804523424879, "grad_norm": 0.05521553382277489, "learning_rate": 1.3700000000000003e-05, "loss": 0.0978, "step": 138 }, { "epoch": 0.4491114701130856, "grad_norm": 0.05352664738893509, "learning_rate": 1.38e-05, "loss": 0.1097, "step": 139 }, { "epoch": 0.45234248788368336, "grad_norm": 0.04332451522350311, "learning_rate": 1.39e-05, "loss": 0.0917, "step": 140 }, { "epoch": 0.4555735056542811, "grad_norm": 0.05932965502142906, "learning_rate": 1.4e-05, "loss": 0.0909, "step": 141 }, { "epoch": 0.4588045234248788, "grad_norm": 0.04634483903646469, "learning_rate": 1.41e-05, "loss": 0.0806, "step": 142 }, { "epoch": 0.4620355411954766, "grad_norm": 0.037898797541856766, "learning_rate": 1.4200000000000001e-05, "loss": 0.0802, "step": 143 }, { "epoch": 0.46526655896607433, "grad_norm": 0.04366337135434151, "learning_rate": 1.43e-05, "loss": 0.0835, "step": 144 }, { "epoch": 0.46849757673667203, "grad_norm": 0.03588287532329559, "learning_rate": 1.4400000000000001e-05, "loss": 0.0746, "step": 145 }, { "epoch": 0.4717285945072698, "grad_norm": 0.04979556053876877, "learning_rate": 1.45e-05, "loss": 0.0914, "step": 146 }, { "epoch": 0.47495961227786754, "grad_norm": 0.03938375040888786, "learning_rate": 1.46e-05, "loss": 0.0696, "step": 147 }, { "epoch": 0.4781906300484653, "grad_norm": 0.04531609266996384, "learning_rate": 1.4700000000000002e-05, "loss": 0.0847, "step": 148 }, { "epoch": 0.481421647819063, "grad_norm": 0.04314682260155678, "learning_rate": 1.48e-05, "loss": 0.0872, "step": 149 }, { "epoch": 0.48465266558966075, "grad_norm": 0.04925313591957092, "learning_rate": 1.4900000000000001e-05, "loss": 0.0905, "step": 150 }, { "epoch": 0.48465266558966075, "eval_loss": 0.09467896819114685, "eval_runtime": 188.3915, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 150 }, { "epoch": 0.4878836833602585, "grad_norm": 0.04450507089495659, "learning_rate": 1.5000000000000002e-05, "loss": 0.0704, "step": 151 }, { "epoch": 0.4911147011308562, "grad_norm": 0.03887806460261345, "learning_rate": 1.5100000000000001e-05, "loss": 0.0727, "step": 152 }, { "epoch": 0.49434571890145396, "grad_norm": 0.04298221319913864, "learning_rate": 1.5200000000000002e-05, "loss": 0.0763, "step": 153 }, { "epoch": 0.4975767366720517, "grad_norm": 0.046640265733003616, "learning_rate": 1.5300000000000003e-05, "loss": 0.0794, "step": 154 }, { "epoch": 0.5008077544426495, "grad_norm": 0.03670887276530266, "learning_rate": 1.54e-05, "loss": 0.0681, "step": 155 }, { "epoch": 0.5040387722132472, "grad_norm": 0.03891611844301224, "learning_rate": 1.55e-05, "loss": 0.0712, "step": 156 }, { "epoch": 0.5072697899838449, "grad_norm": 0.042465128004550934, "learning_rate": 1.5600000000000003e-05, "loss": 0.0828, "step": 157 }, { "epoch": 0.5105008077544426, "grad_norm": 0.054676711559295654, "learning_rate": 1.5700000000000002e-05, "loss": 0.0978, "step": 158 }, { "epoch": 0.5137318255250404, "grad_norm": 0.040950365364551544, "learning_rate": 1.58e-05, "loss": 0.0822, "step": 159 }, { "epoch": 0.5169628432956381, "grad_norm": 0.04756248742341995, "learning_rate": 1.5900000000000004e-05, "loss": 0.0914, "step": 160 }, { "epoch": 0.5201938610662359, "grad_norm": 0.046163927763700485, "learning_rate": 1.6000000000000003e-05, "loss": 0.089, "step": 161 }, { "epoch": 0.5234248788368336, "grad_norm": 0.04039200022816658, "learning_rate": 1.6100000000000002e-05, "loss": 0.0657, "step": 162 }, { "epoch": 0.5266558966074314, "grad_norm": 0.05602607503533363, "learning_rate": 1.62e-05, "loss": 0.1044, "step": 163 }, { "epoch": 0.529886914378029, "grad_norm": 0.04312260076403618, "learning_rate": 1.63e-05, "loss": 0.0821, "step": 164 }, { "epoch": 0.5331179321486268, "grad_norm": 0.047874368727207184, "learning_rate": 1.64e-05, "loss": 0.0824, "step": 165 }, { "epoch": 0.5363489499192245, "grad_norm": 0.06421804428100586, "learning_rate": 1.65e-05, "loss": 0.1126, "step": 166 }, { "epoch": 0.5395799676898223, "grad_norm": 0.05542091280221939, "learning_rate": 1.66e-05, "loss": 0.0959, "step": 167 }, { "epoch": 0.5428109854604201, "grad_norm": 0.05822491645812988, "learning_rate": 1.67e-05, "loss": 0.099, "step": 168 }, { "epoch": 0.5460420032310178, "grad_norm": 0.04154228791594505, "learning_rate": 1.6800000000000002e-05, "loss": 0.0737, "step": 169 }, { "epoch": 0.5492730210016155, "grad_norm": 0.04687827080488205, "learning_rate": 1.69e-05, "loss": 0.0725, "step": 170 }, { "epoch": 0.5525040387722132, "grad_norm": 0.056682366877794266, "learning_rate": 1.7e-05, "loss": 0.1004, "step": 171 }, { "epoch": 0.555735056542811, "grad_norm": 0.0667276531457901, "learning_rate": 1.7100000000000002e-05, "loss": 0.0917, "step": 172 }, { "epoch": 0.5589660743134087, "grad_norm": 0.044881295412778854, "learning_rate": 1.72e-05, "loss": 0.0689, "step": 173 }, { "epoch": 0.5621970920840065, "grad_norm": 0.0385456345975399, "learning_rate": 1.73e-05, "loss": 0.0592, "step": 174 }, { "epoch": 0.5654281098546042, "grad_norm": 0.05141144245862961, "learning_rate": 1.7400000000000003e-05, "loss": 0.0895, "step": 175 }, { "epoch": 0.568659127625202, "grad_norm": 0.06854357570409775, "learning_rate": 1.7500000000000002e-05, "loss": 0.0692, "step": 176 }, { "epoch": 0.5718901453957996, "grad_norm": 0.04410829395055771, "learning_rate": 1.76e-05, "loss": 0.0688, "step": 177 }, { "epoch": 0.5751211631663974, "grad_norm": 0.03727763518691063, "learning_rate": 1.77e-05, "loss": 0.0638, "step": 178 }, { "epoch": 0.5783521809369951, "grad_norm": 0.044415879994630814, "learning_rate": 1.7800000000000002e-05, "loss": 0.0682, "step": 179 }, { "epoch": 0.5815831987075929, "grad_norm": 0.06855777651071548, "learning_rate": 1.79e-05, "loss": 0.1018, "step": 180 }, { "epoch": 0.5848142164781907, "grad_norm": 0.053684502840042114, "learning_rate": 1.8e-05, "loss": 0.0862, "step": 181 }, { "epoch": 0.5880452342487884, "grad_norm": 0.0487506277859211, "learning_rate": 1.8100000000000003e-05, "loss": 0.073, "step": 182 }, { "epoch": 0.5912762520193862, "grad_norm": 0.04568934440612793, "learning_rate": 1.8200000000000002e-05, "loss": 0.0726, "step": 183 }, { "epoch": 0.5945072697899838, "grad_norm": 0.04607719928026199, "learning_rate": 1.83e-05, "loss": 0.0685, "step": 184 }, { "epoch": 0.5977382875605816, "grad_norm": 0.05040200799703598, "learning_rate": 1.8400000000000003e-05, "loss": 0.0721, "step": 185 }, { "epoch": 0.6009693053311793, "grad_norm": 0.0538799948990345, "learning_rate": 1.8500000000000002e-05, "loss": 0.0769, "step": 186 }, { "epoch": 0.6042003231017771, "grad_norm": 0.058767516165971756, "learning_rate": 1.86e-05, "loss": 0.1015, "step": 187 }, { "epoch": 0.6074313408723748, "grad_norm": 0.056379787623882294, "learning_rate": 1.8700000000000004e-05, "loss": 0.0887, "step": 188 }, { "epoch": 0.6106623586429726, "grad_norm": 0.04885280132293701, "learning_rate": 1.88e-05, "loss": 0.0767, "step": 189 }, { "epoch": 0.6138933764135702, "grad_norm": 0.04340769350528717, "learning_rate": 1.8900000000000002e-05, "loss": 0.061, "step": 190 }, { "epoch": 0.617124394184168, "grad_norm": 0.051385678350925446, "learning_rate": 1.9e-05, "loss": 0.0794, "step": 191 }, { "epoch": 0.6203554119547657, "grad_norm": 0.03737674281001091, "learning_rate": 1.91e-05, "loss": 0.0594, "step": 192 }, { "epoch": 0.6235864297253635, "grad_norm": 0.047995615750551224, "learning_rate": 1.9200000000000003e-05, "loss": 0.06, "step": 193 }, { "epoch": 0.6268174474959612, "grad_norm": 0.04322716221213341, "learning_rate": 1.93e-05, "loss": 0.0712, "step": 194 }, { "epoch": 0.630048465266559, "grad_norm": 0.044713038951158524, "learning_rate": 1.94e-05, "loss": 0.0679, "step": 195 }, { "epoch": 0.6332794830371568, "grad_norm": 0.05132253095507622, "learning_rate": 1.95e-05, "loss": 0.0723, "step": 196 }, { "epoch": 0.6365105008077544, "grad_norm": 0.039808765053749084, "learning_rate": 1.9600000000000002e-05, "loss": 0.057, "step": 197 }, { "epoch": 0.6397415185783522, "grad_norm": 0.05255698040127754, "learning_rate": 1.97e-05, "loss": 0.0774, "step": 198 }, { "epoch": 0.6429725363489499, "grad_norm": 0.05560529604554176, "learning_rate": 1.98e-05, "loss": 0.0839, "step": 199 }, { "epoch": 0.6462035541195477, "grad_norm": 0.0640430748462677, "learning_rate": 1.9900000000000003e-05, "loss": 0.0993, "step": 200 }, { "epoch": 0.6494345718901454, "grad_norm": 0.05945132300257683, "learning_rate": 2e-05, "loss": 0.0917, "step": 201 }, { "epoch": 0.6526655896607432, "grad_norm": 0.0569840632379055, "learning_rate": 1.9995490417136416e-05, "loss": 0.096, "step": 202 }, { "epoch": 0.6558966074313409, "grad_norm": 0.052612412720918655, "learning_rate": 1.999098083427283e-05, "loss": 0.079, "step": 203 }, { "epoch": 0.6591276252019386, "grad_norm": 0.05563991889357567, "learning_rate": 1.9986471251409248e-05, "loss": 0.0748, "step": 204 }, { "epoch": 0.6623586429725363, "grad_norm": 0.0527903214097023, "learning_rate": 1.9981961668545663e-05, "loss": 0.0789, "step": 205 }, { "epoch": 0.6655896607431341, "grad_norm": 0.06214462220668793, "learning_rate": 1.9977452085682077e-05, "loss": 0.0806, "step": 206 }, { "epoch": 0.6688206785137318, "grad_norm": 0.05324917659163475, "learning_rate": 1.997294250281849e-05, "loss": 0.0778, "step": 207 }, { "epoch": 0.6720516962843296, "grad_norm": 0.03270899876952171, "learning_rate": 1.9968432919954906e-05, "loss": 0.0469, "step": 208 }, { "epoch": 0.6752827140549273, "grad_norm": 0.060554295778274536, "learning_rate": 1.996392333709132e-05, "loss": 0.0926, "step": 209 }, { "epoch": 0.678513731825525, "grad_norm": 0.05076554790139198, "learning_rate": 1.9959413754227738e-05, "loss": 0.0752, "step": 210 }, { "epoch": 0.6817447495961227, "grad_norm": 0.05301008000969887, "learning_rate": 1.9954904171364152e-05, "loss": 0.0783, "step": 211 }, { "epoch": 0.6849757673667205, "grad_norm": 0.04376392439007759, "learning_rate": 1.9950394588500567e-05, "loss": 0.0603, "step": 212 }, { "epoch": 0.6882067851373183, "grad_norm": 0.049395546317100525, "learning_rate": 1.994588500563698e-05, "loss": 0.071, "step": 213 }, { "epoch": 0.691437802907916, "grad_norm": 0.061137910932302475, "learning_rate": 1.9941375422773395e-05, "loss": 0.0995, "step": 214 }, { "epoch": 0.6946688206785138, "grad_norm": 0.05603012442588806, "learning_rate": 1.993686583990981e-05, "loss": 0.0787, "step": 215 }, { "epoch": 0.6978998384491115, "grad_norm": 0.06130882352590561, "learning_rate": 1.9932356257046224e-05, "loss": 0.0745, "step": 216 }, { "epoch": 0.7011308562197092, "grad_norm": 0.06154336780309677, "learning_rate": 1.9927846674182642e-05, "loss": 0.0707, "step": 217 }, { "epoch": 0.7043618739903069, "grad_norm": 0.06344747543334961, "learning_rate": 1.9923337091319056e-05, "loss": 0.0978, "step": 218 }, { "epoch": 0.7075928917609047, "grad_norm": 0.06630375236272812, "learning_rate": 1.991882750845547e-05, "loss": 0.0721, "step": 219 }, { "epoch": 0.7108239095315024, "grad_norm": 0.04976838827133179, "learning_rate": 1.9914317925591885e-05, "loss": 0.0595, "step": 220 }, { "epoch": 0.7140549273021002, "grad_norm": 0.05968477949500084, "learning_rate": 1.99098083427283e-05, "loss": 0.0792, "step": 221 }, { "epoch": 0.7172859450726979, "grad_norm": 0.0983344092965126, "learning_rate": 1.9905298759864714e-05, "loss": 0.0872, "step": 222 }, { "epoch": 0.7205169628432956, "grad_norm": 0.08061773329973221, "learning_rate": 1.990078917700113e-05, "loss": 0.0824, "step": 223 }, { "epoch": 0.7237479806138933, "grad_norm": 0.055074963718652725, "learning_rate": 1.9896279594137543e-05, "loss": 0.0677, "step": 224 }, { "epoch": 0.7269789983844911, "grad_norm": 0.06062469258904457, "learning_rate": 1.9891770011273957e-05, "loss": 0.0658, "step": 225 }, { "epoch": 0.7269789983844911, "eval_loss": 0.08582010865211487, "eval_runtime": 188.2111, "eval_samples_per_second": 1.047, "eval_steps_per_second": 1.047, "step": 225 }, { "epoch": 0.7302100161550888, "grad_norm": 0.07606152445077896, "learning_rate": 1.988726042841037e-05, "loss": 0.1078, "step": 226 }, { "epoch": 0.7334410339256866, "grad_norm": 0.06171920895576477, "learning_rate": 1.988275084554679e-05, "loss": 0.086, "step": 227 }, { "epoch": 0.7366720516962844, "grad_norm": 0.03934045881032944, "learning_rate": 1.9878241262683204e-05, "loss": 0.0516, "step": 228 }, { "epoch": 0.7399030694668821, "grad_norm": 0.0552021786570549, "learning_rate": 1.9873731679819618e-05, "loss": 0.0761, "step": 229 }, { "epoch": 0.7431340872374798, "grad_norm": 0.05151893198490143, "learning_rate": 1.9869222096956032e-05, "loss": 0.0726, "step": 230 }, { "epoch": 0.7463651050080775, "grad_norm": 0.0533306822180748, "learning_rate": 1.9864712514092447e-05, "loss": 0.0717, "step": 231 }, { "epoch": 0.7495961227786753, "grad_norm": 0.052841685712337494, "learning_rate": 1.986020293122886e-05, "loss": 0.0755, "step": 232 }, { "epoch": 0.752827140549273, "grad_norm": 0.040998924523591995, "learning_rate": 1.9855693348365276e-05, "loss": 0.0554, "step": 233 }, { "epoch": 0.7560581583198708, "grad_norm": 0.057859815657138824, "learning_rate": 1.9851183765501693e-05, "loss": 0.0719, "step": 234 }, { "epoch": 0.7592891760904685, "grad_norm": 0.04167502373456955, "learning_rate": 1.9846674182638108e-05, "loss": 0.0625, "step": 235 }, { "epoch": 0.7625201938610663, "grad_norm": 0.058570049703121185, "learning_rate": 1.9842164599774522e-05, "loss": 0.0773, "step": 236 }, { "epoch": 0.7657512116316639, "grad_norm": 0.06181475892663002, "learning_rate": 1.9837655016910937e-05, "loss": 0.083, "step": 237 }, { "epoch": 0.7689822294022617, "grad_norm": 0.06188640370965004, "learning_rate": 1.983314543404735e-05, "loss": 0.0838, "step": 238 }, { "epoch": 0.7722132471728594, "grad_norm": 0.0784875750541687, "learning_rate": 1.9828635851183765e-05, "loss": 0.1032, "step": 239 }, { "epoch": 0.7754442649434572, "grad_norm": 0.06791771203279495, "learning_rate": 1.9824126268320183e-05, "loss": 0.0888, "step": 240 }, { "epoch": 0.778675282714055, "grad_norm": 0.06790699809789658, "learning_rate": 1.9819616685456598e-05, "loss": 0.0919, "step": 241 }, { "epoch": 0.7819063004846527, "grad_norm": 0.04812704026699066, "learning_rate": 1.9815107102593012e-05, "loss": 0.058, "step": 242 }, { "epoch": 0.7851373182552503, "grad_norm": 0.0617465041577816, "learning_rate": 1.9810597519729426e-05, "loss": 0.0852, "step": 243 }, { "epoch": 0.7883683360258481, "grad_norm": 0.049844078719615936, "learning_rate": 1.980608793686584e-05, "loss": 0.0668, "step": 244 }, { "epoch": 0.7915993537964459, "grad_norm": 0.0725836232304573, "learning_rate": 1.9801578354002255e-05, "loss": 0.0929, "step": 245 }, { "epoch": 0.7948303715670436, "grad_norm": 0.0587320439517498, "learning_rate": 1.9797068771138673e-05, "loss": 0.0764, "step": 246 }, { "epoch": 0.7980613893376414, "grad_norm": 0.04824487864971161, "learning_rate": 1.9792559188275087e-05, "loss": 0.0562, "step": 247 }, { "epoch": 0.8012924071082391, "grad_norm": 0.06668104231357574, "learning_rate": 1.97880496054115e-05, "loss": 0.0842, "step": 248 }, { "epoch": 0.8045234248788369, "grad_norm": 0.057721976190805435, "learning_rate": 1.9783540022547916e-05, "loss": 0.0838, "step": 249 }, { "epoch": 0.8077544426494345, "grad_norm": 0.07014774531126022, "learning_rate": 1.977903043968433e-05, "loss": 0.0831, "step": 250 }, { "epoch": 0.8109854604200323, "grad_norm": 0.0693356841802597, "learning_rate": 1.9774520856820745e-05, "loss": 0.0903, "step": 251 }, { "epoch": 0.81421647819063, "grad_norm": 0.05464401841163635, "learning_rate": 1.9770011273957163e-05, "loss": 0.0682, "step": 252 }, { "epoch": 0.8174474959612278, "grad_norm": 0.053677983582019806, "learning_rate": 1.9765501691093577e-05, "loss": 0.0588, "step": 253 }, { "epoch": 0.8206785137318255, "grad_norm": 0.04152385890483856, "learning_rate": 1.976099210822999e-05, "loss": 0.0564, "step": 254 }, { "epoch": 0.8239095315024233, "grad_norm": 0.06150719150900841, "learning_rate": 1.9756482525366406e-05, "loss": 0.0694, "step": 255 }, { "epoch": 0.827140549273021, "grad_norm": 0.05433864891529083, "learning_rate": 1.975197294250282e-05, "loss": 0.067, "step": 256 }, { "epoch": 0.8303715670436187, "grad_norm": 0.04325372353196144, "learning_rate": 1.9747463359639235e-05, "loss": 0.0575, "step": 257 }, { "epoch": 0.8336025848142165, "grad_norm": 0.049097690731287, "learning_rate": 1.9742953776775652e-05, "loss": 0.0571, "step": 258 }, { "epoch": 0.8368336025848142, "grad_norm": 0.06370379030704498, "learning_rate": 1.9738444193912067e-05, "loss": 0.0877, "step": 259 }, { "epoch": 0.840064620355412, "grad_norm": 0.05573710799217224, "learning_rate": 1.973393461104848e-05, "loss": 0.0759, "step": 260 }, { "epoch": 0.8432956381260097, "grad_norm": 0.06537079066038132, "learning_rate": 1.9729425028184896e-05, "loss": 0.0759, "step": 261 }, { "epoch": 0.8465266558966075, "grad_norm": 0.04301934316754341, "learning_rate": 1.972491544532131e-05, "loss": 0.0564, "step": 262 }, { "epoch": 0.8497576736672051, "grad_norm": 0.07281677424907684, "learning_rate": 1.9720405862457724e-05, "loss": 0.0685, "step": 263 }, { "epoch": 0.8529886914378029, "grad_norm": 0.06059825047850609, "learning_rate": 1.971589627959414e-05, "loss": 0.071, "step": 264 }, { "epoch": 0.8562197092084006, "grad_norm": 0.05605108663439751, "learning_rate": 1.9711386696730553e-05, "loss": 0.0634, "step": 265 }, { "epoch": 0.8594507269789984, "grad_norm": 0.07372546941041946, "learning_rate": 1.9706877113866967e-05, "loss": 0.0916, "step": 266 }, { "epoch": 0.8626817447495961, "grad_norm": 0.051352906972169876, "learning_rate": 1.9702367531003382e-05, "loss": 0.0653, "step": 267 }, { "epoch": 0.8659127625201939, "grad_norm": 0.059334397315979004, "learning_rate": 1.96978579481398e-05, "loss": 0.0717, "step": 268 }, { "epoch": 0.8691437802907916, "grad_norm": 0.06220857426524162, "learning_rate": 1.9693348365276214e-05, "loss": 0.0612, "step": 269 }, { "epoch": 0.8723747980613893, "grad_norm": 0.053203944116830826, "learning_rate": 1.968883878241263e-05, "loss": 0.0699, "step": 270 }, { "epoch": 0.875605815831987, "grad_norm": 0.06943807750940323, "learning_rate": 1.9684329199549043e-05, "loss": 0.0793, "step": 271 }, { "epoch": 0.8788368336025848, "grad_norm": 0.07023902982473373, "learning_rate": 1.9679819616685457e-05, "loss": 0.0859, "step": 272 }, { "epoch": 0.8820678513731826, "grad_norm": 0.06727661192417145, "learning_rate": 1.967531003382187e-05, "loss": 0.0796, "step": 273 }, { "epoch": 0.8852988691437803, "grad_norm": 0.08004336804151535, "learning_rate": 1.9670800450958286e-05, "loss": 0.0989, "step": 274 }, { "epoch": 0.8885298869143781, "grad_norm": 0.06687738746404648, "learning_rate": 1.96662908680947e-05, "loss": 0.0674, "step": 275 }, { "epoch": 0.8917609046849758, "grad_norm": 0.06280867755413055, "learning_rate": 1.9661781285231118e-05, "loss": 0.0735, "step": 276 }, { "epoch": 0.8949919224555735, "grad_norm": 0.06883740425109863, "learning_rate": 1.9657271702367533e-05, "loss": 0.0893, "step": 277 }, { "epoch": 0.8982229402261712, "grad_norm": 0.059292376041412354, "learning_rate": 1.9652762119503947e-05, "loss": 0.0819, "step": 278 }, { "epoch": 0.901453957996769, "grad_norm": 0.0578530877828598, "learning_rate": 1.964825253664036e-05, "loss": 0.0724, "step": 279 }, { "epoch": 0.9046849757673667, "grad_norm": 0.08932427316904068, "learning_rate": 1.9643742953776776e-05, "loss": 0.0939, "step": 280 }, { "epoch": 0.9079159935379645, "grad_norm": 0.07406419515609741, "learning_rate": 1.963923337091319e-05, "loss": 0.0868, "step": 281 }, { "epoch": 0.9111470113085622, "grad_norm": 0.05354011803865433, "learning_rate": 1.9634723788049608e-05, "loss": 0.0685, "step": 282 }, { "epoch": 0.9143780290791599, "grad_norm": 0.06414072960615158, "learning_rate": 1.9630214205186022e-05, "loss": 0.0686, "step": 283 }, { "epoch": 0.9176090468497576, "grad_norm": 0.058192793279886246, "learning_rate": 1.9625704622322437e-05, "loss": 0.0674, "step": 284 }, { "epoch": 0.9208400646203554, "grad_norm": 0.10264746099710464, "learning_rate": 1.962119503945885e-05, "loss": 0.1152, "step": 285 }, { "epoch": 0.9240710823909531, "grad_norm": 0.066757932305336, "learning_rate": 1.9616685456595265e-05, "loss": 0.0757, "step": 286 }, { "epoch": 0.9273021001615509, "grad_norm": 0.06598404794931412, "learning_rate": 1.9612175873731683e-05, "loss": 0.0768, "step": 287 }, { "epoch": 0.9305331179321487, "grad_norm": 0.07162454724311829, "learning_rate": 1.9607666290868098e-05, "loss": 0.081, "step": 288 }, { "epoch": 0.9337641357027464, "grad_norm": 0.05917588993906975, "learning_rate": 1.9603156708004512e-05, "loss": 0.0645, "step": 289 }, { "epoch": 0.9369951534733441, "grad_norm": 0.06051475182175636, "learning_rate": 1.9598647125140926e-05, "loss": 0.0656, "step": 290 }, { "epoch": 0.9402261712439418, "grad_norm": 0.06452775001525879, "learning_rate": 1.959413754227734e-05, "loss": 0.0704, "step": 291 }, { "epoch": 0.9434571890145396, "grad_norm": 0.06445769965648651, "learning_rate": 1.9589627959413755e-05, "loss": 0.0759, "step": 292 }, { "epoch": 0.9466882067851373, "grad_norm": 0.06948834657669067, "learning_rate": 1.9585118376550173e-05, "loss": 0.0776, "step": 293 }, { "epoch": 0.9499192245557351, "grad_norm": 0.05026319995522499, "learning_rate": 1.9580608793686587e-05, "loss": 0.0574, "step": 294 }, { "epoch": 0.9531502423263328, "grad_norm": 0.08733383566141129, "learning_rate": 1.9576099210823002e-05, "loss": 0.0898, "step": 295 }, { "epoch": 0.9563812600969306, "grad_norm": 0.05138668045401573, "learning_rate": 1.9571589627959416e-05, "loss": 0.0525, "step": 296 }, { "epoch": 0.9596122778675282, "grad_norm": 0.0710444375872612, "learning_rate": 1.956708004509583e-05, "loss": 0.0832, "step": 297 }, { "epoch": 0.962843295638126, "grad_norm": 0.06288463622331619, "learning_rate": 1.9562570462232245e-05, "loss": 0.0673, "step": 298 }, { "epoch": 0.9660743134087237, "grad_norm": 0.05722356587648392, "learning_rate": 1.9558060879368663e-05, "loss": 0.0702, "step": 299 }, { "epoch": 0.9693053311793215, "grad_norm": 0.07167758047580719, "learning_rate": 1.9553551296505077e-05, "loss": 0.0809, "step": 300 }, { "epoch": 0.9693053311793215, "eval_loss": 0.08212888240814209, "eval_runtime": 188.3411, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 300 }, { "epoch": 0.9725363489499192, "grad_norm": 0.04536513611674309, "learning_rate": 1.954904171364149e-05, "loss": 0.0485, "step": 301 }, { "epoch": 0.975767366720517, "grad_norm": 0.07035136222839355, "learning_rate": 1.9544532130777906e-05, "loss": 0.0763, "step": 302 }, { "epoch": 0.9789983844911146, "grad_norm": 0.06417107582092285, "learning_rate": 1.954002254791432e-05, "loss": 0.0735, "step": 303 }, { "epoch": 0.9822294022617124, "grad_norm": 0.06369137018918991, "learning_rate": 1.9535512965050735e-05, "loss": 0.07, "step": 304 }, { "epoch": 0.9854604200323102, "grad_norm": 0.053664304316043854, "learning_rate": 1.953100338218715e-05, "loss": 0.058, "step": 305 }, { "epoch": 0.9886914378029079, "grad_norm": 0.07393426448106766, "learning_rate": 1.9526493799323563e-05, "loss": 0.0783, "step": 306 }, { "epoch": 0.9919224555735057, "grad_norm": 0.05141662806272507, "learning_rate": 1.9521984216459978e-05, "loss": 0.0556, "step": 307 }, { "epoch": 0.9951534733441034, "grad_norm": 0.06411275267601013, "learning_rate": 1.9517474633596392e-05, "loss": 0.0687, "step": 308 }, { "epoch": 0.9983844911147012, "grad_norm": 0.08877477794885635, "learning_rate": 1.951296505073281e-05, "loss": 0.0846, "step": 309 }, { "epoch": 1.0, "grad_norm": 0.08283062279224396, "learning_rate": 1.9508455467869224e-05, "loss": 0.0541, "step": 310 }, { "epoch": 1.0032310177705976, "grad_norm": 0.06769707798957825, "learning_rate": 1.950394588500564e-05, "loss": 0.0752, "step": 311 }, { "epoch": 1.0064620355411955, "grad_norm": 0.06125921383500099, "learning_rate": 1.9499436302142053e-05, "loss": 0.0719, "step": 312 }, { "epoch": 1.0096930533117932, "grad_norm": 0.03994071111083031, "learning_rate": 1.9494926719278468e-05, "loss": 0.0421, "step": 313 }, { "epoch": 1.012924071082391, "grad_norm": 0.05433064326643944, "learning_rate": 1.9490417136414882e-05, "loss": 0.066, "step": 314 }, { "epoch": 1.0161550888529887, "grad_norm": 0.06107380986213684, "learning_rate": 1.9485907553551296e-05, "loss": 0.0724, "step": 315 }, { "epoch": 1.0193861066235865, "grad_norm": 0.0669042244553566, "learning_rate": 1.948139797068771e-05, "loss": 0.0772, "step": 316 }, { "epoch": 1.0226171243941842, "grad_norm": 0.0474565327167511, "learning_rate": 1.947688838782413e-05, "loss": 0.0491, "step": 317 }, { "epoch": 1.0258481421647818, "grad_norm": 0.054098691791296005, "learning_rate": 1.9472378804960543e-05, "loss": 0.0618, "step": 318 }, { "epoch": 1.0290791599353797, "grad_norm": 0.06151336431503296, "learning_rate": 1.9467869222096957e-05, "loss": 0.0604, "step": 319 }, { "epoch": 1.0323101777059773, "grad_norm": 0.051618464291095734, "learning_rate": 1.946335963923337e-05, "loss": 0.0551, "step": 320 }, { "epoch": 1.0355411954765752, "grad_norm": 0.08121399581432343, "learning_rate": 1.9458850056369786e-05, "loss": 0.0939, "step": 321 }, { "epoch": 1.0387722132471728, "grad_norm": 0.05889379233121872, "learning_rate": 1.94543404735062e-05, "loss": 0.0687, "step": 322 }, { "epoch": 1.0420032310177707, "grad_norm": 0.06208242103457451, "learning_rate": 1.9449830890642618e-05, "loss": 0.0744, "step": 323 }, { "epoch": 1.0452342487883683, "grad_norm": 0.06454786658287048, "learning_rate": 1.9445321307779033e-05, "loss": 0.0684, "step": 324 }, { "epoch": 1.048465266558966, "grad_norm": 0.07085470855236053, "learning_rate": 1.9440811724915447e-05, "loss": 0.0727, "step": 325 }, { "epoch": 1.0516962843295639, "grad_norm": 0.07236117869615555, "learning_rate": 1.943630214205186e-05, "loss": 0.0892, "step": 326 }, { "epoch": 1.0549273021001615, "grad_norm": 0.054056137800216675, "learning_rate": 1.9431792559188276e-05, "loss": 0.0634, "step": 327 }, { "epoch": 1.0581583198707594, "grad_norm": 0.05462612211704254, "learning_rate": 1.942728297632469e-05, "loss": 0.0631, "step": 328 }, { "epoch": 1.061389337641357, "grad_norm": 0.0674949586391449, "learning_rate": 1.9422773393461108e-05, "loss": 0.0641, "step": 329 }, { "epoch": 1.0646203554119547, "grad_norm": 0.07532529532909393, "learning_rate": 1.9418263810597522e-05, "loss": 0.0765, "step": 330 }, { "epoch": 1.0678513731825525, "grad_norm": 0.06264142692089081, "learning_rate": 1.9413754227733937e-05, "loss": 0.0704, "step": 331 }, { "epoch": 1.0710823909531502, "grad_norm": 0.06789285689592361, "learning_rate": 1.940924464487035e-05, "loss": 0.0667, "step": 332 }, { "epoch": 1.074313408723748, "grad_norm": 0.06181450933218002, "learning_rate": 1.9404735062006765e-05, "loss": 0.0648, "step": 333 }, { "epoch": 1.0775444264943457, "grad_norm": 0.07014179229736328, "learning_rate": 1.940022547914318e-05, "loss": 0.076, "step": 334 }, { "epoch": 1.0807754442649435, "grad_norm": 0.07433414459228516, "learning_rate": 1.9395715896279598e-05, "loss": 0.0757, "step": 335 }, { "epoch": 1.0840064620355412, "grad_norm": 0.04758503660559654, "learning_rate": 1.9391206313416012e-05, "loss": 0.0492, "step": 336 }, { "epoch": 1.0872374798061388, "grad_norm": 0.06751306354999542, "learning_rate": 1.9386696730552426e-05, "loss": 0.0682, "step": 337 }, { "epoch": 1.0904684975767367, "grad_norm": 0.06028216332197189, "learning_rate": 1.938218714768884e-05, "loss": 0.059, "step": 338 }, { "epoch": 1.0936995153473343, "grad_norm": 0.060358040034770966, "learning_rate": 1.9377677564825255e-05, "loss": 0.0659, "step": 339 }, { "epoch": 1.0969305331179322, "grad_norm": 0.06687436252832413, "learning_rate": 1.937316798196167e-05, "loss": 0.0517, "step": 340 }, { "epoch": 1.1001615508885298, "grad_norm": 0.07463373243808746, "learning_rate": 1.9368658399098087e-05, "loss": 0.0674, "step": 341 }, { "epoch": 1.1033925686591277, "grad_norm": 0.06248531863093376, "learning_rate": 1.9364148816234502e-05, "loss": 0.0681, "step": 342 }, { "epoch": 1.1066235864297254, "grad_norm": 0.06864578276872635, "learning_rate": 1.9359639233370916e-05, "loss": 0.0703, "step": 343 }, { "epoch": 1.109854604200323, "grad_norm": 0.0693066269159317, "learning_rate": 1.935512965050733e-05, "loss": 0.0599, "step": 344 }, { "epoch": 1.1130856219709209, "grad_norm": 0.13610310852527618, "learning_rate": 1.9350620067643745e-05, "loss": 0.0789, "step": 345 }, { "epoch": 1.1163166397415185, "grad_norm": 0.09487364441156387, "learning_rate": 1.934611048478016e-05, "loss": 0.094, "step": 346 }, { "epoch": 1.1195476575121164, "grad_norm": 0.0767926499247551, "learning_rate": 1.9341600901916574e-05, "loss": 0.0751, "step": 347 }, { "epoch": 1.122778675282714, "grad_norm": 0.1105605959892273, "learning_rate": 1.9337091319052988e-05, "loss": 0.0908, "step": 348 }, { "epoch": 1.1260096930533119, "grad_norm": 0.06821838766336441, "learning_rate": 1.9332581736189403e-05, "loss": 0.0702, "step": 349 }, { "epoch": 1.1292407108239095, "grad_norm": 0.07123742997646332, "learning_rate": 1.932807215332582e-05, "loss": 0.0637, "step": 350 }, { "epoch": 1.1324717285945072, "grad_norm": 0.08340942859649658, "learning_rate": 1.9323562570462235e-05, "loss": 0.0803, "step": 351 }, { "epoch": 1.135702746365105, "grad_norm": 0.06730187684297562, "learning_rate": 1.931905298759865e-05, "loss": 0.0644, "step": 352 }, { "epoch": 1.1389337641357027, "grad_norm": 0.06728731095790863, "learning_rate": 1.9314543404735063e-05, "loss": 0.0633, "step": 353 }, { "epoch": 1.1421647819063006, "grad_norm": 0.07192697376012802, "learning_rate": 1.9310033821871478e-05, "loss": 0.0664, "step": 354 }, { "epoch": 1.1453957996768982, "grad_norm": 0.07150010764598846, "learning_rate": 1.9305524239007892e-05, "loss": 0.0745, "step": 355 }, { "epoch": 1.148626817447496, "grad_norm": 0.05815986543893814, "learning_rate": 1.9301014656144307e-05, "loss": 0.0501, "step": 356 }, { "epoch": 1.1518578352180937, "grad_norm": 0.063558429479599, "learning_rate": 1.929650507328072e-05, "loss": 0.064, "step": 357 }, { "epoch": 1.1550888529886914, "grad_norm": 0.08062389492988586, "learning_rate": 1.9291995490417135e-05, "loss": 0.0792, "step": 358 }, { "epoch": 1.1583198707592892, "grad_norm": 0.06872212886810303, "learning_rate": 1.9287485907553553e-05, "loss": 0.0756, "step": 359 }, { "epoch": 1.1615508885298869, "grad_norm": 0.06003013253211975, "learning_rate": 1.9282976324689968e-05, "loss": 0.0577, "step": 360 }, { "epoch": 1.1647819063004847, "grad_norm": 0.07533125579357147, "learning_rate": 1.9278466741826382e-05, "loss": 0.0745, "step": 361 }, { "epoch": 1.1680129240710824, "grad_norm": 0.0708516389131546, "learning_rate": 1.9273957158962796e-05, "loss": 0.0671, "step": 362 }, { "epoch": 1.1712439418416802, "grad_norm": 0.10226985812187195, "learning_rate": 1.926944757609921e-05, "loss": 0.1126, "step": 363 }, { "epoch": 1.1744749596122779, "grad_norm": 0.067733995616436, "learning_rate": 1.9264937993235625e-05, "loss": 0.0554, "step": 364 }, { "epoch": 1.1777059773828755, "grad_norm": 0.08708222955465317, "learning_rate": 1.9260428410372043e-05, "loss": 0.0806, "step": 365 }, { "epoch": 1.1809369951534734, "grad_norm": 0.06153462454676628, "learning_rate": 1.9255918827508457e-05, "loss": 0.0532, "step": 366 }, { "epoch": 1.184168012924071, "grad_norm": 0.051941219717264175, "learning_rate": 1.9251409244644872e-05, "loss": 0.0503, "step": 367 }, { "epoch": 1.187399030694669, "grad_norm": 0.09817774593830109, "learning_rate": 1.9246899661781286e-05, "loss": 0.0801, "step": 368 }, { "epoch": 1.1906300484652665, "grad_norm": 0.08504205197095871, "learning_rate": 1.92423900789177e-05, "loss": 0.08, "step": 369 }, { "epoch": 1.1938610662358644, "grad_norm": 0.0611301064491272, "learning_rate": 1.9237880496054118e-05, "loss": 0.061, "step": 370 }, { "epoch": 1.197092084006462, "grad_norm": 0.06038827449083328, "learning_rate": 1.9233370913190533e-05, "loss": 0.0512, "step": 371 }, { "epoch": 1.2003231017770597, "grad_norm": 0.08283468335866928, "learning_rate": 1.9228861330326947e-05, "loss": 0.0753, "step": 372 }, { "epoch": 1.2035541195476576, "grad_norm": 0.09547346830368042, "learning_rate": 1.922435174746336e-05, "loss": 0.0809, "step": 373 }, { "epoch": 1.2067851373182552, "grad_norm": 0.058611780405044556, "learning_rate": 1.9219842164599776e-05, "loss": 0.0563, "step": 374 }, { "epoch": 1.210016155088853, "grad_norm": 0.08549389988183975, "learning_rate": 1.921533258173619e-05, "loss": 0.0733, "step": 375 }, { "epoch": 1.210016155088853, "eval_loss": 0.0792868584394455, "eval_runtime": 188.2838, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 375 }, { "epoch": 1.2132471728594507, "grad_norm": 0.08417635411024094, "learning_rate": 1.9210822998872608e-05, "loss": 0.0841, "step": 376 }, { "epoch": 1.2164781906300486, "grad_norm": 0.08157463371753693, "learning_rate": 1.9206313416009022e-05, "loss": 0.0714, "step": 377 }, { "epoch": 1.2197092084006462, "grad_norm": 0.05649822950363159, "learning_rate": 1.9201803833145437e-05, "loss": 0.0503, "step": 378 }, { "epoch": 1.2229402261712439, "grad_norm": 0.07617928087711334, "learning_rate": 1.919729425028185e-05, "loss": 0.0727, "step": 379 }, { "epoch": 1.2261712439418417, "grad_norm": 0.0574098639190197, "learning_rate": 1.9192784667418266e-05, "loss": 0.0506, "step": 380 }, { "epoch": 1.2294022617124394, "grad_norm": 0.07354257255792618, "learning_rate": 1.918827508455468e-05, "loss": 0.0728, "step": 381 }, { "epoch": 1.2326332794830372, "grad_norm": 0.07268121838569641, "learning_rate": 1.9183765501691098e-05, "loss": 0.0679, "step": 382 }, { "epoch": 1.235864297253635, "grad_norm": 0.07641527056694031, "learning_rate": 1.9179255918827512e-05, "loss": 0.0663, "step": 383 }, { "epoch": 1.2390953150242328, "grad_norm": 0.059996772557497025, "learning_rate": 1.9174746335963926e-05, "loss": 0.0523, "step": 384 }, { "epoch": 1.2423263327948304, "grad_norm": 0.07397306710481644, "learning_rate": 1.917023675310034e-05, "loss": 0.0662, "step": 385 }, { "epoch": 1.245557350565428, "grad_norm": 0.09324625134468079, "learning_rate": 1.9165727170236755e-05, "loss": 0.083, "step": 386 }, { "epoch": 1.248788368336026, "grad_norm": 0.08019818365573883, "learning_rate": 1.916121758737317e-05, "loss": 0.0682, "step": 387 }, { "epoch": 1.2520193861066236, "grad_norm": 0.08203406631946564, "learning_rate": 1.9156708004509584e-05, "loss": 0.0788, "step": 388 }, { "epoch": 1.2552504038772212, "grad_norm": 0.07293461263179779, "learning_rate": 1.9152198421646e-05, "loss": 0.0583, "step": 389 }, { "epoch": 1.258481421647819, "grad_norm": 0.07020010054111481, "learning_rate": 1.9147688838782413e-05, "loss": 0.0546, "step": 390 }, { "epoch": 1.261712439418417, "grad_norm": 0.0655217245221138, "learning_rate": 1.914317925591883e-05, "loss": 0.0561, "step": 391 }, { "epoch": 1.2649434571890146, "grad_norm": 0.0773930773139, "learning_rate": 1.9138669673055245e-05, "loss": 0.0805, "step": 392 }, { "epoch": 1.2681744749596122, "grad_norm": 0.06243716925382614, "learning_rate": 1.913416009019166e-05, "loss": 0.0573, "step": 393 }, { "epoch": 1.27140549273021, "grad_norm": 0.07922864705324173, "learning_rate": 1.9129650507328074e-05, "loss": 0.0679, "step": 394 }, { "epoch": 1.2746365105008077, "grad_norm": 0.10133316367864609, "learning_rate": 1.9125140924464488e-05, "loss": 0.1135, "step": 395 }, { "epoch": 1.2778675282714054, "grad_norm": 0.0727897360920906, "learning_rate": 1.9120631341600903e-05, "loss": 0.0658, "step": 396 }, { "epoch": 1.2810985460420032, "grad_norm": 0.0690392330288887, "learning_rate": 1.9116121758737317e-05, "loss": 0.0656, "step": 397 }, { "epoch": 1.284329563812601, "grad_norm": 0.062050607055425644, "learning_rate": 1.911161217587373e-05, "loss": 0.0542, "step": 398 }, { "epoch": 1.2875605815831987, "grad_norm": 0.0690266340970993, "learning_rate": 1.9107102593010146e-05, "loss": 0.065, "step": 399 }, { "epoch": 1.2907915993537964, "grad_norm": 0.07588627934455872, "learning_rate": 1.9102593010146564e-05, "loss": 0.0761, "step": 400 }, { "epoch": 1.2940226171243943, "grad_norm": 0.07368933409452438, "learning_rate": 1.9098083427282978e-05, "loss": 0.0637, "step": 401 }, { "epoch": 1.297253634894992, "grad_norm": 0.0670572817325592, "learning_rate": 1.9093573844419392e-05, "loss": 0.0656, "step": 402 }, { "epoch": 1.3004846526655895, "grad_norm": 0.06778164952993393, "learning_rate": 1.9089064261555807e-05, "loss": 0.0577, "step": 403 }, { "epoch": 1.3037156704361874, "grad_norm": 0.10589181631803513, "learning_rate": 1.908455467869222e-05, "loss": 0.0849, "step": 404 }, { "epoch": 1.306946688206785, "grad_norm": 0.060739271342754364, "learning_rate": 1.9080045095828635e-05, "loss": 0.0563, "step": 405 }, { "epoch": 1.310177705977383, "grad_norm": 0.062488917261362076, "learning_rate": 1.9075535512965053e-05, "loss": 0.0555, "step": 406 }, { "epoch": 1.3134087237479806, "grad_norm": 0.08088962733745575, "learning_rate": 1.9071025930101468e-05, "loss": 0.0683, "step": 407 }, { "epoch": 1.3166397415185784, "grad_norm": 0.07679299265146255, "learning_rate": 1.9066516347237882e-05, "loss": 0.0713, "step": 408 }, { "epoch": 1.319870759289176, "grad_norm": 0.09731165319681168, "learning_rate": 1.9062006764374296e-05, "loss": 0.094, "step": 409 }, { "epoch": 1.3231017770597737, "grad_norm": 0.09488274902105331, "learning_rate": 1.905749718151071e-05, "loss": 0.0855, "step": 410 }, { "epoch": 1.3263327948303716, "grad_norm": 0.08556380867958069, "learning_rate": 1.9052987598647125e-05, "loss": 0.0872, "step": 411 }, { "epoch": 1.3295638126009692, "grad_norm": 0.0821579322218895, "learning_rate": 1.9048478015783543e-05, "loss": 0.0802, "step": 412 }, { "epoch": 1.332794830371567, "grad_norm": 0.08073097467422485, "learning_rate": 1.9043968432919957e-05, "loss": 0.0764, "step": 413 }, { "epoch": 1.3360258481421647, "grad_norm": 0.07449216395616531, "learning_rate": 1.9039458850056372e-05, "loss": 0.0681, "step": 414 }, { "epoch": 1.3392568659127626, "grad_norm": 0.05690048635005951, "learning_rate": 1.9034949267192786e-05, "loss": 0.0479, "step": 415 }, { "epoch": 1.3424878836833603, "grad_norm": 0.08106525242328644, "learning_rate": 1.90304396843292e-05, "loss": 0.0751, "step": 416 }, { "epoch": 1.345718901453958, "grad_norm": 0.06518511474132538, "learning_rate": 1.9025930101465615e-05, "loss": 0.0594, "step": 417 }, { "epoch": 1.3489499192245558, "grad_norm": 0.08395849913358688, "learning_rate": 1.9021420518602033e-05, "loss": 0.0756, "step": 418 }, { "epoch": 1.3521809369951534, "grad_norm": 0.05958770960569382, "learning_rate": 1.9016910935738447e-05, "loss": 0.0491, "step": 419 }, { "epoch": 1.3554119547657513, "grad_norm": 0.07311136275529861, "learning_rate": 1.901240135287486e-05, "loss": 0.0719, "step": 420 }, { "epoch": 1.358642972536349, "grad_norm": 0.09682740271091461, "learning_rate": 1.9007891770011276e-05, "loss": 0.0895, "step": 421 }, { "epoch": 1.3618739903069468, "grad_norm": 0.06295045465230942, "learning_rate": 1.900338218714769e-05, "loss": 0.0589, "step": 422 }, { "epoch": 1.3651050080775444, "grad_norm": 0.0831819698214531, "learning_rate": 1.8998872604284105e-05, "loss": 0.0727, "step": 423 }, { "epoch": 1.368336025848142, "grad_norm": 0.06702585518360138, "learning_rate": 1.8994363021420522e-05, "loss": 0.0617, "step": 424 }, { "epoch": 1.37156704361874, "grad_norm": 0.06618952006101608, "learning_rate": 1.8989853438556937e-05, "loss": 0.0517, "step": 425 }, { "epoch": 1.3747980613893376, "grad_norm": 0.07830128818750381, "learning_rate": 1.898534385569335e-05, "loss": 0.0747, "step": 426 }, { "epoch": 1.3780290791599354, "grad_norm": 0.07554402947425842, "learning_rate": 1.8980834272829766e-05, "loss": 0.078, "step": 427 }, { "epoch": 1.381260096930533, "grad_norm": 0.07517927139997482, "learning_rate": 1.897632468996618e-05, "loss": 0.0715, "step": 428 }, { "epoch": 1.384491114701131, "grad_norm": 0.05810945853590965, "learning_rate": 1.8971815107102594e-05, "loss": 0.0524, "step": 429 }, { "epoch": 1.3877221324717286, "grad_norm": 0.1092490404844284, "learning_rate": 1.896730552423901e-05, "loss": 0.0987, "step": 430 }, { "epoch": 1.3909531502423262, "grad_norm": 0.08325308561325073, "learning_rate": 1.8962795941375423e-05, "loss": 0.0746, "step": 431 }, { "epoch": 1.394184168012924, "grad_norm": 0.08017408847808838, "learning_rate": 1.895828635851184e-05, "loss": 0.0676, "step": 432 }, { "epoch": 1.3974151857835218, "grad_norm": 0.09756331145763397, "learning_rate": 1.8953776775648255e-05, "loss": 0.0788, "step": 433 }, { "epoch": 1.4006462035541196, "grad_norm": 0.0654483512043953, "learning_rate": 1.894926719278467e-05, "loss": 0.0552, "step": 434 }, { "epoch": 1.4038772213247173, "grad_norm": 0.07338982075452805, "learning_rate": 1.8944757609921084e-05, "loss": 0.0597, "step": 435 }, { "epoch": 1.4071082390953151, "grad_norm": 0.06292750686407089, "learning_rate": 1.89402480270575e-05, "loss": 0.0482, "step": 436 }, { "epoch": 1.4103392568659128, "grad_norm": 0.09405938535928726, "learning_rate": 1.8935738444193913e-05, "loss": 0.0795, "step": 437 }, { "epoch": 1.4135702746365104, "grad_norm": 0.09486392885446548, "learning_rate": 1.8931228861330327e-05, "loss": 0.0811, "step": 438 }, { "epoch": 1.4168012924071083, "grad_norm": 0.0729052945971489, "learning_rate": 1.892671927846674e-05, "loss": 0.0682, "step": 439 }, { "epoch": 1.420032310177706, "grad_norm": 0.06790515035390854, "learning_rate": 1.8922209695603156e-05, "loss": 0.0541, "step": 440 }, { "epoch": 1.4232633279483038, "grad_norm": 0.08173596858978271, "learning_rate": 1.891770011273957e-05, "loss": 0.0697, "step": 441 }, { "epoch": 1.4264943457189014, "grad_norm": 0.0874050036072731, "learning_rate": 1.8913190529875988e-05, "loss": 0.0689, "step": 442 }, { "epoch": 1.4297253634894993, "grad_norm": 0.07508452981710434, "learning_rate": 1.8908680947012403e-05, "loss": 0.069, "step": 443 }, { "epoch": 1.432956381260097, "grad_norm": 0.09134234488010406, "learning_rate": 1.8904171364148817e-05, "loss": 0.072, "step": 444 }, { "epoch": 1.4361873990306946, "grad_norm": 0.0830577090382576, "learning_rate": 1.889966178128523e-05, "loss": 0.0681, "step": 445 }, { "epoch": 1.4394184168012925, "grad_norm": 0.0741642490029335, "learning_rate": 1.8895152198421646e-05, "loss": 0.0642, "step": 446 }, { "epoch": 1.44264943457189, "grad_norm": 0.07305614650249481, "learning_rate": 1.889064261555806e-05, "loss": 0.0569, "step": 447 }, { "epoch": 1.445880452342488, "grad_norm": 0.05348379164934158, "learning_rate": 1.8886133032694478e-05, "loss": 0.0434, "step": 448 }, { "epoch": 1.4491114701130856, "grad_norm": 0.09780937433242798, "learning_rate": 1.8881623449830892e-05, "loss": 0.0856, "step": 449 }, { "epoch": 1.4523424878836835, "grad_norm": 0.081721231341362, "learning_rate": 1.8877113866967307e-05, "loss": 0.0655, "step": 450 }, { "epoch": 1.4523424878836835, "eval_loss": 0.0769171267747879, "eval_runtime": 188.2463, "eval_samples_per_second": 1.047, "eval_steps_per_second": 1.047, "step": 450 }, { "epoch": 1.4555735056542811, "grad_norm": 0.06182597577571869, "learning_rate": 1.887260428410372e-05, "loss": 0.0549, "step": 451 }, { "epoch": 1.4588045234248788, "grad_norm": 0.0831274464726448, "learning_rate": 1.8868094701240136e-05, "loss": 0.0664, "step": 452 }, { "epoch": 1.4620355411954766, "grad_norm": 0.07277555763721466, "learning_rate": 1.8863585118376553e-05, "loss": 0.0619, "step": 453 }, { "epoch": 1.4652665589660743, "grad_norm": 0.09069440513849258, "learning_rate": 1.8859075535512968e-05, "loss": 0.0717, "step": 454 }, { "epoch": 1.468497576736672, "grad_norm": 0.08567981421947479, "learning_rate": 1.8854565952649382e-05, "loss": 0.0761, "step": 455 }, { "epoch": 1.4717285945072698, "grad_norm": 0.08353572338819504, "learning_rate": 1.8850056369785796e-05, "loss": 0.0621, "step": 456 }, { "epoch": 1.4749596122778676, "grad_norm": 0.06493799388408661, "learning_rate": 1.884554678692221e-05, "loss": 0.0549, "step": 457 }, { "epoch": 1.4781906300484653, "grad_norm": 0.07239842414855957, "learning_rate": 1.8841037204058625e-05, "loss": 0.0574, "step": 458 }, { "epoch": 1.481421647819063, "grad_norm": 0.1062210276722908, "learning_rate": 1.8836527621195043e-05, "loss": 0.0831, "step": 459 }, { "epoch": 1.4846526655896608, "grad_norm": 0.06695660948753357, "learning_rate": 1.8832018038331457e-05, "loss": 0.0488, "step": 460 }, { "epoch": 1.4878836833602584, "grad_norm": 0.08332875370979309, "learning_rate": 1.8827508455467872e-05, "loss": 0.0752, "step": 461 }, { "epoch": 1.491114701130856, "grad_norm": 0.09285688400268555, "learning_rate": 1.8822998872604286e-05, "loss": 0.0811, "step": 462 }, { "epoch": 1.494345718901454, "grad_norm": 0.07672538608312607, "learning_rate": 1.88184892897407e-05, "loss": 0.0565, "step": 463 }, { "epoch": 1.4975767366720518, "grad_norm": 0.07295355945825577, "learning_rate": 1.8813979706877115e-05, "loss": 0.0615, "step": 464 }, { "epoch": 1.5008077544426495, "grad_norm": 0.05997586250305176, "learning_rate": 1.8809470124013533e-05, "loss": 0.0494, "step": 465 }, { "epoch": 1.504038772213247, "grad_norm": 0.08460883051156998, "learning_rate": 1.8804960541149947e-05, "loss": 0.0715, "step": 466 }, { "epoch": 1.507269789983845, "grad_norm": 0.08083106577396393, "learning_rate": 1.880045095828636e-05, "loss": 0.0615, "step": 467 }, { "epoch": 1.5105008077544426, "grad_norm": 0.09291260689496994, "learning_rate": 1.8795941375422776e-05, "loss": 0.0767, "step": 468 }, { "epoch": 1.5137318255250403, "grad_norm": 0.0817233994603157, "learning_rate": 1.879143179255919e-05, "loss": 0.0728, "step": 469 }, { "epoch": 1.5169628432956381, "grad_norm": 0.07894831895828247, "learning_rate": 1.8786922209695605e-05, "loss": 0.0697, "step": 470 }, { "epoch": 1.520193861066236, "grad_norm": 0.05180181935429573, "learning_rate": 1.878241262683202e-05, "loss": 0.0407, "step": 471 }, { "epoch": 1.5234248788368336, "grad_norm": 0.08214667439460754, "learning_rate": 1.8777903043968433e-05, "loss": 0.0673, "step": 472 }, { "epoch": 1.5266558966074313, "grad_norm": 0.06972946226596832, "learning_rate": 1.877339346110485e-05, "loss": 0.0501, "step": 473 }, { "epoch": 1.5298869143780292, "grad_norm": 0.08416459709405899, "learning_rate": 1.8768883878241266e-05, "loss": 0.0666, "step": 474 }, { "epoch": 1.5331179321486268, "grad_norm": 0.07642164081335068, "learning_rate": 1.876437429537768e-05, "loss": 0.0592, "step": 475 }, { "epoch": 1.5363489499192244, "grad_norm": 0.0762806385755539, "learning_rate": 1.8759864712514094e-05, "loss": 0.0573, "step": 476 }, { "epoch": 1.5395799676898223, "grad_norm": 0.06152572110295296, "learning_rate": 1.875535512965051e-05, "loss": 0.0509, "step": 477 }, { "epoch": 1.5428109854604202, "grad_norm": 0.08461987972259521, "learning_rate": 1.8750845546786923e-05, "loss": 0.0693, "step": 478 }, { "epoch": 1.5460420032310178, "grad_norm": 0.06401054561138153, "learning_rate": 1.8746335963923338e-05, "loss": 0.0523, "step": 479 }, { "epoch": 1.5492730210016155, "grad_norm": 0.07567861676216125, "learning_rate": 1.8741826381059752e-05, "loss": 0.0632, "step": 480 }, { "epoch": 1.5525040387722133, "grad_norm": 0.07169700413942337, "learning_rate": 1.8737316798196166e-05, "loss": 0.0541, "step": 481 }, { "epoch": 1.555735056542811, "grad_norm": 0.067410409450531, "learning_rate": 1.873280721533258e-05, "loss": 0.0576, "step": 482 }, { "epoch": 1.5589660743134086, "grad_norm": 0.0794718787074089, "learning_rate": 1.8728297632469e-05, "loss": 0.0602, "step": 483 }, { "epoch": 1.5621970920840065, "grad_norm": 0.09098870307207108, "learning_rate": 1.8723788049605413e-05, "loss": 0.0741, "step": 484 }, { "epoch": 1.5654281098546043, "grad_norm": 0.07266968488693237, "learning_rate": 1.8719278466741827e-05, "loss": 0.0535, "step": 485 }, { "epoch": 1.568659127625202, "grad_norm": 0.07994985580444336, "learning_rate": 1.8714768883878242e-05, "loss": 0.0642, "step": 486 }, { "epoch": 1.5718901453957996, "grad_norm": 0.09563203901052475, "learning_rate": 1.8710259301014656e-05, "loss": 0.0738, "step": 487 }, { "epoch": 1.5751211631663975, "grad_norm": 0.07337169349193573, "learning_rate": 1.870574971815107e-05, "loss": 0.0615, "step": 488 }, { "epoch": 1.5783521809369951, "grad_norm": 0.08605758100748062, "learning_rate": 1.8701240135287488e-05, "loss": 0.0737, "step": 489 }, { "epoch": 1.5815831987075928, "grad_norm": 0.08178628236055374, "learning_rate": 1.8696730552423903e-05, "loss": 0.0562, "step": 490 }, { "epoch": 1.5848142164781907, "grad_norm": 0.08635883033275604, "learning_rate": 1.8692220969560317e-05, "loss": 0.0655, "step": 491 }, { "epoch": 1.5880452342487885, "grad_norm": 0.10575321316719055, "learning_rate": 1.868771138669673e-05, "loss": 0.0857, "step": 492 }, { "epoch": 1.5912762520193862, "grad_norm": 0.10067257285118103, "learning_rate": 1.8683201803833146e-05, "loss": 0.0817, "step": 493 }, { "epoch": 1.5945072697899838, "grad_norm": 0.07644681632518768, "learning_rate": 1.867869222096956e-05, "loss": 0.0625, "step": 494 }, { "epoch": 1.5977382875605817, "grad_norm": 0.07164619863033295, "learning_rate": 1.8674182638105978e-05, "loss": 0.0597, "step": 495 }, { "epoch": 1.6009693053311793, "grad_norm": 0.07293085008859634, "learning_rate": 1.8669673055242392e-05, "loss": 0.0604, "step": 496 }, { "epoch": 1.604200323101777, "grad_norm": 0.09480689465999603, "learning_rate": 1.8665163472378807e-05, "loss": 0.0737, "step": 497 }, { "epoch": 1.6074313408723748, "grad_norm": 0.09798948466777802, "learning_rate": 1.866065388951522e-05, "loss": 0.0706, "step": 498 }, { "epoch": 1.6106623586429727, "grad_norm": 0.08216292411088943, "learning_rate": 1.8656144306651636e-05, "loss": 0.0585, "step": 499 }, { "epoch": 1.6138933764135701, "grad_norm": 0.10146701335906982, "learning_rate": 1.865163472378805e-05, "loss": 0.0631, "step": 500 }, { "epoch": 1.617124394184168, "grad_norm": 0.07699297368526459, "learning_rate": 1.8647125140924468e-05, "loss": 0.0592, "step": 501 }, { "epoch": 1.6203554119547658, "grad_norm": 0.07803017646074295, "learning_rate": 1.8642615558060882e-05, "loss": 0.063, "step": 502 }, { "epoch": 1.6235864297253635, "grad_norm": 0.08820293843746185, "learning_rate": 1.8638105975197297e-05, "loss": 0.0733, "step": 503 }, { "epoch": 1.6268174474959611, "grad_norm": 0.10102511942386627, "learning_rate": 1.863359639233371e-05, "loss": 0.0735, "step": 504 }, { "epoch": 1.630048465266559, "grad_norm": 0.08669153600931168, "learning_rate": 1.8629086809470125e-05, "loss": 0.0757, "step": 505 }, { "epoch": 1.6332794830371569, "grad_norm": 0.08120600879192352, "learning_rate": 1.862457722660654e-05, "loss": 0.0586, "step": 506 }, { "epoch": 1.6365105008077543, "grad_norm": 0.06960420310497284, "learning_rate": 1.8620067643742957e-05, "loss": 0.0519, "step": 507 }, { "epoch": 1.6397415185783522, "grad_norm": 0.08567452430725098, "learning_rate": 1.8615558060879372e-05, "loss": 0.0703, "step": 508 }, { "epoch": 1.64297253634895, "grad_norm": 0.08288481831550598, "learning_rate": 1.8611048478015786e-05, "loss": 0.0624, "step": 509 }, { "epoch": 1.6462035541195477, "grad_norm": 0.10185632109642029, "learning_rate": 1.86065388951522e-05, "loss": 0.072, "step": 510 }, { "epoch": 1.6494345718901453, "grad_norm": 0.0914456769824028, "learning_rate": 1.8602029312288615e-05, "loss": 0.0683, "step": 511 }, { "epoch": 1.6526655896607432, "grad_norm": 0.08204774558544159, "learning_rate": 1.859751972942503e-05, "loss": 0.0657, "step": 512 }, { "epoch": 1.655896607431341, "grad_norm": 0.11823786050081253, "learning_rate": 1.8593010146561444e-05, "loss": 0.0872, "step": 513 }, { "epoch": 1.6591276252019385, "grad_norm": 0.13115671277046204, "learning_rate": 1.858850056369786e-05, "loss": 0.1004, "step": 514 }, { "epoch": 1.6623586429725363, "grad_norm": 0.09443841129541397, "learning_rate": 1.8583990980834276e-05, "loss": 0.0722, "step": 515 }, { "epoch": 1.6655896607431342, "grad_norm": 0.06847009062767029, "learning_rate": 1.857948139797069e-05, "loss": 0.054, "step": 516 }, { "epoch": 1.6688206785137318, "grad_norm": 0.07960178703069687, "learning_rate": 1.8574971815107105e-05, "loss": 0.065, "step": 517 }, { "epoch": 1.6720516962843295, "grad_norm": 0.07255195826292038, "learning_rate": 1.857046223224352e-05, "loss": 0.0523, "step": 518 }, { "epoch": 1.6752827140549273, "grad_norm": 0.08610787242650986, "learning_rate": 1.8565952649379934e-05, "loss": 0.0667, "step": 519 }, { "epoch": 1.678513731825525, "grad_norm": 0.09422770887613297, "learning_rate": 1.8561443066516348e-05, "loss": 0.0772, "step": 520 }, { "epoch": 1.6817447495961226, "grad_norm": 0.10716807097196579, "learning_rate": 1.8556933483652762e-05, "loss": 0.0825, "step": 521 }, { "epoch": 1.6849757673667205, "grad_norm": 0.09894333779811859, "learning_rate": 1.8552423900789177e-05, "loss": 0.084, "step": 522 }, { "epoch": 1.6882067851373184, "grad_norm": 0.08127731829881668, "learning_rate": 1.854791431792559e-05, "loss": 0.063, "step": 523 }, { "epoch": 1.691437802907916, "grad_norm": 0.08127739280462265, "learning_rate": 1.854340473506201e-05, "loss": 0.0602, "step": 524 }, { "epoch": 1.6946688206785137, "grad_norm": 0.08109954744577408, "learning_rate": 1.8538895152198423e-05, "loss": 0.0618, "step": 525 }, { "epoch": 1.6946688206785137, "eval_loss": 0.07574764639139175, "eval_runtime": 188.1431, "eval_samples_per_second": 1.047, "eval_steps_per_second": 1.047, "step": 525 }, { "epoch": 1.6978998384491115, "grad_norm": 0.07915576547384262, "learning_rate": 1.8534385569334838e-05, "loss": 0.0566, "step": 526 }, { "epoch": 1.7011308562197092, "grad_norm": 0.09259936213493347, "learning_rate": 1.8529875986471252e-05, "loss": 0.0821, "step": 527 }, { "epoch": 1.7043618739903068, "grad_norm": 0.06958405673503876, "learning_rate": 1.8525366403607666e-05, "loss": 0.0513, "step": 528 }, { "epoch": 1.7075928917609047, "grad_norm": 0.11260278522968292, "learning_rate": 1.852085682074408e-05, "loss": 0.0791, "step": 529 }, { "epoch": 1.7108239095315025, "grad_norm": 0.08565714955329895, "learning_rate": 1.8516347237880495e-05, "loss": 0.0704, "step": 530 }, { "epoch": 1.7140549273021002, "grad_norm": 0.0977453961968422, "learning_rate": 1.8511837655016913e-05, "loss": 0.0666, "step": 531 }, { "epoch": 1.7172859450726978, "grad_norm": 0.09589142352342606, "learning_rate": 1.8507328072153327e-05, "loss": 0.0678, "step": 532 }, { "epoch": 1.7205169628432957, "grad_norm": 0.10372763872146606, "learning_rate": 1.8502818489289742e-05, "loss": 0.0755, "step": 533 }, { "epoch": 1.7237479806138933, "grad_norm": 0.09707041829824448, "learning_rate": 1.8498308906426156e-05, "loss": 0.0759, "step": 534 }, { "epoch": 1.726978998384491, "grad_norm": 0.07280156016349792, "learning_rate": 1.849379932356257e-05, "loss": 0.058, "step": 535 }, { "epoch": 1.7302100161550888, "grad_norm": 0.08810850977897644, "learning_rate": 1.848928974069899e-05, "loss": 0.0691, "step": 536 }, { "epoch": 1.7334410339256867, "grad_norm": 0.09844056516885757, "learning_rate": 1.8484780157835403e-05, "loss": 0.0682, "step": 537 }, { "epoch": 1.7366720516962844, "grad_norm": 0.06963982433080673, "learning_rate": 1.8480270574971817e-05, "loss": 0.0513, "step": 538 }, { "epoch": 1.739903069466882, "grad_norm": 0.08248520642518997, "learning_rate": 1.847576099210823e-05, "loss": 0.0611, "step": 539 }, { "epoch": 1.7431340872374799, "grad_norm": 0.09553173929452896, "learning_rate": 1.8471251409244646e-05, "loss": 0.0754, "step": 540 }, { "epoch": 1.7463651050080775, "grad_norm": 0.06919584423303604, "learning_rate": 1.846674182638106e-05, "loss": 0.0508, "step": 541 }, { "epoch": 1.7495961227786752, "grad_norm": 0.07004183530807495, "learning_rate": 1.8462232243517478e-05, "loss": 0.0498, "step": 542 }, { "epoch": 1.752827140549273, "grad_norm": 0.08570928126573563, "learning_rate": 1.8457722660653892e-05, "loss": 0.0631, "step": 543 }, { "epoch": 1.7560581583198709, "grad_norm": 0.0732467994093895, "learning_rate": 1.8453213077790307e-05, "loss": 0.0557, "step": 544 }, { "epoch": 1.7592891760904685, "grad_norm": 0.07687011361122131, "learning_rate": 1.844870349492672e-05, "loss": 0.0573, "step": 545 }, { "epoch": 1.7625201938610662, "grad_norm": 0.07186026871204376, "learning_rate": 1.8444193912063136e-05, "loss": 0.0502, "step": 546 }, { "epoch": 1.765751211631664, "grad_norm": 0.07176259905099869, "learning_rate": 1.843968432919955e-05, "loss": 0.0529, "step": 547 }, { "epoch": 1.7689822294022617, "grad_norm": 0.0842595249414444, "learning_rate": 1.8435174746335968e-05, "loss": 0.0674, "step": 548 }, { "epoch": 1.7722132471728593, "grad_norm": 0.07965710759162903, "learning_rate": 1.8430665163472382e-05, "loss": 0.0619, "step": 549 }, { "epoch": 1.7754442649434572, "grad_norm": 0.08953316509723663, "learning_rate": 1.8426155580608797e-05, "loss": 0.0643, "step": 550 }, { "epoch": 1.778675282714055, "grad_norm": 0.0928904190659523, "learning_rate": 1.842164599774521e-05, "loss": 0.0712, "step": 551 }, { "epoch": 1.7819063004846527, "grad_norm": 0.08743231743574142, "learning_rate": 1.8417136414881625e-05, "loss": 0.0657, "step": 552 }, { "epoch": 1.7851373182552503, "grad_norm": 0.07706678658723831, "learning_rate": 1.841262683201804e-05, "loss": 0.0568, "step": 553 }, { "epoch": 1.7883683360258482, "grad_norm": 0.0831725150346756, "learning_rate": 1.8408117249154454e-05, "loss": 0.0578, "step": 554 }, { "epoch": 1.7915993537964459, "grad_norm": 0.09395398199558258, "learning_rate": 1.8403607666290872e-05, "loss": 0.0715, "step": 555 }, { "epoch": 1.7948303715670435, "grad_norm": 0.0942830964922905, "learning_rate": 1.8399098083427286e-05, "loss": 0.0634, "step": 556 }, { "epoch": 1.7980613893376414, "grad_norm": 0.0980205312371254, "learning_rate": 1.83945885005637e-05, "loss": 0.0694, "step": 557 }, { "epoch": 1.8012924071082392, "grad_norm": 0.10699216276407242, "learning_rate": 1.8390078917700115e-05, "loss": 0.0871, "step": 558 }, { "epoch": 1.8045234248788369, "grad_norm": 0.09851755946874619, "learning_rate": 1.838556933483653e-05, "loss": 0.0644, "step": 559 }, { "epoch": 1.8077544426494345, "grad_norm": 0.09926044940948486, "learning_rate": 1.8381059751972944e-05, "loss": 0.0629, "step": 560 }, { "epoch": 1.8109854604200324, "grad_norm": 0.09520839154720306, "learning_rate": 1.8376550169109358e-05, "loss": 0.0607, "step": 561 }, { "epoch": 1.81421647819063, "grad_norm": 0.06896607577800751, "learning_rate": 1.8372040586245773e-05, "loss": 0.0559, "step": 562 }, { "epoch": 1.8174474959612277, "grad_norm": 0.09539300203323364, "learning_rate": 1.8367531003382187e-05, "loss": 0.0683, "step": 563 }, { "epoch": 1.8206785137318255, "grad_norm": 0.10596197098493576, "learning_rate": 1.83630214205186e-05, "loss": 0.075, "step": 564 }, { "epoch": 1.8239095315024234, "grad_norm": 0.13212427496910095, "learning_rate": 1.835851183765502e-05, "loss": 0.0755, "step": 565 }, { "epoch": 1.827140549273021, "grad_norm": 0.12438125163316727, "learning_rate": 1.8354002254791434e-05, "loss": 0.0853, "step": 566 }, { "epoch": 1.8303715670436187, "grad_norm": 0.06944366544485092, "learning_rate": 1.8349492671927848e-05, "loss": 0.0434, "step": 567 }, { "epoch": 1.8336025848142166, "grad_norm": 0.10360438376665115, "learning_rate": 1.8344983089064262e-05, "loss": 0.0778, "step": 568 }, { "epoch": 1.8368336025848142, "grad_norm": 0.1002860888838768, "learning_rate": 1.8340473506200677e-05, "loss": 0.0733, "step": 569 }, { "epoch": 1.8400646203554119, "grad_norm": 0.10875017940998077, "learning_rate": 1.833596392333709e-05, "loss": 0.0781, "step": 570 }, { "epoch": 1.8432956381260097, "grad_norm": 0.08669572323560715, "learning_rate": 1.8331454340473506e-05, "loss": 0.0587, "step": 571 }, { "epoch": 1.8465266558966076, "grad_norm": 0.09304548799991608, "learning_rate": 1.8326944757609923e-05, "loss": 0.0675, "step": 572 }, { "epoch": 1.849757673667205, "grad_norm": 0.09815046936273575, "learning_rate": 1.8322435174746338e-05, "loss": 0.0752, "step": 573 }, { "epoch": 1.8529886914378029, "grad_norm": 0.08040884137153625, "learning_rate": 1.8317925591882752e-05, "loss": 0.06, "step": 574 }, { "epoch": 1.8562197092084007, "grad_norm": 0.08228793740272522, "learning_rate": 1.8313416009019166e-05, "loss": 0.0547, "step": 575 }, { "epoch": 1.8594507269789984, "grad_norm": 0.07945281267166138, "learning_rate": 1.830890642615558e-05, "loss": 0.0467, "step": 576 }, { "epoch": 1.862681744749596, "grad_norm": 0.07659505307674408, "learning_rate": 1.8304396843291995e-05, "loss": 0.0571, "step": 577 }, { "epoch": 1.865912762520194, "grad_norm": 0.07296533137559891, "learning_rate": 1.8299887260428413e-05, "loss": 0.0567, "step": 578 }, { "epoch": 1.8691437802907918, "grad_norm": 0.10132135450839996, "learning_rate": 1.8295377677564827e-05, "loss": 0.0589, "step": 579 }, { "epoch": 1.8723747980613892, "grad_norm": 0.0985584482550621, "learning_rate": 1.8290868094701242e-05, "loss": 0.0707, "step": 580 }, { "epoch": 1.875605815831987, "grad_norm": 0.09575635194778442, "learning_rate": 1.8286358511837656e-05, "loss": 0.0687, "step": 581 }, { "epoch": 1.878836833602585, "grad_norm": 0.10488908737897873, "learning_rate": 1.828184892897407e-05, "loss": 0.0752, "step": 582 }, { "epoch": 1.8820678513731826, "grad_norm": 0.0739881619811058, "learning_rate": 1.8277339346110485e-05, "loss": 0.0537, "step": 583 }, { "epoch": 1.8852988691437802, "grad_norm": 0.06086435914039612, "learning_rate": 1.8272829763246903e-05, "loss": 0.051, "step": 584 }, { "epoch": 1.888529886914378, "grad_norm": 0.09947849065065384, "learning_rate": 1.8268320180383317e-05, "loss": 0.0705, "step": 585 }, { "epoch": 1.891760904684976, "grad_norm": 0.09509633481502533, "learning_rate": 1.826381059751973e-05, "loss": 0.0665, "step": 586 }, { "epoch": 1.8949919224555734, "grad_norm": 0.12639068067073822, "learning_rate": 1.8259301014656146e-05, "loss": 0.0538, "step": 587 }, { "epoch": 1.8982229402261712, "grad_norm": 0.09957147389650345, "learning_rate": 1.825479143179256e-05, "loss": 0.0649, "step": 588 }, { "epoch": 1.901453957996769, "grad_norm": 0.10096530616283417, "learning_rate": 1.8250281848928975e-05, "loss": 0.0607, "step": 589 }, { "epoch": 1.9046849757673667, "grad_norm": 0.10449621081352234, "learning_rate": 1.8245772266065392e-05, "loss": 0.0742, "step": 590 }, { "epoch": 1.9079159935379644, "grad_norm": 0.09589482843875885, "learning_rate": 1.8241262683201807e-05, "loss": 0.069, "step": 591 }, { "epoch": 1.9111470113085622, "grad_norm": 0.10444579273462296, "learning_rate": 1.823675310033822e-05, "loss": 0.0726, "step": 592 }, { "epoch": 1.9143780290791599, "grad_norm": 0.09007811546325684, "learning_rate": 1.8232243517474636e-05, "loss": 0.0668, "step": 593 }, { "epoch": 1.9176090468497575, "grad_norm": 0.093568354845047, "learning_rate": 1.822773393461105e-05, "loss": 0.0658, "step": 594 }, { "epoch": 1.9208400646203554, "grad_norm": 0.0825546383857727, "learning_rate": 1.8223224351747464e-05, "loss": 0.0586, "step": 595 }, { "epoch": 1.9240710823909533, "grad_norm": 0.08507188409566879, "learning_rate": 1.8218714768883882e-05, "loss": 0.0617, "step": 596 }, { "epoch": 1.927302100161551, "grad_norm": 0.0858079269528389, "learning_rate": 1.8214205186020297e-05, "loss": 0.0601, "step": 597 }, { "epoch": 1.9305331179321485, "grad_norm": 0.1205410435795784, "learning_rate": 1.820969560315671e-05, "loss": 0.0883, "step": 598 }, { "epoch": 1.9337641357027464, "grad_norm": 0.1289929449558258, "learning_rate": 1.8205186020293125e-05, "loss": 0.076, "step": 599 }, { "epoch": 1.936995153473344, "grad_norm": 0.11139614135026932, "learning_rate": 1.820067643742954e-05, "loss": 0.0846, "step": 600 }, { "epoch": 1.936995153473344, "eval_loss": 0.07442964613437653, "eval_runtime": 188.1343, "eval_samples_per_second": 1.047, "eval_steps_per_second": 1.047, "step": 600 }, { "epoch": 1.9402261712439417, "grad_norm": 0.0923788920044899, "learning_rate": 1.8196166854565954e-05, "loss": 0.0605, "step": 601 }, { "epoch": 1.9434571890145396, "grad_norm": 0.1251497119665146, "learning_rate": 1.819165727170237e-05, "loss": 0.0894, "step": 602 }, { "epoch": 1.9466882067851374, "grad_norm": 0.11357556283473969, "learning_rate": 1.8187147688838783e-05, "loss": 0.0819, "step": 603 }, { "epoch": 1.949919224555735, "grad_norm": 0.09567239135503769, "learning_rate": 1.8182638105975197e-05, "loss": 0.0624, "step": 604 }, { "epoch": 1.9531502423263327, "grad_norm": 0.09191922098398209, "learning_rate": 1.8178128523111612e-05, "loss": 0.0578, "step": 605 }, { "epoch": 1.9563812600969306, "grad_norm": 0.09503104537725449, "learning_rate": 1.817361894024803e-05, "loss": 0.0641, "step": 606 }, { "epoch": 1.9596122778675282, "grad_norm": 0.11036618053913116, "learning_rate": 1.8169109357384444e-05, "loss": 0.0666, "step": 607 }, { "epoch": 1.9628432956381259, "grad_norm": 0.09104762971401215, "learning_rate": 1.8164599774520858e-05, "loss": 0.0649, "step": 608 }, { "epoch": 1.9660743134087237, "grad_norm": 0.10882871598005295, "learning_rate": 1.8160090191657273e-05, "loss": 0.0824, "step": 609 }, { "epoch": 1.9693053311793216, "grad_norm": 0.09632111340761185, "learning_rate": 1.8155580608793687e-05, "loss": 0.069, "step": 610 }, { "epoch": 1.9725363489499192, "grad_norm": 0.07966237515211105, "learning_rate": 1.81510710259301e-05, "loss": 0.0557, "step": 611 }, { "epoch": 1.975767366720517, "grad_norm": 0.10335849225521088, "learning_rate": 1.8146561443066516e-05, "loss": 0.0787, "step": 612 }, { "epoch": 1.9789983844911148, "grad_norm": 0.12096443772315979, "learning_rate": 1.814205186020293e-05, "loss": 0.0731, "step": 613 }, { "epoch": 1.9822294022617124, "grad_norm": 0.09432677179574966, "learning_rate": 1.8137542277339348e-05, "loss": 0.0635, "step": 614 }, { "epoch": 1.98546042003231, "grad_norm": 0.11708611994981766, "learning_rate": 1.8133032694475762e-05, "loss": 0.0874, "step": 615 }, { "epoch": 1.988691437802908, "grad_norm": 0.1113506332039833, "learning_rate": 1.8128523111612177e-05, "loss": 0.0678, "step": 616 }, { "epoch": 1.9919224555735058, "grad_norm": 0.09246299415826797, "learning_rate": 1.812401352874859e-05, "loss": 0.0672, "step": 617 }, { "epoch": 1.9951534733441034, "grad_norm": 0.1115182563662529, "learning_rate": 1.8119503945885006e-05, "loss": 0.0758, "step": 618 }, { "epoch": 1.998384491114701, "grad_norm": 0.10478319972753525, "learning_rate": 1.811499436302142e-05, "loss": 0.0648, "step": 619 }, { "epoch": 2.0, "grad_norm": 0.13386856019496918, "learning_rate": 1.8110484780157838e-05, "loss": 0.0624, "step": 620 }, { "epoch": 2.003231017770598, "grad_norm": 0.09262198954820633, "learning_rate": 1.8105975197294252e-05, "loss": 0.0608, "step": 621 }, { "epoch": 2.0064620355411953, "grad_norm": 0.07091473788022995, "learning_rate": 1.8101465614430667e-05, "loss": 0.0434, "step": 622 }, { "epoch": 2.009693053311793, "grad_norm": 0.10324624925851822, "learning_rate": 1.809695603156708e-05, "loss": 0.0764, "step": 623 }, { "epoch": 2.012924071082391, "grad_norm": 0.10515467822551727, "learning_rate": 1.8092446448703495e-05, "loss": 0.0609, "step": 624 }, { "epoch": 2.016155088852989, "grad_norm": 0.10561127960681915, "learning_rate": 1.8087936865839913e-05, "loss": 0.0719, "step": 625 }, { "epoch": 2.0193861066235863, "grad_norm": 0.1146024838089943, "learning_rate": 1.8083427282976327e-05, "loss": 0.0726, "step": 626 }, { "epoch": 2.022617124394184, "grad_norm": 0.07930684089660645, "learning_rate": 1.8078917700112742e-05, "loss": 0.0481, "step": 627 }, { "epoch": 2.025848142164782, "grad_norm": 0.09927454590797424, "learning_rate": 1.8074408117249156e-05, "loss": 0.0608, "step": 628 }, { "epoch": 2.0290791599353795, "grad_norm": 0.08592136949300766, "learning_rate": 1.806989853438557e-05, "loss": 0.0577, "step": 629 }, { "epoch": 2.0323101777059773, "grad_norm": 0.09232696890830994, "learning_rate": 1.8065388951521985e-05, "loss": 0.0528, "step": 630 }, { "epoch": 2.035541195476575, "grad_norm": 0.08930118381977081, "learning_rate": 1.8060879368658403e-05, "loss": 0.0633, "step": 631 }, { "epoch": 2.038772213247173, "grad_norm": 0.09835111349821091, "learning_rate": 1.8056369785794817e-05, "loss": 0.0648, "step": 632 }, { "epoch": 2.0420032310177705, "grad_norm": 0.10789217799901962, "learning_rate": 1.805186020293123e-05, "loss": 0.0695, "step": 633 }, { "epoch": 2.0452342487883683, "grad_norm": 0.10506349802017212, "learning_rate": 1.8047350620067646e-05, "loss": 0.0635, "step": 634 }, { "epoch": 2.048465266558966, "grad_norm": 0.13068322837352753, "learning_rate": 1.804284103720406e-05, "loss": 0.0802, "step": 635 }, { "epoch": 2.0516962843295636, "grad_norm": 0.09663469344377518, "learning_rate": 1.8038331454340475e-05, "loss": 0.062, "step": 636 }, { "epoch": 2.0549273021001615, "grad_norm": 0.07054325938224792, "learning_rate": 1.8033821871476893e-05, "loss": 0.0453, "step": 637 }, { "epoch": 2.0581583198707594, "grad_norm": 0.07739470899105072, "learning_rate": 1.8029312288613307e-05, "loss": 0.0463, "step": 638 }, { "epoch": 2.0613893376413572, "grad_norm": 0.09022580832242966, "learning_rate": 1.802480270574972e-05, "loss": 0.0587, "step": 639 }, { "epoch": 2.0646203554119547, "grad_norm": 0.09953221678733826, "learning_rate": 1.8020293122886136e-05, "loss": 0.0585, "step": 640 }, { "epoch": 2.0678513731825525, "grad_norm": 0.10069511830806732, "learning_rate": 1.801578354002255e-05, "loss": 0.0587, "step": 641 }, { "epoch": 2.0710823909531504, "grad_norm": 0.09177737683057785, "learning_rate": 1.8011273957158964e-05, "loss": 0.0523, "step": 642 }, { "epoch": 2.074313408723748, "grad_norm": 0.1010020524263382, "learning_rate": 1.800676437429538e-05, "loss": 0.0552, "step": 643 }, { "epoch": 2.0775444264943457, "grad_norm": 0.0997423455119133, "learning_rate": 1.8002254791431793e-05, "loss": 0.0662, "step": 644 }, { "epoch": 2.0807754442649435, "grad_norm": 0.09295801818370819, "learning_rate": 1.7997745208568208e-05, "loss": 0.0599, "step": 645 }, { "epoch": 2.0840064620355414, "grad_norm": 0.1053297147154808, "learning_rate": 1.7993235625704622e-05, "loss": 0.064, "step": 646 }, { "epoch": 2.087237479806139, "grad_norm": 0.11978495121002197, "learning_rate": 1.798872604284104e-05, "loss": 0.0727, "step": 647 }, { "epoch": 2.0904684975767367, "grad_norm": 0.07878235727548599, "learning_rate": 1.7984216459977454e-05, "loss": 0.0486, "step": 648 }, { "epoch": 2.0936995153473346, "grad_norm": 0.14993903040885925, "learning_rate": 1.797970687711387e-05, "loss": 0.0458, "step": 649 }, { "epoch": 2.096930533117932, "grad_norm": 0.0925765186548233, "learning_rate": 1.7975197294250283e-05, "loss": 0.0546, "step": 650 }, { "epoch": 2.10016155088853, "grad_norm": 0.09530377388000488, "learning_rate": 1.7970687711386697e-05, "loss": 0.0514, "step": 651 }, { "epoch": 2.1033925686591277, "grad_norm": 0.0945788025856018, "learning_rate": 1.7966178128523112e-05, "loss": 0.0604, "step": 652 }, { "epoch": 2.106623586429725, "grad_norm": 0.11486334353685379, "learning_rate": 1.7961668545659526e-05, "loss": 0.0633, "step": 653 }, { "epoch": 2.109854604200323, "grad_norm": 0.1077791377902031, "learning_rate": 1.795715896279594e-05, "loss": 0.0602, "step": 654 }, { "epoch": 2.113085621970921, "grad_norm": 0.10789015889167786, "learning_rate": 1.795264937993236e-05, "loss": 0.0541, "step": 655 }, { "epoch": 2.1163166397415187, "grad_norm": 0.10327862948179245, "learning_rate": 1.7948139797068773e-05, "loss": 0.0652, "step": 656 }, { "epoch": 2.119547657512116, "grad_norm": 0.10486488789319992, "learning_rate": 1.7943630214205187e-05, "loss": 0.0617, "step": 657 }, { "epoch": 2.122778675282714, "grad_norm": 0.0882355272769928, "learning_rate": 1.79391206313416e-05, "loss": 0.0553, "step": 658 }, { "epoch": 2.126009693053312, "grad_norm": 0.08177275210618973, "learning_rate": 1.7934611048478016e-05, "loss": 0.0522, "step": 659 }, { "epoch": 2.1292407108239093, "grad_norm": 0.1455976665019989, "learning_rate": 1.793010146561443e-05, "loss": 0.0909, "step": 660 }, { "epoch": 2.132471728594507, "grad_norm": 0.1145886555314064, "learning_rate": 1.7925591882750848e-05, "loss": 0.0733, "step": 661 }, { "epoch": 2.135702746365105, "grad_norm": 0.1092807874083519, "learning_rate": 1.7921082299887262e-05, "loss": 0.0581, "step": 662 }, { "epoch": 2.138933764135703, "grad_norm": 0.07647505402565002, "learning_rate": 1.7916572717023677e-05, "loss": 0.0461, "step": 663 }, { "epoch": 2.1421647819063003, "grad_norm": 0.09198980778455734, "learning_rate": 1.791206313416009e-05, "loss": 0.0549, "step": 664 }, { "epoch": 2.145395799676898, "grad_norm": 0.10971511900424957, "learning_rate": 1.7907553551296506e-05, "loss": 0.0643, "step": 665 }, { "epoch": 2.148626817447496, "grad_norm": 0.11374619603157043, "learning_rate": 1.790304396843292e-05, "loss": 0.0754, "step": 666 }, { "epoch": 2.1518578352180935, "grad_norm": 0.09252484142780304, "learning_rate": 1.7898534385569338e-05, "loss": 0.0502, "step": 667 }, { "epoch": 2.1550888529886914, "grad_norm": 0.09586004912853241, "learning_rate": 1.7894024802705752e-05, "loss": 0.0529, "step": 668 }, { "epoch": 2.158319870759289, "grad_norm": 0.10206209868192673, "learning_rate": 1.7889515219842167e-05, "loss": 0.0605, "step": 669 }, { "epoch": 2.161550888529887, "grad_norm": 0.15015992522239685, "learning_rate": 1.788500563697858e-05, "loss": 0.0926, "step": 670 }, { "epoch": 2.1647819063004845, "grad_norm": 0.10588102042675018, "learning_rate": 1.7880496054114995e-05, "loss": 0.0623, "step": 671 }, { "epoch": 2.1680129240710824, "grad_norm": 0.09418896585702896, "learning_rate": 1.787598647125141e-05, "loss": 0.0564, "step": 672 }, { "epoch": 2.1712439418416802, "grad_norm": 0.08213125914335251, "learning_rate": 1.7871476888387828e-05, "loss": 0.0464, "step": 673 }, { "epoch": 2.1744749596122777, "grad_norm": 0.09321248531341553, "learning_rate": 1.7866967305524242e-05, "loss": 0.0531, "step": 674 }, { "epoch": 2.1777059773828755, "grad_norm": 0.10642002522945404, "learning_rate": 1.7862457722660656e-05, "loss": 0.0669, "step": 675 }, { "epoch": 2.1777059773828755, "eval_loss": 0.0747215747833252, "eval_runtime": 188.1708, "eval_samples_per_second": 1.047, "eval_steps_per_second": 1.047, "step": 675 }, { "epoch": 2.1809369951534734, "grad_norm": 0.12946507334709167, "learning_rate": 1.785794813979707e-05, "loss": 0.072, "step": 676 }, { "epoch": 2.1841680129240713, "grad_norm": 0.10074260830879211, "learning_rate": 1.7853438556933485e-05, "loss": 0.0521, "step": 677 }, { "epoch": 2.1873990306946687, "grad_norm": 0.12798738479614258, "learning_rate": 1.78489289740699e-05, "loss": 0.0758, "step": 678 }, { "epoch": 2.1906300484652665, "grad_norm": 0.10193175077438354, "learning_rate": 1.7844419391206317e-05, "loss": 0.0632, "step": 679 }, { "epoch": 2.1938610662358644, "grad_norm": 0.12570485472679138, "learning_rate": 1.783990980834273e-05, "loss": 0.0724, "step": 680 }, { "epoch": 2.197092084006462, "grad_norm": 0.10074017196893692, "learning_rate": 1.7835400225479146e-05, "loss": 0.0563, "step": 681 }, { "epoch": 2.2003231017770597, "grad_norm": 0.08727949112653732, "learning_rate": 1.783089064261556e-05, "loss": 0.0524, "step": 682 }, { "epoch": 2.2035541195476576, "grad_norm": 0.11030570417642593, "learning_rate": 1.7826381059751975e-05, "loss": 0.0668, "step": 683 }, { "epoch": 2.2067851373182554, "grad_norm": 0.10606499761343002, "learning_rate": 1.782187147688839e-05, "loss": 0.0606, "step": 684 }, { "epoch": 2.210016155088853, "grad_norm": 0.11735937744379044, "learning_rate": 1.7817361894024804e-05, "loss": 0.0648, "step": 685 }, { "epoch": 2.2132471728594507, "grad_norm": 0.106626495718956, "learning_rate": 1.7812852311161218e-05, "loss": 0.0649, "step": 686 }, { "epoch": 2.2164781906300486, "grad_norm": 0.12231657654047012, "learning_rate": 1.7808342728297632e-05, "loss": 0.0702, "step": 687 }, { "epoch": 2.219709208400646, "grad_norm": 0.08800094574689865, "learning_rate": 1.780383314543405e-05, "loss": 0.0515, "step": 688 }, { "epoch": 2.222940226171244, "grad_norm": 0.08806774020195007, "learning_rate": 1.7799323562570465e-05, "loss": 0.0493, "step": 689 }, { "epoch": 2.2261712439418417, "grad_norm": 0.10804681479930878, "learning_rate": 1.779481397970688e-05, "loss": 0.0604, "step": 690 }, { "epoch": 2.2294022617124396, "grad_norm": 0.11405564099550247, "learning_rate": 1.7790304396843293e-05, "loss": 0.0597, "step": 691 }, { "epoch": 2.232633279483037, "grad_norm": 0.11010053753852844, "learning_rate": 1.7785794813979708e-05, "loss": 0.0634, "step": 692 }, { "epoch": 2.235864297253635, "grad_norm": 0.10657312721014023, "learning_rate": 1.7781285231116122e-05, "loss": 0.0539, "step": 693 }, { "epoch": 2.2390953150242328, "grad_norm": 0.08584710210561752, "learning_rate": 1.7776775648252536e-05, "loss": 0.0571, "step": 694 }, { "epoch": 2.24232633279483, "grad_norm": 0.10155533254146576, "learning_rate": 1.777226606538895e-05, "loss": 0.0597, "step": 695 }, { "epoch": 2.245557350565428, "grad_norm": 0.11395770311355591, "learning_rate": 1.7767756482525365e-05, "loss": 0.0675, "step": 696 }, { "epoch": 2.248788368336026, "grad_norm": 0.11109079420566559, "learning_rate": 1.7763246899661783e-05, "loss": 0.062, "step": 697 }, { "epoch": 2.2520193861066238, "grad_norm": 0.13479241728782654, "learning_rate": 1.7758737316798197e-05, "loss": 0.0782, "step": 698 }, { "epoch": 2.255250403877221, "grad_norm": 0.12003345042467117, "learning_rate": 1.7754227733934612e-05, "loss": 0.0683, "step": 699 }, { "epoch": 2.258481421647819, "grad_norm": 0.13395312428474426, "learning_rate": 1.7749718151071026e-05, "loss": 0.0764, "step": 700 }, { "epoch": 2.261712439418417, "grad_norm": 0.10561169683933258, "learning_rate": 1.774520856820744e-05, "loss": 0.0552, "step": 701 }, { "epoch": 2.2649434571890144, "grad_norm": 0.1412249207496643, "learning_rate": 1.7740698985343855e-05, "loss": 0.0812, "step": 702 }, { "epoch": 2.268174474959612, "grad_norm": 0.11307451128959656, "learning_rate": 1.7736189402480273e-05, "loss": 0.0671, "step": 703 }, { "epoch": 2.27140549273021, "grad_norm": 0.10989584773778915, "learning_rate": 1.7731679819616687e-05, "loss": 0.059, "step": 704 }, { "epoch": 2.274636510500808, "grad_norm": 0.0964912474155426, "learning_rate": 1.77271702367531e-05, "loss": 0.0515, "step": 705 }, { "epoch": 2.2778675282714054, "grad_norm": 0.09640849381685257, "learning_rate": 1.7722660653889516e-05, "loss": 0.0548, "step": 706 }, { "epoch": 2.2810985460420032, "grad_norm": 0.08393755555152893, "learning_rate": 1.771815107102593e-05, "loss": 0.0477, "step": 707 }, { "epoch": 2.284329563812601, "grad_norm": 0.08865144103765488, "learning_rate": 1.7713641488162348e-05, "loss": 0.0527, "step": 708 }, { "epoch": 2.2875605815831985, "grad_norm": 0.10840681195259094, "learning_rate": 1.7709131905298762e-05, "loss": 0.061, "step": 709 }, { "epoch": 2.2907915993537964, "grad_norm": 0.1336364448070526, "learning_rate": 1.7704622322435177e-05, "loss": 0.0655, "step": 710 }, { "epoch": 2.2940226171243943, "grad_norm": 0.10070191323757172, "learning_rate": 1.770011273957159e-05, "loss": 0.0599, "step": 711 }, { "epoch": 2.297253634894992, "grad_norm": 0.12378398329019547, "learning_rate": 1.7695603156708006e-05, "loss": 0.0712, "step": 712 }, { "epoch": 2.3004846526655895, "grad_norm": 0.08809908479452133, "learning_rate": 1.769109357384442e-05, "loss": 0.0419, "step": 713 }, { "epoch": 2.3037156704361874, "grad_norm": 0.10536797344684601, "learning_rate": 1.7686583990980838e-05, "loss": 0.062, "step": 714 }, { "epoch": 2.3069466882067853, "grad_norm": 0.09373629838228226, "learning_rate": 1.7682074408117252e-05, "loss": 0.0455, "step": 715 }, { "epoch": 2.3101777059773827, "grad_norm": 0.08422086387872696, "learning_rate": 1.7677564825253667e-05, "loss": 0.0481, "step": 716 }, { "epoch": 2.3134087237479806, "grad_norm": 0.12226711213588715, "learning_rate": 1.767305524239008e-05, "loss": 0.0713, "step": 717 }, { "epoch": 2.3166397415185784, "grad_norm": 0.11593876034021378, "learning_rate": 1.7668545659526495e-05, "loss": 0.065, "step": 718 }, { "epoch": 2.3198707592891763, "grad_norm": 0.10055369138717651, "learning_rate": 1.766403607666291e-05, "loss": 0.0515, "step": 719 }, { "epoch": 2.3231017770597737, "grad_norm": 0.1200050637125969, "learning_rate": 1.7659526493799328e-05, "loss": 0.0664, "step": 720 }, { "epoch": 2.3263327948303716, "grad_norm": 0.10001233220100403, "learning_rate": 1.7655016910935742e-05, "loss": 0.0578, "step": 721 }, { "epoch": 2.3295638126009695, "grad_norm": 0.08621415495872498, "learning_rate": 1.7650507328072156e-05, "loss": 0.0426, "step": 722 }, { "epoch": 2.332794830371567, "grad_norm": 0.08662088960409164, "learning_rate": 1.764599774520857e-05, "loss": 0.048, "step": 723 }, { "epoch": 2.3360258481421647, "grad_norm": 0.09761569648981094, "learning_rate": 1.7641488162344985e-05, "loss": 0.0585, "step": 724 }, { "epoch": 2.3392568659127626, "grad_norm": 0.1272287666797638, "learning_rate": 1.76369785794814e-05, "loss": 0.0722, "step": 725 }, { "epoch": 2.3424878836833605, "grad_norm": 0.0827430784702301, "learning_rate": 1.7632468996617814e-05, "loss": 0.0517, "step": 726 }, { "epoch": 2.345718901453958, "grad_norm": 0.08261015266180038, "learning_rate": 1.7627959413754228e-05, "loss": 0.0464, "step": 727 }, { "epoch": 2.3489499192245558, "grad_norm": 0.10019004344940186, "learning_rate": 1.7623449830890643e-05, "loss": 0.0539, "step": 728 }, { "epoch": 2.3521809369951536, "grad_norm": 0.11860109120607376, "learning_rate": 1.761894024802706e-05, "loss": 0.0604, "step": 729 }, { "epoch": 2.355411954765751, "grad_norm": 0.13498210906982422, "learning_rate": 1.7614430665163475e-05, "loss": 0.0497, "step": 730 }, { "epoch": 2.358642972536349, "grad_norm": 0.09867555648088455, "learning_rate": 1.760992108229989e-05, "loss": 0.0572, "step": 731 }, { "epoch": 2.361873990306947, "grad_norm": 0.10520780086517334, "learning_rate": 1.7605411499436304e-05, "loss": 0.0613, "step": 732 }, { "epoch": 2.3651050080775446, "grad_norm": 0.1396726369857788, "learning_rate": 1.7600901916572718e-05, "loss": 0.0808, "step": 733 }, { "epoch": 2.368336025848142, "grad_norm": 0.09852424263954163, "learning_rate": 1.7596392333709132e-05, "loss": 0.0602, "step": 734 }, { "epoch": 2.37156704361874, "grad_norm": 0.08897744864225388, "learning_rate": 1.7591882750845547e-05, "loss": 0.0463, "step": 735 }, { "epoch": 2.374798061389338, "grad_norm": 0.12664619088172913, "learning_rate": 1.758737316798196e-05, "loss": 0.0731, "step": 736 }, { "epoch": 2.378029079159935, "grad_norm": 0.0975411906838417, "learning_rate": 1.7582863585118376e-05, "loss": 0.0541, "step": 737 }, { "epoch": 2.381260096930533, "grad_norm": 0.10056427121162415, "learning_rate": 1.7578354002254793e-05, "loss": 0.0564, "step": 738 }, { "epoch": 2.384491114701131, "grad_norm": 0.0751773938536644, "learning_rate": 1.7573844419391208e-05, "loss": 0.043, "step": 739 }, { "epoch": 2.387722132471729, "grad_norm": 0.12571515142917633, "learning_rate": 1.7569334836527622e-05, "loss": 0.0715, "step": 740 }, { "epoch": 2.3909531502423262, "grad_norm": 0.09152042865753174, "learning_rate": 1.7564825253664037e-05, "loss": 0.0513, "step": 741 }, { "epoch": 2.394184168012924, "grad_norm": 0.16221173107624054, "learning_rate": 1.756031567080045e-05, "loss": 0.1003, "step": 742 }, { "epoch": 2.397415185783522, "grad_norm": 0.09910274296998978, "learning_rate": 1.7555806087936865e-05, "loss": 0.0543, "step": 743 }, { "epoch": 2.4006462035541194, "grad_norm": 0.10756971687078476, "learning_rate": 1.7551296505073283e-05, "loss": 0.0573, "step": 744 }, { "epoch": 2.4038772213247173, "grad_norm": 0.08702822029590607, "learning_rate": 1.7546786922209697e-05, "loss": 0.047, "step": 745 }, { "epoch": 2.407108239095315, "grad_norm": 0.14440016448497772, "learning_rate": 1.7542277339346112e-05, "loss": 0.0685, "step": 746 }, { "epoch": 2.410339256865913, "grad_norm": 0.09141986817121506, "learning_rate": 1.7537767756482526e-05, "loss": 0.0521, "step": 747 }, { "epoch": 2.4135702746365104, "grad_norm": 0.12515199184417725, "learning_rate": 1.753325817361894e-05, "loss": 0.0605, "step": 748 }, { "epoch": 2.4168012924071083, "grad_norm": 0.12448819726705551, "learning_rate": 1.7528748590755355e-05, "loss": 0.0723, "step": 749 }, { "epoch": 2.420032310177706, "grad_norm": 0.13118943572044373, "learning_rate": 1.7524239007891773e-05, "loss": 0.0654, "step": 750 }, { "epoch": 2.420032310177706, "eval_loss": 0.07308099418878555, "eval_runtime": 188.3306, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 750 }, { "epoch": 2.4232633279483036, "grad_norm": 0.1215345561504364, "learning_rate": 1.7519729425028187e-05, "loss": 0.0644, "step": 751 }, { "epoch": 2.4264943457189014, "grad_norm": 0.12855304777622223, "learning_rate": 1.75152198421646e-05, "loss": 0.0666, "step": 752 }, { "epoch": 2.4297253634894993, "grad_norm": 0.11538267880678177, "learning_rate": 1.7510710259301016e-05, "loss": 0.0545, "step": 753 }, { "epoch": 2.432956381260097, "grad_norm": 0.10273373872041702, "learning_rate": 1.750620067643743e-05, "loss": 0.0594, "step": 754 }, { "epoch": 2.4361873990306946, "grad_norm": 0.10953179746866226, "learning_rate": 1.7501691093573845e-05, "loss": 0.0587, "step": 755 }, { "epoch": 2.4394184168012925, "grad_norm": 0.09215240180492401, "learning_rate": 1.7497181510710263e-05, "loss": 0.0501, "step": 756 }, { "epoch": 2.4426494345718903, "grad_norm": 0.11669941246509552, "learning_rate": 1.7492671927846677e-05, "loss": 0.0585, "step": 757 }, { "epoch": 2.4458804523424877, "grad_norm": 0.11698901653289795, "learning_rate": 1.748816234498309e-05, "loss": 0.0551, "step": 758 }, { "epoch": 2.4491114701130856, "grad_norm": 0.1258348822593689, "learning_rate": 1.7483652762119506e-05, "loss": 0.0629, "step": 759 }, { "epoch": 2.4523424878836835, "grad_norm": 0.12607377767562866, "learning_rate": 1.747914317925592e-05, "loss": 0.0728, "step": 760 }, { "epoch": 2.4555735056542813, "grad_norm": 0.0982760339975357, "learning_rate": 1.7474633596392334e-05, "loss": 0.051, "step": 761 }, { "epoch": 2.4588045234248788, "grad_norm": 0.15601739287376404, "learning_rate": 1.7470124013528752e-05, "loss": 0.076, "step": 762 }, { "epoch": 2.4620355411954766, "grad_norm": 0.13090789318084717, "learning_rate": 1.7465614430665167e-05, "loss": 0.0739, "step": 763 }, { "epoch": 2.4652665589660745, "grad_norm": 0.10627159476280212, "learning_rate": 1.746110484780158e-05, "loss": 0.0509, "step": 764 }, { "epoch": 2.468497576736672, "grad_norm": 0.07108946144580841, "learning_rate": 1.7456595264937995e-05, "loss": 0.039, "step": 765 }, { "epoch": 2.47172859450727, "grad_norm": 0.14733023941516876, "learning_rate": 1.745208568207441e-05, "loss": 0.0788, "step": 766 }, { "epoch": 2.4749596122778676, "grad_norm": 0.10821715742349625, "learning_rate": 1.7447576099210824e-05, "loss": 0.0524, "step": 767 }, { "epoch": 2.4781906300484655, "grad_norm": 0.11846361309289932, "learning_rate": 1.744306651634724e-05, "loss": 0.0579, "step": 768 }, { "epoch": 2.481421647819063, "grad_norm": 0.10738200694322586, "learning_rate": 1.7438556933483653e-05, "loss": 0.0589, "step": 769 }, { "epoch": 2.484652665589661, "grad_norm": 0.1159975603222847, "learning_rate": 1.743404735062007e-05, "loss": 0.0565, "step": 770 }, { "epoch": 2.4878836833602587, "grad_norm": 0.10056610405445099, "learning_rate": 1.7429537767756485e-05, "loss": 0.0538, "step": 771 }, { "epoch": 2.491114701130856, "grad_norm": 0.10329104959964752, "learning_rate": 1.74250281848929e-05, "loss": 0.0566, "step": 772 }, { "epoch": 2.494345718901454, "grad_norm": 0.1422542929649353, "learning_rate": 1.7420518602029314e-05, "loss": 0.0747, "step": 773 }, { "epoch": 2.497576736672052, "grad_norm": 0.12898680567741394, "learning_rate": 1.741600901916573e-05, "loss": 0.0735, "step": 774 }, { "epoch": 2.5008077544426497, "grad_norm": 0.13066206872463226, "learning_rate": 1.7411499436302143e-05, "loss": 0.062, "step": 775 }, { "epoch": 2.504038772213247, "grad_norm": 0.12379497289657593, "learning_rate": 1.7406989853438557e-05, "loss": 0.0631, "step": 776 }, { "epoch": 2.507269789983845, "grad_norm": 0.1296347826719284, "learning_rate": 1.740248027057497e-05, "loss": 0.0634, "step": 777 }, { "epoch": 2.5105008077544424, "grad_norm": 0.10818596184253693, "learning_rate": 1.7397970687711386e-05, "loss": 0.0606, "step": 778 }, { "epoch": 2.5137318255250403, "grad_norm": 0.12639783322811127, "learning_rate": 1.73934611048478e-05, "loss": 0.0459, "step": 779 }, { "epoch": 2.516962843295638, "grad_norm": 0.1167321428656578, "learning_rate": 1.7388951521984218e-05, "loss": 0.0667, "step": 780 }, { "epoch": 2.520193861066236, "grad_norm": 0.10153870284557343, "learning_rate": 1.7384441939120632e-05, "loss": 0.0589, "step": 781 }, { "epoch": 2.523424878836834, "grad_norm": 0.09986142814159393, "learning_rate": 1.7379932356257047e-05, "loss": 0.045, "step": 782 }, { "epoch": 2.5266558966074313, "grad_norm": 0.10662157833576202, "learning_rate": 1.737542277339346e-05, "loss": 0.0586, "step": 783 }, { "epoch": 2.529886914378029, "grad_norm": 0.11709077656269073, "learning_rate": 1.7370913190529876e-05, "loss": 0.0613, "step": 784 }, { "epoch": 2.5331179321486266, "grad_norm": 0.13120310008525848, "learning_rate": 1.736640360766629e-05, "loss": 0.0664, "step": 785 }, { "epoch": 2.5363489499192244, "grad_norm": 0.13849826157093048, "learning_rate": 1.7361894024802708e-05, "loss": 0.0673, "step": 786 }, { "epoch": 2.5395799676898223, "grad_norm": 0.08833606541156769, "learning_rate": 1.7357384441939122e-05, "loss": 0.0459, "step": 787 }, { "epoch": 2.54281098546042, "grad_norm": 0.09421700984239578, "learning_rate": 1.7352874859075537e-05, "loss": 0.0481, "step": 788 }, { "epoch": 2.546042003231018, "grad_norm": 0.1201411634683609, "learning_rate": 1.734836527621195e-05, "loss": 0.0608, "step": 789 }, { "epoch": 2.5492730210016155, "grad_norm": 0.09896653145551682, "learning_rate": 1.7343855693348365e-05, "loss": 0.0465, "step": 790 }, { "epoch": 2.5525040387722133, "grad_norm": 0.12088964134454727, "learning_rate": 1.7339346110484783e-05, "loss": 0.0614, "step": 791 }, { "epoch": 2.5557350565428107, "grad_norm": 0.11183801293373108, "learning_rate": 1.7334836527621198e-05, "loss": 0.0545, "step": 792 }, { "epoch": 2.5589660743134086, "grad_norm": 0.11126703768968582, "learning_rate": 1.7330326944757612e-05, "loss": 0.0509, "step": 793 }, { "epoch": 2.5621970920840065, "grad_norm": 0.1374976634979248, "learning_rate": 1.7325817361894026e-05, "loss": 0.0664, "step": 794 }, { "epoch": 2.5654281098546043, "grad_norm": 0.16783633828163147, "learning_rate": 1.732130777903044e-05, "loss": 0.0868, "step": 795 }, { "epoch": 2.568659127625202, "grad_norm": 0.11534145474433899, "learning_rate": 1.7316798196166855e-05, "loss": 0.053, "step": 796 }, { "epoch": 2.5718901453957996, "grad_norm": 0.13769778609275818, "learning_rate": 1.7312288613303273e-05, "loss": 0.0756, "step": 797 }, { "epoch": 2.5751211631663975, "grad_norm": 0.09577422589063644, "learning_rate": 1.7307779030439687e-05, "loss": 0.0441, "step": 798 }, { "epoch": 2.578352180936995, "grad_norm": 0.11375096440315247, "learning_rate": 1.73032694475761e-05, "loss": 0.0524, "step": 799 }, { "epoch": 2.581583198707593, "grad_norm": 0.11465324461460114, "learning_rate": 1.7298759864712516e-05, "loss": 0.0526, "step": 800 }, { "epoch": 2.5848142164781907, "grad_norm": 0.11597500741481781, "learning_rate": 1.729425028184893e-05, "loss": 0.0591, "step": 801 }, { "epoch": 2.5880452342487885, "grad_norm": 0.09817709028720856, "learning_rate": 1.7289740698985345e-05, "loss": 0.0503, "step": 802 }, { "epoch": 2.5912762520193864, "grad_norm": 0.10352802276611328, "learning_rate": 1.7285231116121763e-05, "loss": 0.0461, "step": 803 }, { "epoch": 2.594507269789984, "grad_norm": 0.12035888433456421, "learning_rate": 1.7280721533258177e-05, "loss": 0.0576, "step": 804 }, { "epoch": 2.5977382875605817, "grad_norm": 0.12561960518360138, "learning_rate": 1.727621195039459e-05, "loss": 0.0599, "step": 805 }, { "epoch": 2.600969305331179, "grad_norm": 0.11348681896924973, "learning_rate": 1.7271702367531006e-05, "loss": 0.0503, "step": 806 }, { "epoch": 2.604200323101777, "grad_norm": 0.09772437810897827, "learning_rate": 1.726719278466742e-05, "loss": 0.0471, "step": 807 }, { "epoch": 2.607431340872375, "grad_norm": 0.10316241532564163, "learning_rate": 1.7262683201803835e-05, "loss": 0.0514, "step": 808 }, { "epoch": 2.6106623586429727, "grad_norm": 0.11204390227794647, "learning_rate": 1.725817361894025e-05, "loss": 0.0578, "step": 809 }, { "epoch": 2.61389337641357, "grad_norm": 0.10899617522954941, "learning_rate": 1.7253664036076663e-05, "loss": 0.0495, "step": 810 }, { "epoch": 2.617124394184168, "grad_norm": 0.1386026293039322, "learning_rate": 1.724915445321308e-05, "loss": 0.0542, "step": 811 }, { "epoch": 2.620355411954766, "grad_norm": 0.13927608728408813, "learning_rate": 1.7244644870349495e-05, "loss": 0.0628, "step": 812 }, { "epoch": 2.6235864297253633, "grad_norm": 0.099461629986763, "learning_rate": 1.724013528748591e-05, "loss": 0.0501, "step": 813 }, { "epoch": 2.626817447495961, "grad_norm": 0.09142296761274338, "learning_rate": 1.7235625704622324e-05, "loss": 0.0475, "step": 814 }, { "epoch": 2.630048465266559, "grad_norm": 0.12531687319278717, "learning_rate": 1.723111612175874e-05, "loss": 0.059, "step": 815 }, { "epoch": 2.633279483037157, "grad_norm": 0.1252615749835968, "learning_rate": 1.7226606538895153e-05, "loss": 0.0589, "step": 816 }, { "epoch": 2.6365105008077543, "grad_norm": 0.12725740671157837, "learning_rate": 1.7222096956031567e-05, "loss": 0.0522, "step": 817 }, { "epoch": 2.639741518578352, "grad_norm": 0.12746059894561768, "learning_rate": 1.7217587373167982e-05, "loss": 0.0661, "step": 818 }, { "epoch": 2.64297253634895, "grad_norm": 0.2133682370185852, "learning_rate": 1.7213077790304396e-05, "loss": 0.0639, "step": 819 }, { "epoch": 2.6462035541195474, "grad_norm": 0.11452341079711914, "learning_rate": 1.720856820744081e-05, "loss": 0.0512, "step": 820 }, { "epoch": 2.6494345718901453, "grad_norm": 0.12344635277986526, "learning_rate": 1.720405862457723e-05, "loss": 0.0503, "step": 821 }, { "epoch": 2.652665589660743, "grad_norm": 0.12654437124729156, "learning_rate": 1.7199549041713643e-05, "loss": 0.0543, "step": 822 }, { "epoch": 2.655896607431341, "grad_norm": 0.12805619835853577, "learning_rate": 1.7195039458850057e-05, "loss": 0.0646, "step": 823 }, { "epoch": 2.6591276252019385, "grad_norm": 0.11218256503343582, "learning_rate": 1.719052987598647e-05, "loss": 0.0575, "step": 824 }, { "epoch": 2.6623586429725363, "grad_norm": 0.12950399518013, "learning_rate": 1.7186020293122886e-05, "loss": 0.0673, "step": 825 }, { "epoch": 2.6623586429725363, "eval_loss": 0.07266557961702347, "eval_runtime": 187.9396, "eval_samples_per_second": 1.048, "eval_steps_per_second": 1.048, "step": 825 }, { "epoch": 2.665589660743134, "grad_norm": 0.11642561107873917, "learning_rate": 1.71815107102593e-05, "loss": 0.0664, "step": 826 }, { "epoch": 2.6688206785137316, "grad_norm": 0.09707733243703842, "learning_rate": 1.7177001127395718e-05, "loss": 0.0503, "step": 827 }, { "epoch": 2.6720516962843295, "grad_norm": 0.07535319775342941, "learning_rate": 1.7172491544532133e-05, "loss": 0.038, "step": 828 }, { "epoch": 2.6752827140549273, "grad_norm": 0.12683290243148804, "learning_rate": 1.7167981961668547e-05, "loss": 0.0614, "step": 829 }, { "epoch": 2.678513731825525, "grad_norm": 0.1531742513179779, "learning_rate": 1.716347237880496e-05, "loss": 0.069, "step": 830 }, { "epoch": 2.6817447495961226, "grad_norm": 0.13030219078063965, "learning_rate": 1.7158962795941376e-05, "loss": 0.0593, "step": 831 }, { "epoch": 2.6849757673667205, "grad_norm": 0.13288383185863495, "learning_rate": 1.715445321307779e-05, "loss": 0.0607, "step": 832 }, { "epoch": 2.6882067851373184, "grad_norm": 0.1245107427239418, "learning_rate": 1.7149943630214208e-05, "loss": 0.059, "step": 833 }, { "epoch": 2.691437802907916, "grad_norm": 0.10677826404571533, "learning_rate": 1.7145434047350622e-05, "loss": 0.0532, "step": 834 }, { "epoch": 2.6946688206785137, "grad_norm": 0.11118808388710022, "learning_rate": 1.7140924464487037e-05, "loss": 0.055, "step": 835 }, { "epoch": 2.6978998384491115, "grad_norm": 0.11494432389736176, "learning_rate": 1.713641488162345e-05, "loss": 0.0556, "step": 836 }, { "epoch": 2.7011308562197094, "grad_norm": 0.14139457046985626, "learning_rate": 1.7131905298759865e-05, "loss": 0.0687, "step": 837 }, { "epoch": 2.704361873990307, "grad_norm": 0.12973995506763458, "learning_rate": 1.712739571589628e-05, "loss": 0.0639, "step": 838 }, { "epoch": 2.7075928917609047, "grad_norm": 0.12217195332050323, "learning_rate": 1.7122886133032698e-05, "loss": 0.0636, "step": 839 }, { "epoch": 2.7108239095315025, "grad_norm": 0.08900095522403717, "learning_rate": 1.7118376550169112e-05, "loss": 0.0491, "step": 840 }, { "epoch": 2.7140549273021, "grad_norm": 0.12005368620157242, "learning_rate": 1.7113866967305526e-05, "loss": 0.0605, "step": 841 }, { "epoch": 2.717285945072698, "grad_norm": 0.1201101690530777, "learning_rate": 1.710935738444194e-05, "loss": 0.0597, "step": 842 }, { "epoch": 2.7205169628432957, "grad_norm": 0.12422793358564377, "learning_rate": 1.7104847801578355e-05, "loss": 0.0604, "step": 843 }, { "epoch": 2.7237479806138936, "grad_norm": 0.11504203826189041, "learning_rate": 1.710033821871477e-05, "loss": 0.0567, "step": 844 }, { "epoch": 2.726978998384491, "grad_norm": 0.13158410787582397, "learning_rate": 1.7095828635851187e-05, "loss": 0.0751, "step": 845 }, { "epoch": 2.730210016155089, "grad_norm": 0.13026569783687592, "learning_rate": 1.70913190529876e-05, "loss": 0.057, "step": 846 }, { "epoch": 2.7334410339256867, "grad_norm": 0.5227922201156616, "learning_rate": 1.7086809470124016e-05, "loss": 0.0567, "step": 847 }, { "epoch": 2.736672051696284, "grad_norm": 0.08213207870721817, "learning_rate": 1.708229988726043e-05, "loss": 0.0385, "step": 848 }, { "epoch": 2.739903069466882, "grad_norm": 0.14717501401901245, "learning_rate": 1.7077790304396845e-05, "loss": 0.0746, "step": 849 }, { "epoch": 2.74313408723748, "grad_norm": 0.11484480649232864, "learning_rate": 1.707328072153326e-05, "loss": 0.0513, "step": 850 }, { "epoch": 2.7463651050080777, "grad_norm": 0.13454794883728027, "learning_rate": 1.7068771138669674e-05, "loss": 0.0668, "step": 851 }, { "epoch": 2.749596122778675, "grad_norm": 0.16599783301353455, "learning_rate": 1.706426155580609e-05, "loss": 0.0659, "step": 852 }, { "epoch": 2.752827140549273, "grad_norm": 0.11365890502929688, "learning_rate": 1.7059751972942506e-05, "loss": 0.0591, "step": 853 }, { "epoch": 2.756058158319871, "grad_norm": 0.12964101135730743, "learning_rate": 1.705524239007892e-05, "loss": 0.0598, "step": 854 }, { "epoch": 2.7592891760904683, "grad_norm": 0.10415180772542953, "learning_rate": 1.7050732807215335e-05, "loss": 0.0499, "step": 855 }, { "epoch": 2.762520193861066, "grad_norm": 0.1433461755514145, "learning_rate": 1.704622322435175e-05, "loss": 0.0668, "step": 856 }, { "epoch": 2.765751211631664, "grad_norm": 0.12921610474586487, "learning_rate": 1.7041713641488163e-05, "loss": 0.0525, "step": 857 }, { "epoch": 2.768982229402262, "grad_norm": 0.11878325045108795, "learning_rate": 1.7037204058624578e-05, "loss": 0.06, "step": 858 }, { "epoch": 2.7722132471728593, "grad_norm": 0.09403812140226364, "learning_rate": 1.7032694475760992e-05, "loss": 0.0472, "step": 859 }, { "epoch": 2.775444264943457, "grad_norm": 0.09613174945116043, "learning_rate": 1.7028184892897407e-05, "loss": 0.0508, "step": 860 }, { "epoch": 2.778675282714055, "grad_norm": 0.11891157180070877, "learning_rate": 1.702367531003382e-05, "loss": 0.0589, "step": 861 }, { "epoch": 2.7819063004846525, "grad_norm": 0.1563875377178192, "learning_rate": 1.701916572717024e-05, "loss": 0.0794, "step": 862 }, { "epoch": 2.7851373182552503, "grad_norm": 0.13382849097251892, "learning_rate": 1.7014656144306653e-05, "loss": 0.0683, "step": 863 }, { "epoch": 2.788368336025848, "grad_norm": 0.15156099200248718, "learning_rate": 1.7010146561443067e-05, "loss": 0.0694, "step": 864 }, { "epoch": 2.791599353796446, "grad_norm": 0.12621727585792542, "learning_rate": 1.7005636978579482e-05, "loss": 0.0567, "step": 865 }, { "epoch": 2.7948303715670435, "grad_norm": 0.15388263761997223, "learning_rate": 1.7001127395715896e-05, "loss": 0.0738, "step": 866 }, { "epoch": 2.7980613893376414, "grad_norm": 0.1349688321352005, "learning_rate": 1.699661781285231e-05, "loss": 0.0546, "step": 867 }, { "epoch": 2.8012924071082392, "grad_norm": 0.11894084513187408, "learning_rate": 1.6992108229988725e-05, "loss": 0.0538, "step": 868 }, { "epoch": 2.8045234248788367, "grad_norm": 0.21414369344711304, "learning_rate": 1.6987598647125143e-05, "loss": 0.0511, "step": 869 }, { "epoch": 2.8077544426494345, "grad_norm": 0.1107967421412468, "learning_rate": 1.6983089064261557e-05, "loss": 0.0551, "step": 870 }, { "epoch": 2.8109854604200324, "grad_norm": 0.16710782051086426, "learning_rate": 1.697857948139797e-05, "loss": 0.0805, "step": 871 }, { "epoch": 2.8142164781906303, "grad_norm": 0.12345987558364868, "learning_rate": 1.6974069898534386e-05, "loss": 0.0576, "step": 872 }, { "epoch": 2.8174474959612277, "grad_norm": 0.11037538200616837, "learning_rate": 1.69695603156708e-05, "loss": 0.0486, "step": 873 }, { "epoch": 2.8206785137318255, "grad_norm": 0.10175740718841553, "learning_rate": 1.6965050732807218e-05, "loss": 0.0529, "step": 874 }, { "epoch": 2.8239095315024234, "grad_norm": 0.1126103326678276, "learning_rate": 1.6960541149943633e-05, "loss": 0.0546, "step": 875 }, { "epoch": 2.827140549273021, "grad_norm": 0.09911254793405533, "learning_rate": 1.6956031567080047e-05, "loss": 0.0455, "step": 876 }, { "epoch": 2.8303715670436187, "grad_norm": 0.1377602368593216, "learning_rate": 1.695152198421646e-05, "loss": 0.0599, "step": 877 }, { "epoch": 2.8336025848142166, "grad_norm": 0.11292906850576401, "learning_rate": 1.6947012401352876e-05, "loss": 0.053, "step": 878 }, { "epoch": 2.8368336025848144, "grad_norm": 0.13102898001670837, "learning_rate": 1.694250281848929e-05, "loss": 0.0629, "step": 879 }, { "epoch": 2.840064620355412, "grad_norm": 0.12573762238025665, "learning_rate": 1.6937993235625708e-05, "loss": 0.052, "step": 880 }, { "epoch": 2.8432956381260097, "grad_norm": 0.11562048643827438, "learning_rate": 1.6933483652762122e-05, "loss": 0.0594, "step": 881 }, { "epoch": 2.8465266558966076, "grad_norm": 0.15344659984111786, "learning_rate": 1.6928974069898537e-05, "loss": 0.0636, "step": 882 }, { "epoch": 2.849757673667205, "grad_norm": 0.11969706416130066, "learning_rate": 1.692446448703495e-05, "loss": 0.0549, "step": 883 }, { "epoch": 2.852988691437803, "grad_norm": 0.0930706337094307, "learning_rate": 1.6919954904171365e-05, "loss": 0.0434, "step": 884 }, { "epoch": 2.8562197092084007, "grad_norm": 0.1458914428949356, "learning_rate": 1.691544532130778e-05, "loss": 0.0707, "step": 885 }, { "epoch": 2.8594507269789986, "grad_norm": 0.11928731948137283, "learning_rate": 1.6910935738444198e-05, "loss": 0.0584, "step": 886 }, { "epoch": 2.862681744749596, "grad_norm": 0.1225530207157135, "learning_rate": 1.6906426155580612e-05, "loss": 0.0619, "step": 887 }, { "epoch": 2.865912762520194, "grad_norm": 0.13734394311904907, "learning_rate": 1.6901916572717026e-05, "loss": 0.065, "step": 888 }, { "epoch": 2.8691437802907918, "grad_norm": 0.13331110775470734, "learning_rate": 1.689740698985344e-05, "loss": 0.0652, "step": 889 }, { "epoch": 2.872374798061389, "grad_norm": 0.1149471327662468, "learning_rate": 1.6892897406989855e-05, "loss": 0.0495, "step": 890 }, { "epoch": 2.875605815831987, "grad_norm": 0.14575156569480896, "learning_rate": 1.688838782412627e-05, "loss": 0.069, "step": 891 }, { "epoch": 2.878836833602585, "grad_norm": 0.1205376535654068, "learning_rate": 1.6883878241262684e-05, "loss": 0.0595, "step": 892 }, { "epoch": 2.8820678513731828, "grad_norm": 0.13029593229293823, "learning_rate": 1.6879368658399102e-05, "loss": 0.0652, "step": 893 }, { "epoch": 2.88529886914378, "grad_norm": 0.1242680773139, "learning_rate": 1.6874859075535516e-05, "loss": 0.0628, "step": 894 }, { "epoch": 2.888529886914378, "grad_norm": 0.1066925972700119, "learning_rate": 1.687034949267193e-05, "loss": 0.0571, "step": 895 }, { "epoch": 2.891760904684976, "grad_norm": 0.09622512012720108, "learning_rate": 1.6865839909808345e-05, "loss": 0.0447, "step": 896 }, { "epoch": 2.8949919224555734, "grad_norm": 0.14432470500469208, "learning_rate": 1.686133032694476e-05, "loss": 0.0658, "step": 897 }, { "epoch": 2.898222940226171, "grad_norm": 0.1262982338666916, "learning_rate": 1.6856820744081174e-05, "loss": 0.057, "step": 898 }, { "epoch": 2.901453957996769, "grad_norm": 0.12278001755475998, "learning_rate": 1.6852311161217588e-05, "loss": 0.0533, "step": 899 }, { "epoch": 2.904684975767367, "grad_norm": 0.13526810705661774, "learning_rate": 1.6847801578354002e-05, "loss": 0.0536, "step": 900 }, { "epoch": 2.904684975767367, "eval_loss": 0.07217078655958176, "eval_runtime": 188.4791, "eval_samples_per_second": 1.045, "eval_steps_per_second": 1.045, "step": 900 }, { "epoch": 2.9079159935379644, "grad_norm": 0.07958260923624039, "learning_rate": 1.6843291995490417e-05, "loss": 0.0395, "step": 901 }, { "epoch": 2.9111470113085622, "grad_norm": 0.10193248093128204, "learning_rate": 1.683878241262683e-05, "loss": 0.0495, "step": 902 }, { "epoch": 2.9143780290791597, "grad_norm": 0.09985180199146271, "learning_rate": 1.683427282976325e-05, "loss": 0.0496, "step": 903 }, { "epoch": 2.9176090468497575, "grad_norm": 0.15160292387008667, "learning_rate": 1.6829763246899663e-05, "loss": 0.0668, "step": 904 }, { "epoch": 2.9208400646203554, "grad_norm": 0.13049964606761932, "learning_rate": 1.6825253664036078e-05, "loss": 0.067, "step": 905 }, { "epoch": 2.9240710823909533, "grad_norm": 0.13118034601211548, "learning_rate": 1.6820744081172492e-05, "loss": 0.06, "step": 906 }, { "epoch": 2.927302100161551, "grad_norm": 0.11038261651992798, "learning_rate": 1.6816234498308907e-05, "loss": 0.0542, "step": 907 }, { "epoch": 2.9305331179321485, "grad_norm": 0.11064022779464722, "learning_rate": 1.681172491544532e-05, "loss": 0.0535, "step": 908 }, { "epoch": 2.9337641357027464, "grad_norm": 0.10448973625898361, "learning_rate": 1.6807215332581735e-05, "loss": 0.0444, "step": 909 }, { "epoch": 2.936995153473344, "grad_norm": 0.09960347414016724, "learning_rate": 1.6802705749718153e-05, "loss": 0.0477, "step": 910 }, { "epoch": 2.9402261712439417, "grad_norm": 0.10175690799951553, "learning_rate": 1.6798196166854568e-05, "loss": 0.0521, "step": 911 }, { "epoch": 2.9434571890145396, "grad_norm": 0.11306945234537125, "learning_rate": 1.6793686583990982e-05, "loss": 0.0483, "step": 912 }, { "epoch": 2.9466882067851374, "grad_norm": 0.12629052996635437, "learning_rate": 1.6789177001127396e-05, "loss": 0.0594, "step": 913 }, { "epoch": 2.9499192245557353, "grad_norm": 0.16772274672985077, "learning_rate": 1.678466741826381e-05, "loss": 0.0808, "step": 914 }, { "epoch": 2.9531502423263327, "grad_norm": 0.14857324957847595, "learning_rate": 1.6780157835400225e-05, "loss": 0.0626, "step": 915 }, { "epoch": 2.9563812600969306, "grad_norm": 0.12077292054891586, "learning_rate": 1.6775648252536643e-05, "loss": 0.0547, "step": 916 }, { "epoch": 2.959612277867528, "grad_norm": 0.08560369163751602, "learning_rate": 1.6771138669673057e-05, "loss": 0.0402, "step": 917 }, { "epoch": 2.962843295638126, "grad_norm": 0.1328180432319641, "learning_rate": 1.676662908680947e-05, "loss": 0.0603, "step": 918 }, { "epoch": 2.9660743134087237, "grad_norm": 0.13140498101711273, "learning_rate": 1.6762119503945886e-05, "loss": 0.0541, "step": 919 }, { "epoch": 2.9693053311793216, "grad_norm": 0.13546602427959442, "learning_rate": 1.67576099210823e-05, "loss": 0.0644, "step": 920 }, { "epoch": 2.9725363489499195, "grad_norm": 0.13099107146263123, "learning_rate": 1.6753100338218715e-05, "loss": 0.0544, "step": 921 }, { "epoch": 2.975767366720517, "grad_norm": 0.12933450937271118, "learning_rate": 1.6748590755355133e-05, "loss": 0.0632, "step": 922 }, { "epoch": 2.9789983844911148, "grad_norm": 0.12769202888011932, "learning_rate": 1.6744081172491547e-05, "loss": 0.0591, "step": 923 }, { "epoch": 2.982229402261712, "grad_norm": 0.12964068353176117, "learning_rate": 1.673957158962796e-05, "loss": 0.0602, "step": 924 }, { "epoch": 2.98546042003231, "grad_norm": 0.1714252084493637, "learning_rate": 1.6735062006764376e-05, "loss": 0.076, "step": 925 }, { "epoch": 2.988691437802908, "grad_norm": 0.15382935106754303, "learning_rate": 1.673055242390079e-05, "loss": 0.0624, "step": 926 }, { "epoch": 2.991922455573506, "grad_norm": 0.15337888896465302, "learning_rate": 1.6726042841037205e-05, "loss": 0.0652, "step": 927 }, { "epoch": 2.9951534733441036, "grad_norm": 0.1587558090686798, "learning_rate": 1.6721533258173622e-05, "loss": 0.0754, "step": 928 }, { "epoch": 2.998384491114701, "grad_norm": 0.09836894273757935, "learning_rate": 1.6717023675310037e-05, "loss": 0.046, "step": 929 }, { "epoch": 3.0, "grad_norm": 0.20606525242328644, "learning_rate": 1.671251409244645e-05, "loss": 0.0692, "step": 930 }, { "epoch": 3.003231017770598, "grad_norm": 0.12933456897735596, "learning_rate": 1.6708004509582866e-05, "loss": 0.0601, "step": 931 }, { "epoch": 3.0064620355411953, "grad_norm": 0.15069305896759033, "learning_rate": 1.670349492671928e-05, "loss": 0.0691, "step": 932 }, { "epoch": 3.009693053311793, "grad_norm": 0.12518665194511414, "learning_rate": 1.6698985343855694e-05, "loss": 0.0594, "step": 933 }, { "epoch": 3.012924071082391, "grad_norm": 0.13509726524353027, "learning_rate": 1.6694475760992112e-05, "loss": 0.0553, "step": 934 }, { "epoch": 3.016155088852989, "grad_norm": 0.18207424879074097, "learning_rate": 1.6689966178128526e-05, "loss": 0.0691, "step": 935 }, { "epoch": 3.0193861066235863, "grad_norm": 0.10155311226844788, "learning_rate": 1.668545659526494e-05, "loss": 0.0442, "step": 936 }, { "epoch": 3.022617124394184, "grad_norm": 0.08462440967559814, "learning_rate": 1.6680947012401355e-05, "loss": 0.0397, "step": 937 }, { "epoch": 3.025848142164782, "grad_norm": 0.1074729785323143, "learning_rate": 1.667643742953777e-05, "loss": 0.0476, "step": 938 }, { "epoch": 3.0290791599353795, "grad_norm": 0.10128747671842575, "learning_rate": 1.6671927846674184e-05, "loss": 0.044, "step": 939 }, { "epoch": 3.0323101777059773, "grad_norm": 0.10703253746032715, "learning_rate": 1.66674182638106e-05, "loss": 0.0434, "step": 940 }, { "epoch": 3.035541195476575, "grad_norm": 0.16827581822872162, "learning_rate": 1.6662908680947013e-05, "loss": 0.058, "step": 941 }, { "epoch": 3.038772213247173, "grad_norm": 0.12423544377088547, "learning_rate": 1.6658399098083427e-05, "loss": 0.0529, "step": 942 }, { "epoch": 3.0420032310177705, "grad_norm": 0.11421461403369904, "learning_rate": 1.665388951521984e-05, "loss": 0.0462, "step": 943 }, { "epoch": 3.0452342487883683, "grad_norm": 0.1504746377468109, "learning_rate": 1.664937993235626e-05, "loss": 0.0607, "step": 944 }, { "epoch": 3.048465266558966, "grad_norm": 0.1171237900853157, "learning_rate": 1.6644870349492674e-05, "loss": 0.0488, "step": 945 }, { "epoch": 3.0516962843295636, "grad_norm": 0.12751275300979614, "learning_rate": 1.6640360766629088e-05, "loss": 0.0566, "step": 946 }, { "epoch": 3.0549273021001615, "grad_norm": 0.10137461870908737, "learning_rate": 1.6635851183765503e-05, "loss": 0.042, "step": 947 }, { "epoch": 3.0581583198707594, "grad_norm": 0.10805993527173996, "learning_rate": 1.6631341600901917e-05, "loss": 0.0436, "step": 948 }, { "epoch": 3.0613893376413572, "grad_norm": 0.15429779887199402, "learning_rate": 1.662683201803833e-05, "loss": 0.0611, "step": 949 }, { "epoch": 3.0646203554119547, "grad_norm": 0.15192106366157532, "learning_rate": 1.6622322435174746e-05, "loss": 0.0558, "step": 950 }, { "epoch": 3.0678513731825525, "grad_norm": 0.14291639626026154, "learning_rate": 1.661781285231116e-05, "loss": 0.0582, "step": 951 }, { "epoch": 3.0710823909531504, "grad_norm": 0.11516083776950836, "learning_rate": 1.6613303269447578e-05, "loss": 0.0455, "step": 952 }, { "epoch": 3.074313408723748, "grad_norm": 0.11716248095035553, "learning_rate": 1.6608793686583992e-05, "loss": 0.0419, "step": 953 }, { "epoch": 3.0775444264943457, "grad_norm": 0.13777975738048553, "learning_rate": 1.6604284103720407e-05, "loss": 0.0587, "step": 954 }, { "epoch": 3.0807754442649435, "grad_norm": 0.15481697022914886, "learning_rate": 1.659977452085682e-05, "loss": 0.058, "step": 955 }, { "epoch": 3.0840064620355414, "grad_norm": 0.11290151625871658, "learning_rate": 1.6595264937993235e-05, "loss": 0.0428, "step": 956 }, { "epoch": 3.087237479806139, "grad_norm": 0.11138515174388885, "learning_rate": 1.6590755355129653e-05, "loss": 0.0445, "step": 957 }, { "epoch": 3.0904684975767367, "grad_norm": 0.13892598450183868, "learning_rate": 1.6586245772266068e-05, "loss": 0.0532, "step": 958 }, { "epoch": 3.0936995153473346, "grad_norm": 0.14099125564098358, "learning_rate": 1.6581736189402482e-05, "loss": 0.0532, "step": 959 }, { "epoch": 3.096930533117932, "grad_norm": 0.1620667278766632, "learning_rate": 1.6577226606538896e-05, "loss": 0.0716, "step": 960 }, { "epoch": 3.10016155088853, "grad_norm": 0.1435079723596573, "learning_rate": 1.657271702367531e-05, "loss": 0.0587, "step": 961 }, { "epoch": 3.1033925686591277, "grad_norm": 0.1412099003791809, "learning_rate": 1.6568207440811725e-05, "loss": 0.0599, "step": 962 }, { "epoch": 3.106623586429725, "grad_norm": 0.16996391117572784, "learning_rate": 1.6563697857948143e-05, "loss": 0.0577, "step": 963 }, { "epoch": 3.109854604200323, "grad_norm": 0.14544463157653809, "learning_rate": 1.6559188275084557e-05, "loss": 0.0595, "step": 964 }, { "epoch": 3.113085621970921, "grad_norm": 0.12646666169166565, "learning_rate": 1.6554678692220972e-05, "loss": 0.0496, "step": 965 }, { "epoch": 3.1163166397415187, "grad_norm": 0.16260091960430145, "learning_rate": 1.6550169109357386e-05, "loss": 0.0588, "step": 966 }, { "epoch": 3.119547657512116, "grad_norm": 0.14531579613685608, "learning_rate": 1.65456595264938e-05, "loss": 0.0654, "step": 967 }, { "epoch": 3.122778675282714, "grad_norm": 0.13838277757167816, "learning_rate": 1.6541149943630215e-05, "loss": 0.058, "step": 968 }, { "epoch": 3.126009693053312, "grad_norm": 0.10179346054792404, "learning_rate": 1.6536640360766633e-05, "loss": 0.0394, "step": 969 }, { "epoch": 3.1292407108239093, "grad_norm": 0.14759835600852966, "learning_rate": 1.6532130777903047e-05, "loss": 0.0616, "step": 970 }, { "epoch": 3.132471728594507, "grad_norm": 0.12317320704460144, "learning_rate": 1.652762119503946e-05, "loss": 0.0457, "step": 971 }, { "epoch": 3.135702746365105, "grad_norm": 0.14770029485225677, "learning_rate": 1.6523111612175876e-05, "loss": 0.0606, "step": 972 }, { "epoch": 3.138933764135703, "grad_norm": 0.14644749462604523, "learning_rate": 1.651860202931229e-05, "loss": 0.0576, "step": 973 }, { "epoch": 3.1421647819063003, "grad_norm": 0.15745016932487488, "learning_rate": 1.6514092446448705e-05, "loss": 0.0658, "step": 974 }, { "epoch": 3.145395799676898, "grad_norm": 0.15281431376934052, "learning_rate": 1.6509582863585122e-05, "loss": 0.0579, "step": 975 }, { "epoch": 3.145395799676898, "eval_loss": 0.07240297645330429, "eval_runtime": 188.3995, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 975 }, { "epoch": 3.148626817447496, "grad_norm": 0.186857670545578, "learning_rate": 1.6505073280721537e-05, "loss": 0.0712, "step": 976 }, { "epoch": 3.1518578352180935, "grad_norm": 0.11668923497200012, "learning_rate": 1.650056369785795e-05, "loss": 0.0432, "step": 977 }, { "epoch": 3.1550888529886914, "grad_norm": 0.1078757792711258, "learning_rate": 1.6496054114994366e-05, "loss": 0.0438, "step": 978 }, { "epoch": 3.158319870759289, "grad_norm": 0.10889827460050583, "learning_rate": 1.649154453213078e-05, "loss": 0.0404, "step": 979 }, { "epoch": 3.161550888529887, "grad_norm": 0.11770477145910263, "learning_rate": 1.6487034949267194e-05, "loss": 0.0379, "step": 980 }, { "epoch": 3.1647819063004845, "grad_norm": 0.1730085015296936, "learning_rate": 1.648252536640361e-05, "loss": 0.0527, "step": 981 }, { "epoch": 3.1680129240710824, "grad_norm": 0.17005400359630585, "learning_rate": 1.6478015783540023e-05, "loss": 0.0608, "step": 982 }, { "epoch": 3.1712439418416802, "grad_norm": 0.15987516939640045, "learning_rate": 1.6473506200676438e-05, "loss": 0.0657, "step": 983 }, { "epoch": 3.1744749596122777, "grad_norm": 0.12467172741889954, "learning_rate": 1.6468996617812852e-05, "loss": 0.0532, "step": 984 }, { "epoch": 3.1777059773828755, "grad_norm": 0.13436008989810944, "learning_rate": 1.646448703494927e-05, "loss": 0.0464, "step": 985 }, { "epoch": 3.1809369951534734, "grad_norm": 0.15260566771030426, "learning_rate": 1.6459977452085684e-05, "loss": 0.0585, "step": 986 }, { "epoch": 3.1841680129240713, "grad_norm": 0.1228412613272667, "learning_rate": 1.64554678692221e-05, "loss": 0.042, "step": 987 }, { "epoch": 3.1873990306946687, "grad_norm": 0.1621600091457367, "learning_rate": 1.6450958286358513e-05, "loss": 0.0705, "step": 988 }, { "epoch": 3.1906300484652665, "grad_norm": 0.14798057079315186, "learning_rate": 1.6446448703494927e-05, "loss": 0.0536, "step": 989 }, { "epoch": 3.1938610662358644, "grad_norm": 0.17002591490745544, "learning_rate": 1.644193912063134e-05, "loss": 0.0624, "step": 990 }, { "epoch": 3.197092084006462, "grad_norm": 0.11882289499044418, "learning_rate": 1.6437429537767756e-05, "loss": 0.0386, "step": 991 }, { "epoch": 3.2003231017770597, "grad_norm": 0.177546426653862, "learning_rate": 1.643291995490417e-05, "loss": 0.0684, "step": 992 }, { "epoch": 3.2035541195476576, "grad_norm": 0.1515907645225525, "learning_rate": 1.6428410372040588e-05, "loss": 0.0588, "step": 993 }, { "epoch": 3.2067851373182554, "grad_norm": 0.13172346353530884, "learning_rate": 1.6423900789177003e-05, "loss": 0.0502, "step": 994 }, { "epoch": 3.210016155088853, "grad_norm": 0.1430046707391739, "learning_rate": 1.6419391206313417e-05, "loss": 0.0538, "step": 995 }, { "epoch": 3.2132471728594507, "grad_norm": 0.10192380100488663, "learning_rate": 1.641488162344983e-05, "loss": 0.0414, "step": 996 }, { "epoch": 3.2164781906300486, "grad_norm": 0.12296223640441895, "learning_rate": 1.6410372040586246e-05, "loss": 0.0466, "step": 997 }, { "epoch": 3.219709208400646, "grad_norm": 0.1641893982887268, "learning_rate": 1.640586245772266e-05, "loss": 0.0606, "step": 998 }, { "epoch": 3.222940226171244, "grad_norm": 0.109470434486866, "learning_rate": 1.6401352874859078e-05, "loss": 0.0424, "step": 999 }, { "epoch": 3.2261712439418417, "grad_norm": 0.10068835318088531, "learning_rate": 1.6396843291995492e-05, "loss": 0.0364, "step": 1000 }, { "epoch": 3.2294022617124396, "grad_norm": 0.09672326594591141, "learning_rate": 1.6392333709131907e-05, "loss": 0.0403, "step": 1001 }, { "epoch": 3.232633279483037, "grad_norm": 0.21638123691082, "learning_rate": 1.638782412626832e-05, "loss": 0.0785, "step": 1002 }, { "epoch": 3.235864297253635, "grad_norm": 0.15015633404254913, "learning_rate": 1.6383314543404735e-05, "loss": 0.0513, "step": 1003 }, { "epoch": 3.2390953150242328, "grad_norm": 0.11658553779125214, "learning_rate": 1.637880496054115e-05, "loss": 0.0444, "step": 1004 }, { "epoch": 3.24232633279483, "grad_norm": 0.142287939786911, "learning_rate": 1.6374295377677568e-05, "loss": 0.0576, "step": 1005 }, { "epoch": 3.245557350565428, "grad_norm": 0.11885146051645279, "learning_rate": 1.6369785794813982e-05, "loss": 0.0442, "step": 1006 }, { "epoch": 3.248788368336026, "grad_norm": 0.1423695832490921, "learning_rate": 1.6365276211950396e-05, "loss": 0.0539, "step": 1007 }, { "epoch": 3.2520193861066238, "grad_norm": 0.14337588846683502, "learning_rate": 1.636076662908681e-05, "loss": 0.056, "step": 1008 }, { "epoch": 3.255250403877221, "grad_norm": 0.15875791013240814, "learning_rate": 1.6356257046223225e-05, "loss": 0.0672, "step": 1009 }, { "epoch": 3.258481421647819, "grad_norm": 0.1158808171749115, "learning_rate": 1.635174746335964e-05, "loss": 0.0464, "step": 1010 }, { "epoch": 3.261712439418417, "grad_norm": 0.12882259488105774, "learning_rate": 1.6347237880496057e-05, "loss": 0.0497, "step": 1011 }, { "epoch": 3.2649434571890144, "grad_norm": 0.15846951305866241, "learning_rate": 1.6342728297632472e-05, "loss": 0.0656, "step": 1012 }, { "epoch": 3.268174474959612, "grad_norm": 0.16542094945907593, "learning_rate": 1.6338218714768886e-05, "loss": 0.06, "step": 1013 }, { "epoch": 3.27140549273021, "grad_norm": 0.1318148821592331, "learning_rate": 1.63337091319053e-05, "loss": 0.0475, "step": 1014 }, { "epoch": 3.274636510500808, "grad_norm": 0.110074482858181, "learning_rate": 1.6329199549041715e-05, "loss": 0.0411, "step": 1015 }, { "epoch": 3.2778675282714054, "grad_norm": 0.14870020747184753, "learning_rate": 1.632468996617813e-05, "loss": 0.0491, "step": 1016 }, { "epoch": 3.2810985460420032, "grad_norm": 0.16925957798957825, "learning_rate": 1.6320180383314547e-05, "loss": 0.0648, "step": 1017 }, { "epoch": 3.284329563812601, "grad_norm": 0.11573273688554764, "learning_rate": 1.631567080045096e-05, "loss": 0.0457, "step": 1018 }, { "epoch": 3.2875605815831985, "grad_norm": 0.16456320881843567, "learning_rate": 1.6311161217587376e-05, "loss": 0.0608, "step": 1019 }, { "epoch": 3.2907915993537964, "grad_norm": 0.14626148343086243, "learning_rate": 1.630665163472379e-05, "loss": 0.0543, "step": 1020 }, { "epoch": 3.2940226171243943, "grad_norm": 0.15516629815101624, "learning_rate": 1.6302142051860205e-05, "loss": 0.0628, "step": 1021 }, { "epoch": 3.297253634894992, "grad_norm": 0.18173731863498688, "learning_rate": 1.629763246899662e-05, "loss": 0.0677, "step": 1022 }, { "epoch": 3.3004846526655895, "grad_norm": 0.146285742521286, "learning_rate": 1.6293122886133033e-05, "loss": 0.052, "step": 1023 }, { "epoch": 3.3037156704361874, "grad_norm": 0.19001832604408264, "learning_rate": 1.6288613303269448e-05, "loss": 0.0693, "step": 1024 }, { "epoch": 3.3069466882067853, "grad_norm": 0.09959913045167923, "learning_rate": 1.6284103720405862e-05, "loss": 0.0339, "step": 1025 }, { "epoch": 3.3101777059773827, "grad_norm": 0.16015265882015228, "learning_rate": 1.627959413754228e-05, "loss": 0.0615, "step": 1026 }, { "epoch": 3.3134087237479806, "grad_norm": 0.13552674651145935, "learning_rate": 1.6275084554678694e-05, "loss": 0.0475, "step": 1027 }, { "epoch": 3.3166397415185784, "grad_norm": 0.16153255105018616, "learning_rate": 1.627057497181511e-05, "loss": 0.0601, "step": 1028 }, { "epoch": 3.3198707592891763, "grad_norm": 0.14683452248573303, "learning_rate": 1.6266065388951523e-05, "loss": 0.0526, "step": 1029 }, { "epoch": 3.3231017770597737, "grad_norm": 0.11259462684392929, "learning_rate": 1.6261555806087938e-05, "loss": 0.0429, "step": 1030 }, { "epoch": 3.3263327948303716, "grad_norm": 0.1688949465751648, "learning_rate": 1.6257046223224352e-05, "loss": 0.0624, "step": 1031 }, { "epoch": 3.3295638126009695, "grad_norm": 0.10584679245948792, "learning_rate": 1.6252536640360766e-05, "loss": 0.0376, "step": 1032 }, { "epoch": 3.332794830371567, "grad_norm": 0.16506820917129517, "learning_rate": 1.624802705749718e-05, "loss": 0.0585, "step": 1033 }, { "epoch": 3.3360258481421647, "grad_norm": 0.11264611780643463, "learning_rate": 1.6243517474633595e-05, "loss": 0.0432, "step": 1034 }, { "epoch": 3.3392568659127626, "grad_norm": 0.17402660846710205, "learning_rate": 1.6239007891770013e-05, "loss": 0.0566, "step": 1035 }, { "epoch": 3.3424878836833605, "grad_norm": 0.15407973527908325, "learning_rate": 1.6234498308906427e-05, "loss": 0.0519, "step": 1036 }, { "epoch": 3.345718901453958, "grad_norm": 0.1423128843307495, "learning_rate": 1.622998872604284e-05, "loss": 0.0511, "step": 1037 }, { "epoch": 3.3489499192245558, "grad_norm": 0.11291830986738205, "learning_rate": 1.6225479143179256e-05, "loss": 0.0413, "step": 1038 }, { "epoch": 3.3521809369951536, "grad_norm": 0.17067734897136688, "learning_rate": 1.622096956031567e-05, "loss": 0.0587, "step": 1039 }, { "epoch": 3.355411954765751, "grad_norm": 0.17072725296020508, "learning_rate": 1.6216459977452088e-05, "loss": 0.0606, "step": 1040 }, { "epoch": 3.358642972536349, "grad_norm": 0.13390378654003143, "learning_rate": 1.6211950394588503e-05, "loss": 0.0411, "step": 1041 }, { "epoch": 3.361873990306947, "grad_norm": 0.10424879193305969, "learning_rate": 1.6207440811724917e-05, "loss": 0.0356, "step": 1042 }, { "epoch": 3.3651050080775446, "grad_norm": 0.26167649030685425, "learning_rate": 1.620293122886133e-05, "loss": 0.0569, "step": 1043 }, { "epoch": 3.368336025848142, "grad_norm": 0.1589985489845276, "learning_rate": 1.6198421645997746e-05, "loss": 0.0539, "step": 1044 }, { "epoch": 3.37156704361874, "grad_norm": 0.14946100115776062, "learning_rate": 1.619391206313416e-05, "loss": 0.0531, "step": 1045 }, { "epoch": 3.374798061389338, "grad_norm": 0.1740565448999405, "learning_rate": 1.6189402480270578e-05, "loss": 0.0626, "step": 1046 }, { "epoch": 3.378029079159935, "grad_norm": 0.16527672111988068, "learning_rate": 1.6184892897406992e-05, "loss": 0.061, "step": 1047 }, { "epoch": 3.381260096930533, "grad_norm": 0.11640432476997375, "learning_rate": 1.6180383314543407e-05, "loss": 0.0433, "step": 1048 }, { "epoch": 3.384491114701131, "grad_norm": 0.11857109516859055, "learning_rate": 1.617587373167982e-05, "loss": 0.0374, "step": 1049 }, { "epoch": 3.387722132471729, "grad_norm": 0.15555702149868011, "learning_rate": 1.6171364148816236e-05, "loss": 0.0505, "step": 1050 }, { "epoch": 3.387722132471729, "eval_loss": 0.07340462505817413, "eval_runtime": 188.3173, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 1050 }, { "epoch": 3.3909531502423262, "grad_norm": 0.1614767611026764, "learning_rate": 1.616685456595265e-05, "loss": 0.0616, "step": 1051 }, { "epoch": 3.394184168012924, "grad_norm": 0.13632138073444366, "learning_rate": 1.6162344983089068e-05, "loss": 0.0495, "step": 1052 }, { "epoch": 3.397415185783522, "grad_norm": 0.1372632533311844, "learning_rate": 1.6157835400225482e-05, "loss": 0.0433, "step": 1053 }, { "epoch": 3.4006462035541194, "grad_norm": 0.17442293465137482, "learning_rate": 1.6153325817361896e-05, "loss": 0.0578, "step": 1054 }, { "epoch": 3.4038772213247173, "grad_norm": 0.18292829394340515, "learning_rate": 1.614881623449831e-05, "loss": 0.0648, "step": 1055 }, { "epoch": 3.407108239095315, "grad_norm": 0.15629050135612488, "learning_rate": 1.6144306651634725e-05, "loss": 0.0504, "step": 1056 }, { "epoch": 3.410339256865913, "grad_norm": 0.1325637549161911, "learning_rate": 1.613979706877114e-05, "loss": 0.039, "step": 1057 }, { "epoch": 3.4135702746365104, "grad_norm": 0.17488038539886475, "learning_rate": 1.6135287485907557e-05, "loss": 0.0557, "step": 1058 }, { "epoch": 3.4168012924071083, "grad_norm": 0.1492920219898224, "learning_rate": 1.6130777903043972e-05, "loss": 0.0537, "step": 1059 }, { "epoch": 3.420032310177706, "grad_norm": 0.14212766289710999, "learning_rate": 1.6126268320180386e-05, "loss": 0.0422, "step": 1060 }, { "epoch": 3.4232633279483036, "grad_norm": 0.1526009440422058, "learning_rate": 1.61217587373168e-05, "loss": 0.0572, "step": 1061 }, { "epoch": 3.4264943457189014, "grad_norm": 0.11353015899658203, "learning_rate": 1.6117249154453215e-05, "loss": 0.0383, "step": 1062 }, { "epoch": 3.4297253634894993, "grad_norm": 0.18253804743289948, "learning_rate": 1.611273957158963e-05, "loss": 0.0657, "step": 1063 }, { "epoch": 3.432956381260097, "grad_norm": 0.1422210931777954, "learning_rate": 1.6108229988726044e-05, "loss": 0.0528, "step": 1064 }, { "epoch": 3.4361873990306946, "grad_norm": 0.15058425068855286, "learning_rate": 1.6103720405862458e-05, "loss": 0.052, "step": 1065 }, { "epoch": 3.4394184168012925, "grad_norm": 0.13702097535133362, "learning_rate": 1.6099210822998873e-05, "loss": 0.0478, "step": 1066 }, { "epoch": 3.4426494345718903, "grad_norm": 0.19859325885772705, "learning_rate": 1.609470124013529e-05, "loss": 0.0512, "step": 1067 }, { "epoch": 3.4458804523424877, "grad_norm": 0.1942027509212494, "learning_rate": 1.6090191657271705e-05, "loss": 0.0711, "step": 1068 }, { "epoch": 3.4491114701130856, "grad_norm": 0.1615460216999054, "learning_rate": 1.608568207440812e-05, "loss": 0.0532, "step": 1069 }, { "epoch": 3.4523424878836835, "grad_norm": 0.15197354555130005, "learning_rate": 1.6081172491544533e-05, "loss": 0.0566, "step": 1070 }, { "epoch": 3.4555735056542813, "grad_norm": 0.131342813372612, "learning_rate": 1.6076662908680948e-05, "loss": 0.0448, "step": 1071 }, { "epoch": 3.4588045234248788, "grad_norm": 0.1431513875722885, "learning_rate": 1.6072153325817362e-05, "loss": 0.0502, "step": 1072 }, { "epoch": 3.4620355411954766, "grad_norm": 0.1926133781671524, "learning_rate": 1.6067643742953777e-05, "loss": 0.0625, "step": 1073 }, { "epoch": 3.4652665589660745, "grad_norm": 0.13775743544101715, "learning_rate": 1.606313416009019e-05, "loss": 0.0529, "step": 1074 }, { "epoch": 3.468497576736672, "grad_norm": 0.1370486617088318, "learning_rate": 1.6058624577226605e-05, "loss": 0.0489, "step": 1075 }, { "epoch": 3.47172859450727, "grad_norm": 0.18667002022266388, "learning_rate": 1.6054114994363023e-05, "loss": 0.0734, "step": 1076 }, { "epoch": 3.4749596122778676, "grad_norm": 0.16826723515987396, "learning_rate": 1.6049605411499438e-05, "loss": 0.0568, "step": 1077 }, { "epoch": 3.4781906300484655, "grad_norm": 0.1706121861934662, "learning_rate": 1.6045095828635852e-05, "loss": 0.064, "step": 1078 }, { "epoch": 3.481421647819063, "grad_norm": 0.12642326951026917, "learning_rate": 1.6040586245772266e-05, "loss": 0.0441, "step": 1079 }, { "epoch": 3.484652665589661, "grad_norm": 0.14685548841953278, "learning_rate": 1.603607666290868e-05, "loss": 0.0484, "step": 1080 }, { "epoch": 3.4878836833602587, "grad_norm": 0.13969610631465912, "learning_rate": 1.6031567080045095e-05, "loss": 0.0467, "step": 1081 }, { "epoch": 3.491114701130856, "grad_norm": 0.18751631677150726, "learning_rate": 1.6027057497181513e-05, "loss": 0.0653, "step": 1082 }, { "epoch": 3.494345718901454, "grad_norm": 0.14187730848789215, "learning_rate": 1.6022547914317927e-05, "loss": 0.0485, "step": 1083 }, { "epoch": 3.497576736672052, "grad_norm": 0.13812421262264252, "learning_rate": 1.6018038331454342e-05, "loss": 0.0494, "step": 1084 }, { "epoch": 3.5008077544426497, "grad_norm": 0.13007357716560364, "learning_rate": 1.6013528748590756e-05, "loss": 0.0454, "step": 1085 }, { "epoch": 3.504038772213247, "grad_norm": 0.16555847227573395, "learning_rate": 1.600901916572717e-05, "loss": 0.0474, "step": 1086 }, { "epoch": 3.507269789983845, "grad_norm": 0.11033131182193756, "learning_rate": 1.6004509582863585e-05, "loss": 0.0367, "step": 1087 }, { "epoch": 3.5105008077544424, "grad_norm": 0.14931395649909973, "learning_rate": 1.6000000000000003e-05, "loss": 0.0489, "step": 1088 }, { "epoch": 3.5137318255250403, "grad_norm": 0.16848234832286835, "learning_rate": 1.5995490417136417e-05, "loss": 0.0578, "step": 1089 }, { "epoch": 3.516962843295638, "grad_norm": 0.15877306461334229, "learning_rate": 1.599098083427283e-05, "loss": 0.0524, "step": 1090 }, { "epoch": 3.520193861066236, "grad_norm": 0.16530410945415497, "learning_rate": 1.5986471251409246e-05, "loss": 0.0619, "step": 1091 }, { "epoch": 3.523424878836834, "grad_norm": 0.14331963658332825, "learning_rate": 1.598196166854566e-05, "loss": 0.0486, "step": 1092 }, { "epoch": 3.5266558966074313, "grad_norm": 0.15027795732021332, "learning_rate": 1.5977452085682075e-05, "loss": 0.048, "step": 1093 }, { "epoch": 3.529886914378029, "grad_norm": 0.15376311540603638, "learning_rate": 1.5972942502818492e-05, "loss": 0.0559, "step": 1094 }, { "epoch": 3.5331179321486266, "grad_norm": 0.1315869837999344, "learning_rate": 1.5968432919954907e-05, "loss": 0.0451, "step": 1095 }, { "epoch": 3.5363489499192244, "grad_norm": 0.13606947660446167, "learning_rate": 1.596392333709132e-05, "loss": 0.0433, "step": 1096 }, { "epoch": 3.5395799676898223, "grad_norm": 0.20028483867645264, "learning_rate": 1.5959413754227736e-05, "loss": 0.0593, "step": 1097 }, { "epoch": 3.54281098546042, "grad_norm": 0.15004722774028778, "learning_rate": 1.595490417136415e-05, "loss": 0.0501, "step": 1098 }, { "epoch": 3.546042003231018, "grad_norm": 0.14318561553955078, "learning_rate": 1.5950394588500564e-05, "loss": 0.0463, "step": 1099 }, { "epoch": 3.5492730210016155, "grad_norm": 0.11907773464918137, "learning_rate": 1.5945885005636982e-05, "loss": 0.0367, "step": 1100 }, { "epoch": 3.5525040387722133, "grad_norm": 0.19116652011871338, "learning_rate": 1.5941375422773397e-05, "loss": 0.0741, "step": 1101 }, { "epoch": 3.5557350565428107, "grad_norm": 0.14904284477233887, "learning_rate": 1.593686583990981e-05, "loss": 0.0493, "step": 1102 }, { "epoch": 3.5589660743134086, "grad_norm": 0.134224995970726, "learning_rate": 1.5932356257046225e-05, "loss": 0.0519, "step": 1103 }, { "epoch": 3.5621970920840065, "grad_norm": 0.16584910452365875, "learning_rate": 1.592784667418264e-05, "loss": 0.0604, "step": 1104 }, { "epoch": 3.5654281098546043, "grad_norm": 0.19957157969474792, "learning_rate": 1.5923337091319054e-05, "loss": 0.0874, "step": 1105 }, { "epoch": 3.568659127625202, "grad_norm": 0.16732187569141388, "learning_rate": 1.591882750845547e-05, "loss": 0.0521, "step": 1106 }, { "epoch": 3.5718901453957996, "grad_norm": 0.10181103646755219, "learning_rate": 1.5914317925591883e-05, "loss": 0.0356, "step": 1107 }, { "epoch": 3.5751211631663975, "grad_norm": 0.1692725121974945, "learning_rate": 1.59098083427283e-05, "loss": 0.0546, "step": 1108 }, { "epoch": 3.578352180936995, "grad_norm": 0.15010833740234375, "learning_rate": 1.5905298759864715e-05, "loss": 0.0525, "step": 1109 }, { "epoch": 3.581583198707593, "grad_norm": 0.14599505066871643, "learning_rate": 1.590078917700113e-05, "loss": 0.0503, "step": 1110 }, { "epoch": 3.5848142164781907, "grad_norm": 0.1455962210893631, "learning_rate": 1.5896279594137544e-05, "loss": 0.0538, "step": 1111 }, { "epoch": 3.5880452342487885, "grad_norm": 0.16955074667930603, "learning_rate": 1.5891770011273958e-05, "loss": 0.0562, "step": 1112 }, { "epoch": 3.5912762520193864, "grad_norm": 0.12441671639680862, "learning_rate": 1.5887260428410373e-05, "loss": 0.0423, "step": 1113 }, { "epoch": 3.594507269789984, "grad_norm": 0.1585661768913269, "learning_rate": 1.5882750845546787e-05, "loss": 0.0585, "step": 1114 }, { "epoch": 3.5977382875605817, "grad_norm": 0.1167236939072609, "learning_rate": 1.58782412626832e-05, "loss": 0.0396, "step": 1115 }, { "epoch": 3.600969305331179, "grad_norm": 0.11918371170759201, "learning_rate": 1.5873731679819616e-05, "loss": 0.0443, "step": 1116 }, { "epoch": 3.604200323101777, "grad_norm": 0.1533348709344864, "learning_rate": 1.586922209695603e-05, "loss": 0.0554, "step": 1117 }, { "epoch": 3.607431340872375, "grad_norm": 0.14366954565048218, "learning_rate": 1.5864712514092448e-05, "loss": 0.046, "step": 1118 }, { "epoch": 3.6106623586429727, "grad_norm": 0.17497192323207855, "learning_rate": 1.5860202931228862e-05, "loss": 0.0572, "step": 1119 }, { "epoch": 3.61389337641357, "grad_norm": 0.13877834379673004, "learning_rate": 1.5855693348365277e-05, "loss": 0.0493, "step": 1120 }, { "epoch": 3.617124394184168, "grad_norm": 0.19728821516036987, "learning_rate": 1.585118376550169e-05, "loss": 0.0712, "step": 1121 }, { "epoch": 3.620355411954766, "grad_norm": 0.11815082281827927, "learning_rate": 1.5846674182638105e-05, "loss": 0.0431, "step": 1122 }, { "epoch": 3.6235864297253633, "grad_norm": 0.1730748862028122, "learning_rate": 1.5842164599774523e-05, "loss": 0.0604, "step": 1123 }, { "epoch": 3.626817447495961, "grad_norm": 0.10923890024423599, "learning_rate": 1.5837655016910938e-05, "loss": 0.0355, "step": 1124 }, { "epoch": 3.630048465266559, "grad_norm": 0.12295415252447128, "learning_rate": 1.5833145434047352e-05, "loss": 0.0412, "step": 1125 }, { "epoch": 3.630048465266559, "eval_loss": 0.0719488188624382, "eval_runtime": 188.3414, "eval_samples_per_second": 1.046, "eval_steps_per_second": 1.046, "step": 1125 } ], "logging_steps": 1, "max_steps": 4635, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 75, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3264284514179097e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }