| { |
| "best_global_step": 1125, |
| "best_metric": 0.0719488188624382, |
| "best_model_checkpoint": "outputs_3/checkpoint-1125", |
| "epoch": 6.290791599353796, |
| "eval_steps": 75, |
| "global_step": 1950, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0032310177705977385, |
| "grad_norm": 0.0250613521784544, |
| "learning_rate": 0.0, |
| "loss": 0.1103, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.006462035541195477, |
| "grad_norm": 0.03354981914162636, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 0.1336, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.009693053311793215, |
| "grad_norm": 0.024361876770853996, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.1158, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.012924071082390954, |
| "grad_norm": 0.024172648787498474, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 0.0937, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.01615508885298869, |
| "grad_norm": 0.024678289890289307, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.1047, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01938610662358643, |
| "grad_norm": 0.031230105087161064, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.1479, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.022617124394184167, |
| "grad_norm": 0.024539202451705933, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.126, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.025848142164781908, |
| "grad_norm": 0.02983587421476841, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 0.1218, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.029079159935379646, |
| "grad_norm": 0.02570008486509323, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.1153, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.03231017770597738, |
| "grad_norm": 0.024239273741841316, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.0952, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.035541195476575124, |
| "grad_norm": 0.020495450124144554, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0958, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.03877221324717286, |
| "grad_norm": 0.028608962893486023, |
| "learning_rate": 1.1e-06, |
| "loss": 0.1169, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0420032310177706, |
| "grad_norm": 0.03878644108772278, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.1371, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.045234248788368334, |
| "grad_norm": 0.02789674885571003, |
| "learning_rate": 1.3e-06, |
| "loss": 0.1266, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.048465266558966075, |
| "grad_norm": 0.03313566744327545, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.1301, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.051696284329563816, |
| "grad_norm": 0.0248391292989254, |
| "learning_rate": 1.5e-06, |
| "loss": 0.1056, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.05492730210016155, |
| "grad_norm": 0.024395154789090157, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.112, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.05815831987075929, |
| "grad_norm": 0.03043658658862114, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 0.1037, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.061389337641357025, |
| "grad_norm": 0.02323235385119915, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0996, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.06462035541195477, |
| "grad_norm": 0.03656580671668053, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.1327, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.06785137318255251, |
| "grad_norm": 0.02677535079419613, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.1205, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.07108239095315025, |
| "grad_norm": 0.029969926923513412, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.126, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.07431340872374798, |
| "grad_norm": 0.02832009270787239, |
| "learning_rate": 2.2e-06, |
| "loss": 0.1208, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.07754442649434572, |
| "grad_norm": 0.023000000044703484, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 0.102, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.08077544426494346, |
| "grad_norm": 0.04773552715778351, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.1403, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.0840064620355412, |
| "grad_norm": 0.03021993860602379, |
| "learning_rate": 2.5e-06, |
| "loss": 0.1116, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.08723747980613894, |
| "grad_norm": 0.026623567566275597, |
| "learning_rate": 2.6e-06, |
| "loss": 0.1052, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.09046849757673667, |
| "grad_norm": 0.02503894828259945, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.1016, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.09369951534733441, |
| "grad_norm": 0.026578862220048904, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.1026, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.09693053311793215, |
| "grad_norm": 0.029514916241168976, |
| "learning_rate": 2.9e-06, |
| "loss": 0.1047, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10016155088852989, |
| "grad_norm": 0.025679711252450943, |
| "learning_rate": 3e-06, |
| "loss": 0.1228, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.10339256865912763, |
| "grad_norm": 0.023719631135463715, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.0997, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.10662358642972536, |
| "grad_norm": 0.03468646854162216, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.1354, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1098546042003231, |
| "grad_norm": 0.02570706605911255, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.1007, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.11308562197092084, |
| "grad_norm": 0.03752969205379486, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.1512, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11631663974151858, |
| "grad_norm": 0.034612756222486496, |
| "learning_rate": 3.5e-06, |
| "loss": 0.1212, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.11954765751211632, |
| "grad_norm": 0.022866638377308846, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0962, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.12277867528271405, |
| "grad_norm": 0.03314971551299095, |
| "learning_rate": 3.7e-06, |
| "loss": 0.1275, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1260096930533118, |
| "grad_norm": 0.026049382984638214, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.1095, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.12924071082390953, |
| "grad_norm": 0.027846891433000565, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.1208, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13247172859450726, |
| "grad_norm": 0.03061266988515854, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.1071, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.13570274636510501, |
| "grad_norm": 0.028831277042627335, |
| "learning_rate": 4.1e-06, |
| "loss": 0.122, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.13893376413570274, |
| "grad_norm": 0.024587510153651237, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.0891, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.1421647819063005, |
| "grad_norm": 0.03614000231027603, |
| "learning_rate": 4.3e-06, |
| "loss": 0.1443, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.14539579967689822, |
| "grad_norm": 0.0399002730846405, |
| "learning_rate": 4.4e-06, |
| "loss": 0.1306, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.14862681744749595, |
| "grad_norm": 0.03731178864836693, |
| "learning_rate": 4.5e-06, |
| "loss": 0.1552, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.1518578352180937, |
| "grad_norm": 0.03669052943587303, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 0.1285, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.15508885298869143, |
| "grad_norm": 0.029436811804771423, |
| "learning_rate": 4.7e-06, |
| "loss": 0.1128, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.1583198707592892, |
| "grad_norm": 0.03553691506385803, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.1295, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.16155088852988692, |
| "grad_norm": 0.045196086168289185, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.1589, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16478190630048464, |
| "grad_norm": 0.03583088517189026, |
| "learning_rate": 5e-06, |
| "loss": 0.1195, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.1680129240710824, |
| "grad_norm": 0.03799896687269211, |
| "learning_rate": 5.1e-06, |
| "loss": 0.1293, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.17124394184168013, |
| "grad_norm": 0.0341208279132843, |
| "learning_rate": 5.2e-06, |
| "loss": 0.1121, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.17447495961227788, |
| "grad_norm": 0.0367840901017189, |
| "learning_rate": 5.300000000000001e-06, |
| "loss": 0.1288, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.1777059773828756, |
| "grad_norm": 0.03497275337576866, |
| "learning_rate": 5.400000000000001e-06, |
| "loss": 0.1146, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.18093699515347333, |
| "grad_norm": 0.04450898617506027, |
| "learning_rate": 5.500000000000001e-06, |
| "loss": 0.1261, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.1841680129240711, |
| "grad_norm": 0.029873637482523918, |
| "learning_rate": 5.600000000000001e-06, |
| "loss": 0.0981, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.18739903069466882, |
| "grad_norm": 0.030145753175020218, |
| "learning_rate": 5.7e-06, |
| "loss": 0.1121, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.19063004846526657, |
| "grad_norm": 0.03658242151141167, |
| "learning_rate": 5.8e-06, |
| "loss": 0.1416, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.1938610662358643, |
| "grad_norm": 0.049440935254096985, |
| "learning_rate": 5.9e-06, |
| "loss": 0.125, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19709208400646203, |
| "grad_norm": 0.0388176292181015, |
| "learning_rate": 6e-06, |
| "loss": 0.1195, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.20032310177705978, |
| "grad_norm": 0.03422081843018532, |
| "learning_rate": 6.1e-06, |
| "loss": 0.1191, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.2035541195476575, |
| "grad_norm": 0.047777559608221054, |
| "learning_rate": 6.200000000000001e-06, |
| "loss": 0.1304, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.20678513731825526, |
| "grad_norm": 0.031583212316036224, |
| "learning_rate": 6.300000000000001e-06, |
| "loss": 0.0996, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.210016155088853, |
| "grad_norm": 0.03744835779070854, |
| "learning_rate": 6.4000000000000006e-06, |
| "loss": 0.1124, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.21324717285945072, |
| "grad_norm": 0.04165760055184364, |
| "learning_rate": 6.5000000000000004e-06, |
| "loss": 0.1041, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.21647819063004847, |
| "grad_norm": 0.03987026587128639, |
| "learning_rate": 6.600000000000001e-06, |
| "loss": 0.133, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.2197092084006462, |
| "grad_norm": 0.040481116622686386, |
| "learning_rate": 6.700000000000001e-06, |
| "loss": 0.0944, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.22294022617124395, |
| "grad_norm": 0.03691282495856285, |
| "learning_rate": 6.800000000000001e-06, |
| "loss": 0.1018, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.22617124394184168, |
| "grad_norm": 0.04485676437616348, |
| "learning_rate": 6.9e-06, |
| "loss": 0.137, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2294022617124394, |
| "grad_norm": 0.02854587510228157, |
| "learning_rate": 7e-06, |
| "loss": 0.0885, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.23263327948303716, |
| "grad_norm": 0.04462384432554245, |
| "learning_rate": 7.100000000000001e-06, |
| "loss": 0.1429, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.2358642972536349, |
| "grad_norm": 0.040676336735486984, |
| "learning_rate": 7.2000000000000005e-06, |
| "loss": 0.121, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.23909531502423265, |
| "grad_norm": 0.0420430488884449, |
| "learning_rate": 7.3e-06, |
| "loss": 0.1197, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.24232633279483037, |
| "grad_norm": 0.034694962203502655, |
| "learning_rate": 7.4e-06, |
| "loss": 0.1177, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24232633279483037, |
| "eval_loss": 0.12372970581054688, |
| "eval_runtime": 188.2884, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.2455573505654281, |
| "grad_norm": 0.03579672798514366, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.1058, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.24878836833602586, |
| "grad_norm": 0.029704652726650238, |
| "learning_rate": 7.600000000000001e-06, |
| "loss": 0.0882, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.2520193861066236, |
| "grad_norm": 0.04732828587293625, |
| "learning_rate": 7.7e-06, |
| "loss": 0.1277, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.2552504038772213, |
| "grad_norm": 0.027987899258732796, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.0751, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.25848142164781907, |
| "grad_norm": 0.04119185730814934, |
| "learning_rate": 7.9e-06, |
| "loss": 0.1043, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2617124394184168, |
| "grad_norm": 0.04631367698311806, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.118, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.2649434571890145, |
| "grad_norm": 0.03165270760655403, |
| "learning_rate": 8.1e-06, |
| "loss": 0.0889, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.2681744749596123, |
| "grad_norm": 0.03328806161880493, |
| "learning_rate": 8.2e-06, |
| "loss": 0.097, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.27140549273021003, |
| "grad_norm": 0.05337163060903549, |
| "learning_rate": 8.3e-06, |
| "loss": 0.1373, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.27463651050080773, |
| "grad_norm": 0.030968431383371353, |
| "learning_rate": 8.400000000000001e-06, |
| "loss": 0.077, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.2778675282714055, |
| "grad_norm": 0.03477643430233002, |
| "learning_rate": 8.5e-06, |
| "loss": 0.1016, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.28109854604200324, |
| "grad_norm": 0.04051528871059418, |
| "learning_rate": 8.6e-06, |
| "loss": 0.1017, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.284329563812601, |
| "grad_norm": 0.039160750806331635, |
| "learning_rate": 8.700000000000001e-06, |
| "loss": 0.112, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.2875605815831987, |
| "grad_norm": 0.03572917729616165, |
| "learning_rate": 8.8e-06, |
| "loss": 0.083, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.29079159935379645, |
| "grad_norm": 0.05116155743598938, |
| "learning_rate": 8.900000000000001e-06, |
| "loss": 0.1262, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2940226171243942, |
| "grad_norm": 0.043991196900606155, |
| "learning_rate": 9e-06, |
| "loss": 0.1147, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.2972536348949919, |
| "grad_norm": 0.03514918312430382, |
| "learning_rate": 9.100000000000001e-06, |
| "loss": 0.0874, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.30048465266558966, |
| "grad_norm": 0.03676354140043259, |
| "learning_rate": 9.200000000000002e-06, |
| "loss": 0.0988, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.3037156704361874, |
| "grad_norm": 0.04684548079967499, |
| "learning_rate": 9.3e-06, |
| "loss": 0.129, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.3069466882067851, |
| "grad_norm": 0.033971093595027924, |
| "learning_rate": 9.4e-06, |
| "loss": 0.09, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.31017770597738287, |
| "grad_norm": 0.0339505635201931, |
| "learning_rate": 9.5e-06, |
| "loss": 0.083, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.3134087237479806, |
| "grad_norm": 0.04067440703511238, |
| "learning_rate": 9.600000000000001e-06, |
| "loss": 0.1093, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.3166397415185784, |
| "grad_norm": 0.036671143025159836, |
| "learning_rate": 9.7e-06, |
| "loss": 0.0929, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.3198707592891761, |
| "grad_norm": 0.0459955595433712, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.1273, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.32310177705977383, |
| "grad_norm": 0.05120276287198067, |
| "learning_rate": 9.9e-06, |
| "loss": 0.1349, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3263327948303716, |
| "grad_norm": 0.03149951994419098, |
| "learning_rate": 1e-05, |
| "loss": 0.074, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.3295638126009693, |
| "grad_norm": 0.2025747299194336, |
| "learning_rate": 1.0100000000000002e-05, |
| "loss": 0.099, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.33279483037156704, |
| "grad_norm": 0.03870416432619095, |
| "learning_rate": 1.02e-05, |
| "loss": 0.0984, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.3360258481421648, |
| "grad_norm": 0.05344085767865181, |
| "learning_rate": 1.0300000000000001e-05, |
| "loss": 0.1057, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.3392568659127625, |
| "grad_norm": 0.04757027328014374, |
| "learning_rate": 1.04e-05, |
| "loss": 0.124, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.34248788368336025, |
| "grad_norm": 0.03201949968934059, |
| "learning_rate": 1.0500000000000001e-05, |
| "loss": 0.0807, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.345718901453958, |
| "grad_norm": 0.03873045742511749, |
| "learning_rate": 1.0600000000000002e-05, |
| "loss": 0.1048, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.34894991922455576, |
| "grad_norm": 0.027501031756401062, |
| "learning_rate": 1.0700000000000001e-05, |
| "loss": 0.0613, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.35218093699515346, |
| "grad_norm": 0.03909388184547424, |
| "learning_rate": 1.0800000000000002e-05, |
| "loss": 0.0929, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.3554119547657512, |
| "grad_norm": 0.08006097376346588, |
| "learning_rate": 1.0900000000000002e-05, |
| "loss": 0.0908, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.35864297253634897, |
| "grad_norm": 0.03813672810792923, |
| "learning_rate": 1.1000000000000001e-05, |
| "loss": 0.0764, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.36187399030694667, |
| "grad_norm": 0.030327340587973595, |
| "learning_rate": 1.1100000000000002e-05, |
| "loss": 0.077, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.3651050080775444, |
| "grad_norm": 0.03776196017861366, |
| "learning_rate": 1.1200000000000001e-05, |
| "loss": 0.0759, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.3683360258481422, |
| "grad_norm": 0.039037927985191345, |
| "learning_rate": 1.13e-05, |
| "loss": 0.0926, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.3715670436187399, |
| "grad_norm": 0.0416683666408062, |
| "learning_rate": 1.14e-05, |
| "loss": 0.0914, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.37479806138933763, |
| "grad_norm": 0.04185537248849869, |
| "learning_rate": 1.15e-05, |
| "loss": 0.0893, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.3780290791599354, |
| "grad_norm": 0.04651897773146629, |
| "learning_rate": 1.16e-05, |
| "loss": 0.1094, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.38126009693053314, |
| "grad_norm": 0.04604775831103325, |
| "learning_rate": 1.17e-05, |
| "loss": 0.1068, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.38449111470113084, |
| "grad_norm": 0.02846536412835121, |
| "learning_rate": 1.18e-05, |
| "loss": 0.064, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.3877221324717286, |
| "grad_norm": 0.033402059227228165, |
| "learning_rate": 1.1900000000000001e-05, |
| "loss": 0.0725, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.39095315024232635, |
| "grad_norm": 0.044676899909973145, |
| "learning_rate": 1.2e-05, |
| "loss": 0.1017, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.39418416801292405, |
| "grad_norm": 0.05336389318108559, |
| "learning_rate": 1.2100000000000001e-05, |
| "loss": 0.1063, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.3974151857835218, |
| "grad_norm": 0.0402502678334713, |
| "learning_rate": 1.22e-05, |
| "loss": 0.0764, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.40064620355411956, |
| "grad_norm": 0.04342082887887955, |
| "learning_rate": 1.23e-05, |
| "loss": 0.1008, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.40387722132471726, |
| "grad_norm": 0.047081444412469864, |
| "learning_rate": 1.2400000000000002e-05, |
| "loss": 0.1029, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.407108239095315, |
| "grad_norm": 0.038031261414289474, |
| "learning_rate": 1.25e-05, |
| "loss": 0.0783, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.41033925686591277, |
| "grad_norm": 0.03746628388762474, |
| "learning_rate": 1.2600000000000001e-05, |
| "loss": 0.0764, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.4135702746365105, |
| "grad_norm": 0.04288509115576744, |
| "learning_rate": 1.27e-05, |
| "loss": 0.0892, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.4168012924071082, |
| "grad_norm": 0.042407114058732986, |
| "learning_rate": 1.2800000000000001e-05, |
| "loss": 0.0925, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.420032310177706, |
| "grad_norm": 0.03754522651433945, |
| "learning_rate": 1.2900000000000002e-05, |
| "loss": 0.0833, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.42326332794830374, |
| "grad_norm": 0.04337688535451889, |
| "learning_rate": 1.3000000000000001e-05, |
| "loss": 0.0976, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.42649434571890144, |
| "grad_norm": 0.03174331411719322, |
| "learning_rate": 1.3100000000000002e-05, |
| "loss": 0.0684, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.4297253634894992, |
| "grad_norm": 0.04556446522474289, |
| "learning_rate": 1.3200000000000002e-05, |
| "loss": 0.0979, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.43295638126009695, |
| "grad_norm": 0.04222508519887924, |
| "learning_rate": 1.3300000000000001e-05, |
| "loss": 0.0923, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.43618739903069464, |
| "grad_norm": 0.047948531806468964, |
| "learning_rate": 1.3400000000000002e-05, |
| "loss": 0.0991, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.4394184168012924, |
| "grad_norm": 0.04752594605088234, |
| "learning_rate": 1.3500000000000001e-05, |
| "loss": 0.0864, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.44264943457189015, |
| "grad_norm": 0.049508459866046906, |
| "learning_rate": 1.3600000000000002e-05, |
| "loss": 0.1035, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.4458804523424879, |
| "grad_norm": 0.05521553382277489, |
| "learning_rate": 1.3700000000000003e-05, |
| "loss": 0.0978, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.4491114701130856, |
| "grad_norm": 0.05352664738893509, |
| "learning_rate": 1.38e-05, |
| "loss": 0.1097, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.45234248788368336, |
| "grad_norm": 0.04332451522350311, |
| "learning_rate": 1.39e-05, |
| "loss": 0.0917, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4555735056542811, |
| "grad_norm": 0.05932965502142906, |
| "learning_rate": 1.4e-05, |
| "loss": 0.0909, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.4588045234248788, |
| "grad_norm": 0.04634483903646469, |
| "learning_rate": 1.41e-05, |
| "loss": 0.0806, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.4620355411954766, |
| "grad_norm": 0.037898797541856766, |
| "learning_rate": 1.4200000000000001e-05, |
| "loss": 0.0802, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.46526655896607433, |
| "grad_norm": 0.04366337135434151, |
| "learning_rate": 1.43e-05, |
| "loss": 0.0835, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.46849757673667203, |
| "grad_norm": 0.03588287532329559, |
| "learning_rate": 1.4400000000000001e-05, |
| "loss": 0.0746, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4717285945072698, |
| "grad_norm": 0.04979556053876877, |
| "learning_rate": 1.45e-05, |
| "loss": 0.0914, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.47495961227786754, |
| "grad_norm": 0.03938375040888786, |
| "learning_rate": 1.46e-05, |
| "loss": 0.0696, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.4781906300484653, |
| "grad_norm": 0.04531609266996384, |
| "learning_rate": 1.4700000000000002e-05, |
| "loss": 0.0847, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.481421647819063, |
| "grad_norm": 0.04314682260155678, |
| "learning_rate": 1.48e-05, |
| "loss": 0.0872, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.48465266558966075, |
| "grad_norm": 0.04925313591957092, |
| "learning_rate": 1.4900000000000001e-05, |
| "loss": 0.0905, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.48465266558966075, |
| "eval_loss": 0.09467896819114685, |
| "eval_runtime": 188.3915, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4878836833602585, |
| "grad_norm": 0.04450507089495659, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.0704, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.4911147011308562, |
| "grad_norm": 0.03887806460261345, |
| "learning_rate": 1.5100000000000001e-05, |
| "loss": 0.0727, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.49434571890145396, |
| "grad_norm": 0.04298221319913864, |
| "learning_rate": 1.5200000000000002e-05, |
| "loss": 0.0763, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.4975767366720517, |
| "grad_norm": 0.046640265733003616, |
| "learning_rate": 1.5300000000000003e-05, |
| "loss": 0.0794, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.5008077544426495, |
| "grad_norm": 0.03670887276530266, |
| "learning_rate": 1.54e-05, |
| "loss": 0.0681, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.5040387722132472, |
| "grad_norm": 0.03891611844301224, |
| "learning_rate": 1.55e-05, |
| "loss": 0.0712, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.5072697899838449, |
| "grad_norm": 0.042465128004550934, |
| "learning_rate": 1.5600000000000003e-05, |
| "loss": 0.0828, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.5105008077544426, |
| "grad_norm": 0.054676711559295654, |
| "learning_rate": 1.5700000000000002e-05, |
| "loss": 0.0978, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.5137318255250404, |
| "grad_norm": 0.040950365364551544, |
| "learning_rate": 1.58e-05, |
| "loss": 0.0822, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.5169628432956381, |
| "grad_norm": 0.04756248742341995, |
| "learning_rate": 1.5900000000000004e-05, |
| "loss": 0.0914, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5201938610662359, |
| "grad_norm": 0.046163927763700485, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.089, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.5234248788368336, |
| "grad_norm": 0.04039200022816658, |
| "learning_rate": 1.6100000000000002e-05, |
| "loss": 0.0657, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.5266558966074314, |
| "grad_norm": 0.05602607503533363, |
| "learning_rate": 1.62e-05, |
| "loss": 0.1044, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.529886914378029, |
| "grad_norm": 0.04312260076403618, |
| "learning_rate": 1.63e-05, |
| "loss": 0.0821, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.5331179321486268, |
| "grad_norm": 0.047874368727207184, |
| "learning_rate": 1.64e-05, |
| "loss": 0.0824, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5363489499192245, |
| "grad_norm": 0.06421804428100586, |
| "learning_rate": 1.65e-05, |
| "loss": 0.1126, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.5395799676898223, |
| "grad_norm": 0.05542091280221939, |
| "learning_rate": 1.66e-05, |
| "loss": 0.0959, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.5428109854604201, |
| "grad_norm": 0.05822491645812988, |
| "learning_rate": 1.67e-05, |
| "loss": 0.099, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.5460420032310178, |
| "grad_norm": 0.04154228791594505, |
| "learning_rate": 1.6800000000000002e-05, |
| "loss": 0.0737, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.5492730210016155, |
| "grad_norm": 0.04687827080488205, |
| "learning_rate": 1.69e-05, |
| "loss": 0.0725, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5525040387722132, |
| "grad_norm": 0.056682366877794266, |
| "learning_rate": 1.7e-05, |
| "loss": 0.1004, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.555735056542811, |
| "grad_norm": 0.0667276531457901, |
| "learning_rate": 1.7100000000000002e-05, |
| "loss": 0.0917, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.5589660743134087, |
| "grad_norm": 0.044881295412778854, |
| "learning_rate": 1.72e-05, |
| "loss": 0.0689, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.5621970920840065, |
| "grad_norm": 0.0385456345975399, |
| "learning_rate": 1.73e-05, |
| "loss": 0.0592, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.5654281098546042, |
| "grad_norm": 0.05141144245862961, |
| "learning_rate": 1.7400000000000003e-05, |
| "loss": 0.0895, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.568659127625202, |
| "grad_norm": 0.06854357570409775, |
| "learning_rate": 1.7500000000000002e-05, |
| "loss": 0.0692, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.5718901453957996, |
| "grad_norm": 0.04410829395055771, |
| "learning_rate": 1.76e-05, |
| "loss": 0.0688, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.5751211631663974, |
| "grad_norm": 0.03727763518691063, |
| "learning_rate": 1.77e-05, |
| "loss": 0.0638, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.5783521809369951, |
| "grad_norm": 0.044415879994630814, |
| "learning_rate": 1.7800000000000002e-05, |
| "loss": 0.0682, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.5815831987075929, |
| "grad_norm": 0.06855777651071548, |
| "learning_rate": 1.79e-05, |
| "loss": 0.1018, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5848142164781907, |
| "grad_norm": 0.053684502840042114, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0862, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.5880452342487884, |
| "grad_norm": 0.0487506277859211, |
| "learning_rate": 1.8100000000000003e-05, |
| "loss": 0.073, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.5912762520193862, |
| "grad_norm": 0.04568934440612793, |
| "learning_rate": 1.8200000000000002e-05, |
| "loss": 0.0726, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.5945072697899838, |
| "grad_norm": 0.04607719928026199, |
| "learning_rate": 1.83e-05, |
| "loss": 0.0685, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.5977382875605816, |
| "grad_norm": 0.05040200799703598, |
| "learning_rate": 1.8400000000000003e-05, |
| "loss": 0.0721, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.6009693053311793, |
| "grad_norm": 0.0538799948990345, |
| "learning_rate": 1.8500000000000002e-05, |
| "loss": 0.0769, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.6042003231017771, |
| "grad_norm": 0.058767516165971756, |
| "learning_rate": 1.86e-05, |
| "loss": 0.1015, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.6074313408723748, |
| "grad_norm": 0.056379787623882294, |
| "learning_rate": 1.8700000000000004e-05, |
| "loss": 0.0887, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.6106623586429726, |
| "grad_norm": 0.04885280132293701, |
| "learning_rate": 1.88e-05, |
| "loss": 0.0767, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.6138933764135702, |
| "grad_norm": 0.04340769350528717, |
| "learning_rate": 1.8900000000000002e-05, |
| "loss": 0.061, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.617124394184168, |
| "grad_norm": 0.051385678350925446, |
| "learning_rate": 1.9e-05, |
| "loss": 0.0794, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.6203554119547657, |
| "grad_norm": 0.03737674281001091, |
| "learning_rate": 1.91e-05, |
| "loss": 0.0594, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.6235864297253635, |
| "grad_norm": 0.047995615750551224, |
| "learning_rate": 1.9200000000000003e-05, |
| "loss": 0.06, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.6268174474959612, |
| "grad_norm": 0.04322716221213341, |
| "learning_rate": 1.93e-05, |
| "loss": 0.0712, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.630048465266559, |
| "grad_norm": 0.044713038951158524, |
| "learning_rate": 1.94e-05, |
| "loss": 0.0679, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.6332794830371568, |
| "grad_norm": 0.05132253095507622, |
| "learning_rate": 1.95e-05, |
| "loss": 0.0723, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.6365105008077544, |
| "grad_norm": 0.039808765053749084, |
| "learning_rate": 1.9600000000000002e-05, |
| "loss": 0.057, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.6397415185783522, |
| "grad_norm": 0.05255698040127754, |
| "learning_rate": 1.97e-05, |
| "loss": 0.0774, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.6429725363489499, |
| "grad_norm": 0.05560529604554176, |
| "learning_rate": 1.98e-05, |
| "loss": 0.0839, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.6462035541195477, |
| "grad_norm": 0.0640430748462677, |
| "learning_rate": 1.9900000000000003e-05, |
| "loss": 0.0993, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6494345718901454, |
| "grad_norm": 0.05945132300257683, |
| "learning_rate": 2e-05, |
| "loss": 0.0917, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.6526655896607432, |
| "grad_norm": 0.0569840632379055, |
| "learning_rate": 1.9995490417136416e-05, |
| "loss": 0.096, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.6558966074313409, |
| "grad_norm": 0.052612412720918655, |
| "learning_rate": 1.999098083427283e-05, |
| "loss": 0.079, |
| "step": 203 |
| }, |
| { |
| "epoch": 0.6591276252019386, |
| "grad_norm": 0.05563991889357567, |
| "learning_rate": 1.9986471251409248e-05, |
| "loss": 0.0748, |
| "step": 204 |
| }, |
| { |
| "epoch": 0.6623586429725363, |
| "grad_norm": 0.0527903214097023, |
| "learning_rate": 1.9981961668545663e-05, |
| "loss": 0.0789, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6655896607431341, |
| "grad_norm": 0.06214462220668793, |
| "learning_rate": 1.9977452085682077e-05, |
| "loss": 0.0806, |
| "step": 206 |
| }, |
| { |
| "epoch": 0.6688206785137318, |
| "grad_norm": 0.05324917659163475, |
| "learning_rate": 1.997294250281849e-05, |
| "loss": 0.0778, |
| "step": 207 |
| }, |
| { |
| "epoch": 0.6720516962843296, |
| "grad_norm": 0.03270899876952171, |
| "learning_rate": 1.9968432919954906e-05, |
| "loss": 0.0469, |
| "step": 208 |
| }, |
| { |
| "epoch": 0.6752827140549273, |
| "grad_norm": 0.060554295778274536, |
| "learning_rate": 1.996392333709132e-05, |
| "loss": 0.0926, |
| "step": 209 |
| }, |
| { |
| "epoch": 0.678513731825525, |
| "grad_norm": 0.05076554790139198, |
| "learning_rate": 1.9959413754227738e-05, |
| "loss": 0.0752, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6817447495961227, |
| "grad_norm": 0.05301008000969887, |
| "learning_rate": 1.9954904171364152e-05, |
| "loss": 0.0783, |
| "step": 211 |
| }, |
| { |
| "epoch": 0.6849757673667205, |
| "grad_norm": 0.04376392439007759, |
| "learning_rate": 1.9950394588500567e-05, |
| "loss": 0.0603, |
| "step": 212 |
| }, |
| { |
| "epoch": 0.6882067851373183, |
| "grad_norm": 0.049395546317100525, |
| "learning_rate": 1.994588500563698e-05, |
| "loss": 0.071, |
| "step": 213 |
| }, |
| { |
| "epoch": 0.691437802907916, |
| "grad_norm": 0.061137910932302475, |
| "learning_rate": 1.9941375422773395e-05, |
| "loss": 0.0995, |
| "step": 214 |
| }, |
| { |
| "epoch": 0.6946688206785138, |
| "grad_norm": 0.05603012442588806, |
| "learning_rate": 1.993686583990981e-05, |
| "loss": 0.0787, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6978998384491115, |
| "grad_norm": 0.06130882352590561, |
| "learning_rate": 1.9932356257046224e-05, |
| "loss": 0.0745, |
| "step": 216 |
| }, |
| { |
| "epoch": 0.7011308562197092, |
| "grad_norm": 0.06154336780309677, |
| "learning_rate": 1.9927846674182642e-05, |
| "loss": 0.0707, |
| "step": 217 |
| }, |
| { |
| "epoch": 0.7043618739903069, |
| "grad_norm": 0.06344747543334961, |
| "learning_rate": 1.9923337091319056e-05, |
| "loss": 0.0978, |
| "step": 218 |
| }, |
| { |
| "epoch": 0.7075928917609047, |
| "grad_norm": 0.06630375236272812, |
| "learning_rate": 1.991882750845547e-05, |
| "loss": 0.0721, |
| "step": 219 |
| }, |
| { |
| "epoch": 0.7108239095315024, |
| "grad_norm": 0.04976838827133179, |
| "learning_rate": 1.9914317925591885e-05, |
| "loss": 0.0595, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7140549273021002, |
| "grad_norm": 0.05968477949500084, |
| "learning_rate": 1.99098083427283e-05, |
| "loss": 0.0792, |
| "step": 221 |
| }, |
| { |
| "epoch": 0.7172859450726979, |
| "grad_norm": 0.0983344092965126, |
| "learning_rate": 1.9905298759864714e-05, |
| "loss": 0.0872, |
| "step": 222 |
| }, |
| { |
| "epoch": 0.7205169628432956, |
| "grad_norm": 0.08061773329973221, |
| "learning_rate": 1.990078917700113e-05, |
| "loss": 0.0824, |
| "step": 223 |
| }, |
| { |
| "epoch": 0.7237479806138933, |
| "grad_norm": 0.055074963718652725, |
| "learning_rate": 1.9896279594137543e-05, |
| "loss": 0.0677, |
| "step": 224 |
| }, |
| { |
| "epoch": 0.7269789983844911, |
| "grad_norm": 0.06062469258904457, |
| "learning_rate": 1.9891770011273957e-05, |
| "loss": 0.0658, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7269789983844911, |
| "eval_loss": 0.08582010865211487, |
| "eval_runtime": 188.2111, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.7302100161550888, |
| "grad_norm": 0.07606152445077896, |
| "learning_rate": 1.988726042841037e-05, |
| "loss": 0.1078, |
| "step": 226 |
| }, |
| { |
| "epoch": 0.7334410339256866, |
| "grad_norm": 0.06171920895576477, |
| "learning_rate": 1.988275084554679e-05, |
| "loss": 0.086, |
| "step": 227 |
| }, |
| { |
| "epoch": 0.7366720516962844, |
| "grad_norm": 0.03934045881032944, |
| "learning_rate": 1.9878241262683204e-05, |
| "loss": 0.0516, |
| "step": 228 |
| }, |
| { |
| "epoch": 0.7399030694668821, |
| "grad_norm": 0.0552021786570549, |
| "learning_rate": 1.9873731679819618e-05, |
| "loss": 0.0761, |
| "step": 229 |
| }, |
| { |
| "epoch": 0.7431340872374798, |
| "grad_norm": 0.05151893198490143, |
| "learning_rate": 1.9869222096956032e-05, |
| "loss": 0.0726, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7463651050080775, |
| "grad_norm": 0.0533306822180748, |
| "learning_rate": 1.9864712514092447e-05, |
| "loss": 0.0717, |
| "step": 231 |
| }, |
| { |
| "epoch": 0.7495961227786753, |
| "grad_norm": 0.052841685712337494, |
| "learning_rate": 1.986020293122886e-05, |
| "loss": 0.0755, |
| "step": 232 |
| }, |
| { |
| "epoch": 0.752827140549273, |
| "grad_norm": 0.040998924523591995, |
| "learning_rate": 1.9855693348365276e-05, |
| "loss": 0.0554, |
| "step": 233 |
| }, |
| { |
| "epoch": 0.7560581583198708, |
| "grad_norm": 0.057859815657138824, |
| "learning_rate": 1.9851183765501693e-05, |
| "loss": 0.0719, |
| "step": 234 |
| }, |
| { |
| "epoch": 0.7592891760904685, |
| "grad_norm": 0.04167502373456955, |
| "learning_rate": 1.9846674182638108e-05, |
| "loss": 0.0625, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7625201938610663, |
| "grad_norm": 0.058570049703121185, |
| "learning_rate": 1.9842164599774522e-05, |
| "loss": 0.0773, |
| "step": 236 |
| }, |
| { |
| "epoch": 0.7657512116316639, |
| "grad_norm": 0.06181475892663002, |
| "learning_rate": 1.9837655016910937e-05, |
| "loss": 0.083, |
| "step": 237 |
| }, |
| { |
| "epoch": 0.7689822294022617, |
| "grad_norm": 0.06188640370965004, |
| "learning_rate": 1.983314543404735e-05, |
| "loss": 0.0838, |
| "step": 238 |
| }, |
| { |
| "epoch": 0.7722132471728594, |
| "grad_norm": 0.0784875750541687, |
| "learning_rate": 1.9828635851183765e-05, |
| "loss": 0.1032, |
| "step": 239 |
| }, |
| { |
| "epoch": 0.7754442649434572, |
| "grad_norm": 0.06791771203279495, |
| "learning_rate": 1.9824126268320183e-05, |
| "loss": 0.0888, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.778675282714055, |
| "grad_norm": 0.06790699809789658, |
| "learning_rate": 1.9819616685456598e-05, |
| "loss": 0.0919, |
| "step": 241 |
| }, |
| { |
| "epoch": 0.7819063004846527, |
| "grad_norm": 0.04812704026699066, |
| "learning_rate": 1.9815107102593012e-05, |
| "loss": 0.058, |
| "step": 242 |
| }, |
| { |
| "epoch": 0.7851373182552503, |
| "grad_norm": 0.0617465041577816, |
| "learning_rate": 1.9810597519729426e-05, |
| "loss": 0.0852, |
| "step": 243 |
| }, |
| { |
| "epoch": 0.7883683360258481, |
| "grad_norm": 0.049844078719615936, |
| "learning_rate": 1.980608793686584e-05, |
| "loss": 0.0668, |
| "step": 244 |
| }, |
| { |
| "epoch": 0.7915993537964459, |
| "grad_norm": 0.0725836232304573, |
| "learning_rate": 1.9801578354002255e-05, |
| "loss": 0.0929, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7948303715670436, |
| "grad_norm": 0.0587320439517498, |
| "learning_rate": 1.9797068771138673e-05, |
| "loss": 0.0764, |
| "step": 246 |
| }, |
| { |
| "epoch": 0.7980613893376414, |
| "grad_norm": 0.04824487864971161, |
| "learning_rate": 1.9792559188275087e-05, |
| "loss": 0.0562, |
| "step": 247 |
| }, |
| { |
| "epoch": 0.8012924071082391, |
| "grad_norm": 0.06668104231357574, |
| "learning_rate": 1.97880496054115e-05, |
| "loss": 0.0842, |
| "step": 248 |
| }, |
| { |
| "epoch": 0.8045234248788369, |
| "grad_norm": 0.057721976190805435, |
| "learning_rate": 1.9783540022547916e-05, |
| "loss": 0.0838, |
| "step": 249 |
| }, |
| { |
| "epoch": 0.8077544426494345, |
| "grad_norm": 0.07014774531126022, |
| "learning_rate": 1.977903043968433e-05, |
| "loss": 0.0831, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8109854604200323, |
| "grad_norm": 0.0693356841802597, |
| "learning_rate": 1.9774520856820745e-05, |
| "loss": 0.0903, |
| "step": 251 |
| }, |
| { |
| "epoch": 0.81421647819063, |
| "grad_norm": 0.05464401841163635, |
| "learning_rate": 1.9770011273957163e-05, |
| "loss": 0.0682, |
| "step": 252 |
| }, |
| { |
| "epoch": 0.8174474959612278, |
| "grad_norm": 0.053677983582019806, |
| "learning_rate": 1.9765501691093577e-05, |
| "loss": 0.0588, |
| "step": 253 |
| }, |
| { |
| "epoch": 0.8206785137318255, |
| "grad_norm": 0.04152385890483856, |
| "learning_rate": 1.976099210822999e-05, |
| "loss": 0.0564, |
| "step": 254 |
| }, |
| { |
| "epoch": 0.8239095315024233, |
| "grad_norm": 0.06150719150900841, |
| "learning_rate": 1.9756482525366406e-05, |
| "loss": 0.0694, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.827140549273021, |
| "grad_norm": 0.05433864891529083, |
| "learning_rate": 1.975197294250282e-05, |
| "loss": 0.067, |
| "step": 256 |
| }, |
| { |
| "epoch": 0.8303715670436187, |
| "grad_norm": 0.04325372353196144, |
| "learning_rate": 1.9747463359639235e-05, |
| "loss": 0.0575, |
| "step": 257 |
| }, |
| { |
| "epoch": 0.8336025848142165, |
| "grad_norm": 0.049097690731287, |
| "learning_rate": 1.9742953776775652e-05, |
| "loss": 0.0571, |
| "step": 258 |
| }, |
| { |
| "epoch": 0.8368336025848142, |
| "grad_norm": 0.06370379030704498, |
| "learning_rate": 1.9738444193912067e-05, |
| "loss": 0.0877, |
| "step": 259 |
| }, |
| { |
| "epoch": 0.840064620355412, |
| "grad_norm": 0.05573710799217224, |
| "learning_rate": 1.973393461104848e-05, |
| "loss": 0.0759, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8432956381260097, |
| "grad_norm": 0.06537079066038132, |
| "learning_rate": 1.9729425028184896e-05, |
| "loss": 0.0759, |
| "step": 261 |
| }, |
| { |
| "epoch": 0.8465266558966075, |
| "grad_norm": 0.04301934316754341, |
| "learning_rate": 1.972491544532131e-05, |
| "loss": 0.0564, |
| "step": 262 |
| }, |
| { |
| "epoch": 0.8497576736672051, |
| "grad_norm": 0.07281677424907684, |
| "learning_rate": 1.9720405862457724e-05, |
| "loss": 0.0685, |
| "step": 263 |
| }, |
| { |
| "epoch": 0.8529886914378029, |
| "grad_norm": 0.06059825047850609, |
| "learning_rate": 1.971589627959414e-05, |
| "loss": 0.071, |
| "step": 264 |
| }, |
| { |
| "epoch": 0.8562197092084006, |
| "grad_norm": 0.05605108663439751, |
| "learning_rate": 1.9711386696730553e-05, |
| "loss": 0.0634, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.8594507269789984, |
| "grad_norm": 0.07372546941041946, |
| "learning_rate": 1.9706877113866967e-05, |
| "loss": 0.0916, |
| "step": 266 |
| }, |
| { |
| "epoch": 0.8626817447495961, |
| "grad_norm": 0.051352906972169876, |
| "learning_rate": 1.9702367531003382e-05, |
| "loss": 0.0653, |
| "step": 267 |
| }, |
| { |
| "epoch": 0.8659127625201939, |
| "grad_norm": 0.059334397315979004, |
| "learning_rate": 1.96978579481398e-05, |
| "loss": 0.0717, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.8691437802907916, |
| "grad_norm": 0.06220857426524162, |
| "learning_rate": 1.9693348365276214e-05, |
| "loss": 0.0612, |
| "step": 269 |
| }, |
| { |
| "epoch": 0.8723747980613893, |
| "grad_norm": 0.053203944116830826, |
| "learning_rate": 1.968883878241263e-05, |
| "loss": 0.0699, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.875605815831987, |
| "grad_norm": 0.06943807750940323, |
| "learning_rate": 1.9684329199549043e-05, |
| "loss": 0.0793, |
| "step": 271 |
| }, |
| { |
| "epoch": 0.8788368336025848, |
| "grad_norm": 0.07023902982473373, |
| "learning_rate": 1.9679819616685457e-05, |
| "loss": 0.0859, |
| "step": 272 |
| }, |
| { |
| "epoch": 0.8820678513731826, |
| "grad_norm": 0.06727661192417145, |
| "learning_rate": 1.967531003382187e-05, |
| "loss": 0.0796, |
| "step": 273 |
| }, |
| { |
| "epoch": 0.8852988691437803, |
| "grad_norm": 0.08004336804151535, |
| "learning_rate": 1.9670800450958286e-05, |
| "loss": 0.0989, |
| "step": 274 |
| }, |
| { |
| "epoch": 0.8885298869143781, |
| "grad_norm": 0.06687738746404648, |
| "learning_rate": 1.96662908680947e-05, |
| "loss": 0.0674, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8917609046849758, |
| "grad_norm": 0.06280867755413055, |
| "learning_rate": 1.9661781285231118e-05, |
| "loss": 0.0735, |
| "step": 276 |
| }, |
| { |
| "epoch": 0.8949919224555735, |
| "grad_norm": 0.06883740425109863, |
| "learning_rate": 1.9657271702367533e-05, |
| "loss": 0.0893, |
| "step": 277 |
| }, |
| { |
| "epoch": 0.8982229402261712, |
| "grad_norm": 0.059292376041412354, |
| "learning_rate": 1.9652762119503947e-05, |
| "loss": 0.0819, |
| "step": 278 |
| }, |
| { |
| "epoch": 0.901453957996769, |
| "grad_norm": 0.0578530877828598, |
| "learning_rate": 1.964825253664036e-05, |
| "loss": 0.0724, |
| "step": 279 |
| }, |
| { |
| "epoch": 0.9046849757673667, |
| "grad_norm": 0.08932427316904068, |
| "learning_rate": 1.9643742953776776e-05, |
| "loss": 0.0939, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9079159935379645, |
| "grad_norm": 0.07406419515609741, |
| "learning_rate": 1.963923337091319e-05, |
| "loss": 0.0868, |
| "step": 281 |
| }, |
| { |
| "epoch": 0.9111470113085622, |
| "grad_norm": 0.05354011803865433, |
| "learning_rate": 1.9634723788049608e-05, |
| "loss": 0.0685, |
| "step": 282 |
| }, |
| { |
| "epoch": 0.9143780290791599, |
| "grad_norm": 0.06414072960615158, |
| "learning_rate": 1.9630214205186022e-05, |
| "loss": 0.0686, |
| "step": 283 |
| }, |
| { |
| "epoch": 0.9176090468497576, |
| "grad_norm": 0.058192793279886246, |
| "learning_rate": 1.9625704622322437e-05, |
| "loss": 0.0674, |
| "step": 284 |
| }, |
| { |
| "epoch": 0.9208400646203554, |
| "grad_norm": 0.10264746099710464, |
| "learning_rate": 1.962119503945885e-05, |
| "loss": 0.1152, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.9240710823909531, |
| "grad_norm": 0.066757932305336, |
| "learning_rate": 1.9616685456595265e-05, |
| "loss": 0.0757, |
| "step": 286 |
| }, |
| { |
| "epoch": 0.9273021001615509, |
| "grad_norm": 0.06598404794931412, |
| "learning_rate": 1.9612175873731683e-05, |
| "loss": 0.0768, |
| "step": 287 |
| }, |
| { |
| "epoch": 0.9305331179321487, |
| "grad_norm": 0.07162454724311829, |
| "learning_rate": 1.9607666290868098e-05, |
| "loss": 0.081, |
| "step": 288 |
| }, |
| { |
| "epoch": 0.9337641357027464, |
| "grad_norm": 0.05917588993906975, |
| "learning_rate": 1.9603156708004512e-05, |
| "loss": 0.0645, |
| "step": 289 |
| }, |
| { |
| "epoch": 0.9369951534733441, |
| "grad_norm": 0.06051475182175636, |
| "learning_rate": 1.9598647125140926e-05, |
| "loss": 0.0656, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9402261712439418, |
| "grad_norm": 0.06452775001525879, |
| "learning_rate": 1.959413754227734e-05, |
| "loss": 0.0704, |
| "step": 291 |
| }, |
| { |
| "epoch": 0.9434571890145396, |
| "grad_norm": 0.06445769965648651, |
| "learning_rate": 1.9589627959413755e-05, |
| "loss": 0.0759, |
| "step": 292 |
| }, |
| { |
| "epoch": 0.9466882067851373, |
| "grad_norm": 0.06948834657669067, |
| "learning_rate": 1.9585118376550173e-05, |
| "loss": 0.0776, |
| "step": 293 |
| }, |
| { |
| "epoch": 0.9499192245557351, |
| "grad_norm": 0.05026319995522499, |
| "learning_rate": 1.9580608793686587e-05, |
| "loss": 0.0574, |
| "step": 294 |
| }, |
| { |
| "epoch": 0.9531502423263328, |
| "grad_norm": 0.08733383566141129, |
| "learning_rate": 1.9576099210823002e-05, |
| "loss": 0.0898, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.9563812600969306, |
| "grad_norm": 0.05138668045401573, |
| "learning_rate": 1.9571589627959416e-05, |
| "loss": 0.0525, |
| "step": 296 |
| }, |
| { |
| "epoch": 0.9596122778675282, |
| "grad_norm": 0.0710444375872612, |
| "learning_rate": 1.956708004509583e-05, |
| "loss": 0.0832, |
| "step": 297 |
| }, |
| { |
| "epoch": 0.962843295638126, |
| "grad_norm": 0.06288463622331619, |
| "learning_rate": 1.9562570462232245e-05, |
| "loss": 0.0673, |
| "step": 298 |
| }, |
| { |
| "epoch": 0.9660743134087237, |
| "grad_norm": 0.05722356587648392, |
| "learning_rate": 1.9558060879368663e-05, |
| "loss": 0.0702, |
| "step": 299 |
| }, |
| { |
| "epoch": 0.9693053311793215, |
| "grad_norm": 0.07167758047580719, |
| "learning_rate": 1.9553551296505077e-05, |
| "loss": 0.0809, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9693053311793215, |
| "eval_loss": 0.08212888240814209, |
| "eval_runtime": 188.3411, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9725363489499192, |
| "grad_norm": 0.04536513611674309, |
| "learning_rate": 1.954904171364149e-05, |
| "loss": 0.0485, |
| "step": 301 |
| }, |
| { |
| "epoch": 0.975767366720517, |
| "grad_norm": 0.07035136222839355, |
| "learning_rate": 1.9544532130777906e-05, |
| "loss": 0.0763, |
| "step": 302 |
| }, |
| { |
| "epoch": 0.9789983844911146, |
| "grad_norm": 0.06417107582092285, |
| "learning_rate": 1.954002254791432e-05, |
| "loss": 0.0735, |
| "step": 303 |
| }, |
| { |
| "epoch": 0.9822294022617124, |
| "grad_norm": 0.06369137018918991, |
| "learning_rate": 1.9535512965050735e-05, |
| "loss": 0.07, |
| "step": 304 |
| }, |
| { |
| "epoch": 0.9854604200323102, |
| "grad_norm": 0.053664304316043854, |
| "learning_rate": 1.953100338218715e-05, |
| "loss": 0.058, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9886914378029079, |
| "grad_norm": 0.07393426448106766, |
| "learning_rate": 1.9526493799323563e-05, |
| "loss": 0.0783, |
| "step": 306 |
| }, |
| { |
| "epoch": 0.9919224555735057, |
| "grad_norm": 0.05141662806272507, |
| "learning_rate": 1.9521984216459978e-05, |
| "loss": 0.0556, |
| "step": 307 |
| }, |
| { |
| "epoch": 0.9951534733441034, |
| "grad_norm": 0.06411275267601013, |
| "learning_rate": 1.9517474633596392e-05, |
| "loss": 0.0687, |
| "step": 308 |
| }, |
| { |
| "epoch": 0.9983844911147012, |
| "grad_norm": 0.08877477794885635, |
| "learning_rate": 1.951296505073281e-05, |
| "loss": 0.0846, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.08283062279224396, |
| "learning_rate": 1.9508455467869224e-05, |
| "loss": 0.0541, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0032310177705976, |
| "grad_norm": 0.06769707798957825, |
| "learning_rate": 1.950394588500564e-05, |
| "loss": 0.0752, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.0064620355411955, |
| "grad_norm": 0.06125921383500099, |
| "learning_rate": 1.9499436302142053e-05, |
| "loss": 0.0719, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.0096930533117932, |
| "grad_norm": 0.03994071111083031, |
| "learning_rate": 1.9494926719278468e-05, |
| "loss": 0.0421, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.012924071082391, |
| "grad_norm": 0.05433064326643944, |
| "learning_rate": 1.9490417136414882e-05, |
| "loss": 0.066, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.0161550888529887, |
| "grad_norm": 0.06107380986213684, |
| "learning_rate": 1.9485907553551296e-05, |
| "loss": 0.0724, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.0193861066235865, |
| "grad_norm": 0.0669042244553566, |
| "learning_rate": 1.948139797068771e-05, |
| "loss": 0.0772, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.0226171243941842, |
| "grad_norm": 0.0474565327167511, |
| "learning_rate": 1.947688838782413e-05, |
| "loss": 0.0491, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.0258481421647818, |
| "grad_norm": 0.054098691791296005, |
| "learning_rate": 1.9472378804960543e-05, |
| "loss": 0.0618, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.0290791599353797, |
| "grad_norm": 0.06151336431503296, |
| "learning_rate": 1.9467869222096957e-05, |
| "loss": 0.0604, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.0323101777059773, |
| "grad_norm": 0.051618464291095734, |
| "learning_rate": 1.946335963923337e-05, |
| "loss": 0.0551, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.0355411954765752, |
| "grad_norm": 0.08121399581432343, |
| "learning_rate": 1.9458850056369786e-05, |
| "loss": 0.0939, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.0387722132471728, |
| "grad_norm": 0.05889379233121872, |
| "learning_rate": 1.94543404735062e-05, |
| "loss": 0.0687, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.0420032310177707, |
| "grad_norm": 0.06208242103457451, |
| "learning_rate": 1.9449830890642618e-05, |
| "loss": 0.0744, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.0452342487883683, |
| "grad_norm": 0.06454786658287048, |
| "learning_rate": 1.9445321307779033e-05, |
| "loss": 0.0684, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.048465266558966, |
| "grad_norm": 0.07085470855236053, |
| "learning_rate": 1.9440811724915447e-05, |
| "loss": 0.0727, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.0516962843295639, |
| "grad_norm": 0.07236117869615555, |
| "learning_rate": 1.943630214205186e-05, |
| "loss": 0.0892, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.0549273021001615, |
| "grad_norm": 0.054056137800216675, |
| "learning_rate": 1.9431792559188276e-05, |
| "loss": 0.0634, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.0581583198707594, |
| "grad_norm": 0.05462612211704254, |
| "learning_rate": 1.942728297632469e-05, |
| "loss": 0.0631, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.061389337641357, |
| "grad_norm": 0.0674949586391449, |
| "learning_rate": 1.9422773393461108e-05, |
| "loss": 0.0641, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.0646203554119547, |
| "grad_norm": 0.07532529532909393, |
| "learning_rate": 1.9418263810597522e-05, |
| "loss": 0.0765, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.0678513731825525, |
| "grad_norm": 0.06264142692089081, |
| "learning_rate": 1.9413754227733937e-05, |
| "loss": 0.0704, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.0710823909531502, |
| "grad_norm": 0.06789285689592361, |
| "learning_rate": 1.940924464487035e-05, |
| "loss": 0.0667, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.074313408723748, |
| "grad_norm": 0.06181450933218002, |
| "learning_rate": 1.9404735062006765e-05, |
| "loss": 0.0648, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.0775444264943457, |
| "grad_norm": 0.07014179229736328, |
| "learning_rate": 1.940022547914318e-05, |
| "loss": 0.076, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.0807754442649435, |
| "grad_norm": 0.07433414459228516, |
| "learning_rate": 1.9395715896279598e-05, |
| "loss": 0.0757, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.0840064620355412, |
| "grad_norm": 0.04758503660559654, |
| "learning_rate": 1.9391206313416012e-05, |
| "loss": 0.0492, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.0872374798061388, |
| "grad_norm": 0.06751306354999542, |
| "learning_rate": 1.9386696730552426e-05, |
| "loss": 0.0682, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.0904684975767367, |
| "grad_norm": 0.06028216332197189, |
| "learning_rate": 1.938218714768884e-05, |
| "loss": 0.059, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.0936995153473343, |
| "grad_norm": 0.060358040034770966, |
| "learning_rate": 1.9377677564825255e-05, |
| "loss": 0.0659, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.0969305331179322, |
| "grad_norm": 0.06687436252832413, |
| "learning_rate": 1.937316798196167e-05, |
| "loss": 0.0517, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1001615508885298, |
| "grad_norm": 0.07463373243808746, |
| "learning_rate": 1.9368658399098087e-05, |
| "loss": 0.0674, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.1033925686591277, |
| "grad_norm": 0.06248531863093376, |
| "learning_rate": 1.9364148816234502e-05, |
| "loss": 0.0681, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.1066235864297254, |
| "grad_norm": 0.06864578276872635, |
| "learning_rate": 1.9359639233370916e-05, |
| "loss": 0.0703, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.109854604200323, |
| "grad_norm": 0.0693066269159317, |
| "learning_rate": 1.935512965050733e-05, |
| "loss": 0.0599, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.1130856219709209, |
| "grad_norm": 0.13610310852527618, |
| "learning_rate": 1.9350620067643745e-05, |
| "loss": 0.0789, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.1163166397415185, |
| "grad_norm": 0.09487364441156387, |
| "learning_rate": 1.934611048478016e-05, |
| "loss": 0.094, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.1195476575121164, |
| "grad_norm": 0.0767926499247551, |
| "learning_rate": 1.9341600901916574e-05, |
| "loss": 0.0751, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.122778675282714, |
| "grad_norm": 0.1105605959892273, |
| "learning_rate": 1.9337091319052988e-05, |
| "loss": 0.0908, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.1260096930533119, |
| "grad_norm": 0.06821838766336441, |
| "learning_rate": 1.9332581736189403e-05, |
| "loss": 0.0702, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.1292407108239095, |
| "grad_norm": 0.07123742997646332, |
| "learning_rate": 1.932807215332582e-05, |
| "loss": 0.0637, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.1324717285945072, |
| "grad_norm": 0.08340942859649658, |
| "learning_rate": 1.9323562570462235e-05, |
| "loss": 0.0803, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.135702746365105, |
| "grad_norm": 0.06730187684297562, |
| "learning_rate": 1.931905298759865e-05, |
| "loss": 0.0644, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.1389337641357027, |
| "grad_norm": 0.06728731095790863, |
| "learning_rate": 1.9314543404735063e-05, |
| "loss": 0.0633, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.1421647819063006, |
| "grad_norm": 0.07192697376012802, |
| "learning_rate": 1.9310033821871478e-05, |
| "loss": 0.0664, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.1453957996768982, |
| "grad_norm": 0.07150010764598846, |
| "learning_rate": 1.9305524239007892e-05, |
| "loss": 0.0745, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.148626817447496, |
| "grad_norm": 0.05815986543893814, |
| "learning_rate": 1.9301014656144307e-05, |
| "loss": 0.0501, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.1518578352180937, |
| "grad_norm": 0.063558429479599, |
| "learning_rate": 1.929650507328072e-05, |
| "loss": 0.064, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.1550888529886914, |
| "grad_norm": 0.08062389492988586, |
| "learning_rate": 1.9291995490417135e-05, |
| "loss": 0.0792, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.1583198707592892, |
| "grad_norm": 0.06872212886810303, |
| "learning_rate": 1.9287485907553553e-05, |
| "loss": 0.0756, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.1615508885298869, |
| "grad_norm": 0.06003013253211975, |
| "learning_rate": 1.9282976324689968e-05, |
| "loss": 0.0577, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1647819063004847, |
| "grad_norm": 0.07533125579357147, |
| "learning_rate": 1.9278466741826382e-05, |
| "loss": 0.0745, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.1680129240710824, |
| "grad_norm": 0.0708516389131546, |
| "learning_rate": 1.9273957158962796e-05, |
| "loss": 0.0671, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.1712439418416802, |
| "grad_norm": 0.10226985812187195, |
| "learning_rate": 1.926944757609921e-05, |
| "loss": 0.1126, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.1744749596122779, |
| "grad_norm": 0.067733995616436, |
| "learning_rate": 1.9264937993235625e-05, |
| "loss": 0.0554, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.1777059773828755, |
| "grad_norm": 0.08708222955465317, |
| "learning_rate": 1.9260428410372043e-05, |
| "loss": 0.0806, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.1809369951534734, |
| "grad_norm": 0.06153462454676628, |
| "learning_rate": 1.9255918827508457e-05, |
| "loss": 0.0532, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.184168012924071, |
| "grad_norm": 0.051941219717264175, |
| "learning_rate": 1.9251409244644872e-05, |
| "loss": 0.0503, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.187399030694669, |
| "grad_norm": 0.09817774593830109, |
| "learning_rate": 1.9246899661781286e-05, |
| "loss": 0.0801, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.1906300484652665, |
| "grad_norm": 0.08504205197095871, |
| "learning_rate": 1.92423900789177e-05, |
| "loss": 0.08, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.1938610662358644, |
| "grad_norm": 0.0611301064491272, |
| "learning_rate": 1.9237880496054118e-05, |
| "loss": 0.061, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.197092084006462, |
| "grad_norm": 0.06038827449083328, |
| "learning_rate": 1.9233370913190533e-05, |
| "loss": 0.0512, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.2003231017770597, |
| "grad_norm": 0.08283468335866928, |
| "learning_rate": 1.9228861330326947e-05, |
| "loss": 0.0753, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.2035541195476576, |
| "grad_norm": 0.09547346830368042, |
| "learning_rate": 1.922435174746336e-05, |
| "loss": 0.0809, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.2067851373182552, |
| "grad_norm": 0.058611780405044556, |
| "learning_rate": 1.9219842164599776e-05, |
| "loss": 0.0563, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.210016155088853, |
| "grad_norm": 0.08549389988183975, |
| "learning_rate": 1.921533258173619e-05, |
| "loss": 0.0733, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.210016155088853, |
| "eval_loss": 0.0792868584394455, |
| "eval_runtime": 188.2838, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.2132471728594507, |
| "grad_norm": 0.08417635411024094, |
| "learning_rate": 1.9210822998872608e-05, |
| "loss": 0.0841, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.2164781906300486, |
| "grad_norm": 0.08157463371753693, |
| "learning_rate": 1.9206313416009022e-05, |
| "loss": 0.0714, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.2197092084006462, |
| "grad_norm": 0.05649822950363159, |
| "learning_rate": 1.9201803833145437e-05, |
| "loss": 0.0503, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.2229402261712439, |
| "grad_norm": 0.07617928087711334, |
| "learning_rate": 1.919729425028185e-05, |
| "loss": 0.0727, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.2261712439418417, |
| "grad_norm": 0.0574098639190197, |
| "learning_rate": 1.9192784667418266e-05, |
| "loss": 0.0506, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.2294022617124394, |
| "grad_norm": 0.07354257255792618, |
| "learning_rate": 1.918827508455468e-05, |
| "loss": 0.0728, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.2326332794830372, |
| "grad_norm": 0.07268121838569641, |
| "learning_rate": 1.9183765501691098e-05, |
| "loss": 0.0679, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.235864297253635, |
| "grad_norm": 0.07641527056694031, |
| "learning_rate": 1.9179255918827512e-05, |
| "loss": 0.0663, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.2390953150242328, |
| "grad_norm": 0.059996772557497025, |
| "learning_rate": 1.9174746335963926e-05, |
| "loss": 0.0523, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.2423263327948304, |
| "grad_norm": 0.07397306710481644, |
| "learning_rate": 1.917023675310034e-05, |
| "loss": 0.0662, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.245557350565428, |
| "grad_norm": 0.09324625134468079, |
| "learning_rate": 1.9165727170236755e-05, |
| "loss": 0.083, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.248788368336026, |
| "grad_norm": 0.08019818365573883, |
| "learning_rate": 1.916121758737317e-05, |
| "loss": 0.0682, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.2520193861066236, |
| "grad_norm": 0.08203406631946564, |
| "learning_rate": 1.9156708004509584e-05, |
| "loss": 0.0788, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.2552504038772212, |
| "grad_norm": 0.07293461263179779, |
| "learning_rate": 1.9152198421646e-05, |
| "loss": 0.0583, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.258481421647819, |
| "grad_norm": 0.07020010054111481, |
| "learning_rate": 1.9147688838782413e-05, |
| "loss": 0.0546, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.261712439418417, |
| "grad_norm": 0.0655217245221138, |
| "learning_rate": 1.914317925591883e-05, |
| "loss": 0.0561, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.2649434571890146, |
| "grad_norm": 0.0773930773139, |
| "learning_rate": 1.9138669673055245e-05, |
| "loss": 0.0805, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.2681744749596122, |
| "grad_norm": 0.06243716925382614, |
| "learning_rate": 1.913416009019166e-05, |
| "loss": 0.0573, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.27140549273021, |
| "grad_norm": 0.07922864705324173, |
| "learning_rate": 1.9129650507328074e-05, |
| "loss": 0.0679, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.2746365105008077, |
| "grad_norm": 0.10133316367864609, |
| "learning_rate": 1.9125140924464488e-05, |
| "loss": 0.1135, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.2778675282714054, |
| "grad_norm": 0.0727897360920906, |
| "learning_rate": 1.9120631341600903e-05, |
| "loss": 0.0658, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.2810985460420032, |
| "grad_norm": 0.0690392330288887, |
| "learning_rate": 1.9116121758737317e-05, |
| "loss": 0.0656, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.284329563812601, |
| "grad_norm": 0.062050607055425644, |
| "learning_rate": 1.911161217587373e-05, |
| "loss": 0.0542, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.2875605815831987, |
| "grad_norm": 0.0690266340970993, |
| "learning_rate": 1.9107102593010146e-05, |
| "loss": 0.065, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.2907915993537964, |
| "grad_norm": 0.07588627934455872, |
| "learning_rate": 1.9102593010146564e-05, |
| "loss": 0.0761, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2940226171243943, |
| "grad_norm": 0.07368933409452438, |
| "learning_rate": 1.9098083427282978e-05, |
| "loss": 0.0637, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.297253634894992, |
| "grad_norm": 0.0670572817325592, |
| "learning_rate": 1.9093573844419392e-05, |
| "loss": 0.0656, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.3004846526655895, |
| "grad_norm": 0.06778164952993393, |
| "learning_rate": 1.9089064261555807e-05, |
| "loss": 0.0577, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.3037156704361874, |
| "grad_norm": 0.10589181631803513, |
| "learning_rate": 1.908455467869222e-05, |
| "loss": 0.0849, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.306946688206785, |
| "grad_norm": 0.060739271342754364, |
| "learning_rate": 1.9080045095828635e-05, |
| "loss": 0.0563, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.310177705977383, |
| "grad_norm": 0.062488917261362076, |
| "learning_rate": 1.9075535512965053e-05, |
| "loss": 0.0555, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.3134087237479806, |
| "grad_norm": 0.08088962733745575, |
| "learning_rate": 1.9071025930101468e-05, |
| "loss": 0.0683, |
| "step": 407 |
| }, |
| { |
| "epoch": 1.3166397415185784, |
| "grad_norm": 0.07679299265146255, |
| "learning_rate": 1.9066516347237882e-05, |
| "loss": 0.0713, |
| "step": 408 |
| }, |
| { |
| "epoch": 1.319870759289176, |
| "grad_norm": 0.09731165319681168, |
| "learning_rate": 1.9062006764374296e-05, |
| "loss": 0.094, |
| "step": 409 |
| }, |
| { |
| "epoch": 1.3231017770597737, |
| "grad_norm": 0.09488274902105331, |
| "learning_rate": 1.905749718151071e-05, |
| "loss": 0.0855, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.3263327948303716, |
| "grad_norm": 0.08556380867958069, |
| "learning_rate": 1.9052987598647125e-05, |
| "loss": 0.0872, |
| "step": 411 |
| }, |
| { |
| "epoch": 1.3295638126009692, |
| "grad_norm": 0.0821579322218895, |
| "learning_rate": 1.9048478015783543e-05, |
| "loss": 0.0802, |
| "step": 412 |
| }, |
| { |
| "epoch": 1.332794830371567, |
| "grad_norm": 0.08073097467422485, |
| "learning_rate": 1.9043968432919957e-05, |
| "loss": 0.0764, |
| "step": 413 |
| }, |
| { |
| "epoch": 1.3360258481421647, |
| "grad_norm": 0.07449216395616531, |
| "learning_rate": 1.9039458850056372e-05, |
| "loss": 0.0681, |
| "step": 414 |
| }, |
| { |
| "epoch": 1.3392568659127626, |
| "grad_norm": 0.05690048635005951, |
| "learning_rate": 1.9034949267192786e-05, |
| "loss": 0.0479, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.3424878836833603, |
| "grad_norm": 0.08106525242328644, |
| "learning_rate": 1.90304396843292e-05, |
| "loss": 0.0751, |
| "step": 416 |
| }, |
| { |
| "epoch": 1.345718901453958, |
| "grad_norm": 0.06518511474132538, |
| "learning_rate": 1.9025930101465615e-05, |
| "loss": 0.0594, |
| "step": 417 |
| }, |
| { |
| "epoch": 1.3489499192245558, |
| "grad_norm": 0.08395849913358688, |
| "learning_rate": 1.9021420518602033e-05, |
| "loss": 0.0756, |
| "step": 418 |
| }, |
| { |
| "epoch": 1.3521809369951534, |
| "grad_norm": 0.05958770960569382, |
| "learning_rate": 1.9016910935738447e-05, |
| "loss": 0.0491, |
| "step": 419 |
| }, |
| { |
| "epoch": 1.3554119547657513, |
| "grad_norm": 0.07311136275529861, |
| "learning_rate": 1.901240135287486e-05, |
| "loss": 0.0719, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.358642972536349, |
| "grad_norm": 0.09682740271091461, |
| "learning_rate": 1.9007891770011276e-05, |
| "loss": 0.0895, |
| "step": 421 |
| }, |
| { |
| "epoch": 1.3618739903069468, |
| "grad_norm": 0.06295045465230942, |
| "learning_rate": 1.900338218714769e-05, |
| "loss": 0.0589, |
| "step": 422 |
| }, |
| { |
| "epoch": 1.3651050080775444, |
| "grad_norm": 0.0831819698214531, |
| "learning_rate": 1.8998872604284105e-05, |
| "loss": 0.0727, |
| "step": 423 |
| }, |
| { |
| "epoch": 1.368336025848142, |
| "grad_norm": 0.06702585518360138, |
| "learning_rate": 1.8994363021420522e-05, |
| "loss": 0.0617, |
| "step": 424 |
| }, |
| { |
| "epoch": 1.37156704361874, |
| "grad_norm": 0.06618952006101608, |
| "learning_rate": 1.8989853438556937e-05, |
| "loss": 0.0517, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.3747980613893376, |
| "grad_norm": 0.07830128818750381, |
| "learning_rate": 1.898534385569335e-05, |
| "loss": 0.0747, |
| "step": 426 |
| }, |
| { |
| "epoch": 1.3780290791599354, |
| "grad_norm": 0.07554402947425842, |
| "learning_rate": 1.8980834272829766e-05, |
| "loss": 0.078, |
| "step": 427 |
| }, |
| { |
| "epoch": 1.381260096930533, |
| "grad_norm": 0.07517927139997482, |
| "learning_rate": 1.897632468996618e-05, |
| "loss": 0.0715, |
| "step": 428 |
| }, |
| { |
| "epoch": 1.384491114701131, |
| "grad_norm": 0.05810945853590965, |
| "learning_rate": 1.8971815107102594e-05, |
| "loss": 0.0524, |
| "step": 429 |
| }, |
| { |
| "epoch": 1.3877221324717286, |
| "grad_norm": 0.1092490404844284, |
| "learning_rate": 1.896730552423901e-05, |
| "loss": 0.0987, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3909531502423262, |
| "grad_norm": 0.08325308561325073, |
| "learning_rate": 1.8962795941375423e-05, |
| "loss": 0.0746, |
| "step": 431 |
| }, |
| { |
| "epoch": 1.394184168012924, |
| "grad_norm": 0.08017408847808838, |
| "learning_rate": 1.895828635851184e-05, |
| "loss": 0.0676, |
| "step": 432 |
| }, |
| { |
| "epoch": 1.3974151857835218, |
| "grad_norm": 0.09756331145763397, |
| "learning_rate": 1.8953776775648255e-05, |
| "loss": 0.0788, |
| "step": 433 |
| }, |
| { |
| "epoch": 1.4006462035541196, |
| "grad_norm": 0.0654483512043953, |
| "learning_rate": 1.894926719278467e-05, |
| "loss": 0.0552, |
| "step": 434 |
| }, |
| { |
| "epoch": 1.4038772213247173, |
| "grad_norm": 0.07338982075452805, |
| "learning_rate": 1.8944757609921084e-05, |
| "loss": 0.0597, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.4071082390953151, |
| "grad_norm": 0.06292750686407089, |
| "learning_rate": 1.89402480270575e-05, |
| "loss": 0.0482, |
| "step": 436 |
| }, |
| { |
| "epoch": 1.4103392568659128, |
| "grad_norm": 0.09405938535928726, |
| "learning_rate": 1.8935738444193913e-05, |
| "loss": 0.0795, |
| "step": 437 |
| }, |
| { |
| "epoch": 1.4135702746365104, |
| "grad_norm": 0.09486392885446548, |
| "learning_rate": 1.8931228861330327e-05, |
| "loss": 0.0811, |
| "step": 438 |
| }, |
| { |
| "epoch": 1.4168012924071083, |
| "grad_norm": 0.0729052945971489, |
| "learning_rate": 1.892671927846674e-05, |
| "loss": 0.0682, |
| "step": 439 |
| }, |
| { |
| "epoch": 1.420032310177706, |
| "grad_norm": 0.06790515035390854, |
| "learning_rate": 1.8922209695603156e-05, |
| "loss": 0.0541, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.4232633279483038, |
| "grad_norm": 0.08173596858978271, |
| "learning_rate": 1.891770011273957e-05, |
| "loss": 0.0697, |
| "step": 441 |
| }, |
| { |
| "epoch": 1.4264943457189014, |
| "grad_norm": 0.0874050036072731, |
| "learning_rate": 1.8913190529875988e-05, |
| "loss": 0.0689, |
| "step": 442 |
| }, |
| { |
| "epoch": 1.4297253634894993, |
| "grad_norm": 0.07508452981710434, |
| "learning_rate": 1.8908680947012403e-05, |
| "loss": 0.069, |
| "step": 443 |
| }, |
| { |
| "epoch": 1.432956381260097, |
| "grad_norm": 0.09134234488010406, |
| "learning_rate": 1.8904171364148817e-05, |
| "loss": 0.072, |
| "step": 444 |
| }, |
| { |
| "epoch": 1.4361873990306946, |
| "grad_norm": 0.0830577090382576, |
| "learning_rate": 1.889966178128523e-05, |
| "loss": 0.0681, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.4394184168012925, |
| "grad_norm": 0.0741642490029335, |
| "learning_rate": 1.8895152198421646e-05, |
| "loss": 0.0642, |
| "step": 446 |
| }, |
| { |
| "epoch": 1.44264943457189, |
| "grad_norm": 0.07305614650249481, |
| "learning_rate": 1.889064261555806e-05, |
| "loss": 0.0569, |
| "step": 447 |
| }, |
| { |
| "epoch": 1.445880452342488, |
| "grad_norm": 0.05348379164934158, |
| "learning_rate": 1.8886133032694478e-05, |
| "loss": 0.0434, |
| "step": 448 |
| }, |
| { |
| "epoch": 1.4491114701130856, |
| "grad_norm": 0.09780937433242798, |
| "learning_rate": 1.8881623449830892e-05, |
| "loss": 0.0856, |
| "step": 449 |
| }, |
| { |
| "epoch": 1.4523424878836835, |
| "grad_norm": 0.081721231341362, |
| "learning_rate": 1.8877113866967307e-05, |
| "loss": 0.0655, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4523424878836835, |
| "eval_loss": 0.0769171267747879, |
| "eval_runtime": 188.2463, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.4555735056542811, |
| "grad_norm": 0.06182597577571869, |
| "learning_rate": 1.887260428410372e-05, |
| "loss": 0.0549, |
| "step": 451 |
| }, |
| { |
| "epoch": 1.4588045234248788, |
| "grad_norm": 0.0831274464726448, |
| "learning_rate": 1.8868094701240136e-05, |
| "loss": 0.0664, |
| "step": 452 |
| }, |
| { |
| "epoch": 1.4620355411954766, |
| "grad_norm": 0.07277555763721466, |
| "learning_rate": 1.8863585118376553e-05, |
| "loss": 0.0619, |
| "step": 453 |
| }, |
| { |
| "epoch": 1.4652665589660743, |
| "grad_norm": 0.09069440513849258, |
| "learning_rate": 1.8859075535512968e-05, |
| "loss": 0.0717, |
| "step": 454 |
| }, |
| { |
| "epoch": 1.468497576736672, |
| "grad_norm": 0.08567981421947479, |
| "learning_rate": 1.8854565952649382e-05, |
| "loss": 0.0761, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.4717285945072698, |
| "grad_norm": 0.08353572338819504, |
| "learning_rate": 1.8850056369785796e-05, |
| "loss": 0.0621, |
| "step": 456 |
| }, |
| { |
| "epoch": 1.4749596122778676, |
| "grad_norm": 0.06493799388408661, |
| "learning_rate": 1.884554678692221e-05, |
| "loss": 0.0549, |
| "step": 457 |
| }, |
| { |
| "epoch": 1.4781906300484653, |
| "grad_norm": 0.07239842414855957, |
| "learning_rate": 1.8841037204058625e-05, |
| "loss": 0.0574, |
| "step": 458 |
| }, |
| { |
| "epoch": 1.481421647819063, |
| "grad_norm": 0.1062210276722908, |
| "learning_rate": 1.8836527621195043e-05, |
| "loss": 0.0831, |
| "step": 459 |
| }, |
| { |
| "epoch": 1.4846526655896608, |
| "grad_norm": 0.06695660948753357, |
| "learning_rate": 1.8832018038331457e-05, |
| "loss": 0.0488, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4878836833602584, |
| "grad_norm": 0.08332875370979309, |
| "learning_rate": 1.8827508455467872e-05, |
| "loss": 0.0752, |
| "step": 461 |
| }, |
| { |
| "epoch": 1.491114701130856, |
| "grad_norm": 0.09285688400268555, |
| "learning_rate": 1.8822998872604286e-05, |
| "loss": 0.0811, |
| "step": 462 |
| }, |
| { |
| "epoch": 1.494345718901454, |
| "grad_norm": 0.07672538608312607, |
| "learning_rate": 1.88184892897407e-05, |
| "loss": 0.0565, |
| "step": 463 |
| }, |
| { |
| "epoch": 1.4975767366720518, |
| "grad_norm": 0.07295355945825577, |
| "learning_rate": 1.8813979706877115e-05, |
| "loss": 0.0615, |
| "step": 464 |
| }, |
| { |
| "epoch": 1.5008077544426495, |
| "grad_norm": 0.05997586250305176, |
| "learning_rate": 1.8809470124013533e-05, |
| "loss": 0.0494, |
| "step": 465 |
| }, |
| { |
| "epoch": 1.504038772213247, |
| "grad_norm": 0.08460883051156998, |
| "learning_rate": 1.8804960541149947e-05, |
| "loss": 0.0715, |
| "step": 466 |
| }, |
| { |
| "epoch": 1.507269789983845, |
| "grad_norm": 0.08083106577396393, |
| "learning_rate": 1.880045095828636e-05, |
| "loss": 0.0615, |
| "step": 467 |
| }, |
| { |
| "epoch": 1.5105008077544426, |
| "grad_norm": 0.09291260689496994, |
| "learning_rate": 1.8795941375422776e-05, |
| "loss": 0.0767, |
| "step": 468 |
| }, |
| { |
| "epoch": 1.5137318255250403, |
| "grad_norm": 0.0817233994603157, |
| "learning_rate": 1.879143179255919e-05, |
| "loss": 0.0728, |
| "step": 469 |
| }, |
| { |
| "epoch": 1.5169628432956381, |
| "grad_norm": 0.07894831895828247, |
| "learning_rate": 1.8786922209695605e-05, |
| "loss": 0.0697, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.520193861066236, |
| "grad_norm": 0.05180181935429573, |
| "learning_rate": 1.878241262683202e-05, |
| "loss": 0.0407, |
| "step": 471 |
| }, |
| { |
| "epoch": 1.5234248788368336, |
| "grad_norm": 0.08214667439460754, |
| "learning_rate": 1.8777903043968433e-05, |
| "loss": 0.0673, |
| "step": 472 |
| }, |
| { |
| "epoch": 1.5266558966074313, |
| "grad_norm": 0.06972946226596832, |
| "learning_rate": 1.877339346110485e-05, |
| "loss": 0.0501, |
| "step": 473 |
| }, |
| { |
| "epoch": 1.5298869143780292, |
| "grad_norm": 0.08416459709405899, |
| "learning_rate": 1.8768883878241266e-05, |
| "loss": 0.0666, |
| "step": 474 |
| }, |
| { |
| "epoch": 1.5331179321486268, |
| "grad_norm": 0.07642164081335068, |
| "learning_rate": 1.876437429537768e-05, |
| "loss": 0.0592, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.5363489499192244, |
| "grad_norm": 0.0762806385755539, |
| "learning_rate": 1.8759864712514094e-05, |
| "loss": 0.0573, |
| "step": 476 |
| }, |
| { |
| "epoch": 1.5395799676898223, |
| "grad_norm": 0.06152572110295296, |
| "learning_rate": 1.875535512965051e-05, |
| "loss": 0.0509, |
| "step": 477 |
| }, |
| { |
| "epoch": 1.5428109854604202, |
| "grad_norm": 0.08461987972259521, |
| "learning_rate": 1.8750845546786923e-05, |
| "loss": 0.0693, |
| "step": 478 |
| }, |
| { |
| "epoch": 1.5460420032310178, |
| "grad_norm": 0.06401054561138153, |
| "learning_rate": 1.8746335963923338e-05, |
| "loss": 0.0523, |
| "step": 479 |
| }, |
| { |
| "epoch": 1.5492730210016155, |
| "grad_norm": 0.07567861676216125, |
| "learning_rate": 1.8741826381059752e-05, |
| "loss": 0.0632, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.5525040387722133, |
| "grad_norm": 0.07169700413942337, |
| "learning_rate": 1.8737316798196166e-05, |
| "loss": 0.0541, |
| "step": 481 |
| }, |
| { |
| "epoch": 1.555735056542811, |
| "grad_norm": 0.067410409450531, |
| "learning_rate": 1.873280721533258e-05, |
| "loss": 0.0576, |
| "step": 482 |
| }, |
| { |
| "epoch": 1.5589660743134086, |
| "grad_norm": 0.0794718787074089, |
| "learning_rate": 1.8728297632469e-05, |
| "loss": 0.0602, |
| "step": 483 |
| }, |
| { |
| "epoch": 1.5621970920840065, |
| "grad_norm": 0.09098870307207108, |
| "learning_rate": 1.8723788049605413e-05, |
| "loss": 0.0741, |
| "step": 484 |
| }, |
| { |
| "epoch": 1.5654281098546043, |
| "grad_norm": 0.07266968488693237, |
| "learning_rate": 1.8719278466741827e-05, |
| "loss": 0.0535, |
| "step": 485 |
| }, |
| { |
| "epoch": 1.568659127625202, |
| "grad_norm": 0.07994985580444336, |
| "learning_rate": 1.8714768883878242e-05, |
| "loss": 0.0642, |
| "step": 486 |
| }, |
| { |
| "epoch": 1.5718901453957996, |
| "grad_norm": 0.09563203901052475, |
| "learning_rate": 1.8710259301014656e-05, |
| "loss": 0.0738, |
| "step": 487 |
| }, |
| { |
| "epoch": 1.5751211631663975, |
| "grad_norm": 0.07337169349193573, |
| "learning_rate": 1.870574971815107e-05, |
| "loss": 0.0615, |
| "step": 488 |
| }, |
| { |
| "epoch": 1.5783521809369951, |
| "grad_norm": 0.08605758100748062, |
| "learning_rate": 1.8701240135287488e-05, |
| "loss": 0.0737, |
| "step": 489 |
| }, |
| { |
| "epoch": 1.5815831987075928, |
| "grad_norm": 0.08178628236055374, |
| "learning_rate": 1.8696730552423903e-05, |
| "loss": 0.0562, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.5848142164781907, |
| "grad_norm": 0.08635883033275604, |
| "learning_rate": 1.8692220969560317e-05, |
| "loss": 0.0655, |
| "step": 491 |
| }, |
| { |
| "epoch": 1.5880452342487885, |
| "grad_norm": 0.10575321316719055, |
| "learning_rate": 1.868771138669673e-05, |
| "loss": 0.0857, |
| "step": 492 |
| }, |
| { |
| "epoch": 1.5912762520193862, |
| "grad_norm": 0.10067257285118103, |
| "learning_rate": 1.8683201803833146e-05, |
| "loss": 0.0817, |
| "step": 493 |
| }, |
| { |
| "epoch": 1.5945072697899838, |
| "grad_norm": 0.07644681632518768, |
| "learning_rate": 1.867869222096956e-05, |
| "loss": 0.0625, |
| "step": 494 |
| }, |
| { |
| "epoch": 1.5977382875605817, |
| "grad_norm": 0.07164619863033295, |
| "learning_rate": 1.8674182638105978e-05, |
| "loss": 0.0597, |
| "step": 495 |
| }, |
| { |
| "epoch": 1.6009693053311793, |
| "grad_norm": 0.07293085008859634, |
| "learning_rate": 1.8669673055242392e-05, |
| "loss": 0.0604, |
| "step": 496 |
| }, |
| { |
| "epoch": 1.604200323101777, |
| "grad_norm": 0.09480689465999603, |
| "learning_rate": 1.8665163472378807e-05, |
| "loss": 0.0737, |
| "step": 497 |
| }, |
| { |
| "epoch": 1.6074313408723748, |
| "grad_norm": 0.09798948466777802, |
| "learning_rate": 1.866065388951522e-05, |
| "loss": 0.0706, |
| "step": 498 |
| }, |
| { |
| "epoch": 1.6106623586429727, |
| "grad_norm": 0.08216292411088943, |
| "learning_rate": 1.8656144306651636e-05, |
| "loss": 0.0585, |
| "step": 499 |
| }, |
| { |
| "epoch": 1.6138933764135701, |
| "grad_norm": 0.10146701335906982, |
| "learning_rate": 1.865163472378805e-05, |
| "loss": 0.0631, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.617124394184168, |
| "grad_norm": 0.07699297368526459, |
| "learning_rate": 1.8647125140924468e-05, |
| "loss": 0.0592, |
| "step": 501 |
| }, |
| { |
| "epoch": 1.6203554119547658, |
| "grad_norm": 0.07803017646074295, |
| "learning_rate": 1.8642615558060882e-05, |
| "loss": 0.063, |
| "step": 502 |
| }, |
| { |
| "epoch": 1.6235864297253635, |
| "grad_norm": 0.08820293843746185, |
| "learning_rate": 1.8638105975197297e-05, |
| "loss": 0.0733, |
| "step": 503 |
| }, |
| { |
| "epoch": 1.6268174474959611, |
| "grad_norm": 0.10102511942386627, |
| "learning_rate": 1.863359639233371e-05, |
| "loss": 0.0735, |
| "step": 504 |
| }, |
| { |
| "epoch": 1.630048465266559, |
| "grad_norm": 0.08669153600931168, |
| "learning_rate": 1.8629086809470125e-05, |
| "loss": 0.0757, |
| "step": 505 |
| }, |
| { |
| "epoch": 1.6332794830371569, |
| "grad_norm": 0.08120600879192352, |
| "learning_rate": 1.862457722660654e-05, |
| "loss": 0.0586, |
| "step": 506 |
| }, |
| { |
| "epoch": 1.6365105008077543, |
| "grad_norm": 0.06960420310497284, |
| "learning_rate": 1.8620067643742957e-05, |
| "loss": 0.0519, |
| "step": 507 |
| }, |
| { |
| "epoch": 1.6397415185783522, |
| "grad_norm": 0.08567452430725098, |
| "learning_rate": 1.8615558060879372e-05, |
| "loss": 0.0703, |
| "step": 508 |
| }, |
| { |
| "epoch": 1.64297253634895, |
| "grad_norm": 0.08288481831550598, |
| "learning_rate": 1.8611048478015786e-05, |
| "loss": 0.0624, |
| "step": 509 |
| }, |
| { |
| "epoch": 1.6462035541195477, |
| "grad_norm": 0.10185632109642029, |
| "learning_rate": 1.86065388951522e-05, |
| "loss": 0.072, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.6494345718901453, |
| "grad_norm": 0.0914456769824028, |
| "learning_rate": 1.8602029312288615e-05, |
| "loss": 0.0683, |
| "step": 511 |
| }, |
| { |
| "epoch": 1.6526655896607432, |
| "grad_norm": 0.08204774558544159, |
| "learning_rate": 1.859751972942503e-05, |
| "loss": 0.0657, |
| "step": 512 |
| }, |
| { |
| "epoch": 1.655896607431341, |
| "grad_norm": 0.11823786050081253, |
| "learning_rate": 1.8593010146561444e-05, |
| "loss": 0.0872, |
| "step": 513 |
| }, |
| { |
| "epoch": 1.6591276252019385, |
| "grad_norm": 0.13115671277046204, |
| "learning_rate": 1.858850056369786e-05, |
| "loss": 0.1004, |
| "step": 514 |
| }, |
| { |
| "epoch": 1.6623586429725363, |
| "grad_norm": 0.09443841129541397, |
| "learning_rate": 1.8583990980834276e-05, |
| "loss": 0.0722, |
| "step": 515 |
| }, |
| { |
| "epoch": 1.6655896607431342, |
| "grad_norm": 0.06847009062767029, |
| "learning_rate": 1.857948139797069e-05, |
| "loss": 0.054, |
| "step": 516 |
| }, |
| { |
| "epoch": 1.6688206785137318, |
| "grad_norm": 0.07960178703069687, |
| "learning_rate": 1.8574971815107105e-05, |
| "loss": 0.065, |
| "step": 517 |
| }, |
| { |
| "epoch": 1.6720516962843295, |
| "grad_norm": 0.07255195826292038, |
| "learning_rate": 1.857046223224352e-05, |
| "loss": 0.0523, |
| "step": 518 |
| }, |
| { |
| "epoch": 1.6752827140549273, |
| "grad_norm": 0.08610787242650986, |
| "learning_rate": 1.8565952649379934e-05, |
| "loss": 0.0667, |
| "step": 519 |
| }, |
| { |
| "epoch": 1.678513731825525, |
| "grad_norm": 0.09422770887613297, |
| "learning_rate": 1.8561443066516348e-05, |
| "loss": 0.0772, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.6817447495961226, |
| "grad_norm": 0.10716807097196579, |
| "learning_rate": 1.8556933483652762e-05, |
| "loss": 0.0825, |
| "step": 521 |
| }, |
| { |
| "epoch": 1.6849757673667205, |
| "grad_norm": 0.09894333779811859, |
| "learning_rate": 1.8552423900789177e-05, |
| "loss": 0.084, |
| "step": 522 |
| }, |
| { |
| "epoch": 1.6882067851373184, |
| "grad_norm": 0.08127731829881668, |
| "learning_rate": 1.854791431792559e-05, |
| "loss": 0.063, |
| "step": 523 |
| }, |
| { |
| "epoch": 1.691437802907916, |
| "grad_norm": 0.08127739280462265, |
| "learning_rate": 1.854340473506201e-05, |
| "loss": 0.0602, |
| "step": 524 |
| }, |
| { |
| "epoch": 1.6946688206785137, |
| "grad_norm": 0.08109954744577408, |
| "learning_rate": 1.8538895152198423e-05, |
| "loss": 0.0618, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.6946688206785137, |
| "eval_loss": 0.07574764639139175, |
| "eval_runtime": 188.1431, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.6978998384491115, |
| "grad_norm": 0.07915576547384262, |
| "learning_rate": 1.8534385569334838e-05, |
| "loss": 0.0566, |
| "step": 526 |
| }, |
| { |
| "epoch": 1.7011308562197092, |
| "grad_norm": 0.09259936213493347, |
| "learning_rate": 1.8529875986471252e-05, |
| "loss": 0.0821, |
| "step": 527 |
| }, |
| { |
| "epoch": 1.7043618739903068, |
| "grad_norm": 0.06958405673503876, |
| "learning_rate": 1.8525366403607666e-05, |
| "loss": 0.0513, |
| "step": 528 |
| }, |
| { |
| "epoch": 1.7075928917609047, |
| "grad_norm": 0.11260278522968292, |
| "learning_rate": 1.852085682074408e-05, |
| "loss": 0.0791, |
| "step": 529 |
| }, |
| { |
| "epoch": 1.7108239095315025, |
| "grad_norm": 0.08565714955329895, |
| "learning_rate": 1.8516347237880495e-05, |
| "loss": 0.0704, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.7140549273021002, |
| "grad_norm": 0.0977453961968422, |
| "learning_rate": 1.8511837655016913e-05, |
| "loss": 0.0666, |
| "step": 531 |
| }, |
| { |
| "epoch": 1.7172859450726978, |
| "grad_norm": 0.09589142352342606, |
| "learning_rate": 1.8507328072153327e-05, |
| "loss": 0.0678, |
| "step": 532 |
| }, |
| { |
| "epoch": 1.7205169628432957, |
| "grad_norm": 0.10372763872146606, |
| "learning_rate": 1.8502818489289742e-05, |
| "loss": 0.0755, |
| "step": 533 |
| }, |
| { |
| "epoch": 1.7237479806138933, |
| "grad_norm": 0.09707041829824448, |
| "learning_rate": 1.8498308906426156e-05, |
| "loss": 0.0759, |
| "step": 534 |
| }, |
| { |
| "epoch": 1.726978998384491, |
| "grad_norm": 0.07280156016349792, |
| "learning_rate": 1.849379932356257e-05, |
| "loss": 0.058, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.7302100161550888, |
| "grad_norm": 0.08810850977897644, |
| "learning_rate": 1.848928974069899e-05, |
| "loss": 0.0691, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.7334410339256867, |
| "grad_norm": 0.09844056516885757, |
| "learning_rate": 1.8484780157835403e-05, |
| "loss": 0.0682, |
| "step": 537 |
| }, |
| { |
| "epoch": 1.7366720516962844, |
| "grad_norm": 0.06963982433080673, |
| "learning_rate": 1.8480270574971817e-05, |
| "loss": 0.0513, |
| "step": 538 |
| }, |
| { |
| "epoch": 1.739903069466882, |
| "grad_norm": 0.08248520642518997, |
| "learning_rate": 1.847576099210823e-05, |
| "loss": 0.0611, |
| "step": 539 |
| }, |
| { |
| "epoch": 1.7431340872374799, |
| "grad_norm": 0.09553173929452896, |
| "learning_rate": 1.8471251409244646e-05, |
| "loss": 0.0754, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.7463651050080775, |
| "grad_norm": 0.06919584423303604, |
| "learning_rate": 1.846674182638106e-05, |
| "loss": 0.0508, |
| "step": 541 |
| }, |
| { |
| "epoch": 1.7495961227786752, |
| "grad_norm": 0.07004183530807495, |
| "learning_rate": 1.8462232243517478e-05, |
| "loss": 0.0498, |
| "step": 542 |
| }, |
| { |
| "epoch": 1.752827140549273, |
| "grad_norm": 0.08570928126573563, |
| "learning_rate": 1.8457722660653892e-05, |
| "loss": 0.0631, |
| "step": 543 |
| }, |
| { |
| "epoch": 1.7560581583198709, |
| "grad_norm": 0.0732467994093895, |
| "learning_rate": 1.8453213077790307e-05, |
| "loss": 0.0557, |
| "step": 544 |
| }, |
| { |
| "epoch": 1.7592891760904685, |
| "grad_norm": 0.07687011361122131, |
| "learning_rate": 1.844870349492672e-05, |
| "loss": 0.0573, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.7625201938610662, |
| "grad_norm": 0.07186026871204376, |
| "learning_rate": 1.8444193912063136e-05, |
| "loss": 0.0502, |
| "step": 546 |
| }, |
| { |
| "epoch": 1.765751211631664, |
| "grad_norm": 0.07176259905099869, |
| "learning_rate": 1.843968432919955e-05, |
| "loss": 0.0529, |
| "step": 547 |
| }, |
| { |
| "epoch": 1.7689822294022617, |
| "grad_norm": 0.0842595249414444, |
| "learning_rate": 1.8435174746335968e-05, |
| "loss": 0.0674, |
| "step": 548 |
| }, |
| { |
| "epoch": 1.7722132471728593, |
| "grad_norm": 0.07965710759162903, |
| "learning_rate": 1.8430665163472382e-05, |
| "loss": 0.0619, |
| "step": 549 |
| }, |
| { |
| "epoch": 1.7754442649434572, |
| "grad_norm": 0.08953316509723663, |
| "learning_rate": 1.8426155580608797e-05, |
| "loss": 0.0643, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.778675282714055, |
| "grad_norm": 0.0928904190659523, |
| "learning_rate": 1.842164599774521e-05, |
| "loss": 0.0712, |
| "step": 551 |
| }, |
| { |
| "epoch": 1.7819063004846527, |
| "grad_norm": 0.08743231743574142, |
| "learning_rate": 1.8417136414881625e-05, |
| "loss": 0.0657, |
| "step": 552 |
| }, |
| { |
| "epoch": 1.7851373182552503, |
| "grad_norm": 0.07706678658723831, |
| "learning_rate": 1.841262683201804e-05, |
| "loss": 0.0568, |
| "step": 553 |
| }, |
| { |
| "epoch": 1.7883683360258482, |
| "grad_norm": 0.0831725150346756, |
| "learning_rate": 1.8408117249154454e-05, |
| "loss": 0.0578, |
| "step": 554 |
| }, |
| { |
| "epoch": 1.7915993537964459, |
| "grad_norm": 0.09395398199558258, |
| "learning_rate": 1.8403607666290872e-05, |
| "loss": 0.0715, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.7948303715670435, |
| "grad_norm": 0.0942830964922905, |
| "learning_rate": 1.8399098083427286e-05, |
| "loss": 0.0634, |
| "step": 556 |
| }, |
| { |
| "epoch": 1.7980613893376414, |
| "grad_norm": 0.0980205312371254, |
| "learning_rate": 1.83945885005637e-05, |
| "loss": 0.0694, |
| "step": 557 |
| }, |
| { |
| "epoch": 1.8012924071082392, |
| "grad_norm": 0.10699216276407242, |
| "learning_rate": 1.8390078917700115e-05, |
| "loss": 0.0871, |
| "step": 558 |
| }, |
| { |
| "epoch": 1.8045234248788369, |
| "grad_norm": 0.09851755946874619, |
| "learning_rate": 1.838556933483653e-05, |
| "loss": 0.0644, |
| "step": 559 |
| }, |
| { |
| "epoch": 1.8077544426494345, |
| "grad_norm": 0.09926044940948486, |
| "learning_rate": 1.8381059751972944e-05, |
| "loss": 0.0629, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.8109854604200324, |
| "grad_norm": 0.09520839154720306, |
| "learning_rate": 1.8376550169109358e-05, |
| "loss": 0.0607, |
| "step": 561 |
| }, |
| { |
| "epoch": 1.81421647819063, |
| "grad_norm": 0.06896607577800751, |
| "learning_rate": 1.8372040586245773e-05, |
| "loss": 0.0559, |
| "step": 562 |
| }, |
| { |
| "epoch": 1.8174474959612277, |
| "grad_norm": 0.09539300203323364, |
| "learning_rate": 1.8367531003382187e-05, |
| "loss": 0.0683, |
| "step": 563 |
| }, |
| { |
| "epoch": 1.8206785137318255, |
| "grad_norm": 0.10596197098493576, |
| "learning_rate": 1.83630214205186e-05, |
| "loss": 0.075, |
| "step": 564 |
| }, |
| { |
| "epoch": 1.8239095315024234, |
| "grad_norm": 0.13212427496910095, |
| "learning_rate": 1.835851183765502e-05, |
| "loss": 0.0755, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.827140549273021, |
| "grad_norm": 0.12438125163316727, |
| "learning_rate": 1.8354002254791434e-05, |
| "loss": 0.0853, |
| "step": 566 |
| }, |
| { |
| "epoch": 1.8303715670436187, |
| "grad_norm": 0.06944366544485092, |
| "learning_rate": 1.8349492671927848e-05, |
| "loss": 0.0434, |
| "step": 567 |
| }, |
| { |
| "epoch": 1.8336025848142166, |
| "grad_norm": 0.10360438376665115, |
| "learning_rate": 1.8344983089064262e-05, |
| "loss": 0.0778, |
| "step": 568 |
| }, |
| { |
| "epoch": 1.8368336025848142, |
| "grad_norm": 0.1002860888838768, |
| "learning_rate": 1.8340473506200677e-05, |
| "loss": 0.0733, |
| "step": 569 |
| }, |
| { |
| "epoch": 1.8400646203554119, |
| "grad_norm": 0.10875017940998077, |
| "learning_rate": 1.833596392333709e-05, |
| "loss": 0.0781, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.8432956381260097, |
| "grad_norm": 0.08669572323560715, |
| "learning_rate": 1.8331454340473506e-05, |
| "loss": 0.0587, |
| "step": 571 |
| }, |
| { |
| "epoch": 1.8465266558966076, |
| "grad_norm": 0.09304548799991608, |
| "learning_rate": 1.8326944757609923e-05, |
| "loss": 0.0675, |
| "step": 572 |
| }, |
| { |
| "epoch": 1.849757673667205, |
| "grad_norm": 0.09815046936273575, |
| "learning_rate": 1.8322435174746338e-05, |
| "loss": 0.0752, |
| "step": 573 |
| }, |
| { |
| "epoch": 1.8529886914378029, |
| "grad_norm": 0.08040884137153625, |
| "learning_rate": 1.8317925591882752e-05, |
| "loss": 0.06, |
| "step": 574 |
| }, |
| { |
| "epoch": 1.8562197092084007, |
| "grad_norm": 0.08228793740272522, |
| "learning_rate": 1.8313416009019166e-05, |
| "loss": 0.0547, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.8594507269789984, |
| "grad_norm": 0.07945281267166138, |
| "learning_rate": 1.830890642615558e-05, |
| "loss": 0.0467, |
| "step": 576 |
| }, |
| { |
| "epoch": 1.862681744749596, |
| "grad_norm": 0.07659505307674408, |
| "learning_rate": 1.8304396843291995e-05, |
| "loss": 0.0571, |
| "step": 577 |
| }, |
| { |
| "epoch": 1.865912762520194, |
| "grad_norm": 0.07296533137559891, |
| "learning_rate": 1.8299887260428413e-05, |
| "loss": 0.0567, |
| "step": 578 |
| }, |
| { |
| "epoch": 1.8691437802907918, |
| "grad_norm": 0.10132135450839996, |
| "learning_rate": 1.8295377677564827e-05, |
| "loss": 0.0589, |
| "step": 579 |
| }, |
| { |
| "epoch": 1.8723747980613892, |
| "grad_norm": 0.0985584482550621, |
| "learning_rate": 1.8290868094701242e-05, |
| "loss": 0.0707, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.875605815831987, |
| "grad_norm": 0.09575635194778442, |
| "learning_rate": 1.8286358511837656e-05, |
| "loss": 0.0687, |
| "step": 581 |
| }, |
| { |
| "epoch": 1.878836833602585, |
| "grad_norm": 0.10488908737897873, |
| "learning_rate": 1.828184892897407e-05, |
| "loss": 0.0752, |
| "step": 582 |
| }, |
| { |
| "epoch": 1.8820678513731826, |
| "grad_norm": 0.0739881619811058, |
| "learning_rate": 1.8277339346110485e-05, |
| "loss": 0.0537, |
| "step": 583 |
| }, |
| { |
| "epoch": 1.8852988691437802, |
| "grad_norm": 0.06086435914039612, |
| "learning_rate": 1.8272829763246903e-05, |
| "loss": 0.051, |
| "step": 584 |
| }, |
| { |
| "epoch": 1.888529886914378, |
| "grad_norm": 0.09947849065065384, |
| "learning_rate": 1.8268320180383317e-05, |
| "loss": 0.0705, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.891760904684976, |
| "grad_norm": 0.09509633481502533, |
| "learning_rate": 1.826381059751973e-05, |
| "loss": 0.0665, |
| "step": 586 |
| }, |
| { |
| "epoch": 1.8949919224555734, |
| "grad_norm": 0.12639068067073822, |
| "learning_rate": 1.8259301014656146e-05, |
| "loss": 0.0538, |
| "step": 587 |
| }, |
| { |
| "epoch": 1.8982229402261712, |
| "grad_norm": 0.09957147389650345, |
| "learning_rate": 1.825479143179256e-05, |
| "loss": 0.0649, |
| "step": 588 |
| }, |
| { |
| "epoch": 1.901453957996769, |
| "grad_norm": 0.10096530616283417, |
| "learning_rate": 1.8250281848928975e-05, |
| "loss": 0.0607, |
| "step": 589 |
| }, |
| { |
| "epoch": 1.9046849757673667, |
| "grad_norm": 0.10449621081352234, |
| "learning_rate": 1.8245772266065392e-05, |
| "loss": 0.0742, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9079159935379644, |
| "grad_norm": 0.09589482843875885, |
| "learning_rate": 1.8241262683201807e-05, |
| "loss": 0.069, |
| "step": 591 |
| }, |
| { |
| "epoch": 1.9111470113085622, |
| "grad_norm": 0.10444579273462296, |
| "learning_rate": 1.823675310033822e-05, |
| "loss": 0.0726, |
| "step": 592 |
| }, |
| { |
| "epoch": 1.9143780290791599, |
| "grad_norm": 0.09007811546325684, |
| "learning_rate": 1.8232243517474636e-05, |
| "loss": 0.0668, |
| "step": 593 |
| }, |
| { |
| "epoch": 1.9176090468497575, |
| "grad_norm": 0.093568354845047, |
| "learning_rate": 1.822773393461105e-05, |
| "loss": 0.0658, |
| "step": 594 |
| }, |
| { |
| "epoch": 1.9208400646203554, |
| "grad_norm": 0.0825546383857727, |
| "learning_rate": 1.8223224351747464e-05, |
| "loss": 0.0586, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.9240710823909533, |
| "grad_norm": 0.08507188409566879, |
| "learning_rate": 1.8218714768883882e-05, |
| "loss": 0.0617, |
| "step": 596 |
| }, |
| { |
| "epoch": 1.927302100161551, |
| "grad_norm": 0.0858079269528389, |
| "learning_rate": 1.8214205186020297e-05, |
| "loss": 0.0601, |
| "step": 597 |
| }, |
| { |
| "epoch": 1.9305331179321485, |
| "grad_norm": 0.1205410435795784, |
| "learning_rate": 1.820969560315671e-05, |
| "loss": 0.0883, |
| "step": 598 |
| }, |
| { |
| "epoch": 1.9337641357027464, |
| "grad_norm": 0.1289929449558258, |
| "learning_rate": 1.8205186020293125e-05, |
| "loss": 0.076, |
| "step": 599 |
| }, |
| { |
| "epoch": 1.936995153473344, |
| "grad_norm": 0.11139614135026932, |
| "learning_rate": 1.820067643742954e-05, |
| "loss": 0.0846, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.936995153473344, |
| "eval_loss": 0.07442964613437653, |
| "eval_runtime": 188.1343, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.9402261712439417, |
| "grad_norm": 0.0923788920044899, |
| "learning_rate": 1.8196166854565954e-05, |
| "loss": 0.0605, |
| "step": 601 |
| }, |
| { |
| "epoch": 1.9434571890145396, |
| "grad_norm": 0.1251497119665146, |
| "learning_rate": 1.819165727170237e-05, |
| "loss": 0.0894, |
| "step": 602 |
| }, |
| { |
| "epoch": 1.9466882067851374, |
| "grad_norm": 0.11357556283473969, |
| "learning_rate": 1.8187147688838783e-05, |
| "loss": 0.0819, |
| "step": 603 |
| }, |
| { |
| "epoch": 1.949919224555735, |
| "grad_norm": 0.09567239135503769, |
| "learning_rate": 1.8182638105975197e-05, |
| "loss": 0.0624, |
| "step": 604 |
| }, |
| { |
| "epoch": 1.9531502423263327, |
| "grad_norm": 0.09191922098398209, |
| "learning_rate": 1.8178128523111612e-05, |
| "loss": 0.0578, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.9563812600969306, |
| "grad_norm": 0.09503104537725449, |
| "learning_rate": 1.817361894024803e-05, |
| "loss": 0.0641, |
| "step": 606 |
| }, |
| { |
| "epoch": 1.9596122778675282, |
| "grad_norm": 0.11036618053913116, |
| "learning_rate": 1.8169109357384444e-05, |
| "loss": 0.0666, |
| "step": 607 |
| }, |
| { |
| "epoch": 1.9628432956381259, |
| "grad_norm": 0.09104762971401215, |
| "learning_rate": 1.8164599774520858e-05, |
| "loss": 0.0649, |
| "step": 608 |
| }, |
| { |
| "epoch": 1.9660743134087237, |
| "grad_norm": 0.10882871598005295, |
| "learning_rate": 1.8160090191657273e-05, |
| "loss": 0.0824, |
| "step": 609 |
| }, |
| { |
| "epoch": 1.9693053311793216, |
| "grad_norm": 0.09632111340761185, |
| "learning_rate": 1.8155580608793687e-05, |
| "loss": 0.069, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.9725363489499192, |
| "grad_norm": 0.07966237515211105, |
| "learning_rate": 1.81510710259301e-05, |
| "loss": 0.0557, |
| "step": 611 |
| }, |
| { |
| "epoch": 1.975767366720517, |
| "grad_norm": 0.10335849225521088, |
| "learning_rate": 1.8146561443066516e-05, |
| "loss": 0.0787, |
| "step": 612 |
| }, |
| { |
| "epoch": 1.9789983844911148, |
| "grad_norm": 0.12096443772315979, |
| "learning_rate": 1.814205186020293e-05, |
| "loss": 0.0731, |
| "step": 613 |
| }, |
| { |
| "epoch": 1.9822294022617124, |
| "grad_norm": 0.09432677179574966, |
| "learning_rate": 1.8137542277339348e-05, |
| "loss": 0.0635, |
| "step": 614 |
| }, |
| { |
| "epoch": 1.98546042003231, |
| "grad_norm": 0.11708611994981766, |
| "learning_rate": 1.8133032694475762e-05, |
| "loss": 0.0874, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.988691437802908, |
| "grad_norm": 0.1113506332039833, |
| "learning_rate": 1.8128523111612177e-05, |
| "loss": 0.0678, |
| "step": 616 |
| }, |
| { |
| "epoch": 1.9919224555735058, |
| "grad_norm": 0.09246299415826797, |
| "learning_rate": 1.812401352874859e-05, |
| "loss": 0.0672, |
| "step": 617 |
| }, |
| { |
| "epoch": 1.9951534733441034, |
| "grad_norm": 0.1115182563662529, |
| "learning_rate": 1.8119503945885006e-05, |
| "loss": 0.0758, |
| "step": 618 |
| }, |
| { |
| "epoch": 1.998384491114701, |
| "grad_norm": 0.10478319972753525, |
| "learning_rate": 1.811499436302142e-05, |
| "loss": 0.0648, |
| "step": 619 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.13386856019496918, |
| "learning_rate": 1.8110484780157838e-05, |
| "loss": 0.0624, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.003231017770598, |
| "grad_norm": 0.09262198954820633, |
| "learning_rate": 1.8105975197294252e-05, |
| "loss": 0.0608, |
| "step": 621 |
| }, |
| { |
| "epoch": 2.0064620355411953, |
| "grad_norm": 0.07091473788022995, |
| "learning_rate": 1.8101465614430667e-05, |
| "loss": 0.0434, |
| "step": 622 |
| }, |
| { |
| "epoch": 2.009693053311793, |
| "grad_norm": 0.10324624925851822, |
| "learning_rate": 1.809695603156708e-05, |
| "loss": 0.0764, |
| "step": 623 |
| }, |
| { |
| "epoch": 2.012924071082391, |
| "grad_norm": 0.10515467822551727, |
| "learning_rate": 1.8092446448703495e-05, |
| "loss": 0.0609, |
| "step": 624 |
| }, |
| { |
| "epoch": 2.016155088852989, |
| "grad_norm": 0.10561127960681915, |
| "learning_rate": 1.8087936865839913e-05, |
| "loss": 0.0719, |
| "step": 625 |
| }, |
| { |
| "epoch": 2.0193861066235863, |
| "grad_norm": 0.1146024838089943, |
| "learning_rate": 1.8083427282976327e-05, |
| "loss": 0.0726, |
| "step": 626 |
| }, |
| { |
| "epoch": 2.022617124394184, |
| "grad_norm": 0.07930684089660645, |
| "learning_rate": 1.8078917700112742e-05, |
| "loss": 0.0481, |
| "step": 627 |
| }, |
| { |
| "epoch": 2.025848142164782, |
| "grad_norm": 0.09927454590797424, |
| "learning_rate": 1.8074408117249156e-05, |
| "loss": 0.0608, |
| "step": 628 |
| }, |
| { |
| "epoch": 2.0290791599353795, |
| "grad_norm": 0.08592136949300766, |
| "learning_rate": 1.806989853438557e-05, |
| "loss": 0.0577, |
| "step": 629 |
| }, |
| { |
| "epoch": 2.0323101777059773, |
| "grad_norm": 0.09232696890830994, |
| "learning_rate": 1.8065388951521985e-05, |
| "loss": 0.0528, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.035541195476575, |
| "grad_norm": 0.08930118381977081, |
| "learning_rate": 1.8060879368658403e-05, |
| "loss": 0.0633, |
| "step": 631 |
| }, |
| { |
| "epoch": 2.038772213247173, |
| "grad_norm": 0.09835111349821091, |
| "learning_rate": 1.8056369785794817e-05, |
| "loss": 0.0648, |
| "step": 632 |
| }, |
| { |
| "epoch": 2.0420032310177705, |
| "grad_norm": 0.10789217799901962, |
| "learning_rate": 1.805186020293123e-05, |
| "loss": 0.0695, |
| "step": 633 |
| }, |
| { |
| "epoch": 2.0452342487883683, |
| "grad_norm": 0.10506349802017212, |
| "learning_rate": 1.8047350620067646e-05, |
| "loss": 0.0635, |
| "step": 634 |
| }, |
| { |
| "epoch": 2.048465266558966, |
| "grad_norm": 0.13068322837352753, |
| "learning_rate": 1.804284103720406e-05, |
| "loss": 0.0802, |
| "step": 635 |
| }, |
| { |
| "epoch": 2.0516962843295636, |
| "grad_norm": 0.09663469344377518, |
| "learning_rate": 1.8038331454340475e-05, |
| "loss": 0.062, |
| "step": 636 |
| }, |
| { |
| "epoch": 2.0549273021001615, |
| "grad_norm": 0.07054325938224792, |
| "learning_rate": 1.8033821871476893e-05, |
| "loss": 0.0453, |
| "step": 637 |
| }, |
| { |
| "epoch": 2.0581583198707594, |
| "grad_norm": 0.07739470899105072, |
| "learning_rate": 1.8029312288613307e-05, |
| "loss": 0.0463, |
| "step": 638 |
| }, |
| { |
| "epoch": 2.0613893376413572, |
| "grad_norm": 0.09022580832242966, |
| "learning_rate": 1.802480270574972e-05, |
| "loss": 0.0587, |
| "step": 639 |
| }, |
| { |
| "epoch": 2.0646203554119547, |
| "grad_norm": 0.09953221678733826, |
| "learning_rate": 1.8020293122886136e-05, |
| "loss": 0.0585, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.0678513731825525, |
| "grad_norm": 0.10069511830806732, |
| "learning_rate": 1.801578354002255e-05, |
| "loss": 0.0587, |
| "step": 641 |
| }, |
| { |
| "epoch": 2.0710823909531504, |
| "grad_norm": 0.09177737683057785, |
| "learning_rate": 1.8011273957158964e-05, |
| "loss": 0.0523, |
| "step": 642 |
| }, |
| { |
| "epoch": 2.074313408723748, |
| "grad_norm": 0.1010020524263382, |
| "learning_rate": 1.800676437429538e-05, |
| "loss": 0.0552, |
| "step": 643 |
| }, |
| { |
| "epoch": 2.0775444264943457, |
| "grad_norm": 0.0997423455119133, |
| "learning_rate": 1.8002254791431793e-05, |
| "loss": 0.0662, |
| "step": 644 |
| }, |
| { |
| "epoch": 2.0807754442649435, |
| "grad_norm": 0.09295801818370819, |
| "learning_rate": 1.7997745208568208e-05, |
| "loss": 0.0599, |
| "step": 645 |
| }, |
| { |
| "epoch": 2.0840064620355414, |
| "grad_norm": 0.1053297147154808, |
| "learning_rate": 1.7993235625704622e-05, |
| "loss": 0.064, |
| "step": 646 |
| }, |
| { |
| "epoch": 2.087237479806139, |
| "grad_norm": 0.11978495121002197, |
| "learning_rate": 1.798872604284104e-05, |
| "loss": 0.0727, |
| "step": 647 |
| }, |
| { |
| "epoch": 2.0904684975767367, |
| "grad_norm": 0.07878235727548599, |
| "learning_rate": 1.7984216459977454e-05, |
| "loss": 0.0486, |
| "step": 648 |
| }, |
| { |
| "epoch": 2.0936995153473346, |
| "grad_norm": 0.14993903040885925, |
| "learning_rate": 1.797970687711387e-05, |
| "loss": 0.0458, |
| "step": 649 |
| }, |
| { |
| "epoch": 2.096930533117932, |
| "grad_norm": 0.0925765186548233, |
| "learning_rate": 1.7975197294250283e-05, |
| "loss": 0.0546, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.10016155088853, |
| "grad_norm": 0.09530377388000488, |
| "learning_rate": 1.7970687711386697e-05, |
| "loss": 0.0514, |
| "step": 651 |
| }, |
| { |
| "epoch": 2.1033925686591277, |
| "grad_norm": 0.0945788025856018, |
| "learning_rate": 1.7966178128523112e-05, |
| "loss": 0.0604, |
| "step": 652 |
| }, |
| { |
| "epoch": 2.106623586429725, |
| "grad_norm": 0.11486334353685379, |
| "learning_rate": 1.7961668545659526e-05, |
| "loss": 0.0633, |
| "step": 653 |
| }, |
| { |
| "epoch": 2.109854604200323, |
| "grad_norm": 0.1077791377902031, |
| "learning_rate": 1.795715896279594e-05, |
| "loss": 0.0602, |
| "step": 654 |
| }, |
| { |
| "epoch": 2.113085621970921, |
| "grad_norm": 0.10789015889167786, |
| "learning_rate": 1.795264937993236e-05, |
| "loss": 0.0541, |
| "step": 655 |
| }, |
| { |
| "epoch": 2.1163166397415187, |
| "grad_norm": 0.10327862948179245, |
| "learning_rate": 1.7948139797068773e-05, |
| "loss": 0.0652, |
| "step": 656 |
| }, |
| { |
| "epoch": 2.119547657512116, |
| "grad_norm": 0.10486488789319992, |
| "learning_rate": 1.7943630214205187e-05, |
| "loss": 0.0617, |
| "step": 657 |
| }, |
| { |
| "epoch": 2.122778675282714, |
| "grad_norm": 0.0882355272769928, |
| "learning_rate": 1.79391206313416e-05, |
| "loss": 0.0553, |
| "step": 658 |
| }, |
| { |
| "epoch": 2.126009693053312, |
| "grad_norm": 0.08177275210618973, |
| "learning_rate": 1.7934611048478016e-05, |
| "loss": 0.0522, |
| "step": 659 |
| }, |
| { |
| "epoch": 2.1292407108239093, |
| "grad_norm": 0.1455976665019989, |
| "learning_rate": 1.793010146561443e-05, |
| "loss": 0.0909, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.132471728594507, |
| "grad_norm": 0.1145886555314064, |
| "learning_rate": 1.7925591882750848e-05, |
| "loss": 0.0733, |
| "step": 661 |
| }, |
| { |
| "epoch": 2.135702746365105, |
| "grad_norm": 0.1092807874083519, |
| "learning_rate": 1.7921082299887262e-05, |
| "loss": 0.0581, |
| "step": 662 |
| }, |
| { |
| "epoch": 2.138933764135703, |
| "grad_norm": 0.07647505402565002, |
| "learning_rate": 1.7916572717023677e-05, |
| "loss": 0.0461, |
| "step": 663 |
| }, |
| { |
| "epoch": 2.1421647819063003, |
| "grad_norm": 0.09198980778455734, |
| "learning_rate": 1.791206313416009e-05, |
| "loss": 0.0549, |
| "step": 664 |
| }, |
| { |
| "epoch": 2.145395799676898, |
| "grad_norm": 0.10971511900424957, |
| "learning_rate": 1.7907553551296506e-05, |
| "loss": 0.0643, |
| "step": 665 |
| }, |
| { |
| "epoch": 2.148626817447496, |
| "grad_norm": 0.11374619603157043, |
| "learning_rate": 1.790304396843292e-05, |
| "loss": 0.0754, |
| "step": 666 |
| }, |
| { |
| "epoch": 2.1518578352180935, |
| "grad_norm": 0.09252484142780304, |
| "learning_rate": 1.7898534385569338e-05, |
| "loss": 0.0502, |
| "step": 667 |
| }, |
| { |
| "epoch": 2.1550888529886914, |
| "grad_norm": 0.09586004912853241, |
| "learning_rate": 1.7894024802705752e-05, |
| "loss": 0.0529, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.158319870759289, |
| "grad_norm": 0.10206209868192673, |
| "learning_rate": 1.7889515219842167e-05, |
| "loss": 0.0605, |
| "step": 669 |
| }, |
| { |
| "epoch": 2.161550888529887, |
| "grad_norm": 0.15015992522239685, |
| "learning_rate": 1.788500563697858e-05, |
| "loss": 0.0926, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.1647819063004845, |
| "grad_norm": 0.10588102042675018, |
| "learning_rate": 1.7880496054114995e-05, |
| "loss": 0.0623, |
| "step": 671 |
| }, |
| { |
| "epoch": 2.1680129240710824, |
| "grad_norm": 0.09418896585702896, |
| "learning_rate": 1.787598647125141e-05, |
| "loss": 0.0564, |
| "step": 672 |
| }, |
| { |
| "epoch": 2.1712439418416802, |
| "grad_norm": 0.08213125914335251, |
| "learning_rate": 1.7871476888387828e-05, |
| "loss": 0.0464, |
| "step": 673 |
| }, |
| { |
| "epoch": 2.1744749596122777, |
| "grad_norm": 0.09321248531341553, |
| "learning_rate": 1.7866967305524242e-05, |
| "loss": 0.0531, |
| "step": 674 |
| }, |
| { |
| "epoch": 2.1777059773828755, |
| "grad_norm": 0.10642002522945404, |
| "learning_rate": 1.7862457722660656e-05, |
| "loss": 0.0669, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.1777059773828755, |
| "eval_loss": 0.0747215747833252, |
| "eval_runtime": 188.1708, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 675 |
| }, |
| { |
| "epoch": 2.1809369951534734, |
| "grad_norm": 0.12946507334709167, |
| "learning_rate": 1.785794813979707e-05, |
| "loss": 0.072, |
| "step": 676 |
| }, |
| { |
| "epoch": 2.1841680129240713, |
| "grad_norm": 0.10074260830879211, |
| "learning_rate": 1.7853438556933485e-05, |
| "loss": 0.0521, |
| "step": 677 |
| }, |
| { |
| "epoch": 2.1873990306946687, |
| "grad_norm": 0.12798738479614258, |
| "learning_rate": 1.78489289740699e-05, |
| "loss": 0.0758, |
| "step": 678 |
| }, |
| { |
| "epoch": 2.1906300484652665, |
| "grad_norm": 0.10193175077438354, |
| "learning_rate": 1.7844419391206317e-05, |
| "loss": 0.0632, |
| "step": 679 |
| }, |
| { |
| "epoch": 2.1938610662358644, |
| "grad_norm": 0.12570485472679138, |
| "learning_rate": 1.783990980834273e-05, |
| "loss": 0.0724, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.197092084006462, |
| "grad_norm": 0.10074017196893692, |
| "learning_rate": 1.7835400225479146e-05, |
| "loss": 0.0563, |
| "step": 681 |
| }, |
| { |
| "epoch": 2.2003231017770597, |
| "grad_norm": 0.08727949112653732, |
| "learning_rate": 1.783089064261556e-05, |
| "loss": 0.0524, |
| "step": 682 |
| }, |
| { |
| "epoch": 2.2035541195476576, |
| "grad_norm": 0.11030570417642593, |
| "learning_rate": 1.7826381059751975e-05, |
| "loss": 0.0668, |
| "step": 683 |
| }, |
| { |
| "epoch": 2.2067851373182554, |
| "grad_norm": 0.10606499761343002, |
| "learning_rate": 1.782187147688839e-05, |
| "loss": 0.0606, |
| "step": 684 |
| }, |
| { |
| "epoch": 2.210016155088853, |
| "grad_norm": 0.11735937744379044, |
| "learning_rate": 1.7817361894024804e-05, |
| "loss": 0.0648, |
| "step": 685 |
| }, |
| { |
| "epoch": 2.2132471728594507, |
| "grad_norm": 0.106626495718956, |
| "learning_rate": 1.7812852311161218e-05, |
| "loss": 0.0649, |
| "step": 686 |
| }, |
| { |
| "epoch": 2.2164781906300486, |
| "grad_norm": 0.12231657654047012, |
| "learning_rate": 1.7808342728297632e-05, |
| "loss": 0.0702, |
| "step": 687 |
| }, |
| { |
| "epoch": 2.219709208400646, |
| "grad_norm": 0.08800094574689865, |
| "learning_rate": 1.780383314543405e-05, |
| "loss": 0.0515, |
| "step": 688 |
| }, |
| { |
| "epoch": 2.222940226171244, |
| "grad_norm": 0.08806774020195007, |
| "learning_rate": 1.7799323562570465e-05, |
| "loss": 0.0493, |
| "step": 689 |
| }, |
| { |
| "epoch": 2.2261712439418417, |
| "grad_norm": 0.10804681479930878, |
| "learning_rate": 1.779481397970688e-05, |
| "loss": 0.0604, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.2294022617124396, |
| "grad_norm": 0.11405564099550247, |
| "learning_rate": 1.7790304396843293e-05, |
| "loss": 0.0597, |
| "step": 691 |
| }, |
| { |
| "epoch": 2.232633279483037, |
| "grad_norm": 0.11010053753852844, |
| "learning_rate": 1.7785794813979708e-05, |
| "loss": 0.0634, |
| "step": 692 |
| }, |
| { |
| "epoch": 2.235864297253635, |
| "grad_norm": 0.10657312721014023, |
| "learning_rate": 1.7781285231116122e-05, |
| "loss": 0.0539, |
| "step": 693 |
| }, |
| { |
| "epoch": 2.2390953150242328, |
| "grad_norm": 0.08584710210561752, |
| "learning_rate": 1.7776775648252536e-05, |
| "loss": 0.0571, |
| "step": 694 |
| }, |
| { |
| "epoch": 2.24232633279483, |
| "grad_norm": 0.10155533254146576, |
| "learning_rate": 1.777226606538895e-05, |
| "loss": 0.0597, |
| "step": 695 |
| }, |
| { |
| "epoch": 2.245557350565428, |
| "grad_norm": 0.11395770311355591, |
| "learning_rate": 1.7767756482525365e-05, |
| "loss": 0.0675, |
| "step": 696 |
| }, |
| { |
| "epoch": 2.248788368336026, |
| "grad_norm": 0.11109079420566559, |
| "learning_rate": 1.7763246899661783e-05, |
| "loss": 0.062, |
| "step": 697 |
| }, |
| { |
| "epoch": 2.2520193861066238, |
| "grad_norm": 0.13479241728782654, |
| "learning_rate": 1.7758737316798197e-05, |
| "loss": 0.0782, |
| "step": 698 |
| }, |
| { |
| "epoch": 2.255250403877221, |
| "grad_norm": 0.12003345042467117, |
| "learning_rate": 1.7754227733934612e-05, |
| "loss": 0.0683, |
| "step": 699 |
| }, |
| { |
| "epoch": 2.258481421647819, |
| "grad_norm": 0.13395312428474426, |
| "learning_rate": 1.7749718151071026e-05, |
| "loss": 0.0764, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.261712439418417, |
| "grad_norm": 0.10561169683933258, |
| "learning_rate": 1.774520856820744e-05, |
| "loss": 0.0552, |
| "step": 701 |
| }, |
| { |
| "epoch": 2.2649434571890144, |
| "grad_norm": 0.1412249207496643, |
| "learning_rate": 1.7740698985343855e-05, |
| "loss": 0.0812, |
| "step": 702 |
| }, |
| { |
| "epoch": 2.268174474959612, |
| "grad_norm": 0.11307451128959656, |
| "learning_rate": 1.7736189402480273e-05, |
| "loss": 0.0671, |
| "step": 703 |
| }, |
| { |
| "epoch": 2.27140549273021, |
| "grad_norm": 0.10989584773778915, |
| "learning_rate": 1.7731679819616687e-05, |
| "loss": 0.059, |
| "step": 704 |
| }, |
| { |
| "epoch": 2.274636510500808, |
| "grad_norm": 0.0964912474155426, |
| "learning_rate": 1.77271702367531e-05, |
| "loss": 0.0515, |
| "step": 705 |
| }, |
| { |
| "epoch": 2.2778675282714054, |
| "grad_norm": 0.09640849381685257, |
| "learning_rate": 1.7722660653889516e-05, |
| "loss": 0.0548, |
| "step": 706 |
| }, |
| { |
| "epoch": 2.2810985460420032, |
| "grad_norm": 0.08393755555152893, |
| "learning_rate": 1.771815107102593e-05, |
| "loss": 0.0477, |
| "step": 707 |
| }, |
| { |
| "epoch": 2.284329563812601, |
| "grad_norm": 0.08865144103765488, |
| "learning_rate": 1.7713641488162348e-05, |
| "loss": 0.0527, |
| "step": 708 |
| }, |
| { |
| "epoch": 2.2875605815831985, |
| "grad_norm": 0.10840681195259094, |
| "learning_rate": 1.7709131905298762e-05, |
| "loss": 0.061, |
| "step": 709 |
| }, |
| { |
| "epoch": 2.2907915993537964, |
| "grad_norm": 0.1336364448070526, |
| "learning_rate": 1.7704622322435177e-05, |
| "loss": 0.0655, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.2940226171243943, |
| "grad_norm": 0.10070191323757172, |
| "learning_rate": 1.770011273957159e-05, |
| "loss": 0.0599, |
| "step": 711 |
| }, |
| { |
| "epoch": 2.297253634894992, |
| "grad_norm": 0.12378398329019547, |
| "learning_rate": 1.7695603156708006e-05, |
| "loss": 0.0712, |
| "step": 712 |
| }, |
| { |
| "epoch": 2.3004846526655895, |
| "grad_norm": 0.08809908479452133, |
| "learning_rate": 1.769109357384442e-05, |
| "loss": 0.0419, |
| "step": 713 |
| }, |
| { |
| "epoch": 2.3037156704361874, |
| "grad_norm": 0.10536797344684601, |
| "learning_rate": 1.7686583990980838e-05, |
| "loss": 0.062, |
| "step": 714 |
| }, |
| { |
| "epoch": 2.3069466882067853, |
| "grad_norm": 0.09373629838228226, |
| "learning_rate": 1.7682074408117252e-05, |
| "loss": 0.0455, |
| "step": 715 |
| }, |
| { |
| "epoch": 2.3101777059773827, |
| "grad_norm": 0.08422086387872696, |
| "learning_rate": 1.7677564825253667e-05, |
| "loss": 0.0481, |
| "step": 716 |
| }, |
| { |
| "epoch": 2.3134087237479806, |
| "grad_norm": 0.12226711213588715, |
| "learning_rate": 1.767305524239008e-05, |
| "loss": 0.0713, |
| "step": 717 |
| }, |
| { |
| "epoch": 2.3166397415185784, |
| "grad_norm": 0.11593876034021378, |
| "learning_rate": 1.7668545659526495e-05, |
| "loss": 0.065, |
| "step": 718 |
| }, |
| { |
| "epoch": 2.3198707592891763, |
| "grad_norm": 0.10055369138717651, |
| "learning_rate": 1.766403607666291e-05, |
| "loss": 0.0515, |
| "step": 719 |
| }, |
| { |
| "epoch": 2.3231017770597737, |
| "grad_norm": 0.1200050637125969, |
| "learning_rate": 1.7659526493799328e-05, |
| "loss": 0.0664, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.3263327948303716, |
| "grad_norm": 0.10001233220100403, |
| "learning_rate": 1.7655016910935742e-05, |
| "loss": 0.0578, |
| "step": 721 |
| }, |
| { |
| "epoch": 2.3295638126009695, |
| "grad_norm": 0.08621415495872498, |
| "learning_rate": 1.7650507328072156e-05, |
| "loss": 0.0426, |
| "step": 722 |
| }, |
| { |
| "epoch": 2.332794830371567, |
| "grad_norm": 0.08662088960409164, |
| "learning_rate": 1.764599774520857e-05, |
| "loss": 0.048, |
| "step": 723 |
| }, |
| { |
| "epoch": 2.3360258481421647, |
| "grad_norm": 0.09761569648981094, |
| "learning_rate": 1.7641488162344985e-05, |
| "loss": 0.0585, |
| "step": 724 |
| }, |
| { |
| "epoch": 2.3392568659127626, |
| "grad_norm": 0.1272287666797638, |
| "learning_rate": 1.76369785794814e-05, |
| "loss": 0.0722, |
| "step": 725 |
| }, |
| { |
| "epoch": 2.3424878836833605, |
| "grad_norm": 0.0827430784702301, |
| "learning_rate": 1.7632468996617814e-05, |
| "loss": 0.0517, |
| "step": 726 |
| }, |
| { |
| "epoch": 2.345718901453958, |
| "grad_norm": 0.08261015266180038, |
| "learning_rate": 1.7627959413754228e-05, |
| "loss": 0.0464, |
| "step": 727 |
| }, |
| { |
| "epoch": 2.3489499192245558, |
| "grad_norm": 0.10019004344940186, |
| "learning_rate": 1.7623449830890643e-05, |
| "loss": 0.0539, |
| "step": 728 |
| }, |
| { |
| "epoch": 2.3521809369951536, |
| "grad_norm": 0.11860109120607376, |
| "learning_rate": 1.761894024802706e-05, |
| "loss": 0.0604, |
| "step": 729 |
| }, |
| { |
| "epoch": 2.355411954765751, |
| "grad_norm": 0.13498210906982422, |
| "learning_rate": 1.7614430665163475e-05, |
| "loss": 0.0497, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.358642972536349, |
| "grad_norm": 0.09867555648088455, |
| "learning_rate": 1.760992108229989e-05, |
| "loss": 0.0572, |
| "step": 731 |
| }, |
| { |
| "epoch": 2.361873990306947, |
| "grad_norm": 0.10520780086517334, |
| "learning_rate": 1.7605411499436304e-05, |
| "loss": 0.0613, |
| "step": 732 |
| }, |
| { |
| "epoch": 2.3651050080775446, |
| "grad_norm": 0.1396726369857788, |
| "learning_rate": 1.7600901916572718e-05, |
| "loss": 0.0808, |
| "step": 733 |
| }, |
| { |
| "epoch": 2.368336025848142, |
| "grad_norm": 0.09852424263954163, |
| "learning_rate": 1.7596392333709132e-05, |
| "loss": 0.0602, |
| "step": 734 |
| }, |
| { |
| "epoch": 2.37156704361874, |
| "grad_norm": 0.08897744864225388, |
| "learning_rate": 1.7591882750845547e-05, |
| "loss": 0.0463, |
| "step": 735 |
| }, |
| { |
| "epoch": 2.374798061389338, |
| "grad_norm": 0.12664619088172913, |
| "learning_rate": 1.758737316798196e-05, |
| "loss": 0.0731, |
| "step": 736 |
| }, |
| { |
| "epoch": 2.378029079159935, |
| "grad_norm": 0.0975411906838417, |
| "learning_rate": 1.7582863585118376e-05, |
| "loss": 0.0541, |
| "step": 737 |
| }, |
| { |
| "epoch": 2.381260096930533, |
| "grad_norm": 0.10056427121162415, |
| "learning_rate": 1.7578354002254793e-05, |
| "loss": 0.0564, |
| "step": 738 |
| }, |
| { |
| "epoch": 2.384491114701131, |
| "grad_norm": 0.0751773938536644, |
| "learning_rate": 1.7573844419391208e-05, |
| "loss": 0.043, |
| "step": 739 |
| }, |
| { |
| "epoch": 2.387722132471729, |
| "grad_norm": 0.12571515142917633, |
| "learning_rate": 1.7569334836527622e-05, |
| "loss": 0.0715, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.3909531502423262, |
| "grad_norm": 0.09152042865753174, |
| "learning_rate": 1.7564825253664037e-05, |
| "loss": 0.0513, |
| "step": 741 |
| }, |
| { |
| "epoch": 2.394184168012924, |
| "grad_norm": 0.16221173107624054, |
| "learning_rate": 1.756031567080045e-05, |
| "loss": 0.1003, |
| "step": 742 |
| }, |
| { |
| "epoch": 2.397415185783522, |
| "grad_norm": 0.09910274296998978, |
| "learning_rate": 1.7555806087936865e-05, |
| "loss": 0.0543, |
| "step": 743 |
| }, |
| { |
| "epoch": 2.4006462035541194, |
| "grad_norm": 0.10756971687078476, |
| "learning_rate": 1.7551296505073283e-05, |
| "loss": 0.0573, |
| "step": 744 |
| }, |
| { |
| "epoch": 2.4038772213247173, |
| "grad_norm": 0.08702822029590607, |
| "learning_rate": 1.7546786922209697e-05, |
| "loss": 0.047, |
| "step": 745 |
| }, |
| { |
| "epoch": 2.407108239095315, |
| "grad_norm": 0.14440016448497772, |
| "learning_rate": 1.7542277339346112e-05, |
| "loss": 0.0685, |
| "step": 746 |
| }, |
| { |
| "epoch": 2.410339256865913, |
| "grad_norm": 0.09141986817121506, |
| "learning_rate": 1.7537767756482526e-05, |
| "loss": 0.0521, |
| "step": 747 |
| }, |
| { |
| "epoch": 2.4135702746365104, |
| "grad_norm": 0.12515199184417725, |
| "learning_rate": 1.753325817361894e-05, |
| "loss": 0.0605, |
| "step": 748 |
| }, |
| { |
| "epoch": 2.4168012924071083, |
| "grad_norm": 0.12448819726705551, |
| "learning_rate": 1.7528748590755355e-05, |
| "loss": 0.0723, |
| "step": 749 |
| }, |
| { |
| "epoch": 2.420032310177706, |
| "grad_norm": 0.13118943572044373, |
| "learning_rate": 1.7524239007891773e-05, |
| "loss": 0.0654, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.420032310177706, |
| "eval_loss": 0.07308099418878555, |
| "eval_runtime": 188.3306, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.4232633279483036, |
| "grad_norm": 0.1215345561504364, |
| "learning_rate": 1.7519729425028187e-05, |
| "loss": 0.0644, |
| "step": 751 |
| }, |
| { |
| "epoch": 2.4264943457189014, |
| "grad_norm": 0.12855304777622223, |
| "learning_rate": 1.75152198421646e-05, |
| "loss": 0.0666, |
| "step": 752 |
| }, |
| { |
| "epoch": 2.4297253634894993, |
| "grad_norm": 0.11538267880678177, |
| "learning_rate": 1.7510710259301016e-05, |
| "loss": 0.0545, |
| "step": 753 |
| }, |
| { |
| "epoch": 2.432956381260097, |
| "grad_norm": 0.10273373872041702, |
| "learning_rate": 1.750620067643743e-05, |
| "loss": 0.0594, |
| "step": 754 |
| }, |
| { |
| "epoch": 2.4361873990306946, |
| "grad_norm": 0.10953179746866226, |
| "learning_rate": 1.7501691093573845e-05, |
| "loss": 0.0587, |
| "step": 755 |
| }, |
| { |
| "epoch": 2.4394184168012925, |
| "grad_norm": 0.09215240180492401, |
| "learning_rate": 1.7497181510710263e-05, |
| "loss": 0.0501, |
| "step": 756 |
| }, |
| { |
| "epoch": 2.4426494345718903, |
| "grad_norm": 0.11669941246509552, |
| "learning_rate": 1.7492671927846677e-05, |
| "loss": 0.0585, |
| "step": 757 |
| }, |
| { |
| "epoch": 2.4458804523424877, |
| "grad_norm": 0.11698901653289795, |
| "learning_rate": 1.748816234498309e-05, |
| "loss": 0.0551, |
| "step": 758 |
| }, |
| { |
| "epoch": 2.4491114701130856, |
| "grad_norm": 0.1258348822593689, |
| "learning_rate": 1.7483652762119506e-05, |
| "loss": 0.0629, |
| "step": 759 |
| }, |
| { |
| "epoch": 2.4523424878836835, |
| "grad_norm": 0.12607377767562866, |
| "learning_rate": 1.747914317925592e-05, |
| "loss": 0.0728, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.4555735056542813, |
| "grad_norm": 0.0982760339975357, |
| "learning_rate": 1.7474633596392334e-05, |
| "loss": 0.051, |
| "step": 761 |
| }, |
| { |
| "epoch": 2.4588045234248788, |
| "grad_norm": 0.15601739287376404, |
| "learning_rate": 1.7470124013528752e-05, |
| "loss": 0.076, |
| "step": 762 |
| }, |
| { |
| "epoch": 2.4620355411954766, |
| "grad_norm": 0.13090789318084717, |
| "learning_rate": 1.7465614430665167e-05, |
| "loss": 0.0739, |
| "step": 763 |
| }, |
| { |
| "epoch": 2.4652665589660745, |
| "grad_norm": 0.10627159476280212, |
| "learning_rate": 1.746110484780158e-05, |
| "loss": 0.0509, |
| "step": 764 |
| }, |
| { |
| "epoch": 2.468497576736672, |
| "grad_norm": 0.07108946144580841, |
| "learning_rate": 1.7456595264937995e-05, |
| "loss": 0.039, |
| "step": 765 |
| }, |
| { |
| "epoch": 2.47172859450727, |
| "grad_norm": 0.14733023941516876, |
| "learning_rate": 1.745208568207441e-05, |
| "loss": 0.0788, |
| "step": 766 |
| }, |
| { |
| "epoch": 2.4749596122778676, |
| "grad_norm": 0.10821715742349625, |
| "learning_rate": 1.7447576099210824e-05, |
| "loss": 0.0524, |
| "step": 767 |
| }, |
| { |
| "epoch": 2.4781906300484655, |
| "grad_norm": 0.11846361309289932, |
| "learning_rate": 1.744306651634724e-05, |
| "loss": 0.0579, |
| "step": 768 |
| }, |
| { |
| "epoch": 2.481421647819063, |
| "grad_norm": 0.10738200694322586, |
| "learning_rate": 1.7438556933483653e-05, |
| "loss": 0.0589, |
| "step": 769 |
| }, |
| { |
| "epoch": 2.484652665589661, |
| "grad_norm": 0.1159975603222847, |
| "learning_rate": 1.743404735062007e-05, |
| "loss": 0.0565, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.4878836833602587, |
| "grad_norm": 0.10056610405445099, |
| "learning_rate": 1.7429537767756485e-05, |
| "loss": 0.0538, |
| "step": 771 |
| }, |
| { |
| "epoch": 2.491114701130856, |
| "grad_norm": 0.10329104959964752, |
| "learning_rate": 1.74250281848929e-05, |
| "loss": 0.0566, |
| "step": 772 |
| }, |
| { |
| "epoch": 2.494345718901454, |
| "grad_norm": 0.1422542929649353, |
| "learning_rate": 1.7420518602029314e-05, |
| "loss": 0.0747, |
| "step": 773 |
| }, |
| { |
| "epoch": 2.497576736672052, |
| "grad_norm": 0.12898680567741394, |
| "learning_rate": 1.741600901916573e-05, |
| "loss": 0.0735, |
| "step": 774 |
| }, |
| { |
| "epoch": 2.5008077544426497, |
| "grad_norm": 0.13066206872463226, |
| "learning_rate": 1.7411499436302143e-05, |
| "loss": 0.062, |
| "step": 775 |
| }, |
| { |
| "epoch": 2.504038772213247, |
| "grad_norm": 0.12379497289657593, |
| "learning_rate": 1.7406989853438557e-05, |
| "loss": 0.0631, |
| "step": 776 |
| }, |
| { |
| "epoch": 2.507269789983845, |
| "grad_norm": 0.1296347826719284, |
| "learning_rate": 1.740248027057497e-05, |
| "loss": 0.0634, |
| "step": 777 |
| }, |
| { |
| "epoch": 2.5105008077544424, |
| "grad_norm": 0.10818596184253693, |
| "learning_rate": 1.7397970687711386e-05, |
| "loss": 0.0606, |
| "step": 778 |
| }, |
| { |
| "epoch": 2.5137318255250403, |
| "grad_norm": 0.12639783322811127, |
| "learning_rate": 1.73934611048478e-05, |
| "loss": 0.0459, |
| "step": 779 |
| }, |
| { |
| "epoch": 2.516962843295638, |
| "grad_norm": 0.1167321428656578, |
| "learning_rate": 1.7388951521984218e-05, |
| "loss": 0.0667, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.520193861066236, |
| "grad_norm": 0.10153870284557343, |
| "learning_rate": 1.7384441939120632e-05, |
| "loss": 0.0589, |
| "step": 781 |
| }, |
| { |
| "epoch": 2.523424878836834, |
| "grad_norm": 0.09986142814159393, |
| "learning_rate": 1.7379932356257047e-05, |
| "loss": 0.045, |
| "step": 782 |
| }, |
| { |
| "epoch": 2.5266558966074313, |
| "grad_norm": 0.10662157833576202, |
| "learning_rate": 1.737542277339346e-05, |
| "loss": 0.0586, |
| "step": 783 |
| }, |
| { |
| "epoch": 2.529886914378029, |
| "grad_norm": 0.11709077656269073, |
| "learning_rate": 1.7370913190529876e-05, |
| "loss": 0.0613, |
| "step": 784 |
| }, |
| { |
| "epoch": 2.5331179321486266, |
| "grad_norm": 0.13120310008525848, |
| "learning_rate": 1.736640360766629e-05, |
| "loss": 0.0664, |
| "step": 785 |
| }, |
| { |
| "epoch": 2.5363489499192244, |
| "grad_norm": 0.13849826157093048, |
| "learning_rate": 1.7361894024802708e-05, |
| "loss": 0.0673, |
| "step": 786 |
| }, |
| { |
| "epoch": 2.5395799676898223, |
| "grad_norm": 0.08833606541156769, |
| "learning_rate": 1.7357384441939122e-05, |
| "loss": 0.0459, |
| "step": 787 |
| }, |
| { |
| "epoch": 2.54281098546042, |
| "grad_norm": 0.09421700984239578, |
| "learning_rate": 1.7352874859075537e-05, |
| "loss": 0.0481, |
| "step": 788 |
| }, |
| { |
| "epoch": 2.546042003231018, |
| "grad_norm": 0.1201411634683609, |
| "learning_rate": 1.734836527621195e-05, |
| "loss": 0.0608, |
| "step": 789 |
| }, |
| { |
| "epoch": 2.5492730210016155, |
| "grad_norm": 0.09896653145551682, |
| "learning_rate": 1.7343855693348365e-05, |
| "loss": 0.0465, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.5525040387722133, |
| "grad_norm": 0.12088964134454727, |
| "learning_rate": 1.7339346110484783e-05, |
| "loss": 0.0614, |
| "step": 791 |
| }, |
| { |
| "epoch": 2.5557350565428107, |
| "grad_norm": 0.11183801293373108, |
| "learning_rate": 1.7334836527621198e-05, |
| "loss": 0.0545, |
| "step": 792 |
| }, |
| { |
| "epoch": 2.5589660743134086, |
| "grad_norm": 0.11126703768968582, |
| "learning_rate": 1.7330326944757612e-05, |
| "loss": 0.0509, |
| "step": 793 |
| }, |
| { |
| "epoch": 2.5621970920840065, |
| "grad_norm": 0.1374976634979248, |
| "learning_rate": 1.7325817361894026e-05, |
| "loss": 0.0664, |
| "step": 794 |
| }, |
| { |
| "epoch": 2.5654281098546043, |
| "grad_norm": 0.16783633828163147, |
| "learning_rate": 1.732130777903044e-05, |
| "loss": 0.0868, |
| "step": 795 |
| }, |
| { |
| "epoch": 2.568659127625202, |
| "grad_norm": 0.11534145474433899, |
| "learning_rate": 1.7316798196166855e-05, |
| "loss": 0.053, |
| "step": 796 |
| }, |
| { |
| "epoch": 2.5718901453957996, |
| "grad_norm": 0.13769778609275818, |
| "learning_rate": 1.7312288613303273e-05, |
| "loss": 0.0756, |
| "step": 797 |
| }, |
| { |
| "epoch": 2.5751211631663975, |
| "grad_norm": 0.09577422589063644, |
| "learning_rate": 1.7307779030439687e-05, |
| "loss": 0.0441, |
| "step": 798 |
| }, |
| { |
| "epoch": 2.578352180936995, |
| "grad_norm": 0.11375096440315247, |
| "learning_rate": 1.73032694475761e-05, |
| "loss": 0.0524, |
| "step": 799 |
| }, |
| { |
| "epoch": 2.581583198707593, |
| "grad_norm": 0.11465324461460114, |
| "learning_rate": 1.7298759864712516e-05, |
| "loss": 0.0526, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.5848142164781907, |
| "grad_norm": 0.11597500741481781, |
| "learning_rate": 1.729425028184893e-05, |
| "loss": 0.0591, |
| "step": 801 |
| }, |
| { |
| "epoch": 2.5880452342487885, |
| "grad_norm": 0.09817709028720856, |
| "learning_rate": 1.7289740698985345e-05, |
| "loss": 0.0503, |
| "step": 802 |
| }, |
| { |
| "epoch": 2.5912762520193864, |
| "grad_norm": 0.10352802276611328, |
| "learning_rate": 1.7285231116121763e-05, |
| "loss": 0.0461, |
| "step": 803 |
| }, |
| { |
| "epoch": 2.594507269789984, |
| "grad_norm": 0.12035888433456421, |
| "learning_rate": 1.7280721533258177e-05, |
| "loss": 0.0576, |
| "step": 804 |
| }, |
| { |
| "epoch": 2.5977382875605817, |
| "grad_norm": 0.12561960518360138, |
| "learning_rate": 1.727621195039459e-05, |
| "loss": 0.0599, |
| "step": 805 |
| }, |
| { |
| "epoch": 2.600969305331179, |
| "grad_norm": 0.11348681896924973, |
| "learning_rate": 1.7271702367531006e-05, |
| "loss": 0.0503, |
| "step": 806 |
| }, |
| { |
| "epoch": 2.604200323101777, |
| "grad_norm": 0.09772437810897827, |
| "learning_rate": 1.726719278466742e-05, |
| "loss": 0.0471, |
| "step": 807 |
| }, |
| { |
| "epoch": 2.607431340872375, |
| "grad_norm": 0.10316241532564163, |
| "learning_rate": 1.7262683201803835e-05, |
| "loss": 0.0514, |
| "step": 808 |
| }, |
| { |
| "epoch": 2.6106623586429727, |
| "grad_norm": 0.11204390227794647, |
| "learning_rate": 1.725817361894025e-05, |
| "loss": 0.0578, |
| "step": 809 |
| }, |
| { |
| "epoch": 2.61389337641357, |
| "grad_norm": 0.10899617522954941, |
| "learning_rate": 1.7253664036076663e-05, |
| "loss": 0.0495, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.617124394184168, |
| "grad_norm": 0.1386026293039322, |
| "learning_rate": 1.724915445321308e-05, |
| "loss": 0.0542, |
| "step": 811 |
| }, |
| { |
| "epoch": 2.620355411954766, |
| "grad_norm": 0.13927608728408813, |
| "learning_rate": 1.7244644870349495e-05, |
| "loss": 0.0628, |
| "step": 812 |
| }, |
| { |
| "epoch": 2.6235864297253633, |
| "grad_norm": 0.099461629986763, |
| "learning_rate": 1.724013528748591e-05, |
| "loss": 0.0501, |
| "step": 813 |
| }, |
| { |
| "epoch": 2.626817447495961, |
| "grad_norm": 0.09142296761274338, |
| "learning_rate": 1.7235625704622324e-05, |
| "loss": 0.0475, |
| "step": 814 |
| }, |
| { |
| "epoch": 2.630048465266559, |
| "grad_norm": 0.12531687319278717, |
| "learning_rate": 1.723111612175874e-05, |
| "loss": 0.059, |
| "step": 815 |
| }, |
| { |
| "epoch": 2.633279483037157, |
| "grad_norm": 0.1252615749835968, |
| "learning_rate": 1.7226606538895153e-05, |
| "loss": 0.0589, |
| "step": 816 |
| }, |
| { |
| "epoch": 2.6365105008077543, |
| "grad_norm": 0.12725740671157837, |
| "learning_rate": 1.7222096956031567e-05, |
| "loss": 0.0522, |
| "step": 817 |
| }, |
| { |
| "epoch": 2.639741518578352, |
| "grad_norm": 0.12746059894561768, |
| "learning_rate": 1.7217587373167982e-05, |
| "loss": 0.0661, |
| "step": 818 |
| }, |
| { |
| "epoch": 2.64297253634895, |
| "grad_norm": 0.2133682370185852, |
| "learning_rate": 1.7213077790304396e-05, |
| "loss": 0.0639, |
| "step": 819 |
| }, |
| { |
| "epoch": 2.6462035541195474, |
| "grad_norm": 0.11452341079711914, |
| "learning_rate": 1.720856820744081e-05, |
| "loss": 0.0512, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.6494345718901453, |
| "grad_norm": 0.12344635277986526, |
| "learning_rate": 1.720405862457723e-05, |
| "loss": 0.0503, |
| "step": 821 |
| }, |
| { |
| "epoch": 2.652665589660743, |
| "grad_norm": 0.12654437124729156, |
| "learning_rate": 1.7199549041713643e-05, |
| "loss": 0.0543, |
| "step": 822 |
| }, |
| { |
| "epoch": 2.655896607431341, |
| "grad_norm": 0.12805619835853577, |
| "learning_rate": 1.7195039458850057e-05, |
| "loss": 0.0646, |
| "step": 823 |
| }, |
| { |
| "epoch": 2.6591276252019385, |
| "grad_norm": 0.11218256503343582, |
| "learning_rate": 1.719052987598647e-05, |
| "loss": 0.0575, |
| "step": 824 |
| }, |
| { |
| "epoch": 2.6623586429725363, |
| "grad_norm": 0.12950399518013, |
| "learning_rate": 1.7186020293122886e-05, |
| "loss": 0.0673, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.6623586429725363, |
| "eval_loss": 0.07266557961702347, |
| "eval_runtime": 187.9396, |
| "eval_samples_per_second": 1.048, |
| "eval_steps_per_second": 1.048, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.665589660743134, |
| "grad_norm": 0.11642561107873917, |
| "learning_rate": 1.71815107102593e-05, |
| "loss": 0.0664, |
| "step": 826 |
| }, |
| { |
| "epoch": 2.6688206785137316, |
| "grad_norm": 0.09707733243703842, |
| "learning_rate": 1.7177001127395718e-05, |
| "loss": 0.0503, |
| "step": 827 |
| }, |
| { |
| "epoch": 2.6720516962843295, |
| "grad_norm": 0.07535319775342941, |
| "learning_rate": 1.7172491544532133e-05, |
| "loss": 0.038, |
| "step": 828 |
| }, |
| { |
| "epoch": 2.6752827140549273, |
| "grad_norm": 0.12683290243148804, |
| "learning_rate": 1.7167981961668547e-05, |
| "loss": 0.0614, |
| "step": 829 |
| }, |
| { |
| "epoch": 2.678513731825525, |
| "grad_norm": 0.1531742513179779, |
| "learning_rate": 1.716347237880496e-05, |
| "loss": 0.069, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.6817447495961226, |
| "grad_norm": 0.13030219078063965, |
| "learning_rate": 1.7158962795941376e-05, |
| "loss": 0.0593, |
| "step": 831 |
| }, |
| { |
| "epoch": 2.6849757673667205, |
| "grad_norm": 0.13288383185863495, |
| "learning_rate": 1.715445321307779e-05, |
| "loss": 0.0607, |
| "step": 832 |
| }, |
| { |
| "epoch": 2.6882067851373184, |
| "grad_norm": 0.1245107427239418, |
| "learning_rate": 1.7149943630214208e-05, |
| "loss": 0.059, |
| "step": 833 |
| }, |
| { |
| "epoch": 2.691437802907916, |
| "grad_norm": 0.10677826404571533, |
| "learning_rate": 1.7145434047350622e-05, |
| "loss": 0.0532, |
| "step": 834 |
| }, |
| { |
| "epoch": 2.6946688206785137, |
| "grad_norm": 0.11118808388710022, |
| "learning_rate": 1.7140924464487037e-05, |
| "loss": 0.055, |
| "step": 835 |
| }, |
| { |
| "epoch": 2.6978998384491115, |
| "grad_norm": 0.11494432389736176, |
| "learning_rate": 1.713641488162345e-05, |
| "loss": 0.0556, |
| "step": 836 |
| }, |
| { |
| "epoch": 2.7011308562197094, |
| "grad_norm": 0.14139457046985626, |
| "learning_rate": 1.7131905298759865e-05, |
| "loss": 0.0687, |
| "step": 837 |
| }, |
| { |
| "epoch": 2.704361873990307, |
| "grad_norm": 0.12973995506763458, |
| "learning_rate": 1.712739571589628e-05, |
| "loss": 0.0639, |
| "step": 838 |
| }, |
| { |
| "epoch": 2.7075928917609047, |
| "grad_norm": 0.12217195332050323, |
| "learning_rate": 1.7122886133032698e-05, |
| "loss": 0.0636, |
| "step": 839 |
| }, |
| { |
| "epoch": 2.7108239095315025, |
| "grad_norm": 0.08900095522403717, |
| "learning_rate": 1.7118376550169112e-05, |
| "loss": 0.0491, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.7140549273021, |
| "grad_norm": 0.12005368620157242, |
| "learning_rate": 1.7113866967305526e-05, |
| "loss": 0.0605, |
| "step": 841 |
| }, |
| { |
| "epoch": 2.717285945072698, |
| "grad_norm": 0.1201101690530777, |
| "learning_rate": 1.710935738444194e-05, |
| "loss": 0.0597, |
| "step": 842 |
| }, |
| { |
| "epoch": 2.7205169628432957, |
| "grad_norm": 0.12422793358564377, |
| "learning_rate": 1.7104847801578355e-05, |
| "loss": 0.0604, |
| "step": 843 |
| }, |
| { |
| "epoch": 2.7237479806138936, |
| "grad_norm": 0.11504203826189041, |
| "learning_rate": 1.710033821871477e-05, |
| "loss": 0.0567, |
| "step": 844 |
| }, |
| { |
| "epoch": 2.726978998384491, |
| "grad_norm": 0.13158410787582397, |
| "learning_rate": 1.7095828635851187e-05, |
| "loss": 0.0751, |
| "step": 845 |
| }, |
| { |
| "epoch": 2.730210016155089, |
| "grad_norm": 0.13026569783687592, |
| "learning_rate": 1.70913190529876e-05, |
| "loss": 0.057, |
| "step": 846 |
| }, |
| { |
| "epoch": 2.7334410339256867, |
| "grad_norm": 0.5227922201156616, |
| "learning_rate": 1.7086809470124016e-05, |
| "loss": 0.0567, |
| "step": 847 |
| }, |
| { |
| "epoch": 2.736672051696284, |
| "grad_norm": 0.08213207870721817, |
| "learning_rate": 1.708229988726043e-05, |
| "loss": 0.0385, |
| "step": 848 |
| }, |
| { |
| "epoch": 2.739903069466882, |
| "grad_norm": 0.14717501401901245, |
| "learning_rate": 1.7077790304396845e-05, |
| "loss": 0.0746, |
| "step": 849 |
| }, |
| { |
| "epoch": 2.74313408723748, |
| "grad_norm": 0.11484480649232864, |
| "learning_rate": 1.707328072153326e-05, |
| "loss": 0.0513, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.7463651050080777, |
| "grad_norm": 0.13454794883728027, |
| "learning_rate": 1.7068771138669674e-05, |
| "loss": 0.0668, |
| "step": 851 |
| }, |
| { |
| "epoch": 2.749596122778675, |
| "grad_norm": 0.16599783301353455, |
| "learning_rate": 1.706426155580609e-05, |
| "loss": 0.0659, |
| "step": 852 |
| }, |
| { |
| "epoch": 2.752827140549273, |
| "grad_norm": 0.11365890502929688, |
| "learning_rate": 1.7059751972942506e-05, |
| "loss": 0.0591, |
| "step": 853 |
| }, |
| { |
| "epoch": 2.756058158319871, |
| "grad_norm": 0.12964101135730743, |
| "learning_rate": 1.705524239007892e-05, |
| "loss": 0.0598, |
| "step": 854 |
| }, |
| { |
| "epoch": 2.7592891760904683, |
| "grad_norm": 0.10415180772542953, |
| "learning_rate": 1.7050732807215335e-05, |
| "loss": 0.0499, |
| "step": 855 |
| }, |
| { |
| "epoch": 2.762520193861066, |
| "grad_norm": 0.1433461755514145, |
| "learning_rate": 1.704622322435175e-05, |
| "loss": 0.0668, |
| "step": 856 |
| }, |
| { |
| "epoch": 2.765751211631664, |
| "grad_norm": 0.12921610474586487, |
| "learning_rate": 1.7041713641488163e-05, |
| "loss": 0.0525, |
| "step": 857 |
| }, |
| { |
| "epoch": 2.768982229402262, |
| "grad_norm": 0.11878325045108795, |
| "learning_rate": 1.7037204058624578e-05, |
| "loss": 0.06, |
| "step": 858 |
| }, |
| { |
| "epoch": 2.7722132471728593, |
| "grad_norm": 0.09403812140226364, |
| "learning_rate": 1.7032694475760992e-05, |
| "loss": 0.0472, |
| "step": 859 |
| }, |
| { |
| "epoch": 2.775444264943457, |
| "grad_norm": 0.09613174945116043, |
| "learning_rate": 1.7028184892897407e-05, |
| "loss": 0.0508, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.778675282714055, |
| "grad_norm": 0.11891157180070877, |
| "learning_rate": 1.702367531003382e-05, |
| "loss": 0.0589, |
| "step": 861 |
| }, |
| { |
| "epoch": 2.7819063004846525, |
| "grad_norm": 0.1563875377178192, |
| "learning_rate": 1.701916572717024e-05, |
| "loss": 0.0794, |
| "step": 862 |
| }, |
| { |
| "epoch": 2.7851373182552503, |
| "grad_norm": 0.13382849097251892, |
| "learning_rate": 1.7014656144306653e-05, |
| "loss": 0.0683, |
| "step": 863 |
| }, |
| { |
| "epoch": 2.788368336025848, |
| "grad_norm": 0.15156099200248718, |
| "learning_rate": 1.7010146561443067e-05, |
| "loss": 0.0694, |
| "step": 864 |
| }, |
| { |
| "epoch": 2.791599353796446, |
| "grad_norm": 0.12621727585792542, |
| "learning_rate": 1.7005636978579482e-05, |
| "loss": 0.0567, |
| "step": 865 |
| }, |
| { |
| "epoch": 2.7948303715670435, |
| "grad_norm": 0.15388263761997223, |
| "learning_rate": 1.7001127395715896e-05, |
| "loss": 0.0738, |
| "step": 866 |
| }, |
| { |
| "epoch": 2.7980613893376414, |
| "grad_norm": 0.1349688321352005, |
| "learning_rate": 1.699661781285231e-05, |
| "loss": 0.0546, |
| "step": 867 |
| }, |
| { |
| "epoch": 2.8012924071082392, |
| "grad_norm": 0.11894084513187408, |
| "learning_rate": 1.6992108229988725e-05, |
| "loss": 0.0538, |
| "step": 868 |
| }, |
| { |
| "epoch": 2.8045234248788367, |
| "grad_norm": 0.21414369344711304, |
| "learning_rate": 1.6987598647125143e-05, |
| "loss": 0.0511, |
| "step": 869 |
| }, |
| { |
| "epoch": 2.8077544426494345, |
| "grad_norm": 0.1107967421412468, |
| "learning_rate": 1.6983089064261557e-05, |
| "loss": 0.0551, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.8109854604200324, |
| "grad_norm": 0.16710782051086426, |
| "learning_rate": 1.697857948139797e-05, |
| "loss": 0.0805, |
| "step": 871 |
| }, |
| { |
| "epoch": 2.8142164781906303, |
| "grad_norm": 0.12345987558364868, |
| "learning_rate": 1.6974069898534386e-05, |
| "loss": 0.0576, |
| "step": 872 |
| }, |
| { |
| "epoch": 2.8174474959612277, |
| "grad_norm": 0.11037538200616837, |
| "learning_rate": 1.69695603156708e-05, |
| "loss": 0.0486, |
| "step": 873 |
| }, |
| { |
| "epoch": 2.8206785137318255, |
| "grad_norm": 0.10175740718841553, |
| "learning_rate": 1.6965050732807218e-05, |
| "loss": 0.0529, |
| "step": 874 |
| }, |
| { |
| "epoch": 2.8239095315024234, |
| "grad_norm": 0.1126103326678276, |
| "learning_rate": 1.6960541149943633e-05, |
| "loss": 0.0546, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.827140549273021, |
| "grad_norm": 0.09911254793405533, |
| "learning_rate": 1.6956031567080047e-05, |
| "loss": 0.0455, |
| "step": 876 |
| }, |
| { |
| "epoch": 2.8303715670436187, |
| "grad_norm": 0.1377602368593216, |
| "learning_rate": 1.695152198421646e-05, |
| "loss": 0.0599, |
| "step": 877 |
| }, |
| { |
| "epoch": 2.8336025848142166, |
| "grad_norm": 0.11292906850576401, |
| "learning_rate": 1.6947012401352876e-05, |
| "loss": 0.053, |
| "step": 878 |
| }, |
| { |
| "epoch": 2.8368336025848144, |
| "grad_norm": 0.13102898001670837, |
| "learning_rate": 1.694250281848929e-05, |
| "loss": 0.0629, |
| "step": 879 |
| }, |
| { |
| "epoch": 2.840064620355412, |
| "grad_norm": 0.12573762238025665, |
| "learning_rate": 1.6937993235625708e-05, |
| "loss": 0.052, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.8432956381260097, |
| "grad_norm": 0.11562048643827438, |
| "learning_rate": 1.6933483652762122e-05, |
| "loss": 0.0594, |
| "step": 881 |
| }, |
| { |
| "epoch": 2.8465266558966076, |
| "grad_norm": 0.15344659984111786, |
| "learning_rate": 1.6928974069898537e-05, |
| "loss": 0.0636, |
| "step": 882 |
| }, |
| { |
| "epoch": 2.849757673667205, |
| "grad_norm": 0.11969706416130066, |
| "learning_rate": 1.692446448703495e-05, |
| "loss": 0.0549, |
| "step": 883 |
| }, |
| { |
| "epoch": 2.852988691437803, |
| "grad_norm": 0.0930706337094307, |
| "learning_rate": 1.6919954904171365e-05, |
| "loss": 0.0434, |
| "step": 884 |
| }, |
| { |
| "epoch": 2.8562197092084007, |
| "grad_norm": 0.1458914428949356, |
| "learning_rate": 1.691544532130778e-05, |
| "loss": 0.0707, |
| "step": 885 |
| }, |
| { |
| "epoch": 2.8594507269789986, |
| "grad_norm": 0.11928731948137283, |
| "learning_rate": 1.6910935738444198e-05, |
| "loss": 0.0584, |
| "step": 886 |
| }, |
| { |
| "epoch": 2.862681744749596, |
| "grad_norm": 0.1225530207157135, |
| "learning_rate": 1.6906426155580612e-05, |
| "loss": 0.0619, |
| "step": 887 |
| }, |
| { |
| "epoch": 2.865912762520194, |
| "grad_norm": 0.13734394311904907, |
| "learning_rate": 1.6901916572717026e-05, |
| "loss": 0.065, |
| "step": 888 |
| }, |
| { |
| "epoch": 2.8691437802907918, |
| "grad_norm": 0.13331110775470734, |
| "learning_rate": 1.689740698985344e-05, |
| "loss": 0.0652, |
| "step": 889 |
| }, |
| { |
| "epoch": 2.872374798061389, |
| "grad_norm": 0.1149471327662468, |
| "learning_rate": 1.6892897406989855e-05, |
| "loss": 0.0495, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.875605815831987, |
| "grad_norm": 0.14575156569480896, |
| "learning_rate": 1.688838782412627e-05, |
| "loss": 0.069, |
| "step": 891 |
| }, |
| { |
| "epoch": 2.878836833602585, |
| "grad_norm": 0.1205376535654068, |
| "learning_rate": 1.6883878241262684e-05, |
| "loss": 0.0595, |
| "step": 892 |
| }, |
| { |
| "epoch": 2.8820678513731828, |
| "grad_norm": 0.13029593229293823, |
| "learning_rate": 1.6879368658399102e-05, |
| "loss": 0.0652, |
| "step": 893 |
| }, |
| { |
| "epoch": 2.88529886914378, |
| "grad_norm": 0.1242680773139, |
| "learning_rate": 1.6874859075535516e-05, |
| "loss": 0.0628, |
| "step": 894 |
| }, |
| { |
| "epoch": 2.888529886914378, |
| "grad_norm": 0.1066925972700119, |
| "learning_rate": 1.687034949267193e-05, |
| "loss": 0.0571, |
| "step": 895 |
| }, |
| { |
| "epoch": 2.891760904684976, |
| "grad_norm": 0.09622512012720108, |
| "learning_rate": 1.6865839909808345e-05, |
| "loss": 0.0447, |
| "step": 896 |
| }, |
| { |
| "epoch": 2.8949919224555734, |
| "grad_norm": 0.14432470500469208, |
| "learning_rate": 1.686133032694476e-05, |
| "loss": 0.0658, |
| "step": 897 |
| }, |
| { |
| "epoch": 2.898222940226171, |
| "grad_norm": 0.1262982338666916, |
| "learning_rate": 1.6856820744081174e-05, |
| "loss": 0.057, |
| "step": 898 |
| }, |
| { |
| "epoch": 2.901453957996769, |
| "grad_norm": 0.12278001755475998, |
| "learning_rate": 1.6852311161217588e-05, |
| "loss": 0.0533, |
| "step": 899 |
| }, |
| { |
| "epoch": 2.904684975767367, |
| "grad_norm": 0.13526810705661774, |
| "learning_rate": 1.6847801578354002e-05, |
| "loss": 0.0536, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.904684975767367, |
| "eval_loss": 0.07217078655958176, |
| "eval_runtime": 188.4791, |
| "eval_samples_per_second": 1.045, |
| "eval_steps_per_second": 1.045, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.9079159935379644, |
| "grad_norm": 0.07958260923624039, |
| "learning_rate": 1.6843291995490417e-05, |
| "loss": 0.0395, |
| "step": 901 |
| }, |
| { |
| "epoch": 2.9111470113085622, |
| "grad_norm": 0.10193248093128204, |
| "learning_rate": 1.683878241262683e-05, |
| "loss": 0.0495, |
| "step": 902 |
| }, |
| { |
| "epoch": 2.9143780290791597, |
| "grad_norm": 0.09985180199146271, |
| "learning_rate": 1.683427282976325e-05, |
| "loss": 0.0496, |
| "step": 903 |
| }, |
| { |
| "epoch": 2.9176090468497575, |
| "grad_norm": 0.15160292387008667, |
| "learning_rate": 1.6829763246899663e-05, |
| "loss": 0.0668, |
| "step": 904 |
| }, |
| { |
| "epoch": 2.9208400646203554, |
| "grad_norm": 0.13049964606761932, |
| "learning_rate": 1.6825253664036078e-05, |
| "loss": 0.067, |
| "step": 905 |
| }, |
| { |
| "epoch": 2.9240710823909533, |
| "grad_norm": 0.13118034601211548, |
| "learning_rate": 1.6820744081172492e-05, |
| "loss": 0.06, |
| "step": 906 |
| }, |
| { |
| "epoch": 2.927302100161551, |
| "grad_norm": 0.11038261651992798, |
| "learning_rate": 1.6816234498308907e-05, |
| "loss": 0.0542, |
| "step": 907 |
| }, |
| { |
| "epoch": 2.9305331179321485, |
| "grad_norm": 0.11064022779464722, |
| "learning_rate": 1.681172491544532e-05, |
| "loss": 0.0535, |
| "step": 908 |
| }, |
| { |
| "epoch": 2.9337641357027464, |
| "grad_norm": 0.10448973625898361, |
| "learning_rate": 1.6807215332581735e-05, |
| "loss": 0.0444, |
| "step": 909 |
| }, |
| { |
| "epoch": 2.936995153473344, |
| "grad_norm": 0.09960347414016724, |
| "learning_rate": 1.6802705749718153e-05, |
| "loss": 0.0477, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.9402261712439417, |
| "grad_norm": 0.10175690799951553, |
| "learning_rate": 1.6798196166854568e-05, |
| "loss": 0.0521, |
| "step": 911 |
| }, |
| { |
| "epoch": 2.9434571890145396, |
| "grad_norm": 0.11306945234537125, |
| "learning_rate": 1.6793686583990982e-05, |
| "loss": 0.0483, |
| "step": 912 |
| }, |
| { |
| "epoch": 2.9466882067851374, |
| "grad_norm": 0.12629052996635437, |
| "learning_rate": 1.6789177001127396e-05, |
| "loss": 0.0594, |
| "step": 913 |
| }, |
| { |
| "epoch": 2.9499192245557353, |
| "grad_norm": 0.16772274672985077, |
| "learning_rate": 1.678466741826381e-05, |
| "loss": 0.0808, |
| "step": 914 |
| }, |
| { |
| "epoch": 2.9531502423263327, |
| "grad_norm": 0.14857324957847595, |
| "learning_rate": 1.6780157835400225e-05, |
| "loss": 0.0626, |
| "step": 915 |
| }, |
| { |
| "epoch": 2.9563812600969306, |
| "grad_norm": 0.12077292054891586, |
| "learning_rate": 1.6775648252536643e-05, |
| "loss": 0.0547, |
| "step": 916 |
| }, |
| { |
| "epoch": 2.959612277867528, |
| "grad_norm": 0.08560369163751602, |
| "learning_rate": 1.6771138669673057e-05, |
| "loss": 0.0402, |
| "step": 917 |
| }, |
| { |
| "epoch": 2.962843295638126, |
| "grad_norm": 0.1328180432319641, |
| "learning_rate": 1.676662908680947e-05, |
| "loss": 0.0603, |
| "step": 918 |
| }, |
| { |
| "epoch": 2.9660743134087237, |
| "grad_norm": 0.13140498101711273, |
| "learning_rate": 1.6762119503945886e-05, |
| "loss": 0.0541, |
| "step": 919 |
| }, |
| { |
| "epoch": 2.9693053311793216, |
| "grad_norm": 0.13546602427959442, |
| "learning_rate": 1.67576099210823e-05, |
| "loss": 0.0644, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.9725363489499195, |
| "grad_norm": 0.13099107146263123, |
| "learning_rate": 1.6753100338218715e-05, |
| "loss": 0.0544, |
| "step": 921 |
| }, |
| { |
| "epoch": 2.975767366720517, |
| "grad_norm": 0.12933450937271118, |
| "learning_rate": 1.6748590755355133e-05, |
| "loss": 0.0632, |
| "step": 922 |
| }, |
| { |
| "epoch": 2.9789983844911148, |
| "grad_norm": 0.12769202888011932, |
| "learning_rate": 1.6744081172491547e-05, |
| "loss": 0.0591, |
| "step": 923 |
| }, |
| { |
| "epoch": 2.982229402261712, |
| "grad_norm": 0.12964068353176117, |
| "learning_rate": 1.673957158962796e-05, |
| "loss": 0.0602, |
| "step": 924 |
| }, |
| { |
| "epoch": 2.98546042003231, |
| "grad_norm": 0.1714252084493637, |
| "learning_rate": 1.6735062006764376e-05, |
| "loss": 0.076, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.988691437802908, |
| "grad_norm": 0.15382935106754303, |
| "learning_rate": 1.673055242390079e-05, |
| "loss": 0.0624, |
| "step": 926 |
| }, |
| { |
| "epoch": 2.991922455573506, |
| "grad_norm": 0.15337888896465302, |
| "learning_rate": 1.6726042841037205e-05, |
| "loss": 0.0652, |
| "step": 927 |
| }, |
| { |
| "epoch": 2.9951534733441036, |
| "grad_norm": 0.1587558090686798, |
| "learning_rate": 1.6721533258173622e-05, |
| "loss": 0.0754, |
| "step": 928 |
| }, |
| { |
| "epoch": 2.998384491114701, |
| "grad_norm": 0.09836894273757935, |
| "learning_rate": 1.6717023675310037e-05, |
| "loss": 0.046, |
| "step": 929 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 0.20606525242328644, |
| "learning_rate": 1.671251409244645e-05, |
| "loss": 0.0692, |
| "step": 930 |
| }, |
| { |
| "epoch": 3.003231017770598, |
| "grad_norm": 0.12933456897735596, |
| "learning_rate": 1.6708004509582866e-05, |
| "loss": 0.0601, |
| "step": 931 |
| }, |
| { |
| "epoch": 3.0064620355411953, |
| "grad_norm": 0.15069305896759033, |
| "learning_rate": 1.670349492671928e-05, |
| "loss": 0.0691, |
| "step": 932 |
| }, |
| { |
| "epoch": 3.009693053311793, |
| "grad_norm": 0.12518665194511414, |
| "learning_rate": 1.6698985343855694e-05, |
| "loss": 0.0594, |
| "step": 933 |
| }, |
| { |
| "epoch": 3.012924071082391, |
| "grad_norm": 0.13509726524353027, |
| "learning_rate": 1.6694475760992112e-05, |
| "loss": 0.0553, |
| "step": 934 |
| }, |
| { |
| "epoch": 3.016155088852989, |
| "grad_norm": 0.18207424879074097, |
| "learning_rate": 1.6689966178128526e-05, |
| "loss": 0.0691, |
| "step": 935 |
| }, |
| { |
| "epoch": 3.0193861066235863, |
| "grad_norm": 0.10155311226844788, |
| "learning_rate": 1.668545659526494e-05, |
| "loss": 0.0442, |
| "step": 936 |
| }, |
| { |
| "epoch": 3.022617124394184, |
| "grad_norm": 0.08462440967559814, |
| "learning_rate": 1.6680947012401355e-05, |
| "loss": 0.0397, |
| "step": 937 |
| }, |
| { |
| "epoch": 3.025848142164782, |
| "grad_norm": 0.1074729785323143, |
| "learning_rate": 1.667643742953777e-05, |
| "loss": 0.0476, |
| "step": 938 |
| }, |
| { |
| "epoch": 3.0290791599353795, |
| "grad_norm": 0.10128747671842575, |
| "learning_rate": 1.6671927846674184e-05, |
| "loss": 0.044, |
| "step": 939 |
| }, |
| { |
| "epoch": 3.0323101777059773, |
| "grad_norm": 0.10703253746032715, |
| "learning_rate": 1.66674182638106e-05, |
| "loss": 0.0434, |
| "step": 940 |
| }, |
| { |
| "epoch": 3.035541195476575, |
| "grad_norm": 0.16827581822872162, |
| "learning_rate": 1.6662908680947013e-05, |
| "loss": 0.058, |
| "step": 941 |
| }, |
| { |
| "epoch": 3.038772213247173, |
| "grad_norm": 0.12423544377088547, |
| "learning_rate": 1.6658399098083427e-05, |
| "loss": 0.0529, |
| "step": 942 |
| }, |
| { |
| "epoch": 3.0420032310177705, |
| "grad_norm": 0.11421461403369904, |
| "learning_rate": 1.665388951521984e-05, |
| "loss": 0.0462, |
| "step": 943 |
| }, |
| { |
| "epoch": 3.0452342487883683, |
| "grad_norm": 0.1504746377468109, |
| "learning_rate": 1.664937993235626e-05, |
| "loss": 0.0607, |
| "step": 944 |
| }, |
| { |
| "epoch": 3.048465266558966, |
| "grad_norm": 0.1171237900853157, |
| "learning_rate": 1.6644870349492674e-05, |
| "loss": 0.0488, |
| "step": 945 |
| }, |
| { |
| "epoch": 3.0516962843295636, |
| "grad_norm": 0.12751275300979614, |
| "learning_rate": 1.6640360766629088e-05, |
| "loss": 0.0566, |
| "step": 946 |
| }, |
| { |
| "epoch": 3.0549273021001615, |
| "grad_norm": 0.10137461870908737, |
| "learning_rate": 1.6635851183765503e-05, |
| "loss": 0.042, |
| "step": 947 |
| }, |
| { |
| "epoch": 3.0581583198707594, |
| "grad_norm": 0.10805993527173996, |
| "learning_rate": 1.6631341600901917e-05, |
| "loss": 0.0436, |
| "step": 948 |
| }, |
| { |
| "epoch": 3.0613893376413572, |
| "grad_norm": 0.15429779887199402, |
| "learning_rate": 1.662683201803833e-05, |
| "loss": 0.0611, |
| "step": 949 |
| }, |
| { |
| "epoch": 3.0646203554119547, |
| "grad_norm": 0.15192106366157532, |
| "learning_rate": 1.6622322435174746e-05, |
| "loss": 0.0558, |
| "step": 950 |
| }, |
| { |
| "epoch": 3.0678513731825525, |
| "grad_norm": 0.14291639626026154, |
| "learning_rate": 1.661781285231116e-05, |
| "loss": 0.0582, |
| "step": 951 |
| }, |
| { |
| "epoch": 3.0710823909531504, |
| "grad_norm": 0.11516083776950836, |
| "learning_rate": 1.6613303269447578e-05, |
| "loss": 0.0455, |
| "step": 952 |
| }, |
| { |
| "epoch": 3.074313408723748, |
| "grad_norm": 0.11716248095035553, |
| "learning_rate": 1.6608793686583992e-05, |
| "loss": 0.0419, |
| "step": 953 |
| }, |
| { |
| "epoch": 3.0775444264943457, |
| "grad_norm": 0.13777975738048553, |
| "learning_rate": 1.6604284103720407e-05, |
| "loss": 0.0587, |
| "step": 954 |
| }, |
| { |
| "epoch": 3.0807754442649435, |
| "grad_norm": 0.15481697022914886, |
| "learning_rate": 1.659977452085682e-05, |
| "loss": 0.058, |
| "step": 955 |
| }, |
| { |
| "epoch": 3.0840064620355414, |
| "grad_norm": 0.11290151625871658, |
| "learning_rate": 1.6595264937993235e-05, |
| "loss": 0.0428, |
| "step": 956 |
| }, |
| { |
| "epoch": 3.087237479806139, |
| "grad_norm": 0.11138515174388885, |
| "learning_rate": 1.6590755355129653e-05, |
| "loss": 0.0445, |
| "step": 957 |
| }, |
| { |
| "epoch": 3.0904684975767367, |
| "grad_norm": 0.13892598450183868, |
| "learning_rate": 1.6586245772266068e-05, |
| "loss": 0.0532, |
| "step": 958 |
| }, |
| { |
| "epoch": 3.0936995153473346, |
| "grad_norm": 0.14099125564098358, |
| "learning_rate": 1.6581736189402482e-05, |
| "loss": 0.0532, |
| "step": 959 |
| }, |
| { |
| "epoch": 3.096930533117932, |
| "grad_norm": 0.1620667278766632, |
| "learning_rate": 1.6577226606538896e-05, |
| "loss": 0.0716, |
| "step": 960 |
| }, |
| { |
| "epoch": 3.10016155088853, |
| "grad_norm": 0.1435079723596573, |
| "learning_rate": 1.657271702367531e-05, |
| "loss": 0.0587, |
| "step": 961 |
| }, |
| { |
| "epoch": 3.1033925686591277, |
| "grad_norm": 0.1412099003791809, |
| "learning_rate": 1.6568207440811725e-05, |
| "loss": 0.0599, |
| "step": 962 |
| }, |
| { |
| "epoch": 3.106623586429725, |
| "grad_norm": 0.16996391117572784, |
| "learning_rate": 1.6563697857948143e-05, |
| "loss": 0.0577, |
| "step": 963 |
| }, |
| { |
| "epoch": 3.109854604200323, |
| "grad_norm": 0.14544463157653809, |
| "learning_rate": 1.6559188275084557e-05, |
| "loss": 0.0595, |
| "step": 964 |
| }, |
| { |
| "epoch": 3.113085621970921, |
| "grad_norm": 0.12646666169166565, |
| "learning_rate": 1.6554678692220972e-05, |
| "loss": 0.0496, |
| "step": 965 |
| }, |
| { |
| "epoch": 3.1163166397415187, |
| "grad_norm": 0.16260091960430145, |
| "learning_rate": 1.6550169109357386e-05, |
| "loss": 0.0588, |
| "step": 966 |
| }, |
| { |
| "epoch": 3.119547657512116, |
| "grad_norm": 0.14531579613685608, |
| "learning_rate": 1.65456595264938e-05, |
| "loss": 0.0654, |
| "step": 967 |
| }, |
| { |
| "epoch": 3.122778675282714, |
| "grad_norm": 0.13838277757167816, |
| "learning_rate": 1.6541149943630215e-05, |
| "loss": 0.058, |
| "step": 968 |
| }, |
| { |
| "epoch": 3.126009693053312, |
| "grad_norm": 0.10179346054792404, |
| "learning_rate": 1.6536640360766633e-05, |
| "loss": 0.0394, |
| "step": 969 |
| }, |
| { |
| "epoch": 3.1292407108239093, |
| "grad_norm": 0.14759835600852966, |
| "learning_rate": 1.6532130777903047e-05, |
| "loss": 0.0616, |
| "step": 970 |
| }, |
| { |
| "epoch": 3.132471728594507, |
| "grad_norm": 0.12317320704460144, |
| "learning_rate": 1.652762119503946e-05, |
| "loss": 0.0457, |
| "step": 971 |
| }, |
| { |
| "epoch": 3.135702746365105, |
| "grad_norm": 0.14770029485225677, |
| "learning_rate": 1.6523111612175876e-05, |
| "loss": 0.0606, |
| "step": 972 |
| }, |
| { |
| "epoch": 3.138933764135703, |
| "grad_norm": 0.14644749462604523, |
| "learning_rate": 1.651860202931229e-05, |
| "loss": 0.0576, |
| "step": 973 |
| }, |
| { |
| "epoch": 3.1421647819063003, |
| "grad_norm": 0.15745016932487488, |
| "learning_rate": 1.6514092446448705e-05, |
| "loss": 0.0658, |
| "step": 974 |
| }, |
| { |
| "epoch": 3.145395799676898, |
| "grad_norm": 0.15281431376934052, |
| "learning_rate": 1.6509582863585122e-05, |
| "loss": 0.0579, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.145395799676898, |
| "eval_loss": 0.07240297645330429, |
| "eval_runtime": 188.3995, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 975 |
| }, |
| { |
| "epoch": 3.148626817447496, |
| "grad_norm": 0.186857670545578, |
| "learning_rate": 1.6505073280721537e-05, |
| "loss": 0.0712, |
| "step": 976 |
| }, |
| { |
| "epoch": 3.1518578352180935, |
| "grad_norm": 0.11668923497200012, |
| "learning_rate": 1.650056369785795e-05, |
| "loss": 0.0432, |
| "step": 977 |
| }, |
| { |
| "epoch": 3.1550888529886914, |
| "grad_norm": 0.1078757792711258, |
| "learning_rate": 1.6496054114994366e-05, |
| "loss": 0.0438, |
| "step": 978 |
| }, |
| { |
| "epoch": 3.158319870759289, |
| "grad_norm": 0.10889827460050583, |
| "learning_rate": 1.649154453213078e-05, |
| "loss": 0.0404, |
| "step": 979 |
| }, |
| { |
| "epoch": 3.161550888529887, |
| "grad_norm": 0.11770477145910263, |
| "learning_rate": 1.6487034949267194e-05, |
| "loss": 0.0379, |
| "step": 980 |
| }, |
| { |
| "epoch": 3.1647819063004845, |
| "grad_norm": 0.1730085015296936, |
| "learning_rate": 1.648252536640361e-05, |
| "loss": 0.0527, |
| "step": 981 |
| }, |
| { |
| "epoch": 3.1680129240710824, |
| "grad_norm": 0.17005400359630585, |
| "learning_rate": 1.6478015783540023e-05, |
| "loss": 0.0608, |
| "step": 982 |
| }, |
| { |
| "epoch": 3.1712439418416802, |
| "grad_norm": 0.15987516939640045, |
| "learning_rate": 1.6473506200676438e-05, |
| "loss": 0.0657, |
| "step": 983 |
| }, |
| { |
| "epoch": 3.1744749596122777, |
| "grad_norm": 0.12467172741889954, |
| "learning_rate": 1.6468996617812852e-05, |
| "loss": 0.0532, |
| "step": 984 |
| }, |
| { |
| "epoch": 3.1777059773828755, |
| "grad_norm": 0.13436008989810944, |
| "learning_rate": 1.646448703494927e-05, |
| "loss": 0.0464, |
| "step": 985 |
| }, |
| { |
| "epoch": 3.1809369951534734, |
| "grad_norm": 0.15260566771030426, |
| "learning_rate": 1.6459977452085684e-05, |
| "loss": 0.0585, |
| "step": 986 |
| }, |
| { |
| "epoch": 3.1841680129240713, |
| "grad_norm": 0.1228412613272667, |
| "learning_rate": 1.64554678692221e-05, |
| "loss": 0.042, |
| "step": 987 |
| }, |
| { |
| "epoch": 3.1873990306946687, |
| "grad_norm": 0.1621600091457367, |
| "learning_rate": 1.6450958286358513e-05, |
| "loss": 0.0705, |
| "step": 988 |
| }, |
| { |
| "epoch": 3.1906300484652665, |
| "grad_norm": 0.14798057079315186, |
| "learning_rate": 1.6446448703494927e-05, |
| "loss": 0.0536, |
| "step": 989 |
| }, |
| { |
| "epoch": 3.1938610662358644, |
| "grad_norm": 0.17002591490745544, |
| "learning_rate": 1.644193912063134e-05, |
| "loss": 0.0624, |
| "step": 990 |
| }, |
| { |
| "epoch": 3.197092084006462, |
| "grad_norm": 0.11882289499044418, |
| "learning_rate": 1.6437429537767756e-05, |
| "loss": 0.0386, |
| "step": 991 |
| }, |
| { |
| "epoch": 3.2003231017770597, |
| "grad_norm": 0.177546426653862, |
| "learning_rate": 1.643291995490417e-05, |
| "loss": 0.0684, |
| "step": 992 |
| }, |
| { |
| "epoch": 3.2035541195476576, |
| "grad_norm": 0.1515907645225525, |
| "learning_rate": 1.6428410372040588e-05, |
| "loss": 0.0588, |
| "step": 993 |
| }, |
| { |
| "epoch": 3.2067851373182554, |
| "grad_norm": 0.13172346353530884, |
| "learning_rate": 1.6423900789177003e-05, |
| "loss": 0.0502, |
| "step": 994 |
| }, |
| { |
| "epoch": 3.210016155088853, |
| "grad_norm": 0.1430046707391739, |
| "learning_rate": 1.6419391206313417e-05, |
| "loss": 0.0538, |
| "step": 995 |
| }, |
| { |
| "epoch": 3.2132471728594507, |
| "grad_norm": 0.10192380100488663, |
| "learning_rate": 1.641488162344983e-05, |
| "loss": 0.0414, |
| "step": 996 |
| }, |
| { |
| "epoch": 3.2164781906300486, |
| "grad_norm": 0.12296223640441895, |
| "learning_rate": 1.6410372040586246e-05, |
| "loss": 0.0466, |
| "step": 997 |
| }, |
| { |
| "epoch": 3.219709208400646, |
| "grad_norm": 0.1641893982887268, |
| "learning_rate": 1.640586245772266e-05, |
| "loss": 0.0606, |
| "step": 998 |
| }, |
| { |
| "epoch": 3.222940226171244, |
| "grad_norm": 0.109470434486866, |
| "learning_rate": 1.6401352874859078e-05, |
| "loss": 0.0424, |
| "step": 999 |
| }, |
| { |
| "epoch": 3.2261712439418417, |
| "grad_norm": 0.10068835318088531, |
| "learning_rate": 1.6396843291995492e-05, |
| "loss": 0.0364, |
| "step": 1000 |
| }, |
| { |
| "epoch": 3.2294022617124396, |
| "grad_norm": 0.09672326594591141, |
| "learning_rate": 1.6392333709131907e-05, |
| "loss": 0.0403, |
| "step": 1001 |
| }, |
| { |
| "epoch": 3.232633279483037, |
| "grad_norm": 0.21638123691082, |
| "learning_rate": 1.638782412626832e-05, |
| "loss": 0.0785, |
| "step": 1002 |
| }, |
| { |
| "epoch": 3.235864297253635, |
| "grad_norm": 0.15015633404254913, |
| "learning_rate": 1.6383314543404735e-05, |
| "loss": 0.0513, |
| "step": 1003 |
| }, |
| { |
| "epoch": 3.2390953150242328, |
| "grad_norm": 0.11658553779125214, |
| "learning_rate": 1.637880496054115e-05, |
| "loss": 0.0444, |
| "step": 1004 |
| }, |
| { |
| "epoch": 3.24232633279483, |
| "grad_norm": 0.142287939786911, |
| "learning_rate": 1.6374295377677568e-05, |
| "loss": 0.0576, |
| "step": 1005 |
| }, |
| { |
| "epoch": 3.245557350565428, |
| "grad_norm": 0.11885146051645279, |
| "learning_rate": 1.6369785794813982e-05, |
| "loss": 0.0442, |
| "step": 1006 |
| }, |
| { |
| "epoch": 3.248788368336026, |
| "grad_norm": 0.1423695832490921, |
| "learning_rate": 1.6365276211950396e-05, |
| "loss": 0.0539, |
| "step": 1007 |
| }, |
| { |
| "epoch": 3.2520193861066238, |
| "grad_norm": 0.14337588846683502, |
| "learning_rate": 1.636076662908681e-05, |
| "loss": 0.056, |
| "step": 1008 |
| }, |
| { |
| "epoch": 3.255250403877221, |
| "grad_norm": 0.15875791013240814, |
| "learning_rate": 1.6356257046223225e-05, |
| "loss": 0.0672, |
| "step": 1009 |
| }, |
| { |
| "epoch": 3.258481421647819, |
| "grad_norm": 0.1158808171749115, |
| "learning_rate": 1.635174746335964e-05, |
| "loss": 0.0464, |
| "step": 1010 |
| }, |
| { |
| "epoch": 3.261712439418417, |
| "grad_norm": 0.12882259488105774, |
| "learning_rate": 1.6347237880496057e-05, |
| "loss": 0.0497, |
| "step": 1011 |
| }, |
| { |
| "epoch": 3.2649434571890144, |
| "grad_norm": 0.15846951305866241, |
| "learning_rate": 1.6342728297632472e-05, |
| "loss": 0.0656, |
| "step": 1012 |
| }, |
| { |
| "epoch": 3.268174474959612, |
| "grad_norm": 0.16542094945907593, |
| "learning_rate": 1.6338218714768886e-05, |
| "loss": 0.06, |
| "step": 1013 |
| }, |
| { |
| "epoch": 3.27140549273021, |
| "grad_norm": 0.1318148821592331, |
| "learning_rate": 1.63337091319053e-05, |
| "loss": 0.0475, |
| "step": 1014 |
| }, |
| { |
| "epoch": 3.274636510500808, |
| "grad_norm": 0.110074482858181, |
| "learning_rate": 1.6329199549041715e-05, |
| "loss": 0.0411, |
| "step": 1015 |
| }, |
| { |
| "epoch": 3.2778675282714054, |
| "grad_norm": 0.14870020747184753, |
| "learning_rate": 1.632468996617813e-05, |
| "loss": 0.0491, |
| "step": 1016 |
| }, |
| { |
| "epoch": 3.2810985460420032, |
| "grad_norm": 0.16925957798957825, |
| "learning_rate": 1.6320180383314547e-05, |
| "loss": 0.0648, |
| "step": 1017 |
| }, |
| { |
| "epoch": 3.284329563812601, |
| "grad_norm": 0.11573273688554764, |
| "learning_rate": 1.631567080045096e-05, |
| "loss": 0.0457, |
| "step": 1018 |
| }, |
| { |
| "epoch": 3.2875605815831985, |
| "grad_norm": 0.16456320881843567, |
| "learning_rate": 1.6311161217587376e-05, |
| "loss": 0.0608, |
| "step": 1019 |
| }, |
| { |
| "epoch": 3.2907915993537964, |
| "grad_norm": 0.14626148343086243, |
| "learning_rate": 1.630665163472379e-05, |
| "loss": 0.0543, |
| "step": 1020 |
| }, |
| { |
| "epoch": 3.2940226171243943, |
| "grad_norm": 0.15516629815101624, |
| "learning_rate": 1.6302142051860205e-05, |
| "loss": 0.0628, |
| "step": 1021 |
| }, |
| { |
| "epoch": 3.297253634894992, |
| "grad_norm": 0.18173731863498688, |
| "learning_rate": 1.629763246899662e-05, |
| "loss": 0.0677, |
| "step": 1022 |
| }, |
| { |
| "epoch": 3.3004846526655895, |
| "grad_norm": 0.146285742521286, |
| "learning_rate": 1.6293122886133033e-05, |
| "loss": 0.052, |
| "step": 1023 |
| }, |
| { |
| "epoch": 3.3037156704361874, |
| "grad_norm": 0.19001832604408264, |
| "learning_rate": 1.6288613303269448e-05, |
| "loss": 0.0693, |
| "step": 1024 |
| }, |
| { |
| "epoch": 3.3069466882067853, |
| "grad_norm": 0.09959913045167923, |
| "learning_rate": 1.6284103720405862e-05, |
| "loss": 0.0339, |
| "step": 1025 |
| }, |
| { |
| "epoch": 3.3101777059773827, |
| "grad_norm": 0.16015265882015228, |
| "learning_rate": 1.627959413754228e-05, |
| "loss": 0.0615, |
| "step": 1026 |
| }, |
| { |
| "epoch": 3.3134087237479806, |
| "grad_norm": 0.13552674651145935, |
| "learning_rate": 1.6275084554678694e-05, |
| "loss": 0.0475, |
| "step": 1027 |
| }, |
| { |
| "epoch": 3.3166397415185784, |
| "grad_norm": 0.16153255105018616, |
| "learning_rate": 1.627057497181511e-05, |
| "loss": 0.0601, |
| "step": 1028 |
| }, |
| { |
| "epoch": 3.3198707592891763, |
| "grad_norm": 0.14683452248573303, |
| "learning_rate": 1.6266065388951523e-05, |
| "loss": 0.0526, |
| "step": 1029 |
| }, |
| { |
| "epoch": 3.3231017770597737, |
| "grad_norm": 0.11259462684392929, |
| "learning_rate": 1.6261555806087938e-05, |
| "loss": 0.0429, |
| "step": 1030 |
| }, |
| { |
| "epoch": 3.3263327948303716, |
| "grad_norm": 0.1688949465751648, |
| "learning_rate": 1.6257046223224352e-05, |
| "loss": 0.0624, |
| "step": 1031 |
| }, |
| { |
| "epoch": 3.3295638126009695, |
| "grad_norm": 0.10584679245948792, |
| "learning_rate": 1.6252536640360766e-05, |
| "loss": 0.0376, |
| "step": 1032 |
| }, |
| { |
| "epoch": 3.332794830371567, |
| "grad_norm": 0.16506820917129517, |
| "learning_rate": 1.624802705749718e-05, |
| "loss": 0.0585, |
| "step": 1033 |
| }, |
| { |
| "epoch": 3.3360258481421647, |
| "grad_norm": 0.11264611780643463, |
| "learning_rate": 1.6243517474633595e-05, |
| "loss": 0.0432, |
| "step": 1034 |
| }, |
| { |
| "epoch": 3.3392568659127626, |
| "grad_norm": 0.17402660846710205, |
| "learning_rate": 1.6239007891770013e-05, |
| "loss": 0.0566, |
| "step": 1035 |
| }, |
| { |
| "epoch": 3.3424878836833605, |
| "grad_norm": 0.15407973527908325, |
| "learning_rate": 1.6234498308906427e-05, |
| "loss": 0.0519, |
| "step": 1036 |
| }, |
| { |
| "epoch": 3.345718901453958, |
| "grad_norm": 0.1423128843307495, |
| "learning_rate": 1.622998872604284e-05, |
| "loss": 0.0511, |
| "step": 1037 |
| }, |
| { |
| "epoch": 3.3489499192245558, |
| "grad_norm": 0.11291830986738205, |
| "learning_rate": 1.6225479143179256e-05, |
| "loss": 0.0413, |
| "step": 1038 |
| }, |
| { |
| "epoch": 3.3521809369951536, |
| "grad_norm": 0.17067734897136688, |
| "learning_rate": 1.622096956031567e-05, |
| "loss": 0.0587, |
| "step": 1039 |
| }, |
| { |
| "epoch": 3.355411954765751, |
| "grad_norm": 0.17072725296020508, |
| "learning_rate": 1.6216459977452088e-05, |
| "loss": 0.0606, |
| "step": 1040 |
| }, |
| { |
| "epoch": 3.358642972536349, |
| "grad_norm": 0.13390378654003143, |
| "learning_rate": 1.6211950394588503e-05, |
| "loss": 0.0411, |
| "step": 1041 |
| }, |
| { |
| "epoch": 3.361873990306947, |
| "grad_norm": 0.10424879193305969, |
| "learning_rate": 1.6207440811724917e-05, |
| "loss": 0.0356, |
| "step": 1042 |
| }, |
| { |
| "epoch": 3.3651050080775446, |
| "grad_norm": 0.26167649030685425, |
| "learning_rate": 1.620293122886133e-05, |
| "loss": 0.0569, |
| "step": 1043 |
| }, |
| { |
| "epoch": 3.368336025848142, |
| "grad_norm": 0.1589985489845276, |
| "learning_rate": 1.6198421645997746e-05, |
| "loss": 0.0539, |
| "step": 1044 |
| }, |
| { |
| "epoch": 3.37156704361874, |
| "grad_norm": 0.14946100115776062, |
| "learning_rate": 1.619391206313416e-05, |
| "loss": 0.0531, |
| "step": 1045 |
| }, |
| { |
| "epoch": 3.374798061389338, |
| "grad_norm": 0.1740565448999405, |
| "learning_rate": 1.6189402480270578e-05, |
| "loss": 0.0626, |
| "step": 1046 |
| }, |
| { |
| "epoch": 3.378029079159935, |
| "grad_norm": 0.16527672111988068, |
| "learning_rate": 1.6184892897406992e-05, |
| "loss": 0.061, |
| "step": 1047 |
| }, |
| { |
| "epoch": 3.381260096930533, |
| "grad_norm": 0.11640432476997375, |
| "learning_rate": 1.6180383314543407e-05, |
| "loss": 0.0433, |
| "step": 1048 |
| }, |
| { |
| "epoch": 3.384491114701131, |
| "grad_norm": 0.11857109516859055, |
| "learning_rate": 1.617587373167982e-05, |
| "loss": 0.0374, |
| "step": 1049 |
| }, |
| { |
| "epoch": 3.387722132471729, |
| "grad_norm": 0.15555702149868011, |
| "learning_rate": 1.6171364148816236e-05, |
| "loss": 0.0505, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.387722132471729, |
| "eval_loss": 0.07340462505817413, |
| "eval_runtime": 188.3173, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1050 |
| }, |
| { |
| "epoch": 3.3909531502423262, |
| "grad_norm": 0.1614767611026764, |
| "learning_rate": 1.616685456595265e-05, |
| "loss": 0.0616, |
| "step": 1051 |
| }, |
| { |
| "epoch": 3.394184168012924, |
| "grad_norm": 0.13632138073444366, |
| "learning_rate": 1.6162344983089068e-05, |
| "loss": 0.0495, |
| "step": 1052 |
| }, |
| { |
| "epoch": 3.397415185783522, |
| "grad_norm": 0.1372632533311844, |
| "learning_rate": 1.6157835400225482e-05, |
| "loss": 0.0433, |
| "step": 1053 |
| }, |
| { |
| "epoch": 3.4006462035541194, |
| "grad_norm": 0.17442293465137482, |
| "learning_rate": 1.6153325817361896e-05, |
| "loss": 0.0578, |
| "step": 1054 |
| }, |
| { |
| "epoch": 3.4038772213247173, |
| "grad_norm": 0.18292829394340515, |
| "learning_rate": 1.614881623449831e-05, |
| "loss": 0.0648, |
| "step": 1055 |
| }, |
| { |
| "epoch": 3.407108239095315, |
| "grad_norm": 0.15629050135612488, |
| "learning_rate": 1.6144306651634725e-05, |
| "loss": 0.0504, |
| "step": 1056 |
| }, |
| { |
| "epoch": 3.410339256865913, |
| "grad_norm": 0.1325637549161911, |
| "learning_rate": 1.613979706877114e-05, |
| "loss": 0.039, |
| "step": 1057 |
| }, |
| { |
| "epoch": 3.4135702746365104, |
| "grad_norm": 0.17488038539886475, |
| "learning_rate": 1.6135287485907557e-05, |
| "loss": 0.0557, |
| "step": 1058 |
| }, |
| { |
| "epoch": 3.4168012924071083, |
| "grad_norm": 0.1492920219898224, |
| "learning_rate": 1.6130777903043972e-05, |
| "loss": 0.0537, |
| "step": 1059 |
| }, |
| { |
| "epoch": 3.420032310177706, |
| "grad_norm": 0.14212766289710999, |
| "learning_rate": 1.6126268320180386e-05, |
| "loss": 0.0422, |
| "step": 1060 |
| }, |
| { |
| "epoch": 3.4232633279483036, |
| "grad_norm": 0.1526009440422058, |
| "learning_rate": 1.61217587373168e-05, |
| "loss": 0.0572, |
| "step": 1061 |
| }, |
| { |
| "epoch": 3.4264943457189014, |
| "grad_norm": 0.11353015899658203, |
| "learning_rate": 1.6117249154453215e-05, |
| "loss": 0.0383, |
| "step": 1062 |
| }, |
| { |
| "epoch": 3.4297253634894993, |
| "grad_norm": 0.18253804743289948, |
| "learning_rate": 1.611273957158963e-05, |
| "loss": 0.0657, |
| "step": 1063 |
| }, |
| { |
| "epoch": 3.432956381260097, |
| "grad_norm": 0.1422210931777954, |
| "learning_rate": 1.6108229988726044e-05, |
| "loss": 0.0528, |
| "step": 1064 |
| }, |
| { |
| "epoch": 3.4361873990306946, |
| "grad_norm": 0.15058425068855286, |
| "learning_rate": 1.6103720405862458e-05, |
| "loss": 0.052, |
| "step": 1065 |
| }, |
| { |
| "epoch": 3.4394184168012925, |
| "grad_norm": 0.13702097535133362, |
| "learning_rate": 1.6099210822998873e-05, |
| "loss": 0.0478, |
| "step": 1066 |
| }, |
| { |
| "epoch": 3.4426494345718903, |
| "grad_norm": 0.19859325885772705, |
| "learning_rate": 1.609470124013529e-05, |
| "loss": 0.0512, |
| "step": 1067 |
| }, |
| { |
| "epoch": 3.4458804523424877, |
| "grad_norm": 0.1942027509212494, |
| "learning_rate": 1.6090191657271705e-05, |
| "loss": 0.0711, |
| "step": 1068 |
| }, |
| { |
| "epoch": 3.4491114701130856, |
| "grad_norm": 0.1615460216999054, |
| "learning_rate": 1.608568207440812e-05, |
| "loss": 0.0532, |
| "step": 1069 |
| }, |
| { |
| "epoch": 3.4523424878836835, |
| "grad_norm": 0.15197354555130005, |
| "learning_rate": 1.6081172491544533e-05, |
| "loss": 0.0566, |
| "step": 1070 |
| }, |
| { |
| "epoch": 3.4555735056542813, |
| "grad_norm": 0.131342813372612, |
| "learning_rate": 1.6076662908680948e-05, |
| "loss": 0.0448, |
| "step": 1071 |
| }, |
| { |
| "epoch": 3.4588045234248788, |
| "grad_norm": 0.1431513875722885, |
| "learning_rate": 1.6072153325817362e-05, |
| "loss": 0.0502, |
| "step": 1072 |
| }, |
| { |
| "epoch": 3.4620355411954766, |
| "grad_norm": 0.1926133781671524, |
| "learning_rate": 1.6067643742953777e-05, |
| "loss": 0.0625, |
| "step": 1073 |
| }, |
| { |
| "epoch": 3.4652665589660745, |
| "grad_norm": 0.13775743544101715, |
| "learning_rate": 1.606313416009019e-05, |
| "loss": 0.0529, |
| "step": 1074 |
| }, |
| { |
| "epoch": 3.468497576736672, |
| "grad_norm": 0.1370486617088318, |
| "learning_rate": 1.6058624577226605e-05, |
| "loss": 0.0489, |
| "step": 1075 |
| }, |
| { |
| "epoch": 3.47172859450727, |
| "grad_norm": 0.18667002022266388, |
| "learning_rate": 1.6054114994363023e-05, |
| "loss": 0.0734, |
| "step": 1076 |
| }, |
| { |
| "epoch": 3.4749596122778676, |
| "grad_norm": 0.16826723515987396, |
| "learning_rate": 1.6049605411499438e-05, |
| "loss": 0.0568, |
| "step": 1077 |
| }, |
| { |
| "epoch": 3.4781906300484655, |
| "grad_norm": 0.1706121861934662, |
| "learning_rate": 1.6045095828635852e-05, |
| "loss": 0.064, |
| "step": 1078 |
| }, |
| { |
| "epoch": 3.481421647819063, |
| "grad_norm": 0.12642326951026917, |
| "learning_rate": 1.6040586245772266e-05, |
| "loss": 0.0441, |
| "step": 1079 |
| }, |
| { |
| "epoch": 3.484652665589661, |
| "grad_norm": 0.14685548841953278, |
| "learning_rate": 1.603607666290868e-05, |
| "loss": 0.0484, |
| "step": 1080 |
| }, |
| { |
| "epoch": 3.4878836833602587, |
| "grad_norm": 0.13969610631465912, |
| "learning_rate": 1.6031567080045095e-05, |
| "loss": 0.0467, |
| "step": 1081 |
| }, |
| { |
| "epoch": 3.491114701130856, |
| "grad_norm": 0.18751631677150726, |
| "learning_rate": 1.6027057497181513e-05, |
| "loss": 0.0653, |
| "step": 1082 |
| }, |
| { |
| "epoch": 3.494345718901454, |
| "grad_norm": 0.14187730848789215, |
| "learning_rate": 1.6022547914317927e-05, |
| "loss": 0.0485, |
| "step": 1083 |
| }, |
| { |
| "epoch": 3.497576736672052, |
| "grad_norm": 0.13812421262264252, |
| "learning_rate": 1.6018038331454342e-05, |
| "loss": 0.0494, |
| "step": 1084 |
| }, |
| { |
| "epoch": 3.5008077544426497, |
| "grad_norm": 0.13007357716560364, |
| "learning_rate": 1.6013528748590756e-05, |
| "loss": 0.0454, |
| "step": 1085 |
| }, |
| { |
| "epoch": 3.504038772213247, |
| "grad_norm": 0.16555847227573395, |
| "learning_rate": 1.600901916572717e-05, |
| "loss": 0.0474, |
| "step": 1086 |
| }, |
| { |
| "epoch": 3.507269789983845, |
| "grad_norm": 0.11033131182193756, |
| "learning_rate": 1.6004509582863585e-05, |
| "loss": 0.0367, |
| "step": 1087 |
| }, |
| { |
| "epoch": 3.5105008077544424, |
| "grad_norm": 0.14931395649909973, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 0.0489, |
| "step": 1088 |
| }, |
| { |
| "epoch": 3.5137318255250403, |
| "grad_norm": 0.16848234832286835, |
| "learning_rate": 1.5995490417136417e-05, |
| "loss": 0.0578, |
| "step": 1089 |
| }, |
| { |
| "epoch": 3.516962843295638, |
| "grad_norm": 0.15877306461334229, |
| "learning_rate": 1.599098083427283e-05, |
| "loss": 0.0524, |
| "step": 1090 |
| }, |
| { |
| "epoch": 3.520193861066236, |
| "grad_norm": 0.16530410945415497, |
| "learning_rate": 1.5986471251409246e-05, |
| "loss": 0.0619, |
| "step": 1091 |
| }, |
| { |
| "epoch": 3.523424878836834, |
| "grad_norm": 0.14331963658332825, |
| "learning_rate": 1.598196166854566e-05, |
| "loss": 0.0486, |
| "step": 1092 |
| }, |
| { |
| "epoch": 3.5266558966074313, |
| "grad_norm": 0.15027795732021332, |
| "learning_rate": 1.5977452085682075e-05, |
| "loss": 0.048, |
| "step": 1093 |
| }, |
| { |
| "epoch": 3.529886914378029, |
| "grad_norm": 0.15376311540603638, |
| "learning_rate": 1.5972942502818492e-05, |
| "loss": 0.0559, |
| "step": 1094 |
| }, |
| { |
| "epoch": 3.5331179321486266, |
| "grad_norm": 0.1315869837999344, |
| "learning_rate": 1.5968432919954907e-05, |
| "loss": 0.0451, |
| "step": 1095 |
| }, |
| { |
| "epoch": 3.5363489499192244, |
| "grad_norm": 0.13606947660446167, |
| "learning_rate": 1.596392333709132e-05, |
| "loss": 0.0433, |
| "step": 1096 |
| }, |
| { |
| "epoch": 3.5395799676898223, |
| "grad_norm": 0.20028483867645264, |
| "learning_rate": 1.5959413754227736e-05, |
| "loss": 0.0593, |
| "step": 1097 |
| }, |
| { |
| "epoch": 3.54281098546042, |
| "grad_norm": 0.15004722774028778, |
| "learning_rate": 1.595490417136415e-05, |
| "loss": 0.0501, |
| "step": 1098 |
| }, |
| { |
| "epoch": 3.546042003231018, |
| "grad_norm": 0.14318561553955078, |
| "learning_rate": 1.5950394588500564e-05, |
| "loss": 0.0463, |
| "step": 1099 |
| }, |
| { |
| "epoch": 3.5492730210016155, |
| "grad_norm": 0.11907773464918137, |
| "learning_rate": 1.5945885005636982e-05, |
| "loss": 0.0367, |
| "step": 1100 |
| }, |
| { |
| "epoch": 3.5525040387722133, |
| "grad_norm": 0.19116652011871338, |
| "learning_rate": 1.5941375422773397e-05, |
| "loss": 0.0741, |
| "step": 1101 |
| }, |
| { |
| "epoch": 3.5557350565428107, |
| "grad_norm": 0.14904284477233887, |
| "learning_rate": 1.593686583990981e-05, |
| "loss": 0.0493, |
| "step": 1102 |
| }, |
| { |
| "epoch": 3.5589660743134086, |
| "grad_norm": 0.134224995970726, |
| "learning_rate": 1.5932356257046225e-05, |
| "loss": 0.0519, |
| "step": 1103 |
| }, |
| { |
| "epoch": 3.5621970920840065, |
| "grad_norm": 0.16584910452365875, |
| "learning_rate": 1.592784667418264e-05, |
| "loss": 0.0604, |
| "step": 1104 |
| }, |
| { |
| "epoch": 3.5654281098546043, |
| "grad_norm": 0.19957157969474792, |
| "learning_rate": 1.5923337091319054e-05, |
| "loss": 0.0874, |
| "step": 1105 |
| }, |
| { |
| "epoch": 3.568659127625202, |
| "grad_norm": 0.16732187569141388, |
| "learning_rate": 1.591882750845547e-05, |
| "loss": 0.0521, |
| "step": 1106 |
| }, |
| { |
| "epoch": 3.5718901453957996, |
| "grad_norm": 0.10181103646755219, |
| "learning_rate": 1.5914317925591883e-05, |
| "loss": 0.0356, |
| "step": 1107 |
| }, |
| { |
| "epoch": 3.5751211631663975, |
| "grad_norm": 0.1692725121974945, |
| "learning_rate": 1.59098083427283e-05, |
| "loss": 0.0546, |
| "step": 1108 |
| }, |
| { |
| "epoch": 3.578352180936995, |
| "grad_norm": 0.15010833740234375, |
| "learning_rate": 1.5905298759864715e-05, |
| "loss": 0.0525, |
| "step": 1109 |
| }, |
| { |
| "epoch": 3.581583198707593, |
| "grad_norm": 0.14599505066871643, |
| "learning_rate": 1.590078917700113e-05, |
| "loss": 0.0503, |
| "step": 1110 |
| }, |
| { |
| "epoch": 3.5848142164781907, |
| "grad_norm": 0.1455962210893631, |
| "learning_rate": 1.5896279594137544e-05, |
| "loss": 0.0538, |
| "step": 1111 |
| }, |
| { |
| "epoch": 3.5880452342487885, |
| "grad_norm": 0.16955074667930603, |
| "learning_rate": 1.5891770011273958e-05, |
| "loss": 0.0562, |
| "step": 1112 |
| }, |
| { |
| "epoch": 3.5912762520193864, |
| "grad_norm": 0.12441671639680862, |
| "learning_rate": 1.5887260428410373e-05, |
| "loss": 0.0423, |
| "step": 1113 |
| }, |
| { |
| "epoch": 3.594507269789984, |
| "grad_norm": 0.1585661768913269, |
| "learning_rate": 1.5882750845546787e-05, |
| "loss": 0.0585, |
| "step": 1114 |
| }, |
| { |
| "epoch": 3.5977382875605817, |
| "grad_norm": 0.1167236939072609, |
| "learning_rate": 1.58782412626832e-05, |
| "loss": 0.0396, |
| "step": 1115 |
| }, |
| { |
| "epoch": 3.600969305331179, |
| "grad_norm": 0.11918371170759201, |
| "learning_rate": 1.5873731679819616e-05, |
| "loss": 0.0443, |
| "step": 1116 |
| }, |
| { |
| "epoch": 3.604200323101777, |
| "grad_norm": 0.1533348709344864, |
| "learning_rate": 1.586922209695603e-05, |
| "loss": 0.0554, |
| "step": 1117 |
| }, |
| { |
| "epoch": 3.607431340872375, |
| "grad_norm": 0.14366954565048218, |
| "learning_rate": 1.5864712514092448e-05, |
| "loss": 0.046, |
| "step": 1118 |
| }, |
| { |
| "epoch": 3.6106623586429727, |
| "grad_norm": 0.17497192323207855, |
| "learning_rate": 1.5860202931228862e-05, |
| "loss": 0.0572, |
| "step": 1119 |
| }, |
| { |
| "epoch": 3.61389337641357, |
| "grad_norm": 0.13877834379673004, |
| "learning_rate": 1.5855693348365277e-05, |
| "loss": 0.0493, |
| "step": 1120 |
| }, |
| { |
| "epoch": 3.617124394184168, |
| "grad_norm": 0.19728821516036987, |
| "learning_rate": 1.585118376550169e-05, |
| "loss": 0.0712, |
| "step": 1121 |
| }, |
| { |
| "epoch": 3.620355411954766, |
| "grad_norm": 0.11815082281827927, |
| "learning_rate": 1.5846674182638105e-05, |
| "loss": 0.0431, |
| "step": 1122 |
| }, |
| { |
| "epoch": 3.6235864297253633, |
| "grad_norm": 0.1730748862028122, |
| "learning_rate": 1.5842164599774523e-05, |
| "loss": 0.0604, |
| "step": 1123 |
| }, |
| { |
| "epoch": 3.626817447495961, |
| "grad_norm": 0.10923890024423599, |
| "learning_rate": 1.5837655016910938e-05, |
| "loss": 0.0355, |
| "step": 1124 |
| }, |
| { |
| "epoch": 3.630048465266559, |
| "grad_norm": 0.12295415252447128, |
| "learning_rate": 1.5833145434047352e-05, |
| "loss": 0.0412, |
| "step": 1125 |
| }, |
| { |
| "epoch": 3.630048465266559, |
| "eval_loss": 0.0719488188624382, |
| "eval_runtime": 188.3414, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1125 |
| }, |
| { |
| "epoch": 3.633279483037157, |
| "grad_norm": 0.20836062729358673, |
| "learning_rate": 1.5828635851183766e-05, |
| "loss": 0.0624, |
| "step": 1126 |
| }, |
| { |
| "epoch": 3.6365105008077543, |
| "grad_norm": 0.11430151760578156, |
| "learning_rate": 1.582412626832018e-05, |
| "loss": 0.0398, |
| "step": 1127 |
| }, |
| { |
| "epoch": 3.639741518578352, |
| "grad_norm": 0.24180755019187927, |
| "learning_rate": 1.5819616685456595e-05, |
| "loss": 0.0508, |
| "step": 1128 |
| }, |
| { |
| "epoch": 3.64297253634895, |
| "grad_norm": 0.15328733623027802, |
| "learning_rate": 1.5815107102593013e-05, |
| "loss": 0.0497, |
| "step": 1129 |
| }, |
| { |
| "epoch": 3.6462035541195474, |
| "grad_norm": 0.147452712059021, |
| "learning_rate": 1.5810597519729427e-05, |
| "loss": 0.0502, |
| "step": 1130 |
| }, |
| { |
| "epoch": 3.6494345718901453, |
| "grad_norm": 0.18881501257419586, |
| "learning_rate": 1.5806087936865842e-05, |
| "loss": 0.0544, |
| "step": 1131 |
| }, |
| { |
| "epoch": 3.652665589660743, |
| "grad_norm": 0.18115286529064178, |
| "learning_rate": 1.5801578354002256e-05, |
| "loss": 0.0579, |
| "step": 1132 |
| }, |
| { |
| "epoch": 3.655896607431341, |
| "grad_norm": 0.1260828673839569, |
| "learning_rate": 1.579706877113867e-05, |
| "loss": 0.0382, |
| "step": 1133 |
| }, |
| { |
| "epoch": 3.6591276252019385, |
| "grad_norm": 0.17443282902240753, |
| "learning_rate": 1.5792559188275085e-05, |
| "loss": 0.0589, |
| "step": 1134 |
| }, |
| { |
| "epoch": 3.6623586429725363, |
| "grad_norm": 0.18841315805912018, |
| "learning_rate": 1.5788049605411503e-05, |
| "loss": 0.0577, |
| "step": 1135 |
| }, |
| { |
| "epoch": 3.665589660743134, |
| "grad_norm": 0.12547925114631653, |
| "learning_rate": 1.5783540022547917e-05, |
| "loss": 0.0421, |
| "step": 1136 |
| }, |
| { |
| "epoch": 3.6688206785137316, |
| "grad_norm": 0.13334450125694275, |
| "learning_rate": 1.577903043968433e-05, |
| "loss": 0.0401, |
| "step": 1137 |
| }, |
| { |
| "epoch": 3.6720516962843295, |
| "grad_norm": 0.15941473841667175, |
| "learning_rate": 1.5774520856820746e-05, |
| "loss": 0.0508, |
| "step": 1138 |
| }, |
| { |
| "epoch": 3.6752827140549273, |
| "grad_norm": 0.14976635575294495, |
| "learning_rate": 1.577001127395716e-05, |
| "loss": 0.051, |
| "step": 1139 |
| }, |
| { |
| "epoch": 3.678513731825525, |
| "grad_norm": 0.16594912111759186, |
| "learning_rate": 1.5765501691093575e-05, |
| "loss": 0.0522, |
| "step": 1140 |
| }, |
| { |
| "epoch": 3.6817447495961226, |
| "grad_norm": 0.1369272917509079, |
| "learning_rate": 1.5760992108229992e-05, |
| "loss": 0.0453, |
| "step": 1141 |
| }, |
| { |
| "epoch": 3.6849757673667205, |
| "grad_norm": 0.18500114977359772, |
| "learning_rate": 1.5756482525366407e-05, |
| "loss": 0.0603, |
| "step": 1142 |
| }, |
| { |
| "epoch": 3.6882067851373184, |
| "grad_norm": 0.1417061984539032, |
| "learning_rate": 1.575197294250282e-05, |
| "loss": 0.0473, |
| "step": 1143 |
| }, |
| { |
| "epoch": 3.691437802907916, |
| "grad_norm": 0.13675205409526825, |
| "learning_rate": 1.5747463359639236e-05, |
| "loss": 0.047, |
| "step": 1144 |
| }, |
| { |
| "epoch": 3.6946688206785137, |
| "grad_norm": 0.20681920647621155, |
| "learning_rate": 1.574295377677565e-05, |
| "loss": 0.0596, |
| "step": 1145 |
| }, |
| { |
| "epoch": 3.6978998384491115, |
| "grad_norm": 0.1576913595199585, |
| "learning_rate": 1.5738444193912064e-05, |
| "loss": 0.0468, |
| "step": 1146 |
| }, |
| { |
| "epoch": 3.7011308562197094, |
| "grad_norm": 0.20271863043308258, |
| "learning_rate": 1.573393461104848e-05, |
| "loss": 0.0652, |
| "step": 1147 |
| }, |
| { |
| "epoch": 3.704361873990307, |
| "grad_norm": 0.14045056700706482, |
| "learning_rate": 1.5729425028184893e-05, |
| "loss": 0.0476, |
| "step": 1148 |
| }, |
| { |
| "epoch": 3.7075928917609047, |
| "grad_norm": 0.16174402832984924, |
| "learning_rate": 1.572491544532131e-05, |
| "loss": 0.05, |
| "step": 1149 |
| }, |
| { |
| "epoch": 3.7108239095315025, |
| "grad_norm": 0.17413346469402313, |
| "learning_rate": 1.5720405862457725e-05, |
| "loss": 0.0602, |
| "step": 1150 |
| }, |
| { |
| "epoch": 3.7140549273021, |
| "grad_norm": 0.16131189465522766, |
| "learning_rate": 1.571589627959414e-05, |
| "loss": 0.0522, |
| "step": 1151 |
| }, |
| { |
| "epoch": 3.717285945072698, |
| "grad_norm": 0.2157316356897354, |
| "learning_rate": 1.5711386696730554e-05, |
| "loss": 0.0667, |
| "step": 1152 |
| }, |
| { |
| "epoch": 3.7205169628432957, |
| "grad_norm": 0.11224117875099182, |
| "learning_rate": 1.570687711386697e-05, |
| "loss": 0.0394, |
| "step": 1153 |
| }, |
| { |
| "epoch": 3.7237479806138936, |
| "grad_norm": 0.14941376447677612, |
| "learning_rate": 1.5702367531003383e-05, |
| "loss": 0.0465, |
| "step": 1154 |
| }, |
| { |
| "epoch": 3.726978998384491, |
| "grad_norm": 0.15842793881893158, |
| "learning_rate": 1.5697857948139797e-05, |
| "loss": 0.0554, |
| "step": 1155 |
| }, |
| { |
| "epoch": 3.730210016155089, |
| "grad_norm": 0.1107991486787796, |
| "learning_rate": 1.569334836527621e-05, |
| "loss": 0.0364, |
| "step": 1156 |
| }, |
| { |
| "epoch": 3.7334410339256867, |
| "grad_norm": 0.14628984034061432, |
| "learning_rate": 1.5688838782412626e-05, |
| "loss": 0.0519, |
| "step": 1157 |
| }, |
| { |
| "epoch": 3.736672051696284, |
| "grad_norm": 0.20470330119132996, |
| "learning_rate": 1.568432919954904e-05, |
| "loss": 0.0716, |
| "step": 1158 |
| }, |
| { |
| "epoch": 3.739903069466882, |
| "grad_norm": 0.2911032438278198, |
| "learning_rate": 1.5679819616685458e-05, |
| "loss": 0.0552, |
| "step": 1159 |
| }, |
| { |
| "epoch": 3.74313408723748, |
| "grad_norm": 0.11797036975622177, |
| "learning_rate": 1.5675310033821873e-05, |
| "loss": 0.0437, |
| "step": 1160 |
| }, |
| { |
| "epoch": 3.7463651050080777, |
| "grad_norm": 0.15150827169418335, |
| "learning_rate": 1.5670800450958287e-05, |
| "loss": 0.0482, |
| "step": 1161 |
| }, |
| { |
| "epoch": 3.749596122778675, |
| "grad_norm": 0.1999519169330597, |
| "learning_rate": 1.56662908680947e-05, |
| "loss": 0.0679, |
| "step": 1162 |
| }, |
| { |
| "epoch": 3.752827140549273, |
| "grad_norm": 0.1607101857662201, |
| "learning_rate": 1.5661781285231116e-05, |
| "loss": 0.0538, |
| "step": 1163 |
| }, |
| { |
| "epoch": 3.756058158319871, |
| "grad_norm": 0.11877261102199554, |
| "learning_rate": 1.565727170236753e-05, |
| "loss": 0.0365, |
| "step": 1164 |
| }, |
| { |
| "epoch": 3.7592891760904683, |
| "grad_norm": 0.15243922173976898, |
| "learning_rate": 1.5652762119503948e-05, |
| "loss": 0.0556, |
| "step": 1165 |
| }, |
| { |
| "epoch": 3.762520193861066, |
| "grad_norm": 0.16719117760658264, |
| "learning_rate": 1.5648252536640362e-05, |
| "loss": 0.0529, |
| "step": 1166 |
| }, |
| { |
| "epoch": 3.765751211631664, |
| "grad_norm": 0.19827044010162354, |
| "learning_rate": 1.5643742953776777e-05, |
| "loss": 0.063, |
| "step": 1167 |
| }, |
| { |
| "epoch": 3.768982229402262, |
| "grad_norm": 0.1877213567495346, |
| "learning_rate": 1.563923337091319e-05, |
| "loss": 0.0663, |
| "step": 1168 |
| }, |
| { |
| "epoch": 3.7722132471728593, |
| "grad_norm": 0.14807568490505219, |
| "learning_rate": 1.5634723788049606e-05, |
| "loss": 0.0483, |
| "step": 1169 |
| }, |
| { |
| "epoch": 3.775444264943457, |
| "grad_norm": 0.12423454970121384, |
| "learning_rate": 1.563021420518602e-05, |
| "loss": 0.0428, |
| "step": 1170 |
| }, |
| { |
| "epoch": 3.778675282714055, |
| "grad_norm": 0.17218518257141113, |
| "learning_rate": 1.5625704622322438e-05, |
| "loss": 0.0532, |
| "step": 1171 |
| }, |
| { |
| "epoch": 3.7819063004846525, |
| "grad_norm": 0.13151338696479797, |
| "learning_rate": 1.5621195039458852e-05, |
| "loss": 0.0443, |
| "step": 1172 |
| }, |
| { |
| "epoch": 3.7851373182552503, |
| "grad_norm": 0.16320225596427917, |
| "learning_rate": 1.5616685456595266e-05, |
| "loss": 0.0532, |
| "step": 1173 |
| }, |
| { |
| "epoch": 3.788368336025848, |
| "grad_norm": 0.1505471020936966, |
| "learning_rate": 1.561217587373168e-05, |
| "loss": 0.0491, |
| "step": 1174 |
| }, |
| { |
| "epoch": 3.791599353796446, |
| "grad_norm": 0.22144146263599396, |
| "learning_rate": 1.5607666290868095e-05, |
| "loss": 0.0697, |
| "step": 1175 |
| }, |
| { |
| "epoch": 3.7948303715670435, |
| "grad_norm": 0.1517932265996933, |
| "learning_rate": 1.560315670800451e-05, |
| "loss": 0.0548, |
| "step": 1176 |
| }, |
| { |
| "epoch": 3.7980613893376414, |
| "grad_norm": 0.2082444280385971, |
| "learning_rate": 1.5598647125140927e-05, |
| "loss": 0.0763, |
| "step": 1177 |
| }, |
| { |
| "epoch": 3.8012924071082392, |
| "grad_norm": 0.15881206095218658, |
| "learning_rate": 1.5594137542277342e-05, |
| "loss": 0.0572, |
| "step": 1178 |
| }, |
| { |
| "epoch": 3.8045234248788367, |
| "grad_norm": 0.1757776439189911, |
| "learning_rate": 1.5589627959413756e-05, |
| "loss": 0.0572, |
| "step": 1179 |
| }, |
| { |
| "epoch": 3.8077544426494345, |
| "grad_norm": 0.14664851129055023, |
| "learning_rate": 1.558511837655017e-05, |
| "loss": 0.0476, |
| "step": 1180 |
| }, |
| { |
| "epoch": 3.8109854604200324, |
| "grad_norm": 0.1364152878522873, |
| "learning_rate": 1.5580608793686585e-05, |
| "loss": 0.0425, |
| "step": 1181 |
| }, |
| { |
| "epoch": 3.8142164781906303, |
| "grad_norm": 0.16134034097194672, |
| "learning_rate": 1.5576099210823e-05, |
| "loss": 0.0523, |
| "step": 1182 |
| }, |
| { |
| "epoch": 3.8174474959612277, |
| "grad_norm": 0.20310769975185394, |
| "learning_rate": 1.5571589627959417e-05, |
| "loss": 0.0672, |
| "step": 1183 |
| }, |
| { |
| "epoch": 3.8206785137318255, |
| "grad_norm": 0.12990249693393707, |
| "learning_rate": 1.556708004509583e-05, |
| "loss": 0.0414, |
| "step": 1184 |
| }, |
| { |
| "epoch": 3.8239095315024234, |
| "grad_norm": 0.18581929802894592, |
| "learning_rate": 1.5562570462232246e-05, |
| "loss": 0.0585, |
| "step": 1185 |
| }, |
| { |
| "epoch": 3.827140549273021, |
| "grad_norm": 0.14280153810977936, |
| "learning_rate": 1.555806087936866e-05, |
| "loss": 0.0456, |
| "step": 1186 |
| }, |
| { |
| "epoch": 3.8303715670436187, |
| "grad_norm": 0.1313421130180359, |
| "learning_rate": 1.5553551296505075e-05, |
| "loss": 0.0453, |
| "step": 1187 |
| }, |
| { |
| "epoch": 3.8336025848142166, |
| "grad_norm": 0.14653180539608002, |
| "learning_rate": 1.554904171364149e-05, |
| "loss": 0.0494, |
| "step": 1188 |
| }, |
| { |
| "epoch": 3.8368336025848144, |
| "grad_norm": 0.169707253575325, |
| "learning_rate": 1.5544532130777903e-05, |
| "loss": 0.0549, |
| "step": 1189 |
| }, |
| { |
| "epoch": 3.840064620355412, |
| "grad_norm": 0.15947915613651276, |
| "learning_rate": 1.554002254791432e-05, |
| "loss": 0.0481, |
| "step": 1190 |
| }, |
| { |
| "epoch": 3.8432956381260097, |
| "grad_norm": 0.16549836099147797, |
| "learning_rate": 1.5535512965050736e-05, |
| "loss": 0.0556, |
| "step": 1191 |
| }, |
| { |
| "epoch": 3.8465266558966076, |
| "grad_norm": 0.1207936704158783, |
| "learning_rate": 1.553100338218715e-05, |
| "loss": 0.0389, |
| "step": 1192 |
| }, |
| { |
| "epoch": 3.849757673667205, |
| "grad_norm": 0.14455802738666534, |
| "learning_rate": 1.5526493799323564e-05, |
| "loss": 0.0513, |
| "step": 1193 |
| }, |
| { |
| "epoch": 3.852988691437803, |
| "grad_norm": 0.18907438218593597, |
| "learning_rate": 1.552198421645998e-05, |
| "loss": 0.065, |
| "step": 1194 |
| }, |
| { |
| "epoch": 3.8562197092084007, |
| "grad_norm": 0.1829371452331543, |
| "learning_rate": 1.5517474633596393e-05, |
| "loss": 0.0582, |
| "step": 1195 |
| }, |
| { |
| "epoch": 3.8594507269789986, |
| "grad_norm": 0.1572795808315277, |
| "learning_rate": 1.5512965050732808e-05, |
| "loss": 0.0477, |
| "step": 1196 |
| }, |
| { |
| "epoch": 3.862681744749596, |
| "grad_norm": 0.18964293599128723, |
| "learning_rate": 1.5508455467869222e-05, |
| "loss": 0.0625, |
| "step": 1197 |
| }, |
| { |
| "epoch": 3.865912762520194, |
| "grad_norm": 0.11856849491596222, |
| "learning_rate": 1.5503945885005636e-05, |
| "loss": 0.0398, |
| "step": 1198 |
| }, |
| { |
| "epoch": 3.8691437802907918, |
| "grad_norm": 0.15630431473255157, |
| "learning_rate": 1.549943630214205e-05, |
| "loss": 0.0514, |
| "step": 1199 |
| }, |
| { |
| "epoch": 3.872374798061389, |
| "grad_norm": 0.15305393934249878, |
| "learning_rate": 1.549492671927847e-05, |
| "loss": 0.0459, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.872374798061389, |
| "eval_loss": 0.07196119427680969, |
| "eval_runtime": 188.357, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1200 |
| }, |
| { |
| "epoch": 3.875605815831987, |
| "grad_norm": 0.1756509244441986, |
| "learning_rate": 1.5490417136414883e-05, |
| "loss": 0.0559, |
| "step": 1201 |
| }, |
| { |
| "epoch": 3.878836833602585, |
| "grad_norm": 0.16133564710617065, |
| "learning_rate": 1.5485907553551297e-05, |
| "loss": 0.049, |
| "step": 1202 |
| }, |
| { |
| "epoch": 3.8820678513731828, |
| "grad_norm": 0.14332996308803558, |
| "learning_rate": 1.5481397970687712e-05, |
| "loss": 0.0498, |
| "step": 1203 |
| }, |
| { |
| "epoch": 3.88529886914378, |
| "grad_norm": 0.16257117688655853, |
| "learning_rate": 1.5476888387824126e-05, |
| "loss": 0.057, |
| "step": 1204 |
| }, |
| { |
| "epoch": 3.888529886914378, |
| "grad_norm": 0.18083199858665466, |
| "learning_rate": 1.547237880496054e-05, |
| "loss": 0.0559, |
| "step": 1205 |
| }, |
| { |
| "epoch": 3.891760904684976, |
| "grad_norm": 0.6128630042076111, |
| "learning_rate": 1.5467869222096958e-05, |
| "loss": 0.0485, |
| "step": 1206 |
| }, |
| { |
| "epoch": 3.8949919224555734, |
| "grad_norm": 0.15541522204875946, |
| "learning_rate": 1.5463359639233373e-05, |
| "loss": 0.0501, |
| "step": 1207 |
| }, |
| { |
| "epoch": 3.898222940226171, |
| "grad_norm": 0.17722243070602417, |
| "learning_rate": 1.5458850056369787e-05, |
| "loss": 0.0631, |
| "step": 1208 |
| }, |
| { |
| "epoch": 3.901453957996769, |
| "grad_norm": 0.18535539507865906, |
| "learning_rate": 1.54543404735062e-05, |
| "loss": 0.0594, |
| "step": 1209 |
| }, |
| { |
| "epoch": 3.904684975767367, |
| "grad_norm": 0.11115753650665283, |
| "learning_rate": 1.5449830890642616e-05, |
| "loss": 0.0364, |
| "step": 1210 |
| }, |
| { |
| "epoch": 3.9079159935379644, |
| "grad_norm": 0.15761958062648773, |
| "learning_rate": 1.544532130777903e-05, |
| "loss": 0.0521, |
| "step": 1211 |
| }, |
| { |
| "epoch": 3.9111470113085622, |
| "grad_norm": 0.1633194386959076, |
| "learning_rate": 1.5440811724915448e-05, |
| "loss": 0.0488, |
| "step": 1212 |
| }, |
| { |
| "epoch": 3.9143780290791597, |
| "grad_norm": 0.17016156017780304, |
| "learning_rate": 1.5436302142051862e-05, |
| "loss": 0.0547, |
| "step": 1213 |
| }, |
| { |
| "epoch": 3.9176090468497575, |
| "grad_norm": 0.14119581878185272, |
| "learning_rate": 1.5431792559188277e-05, |
| "loss": 0.0437, |
| "step": 1214 |
| }, |
| { |
| "epoch": 3.9208400646203554, |
| "grad_norm": 0.14919079840183258, |
| "learning_rate": 1.542728297632469e-05, |
| "loss": 0.0478, |
| "step": 1215 |
| }, |
| { |
| "epoch": 3.9240710823909533, |
| "grad_norm": 0.13685505092144012, |
| "learning_rate": 1.5422773393461106e-05, |
| "loss": 0.0443, |
| "step": 1216 |
| }, |
| { |
| "epoch": 3.927302100161551, |
| "grad_norm": 0.13098403811454773, |
| "learning_rate": 1.541826381059752e-05, |
| "loss": 0.0406, |
| "step": 1217 |
| }, |
| { |
| "epoch": 3.9305331179321485, |
| "grad_norm": 0.17652539908885956, |
| "learning_rate": 1.5413754227733938e-05, |
| "loss": 0.0563, |
| "step": 1218 |
| }, |
| { |
| "epoch": 3.9337641357027464, |
| "grad_norm": 0.18699124455451965, |
| "learning_rate": 1.5409244644870352e-05, |
| "loss": 0.0556, |
| "step": 1219 |
| }, |
| { |
| "epoch": 3.936995153473344, |
| "grad_norm": 0.19722655415534973, |
| "learning_rate": 1.5404735062006767e-05, |
| "loss": 0.0687, |
| "step": 1220 |
| }, |
| { |
| "epoch": 3.9402261712439417, |
| "grad_norm": 0.15618382394313812, |
| "learning_rate": 1.540022547914318e-05, |
| "loss": 0.0475, |
| "step": 1221 |
| }, |
| { |
| "epoch": 3.9434571890145396, |
| "grad_norm": 0.15241998434066772, |
| "learning_rate": 1.5395715896279595e-05, |
| "loss": 0.0433, |
| "step": 1222 |
| }, |
| { |
| "epoch": 3.9466882067851374, |
| "grad_norm": 0.18263420462608337, |
| "learning_rate": 1.539120631341601e-05, |
| "loss": 0.0579, |
| "step": 1223 |
| }, |
| { |
| "epoch": 3.9499192245557353, |
| "grad_norm": 0.14967773854732513, |
| "learning_rate": 1.5386696730552427e-05, |
| "loss": 0.0425, |
| "step": 1224 |
| }, |
| { |
| "epoch": 3.9531502423263327, |
| "grad_norm": 0.16370491683483124, |
| "learning_rate": 1.5382187147688842e-05, |
| "loss": 0.0558, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.9563812600969306, |
| "grad_norm": 0.17876583337783813, |
| "learning_rate": 1.5377677564825256e-05, |
| "loss": 0.0583, |
| "step": 1226 |
| }, |
| { |
| "epoch": 3.959612277867528, |
| "grad_norm": 0.14059610664844513, |
| "learning_rate": 1.537316798196167e-05, |
| "loss": 0.0399, |
| "step": 1227 |
| }, |
| { |
| "epoch": 3.962843295638126, |
| "grad_norm": 0.16172105073928833, |
| "learning_rate": 1.5368658399098085e-05, |
| "loss": 0.0481, |
| "step": 1228 |
| }, |
| { |
| "epoch": 3.9660743134087237, |
| "grad_norm": 0.1390855610370636, |
| "learning_rate": 1.53641488162345e-05, |
| "loss": 0.0478, |
| "step": 1229 |
| }, |
| { |
| "epoch": 3.9693053311793216, |
| "grad_norm": 0.15376152098178864, |
| "learning_rate": 1.5359639233370914e-05, |
| "loss": 0.0487, |
| "step": 1230 |
| }, |
| { |
| "epoch": 3.9725363489499195, |
| "grad_norm": 0.15467970073223114, |
| "learning_rate": 1.535512965050733e-05, |
| "loss": 0.0497, |
| "step": 1231 |
| }, |
| { |
| "epoch": 3.975767366720517, |
| "grad_norm": 0.16866347193717957, |
| "learning_rate": 1.5350620067643746e-05, |
| "loss": 0.056, |
| "step": 1232 |
| }, |
| { |
| "epoch": 3.9789983844911148, |
| "grad_norm": 0.22895418107509613, |
| "learning_rate": 1.534611048478016e-05, |
| "loss": 0.0539, |
| "step": 1233 |
| }, |
| { |
| "epoch": 3.982229402261712, |
| "grad_norm": 0.19517672061920166, |
| "learning_rate": 1.5341600901916575e-05, |
| "loss": 0.0583, |
| "step": 1234 |
| }, |
| { |
| "epoch": 3.98546042003231, |
| "grad_norm": 0.223899245262146, |
| "learning_rate": 1.533709131905299e-05, |
| "loss": 0.069, |
| "step": 1235 |
| }, |
| { |
| "epoch": 3.988691437802908, |
| "grad_norm": 0.15200112760066986, |
| "learning_rate": 1.5332581736189404e-05, |
| "loss": 0.044, |
| "step": 1236 |
| }, |
| { |
| "epoch": 3.991922455573506, |
| "grad_norm": 0.17538610100746155, |
| "learning_rate": 1.5328072153325818e-05, |
| "loss": 0.0584, |
| "step": 1237 |
| }, |
| { |
| "epoch": 3.9951534733441036, |
| "grad_norm": 0.23581402003765106, |
| "learning_rate": 1.5323562570462232e-05, |
| "loss": 0.0806, |
| "step": 1238 |
| }, |
| { |
| "epoch": 3.998384491114701, |
| "grad_norm": 0.15156853199005127, |
| "learning_rate": 1.5319052987598647e-05, |
| "loss": 0.05, |
| "step": 1239 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.24722698330879211, |
| "learning_rate": 1.531454340473506e-05, |
| "loss": 0.0507, |
| "step": 1240 |
| }, |
| { |
| "epoch": 4.003231017770598, |
| "grad_norm": 0.16531053185462952, |
| "learning_rate": 1.531003382187148e-05, |
| "loss": 0.0527, |
| "step": 1241 |
| }, |
| { |
| "epoch": 4.006462035541196, |
| "grad_norm": 0.19849298894405365, |
| "learning_rate": 1.5305524239007893e-05, |
| "loss": 0.0623, |
| "step": 1242 |
| }, |
| { |
| "epoch": 4.009693053311794, |
| "grad_norm": 0.1546104997396469, |
| "learning_rate": 1.5301014656144308e-05, |
| "loss": 0.0433, |
| "step": 1243 |
| }, |
| { |
| "epoch": 4.012924071082391, |
| "grad_norm": 0.21063487231731415, |
| "learning_rate": 1.5296505073280722e-05, |
| "loss": 0.0445, |
| "step": 1244 |
| }, |
| { |
| "epoch": 4.016155088852988, |
| "grad_norm": 0.1584954857826233, |
| "learning_rate": 1.5291995490417136e-05, |
| "loss": 0.0458, |
| "step": 1245 |
| }, |
| { |
| "epoch": 4.019386106623586, |
| "grad_norm": 0.151262566447258, |
| "learning_rate": 1.528748590755355e-05, |
| "loss": 0.0394, |
| "step": 1246 |
| }, |
| { |
| "epoch": 4.022617124394184, |
| "grad_norm": 0.14736542105674744, |
| "learning_rate": 1.5282976324689965e-05, |
| "loss": 0.0442, |
| "step": 1247 |
| }, |
| { |
| "epoch": 4.025848142164782, |
| "grad_norm": 0.2575235068798065, |
| "learning_rate": 1.5278466741826383e-05, |
| "loss": 0.0611, |
| "step": 1248 |
| }, |
| { |
| "epoch": 4.02907915993538, |
| "grad_norm": 0.16461840271949768, |
| "learning_rate": 1.5273957158962797e-05, |
| "loss": 0.0443, |
| "step": 1249 |
| }, |
| { |
| "epoch": 4.032310177705978, |
| "grad_norm": 0.16255277395248413, |
| "learning_rate": 1.5269447576099212e-05, |
| "loss": 0.0502, |
| "step": 1250 |
| }, |
| { |
| "epoch": 4.035541195476575, |
| "grad_norm": 0.15445704758167267, |
| "learning_rate": 1.5264937993235626e-05, |
| "loss": 0.0404, |
| "step": 1251 |
| }, |
| { |
| "epoch": 4.038772213247173, |
| "grad_norm": 0.1593495011329651, |
| "learning_rate": 1.526042841037204e-05, |
| "loss": 0.0434, |
| "step": 1252 |
| }, |
| { |
| "epoch": 4.0420032310177705, |
| "grad_norm": 0.17099085450172424, |
| "learning_rate": 1.5255918827508455e-05, |
| "loss": 0.0454, |
| "step": 1253 |
| }, |
| { |
| "epoch": 4.045234248788368, |
| "grad_norm": 0.19374652206897736, |
| "learning_rate": 1.5251409244644873e-05, |
| "loss": 0.0521, |
| "step": 1254 |
| }, |
| { |
| "epoch": 4.048465266558966, |
| "grad_norm": 0.11825862526893616, |
| "learning_rate": 1.5246899661781287e-05, |
| "loss": 0.0333, |
| "step": 1255 |
| }, |
| { |
| "epoch": 4.051696284329564, |
| "grad_norm": 0.1704898327589035, |
| "learning_rate": 1.5242390078917702e-05, |
| "loss": 0.0471, |
| "step": 1256 |
| }, |
| { |
| "epoch": 4.054927302100162, |
| "grad_norm": 0.14955446124076843, |
| "learning_rate": 1.5237880496054116e-05, |
| "loss": 0.0375, |
| "step": 1257 |
| }, |
| { |
| "epoch": 4.058158319870759, |
| "grad_norm": 0.17337749898433685, |
| "learning_rate": 1.523337091319053e-05, |
| "loss": 0.046, |
| "step": 1258 |
| }, |
| { |
| "epoch": 4.061389337641357, |
| "grad_norm": 0.22652941942214966, |
| "learning_rate": 1.5228861330326945e-05, |
| "loss": 0.0411, |
| "step": 1259 |
| }, |
| { |
| "epoch": 4.064620355411955, |
| "grad_norm": 0.1761440634727478, |
| "learning_rate": 1.5224351747463362e-05, |
| "loss": 0.0474, |
| "step": 1260 |
| }, |
| { |
| "epoch": 4.0678513731825525, |
| "grad_norm": 0.1542208343744278, |
| "learning_rate": 1.5219842164599777e-05, |
| "loss": 0.0451, |
| "step": 1261 |
| }, |
| { |
| "epoch": 4.07108239095315, |
| "grad_norm": 0.15553566813468933, |
| "learning_rate": 1.5215332581736191e-05, |
| "loss": 0.0427, |
| "step": 1262 |
| }, |
| { |
| "epoch": 4.074313408723748, |
| "grad_norm": 0.2138213962316513, |
| "learning_rate": 1.5210822998872606e-05, |
| "loss": 0.0548, |
| "step": 1263 |
| }, |
| { |
| "epoch": 4.077544426494346, |
| "grad_norm": 0.1478452980518341, |
| "learning_rate": 1.520631341600902e-05, |
| "loss": 0.038, |
| "step": 1264 |
| }, |
| { |
| "epoch": 4.080775444264943, |
| "grad_norm": 0.175162672996521, |
| "learning_rate": 1.5201803833145434e-05, |
| "loss": 0.0463, |
| "step": 1265 |
| }, |
| { |
| "epoch": 4.084006462035541, |
| "grad_norm": 0.20436161756515503, |
| "learning_rate": 1.519729425028185e-05, |
| "loss": 0.0513, |
| "step": 1266 |
| }, |
| { |
| "epoch": 4.087237479806139, |
| "grad_norm": 0.14855553209781647, |
| "learning_rate": 1.5192784667418265e-05, |
| "loss": 0.0374, |
| "step": 1267 |
| }, |
| { |
| "epoch": 4.090468497576737, |
| "grad_norm": 0.14089789986610413, |
| "learning_rate": 1.518827508455468e-05, |
| "loss": 0.0377, |
| "step": 1268 |
| }, |
| { |
| "epoch": 4.093699515347335, |
| "grad_norm": 0.15938547253608704, |
| "learning_rate": 1.5183765501691095e-05, |
| "loss": 0.0408, |
| "step": 1269 |
| }, |
| { |
| "epoch": 4.096930533117932, |
| "grad_norm": 0.26034435629844666, |
| "learning_rate": 1.517925591882751e-05, |
| "loss": 0.0496, |
| "step": 1270 |
| }, |
| { |
| "epoch": 4.10016155088853, |
| "grad_norm": 0.14401596784591675, |
| "learning_rate": 1.5174746335963924e-05, |
| "loss": 0.0359, |
| "step": 1271 |
| }, |
| { |
| "epoch": 4.103392568659127, |
| "grad_norm": 0.2040480524301529, |
| "learning_rate": 1.517023675310034e-05, |
| "loss": 0.0572, |
| "step": 1272 |
| }, |
| { |
| "epoch": 4.106623586429725, |
| "grad_norm": 0.17441923916339874, |
| "learning_rate": 1.5165727170236755e-05, |
| "loss": 0.0496, |
| "step": 1273 |
| }, |
| { |
| "epoch": 4.109854604200323, |
| "grad_norm": 0.18695297837257385, |
| "learning_rate": 1.5161217587373169e-05, |
| "loss": 0.0489, |
| "step": 1274 |
| }, |
| { |
| "epoch": 4.113085621970921, |
| "grad_norm": 0.15302547812461853, |
| "learning_rate": 1.5156708004509583e-05, |
| "loss": 0.043, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.113085621970921, |
| "eval_loss": 0.07389214634895325, |
| "eval_runtime": 188.1369, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 1275 |
| }, |
| { |
| "epoch": 4.116316639741519, |
| "grad_norm": 0.15141095221042633, |
| "learning_rate": 1.5152198421645998e-05, |
| "loss": 0.0416, |
| "step": 1276 |
| }, |
| { |
| "epoch": 4.119547657512117, |
| "grad_norm": 0.14847366511821747, |
| "learning_rate": 1.5147688838782412e-05, |
| "loss": 0.0384, |
| "step": 1277 |
| }, |
| { |
| "epoch": 4.1227786752827145, |
| "grad_norm": 0.12617304921150208, |
| "learning_rate": 1.514317925591883e-05, |
| "loss": 0.0342, |
| "step": 1278 |
| }, |
| { |
| "epoch": 4.1260096930533114, |
| "grad_norm": 0.18524858355522156, |
| "learning_rate": 1.5138669673055244e-05, |
| "loss": 0.0538, |
| "step": 1279 |
| }, |
| { |
| "epoch": 4.129240710823909, |
| "grad_norm": 0.11797762662172318, |
| "learning_rate": 1.5134160090191659e-05, |
| "loss": 0.033, |
| "step": 1280 |
| }, |
| { |
| "epoch": 4.132471728594507, |
| "grad_norm": 0.24958020448684692, |
| "learning_rate": 1.5129650507328073e-05, |
| "loss": 0.0617, |
| "step": 1281 |
| }, |
| { |
| "epoch": 4.135702746365105, |
| "grad_norm": 0.2150479257106781, |
| "learning_rate": 1.5125140924464488e-05, |
| "loss": 0.0536, |
| "step": 1282 |
| }, |
| { |
| "epoch": 4.138933764135703, |
| "grad_norm": 0.19638743996620178, |
| "learning_rate": 1.5120631341600902e-05, |
| "loss": 0.0531, |
| "step": 1283 |
| }, |
| { |
| "epoch": 4.142164781906301, |
| "grad_norm": 0.1459854543209076, |
| "learning_rate": 1.5116121758737318e-05, |
| "loss": 0.0343, |
| "step": 1284 |
| }, |
| { |
| "epoch": 4.145395799676899, |
| "grad_norm": 0.13433115184307098, |
| "learning_rate": 1.5111612175873734e-05, |
| "loss": 0.0375, |
| "step": 1285 |
| }, |
| { |
| "epoch": 4.148626817447496, |
| "grad_norm": 0.16149543225765228, |
| "learning_rate": 1.5107102593010148e-05, |
| "loss": 0.0398, |
| "step": 1286 |
| }, |
| { |
| "epoch": 4.1518578352180935, |
| "grad_norm": 0.19074063003063202, |
| "learning_rate": 1.5102593010146563e-05, |
| "loss": 0.0452, |
| "step": 1287 |
| }, |
| { |
| "epoch": 4.155088852988691, |
| "grad_norm": 0.13406312465667725, |
| "learning_rate": 1.5098083427282977e-05, |
| "loss": 0.0376, |
| "step": 1288 |
| }, |
| { |
| "epoch": 4.158319870759289, |
| "grad_norm": 0.19820663332939148, |
| "learning_rate": 1.5093573844419393e-05, |
| "loss": 0.0503, |
| "step": 1289 |
| }, |
| { |
| "epoch": 4.161550888529887, |
| "grad_norm": 0.2043464332818985, |
| "learning_rate": 1.5089064261555808e-05, |
| "loss": 0.0515, |
| "step": 1290 |
| }, |
| { |
| "epoch": 4.164781906300485, |
| "grad_norm": 0.17180056869983673, |
| "learning_rate": 1.5084554678692222e-05, |
| "loss": 0.0425, |
| "step": 1291 |
| }, |
| { |
| "epoch": 4.168012924071083, |
| "grad_norm": 0.1668267697095871, |
| "learning_rate": 1.5080045095828636e-05, |
| "loss": 0.0446, |
| "step": 1292 |
| }, |
| { |
| "epoch": 4.17124394184168, |
| "grad_norm": 0.17857880890369415, |
| "learning_rate": 1.5075535512965051e-05, |
| "loss": 0.049, |
| "step": 1293 |
| }, |
| { |
| "epoch": 4.174474959612278, |
| "grad_norm": 0.22587771713733673, |
| "learning_rate": 1.5071025930101465e-05, |
| "loss": 0.059, |
| "step": 1294 |
| }, |
| { |
| "epoch": 4.1777059773828755, |
| "grad_norm": 0.14877091348171234, |
| "learning_rate": 1.5066516347237883e-05, |
| "loss": 0.0387, |
| "step": 1295 |
| }, |
| { |
| "epoch": 4.180936995153473, |
| "grad_norm": 0.16033658385276794, |
| "learning_rate": 1.5062006764374297e-05, |
| "loss": 0.0407, |
| "step": 1296 |
| }, |
| { |
| "epoch": 4.184168012924071, |
| "grad_norm": 0.18755468726158142, |
| "learning_rate": 1.5057497181510712e-05, |
| "loss": 0.0489, |
| "step": 1297 |
| }, |
| { |
| "epoch": 4.187399030694669, |
| "grad_norm": 0.19523419439792633, |
| "learning_rate": 1.5052987598647126e-05, |
| "loss": 0.0459, |
| "step": 1298 |
| }, |
| { |
| "epoch": 4.190630048465267, |
| "grad_norm": 0.15284906327724457, |
| "learning_rate": 1.504847801578354e-05, |
| "loss": 0.0429, |
| "step": 1299 |
| }, |
| { |
| "epoch": 4.193861066235864, |
| "grad_norm": 0.2246389240026474, |
| "learning_rate": 1.5043968432919955e-05, |
| "loss": 0.0588, |
| "step": 1300 |
| }, |
| { |
| "epoch": 4.197092084006462, |
| "grad_norm": 0.17831605672836304, |
| "learning_rate": 1.5039458850056373e-05, |
| "loss": 0.0494, |
| "step": 1301 |
| }, |
| { |
| "epoch": 4.20032310177706, |
| "grad_norm": 0.19361066818237305, |
| "learning_rate": 1.5034949267192787e-05, |
| "loss": 0.0482, |
| "step": 1302 |
| }, |
| { |
| "epoch": 4.203554119547658, |
| "grad_norm": 0.1435505896806717, |
| "learning_rate": 1.5030439684329202e-05, |
| "loss": 0.0394, |
| "step": 1303 |
| }, |
| { |
| "epoch": 4.206785137318255, |
| "grad_norm": 0.23577755689620972, |
| "learning_rate": 1.5025930101465616e-05, |
| "loss": 0.0594, |
| "step": 1304 |
| }, |
| { |
| "epoch": 4.210016155088853, |
| "grad_norm": 0.1408923715353012, |
| "learning_rate": 1.502142051860203e-05, |
| "loss": 0.0352, |
| "step": 1305 |
| }, |
| { |
| "epoch": 4.21324717285945, |
| "grad_norm": 0.18764124810695648, |
| "learning_rate": 1.5016910935738445e-05, |
| "loss": 0.0462, |
| "step": 1306 |
| }, |
| { |
| "epoch": 4.216478190630048, |
| "grad_norm": 0.19298413395881653, |
| "learning_rate": 1.501240135287486e-05, |
| "loss": 0.0479, |
| "step": 1307 |
| }, |
| { |
| "epoch": 4.219709208400646, |
| "grad_norm": 0.16441990435123444, |
| "learning_rate": 1.5007891770011275e-05, |
| "loss": 0.0398, |
| "step": 1308 |
| }, |
| { |
| "epoch": 4.222940226171244, |
| "grad_norm": 0.234172523021698, |
| "learning_rate": 1.500338218714769e-05, |
| "loss": 0.0499, |
| "step": 1309 |
| }, |
| { |
| "epoch": 4.226171243941842, |
| "grad_norm": 0.1614762842655182, |
| "learning_rate": 1.4998872604284106e-05, |
| "loss": 0.0416, |
| "step": 1310 |
| }, |
| { |
| "epoch": 4.22940226171244, |
| "grad_norm": 0.2377498745918274, |
| "learning_rate": 1.499436302142052e-05, |
| "loss": 0.0538, |
| "step": 1311 |
| }, |
| { |
| "epoch": 4.2326332794830375, |
| "grad_norm": 0.22301968932151794, |
| "learning_rate": 1.4989853438556934e-05, |
| "loss": 0.0546, |
| "step": 1312 |
| }, |
| { |
| "epoch": 4.2358642972536344, |
| "grad_norm": 0.2068958878517151, |
| "learning_rate": 1.498534385569335e-05, |
| "loss": 0.0485, |
| "step": 1313 |
| }, |
| { |
| "epoch": 4.239095315024232, |
| "grad_norm": 0.13461337983608246, |
| "learning_rate": 1.4980834272829765e-05, |
| "loss": 0.0337, |
| "step": 1314 |
| }, |
| { |
| "epoch": 4.24232633279483, |
| "grad_norm": 0.22949622571468353, |
| "learning_rate": 1.497632468996618e-05, |
| "loss": 0.0584, |
| "step": 1315 |
| }, |
| { |
| "epoch": 4.245557350565428, |
| "grad_norm": 0.13926228880882263, |
| "learning_rate": 1.4971815107102594e-05, |
| "loss": 0.0358, |
| "step": 1316 |
| }, |
| { |
| "epoch": 4.248788368336026, |
| "grad_norm": 0.19149082899093628, |
| "learning_rate": 1.4967305524239008e-05, |
| "loss": 0.0431, |
| "step": 1317 |
| }, |
| { |
| "epoch": 4.252019386106624, |
| "grad_norm": 0.19263583421707153, |
| "learning_rate": 1.4962795941375422e-05, |
| "loss": 0.0499, |
| "step": 1318 |
| }, |
| { |
| "epoch": 4.255250403877222, |
| "grad_norm": 0.11835051327943802, |
| "learning_rate": 1.495828635851184e-05, |
| "loss": 0.0316, |
| "step": 1319 |
| }, |
| { |
| "epoch": 4.258481421647819, |
| "grad_norm": 0.16367188096046448, |
| "learning_rate": 1.4953776775648255e-05, |
| "loss": 0.0417, |
| "step": 1320 |
| }, |
| { |
| "epoch": 4.2617124394184165, |
| "grad_norm": 0.2104729264974594, |
| "learning_rate": 1.4949267192784669e-05, |
| "loss": 0.056, |
| "step": 1321 |
| }, |
| { |
| "epoch": 4.264943457189014, |
| "grad_norm": 0.20622481405735016, |
| "learning_rate": 1.4944757609921083e-05, |
| "loss": 0.0495, |
| "step": 1322 |
| }, |
| { |
| "epoch": 4.268174474959612, |
| "grad_norm": 0.15231038630008698, |
| "learning_rate": 1.4940248027057498e-05, |
| "loss": 0.0431, |
| "step": 1323 |
| }, |
| { |
| "epoch": 4.27140549273021, |
| "grad_norm": 0.1890675574541092, |
| "learning_rate": 1.4935738444193912e-05, |
| "loss": 0.0447, |
| "step": 1324 |
| }, |
| { |
| "epoch": 4.274636510500808, |
| "grad_norm": 0.2067018300294876, |
| "learning_rate": 1.4931228861330328e-05, |
| "loss": 0.0532, |
| "step": 1325 |
| }, |
| { |
| "epoch": 4.277867528271406, |
| "grad_norm": 0.21407710015773773, |
| "learning_rate": 1.4926719278466744e-05, |
| "loss": 0.0601, |
| "step": 1326 |
| }, |
| { |
| "epoch": 4.281098546042003, |
| "grad_norm": 0.25905516743659973, |
| "learning_rate": 1.4922209695603159e-05, |
| "loss": 0.0457, |
| "step": 1327 |
| }, |
| { |
| "epoch": 4.284329563812601, |
| "grad_norm": 0.1636149138212204, |
| "learning_rate": 1.4917700112739573e-05, |
| "loss": 0.04, |
| "step": 1328 |
| }, |
| { |
| "epoch": 4.2875605815831985, |
| "grad_norm": 0.1303829401731491, |
| "learning_rate": 1.4913190529875988e-05, |
| "loss": 0.0366, |
| "step": 1329 |
| }, |
| { |
| "epoch": 4.290791599353796, |
| "grad_norm": 0.21324171125888824, |
| "learning_rate": 1.4908680947012402e-05, |
| "loss": 0.0542, |
| "step": 1330 |
| }, |
| { |
| "epoch": 4.294022617124394, |
| "grad_norm": 0.15251924097537994, |
| "learning_rate": 1.4904171364148818e-05, |
| "loss": 0.0358, |
| "step": 1331 |
| }, |
| { |
| "epoch": 4.297253634894992, |
| "grad_norm": 0.24509133398532867, |
| "learning_rate": 1.4899661781285232e-05, |
| "loss": 0.0621, |
| "step": 1332 |
| }, |
| { |
| "epoch": 4.30048465266559, |
| "grad_norm": 0.22666756808757782, |
| "learning_rate": 1.4895152198421647e-05, |
| "loss": 0.0528, |
| "step": 1333 |
| }, |
| { |
| "epoch": 4.303715670436187, |
| "grad_norm": 0.1979990154504776, |
| "learning_rate": 1.4890642615558061e-05, |
| "loss": 0.0478, |
| "step": 1334 |
| }, |
| { |
| "epoch": 4.306946688206785, |
| "grad_norm": 0.2013210654258728, |
| "learning_rate": 1.4886133032694476e-05, |
| "loss": 0.0467, |
| "step": 1335 |
| }, |
| { |
| "epoch": 4.310177705977383, |
| "grad_norm": 0.15871821343898773, |
| "learning_rate": 1.4881623449830892e-05, |
| "loss": 0.0357, |
| "step": 1336 |
| }, |
| { |
| "epoch": 4.313408723747981, |
| "grad_norm": 0.1882667988538742, |
| "learning_rate": 1.4877113866967308e-05, |
| "loss": 0.047, |
| "step": 1337 |
| }, |
| { |
| "epoch": 4.316639741518578, |
| "grad_norm": 0.2094133049249649, |
| "learning_rate": 1.4872604284103722e-05, |
| "loss": 0.0429, |
| "step": 1338 |
| }, |
| { |
| "epoch": 4.319870759289176, |
| "grad_norm": 0.21321342885494232, |
| "learning_rate": 1.4868094701240137e-05, |
| "loss": 0.0553, |
| "step": 1339 |
| }, |
| { |
| "epoch": 4.323101777059774, |
| "grad_norm": 0.22374090552330017, |
| "learning_rate": 1.4863585118376551e-05, |
| "loss": 0.0586, |
| "step": 1340 |
| }, |
| { |
| "epoch": 4.326332794830371, |
| "grad_norm": 0.17087411880493164, |
| "learning_rate": 1.4859075535512965e-05, |
| "loss": 0.0407, |
| "step": 1341 |
| }, |
| { |
| "epoch": 4.329563812600969, |
| "grad_norm": 0.2719607353210449, |
| "learning_rate": 1.485456595264938e-05, |
| "loss": 0.0679, |
| "step": 1342 |
| }, |
| { |
| "epoch": 4.332794830371567, |
| "grad_norm": 0.2514837384223938, |
| "learning_rate": 1.4850056369785797e-05, |
| "loss": 0.0582, |
| "step": 1343 |
| }, |
| { |
| "epoch": 4.336025848142165, |
| "grad_norm": 0.18868017196655273, |
| "learning_rate": 1.4845546786922212e-05, |
| "loss": 0.046, |
| "step": 1344 |
| }, |
| { |
| "epoch": 4.339256865912763, |
| "grad_norm": 0.19118477404117584, |
| "learning_rate": 1.4841037204058626e-05, |
| "loss": 0.0459, |
| "step": 1345 |
| }, |
| { |
| "epoch": 4.3424878836833605, |
| "grad_norm": 0.18494348227977753, |
| "learning_rate": 1.483652762119504e-05, |
| "loss": 0.0404, |
| "step": 1346 |
| }, |
| { |
| "epoch": 4.345718901453958, |
| "grad_norm": 0.20081184804439545, |
| "learning_rate": 1.4832018038331455e-05, |
| "loss": 0.0476, |
| "step": 1347 |
| }, |
| { |
| "epoch": 4.348949919224555, |
| "grad_norm": 0.23053082823753357, |
| "learning_rate": 1.482750845546787e-05, |
| "loss": 0.0546, |
| "step": 1348 |
| }, |
| { |
| "epoch": 4.352180936995153, |
| "grad_norm": 0.26016828417778015, |
| "learning_rate": 1.4822998872604286e-05, |
| "loss": 0.0614, |
| "step": 1349 |
| }, |
| { |
| "epoch": 4.355411954765751, |
| "grad_norm": 0.21929657459259033, |
| "learning_rate": 1.48184892897407e-05, |
| "loss": 0.0588, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.355411954765751, |
| "eval_loss": 0.0738663524389267, |
| "eval_runtime": 188.2629, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1350 |
| }, |
| { |
| "epoch": 4.358642972536349, |
| "grad_norm": 0.16189903020858765, |
| "learning_rate": 1.4813979706877116e-05, |
| "loss": 0.0416, |
| "step": 1351 |
| }, |
| { |
| "epoch": 4.361873990306947, |
| "grad_norm": 0.17752930521965027, |
| "learning_rate": 1.480947012401353e-05, |
| "loss": 0.048, |
| "step": 1352 |
| }, |
| { |
| "epoch": 4.365105008077545, |
| "grad_norm": 0.1832403540611267, |
| "learning_rate": 1.4804960541149945e-05, |
| "loss": 0.0452, |
| "step": 1353 |
| }, |
| { |
| "epoch": 4.3683360258481425, |
| "grad_norm": 0.16476021707057953, |
| "learning_rate": 1.480045095828636e-05, |
| "loss": 0.042, |
| "step": 1354 |
| }, |
| { |
| "epoch": 4.3715670436187395, |
| "grad_norm": 0.18926653265953064, |
| "learning_rate": 1.4795941375422775e-05, |
| "loss": 0.0426, |
| "step": 1355 |
| }, |
| { |
| "epoch": 4.374798061389337, |
| "grad_norm": 0.1436876803636551, |
| "learning_rate": 1.479143179255919e-05, |
| "loss": 0.0356, |
| "step": 1356 |
| }, |
| { |
| "epoch": 4.378029079159935, |
| "grad_norm": 0.18440058827400208, |
| "learning_rate": 1.4786922209695604e-05, |
| "loss": 0.0389, |
| "step": 1357 |
| }, |
| { |
| "epoch": 4.381260096930533, |
| "grad_norm": 0.19692404568195343, |
| "learning_rate": 1.4782412626832018e-05, |
| "loss": 0.0479, |
| "step": 1358 |
| }, |
| { |
| "epoch": 4.384491114701131, |
| "grad_norm": 0.24039334058761597, |
| "learning_rate": 1.4777903043968433e-05, |
| "loss": 0.0545, |
| "step": 1359 |
| }, |
| { |
| "epoch": 4.387722132471729, |
| "grad_norm": 0.2061791568994522, |
| "learning_rate": 1.4773393461104847e-05, |
| "loss": 0.053, |
| "step": 1360 |
| }, |
| { |
| "epoch": 4.390953150242327, |
| "grad_norm": 0.18984144926071167, |
| "learning_rate": 1.4768883878241265e-05, |
| "loss": 0.0487, |
| "step": 1361 |
| }, |
| { |
| "epoch": 4.394184168012924, |
| "grad_norm": 0.2028520703315735, |
| "learning_rate": 1.476437429537768e-05, |
| "loss": 0.0532, |
| "step": 1362 |
| }, |
| { |
| "epoch": 4.3974151857835215, |
| "grad_norm": 0.1802004873752594, |
| "learning_rate": 1.4759864712514094e-05, |
| "loss": 0.0422, |
| "step": 1363 |
| }, |
| { |
| "epoch": 4.400646203554119, |
| "grad_norm": 0.18436428904533386, |
| "learning_rate": 1.4755355129650508e-05, |
| "loss": 0.0448, |
| "step": 1364 |
| }, |
| { |
| "epoch": 4.403877221324717, |
| "grad_norm": 0.19160835444927216, |
| "learning_rate": 1.4750845546786923e-05, |
| "loss": 0.0444, |
| "step": 1365 |
| }, |
| { |
| "epoch": 4.407108239095315, |
| "grad_norm": 0.19131603837013245, |
| "learning_rate": 1.4746335963923337e-05, |
| "loss": 0.0484, |
| "step": 1366 |
| }, |
| { |
| "epoch": 4.410339256865913, |
| "grad_norm": 0.19180463254451752, |
| "learning_rate": 1.4741826381059755e-05, |
| "loss": 0.0419, |
| "step": 1367 |
| }, |
| { |
| "epoch": 4.413570274636511, |
| "grad_norm": 0.23459561169147491, |
| "learning_rate": 1.4737316798196169e-05, |
| "loss": 0.0592, |
| "step": 1368 |
| }, |
| { |
| "epoch": 4.416801292407108, |
| "grad_norm": 0.22431418299674988, |
| "learning_rate": 1.4732807215332583e-05, |
| "loss": 0.052, |
| "step": 1369 |
| }, |
| { |
| "epoch": 4.420032310177706, |
| "grad_norm": 0.25840187072753906, |
| "learning_rate": 1.4728297632468998e-05, |
| "loss": 0.0625, |
| "step": 1370 |
| }, |
| { |
| "epoch": 4.423263327948304, |
| "grad_norm": 0.19012144207954407, |
| "learning_rate": 1.4723788049605412e-05, |
| "loss": 0.0392, |
| "step": 1371 |
| }, |
| { |
| "epoch": 4.426494345718901, |
| "grad_norm": 0.22117850184440613, |
| "learning_rate": 1.4719278466741827e-05, |
| "loss": 0.0495, |
| "step": 1372 |
| }, |
| { |
| "epoch": 4.429725363489499, |
| "grad_norm": 0.2188146561384201, |
| "learning_rate": 1.4714768883878243e-05, |
| "loss": 0.0474, |
| "step": 1373 |
| }, |
| { |
| "epoch": 4.432956381260097, |
| "grad_norm": 0.22705675661563873, |
| "learning_rate": 1.4710259301014657e-05, |
| "loss": 0.0455, |
| "step": 1374 |
| }, |
| { |
| "epoch": 4.436187399030695, |
| "grad_norm": 0.18744595348834991, |
| "learning_rate": 1.4705749718151072e-05, |
| "loss": 0.046, |
| "step": 1375 |
| }, |
| { |
| "epoch": 4.439418416801292, |
| "grad_norm": 0.1990160346031189, |
| "learning_rate": 1.4701240135287486e-05, |
| "loss": 0.0422, |
| "step": 1376 |
| }, |
| { |
| "epoch": 4.44264943457189, |
| "grad_norm": 0.1452781856060028, |
| "learning_rate": 1.4696730552423902e-05, |
| "loss": 0.0349, |
| "step": 1377 |
| }, |
| { |
| "epoch": 4.445880452342488, |
| "grad_norm": 0.18087869882583618, |
| "learning_rate": 1.4692220969560318e-05, |
| "loss": 0.0361, |
| "step": 1378 |
| }, |
| { |
| "epoch": 4.449111470113086, |
| "grad_norm": 0.22286781668663025, |
| "learning_rate": 1.4687711386696732e-05, |
| "loss": 0.0494, |
| "step": 1379 |
| }, |
| { |
| "epoch": 4.4523424878836835, |
| "grad_norm": 0.1993950456380844, |
| "learning_rate": 1.4683201803833147e-05, |
| "loss": 0.0451, |
| "step": 1380 |
| }, |
| { |
| "epoch": 4.455573505654281, |
| "grad_norm": 0.1597587913274765, |
| "learning_rate": 1.4678692220969561e-05, |
| "loss": 0.036, |
| "step": 1381 |
| }, |
| { |
| "epoch": 4.458804523424879, |
| "grad_norm": 0.22262707352638245, |
| "learning_rate": 1.4674182638105976e-05, |
| "loss": 0.0489, |
| "step": 1382 |
| }, |
| { |
| "epoch": 4.462035541195476, |
| "grad_norm": 0.23679843544960022, |
| "learning_rate": 1.466967305524239e-05, |
| "loss": 0.0566, |
| "step": 1383 |
| }, |
| { |
| "epoch": 4.465266558966074, |
| "grad_norm": 0.24386358261108398, |
| "learning_rate": 1.4665163472378808e-05, |
| "loss": 0.0542, |
| "step": 1384 |
| }, |
| { |
| "epoch": 4.468497576736672, |
| "grad_norm": 0.23281097412109375, |
| "learning_rate": 1.4660653889515222e-05, |
| "loss": 0.0527, |
| "step": 1385 |
| }, |
| { |
| "epoch": 4.47172859450727, |
| "grad_norm": 0.22125712037086487, |
| "learning_rate": 1.4656144306651637e-05, |
| "loss": 0.0513, |
| "step": 1386 |
| }, |
| { |
| "epoch": 4.474959612277868, |
| "grad_norm": 0.18801574409008026, |
| "learning_rate": 1.4651634723788051e-05, |
| "loss": 0.0439, |
| "step": 1387 |
| }, |
| { |
| "epoch": 4.4781906300484655, |
| "grad_norm": 0.17072679102420807, |
| "learning_rate": 1.4647125140924465e-05, |
| "loss": 0.0431, |
| "step": 1388 |
| }, |
| { |
| "epoch": 4.481421647819063, |
| "grad_norm": 0.16123917698860168, |
| "learning_rate": 1.464261555806088e-05, |
| "loss": 0.0384, |
| "step": 1389 |
| }, |
| { |
| "epoch": 4.48465266558966, |
| "grad_norm": 0.22254766523838043, |
| "learning_rate": 1.4638105975197296e-05, |
| "loss": 0.0526, |
| "step": 1390 |
| }, |
| { |
| "epoch": 4.487883683360258, |
| "grad_norm": 0.14589062333106995, |
| "learning_rate": 1.463359639233371e-05, |
| "loss": 0.0363, |
| "step": 1391 |
| }, |
| { |
| "epoch": 4.491114701130856, |
| "grad_norm": 0.2187325805425644, |
| "learning_rate": 1.4629086809470126e-05, |
| "loss": 0.0527, |
| "step": 1392 |
| }, |
| { |
| "epoch": 4.494345718901454, |
| "grad_norm": 0.21446366608142853, |
| "learning_rate": 1.462457722660654e-05, |
| "loss": 0.053, |
| "step": 1393 |
| }, |
| { |
| "epoch": 4.497576736672052, |
| "grad_norm": 0.16722171008586884, |
| "learning_rate": 1.4620067643742955e-05, |
| "loss": 0.0363, |
| "step": 1394 |
| }, |
| { |
| "epoch": 4.50080775444265, |
| "grad_norm": 0.20108149945735931, |
| "learning_rate": 1.461555806087937e-05, |
| "loss": 0.0473, |
| "step": 1395 |
| }, |
| { |
| "epoch": 4.5040387722132476, |
| "grad_norm": 0.1628379225730896, |
| "learning_rate": 1.4611048478015786e-05, |
| "loss": 0.0357, |
| "step": 1396 |
| }, |
| { |
| "epoch": 4.5072697899838445, |
| "grad_norm": 0.255012571811676, |
| "learning_rate": 1.46065388951522e-05, |
| "loss": 0.064, |
| "step": 1397 |
| }, |
| { |
| "epoch": 4.510500807754442, |
| "grad_norm": 0.21136048436164856, |
| "learning_rate": 1.4602029312288614e-05, |
| "loss": 0.0439, |
| "step": 1398 |
| }, |
| { |
| "epoch": 4.51373182552504, |
| "grad_norm": 0.1613055020570755, |
| "learning_rate": 1.4597519729425029e-05, |
| "loss": 0.0372, |
| "step": 1399 |
| }, |
| { |
| "epoch": 4.516962843295638, |
| "grad_norm": 0.2173350751399994, |
| "learning_rate": 1.4593010146561443e-05, |
| "loss": 0.0507, |
| "step": 1400 |
| }, |
| { |
| "epoch": 4.520193861066236, |
| "grad_norm": 0.12804266810417175, |
| "learning_rate": 1.4588500563697858e-05, |
| "loss": 0.0297, |
| "step": 1401 |
| }, |
| { |
| "epoch": 4.523424878836834, |
| "grad_norm": 0.22981034219264984, |
| "learning_rate": 1.4583990980834275e-05, |
| "loss": 0.0504, |
| "step": 1402 |
| }, |
| { |
| "epoch": 4.526655896607432, |
| "grad_norm": 0.216976135969162, |
| "learning_rate": 1.457948139797069e-05, |
| "loss": 0.0491, |
| "step": 1403 |
| }, |
| { |
| "epoch": 4.529886914378029, |
| "grad_norm": 0.16182787716388702, |
| "learning_rate": 1.4574971815107104e-05, |
| "loss": 0.0423, |
| "step": 1404 |
| }, |
| { |
| "epoch": 4.533117932148627, |
| "grad_norm": 0.24003592133522034, |
| "learning_rate": 1.4570462232243518e-05, |
| "loss": 0.0554, |
| "step": 1405 |
| }, |
| { |
| "epoch": 4.536348949919224, |
| "grad_norm": 0.17051248252391815, |
| "learning_rate": 1.4565952649379933e-05, |
| "loss": 0.0368, |
| "step": 1406 |
| }, |
| { |
| "epoch": 4.539579967689822, |
| "grad_norm": 0.15054872632026672, |
| "learning_rate": 1.4561443066516347e-05, |
| "loss": 0.0341, |
| "step": 1407 |
| }, |
| { |
| "epoch": 4.54281098546042, |
| "grad_norm": 0.196214959025383, |
| "learning_rate": 1.4556933483652765e-05, |
| "loss": 0.0467, |
| "step": 1408 |
| }, |
| { |
| "epoch": 4.546042003231018, |
| "grad_norm": 0.27159518003463745, |
| "learning_rate": 1.455242390078918e-05, |
| "loss": 0.0621, |
| "step": 1409 |
| }, |
| { |
| "epoch": 4.549273021001616, |
| "grad_norm": 0.21749594807624817, |
| "learning_rate": 1.4547914317925594e-05, |
| "loss": 0.0442, |
| "step": 1410 |
| }, |
| { |
| "epoch": 4.552504038772213, |
| "grad_norm": 0.19181078672409058, |
| "learning_rate": 1.4543404735062008e-05, |
| "loss": 0.0425, |
| "step": 1411 |
| }, |
| { |
| "epoch": 4.555735056542811, |
| "grad_norm": 0.173475980758667, |
| "learning_rate": 1.4538895152198423e-05, |
| "loss": 0.0422, |
| "step": 1412 |
| }, |
| { |
| "epoch": 4.558966074313409, |
| "grad_norm": 0.21586476266384125, |
| "learning_rate": 1.4534385569334837e-05, |
| "loss": 0.0504, |
| "step": 1413 |
| }, |
| { |
| "epoch": 4.5621970920840065, |
| "grad_norm": 0.24450407922267914, |
| "learning_rate": 1.4529875986471253e-05, |
| "loss": 0.0519, |
| "step": 1414 |
| }, |
| { |
| "epoch": 4.565428109854604, |
| "grad_norm": 0.22287097573280334, |
| "learning_rate": 1.4525366403607667e-05, |
| "loss": 0.054, |
| "step": 1415 |
| }, |
| { |
| "epoch": 4.568659127625202, |
| "grad_norm": 0.24564214050769806, |
| "learning_rate": 1.4520856820744082e-05, |
| "loss": 0.0551, |
| "step": 1416 |
| }, |
| { |
| "epoch": 4.5718901453958, |
| "grad_norm": 0.2434011995792389, |
| "learning_rate": 1.4516347237880496e-05, |
| "loss": 0.056, |
| "step": 1417 |
| }, |
| { |
| "epoch": 4.575121163166397, |
| "grad_norm": 0.20210736989974976, |
| "learning_rate": 1.4511837655016912e-05, |
| "loss": 0.0396, |
| "step": 1418 |
| }, |
| { |
| "epoch": 4.578352180936995, |
| "grad_norm": 0.18828843533992767, |
| "learning_rate": 1.4507328072153327e-05, |
| "loss": 0.0445, |
| "step": 1419 |
| }, |
| { |
| "epoch": 4.581583198707593, |
| "grad_norm": 0.1726129651069641, |
| "learning_rate": 1.4502818489289743e-05, |
| "loss": 0.0385, |
| "step": 1420 |
| }, |
| { |
| "epoch": 4.584814216478191, |
| "grad_norm": 0.20119218528270721, |
| "learning_rate": 1.4498308906426157e-05, |
| "loss": 0.0502, |
| "step": 1421 |
| }, |
| { |
| "epoch": 4.5880452342487885, |
| "grad_norm": 0.22063620388507843, |
| "learning_rate": 1.4493799323562572e-05, |
| "loss": 0.0509, |
| "step": 1422 |
| }, |
| { |
| "epoch": 4.591276252019386, |
| "grad_norm": 0.21817119419574738, |
| "learning_rate": 1.4489289740698986e-05, |
| "loss": 0.0505, |
| "step": 1423 |
| }, |
| { |
| "epoch": 4.594507269789984, |
| "grad_norm": 0.19823548197746277, |
| "learning_rate": 1.44847801578354e-05, |
| "loss": 0.0436, |
| "step": 1424 |
| }, |
| { |
| "epoch": 4.597738287560581, |
| "grad_norm": 0.21510590612888336, |
| "learning_rate": 1.4480270574971815e-05, |
| "loss": 0.0555, |
| "step": 1425 |
| }, |
| { |
| "epoch": 4.597738287560581, |
| "eval_loss": 0.0736919641494751, |
| "eval_runtime": 188.3275, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1425 |
| }, |
| { |
| "epoch": 4.600969305331179, |
| "grad_norm": 0.229070246219635, |
| "learning_rate": 1.4475760992108233e-05, |
| "loss": 0.0496, |
| "step": 1426 |
| }, |
| { |
| "epoch": 4.604200323101777, |
| "grad_norm": 0.18254131078720093, |
| "learning_rate": 1.4471251409244647e-05, |
| "loss": 0.0475, |
| "step": 1427 |
| }, |
| { |
| "epoch": 4.607431340872375, |
| "grad_norm": 0.19957689940929413, |
| "learning_rate": 1.4466741826381061e-05, |
| "loss": 0.0443, |
| "step": 1428 |
| }, |
| { |
| "epoch": 4.610662358642973, |
| "grad_norm": 0.23266592621803284, |
| "learning_rate": 1.4462232243517476e-05, |
| "loss": 0.0526, |
| "step": 1429 |
| }, |
| { |
| "epoch": 4.613893376413571, |
| "grad_norm": 0.1473768949508667, |
| "learning_rate": 1.445772266065389e-05, |
| "loss": 0.0342, |
| "step": 1430 |
| }, |
| { |
| "epoch": 4.617124394184168, |
| "grad_norm": 0.1650787740945816, |
| "learning_rate": 1.4453213077790304e-05, |
| "loss": 0.0408, |
| "step": 1431 |
| }, |
| { |
| "epoch": 4.620355411954765, |
| "grad_norm": 0.3259841799736023, |
| "learning_rate": 1.444870349492672e-05, |
| "loss": 0.0476, |
| "step": 1432 |
| }, |
| { |
| "epoch": 4.623586429725363, |
| "grad_norm": 0.1688832938671112, |
| "learning_rate": 1.4444193912063137e-05, |
| "loss": 0.037, |
| "step": 1433 |
| }, |
| { |
| "epoch": 4.626817447495961, |
| "grad_norm": 0.19549787044525146, |
| "learning_rate": 1.4439684329199551e-05, |
| "loss": 0.0537, |
| "step": 1434 |
| }, |
| { |
| "epoch": 4.630048465266559, |
| "grad_norm": 0.2425767183303833, |
| "learning_rate": 1.4435174746335965e-05, |
| "loss": 0.0621, |
| "step": 1435 |
| }, |
| { |
| "epoch": 4.633279483037157, |
| "grad_norm": 0.13551421463489532, |
| "learning_rate": 1.443066516347238e-05, |
| "loss": 0.0307, |
| "step": 1436 |
| }, |
| { |
| "epoch": 4.636510500807755, |
| "grad_norm": 0.21688656508922577, |
| "learning_rate": 1.4426155580608794e-05, |
| "loss": 0.0486, |
| "step": 1437 |
| }, |
| { |
| "epoch": 4.639741518578353, |
| "grad_norm": 0.2006089985370636, |
| "learning_rate": 1.442164599774521e-05, |
| "loss": 0.0449, |
| "step": 1438 |
| }, |
| { |
| "epoch": 4.64297253634895, |
| "grad_norm": 0.1951628178358078, |
| "learning_rate": 1.4417136414881625e-05, |
| "loss": 0.0483, |
| "step": 1439 |
| }, |
| { |
| "epoch": 4.646203554119547, |
| "grad_norm": 0.1714273989200592, |
| "learning_rate": 1.4412626832018039e-05, |
| "loss": 0.0407, |
| "step": 1440 |
| }, |
| { |
| "epoch": 4.649434571890145, |
| "grad_norm": 0.17770501971244812, |
| "learning_rate": 1.4408117249154453e-05, |
| "loss": 0.042, |
| "step": 1441 |
| }, |
| { |
| "epoch": 4.652665589660743, |
| "grad_norm": 0.17861397564411163, |
| "learning_rate": 1.4403607666290868e-05, |
| "loss": 0.0431, |
| "step": 1442 |
| }, |
| { |
| "epoch": 4.655896607431341, |
| "grad_norm": 0.1716032773256302, |
| "learning_rate": 1.4399098083427284e-05, |
| "loss": 0.0394, |
| "step": 1443 |
| }, |
| { |
| "epoch": 4.659127625201939, |
| "grad_norm": 0.17270030081272125, |
| "learning_rate": 1.43945885005637e-05, |
| "loss": 0.0433, |
| "step": 1444 |
| }, |
| { |
| "epoch": 4.662358642972537, |
| "grad_norm": 0.16006852686405182, |
| "learning_rate": 1.4390078917700114e-05, |
| "loss": 0.0362, |
| "step": 1445 |
| }, |
| { |
| "epoch": 4.665589660743134, |
| "grad_norm": 0.16748811304569244, |
| "learning_rate": 1.4385569334836529e-05, |
| "loss": 0.0371, |
| "step": 1446 |
| }, |
| { |
| "epoch": 4.668820678513732, |
| "grad_norm": 0.1422804743051529, |
| "learning_rate": 1.4381059751972943e-05, |
| "loss": 0.0328, |
| "step": 1447 |
| }, |
| { |
| "epoch": 4.6720516962843295, |
| "grad_norm": 0.16988115012645721, |
| "learning_rate": 1.4376550169109358e-05, |
| "loss": 0.0391, |
| "step": 1448 |
| }, |
| { |
| "epoch": 4.675282714054927, |
| "grad_norm": 0.19109609723091125, |
| "learning_rate": 1.4372040586245772e-05, |
| "loss": 0.0449, |
| "step": 1449 |
| }, |
| { |
| "epoch": 4.678513731825525, |
| "grad_norm": 0.19361594319343567, |
| "learning_rate": 1.436753100338219e-05, |
| "loss": 0.0466, |
| "step": 1450 |
| }, |
| { |
| "epoch": 4.681744749596123, |
| "grad_norm": 0.16704949736595154, |
| "learning_rate": 1.4363021420518604e-05, |
| "loss": 0.0374, |
| "step": 1451 |
| }, |
| { |
| "epoch": 4.684975767366721, |
| "grad_norm": 0.3024942874908447, |
| "learning_rate": 1.4358511837655019e-05, |
| "loss": 0.0415, |
| "step": 1452 |
| }, |
| { |
| "epoch": 4.688206785137318, |
| "grad_norm": 0.20888644456863403, |
| "learning_rate": 1.4354002254791433e-05, |
| "loss": 0.0436, |
| "step": 1453 |
| }, |
| { |
| "epoch": 4.691437802907916, |
| "grad_norm": 0.20239083468914032, |
| "learning_rate": 1.4349492671927847e-05, |
| "loss": 0.0427, |
| "step": 1454 |
| }, |
| { |
| "epoch": 4.694668820678514, |
| "grad_norm": 0.18035829067230225, |
| "learning_rate": 1.4344983089064262e-05, |
| "loss": 0.0416, |
| "step": 1455 |
| }, |
| { |
| "epoch": 4.6978998384491115, |
| "grad_norm": 0.20017199218273163, |
| "learning_rate": 1.4340473506200678e-05, |
| "loss": 0.0532, |
| "step": 1456 |
| }, |
| { |
| "epoch": 4.701130856219709, |
| "grad_norm": 0.1918788105249405, |
| "learning_rate": 1.4335963923337092e-05, |
| "loss": 0.0445, |
| "step": 1457 |
| }, |
| { |
| "epoch": 4.704361873990307, |
| "grad_norm": 0.2361784428358078, |
| "learning_rate": 1.4331454340473507e-05, |
| "loss": 0.0637, |
| "step": 1458 |
| }, |
| { |
| "epoch": 4.707592891760905, |
| "grad_norm": 0.2567603588104248, |
| "learning_rate": 1.4326944757609923e-05, |
| "loss": 0.0445, |
| "step": 1459 |
| }, |
| { |
| "epoch": 4.710823909531502, |
| "grad_norm": 0.2610633969306946, |
| "learning_rate": 1.4322435174746337e-05, |
| "loss": 0.0591, |
| "step": 1460 |
| }, |
| { |
| "epoch": 4.7140549273021, |
| "grad_norm": 0.17344635725021362, |
| "learning_rate": 1.4317925591882753e-05, |
| "loss": 0.0355, |
| "step": 1461 |
| }, |
| { |
| "epoch": 4.717285945072698, |
| "grad_norm": 0.18580083549022675, |
| "learning_rate": 1.4313416009019167e-05, |
| "loss": 0.0445, |
| "step": 1462 |
| }, |
| { |
| "epoch": 4.720516962843296, |
| "grad_norm": 0.13654324412345886, |
| "learning_rate": 1.4308906426155582e-05, |
| "loss": 0.0345, |
| "step": 1463 |
| }, |
| { |
| "epoch": 4.723747980613894, |
| "grad_norm": 0.20222207903862, |
| "learning_rate": 1.4304396843291996e-05, |
| "loss": 0.0475, |
| "step": 1464 |
| }, |
| { |
| "epoch": 4.726978998384491, |
| "grad_norm": 0.15386740863323212, |
| "learning_rate": 1.429988726042841e-05, |
| "loss": 0.0375, |
| "step": 1465 |
| }, |
| { |
| "epoch": 4.730210016155089, |
| "grad_norm": 0.15888769924640656, |
| "learning_rate": 1.4295377677564825e-05, |
| "loss": 0.0354, |
| "step": 1466 |
| }, |
| { |
| "epoch": 4.733441033925686, |
| "grad_norm": 0.15721705555915833, |
| "learning_rate": 1.4290868094701243e-05, |
| "loss": 0.0395, |
| "step": 1467 |
| }, |
| { |
| "epoch": 4.736672051696284, |
| "grad_norm": 0.22503730654716492, |
| "learning_rate": 1.4286358511837657e-05, |
| "loss": 0.0522, |
| "step": 1468 |
| }, |
| { |
| "epoch": 4.739903069466882, |
| "grad_norm": 0.1799791008234024, |
| "learning_rate": 1.4281848928974072e-05, |
| "loss": 0.0417, |
| "step": 1469 |
| }, |
| { |
| "epoch": 4.74313408723748, |
| "grad_norm": 0.18177564442157745, |
| "learning_rate": 1.4277339346110486e-05, |
| "loss": 0.0428, |
| "step": 1470 |
| }, |
| { |
| "epoch": 4.746365105008078, |
| "grad_norm": 0.17222186923027039, |
| "learning_rate": 1.42728297632469e-05, |
| "loss": 0.0372, |
| "step": 1471 |
| }, |
| { |
| "epoch": 4.749596122778676, |
| "grad_norm": 0.1819365918636322, |
| "learning_rate": 1.4268320180383315e-05, |
| "loss": 0.0411, |
| "step": 1472 |
| }, |
| { |
| "epoch": 4.7528271405492735, |
| "grad_norm": 0.2186632603406906, |
| "learning_rate": 1.4263810597519731e-05, |
| "loss": 0.0495, |
| "step": 1473 |
| }, |
| { |
| "epoch": 4.75605815831987, |
| "grad_norm": 0.20079995691776276, |
| "learning_rate": 1.4259301014656147e-05, |
| "loss": 0.0442, |
| "step": 1474 |
| }, |
| { |
| "epoch": 4.759289176090468, |
| "grad_norm": 0.17578662931919098, |
| "learning_rate": 1.4254791431792561e-05, |
| "loss": 0.0446, |
| "step": 1475 |
| }, |
| { |
| "epoch": 4.762520193861066, |
| "grad_norm": 0.17604367434978485, |
| "learning_rate": 1.4250281848928976e-05, |
| "loss": 0.0425, |
| "step": 1476 |
| }, |
| { |
| "epoch": 4.765751211631664, |
| "grad_norm": 0.17798492312431335, |
| "learning_rate": 1.424577226606539e-05, |
| "loss": 0.0398, |
| "step": 1477 |
| }, |
| { |
| "epoch": 4.768982229402262, |
| "grad_norm": 0.2289198935031891, |
| "learning_rate": 1.4241262683201805e-05, |
| "loss": 0.0531, |
| "step": 1478 |
| }, |
| { |
| "epoch": 4.77221324717286, |
| "grad_norm": 0.2402426302433014, |
| "learning_rate": 1.423675310033822e-05, |
| "loss": 0.0563, |
| "step": 1479 |
| }, |
| { |
| "epoch": 4.775444264943458, |
| "grad_norm": 0.21412646770477295, |
| "learning_rate": 1.4232243517474635e-05, |
| "loss": 0.049, |
| "step": 1480 |
| }, |
| { |
| "epoch": 4.778675282714055, |
| "grad_norm": 0.16775807738304138, |
| "learning_rate": 1.422773393461105e-05, |
| "loss": 0.0378, |
| "step": 1481 |
| }, |
| { |
| "epoch": 4.7819063004846525, |
| "grad_norm": 0.23936647176742554, |
| "learning_rate": 1.4223224351747464e-05, |
| "loss": 0.056, |
| "step": 1482 |
| }, |
| { |
| "epoch": 4.78513731825525, |
| "grad_norm": 0.19123949110507965, |
| "learning_rate": 1.4218714768883878e-05, |
| "loss": 0.0448, |
| "step": 1483 |
| }, |
| { |
| "epoch": 4.788368336025848, |
| "grad_norm": 0.1810356080532074, |
| "learning_rate": 1.4214205186020294e-05, |
| "loss": 0.0475, |
| "step": 1484 |
| }, |
| { |
| "epoch": 4.791599353796446, |
| "grad_norm": 0.13476230204105377, |
| "learning_rate": 1.420969560315671e-05, |
| "loss": 0.0332, |
| "step": 1485 |
| }, |
| { |
| "epoch": 4.794830371567044, |
| "grad_norm": 0.14813260734081268, |
| "learning_rate": 1.4205186020293125e-05, |
| "loss": 0.0333, |
| "step": 1486 |
| }, |
| { |
| "epoch": 4.798061389337642, |
| "grad_norm": 0.20241865515708923, |
| "learning_rate": 1.4200676437429539e-05, |
| "loss": 0.0467, |
| "step": 1487 |
| }, |
| { |
| "epoch": 4.801292407108239, |
| "grad_norm": 0.161521315574646, |
| "learning_rate": 1.4196166854565953e-05, |
| "loss": 0.0376, |
| "step": 1488 |
| }, |
| { |
| "epoch": 4.804523424878837, |
| "grad_norm": 0.22518181800842285, |
| "learning_rate": 1.4191657271702368e-05, |
| "loss": 0.0525, |
| "step": 1489 |
| }, |
| { |
| "epoch": 4.8077544426494345, |
| "grad_norm": 0.18322886526584625, |
| "learning_rate": 1.4187147688838782e-05, |
| "loss": 0.0457, |
| "step": 1490 |
| }, |
| { |
| "epoch": 4.810985460420032, |
| "grad_norm": 0.1902959942817688, |
| "learning_rate": 1.41826381059752e-05, |
| "loss": 0.0418, |
| "step": 1491 |
| }, |
| { |
| "epoch": 4.81421647819063, |
| "grad_norm": 0.228154256939888, |
| "learning_rate": 1.4178128523111614e-05, |
| "loss": 0.0527, |
| "step": 1492 |
| }, |
| { |
| "epoch": 4.817447495961228, |
| "grad_norm": 0.17104311287403107, |
| "learning_rate": 1.4173618940248029e-05, |
| "loss": 0.0404, |
| "step": 1493 |
| }, |
| { |
| "epoch": 4.820678513731826, |
| "grad_norm": 0.21469609439373016, |
| "learning_rate": 1.4169109357384443e-05, |
| "loss": 0.0453, |
| "step": 1494 |
| }, |
| { |
| "epoch": 4.823909531502423, |
| "grad_norm": 0.1559404581785202, |
| "learning_rate": 1.4164599774520858e-05, |
| "loss": 0.0368, |
| "step": 1495 |
| }, |
| { |
| "epoch": 4.827140549273021, |
| "grad_norm": 0.2028602957725525, |
| "learning_rate": 1.4160090191657272e-05, |
| "loss": 0.0433, |
| "step": 1496 |
| }, |
| { |
| "epoch": 4.830371567043619, |
| "grad_norm": 0.20709873735904694, |
| "learning_rate": 1.4155580608793688e-05, |
| "loss": 0.0495, |
| "step": 1497 |
| }, |
| { |
| "epoch": 4.833602584814217, |
| "grad_norm": 0.2308061271905899, |
| "learning_rate": 1.4151071025930102e-05, |
| "loss": 0.0575, |
| "step": 1498 |
| }, |
| { |
| "epoch": 4.836833602584814, |
| "grad_norm": 0.20048439502716064, |
| "learning_rate": 1.4146561443066517e-05, |
| "loss": 0.0464, |
| "step": 1499 |
| }, |
| { |
| "epoch": 4.840064620355412, |
| "grad_norm": 0.20464977622032166, |
| "learning_rate": 1.4142051860202933e-05, |
| "loss": 0.0488, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.840064620355412, |
| "eval_loss": 0.07371273636817932, |
| "eval_runtime": 188.2872, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1500 |
| }, |
| { |
| "epoch": 4.84329563812601, |
| "grad_norm": 0.22736908495426178, |
| "learning_rate": 1.4137542277339347e-05, |
| "loss": 0.0509, |
| "step": 1501 |
| }, |
| { |
| "epoch": 4.846526655896607, |
| "grad_norm": 0.19624264538288116, |
| "learning_rate": 1.4133032694475762e-05, |
| "loss": 0.0446, |
| "step": 1502 |
| }, |
| { |
| "epoch": 4.849757673667205, |
| "grad_norm": 0.1996881365776062, |
| "learning_rate": 1.4128523111612178e-05, |
| "loss": 0.0505, |
| "step": 1503 |
| }, |
| { |
| "epoch": 4.852988691437803, |
| "grad_norm": 0.19069544970989227, |
| "learning_rate": 1.4124013528748592e-05, |
| "loss": 0.0427, |
| "step": 1504 |
| }, |
| { |
| "epoch": 4.856219709208401, |
| "grad_norm": 0.24699148535728455, |
| "learning_rate": 1.4119503945885007e-05, |
| "loss": 0.0478, |
| "step": 1505 |
| }, |
| { |
| "epoch": 4.859450726978999, |
| "grad_norm": 0.25318363308906555, |
| "learning_rate": 1.4114994363021421e-05, |
| "loss": 0.0585, |
| "step": 1506 |
| }, |
| { |
| "epoch": 4.8626817447495965, |
| "grad_norm": 0.1774936318397522, |
| "learning_rate": 1.4110484780157835e-05, |
| "loss": 0.043, |
| "step": 1507 |
| }, |
| { |
| "epoch": 4.865912762520194, |
| "grad_norm": 0.27946534752845764, |
| "learning_rate": 1.410597519729425e-05, |
| "loss": 0.0562, |
| "step": 1508 |
| }, |
| { |
| "epoch": 4.869143780290791, |
| "grad_norm": 0.19256210327148438, |
| "learning_rate": 1.4101465614430668e-05, |
| "loss": 0.0444, |
| "step": 1509 |
| }, |
| { |
| "epoch": 4.872374798061389, |
| "grad_norm": 0.1564561426639557, |
| "learning_rate": 1.4096956031567082e-05, |
| "loss": 0.0363, |
| "step": 1510 |
| }, |
| { |
| "epoch": 4.875605815831987, |
| "grad_norm": 0.24548828601837158, |
| "learning_rate": 1.4092446448703496e-05, |
| "loss": 0.0558, |
| "step": 1511 |
| }, |
| { |
| "epoch": 4.878836833602585, |
| "grad_norm": 0.24670550227165222, |
| "learning_rate": 1.408793686583991e-05, |
| "loss": 0.0551, |
| "step": 1512 |
| }, |
| { |
| "epoch": 4.882067851373183, |
| "grad_norm": 0.2852325141429901, |
| "learning_rate": 1.4083427282976325e-05, |
| "loss": 0.0656, |
| "step": 1513 |
| }, |
| { |
| "epoch": 4.885298869143781, |
| "grad_norm": 0.18148425221443176, |
| "learning_rate": 1.407891770011274e-05, |
| "loss": 0.0405, |
| "step": 1514 |
| }, |
| { |
| "epoch": 4.8885298869143785, |
| "grad_norm": 0.26084426045417786, |
| "learning_rate": 1.4074408117249157e-05, |
| "loss": 0.0514, |
| "step": 1515 |
| }, |
| { |
| "epoch": 4.8917609046849755, |
| "grad_norm": 0.14365969598293304, |
| "learning_rate": 1.4069898534385572e-05, |
| "loss": 0.0332, |
| "step": 1516 |
| }, |
| { |
| "epoch": 4.894991922455573, |
| "grad_norm": 0.2067614197731018, |
| "learning_rate": 1.4065388951521986e-05, |
| "loss": 0.0422, |
| "step": 1517 |
| }, |
| { |
| "epoch": 4.898222940226171, |
| "grad_norm": 0.24304218590259552, |
| "learning_rate": 1.40608793686584e-05, |
| "loss": 0.0546, |
| "step": 1518 |
| }, |
| { |
| "epoch": 4.901453957996769, |
| "grad_norm": 0.19038884341716766, |
| "learning_rate": 1.4056369785794815e-05, |
| "loss": 0.0411, |
| "step": 1519 |
| }, |
| { |
| "epoch": 4.904684975767367, |
| "grad_norm": 0.18037162721157074, |
| "learning_rate": 1.405186020293123e-05, |
| "loss": 0.0385, |
| "step": 1520 |
| }, |
| { |
| "epoch": 4.907915993537965, |
| "grad_norm": 0.21943694353103638, |
| "learning_rate": 1.4047350620067645e-05, |
| "loss": 0.047, |
| "step": 1521 |
| }, |
| { |
| "epoch": 4.911147011308563, |
| "grad_norm": 0.22996261715888977, |
| "learning_rate": 1.404284103720406e-05, |
| "loss": 0.0489, |
| "step": 1522 |
| }, |
| { |
| "epoch": 4.91437802907916, |
| "grad_norm": 0.28095847368240356, |
| "learning_rate": 1.4038331454340474e-05, |
| "loss": 0.0552, |
| "step": 1523 |
| }, |
| { |
| "epoch": 4.9176090468497575, |
| "grad_norm": 0.2073894888162613, |
| "learning_rate": 1.4033821871476888e-05, |
| "loss": 0.0431, |
| "step": 1524 |
| }, |
| { |
| "epoch": 4.920840064620355, |
| "grad_norm": 0.17864856123924255, |
| "learning_rate": 1.4029312288613305e-05, |
| "loss": 0.0384, |
| "step": 1525 |
| }, |
| { |
| "epoch": 4.924071082390953, |
| "grad_norm": 0.22148238122463226, |
| "learning_rate": 1.4024802705749719e-05, |
| "loss": 0.0463, |
| "step": 1526 |
| }, |
| { |
| "epoch": 4.927302100161551, |
| "grad_norm": 0.2151981145143509, |
| "learning_rate": 1.4020293122886135e-05, |
| "loss": 0.0503, |
| "step": 1527 |
| }, |
| { |
| "epoch": 4.930533117932149, |
| "grad_norm": 0.22114239633083344, |
| "learning_rate": 1.401578354002255e-05, |
| "loss": 0.0513, |
| "step": 1528 |
| }, |
| { |
| "epoch": 4.933764135702747, |
| "grad_norm": 0.1930968314409256, |
| "learning_rate": 1.4011273957158964e-05, |
| "loss": 0.0457, |
| "step": 1529 |
| }, |
| { |
| "epoch": 4.936995153473344, |
| "grad_norm": 0.18273063004016876, |
| "learning_rate": 1.4006764374295378e-05, |
| "loss": 0.0359, |
| "step": 1530 |
| }, |
| { |
| "epoch": 4.940226171243942, |
| "grad_norm": 0.22627362608909607, |
| "learning_rate": 1.4002254791431793e-05, |
| "loss": 0.0504, |
| "step": 1531 |
| }, |
| { |
| "epoch": 4.94345718901454, |
| "grad_norm": 0.23651792109012604, |
| "learning_rate": 1.3997745208568207e-05, |
| "loss": 0.0553, |
| "step": 1532 |
| }, |
| { |
| "epoch": 4.946688206785137, |
| "grad_norm": 0.26761332154273987, |
| "learning_rate": 1.3993235625704625e-05, |
| "loss": 0.0484, |
| "step": 1533 |
| }, |
| { |
| "epoch": 4.949919224555735, |
| "grad_norm": 0.20963138341903687, |
| "learning_rate": 1.398872604284104e-05, |
| "loss": 0.048, |
| "step": 1534 |
| }, |
| { |
| "epoch": 4.953150242326333, |
| "grad_norm": 0.24256834387779236, |
| "learning_rate": 1.3984216459977454e-05, |
| "loss": 0.0499, |
| "step": 1535 |
| }, |
| { |
| "epoch": 4.956381260096931, |
| "grad_norm": 0.20121239125728607, |
| "learning_rate": 1.3979706877113868e-05, |
| "loss": 0.0436, |
| "step": 1536 |
| }, |
| { |
| "epoch": 4.959612277867528, |
| "grad_norm": 0.1495743989944458, |
| "learning_rate": 1.3975197294250282e-05, |
| "loss": 0.0311, |
| "step": 1537 |
| }, |
| { |
| "epoch": 4.962843295638126, |
| "grad_norm": 0.21080492436885834, |
| "learning_rate": 1.3970687711386697e-05, |
| "loss": 0.0406, |
| "step": 1538 |
| }, |
| { |
| "epoch": 4.966074313408724, |
| "grad_norm": 0.1968107521533966, |
| "learning_rate": 1.3966178128523113e-05, |
| "loss": 0.0394, |
| "step": 1539 |
| }, |
| { |
| "epoch": 4.969305331179322, |
| "grad_norm": 0.18444286286830902, |
| "learning_rate": 1.3961668545659527e-05, |
| "loss": 0.0394, |
| "step": 1540 |
| }, |
| { |
| "epoch": 4.9725363489499195, |
| "grad_norm": 0.24130775034427643, |
| "learning_rate": 1.3957158962795943e-05, |
| "loss": 0.0428, |
| "step": 1541 |
| }, |
| { |
| "epoch": 4.975767366720517, |
| "grad_norm": 0.26179346442222595, |
| "learning_rate": 1.3952649379932358e-05, |
| "loss": 0.056, |
| "step": 1542 |
| }, |
| { |
| "epoch": 4.978998384491114, |
| "grad_norm": 0.21392664313316345, |
| "learning_rate": 1.3948139797068772e-05, |
| "loss": 0.049, |
| "step": 1543 |
| }, |
| { |
| "epoch": 4.982229402261712, |
| "grad_norm": 0.19963374733924866, |
| "learning_rate": 1.3943630214205188e-05, |
| "loss": 0.0462, |
| "step": 1544 |
| }, |
| { |
| "epoch": 4.98546042003231, |
| "grad_norm": 0.18436281383037567, |
| "learning_rate": 1.3939120631341603e-05, |
| "loss": 0.0352, |
| "step": 1545 |
| }, |
| { |
| "epoch": 4.988691437802908, |
| "grad_norm": 0.16987344622612, |
| "learning_rate": 1.3934611048478017e-05, |
| "loss": 0.0376, |
| "step": 1546 |
| }, |
| { |
| "epoch": 4.991922455573506, |
| "grad_norm": 0.24175439774990082, |
| "learning_rate": 1.3930101465614431e-05, |
| "loss": 0.0528, |
| "step": 1547 |
| }, |
| { |
| "epoch": 4.995153473344104, |
| "grad_norm": 0.21086211502552032, |
| "learning_rate": 1.3925591882750846e-05, |
| "loss": 0.0406, |
| "step": 1548 |
| }, |
| { |
| "epoch": 4.9983844911147015, |
| "grad_norm": 0.2292601764202118, |
| "learning_rate": 1.392108229988726e-05, |
| "loss": 0.0488, |
| "step": 1549 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.21926479041576385, |
| "learning_rate": 1.3916572717023678e-05, |
| "loss": 0.0296, |
| "step": 1550 |
| }, |
| { |
| "epoch": 5.003231017770598, |
| "grad_norm": 0.20973849296569824, |
| "learning_rate": 1.3912063134160092e-05, |
| "loss": 0.0449, |
| "step": 1551 |
| }, |
| { |
| "epoch": 5.006462035541196, |
| "grad_norm": 0.1923273652791977, |
| "learning_rate": 1.3907553551296507e-05, |
| "loss": 0.0375, |
| "step": 1552 |
| }, |
| { |
| "epoch": 5.009693053311794, |
| "grad_norm": 0.22117039561271667, |
| "learning_rate": 1.3903043968432921e-05, |
| "loss": 0.0456, |
| "step": 1553 |
| }, |
| { |
| "epoch": 5.012924071082391, |
| "grad_norm": 0.18669717013835907, |
| "learning_rate": 1.3898534385569335e-05, |
| "loss": 0.0403, |
| "step": 1554 |
| }, |
| { |
| "epoch": 5.016155088852988, |
| "grad_norm": 0.1996999830007553, |
| "learning_rate": 1.389402480270575e-05, |
| "loss": 0.0434, |
| "step": 1555 |
| }, |
| { |
| "epoch": 5.019386106623586, |
| "grad_norm": 0.1715063899755478, |
| "learning_rate": 1.3889515219842168e-05, |
| "loss": 0.0344, |
| "step": 1556 |
| }, |
| { |
| "epoch": 5.022617124394184, |
| "grad_norm": 0.20772938430309296, |
| "learning_rate": 1.3885005636978582e-05, |
| "loss": 0.0389, |
| "step": 1557 |
| }, |
| { |
| "epoch": 5.025848142164782, |
| "grad_norm": 0.23174318671226501, |
| "learning_rate": 1.3880496054114996e-05, |
| "loss": 0.0394, |
| "step": 1558 |
| }, |
| { |
| "epoch": 5.02907915993538, |
| "grad_norm": 0.1777268350124359, |
| "learning_rate": 1.387598647125141e-05, |
| "loss": 0.0316, |
| "step": 1559 |
| }, |
| { |
| "epoch": 5.032310177705978, |
| "grad_norm": 0.24886149168014526, |
| "learning_rate": 1.3871476888387825e-05, |
| "loss": 0.0471, |
| "step": 1560 |
| }, |
| { |
| "epoch": 5.035541195476575, |
| "grad_norm": 0.18290841579437256, |
| "learning_rate": 1.386696730552424e-05, |
| "loss": 0.0377, |
| "step": 1561 |
| }, |
| { |
| "epoch": 5.038772213247173, |
| "grad_norm": 0.18155437707901, |
| "learning_rate": 1.3862457722660656e-05, |
| "loss": 0.0343, |
| "step": 1562 |
| }, |
| { |
| "epoch": 5.0420032310177705, |
| "grad_norm": 0.19238632917404175, |
| "learning_rate": 1.385794813979707e-05, |
| "loss": 0.0326, |
| "step": 1563 |
| }, |
| { |
| "epoch": 5.045234248788368, |
| "grad_norm": 0.23981189727783203, |
| "learning_rate": 1.3853438556933484e-05, |
| "loss": 0.0416, |
| "step": 1564 |
| }, |
| { |
| "epoch": 5.048465266558966, |
| "grad_norm": 0.20015032589435577, |
| "learning_rate": 1.3848928974069899e-05, |
| "loss": 0.0375, |
| "step": 1565 |
| }, |
| { |
| "epoch": 5.051696284329564, |
| "grad_norm": 0.21805520355701447, |
| "learning_rate": 1.3844419391206315e-05, |
| "loss": 0.0389, |
| "step": 1566 |
| }, |
| { |
| "epoch": 5.054927302100162, |
| "grad_norm": 0.2618580758571625, |
| "learning_rate": 1.383990980834273e-05, |
| "loss": 0.0447, |
| "step": 1567 |
| }, |
| { |
| "epoch": 5.058158319870759, |
| "grad_norm": 0.21499024331569672, |
| "learning_rate": 1.3835400225479145e-05, |
| "loss": 0.0321, |
| "step": 1568 |
| }, |
| { |
| "epoch": 5.061389337641357, |
| "grad_norm": 0.23732180893421173, |
| "learning_rate": 1.383089064261556e-05, |
| "loss": 0.0408, |
| "step": 1569 |
| }, |
| { |
| "epoch": 5.064620355411955, |
| "grad_norm": 0.3351098597049713, |
| "learning_rate": 1.3826381059751974e-05, |
| "loss": 0.0563, |
| "step": 1570 |
| }, |
| { |
| "epoch": 5.0678513731825525, |
| "grad_norm": 0.2283506691455841, |
| "learning_rate": 1.3821871476888389e-05, |
| "loss": 0.0361, |
| "step": 1571 |
| }, |
| { |
| "epoch": 5.07108239095315, |
| "grad_norm": 0.2626391649246216, |
| "learning_rate": 1.3817361894024803e-05, |
| "loss": 0.043, |
| "step": 1572 |
| }, |
| { |
| "epoch": 5.074313408723748, |
| "grad_norm": 0.2048622965812683, |
| "learning_rate": 1.3812852311161217e-05, |
| "loss": 0.0335, |
| "step": 1573 |
| }, |
| { |
| "epoch": 5.077544426494346, |
| "grad_norm": 0.21693314611911774, |
| "learning_rate": 1.3808342728297635e-05, |
| "loss": 0.0358, |
| "step": 1574 |
| }, |
| { |
| "epoch": 5.080775444264943, |
| "grad_norm": 0.253025621175766, |
| "learning_rate": 1.380383314543405e-05, |
| "loss": 0.0424, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.080775444264943, |
| "eval_loss": 0.07800468057394028, |
| "eval_runtime": 188.2256, |
| "eval_samples_per_second": 1.047, |
| "eval_steps_per_second": 1.047, |
| "step": 1575 |
| }, |
| { |
| "epoch": 5.084006462035541, |
| "grad_norm": 0.21952030062675476, |
| "learning_rate": 1.3799323562570464e-05, |
| "loss": 0.037, |
| "step": 1576 |
| }, |
| { |
| "epoch": 5.087237479806139, |
| "grad_norm": 0.22614319622516632, |
| "learning_rate": 1.3794813979706878e-05, |
| "loss": 0.0361, |
| "step": 1577 |
| }, |
| { |
| "epoch": 5.090468497576737, |
| "grad_norm": 0.27146396040916443, |
| "learning_rate": 1.3790304396843293e-05, |
| "loss": 0.0476, |
| "step": 1578 |
| }, |
| { |
| "epoch": 5.093699515347335, |
| "grad_norm": 0.18099626898765564, |
| "learning_rate": 1.3785794813979707e-05, |
| "loss": 0.0296, |
| "step": 1579 |
| }, |
| { |
| "epoch": 5.096930533117932, |
| "grad_norm": 0.2244144231081009, |
| "learning_rate": 1.3781285231116123e-05, |
| "loss": 0.0408, |
| "step": 1580 |
| }, |
| { |
| "epoch": 5.10016155088853, |
| "grad_norm": 0.17479568719863892, |
| "learning_rate": 1.3776775648252538e-05, |
| "loss": 0.0283, |
| "step": 1581 |
| }, |
| { |
| "epoch": 5.103392568659127, |
| "grad_norm": 0.27667051553726196, |
| "learning_rate": 1.3772266065388954e-05, |
| "loss": 0.0489, |
| "step": 1582 |
| }, |
| { |
| "epoch": 5.106623586429725, |
| "grad_norm": 0.2122027575969696, |
| "learning_rate": 1.3767756482525368e-05, |
| "loss": 0.0371, |
| "step": 1583 |
| }, |
| { |
| "epoch": 5.109854604200323, |
| "grad_norm": 0.21383260190486908, |
| "learning_rate": 1.3763246899661782e-05, |
| "loss": 0.0355, |
| "step": 1584 |
| }, |
| { |
| "epoch": 5.113085621970921, |
| "grad_norm": 0.20885974168777466, |
| "learning_rate": 1.3758737316798197e-05, |
| "loss": 0.035, |
| "step": 1585 |
| }, |
| { |
| "epoch": 5.116316639741519, |
| "grad_norm": 0.20157365500926971, |
| "learning_rate": 1.3754227733934613e-05, |
| "loss": 0.0324, |
| "step": 1586 |
| }, |
| { |
| "epoch": 5.119547657512117, |
| "grad_norm": 0.22047610580921173, |
| "learning_rate": 1.3749718151071027e-05, |
| "loss": 0.0363, |
| "step": 1587 |
| }, |
| { |
| "epoch": 5.1227786752827145, |
| "grad_norm": 0.23512761294841766, |
| "learning_rate": 1.3745208568207442e-05, |
| "loss": 0.0375, |
| "step": 1588 |
| }, |
| { |
| "epoch": 5.1260096930533114, |
| "grad_norm": 0.20714382827281952, |
| "learning_rate": 1.3740698985343856e-05, |
| "loss": 0.0389, |
| "step": 1589 |
| }, |
| { |
| "epoch": 5.129240710823909, |
| "grad_norm": 0.25658270716667175, |
| "learning_rate": 1.373618940248027e-05, |
| "loss": 0.0438, |
| "step": 1590 |
| }, |
| { |
| "epoch": 5.132471728594507, |
| "grad_norm": 0.24508136510849, |
| "learning_rate": 1.3731679819616685e-05, |
| "loss": 0.0391, |
| "step": 1591 |
| }, |
| { |
| "epoch": 5.135702746365105, |
| "grad_norm": 0.21929700672626495, |
| "learning_rate": 1.3727170236753103e-05, |
| "loss": 0.0404, |
| "step": 1592 |
| }, |
| { |
| "epoch": 5.138933764135703, |
| "grad_norm": 0.22846420109272003, |
| "learning_rate": 1.3722660653889517e-05, |
| "loss": 0.0387, |
| "step": 1593 |
| }, |
| { |
| "epoch": 5.142164781906301, |
| "grad_norm": 0.2193041890859604, |
| "learning_rate": 1.3718151071025931e-05, |
| "loss": 0.0406, |
| "step": 1594 |
| }, |
| { |
| "epoch": 5.145395799676899, |
| "grad_norm": 0.2604195475578308, |
| "learning_rate": 1.3713641488162346e-05, |
| "loss": 0.0458, |
| "step": 1595 |
| }, |
| { |
| "epoch": 5.148626817447496, |
| "grad_norm": 0.2732478976249695, |
| "learning_rate": 1.370913190529876e-05, |
| "loss": 0.0432, |
| "step": 1596 |
| }, |
| { |
| "epoch": 5.1518578352180935, |
| "grad_norm": 0.19719639420509338, |
| "learning_rate": 1.3704622322435175e-05, |
| "loss": 0.0319, |
| "step": 1597 |
| }, |
| { |
| "epoch": 5.155088852988691, |
| "grad_norm": 0.2675303816795349, |
| "learning_rate": 1.3700112739571592e-05, |
| "loss": 0.0423, |
| "step": 1598 |
| }, |
| { |
| "epoch": 5.158319870759289, |
| "grad_norm": 0.29542332887649536, |
| "learning_rate": 1.3695603156708007e-05, |
| "loss": 0.0503, |
| "step": 1599 |
| }, |
| { |
| "epoch": 5.161550888529887, |
| "grad_norm": 0.2875734865665436, |
| "learning_rate": 1.3691093573844421e-05, |
| "loss": 0.05, |
| "step": 1600 |
| }, |
| { |
| "epoch": 5.164781906300485, |
| "grad_norm": 0.21129800379276276, |
| "learning_rate": 1.3686583990980835e-05, |
| "loss": 0.0357, |
| "step": 1601 |
| }, |
| { |
| "epoch": 5.168012924071083, |
| "grad_norm": 0.20211546123027802, |
| "learning_rate": 1.368207440811725e-05, |
| "loss": 0.0361, |
| "step": 1602 |
| }, |
| { |
| "epoch": 5.17124394184168, |
| "grad_norm": 0.2640823423862457, |
| "learning_rate": 1.3677564825253664e-05, |
| "loss": 0.0443, |
| "step": 1603 |
| }, |
| { |
| "epoch": 5.174474959612278, |
| "grad_norm": 0.18760652840137482, |
| "learning_rate": 1.367305524239008e-05, |
| "loss": 0.0344, |
| "step": 1604 |
| }, |
| { |
| "epoch": 5.1777059773828755, |
| "grad_norm": 0.21234560012817383, |
| "learning_rate": 1.3668545659526495e-05, |
| "loss": 0.0357, |
| "step": 1605 |
| }, |
| { |
| "epoch": 5.180936995153473, |
| "grad_norm": 0.2700323760509491, |
| "learning_rate": 1.3664036076662909e-05, |
| "loss": 0.043, |
| "step": 1606 |
| }, |
| { |
| "epoch": 5.184168012924071, |
| "grad_norm": 0.23098574578762054, |
| "learning_rate": 1.3659526493799325e-05, |
| "loss": 0.0393, |
| "step": 1607 |
| }, |
| { |
| "epoch": 5.187399030694669, |
| "grad_norm": 0.256625771522522, |
| "learning_rate": 1.365501691093574e-05, |
| "loss": 0.0454, |
| "step": 1608 |
| }, |
| { |
| "epoch": 5.190630048465267, |
| "grad_norm": 0.21986429393291473, |
| "learning_rate": 1.3650507328072154e-05, |
| "loss": 0.0368, |
| "step": 1609 |
| }, |
| { |
| "epoch": 5.193861066235864, |
| "grad_norm": 0.23181846737861633, |
| "learning_rate": 1.364599774520857e-05, |
| "loss": 0.0364, |
| "step": 1610 |
| }, |
| { |
| "epoch": 5.197092084006462, |
| "grad_norm": 0.21644949913024902, |
| "learning_rate": 1.3641488162344984e-05, |
| "loss": 0.0357, |
| "step": 1611 |
| }, |
| { |
| "epoch": 5.20032310177706, |
| "grad_norm": 0.18156535923480988, |
| "learning_rate": 1.3636978579481399e-05, |
| "loss": 0.0319, |
| "step": 1612 |
| }, |
| { |
| "epoch": 5.203554119547658, |
| "grad_norm": 0.21721328794956207, |
| "learning_rate": 1.3632468996617813e-05, |
| "loss": 0.0382, |
| "step": 1613 |
| }, |
| { |
| "epoch": 5.206785137318255, |
| "grad_norm": 0.1720905750989914, |
| "learning_rate": 1.3627959413754228e-05, |
| "loss": 0.0292, |
| "step": 1614 |
| }, |
| { |
| "epoch": 5.210016155088853, |
| "grad_norm": 0.20595358312129974, |
| "learning_rate": 1.3623449830890642e-05, |
| "loss": 0.0377, |
| "step": 1615 |
| }, |
| { |
| "epoch": 5.21324717285945, |
| "grad_norm": 0.20645761489868164, |
| "learning_rate": 1.361894024802706e-05, |
| "loss": 0.034, |
| "step": 1616 |
| }, |
| { |
| "epoch": 5.216478190630048, |
| "grad_norm": 0.20638407766819, |
| "learning_rate": 1.3614430665163474e-05, |
| "loss": 0.0355, |
| "step": 1617 |
| }, |
| { |
| "epoch": 5.219709208400646, |
| "grad_norm": 0.1973501741886139, |
| "learning_rate": 1.3609921082299889e-05, |
| "loss": 0.0343, |
| "step": 1618 |
| }, |
| { |
| "epoch": 5.222940226171244, |
| "grad_norm": 0.2543327212333679, |
| "learning_rate": 1.3605411499436303e-05, |
| "loss": 0.046, |
| "step": 1619 |
| }, |
| { |
| "epoch": 5.226171243941842, |
| "grad_norm": 0.23155446350574493, |
| "learning_rate": 1.3600901916572717e-05, |
| "loss": 0.0379, |
| "step": 1620 |
| }, |
| { |
| "epoch": 5.22940226171244, |
| "grad_norm": 0.20217300951480865, |
| "learning_rate": 1.3596392333709132e-05, |
| "loss": 0.0375, |
| "step": 1621 |
| }, |
| { |
| "epoch": 5.2326332794830375, |
| "grad_norm": 0.2119446098804474, |
| "learning_rate": 1.3591882750845548e-05, |
| "loss": 0.0317, |
| "step": 1622 |
| }, |
| { |
| "epoch": 5.2358642972536344, |
| "grad_norm": 0.23808224499225616, |
| "learning_rate": 1.3587373167981964e-05, |
| "loss": 0.0426, |
| "step": 1623 |
| }, |
| { |
| "epoch": 5.239095315024232, |
| "grad_norm": 0.20590466260910034, |
| "learning_rate": 1.3582863585118378e-05, |
| "loss": 0.0338, |
| "step": 1624 |
| }, |
| { |
| "epoch": 5.24232633279483, |
| "grad_norm": 0.18591848015785217, |
| "learning_rate": 1.3578354002254793e-05, |
| "loss": 0.0314, |
| "step": 1625 |
| }, |
| { |
| "epoch": 5.245557350565428, |
| "grad_norm": 0.20926275849342346, |
| "learning_rate": 1.3573844419391207e-05, |
| "loss": 0.0356, |
| "step": 1626 |
| }, |
| { |
| "epoch": 5.248788368336026, |
| "grad_norm": 0.264517605304718, |
| "learning_rate": 1.3569334836527623e-05, |
| "loss": 0.0417, |
| "step": 1627 |
| }, |
| { |
| "epoch": 5.252019386106624, |
| "grad_norm": 0.28868338465690613, |
| "learning_rate": 1.3564825253664038e-05, |
| "loss": 0.0451, |
| "step": 1628 |
| }, |
| { |
| "epoch": 5.255250403877222, |
| "grad_norm": 0.2451377958059311, |
| "learning_rate": 1.3560315670800452e-05, |
| "loss": 0.0392, |
| "step": 1629 |
| }, |
| { |
| "epoch": 5.258481421647819, |
| "grad_norm": 0.27181971073150635, |
| "learning_rate": 1.3555806087936866e-05, |
| "loss": 0.0429, |
| "step": 1630 |
| }, |
| { |
| "epoch": 5.2617124394184165, |
| "grad_norm": 0.20383009314537048, |
| "learning_rate": 1.355129650507328e-05, |
| "loss": 0.0358, |
| "step": 1631 |
| }, |
| { |
| "epoch": 5.264943457189014, |
| "grad_norm": 0.18881487846374512, |
| "learning_rate": 1.3546786922209695e-05, |
| "loss": 0.0323, |
| "step": 1632 |
| }, |
| { |
| "epoch": 5.268174474959612, |
| "grad_norm": 0.2717234790325165, |
| "learning_rate": 1.3542277339346113e-05, |
| "loss": 0.0414, |
| "step": 1633 |
| }, |
| { |
| "epoch": 5.27140549273021, |
| "grad_norm": 0.23408828675746918, |
| "learning_rate": 1.3537767756482527e-05, |
| "loss": 0.0362, |
| "step": 1634 |
| }, |
| { |
| "epoch": 5.274636510500808, |
| "grad_norm": 0.21253463625907898, |
| "learning_rate": 1.3533258173618942e-05, |
| "loss": 0.0355, |
| "step": 1635 |
| }, |
| { |
| "epoch": 5.277867528271406, |
| "grad_norm": 0.20734728872776031, |
| "learning_rate": 1.3528748590755356e-05, |
| "loss": 0.0337, |
| "step": 1636 |
| }, |
| { |
| "epoch": 5.281098546042003, |
| "grad_norm": 0.20818033814430237, |
| "learning_rate": 1.352423900789177e-05, |
| "loss": 0.0344, |
| "step": 1637 |
| }, |
| { |
| "epoch": 5.284329563812601, |
| "grad_norm": 0.17857208847999573, |
| "learning_rate": 1.3519729425028185e-05, |
| "loss": 0.0294, |
| "step": 1638 |
| }, |
| { |
| "epoch": 5.2875605815831985, |
| "grad_norm": 0.22621624171733856, |
| "learning_rate": 1.3515219842164603e-05, |
| "loss": 0.0357, |
| "step": 1639 |
| }, |
| { |
| "epoch": 5.290791599353796, |
| "grad_norm": 0.31244394183158875, |
| "learning_rate": 1.3510710259301017e-05, |
| "loss": 0.0505, |
| "step": 1640 |
| }, |
| { |
| "epoch": 5.294022617124394, |
| "grad_norm": 0.24572983384132385, |
| "learning_rate": 1.3506200676437431e-05, |
| "loss": 0.0397, |
| "step": 1641 |
| }, |
| { |
| "epoch": 5.297253634894992, |
| "grad_norm": 0.24053926765918732, |
| "learning_rate": 1.3501691093573846e-05, |
| "loss": 0.0387, |
| "step": 1642 |
| }, |
| { |
| "epoch": 5.30048465266559, |
| "grad_norm": 0.2668500542640686, |
| "learning_rate": 1.349718151071026e-05, |
| "loss": 0.0426, |
| "step": 1643 |
| }, |
| { |
| "epoch": 5.303715670436187, |
| "grad_norm": 0.20142191648483276, |
| "learning_rate": 1.3492671927846675e-05, |
| "loss": 0.0319, |
| "step": 1644 |
| }, |
| { |
| "epoch": 5.306946688206785, |
| "grad_norm": 0.20248070359230042, |
| "learning_rate": 1.348816234498309e-05, |
| "loss": 0.0317, |
| "step": 1645 |
| }, |
| { |
| "epoch": 5.310177705977383, |
| "grad_norm": 0.2531336843967438, |
| "learning_rate": 1.3483652762119505e-05, |
| "loss": 0.0459, |
| "step": 1646 |
| }, |
| { |
| "epoch": 5.313408723747981, |
| "grad_norm": 0.2778225243091583, |
| "learning_rate": 1.347914317925592e-05, |
| "loss": 0.0423, |
| "step": 1647 |
| }, |
| { |
| "epoch": 5.316639741518578, |
| "grad_norm": 0.2862214744091034, |
| "learning_rate": 1.3474633596392336e-05, |
| "loss": 0.0478, |
| "step": 1648 |
| }, |
| { |
| "epoch": 5.319870759289176, |
| "grad_norm": 0.2593616247177124, |
| "learning_rate": 1.347012401352875e-05, |
| "loss": 0.0415, |
| "step": 1649 |
| }, |
| { |
| "epoch": 5.323101777059774, |
| "grad_norm": 0.23617789149284363, |
| "learning_rate": 1.3465614430665164e-05, |
| "loss": 0.0434, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.323101777059774, |
| "eval_loss": 0.07877334207296371, |
| "eval_runtime": 188.2656, |
| "eval_samples_per_second": 1.046, |
| "eval_steps_per_second": 1.046, |
| "step": 1650 |
| }, |
| { |
| "epoch": 5.326332794830371, |
| "grad_norm": 0.2762896716594696, |
| "learning_rate": 1.346110484780158e-05, |
| "loss": 0.0433, |
| "step": 1651 |
| }, |
| { |
| "epoch": 5.329563812600969, |
| "grad_norm": 0.35530713200569153, |
| "learning_rate": 1.3456595264937995e-05, |
| "loss": 0.0631, |
| "step": 1652 |
| }, |
| { |
| "epoch": 5.332794830371567, |
| "grad_norm": 0.15821903944015503, |
| "learning_rate": 1.345208568207441e-05, |
| "loss": 0.0258, |
| "step": 1653 |
| }, |
| { |
| "epoch": 5.336025848142165, |
| "grad_norm": 0.2661025822162628, |
| "learning_rate": 1.3447576099210824e-05, |
| "loss": 0.0382, |
| "step": 1654 |
| }, |
| { |
| "epoch": 5.339256865912763, |
| "grad_norm": 0.21156413853168488, |
| "learning_rate": 1.3443066516347238e-05, |
| "loss": 0.0338, |
| "step": 1655 |
| }, |
| { |
| "epoch": 5.3424878836833605, |
| "grad_norm": 0.23887784779071808, |
| "learning_rate": 1.3438556933483652e-05, |
| "loss": 0.0364, |
| "step": 1656 |
| }, |
| { |
| "epoch": 5.345718901453958, |
| "grad_norm": 0.23699849843978882, |
| "learning_rate": 1.343404735062007e-05, |
| "loss": 0.0305, |
| "step": 1657 |
| }, |
| { |
| "epoch": 5.348949919224555, |
| "grad_norm": 0.26554080843925476, |
| "learning_rate": 1.3429537767756485e-05, |
| "loss": 0.0412, |
| "step": 1658 |
| }, |
| { |
| "epoch": 5.352180936995153, |
| "grad_norm": 0.3125143051147461, |
| "learning_rate": 1.3425028184892899e-05, |
| "loss": 0.0452, |
| "step": 1659 |
| }, |
| { |
| "epoch": 5.355411954765751, |
| "grad_norm": 0.24472004175186157, |
| "learning_rate": 1.3420518602029313e-05, |
| "loss": 0.0375, |
| "step": 1660 |
| }, |
| { |
| "epoch": 5.358642972536349, |
| "grad_norm": 0.23264792561531067, |
| "learning_rate": 1.3416009019165728e-05, |
| "loss": 0.0375, |
| "step": 1661 |
| }, |
| { |
| "epoch": 5.361873990306947, |
| "grad_norm": 0.29110607504844666, |
| "learning_rate": 1.3411499436302142e-05, |
| "loss": 0.0494, |
| "step": 1662 |
| }, |
| { |
| "epoch": 5.365105008077545, |
| "grad_norm": 0.20962031185626984, |
| "learning_rate": 1.3406989853438558e-05, |
| "loss": 0.0321, |
| "step": 1663 |
| }, |
| { |
| "epoch": 5.3683360258481425, |
| "grad_norm": 0.2336059957742691, |
| "learning_rate": 1.3402480270574974e-05, |
| "loss": 0.0402, |
| "step": 1664 |
| }, |
| { |
| "epoch": 5.3715670436187395, |
| "grad_norm": 0.21446309983730316, |
| "learning_rate": 1.3397970687711389e-05, |
| "loss": 0.034, |
| "step": 1665 |
| }, |
| { |
| "epoch": 5.374798061389337, |
| "grad_norm": 0.22718128561973572, |
| "learning_rate": 1.3393461104847803e-05, |
| "loss": 0.036, |
| "step": 1666 |
| }, |
| { |
| "epoch": 5.378029079159935, |
| "grad_norm": 0.16423138976097107, |
| "learning_rate": 1.3388951521984217e-05, |
| "loss": 0.0267, |
| "step": 1667 |
| }, |
| { |
| "epoch": 5.381260096930533, |
| "grad_norm": 0.1919136494398117, |
| "learning_rate": 1.3384441939120632e-05, |
| "loss": 0.0324, |
| "step": 1668 |
| }, |
| { |
| "epoch": 5.384491114701131, |
| "grad_norm": 0.26571834087371826, |
| "learning_rate": 1.3379932356257048e-05, |
| "loss": 0.0347, |
| "step": 1669 |
| }, |
| { |
| "epoch": 5.387722132471729, |
| "grad_norm": 0.2542237937450409, |
| "learning_rate": 1.3375422773393462e-05, |
| "loss": 0.0421, |
| "step": 1670 |
| }, |
| { |
| "epoch": 5.390953150242327, |
| "grad_norm": 0.2651587724685669, |
| "learning_rate": 1.3370913190529877e-05, |
| "loss": 0.0458, |
| "step": 1671 |
| }, |
| { |
| "epoch": 5.394184168012924, |
| "grad_norm": 0.2586331069469452, |
| "learning_rate": 1.3366403607666291e-05, |
| "loss": 0.038, |
| "step": 1672 |
| }, |
| { |
| "epoch": 5.3974151857835215, |
| "grad_norm": 0.22627052664756775, |
| "learning_rate": 1.3361894024802705e-05, |
| "loss": 0.0367, |
| "step": 1673 |
| }, |
| { |
| "epoch": 5.400646203554119, |
| "grad_norm": 0.2537485659122467, |
| "learning_rate": 1.3357384441939122e-05, |
| "loss": 0.0427, |
| "step": 1674 |
| }, |
| { |
| "epoch": 5.403877221324717, |
| "grad_norm": 0.17138533294200897, |
| "learning_rate": 1.3352874859075538e-05, |
| "loss": 0.0273, |
| "step": 1675 |
| }, |
| { |
| "epoch": 5.407108239095315, |
| "grad_norm": 0.20749658346176147, |
| "learning_rate": 1.3348365276211952e-05, |
| "loss": 0.0347, |
| "step": 1676 |
| }, |
| { |
| "epoch": 5.410339256865913, |
| "grad_norm": 0.2693939507007599, |
| "learning_rate": 1.3343855693348366e-05, |
| "loss": 0.0429, |
| "step": 1677 |
| }, |
| { |
| "epoch": 5.413570274636511, |
| "grad_norm": 0.2280496209859848, |
| "learning_rate": 1.333934611048478e-05, |
| "loss": 0.0362, |
| "step": 1678 |
| }, |
| { |
| "epoch": 5.416801292407108, |
| "grad_norm": 0.23109251260757446, |
| "learning_rate": 1.3334836527621195e-05, |
| "loss": 0.038, |
| "step": 1679 |
| }, |
| { |
| "epoch": 5.420032310177706, |
| "grad_norm": 0.2629033923149109, |
| "learning_rate": 1.333032694475761e-05, |
| "loss": 0.0428, |
| "step": 1680 |
| }, |
| { |
| "epoch": 5.423263327948304, |
| "grad_norm": 0.22949784994125366, |
| "learning_rate": 1.3325817361894027e-05, |
| "loss": 0.0377, |
| "step": 1681 |
| }, |
| { |
| "epoch": 5.426494345718901, |
| "grad_norm": 0.25165918469429016, |
| "learning_rate": 1.3321307779030442e-05, |
| "loss": 0.0382, |
| "step": 1682 |
| }, |
| { |
| "epoch": 5.429725363489499, |
| "grad_norm": 0.2764151394367218, |
| "learning_rate": 1.3316798196166856e-05, |
| "loss": 0.0456, |
| "step": 1683 |
| }, |
| { |
| "epoch": 5.432956381260097, |
| "grad_norm": 0.337446391582489, |
| "learning_rate": 1.331228861330327e-05, |
| "loss": 0.0548, |
| "step": 1684 |
| }, |
| { |
| "epoch": 5.436187399030695, |
| "grad_norm": 0.2689753472805023, |
| "learning_rate": 1.3307779030439685e-05, |
| "loss": 0.0291, |
| "step": 1685 |
| }, |
| { |
| "epoch": 5.439418416801292, |
| "grad_norm": 0.19691252708435059, |
| "learning_rate": 1.33032694475761e-05, |
| "loss": 0.0294, |
| "step": 1686 |
| }, |
| { |
| "epoch": 5.44264943457189, |
| "grad_norm": 0.1909419298171997, |
| "learning_rate": 1.3298759864712515e-05, |
| "loss": 0.0338, |
| "step": 1687 |
| }, |
| { |
| "epoch": 5.445880452342488, |
| "grad_norm": 0.2926692068576813, |
| "learning_rate": 1.329425028184893e-05, |
| "loss": 0.043, |
| "step": 1688 |
| }, |
| { |
| "epoch": 5.449111470113086, |
| "grad_norm": 0.24427159130573273, |
| "learning_rate": 1.3289740698985346e-05, |
| "loss": 0.0364, |
| "step": 1689 |
| }, |
| { |
| "epoch": 5.4523424878836835, |
| "grad_norm": 0.30178409814834595, |
| "learning_rate": 1.328523111612176e-05, |
| "loss": 0.0445, |
| "step": 1690 |
| }, |
| { |
| "epoch": 5.455573505654281, |
| "grad_norm": 0.2400818020105362, |
| "learning_rate": 1.3280721533258175e-05, |
| "loss": 0.0369, |
| "step": 1691 |
| }, |
| { |
| "epoch": 5.458804523424879, |
| "grad_norm": 0.2377413511276245, |
| "learning_rate": 1.3276211950394589e-05, |
| "loss": 0.0339, |
| "step": 1692 |
| }, |
| { |
| "epoch": 5.462035541195476, |
| "grad_norm": 0.2984737157821655, |
| "learning_rate": 1.3271702367531005e-05, |
| "loss": 0.0521, |
| "step": 1693 |
| }, |
| { |
| "epoch": 5.465266558966074, |
| "grad_norm": 0.241432785987854, |
| "learning_rate": 1.326719278466742e-05, |
| "loss": 0.0395, |
| "step": 1694 |
| }, |
| { |
| "epoch": 5.468497576736672, |
| "grad_norm": 0.2652961015701294, |
| "learning_rate": 1.3262683201803834e-05, |
| "loss": 0.0457, |
| "step": 1695 |
| }, |
| { |
| "epoch": 5.47172859450727, |
| "grad_norm": 0.2415868490934372, |
| "learning_rate": 1.3258173618940248e-05, |
| "loss": 0.0376, |
| "step": 1696 |
| }, |
| { |
| "epoch": 5.474959612277868, |
| "grad_norm": 0.25058040022850037, |
| "learning_rate": 1.3253664036076663e-05, |
| "loss": 0.0388, |
| "step": 1697 |
| }, |
| { |
| "epoch": 5.4781906300484655, |
| "grad_norm": 0.1926383078098297, |
| "learning_rate": 1.3249154453213077e-05, |
| "loss": 0.032, |
| "step": 1698 |
| }, |
| { |
| "epoch": 5.481421647819063, |
| "grad_norm": 0.27298471331596375, |
| "learning_rate": 1.3244644870349495e-05, |
| "loss": 0.0424, |
| "step": 1699 |
| }, |
| { |
| "epoch": 5.48465266558966, |
| "grad_norm": 0.27226123213768005, |
| "learning_rate": 1.324013528748591e-05, |
| "loss": 0.0466, |
| "step": 1700 |
| }, |
| { |
| "epoch": 5.487883683360258, |
| "grad_norm": 0.24531492590904236, |
| "learning_rate": 1.3235625704622324e-05, |
| "loss": 0.0463, |
| "step": 1701 |
| }, |
| { |
| "epoch": 5.491114701130856, |
| "grad_norm": 0.26881077885627747, |
| "learning_rate": 1.3231116121758738e-05, |
| "loss": 0.0411, |
| "step": 1702 |
| }, |
| { |
| "epoch": 5.494345718901454, |
| "grad_norm": 0.29036054015159607, |
| "learning_rate": 1.3226606538895152e-05, |
| "loss": 0.0495, |
| "step": 1703 |
| }, |
| { |
| "epoch": 5.497576736672052, |
| "grad_norm": 0.2871154844760895, |
| "learning_rate": 1.3222096956031567e-05, |
| "loss": 0.0401, |
| "step": 1704 |
| }, |
| { |
| "epoch": 5.50080775444265, |
| "grad_norm": 0.2216850370168686, |
| "learning_rate": 1.3217587373167985e-05, |
| "loss": 0.0295, |
| "step": 1705 |
| }, |
| { |
| "epoch": 5.5040387722132476, |
| "grad_norm": 0.17366330325603485, |
| "learning_rate": 1.3213077790304399e-05, |
| "loss": 0.0255, |
| "step": 1706 |
| }, |
| { |
| "epoch": 5.5072697899838445, |
| "grad_norm": 0.24311934411525726, |
| "learning_rate": 1.3208568207440813e-05, |
| "loss": 0.04, |
| "step": 1707 |
| }, |
| { |
| "epoch": 5.510500807754442, |
| "grad_norm": 0.29902347922325134, |
| "learning_rate": 1.3204058624577228e-05, |
| "loss": 0.0428, |
| "step": 1708 |
| }, |
| { |
| "epoch": 5.51373182552504, |
| "grad_norm": 0.2978745102882385, |
| "learning_rate": 1.3199549041713642e-05, |
| "loss": 0.0448, |
| "step": 1709 |
| }, |
| { |
| "epoch": 5.516962843295638, |
| "grad_norm": 0.26285281777381897, |
| "learning_rate": 1.3195039458850058e-05, |
| "loss": 0.0449, |
| "step": 1710 |
| }, |
| { |
| "epoch": 5.520193861066236, |
| "grad_norm": 0.2854604423046112, |
| "learning_rate": 1.3190529875986473e-05, |
| "loss": 0.0447, |
| "step": 1711 |
| }, |
| { |
| "epoch": 5.523424878836834, |
| "grad_norm": 0.28057965636253357, |
| "learning_rate": 1.3186020293122887e-05, |
| "loss": 0.0424, |
| "step": 1712 |
| }, |
| { |
| "epoch": 5.526655896607432, |
| "grad_norm": 0.21214771270751953, |
| "learning_rate": 1.3181510710259301e-05, |
| "loss": 0.0372, |
| "step": 1713 |
| }, |
| { |
| "epoch": 5.529886914378029, |
| "grad_norm": 0.30201148986816406, |
| "learning_rate": 1.3177001127395716e-05, |
| "loss": 0.048, |
| "step": 1714 |
| }, |
| { |
| "epoch": 5.533117932148627, |
| "grad_norm": 0.24792835116386414, |
| "learning_rate": 1.3172491544532132e-05, |
| "loss": 0.0355, |
| "step": 1715 |
| }, |
| { |
| "epoch": 5.536348949919224, |
| "grad_norm": 0.2251952588558197, |
| "learning_rate": 1.3167981961668548e-05, |
| "loss": 0.0332, |
| "step": 1716 |
| }, |
| { |
| "epoch": 5.539579967689822, |
| "grad_norm": 0.2598719000816345, |
| "learning_rate": 1.3163472378804962e-05, |
| "loss": 0.0441, |
| "step": 1717 |
| }, |
| { |
| "epoch": 5.54281098546042, |
| "grad_norm": 0.27209776639938354, |
| "learning_rate": 1.3158962795941377e-05, |
| "loss": 0.0399, |
| "step": 1718 |
| }, |
| { |
| "epoch": 5.546042003231018, |
| "grad_norm": 0.25826388597488403, |
| "learning_rate": 1.3154453213077791e-05, |
| "loss": 0.0372, |
| "step": 1719 |
| }, |
| { |
| "epoch": 5.549273021001616, |
| "grad_norm": 0.21490974724292755, |
| "learning_rate": 1.3149943630214205e-05, |
| "loss": 0.0342, |
| "step": 1720 |
| }, |
| { |
| "epoch": 5.552504038772213, |
| "grad_norm": 0.2208276242017746, |
| "learning_rate": 1.314543404735062e-05, |
| "loss": 0.0341, |
| "step": 1721 |
| }, |
| { |
| "epoch": 5.555735056542811, |
| "grad_norm": 0.24169307947158813, |
| "learning_rate": 1.3140924464487038e-05, |
| "loss": 0.0373, |
| "step": 1722 |
| }, |
| { |
| "epoch": 5.558966074313409, |
| "grad_norm": 0.2553875148296356, |
| "learning_rate": 1.3136414881623452e-05, |
| "loss": 0.0452, |
| "step": 1723 |
| }, |
| { |
| "epoch": 5.5621970920840065, |
| "grad_norm": 0.23106032609939575, |
| "learning_rate": 1.3131905298759866e-05, |
| "loss": 0.0383, |
| "step": 1724 |
| }, |
| { |
| "epoch": 5.565428109854604, |
| "grad_norm": 0.2005496621131897, |
| "learning_rate": 1.312739571589628e-05, |
| "loss": 0.0285, |
| "step": 1725 |
| }, |
| { |
| "epoch": 5.565428109854604, |
| "eval_loss": 0.07855533808469772, |
| "eval_runtime": 187.8564, |
| "eval_samples_per_second": 1.049, |
| "eval_steps_per_second": 1.049, |
| "step": 1725 |
| }, |
| { |
| "epoch": 5.568659127625202, |
| "grad_norm": 0.25991955399513245, |
| "learning_rate": 1.3122886133032695e-05, |
| "loss": 0.0421, |
| "step": 1726 |
| }, |
| { |
| "epoch": 5.5718901453958, |
| "grad_norm": 0.2951747477054596, |
| "learning_rate": 1.311837655016911e-05, |
| "loss": 0.0441, |
| "step": 1727 |
| }, |
| { |
| "epoch": 5.575121163166397, |
| "grad_norm": 0.1985751986503601, |
| "learning_rate": 1.3113866967305526e-05, |
| "loss": 0.032, |
| "step": 1728 |
| }, |
| { |
| "epoch": 5.578352180936995, |
| "grad_norm": 0.179523304104805, |
| "learning_rate": 1.310935738444194e-05, |
| "loss": 0.0312, |
| "step": 1729 |
| }, |
| { |
| "epoch": 5.581583198707593, |
| "grad_norm": 0.2811926007270813, |
| "learning_rate": 1.3104847801578356e-05, |
| "loss": 0.0389, |
| "step": 1730 |
| }, |
| { |
| "epoch": 5.584814216478191, |
| "grad_norm": 0.19230878353118896, |
| "learning_rate": 1.310033821871477e-05, |
| "loss": 0.0347, |
| "step": 1731 |
| }, |
| { |
| "epoch": 5.5880452342487885, |
| "grad_norm": 0.2499038726091385, |
| "learning_rate": 1.3095828635851185e-05, |
| "loss": 0.0358, |
| "step": 1732 |
| }, |
| { |
| "epoch": 5.591276252019386, |
| "grad_norm": 0.21598705649375916, |
| "learning_rate": 1.30913190529876e-05, |
| "loss": 0.0364, |
| "step": 1733 |
| }, |
| { |
| "epoch": 5.594507269789984, |
| "grad_norm": 0.2517814040184021, |
| "learning_rate": 1.3086809470124015e-05, |
| "loss": 0.0401, |
| "step": 1734 |
| }, |
| { |
| "epoch": 5.597738287560581, |
| "grad_norm": 0.2599017918109894, |
| "learning_rate": 1.308229988726043e-05, |
| "loss": 0.0408, |
| "step": 1735 |
| }, |
| { |
| "epoch": 5.600969305331179, |
| "grad_norm": 0.19673794507980347, |
| "learning_rate": 1.3077790304396844e-05, |
| "loss": 0.0277, |
| "step": 1736 |
| }, |
| { |
| "epoch": 5.604200323101777, |
| "grad_norm": 0.21040499210357666, |
| "learning_rate": 1.3073280721533259e-05, |
| "loss": 0.0361, |
| "step": 1737 |
| }, |
| { |
| "epoch": 5.607431340872375, |
| "grad_norm": 0.26807281374931335, |
| "learning_rate": 1.3068771138669673e-05, |
| "loss": 0.0416, |
| "step": 1738 |
| }, |
| { |
| "epoch": 5.610662358642973, |
| "grad_norm": 0.22095413506031036, |
| "learning_rate": 1.3064261555806087e-05, |
| "loss": 0.0366, |
| "step": 1739 |
| }, |
| { |
| "epoch": 5.613893376413571, |
| "grad_norm": 0.2717856764793396, |
| "learning_rate": 1.3059751972942505e-05, |
| "loss": 0.042, |
| "step": 1740 |
| }, |
| { |
| "epoch": 5.617124394184168, |
| "grad_norm": 0.2600835859775543, |
| "learning_rate": 1.305524239007892e-05, |
| "loss": 0.0422, |
| "step": 1741 |
| }, |
| { |
| "epoch": 5.620355411954765, |
| "grad_norm": 0.2875543236732483, |
| "learning_rate": 1.3050732807215334e-05, |
| "loss": 0.0464, |
| "step": 1742 |
| }, |
| { |
| "epoch": 5.623586429725363, |
| "grad_norm": 0.2859259843826294, |
| "learning_rate": 1.3046223224351748e-05, |
| "loss": 0.0419, |
| "step": 1743 |
| }, |
| { |
| "epoch": 5.626817447495961, |
| "grad_norm": 0.25900939106941223, |
| "learning_rate": 1.3041713641488163e-05, |
| "loss": 0.0389, |
| "step": 1744 |
| }, |
| { |
| "epoch": 5.630048465266559, |
| "grad_norm": 0.274265319108963, |
| "learning_rate": 1.3037204058624577e-05, |
| "loss": 0.0446, |
| "step": 1745 |
| }, |
| { |
| "epoch": 5.633279483037157, |
| "grad_norm": 0.2910583019256592, |
| "learning_rate": 1.3032694475760995e-05, |
| "loss": 0.0452, |
| "step": 1746 |
| }, |
| { |
| "epoch": 5.636510500807755, |
| "grad_norm": 0.20005176961421967, |
| "learning_rate": 1.302818489289741e-05, |
| "loss": 0.0347, |
| "step": 1747 |
| }, |
| { |
| "epoch": 5.639741518578353, |
| "grad_norm": 0.2649039328098297, |
| "learning_rate": 1.3023675310033824e-05, |
| "loss": 0.0413, |
| "step": 1748 |
| }, |
| { |
| "epoch": 5.64297253634895, |
| "grad_norm": 0.250976026058197, |
| "learning_rate": 1.3019165727170238e-05, |
| "loss": 0.0422, |
| "step": 1749 |
| }, |
| { |
| "epoch": 5.646203554119547, |
| "grad_norm": 0.25350484251976013, |
| "learning_rate": 1.3014656144306652e-05, |
| "loss": 0.0402, |
| "step": 1750 |
| }, |
| { |
| "epoch": 5.649434571890145, |
| "grad_norm": 0.19594550132751465, |
| "learning_rate": 1.3010146561443067e-05, |
| "loss": 0.0306, |
| "step": 1751 |
| }, |
| { |
| "epoch": 5.652665589660743, |
| "grad_norm": 0.24489079415798187, |
| "learning_rate": 1.3005636978579483e-05, |
| "loss": 0.0391, |
| "step": 1752 |
| }, |
| { |
| "epoch": 5.655896607431341, |
| "grad_norm": 0.27085795998573303, |
| "learning_rate": 1.3001127395715897e-05, |
| "loss": 0.0399, |
| "step": 1753 |
| }, |
| { |
| "epoch": 5.659127625201939, |
| "grad_norm": 0.18318359553813934, |
| "learning_rate": 1.2996617812852312e-05, |
| "loss": 0.0289, |
| "step": 1754 |
| }, |
| { |
| "epoch": 5.662358642972537, |
| "grad_norm": 0.22117024660110474, |
| "learning_rate": 1.2992108229988726e-05, |
| "loss": 0.033, |
| "step": 1755 |
| }, |
| { |
| "epoch": 5.665589660743134, |
| "grad_norm": 0.20710182189941406, |
| "learning_rate": 1.2987598647125142e-05, |
| "loss": 0.0293, |
| "step": 1756 |
| }, |
| { |
| "epoch": 5.668820678513732, |
| "grad_norm": 0.31142568588256836, |
| "learning_rate": 1.2983089064261557e-05, |
| "loss": 0.0415, |
| "step": 1757 |
| }, |
| { |
| "epoch": 5.6720516962843295, |
| "grad_norm": 0.20175360143184662, |
| "learning_rate": 1.2978579481397973e-05, |
| "loss": 0.031, |
| "step": 1758 |
| }, |
| { |
| "epoch": 5.675282714054927, |
| "grad_norm": 0.30963796377182007, |
| "learning_rate": 1.2974069898534387e-05, |
| "loss": 0.04, |
| "step": 1759 |
| }, |
| { |
| "epoch": 5.678513731825525, |
| "grad_norm": 0.2571542263031006, |
| "learning_rate": 1.2969560315670801e-05, |
| "loss": 0.0357, |
| "step": 1760 |
| }, |
| { |
| "epoch": 5.681744749596123, |
| "grad_norm": 0.2782554030418396, |
| "learning_rate": 1.2965050732807216e-05, |
| "loss": 0.0395, |
| "step": 1761 |
| }, |
| { |
| "epoch": 5.684975767366721, |
| "grad_norm": 0.1827014833688736, |
| "learning_rate": 1.296054114994363e-05, |
| "loss": 0.0285, |
| "step": 1762 |
| }, |
| { |
| "epoch": 5.688206785137318, |
| "grad_norm": 0.2578563392162323, |
| "learning_rate": 1.2956031567080045e-05, |
| "loss": 0.0454, |
| "step": 1763 |
| }, |
| { |
| "epoch": 5.691437802907916, |
| "grad_norm": 0.24724380671977997, |
| "learning_rate": 1.2951521984216462e-05, |
| "loss": 0.0379, |
| "step": 1764 |
| }, |
| { |
| "epoch": 5.694668820678514, |
| "grad_norm": 0.2129620909690857, |
| "learning_rate": 1.2947012401352877e-05, |
| "loss": 0.0333, |
| "step": 1765 |
| }, |
| { |
| "epoch": 5.6978998384491115, |
| "grad_norm": 0.3049379289150238, |
| "learning_rate": 1.2942502818489291e-05, |
| "loss": 0.0479, |
| "step": 1766 |
| }, |
| { |
| "epoch": 5.701130856219709, |
| "grad_norm": 0.20850899815559387, |
| "learning_rate": 1.2937993235625706e-05, |
| "loss": 0.0316, |
| "step": 1767 |
| }, |
| { |
| "epoch": 5.704361873990307, |
| "grad_norm": 0.38301345705986023, |
| "learning_rate": 1.293348365276212e-05, |
| "loss": 0.0603, |
| "step": 1768 |
| }, |
| { |
| "epoch": 5.707592891760905, |
| "grad_norm": 0.29469698667526245, |
| "learning_rate": 1.2928974069898534e-05, |
| "loss": 0.0403, |
| "step": 1769 |
| }, |
| { |
| "epoch": 5.710823909531502, |
| "grad_norm": 0.32916879653930664, |
| "learning_rate": 1.292446448703495e-05, |
| "loss": 0.0488, |
| "step": 1770 |
| }, |
| { |
| "epoch": 5.7140549273021, |
| "grad_norm": 0.26417621970176697, |
| "learning_rate": 1.2919954904171366e-05, |
| "loss": 0.0443, |
| "step": 1771 |
| }, |
| { |
| "epoch": 5.717285945072698, |
| "grad_norm": 0.2839740514755249, |
| "learning_rate": 1.2915445321307781e-05, |
| "loss": 0.0446, |
| "step": 1772 |
| }, |
| { |
| "epoch": 5.720516962843296, |
| "grad_norm": 0.2270142138004303, |
| "learning_rate": 1.2910935738444195e-05, |
| "loss": 0.0384, |
| "step": 1773 |
| }, |
| { |
| "epoch": 5.723747980613894, |
| "grad_norm": 0.22519277036190033, |
| "learning_rate": 1.290642615558061e-05, |
| "loss": 0.0397, |
| "step": 1774 |
| }, |
| { |
| "epoch": 5.726978998384491, |
| "grad_norm": 0.28247109055519104, |
| "learning_rate": 1.2901916572717024e-05, |
| "loss": 0.0449, |
| "step": 1775 |
| }, |
| { |
| "epoch": 5.730210016155089, |
| "grad_norm": 0.2766035795211792, |
| "learning_rate": 1.289740698985344e-05, |
| "loss": 0.0434, |
| "step": 1776 |
| }, |
| { |
| "epoch": 5.733441033925686, |
| "grad_norm": 0.2730564773082733, |
| "learning_rate": 1.2892897406989855e-05, |
| "loss": 0.0472, |
| "step": 1777 |
| }, |
| { |
| "epoch": 5.736672051696284, |
| "grad_norm": 0.32382893562316895, |
| "learning_rate": 1.2888387824126269e-05, |
| "loss": 0.0497, |
| "step": 1778 |
| }, |
| { |
| "epoch": 5.739903069466882, |
| "grad_norm": 0.2379397749900818, |
| "learning_rate": 1.2883878241262683e-05, |
| "loss": 0.037, |
| "step": 1779 |
| }, |
| { |
| "epoch": 5.74313408723748, |
| "grad_norm": 0.27886688709259033, |
| "learning_rate": 1.2879368658399098e-05, |
| "loss": 0.046, |
| "step": 1780 |
| }, |
| { |
| "epoch": 5.746365105008078, |
| "grad_norm": 0.3302753269672394, |
| "learning_rate": 1.2874859075535514e-05, |
| "loss": 0.0474, |
| "step": 1781 |
| }, |
| { |
| "epoch": 5.749596122778676, |
| "grad_norm": 0.2237986922264099, |
| "learning_rate": 1.287034949267193e-05, |
| "loss": 0.0347, |
| "step": 1782 |
| }, |
| { |
| "epoch": 5.7528271405492735, |
| "grad_norm": 0.23131723701953888, |
| "learning_rate": 1.2865839909808344e-05, |
| "loss": 0.0365, |
| "step": 1783 |
| }, |
| { |
| "epoch": 5.75605815831987, |
| "grad_norm": 0.29312634468078613, |
| "learning_rate": 1.2861330326944759e-05, |
| "loss": 0.0519, |
| "step": 1784 |
| }, |
| { |
| "epoch": 5.759289176090468, |
| "grad_norm": 0.24728724360466003, |
| "learning_rate": 1.2856820744081173e-05, |
| "loss": 0.0397, |
| "step": 1785 |
| }, |
| { |
| "epoch": 5.762520193861066, |
| "grad_norm": 0.2196582555770874, |
| "learning_rate": 1.2852311161217587e-05, |
| "loss": 0.0386, |
| "step": 1786 |
| }, |
| { |
| "epoch": 5.765751211631664, |
| "grad_norm": 0.253260999917984, |
| "learning_rate": 1.2847801578354002e-05, |
| "loss": 0.0383, |
| "step": 1787 |
| }, |
| { |
| "epoch": 5.768982229402262, |
| "grad_norm": 0.2176506668329239, |
| "learning_rate": 1.284329199549042e-05, |
| "loss": 0.0344, |
| "step": 1788 |
| }, |
| { |
| "epoch": 5.77221324717286, |
| "grad_norm": 0.20385535061359406, |
| "learning_rate": 1.2838782412626834e-05, |
| "loss": 0.0356, |
| "step": 1789 |
| }, |
| { |
| "epoch": 5.775444264943458, |
| "grad_norm": 0.2268013060092926, |
| "learning_rate": 1.2834272829763248e-05, |
| "loss": 0.0365, |
| "step": 1790 |
| }, |
| { |
| "epoch": 5.778675282714055, |
| "grad_norm": 0.24011680483818054, |
| "learning_rate": 1.2829763246899663e-05, |
| "loss": 0.0426, |
| "step": 1791 |
| }, |
| { |
| "epoch": 5.7819063004846525, |
| "grad_norm": 0.2475063055753708, |
| "learning_rate": 1.2825253664036077e-05, |
| "loss": 0.0376, |
| "step": 1792 |
| }, |
| { |
| "epoch": 5.78513731825525, |
| "grad_norm": 0.2665673494338989, |
| "learning_rate": 1.2820744081172493e-05, |
| "loss": 0.0406, |
| "step": 1793 |
| }, |
| { |
| "epoch": 5.788368336025848, |
| "grad_norm": 0.251534104347229, |
| "learning_rate": 1.2816234498308908e-05, |
| "loss": 0.0427, |
| "step": 1794 |
| }, |
| { |
| "epoch": 5.791599353796446, |
| "grad_norm": 0.22792288661003113, |
| "learning_rate": 1.2811724915445322e-05, |
| "loss": 0.0387, |
| "step": 1795 |
| }, |
| { |
| "epoch": 5.794830371567044, |
| "grad_norm": 0.23877018690109253, |
| "learning_rate": 1.2807215332581736e-05, |
| "loss": 0.0382, |
| "step": 1796 |
| }, |
| { |
| "epoch": 5.798061389337642, |
| "grad_norm": 0.1875104308128357, |
| "learning_rate": 1.2802705749718152e-05, |
| "loss": 0.0286, |
| "step": 1797 |
| }, |
| { |
| "epoch": 5.801292407108239, |
| "grad_norm": 0.25169655680656433, |
| "learning_rate": 1.2798196166854567e-05, |
| "loss": 0.0407, |
| "step": 1798 |
| }, |
| { |
| "epoch": 5.804523424878837, |
| "grad_norm": 0.3368035852909088, |
| "learning_rate": 1.2793686583990983e-05, |
| "loss": 0.0458, |
| "step": 1799 |
| }, |
| { |
| "epoch": 5.8077544426494345, |
| "grad_norm": 0.22736723721027374, |
| "learning_rate": 1.2789177001127397e-05, |
| "loss": 0.0332, |
| "step": 1800 |
| }, |
| { |
| "epoch": 5.8077544426494345, |
| "eval_loss": 0.07852450758218765, |
| "eval_runtime": 187.822, |
| "eval_samples_per_second": 1.049, |
| "eval_steps_per_second": 1.049, |
| "step": 1800 |
| }, |
| { |
| "epoch": 5.810985460420032, |
| "grad_norm": 0.3504641652107239, |
| "learning_rate": 1.2784667418263812e-05, |
| "loss": 0.048, |
| "step": 1801 |
| }, |
| { |
| "epoch": 5.81421647819063, |
| "grad_norm": 0.3084200322628021, |
| "learning_rate": 1.2780157835400226e-05, |
| "loss": 0.0429, |
| "step": 1802 |
| }, |
| { |
| "epoch": 5.817447495961228, |
| "grad_norm": 0.24561993777751923, |
| "learning_rate": 1.277564825253664e-05, |
| "loss": 0.0367, |
| "step": 1803 |
| }, |
| { |
| "epoch": 5.820678513731826, |
| "grad_norm": 0.2805800437927246, |
| "learning_rate": 1.2771138669673055e-05, |
| "loss": 0.0413, |
| "step": 1804 |
| }, |
| { |
| "epoch": 5.823909531502423, |
| "grad_norm": 0.3110480308532715, |
| "learning_rate": 1.2766629086809473e-05, |
| "loss": 0.0457, |
| "step": 1805 |
| }, |
| { |
| "epoch": 5.827140549273021, |
| "grad_norm": 0.23643557727336884, |
| "learning_rate": 1.2762119503945887e-05, |
| "loss": 0.0403, |
| "step": 1806 |
| }, |
| { |
| "epoch": 5.830371567043619, |
| "grad_norm": 0.26031965017318726, |
| "learning_rate": 1.2757609921082301e-05, |
| "loss": 0.0397, |
| "step": 1807 |
| }, |
| { |
| "epoch": 5.833602584814217, |
| "grad_norm": 0.2314675748348236, |
| "learning_rate": 1.2753100338218716e-05, |
| "loss": 0.0344, |
| "step": 1808 |
| }, |
| { |
| "epoch": 5.836833602584814, |
| "grad_norm": 0.1894425004720688, |
| "learning_rate": 1.274859075535513e-05, |
| "loss": 0.0286, |
| "step": 1809 |
| }, |
| { |
| "epoch": 5.840064620355412, |
| "grad_norm": 0.20770658552646637, |
| "learning_rate": 1.2744081172491545e-05, |
| "loss": 0.0315, |
| "step": 1810 |
| }, |
| { |
| "epoch": 5.84329563812601, |
| "grad_norm": 0.24325595796108246, |
| "learning_rate": 1.273957158962796e-05, |
| "loss": 0.0368, |
| "step": 1811 |
| }, |
| { |
| "epoch": 5.846526655896607, |
| "grad_norm": 0.23935092985630035, |
| "learning_rate": 1.2735062006764377e-05, |
| "loss": 0.034, |
| "step": 1812 |
| }, |
| { |
| "epoch": 5.849757673667205, |
| "grad_norm": 0.2605186402797699, |
| "learning_rate": 1.2730552423900791e-05, |
| "loss": 0.0424, |
| "step": 1813 |
| }, |
| { |
| "epoch": 5.852988691437803, |
| "grad_norm": 0.20698106288909912, |
| "learning_rate": 1.2726042841037206e-05, |
| "loss": 0.0309, |
| "step": 1814 |
| }, |
| { |
| "epoch": 5.856219709208401, |
| "grad_norm": 0.3079001009464264, |
| "learning_rate": 1.272153325817362e-05, |
| "loss": 0.0475, |
| "step": 1815 |
| }, |
| { |
| "epoch": 5.859450726978999, |
| "grad_norm": 0.22524310648441315, |
| "learning_rate": 1.2717023675310034e-05, |
| "loss": 0.0332, |
| "step": 1816 |
| }, |
| { |
| "epoch": 5.8626817447495965, |
| "grad_norm": 0.2669833302497864, |
| "learning_rate": 1.271251409244645e-05, |
| "loss": 0.0383, |
| "step": 1817 |
| }, |
| { |
| "epoch": 5.865912762520194, |
| "grad_norm": 0.2779400646686554, |
| "learning_rate": 1.2708004509582865e-05, |
| "loss": 0.0362, |
| "step": 1818 |
| }, |
| { |
| "epoch": 5.869143780290791, |
| "grad_norm": 0.23108670115470886, |
| "learning_rate": 1.270349492671928e-05, |
| "loss": 0.0397, |
| "step": 1819 |
| }, |
| { |
| "epoch": 5.872374798061389, |
| "grad_norm": 0.27856990694999695, |
| "learning_rate": 1.2698985343855694e-05, |
| "loss": 0.0424, |
| "step": 1820 |
| }, |
| { |
| "epoch": 5.875605815831987, |
| "grad_norm": 0.265458881855011, |
| "learning_rate": 1.2694475760992108e-05, |
| "loss": 0.0416, |
| "step": 1821 |
| }, |
| { |
| "epoch": 5.878836833602585, |
| "grad_norm": 0.2702803909778595, |
| "learning_rate": 1.2689966178128524e-05, |
| "loss": 0.0377, |
| "step": 1822 |
| }, |
| { |
| "epoch": 5.882067851373183, |
| "grad_norm": 0.2573676109313965, |
| "learning_rate": 1.268545659526494e-05, |
| "loss": 0.0445, |
| "step": 1823 |
| }, |
| { |
| "epoch": 5.885298869143781, |
| "grad_norm": 0.28648799657821655, |
| "learning_rate": 1.2680947012401355e-05, |
| "loss": 0.0471, |
| "step": 1824 |
| }, |
| { |
| "epoch": 5.8885298869143785, |
| "grad_norm": 0.27315300703048706, |
| "learning_rate": 1.2676437429537769e-05, |
| "loss": 0.0407, |
| "step": 1825 |
| }, |
| { |
| "epoch": 5.8917609046849755, |
| "grad_norm": 0.27942955493927, |
| "learning_rate": 1.2671927846674183e-05, |
| "loss": 0.037, |
| "step": 1826 |
| }, |
| { |
| "epoch": 5.894991922455573, |
| "grad_norm": 0.3186817169189453, |
| "learning_rate": 1.2667418263810598e-05, |
| "loss": 0.0515, |
| "step": 1827 |
| }, |
| { |
| "epoch": 5.898222940226171, |
| "grad_norm": 0.18832504749298096, |
| "learning_rate": 1.2662908680947012e-05, |
| "loss": 0.0265, |
| "step": 1828 |
| }, |
| { |
| "epoch": 5.901453957996769, |
| "grad_norm": 0.30080464482307434, |
| "learning_rate": 1.265839909808343e-05, |
| "loss": 0.0424, |
| "step": 1829 |
| }, |
| { |
| "epoch": 5.904684975767367, |
| "grad_norm": 0.22818855941295624, |
| "learning_rate": 1.2653889515219844e-05, |
| "loss": 0.0337, |
| "step": 1830 |
| }, |
| { |
| "epoch": 5.907915993537965, |
| "grad_norm": 0.2548898756504059, |
| "learning_rate": 1.2649379932356259e-05, |
| "loss": 0.039, |
| "step": 1831 |
| }, |
| { |
| "epoch": 5.911147011308563, |
| "grad_norm": 0.3003893792629242, |
| "learning_rate": 1.2644870349492673e-05, |
| "loss": 0.0459, |
| "step": 1832 |
| }, |
| { |
| "epoch": 5.91437802907916, |
| "grad_norm": 0.23449090123176575, |
| "learning_rate": 1.2640360766629087e-05, |
| "loss": 0.0341, |
| "step": 1833 |
| }, |
| { |
| "epoch": 5.9176090468497575, |
| "grad_norm": 0.26345452666282654, |
| "learning_rate": 1.2635851183765502e-05, |
| "loss": 0.0391, |
| "step": 1834 |
| }, |
| { |
| "epoch": 5.920840064620355, |
| "grad_norm": 0.20951145887374878, |
| "learning_rate": 1.2631341600901918e-05, |
| "loss": 0.032, |
| "step": 1835 |
| }, |
| { |
| "epoch": 5.924071082390953, |
| "grad_norm": 0.2048126459121704, |
| "learning_rate": 1.2626832018038332e-05, |
| "loss": 0.0294, |
| "step": 1836 |
| }, |
| { |
| "epoch": 5.927302100161551, |
| "grad_norm": 0.3434942662715912, |
| "learning_rate": 1.2622322435174747e-05, |
| "loss": 0.0492, |
| "step": 1837 |
| }, |
| { |
| "epoch": 5.930533117932149, |
| "grad_norm": 0.23535341024398804, |
| "learning_rate": 1.2617812852311163e-05, |
| "loss": 0.0391, |
| "step": 1838 |
| }, |
| { |
| "epoch": 5.933764135702747, |
| "grad_norm": 0.2758124768733978, |
| "learning_rate": 1.2613303269447577e-05, |
| "loss": 0.0405, |
| "step": 1839 |
| }, |
| { |
| "epoch": 5.936995153473344, |
| "grad_norm": 0.22967149317264557, |
| "learning_rate": 1.2608793686583992e-05, |
| "loss": 0.0329, |
| "step": 1840 |
| }, |
| { |
| "epoch": 5.940226171243942, |
| "grad_norm": 0.2670023739337921, |
| "learning_rate": 1.2604284103720408e-05, |
| "loss": 0.0395, |
| "step": 1841 |
| }, |
| { |
| "epoch": 5.94345718901454, |
| "grad_norm": 0.21399269998073578, |
| "learning_rate": 1.2599774520856822e-05, |
| "loss": 0.0297, |
| "step": 1842 |
| }, |
| { |
| "epoch": 5.946688206785137, |
| "grad_norm": 0.27811315655708313, |
| "learning_rate": 1.2595264937993236e-05, |
| "loss": 0.0447, |
| "step": 1843 |
| }, |
| { |
| "epoch": 5.949919224555735, |
| "grad_norm": 0.2337990254163742, |
| "learning_rate": 1.259075535512965e-05, |
| "loss": 0.0394, |
| "step": 1844 |
| }, |
| { |
| "epoch": 5.953150242326333, |
| "grad_norm": 0.2887449264526367, |
| "learning_rate": 1.2586245772266065e-05, |
| "loss": 0.0446, |
| "step": 1845 |
| }, |
| { |
| "epoch": 5.956381260096931, |
| "grad_norm": 0.2928435802459717, |
| "learning_rate": 1.258173618940248e-05, |
| "loss": 0.0461, |
| "step": 1846 |
| }, |
| { |
| "epoch": 5.959612277867528, |
| "grad_norm": 0.16092540323734283, |
| "learning_rate": 1.2577226606538897e-05, |
| "loss": 0.0271, |
| "step": 1847 |
| }, |
| { |
| "epoch": 5.962843295638126, |
| "grad_norm": 0.21880003809928894, |
| "learning_rate": 1.2572717023675312e-05, |
| "loss": 0.0287, |
| "step": 1848 |
| }, |
| { |
| "epoch": 5.966074313408724, |
| "grad_norm": 0.3079856038093567, |
| "learning_rate": 1.2568207440811726e-05, |
| "loss": 0.0431, |
| "step": 1849 |
| }, |
| { |
| "epoch": 5.969305331179322, |
| "grad_norm": 0.3066396713256836, |
| "learning_rate": 1.256369785794814e-05, |
| "loss": 0.0388, |
| "step": 1850 |
| }, |
| { |
| "epoch": 5.9725363489499195, |
| "grad_norm": 0.2918650209903717, |
| "learning_rate": 1.2559188275084555e-05, |
| "loss": 0.0443, |
| "step": 1851 |
| }, |
| { |
| "epoch": 5.975767366720517, |
| "grad_norm": 0.299337238073349, |
| "learning_rate": 1.255467869222097e-05, |
| "loss": 0.0424, |
| "step": 1852 |
| }, |
| { |
| "epoch": 5.978998384491114, |
| "grad_norm": 0.1878298670053482, |
| "learning_rate": 1.2550169109357387e-05, |
| "loss": 0.0321, |
| "step": 1853 |
| }, |
| { |
| "epoch": 5.982229402261712, |
| "grad_norm": 0.24723421037197113, |
| "learning_rate": 1.2545659526493802e-05, |
| "loss": 0.0394, |
| "step": 1854 |
| }, |
| { |
| "epoch": 5.98546042003231, |
| "grad_norm": 0.3190695643424988, |
| "learning_rate": 1.2541149943630216e-05, |
| "loss": 0.0488, |
| "step": 1855 |
| }, |
| { |
| "epoch": 5.988691437802908, |
| "grad_norm": 0.21342645585536957, |
| "learning_rate": 1.253664036076663e-05, |
| "loss": 0.0357, |
| "step": 1856 |
| }, |
| { |
| "epoch": 5.991922455573506, |
| "grad_norm": 0.2866486608982086, |
| "learning_rate": 1.2532130777903045e-05, |
| "loss": 0.0443, |
| "step": 1857 |
| }, |
| { |
| "epoch": 5.995153473344104, |
| "grad_norm": 0.24589291214942932, |
| "learning_rate": 1.2527621195039459e-05, |
| "loss": 0.0366, |
| "step": 1858 |
| }, |
| { |
| "epoch": 5.9983844911147015, |
| "grad_norm": 0.319561243057251, |
| "learning_rate": 1.2523111612175875e-05, |
| "loss": 0.0445, |
| "step": 1859 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.5308091044425964, |
| "learning_rate": 1.251860202931229e-05, |
| "loss": 0.0523, |
| "step": 1860 |
| }, |
| { |
| "epoch": 6.003231017770598, |
| "grad_norm": 0.2225033938884735, |
| "learning_rate": 1.2514092446448704e-05, |
| "loss": 0.03, |
| "step": 1861 |
| }, |
| { |
| "epoch": 6.006462035541196, |
| "grad_norm": 0.24722102284431458, |
| "learning_rate": 1.2509582863585118e-05, |
| "loss": 0.0316, |
| "step": 1862 |
| }, |
| { |
| "epoch": 6.009693053311794, |
| "grad_norm": 0.31931015849113464, |
| "learning_rate": 1.2505073280721534e-05, |
| "loss": 0.0423, |
| "step": 1863 |
| }, |
| { |
| "epoch": 6.012924071082391, |
| "grad_norm": 0.2228505164384842, |
| "learning_rate": 1.2500563697857949e-05, |
| "loss": 0.03, |
| "step": 1864 |
| }, |
| { |
| "epoch": 6.016155088852988, |
| "grad_norm": 0.24084047973155975, |
| "learning_rate": 1.2496054114994365e-05, |
| "loss": 0.0353, |
| "step": 1865 |
| }, |
| { |
| "epoch": 6.019386106623586, |
| "grad_norm": 0.24939881265163422, |
| "learning_rate": 1.249154453213078e-05, |
| "loss": 0.0351, |
| "step": 1866 |
| }, |
| { |
| "epoch": 6.022617124394184, |
| "grad_norm": 0.20657989382743835, |
| "learning_rate": 1.2487034949267194e-05, |
| "loss": 0.0294, |
| "step": 1867 |
| }, |
| { |
| "epoch": 6.025848142164782, |
| "grad_norm": 0.20235468447208405, |
| "learning_rate": 1.2482525366403608e-05, |
| "loss": 0.0268, |
| "step": 1868 |
| }, |
| { |
| "epoch": 6.02907915993538, |
| "grad_norm": 0.19747896492481232, |
| "learning_rate": 1.2478015783540022e-05, |
| "loss": 0.0254, |
| "step": 1869 |
| }, |
| { |
| "epoch": 6.032310177705978, |
| "grad_norm": 0.3207830488681793, |
| "learning_rate": 1.2473506200676437e-05, |
| "loss": 0.0375, |
| "step": 1870 |
| }, |
| { |
| "epoch": 6.035541195476575, |
| "grad_norm": 0.2929365336894989, |
| "learning_rate": 1.2468996617812855e-05, |
| "loss": 0.0379, |
| "step": 1871 |
| }, |
| { |
| "epoch": 6.038772213247173, |
| "grad_norm": 0.25542163848876953, |
| "learning_rate": 1.2464487034949269e-05, |
| "loss": 0.031, |
| "step": 1872 |
| }, |
| { |
| "epoch": 6.0420032310177705, |
| "grad_norm": 0.29802820086479187, |
| "learning_rate": 1.2459977452085683e-05, |
| "loss": 0.0365, |
| "step": 1873 |
| }, |
| { |
| "epoch": 6.045234248788368, |
| "grad_norm": 0.2891809940338135, |
| "learning_rate": 1.2455467869222098e-05, |
| "loss": 0.0344, |
| "step": 1874 |
| }, |
| { |
| "epoch": 6.048465266558966, |
| "grad_norm": 0.36243242025375366, |
| "learning_rate": 1.2450958286358512e-05, |
| "loss": 0.0422, |
| "step": 1875 |
| }, |
| { |
| "epoch": 6.048465266558966, |
| "eval_loss": 0.08410021662712097, |
| "eval_runtime": 188.521, |
| "eval_samples_per_second": 1.045, |
| "eval_steps_per_second": 1.045, |
| "step": 1875 |
| }, |
| { |
| "epoch": 6.051696284329564, |
| "grad_norm": 0.3777644634246826, |
| "learning_rate": 1.2446448703494928e-05, |
| "loss": 0.0289, |
| "step": 1876 |
| }, |
| { |
| "epoch": 6.054927302100162, |
| "grad_norm": 0.26423031091690063, |
| "learning_rate": 1.2441939120631343e-05, |
| "loss": 0.0317, |
| "step": 1877 |
| }, |
| { |
| "epoch": 6.058158319870759, |
| "grad_norm": 0.31099212169647217, |
| "learning_rate": 1.2437429537767757e-05, |
| "loss": 0.035, |
| "step": 1878 |
| }, |
| { |
| "epoch": 6.061389337641357, |
| "grad_norm": 0.2649920582771301, |
| "learning_rate": 1.2432919954904173e-05, |
| "loss": 0.0293, |
| "step": 1879 |
| }, |
| { |
| "epoch": 6.064620355411955, |
| "grad_norm": 0.3153195381164551, |
| "learning_rate": 1.2428410372040588e-05, |
| "loss": 0.0391, |
| "step": 1880 |
| }, |
| { |
| "epoch": 6.0678513731825525, |
| "grad_norm": 0.23356491327285767, |
| "learning_rate": 1.2423900789177002e-05, |
| "loss": 0.0282, |
| "step": 1881 |
| }, |
| { |
| "epoch": 6.07108239095315, |
| "grad_norm": 0.288780152797699, |
| "learning_rate": 1.2419391206313418e-05, |
| "loss": 0.0321, |
| "step": 1882 |
| }, |
| { |
| "epoch": 6.074313408723748, |
| "grad_norm": 0.3271370530128479, |
| "learning_rate": 1.2414881623449832e-05, |
| "loss": 0.0378, |
| "step": 1883 |
| }, |
| { |
| "epoch": 6.077544426494346, |
| "grad_norm": 0.31619715690612793, |
| "learning_rate": 1.2410372040586247e-05, |
| "loss": 0.0331, |
| "step": 1884 |
| }, |
| { |
| "epoch": 6.080775444264943, |
| "grad_norm": 0.32781535387039185, |
| "learning_rate": 1.2405862457722661e-05, |
| "loss": 0.0374, |
| "step": 1885 |
| }, |
| { |
| "epoch": 6.084006462035541, |
| "grad_norm": 0.29508909583091736, |
| "learning_rate": 1.2401352874859076e-05, |
| "loss": 0.0294, |
| "step": 1886 |
| }, |
| { |
| "epoch": 6.087237479806139, |
| "grad_norm": 0.2053205370903015, |
| "learning_rate": 1.239684329199549e-05, |
| "loss": 0.0269, |
| "step": 1887 |
| }, |
| { |
| "epoch": 6.090468497576737, |
| "grad_norm": 0.24436092376708984, |
| "learning_rate": 1.2392333709131908e-05, |
| "loss": 0.025, |
| "step": 1888 |
| }, |
| { |
| "epoch": 6.093699515347335, |
| "grad_norm": 0.2535931468009949, |
| "learning_rate": 1.2387824126268322e-05, |
| "loss": 0.029, |
| "step": 1889 |
| }, |
| { |
| "epoch": 6.096930533117932, |
| "grad_norm": 0.2552325427532196, |
| "learning_rate": 1.2383314543404736e-05, |
| "loss": 0.0291, |
| "step": 1890 |
| }, |
| { |
| "epoch": 6.10016155088853, |
| "grad_norm": 0.2708950936794281, |
| "learning_rate": 1.2378804960541151e-05, |
| "loss": 0.0311, |
| "step": 1891 |
| }, |
| { |
| "epoch": 6.103392568659127, |
| "grad_norm": 0.318724125623703, |
| "learning_rate": 1.2374295377677565e-05, |
| "loss": 0.0368, |
| "step": 1892 |
| }, |
| { |
| "epoch": 6.106623586429725, |
| "grad_norm": 0.2791127562522888, |
| "learning_rate": 1.236978579481398e-05, |
| "loss": 0.0331, |
| "step": 1893 |
| }, |
| { |
| "epoch": 6.109854604200323, |
| "grad_norm": 0.3681200444698334, |
| "learning_rate": 1.2365276211950397e-05, |
| "loss": 0.0431, |
| "step": 1894 |
| }, |
| { |
| "epoch": 6.113085621970921, |
| "grad_norm": 0.2525460422039032, |
| "learning_rate": 1.2360766629086812e-05, |
| "loss": 0.0297, |
| "step": 1895 |
| }, |
| { |
| "epoch": 6.116316639741519, |
| "grad_norm": 0.30390217900276184, |
| "learning_rate": 1.2356257046223226e-05, |
| "loss": 0.0388, |
| "step": 1896 |
| }, |
| { |
| "epoch": 6.119547657512117, |
| "grad_norm": 0.2600991725921631, |
| "learning_rate": 1.235174746335964e-05, |
| "loss": 0.0277, |
| "step": 1897 |
| }, |
| { |
| "epoch": 6.1227786752827145, |
| "grad_norm": 0.25999915599823, |
| "learning_rate": 1.2347237880496055e-05, |
| "loss": 0.0297, |
| "step": 1898 |
| }, |
| { |
| "epoch": 6.1260096930533114, |
| "grad_norm": 0.24269790947437286, |
| "learning_rate": 1.234272829763247e-05, |
| "loss": 0.0283, |
| "step": 1899 |
| }, |
| { |
| "epoch": 6.129240710823909, |
| "grad_norm": 0.2289639413356781, |
| "learning_rate": 1.2338218714768885e-05, |
| "loss": 0.0289, |
| "step": 1900 |
| }, |
| { |
| "epoch": 6.132471728594507, |
| "grad_norm": 0.24846558272838593, |
| "learning_rate": 1.23337091319053e-05, |
| "loss": 0.0335, |
| "step": 1901 |
| }, |
| { |
| "epoch": 6.135702746365105, |
| "grad_norm": 0.3645813465118408, |
| "learning_rate": 1.2329199549041714e-05, |
| "loss": 0.0406, |
| "step": 1902 |
| }, |
| { |
| "epoch": 6.138933764135703, |
| "grad_norm": 0.2280002236366272, |
| "learning_rate": 1.2324689966178129e-05, |
| "loss": 0.0254, |
| "step": 1903 |
| }, |
| { |
| "epoch": 6.142164781906301, |
| "grad_norm": 0.3087107241153717, |
| "learning_rate": 1.2320180383314545e-05, |
| "loss": 0.0383, |
| "step": 1904 |
| }, |
| { |
| "epoch": 6.145395799676899, |
| "grad_norm": 0.23273569345474243, |
| "learning_rate": 1.2315670800450959e-05, |
| "loss": 0.0263, |
| "step": 1905 |
| }, |
| { |
| "epoch": 6.148626817447496, |
| "grad_norm": 0.2532700300216675, |
| "learning_rate": 1.2311161217587375e-05, |
| "loss": 0.03, |
| "step": 1906 |
| }, |
| { |
| "epoch": 6.1518578352180935, |
| "grad_norm": 0.277876615524292, |
| "learning_rate": 1.230665163472379e-05, |
| "loss": 0.0295, |
| "step": 1907 |
| }, |
| { |
| "epoch": 6.155088852988691, |
| "grad_norm": 0.29409468173980713, |
| "learning_rate": 1.2302142051860204e-05, |
| "loss": 0.0336, |
| "step": 1908 |
| }, |
| { |
| "epoch": 6.158319870759289, |
| "grad_norm": 0.3030576705932617, |
| "learning_rate": 1.2297632468996618e-05, |
| "loss": 0.037, |
| "step": 1909 |
| }, |
| { |
| "epoch": 6.161550888529887, |
| "grad_norm": 0.20891910791397095, |
| "learning_rate": 1.2293122886133033e-05, |
| "loss": 0.0248, |
| "step": 1910 |
| }, |
| { |
| "epoch": 6.164781906300485, |
| "grad_norm": 0.2936623692512512, |
| "learning_rate": 1.2288613303269447e-05, |
| "loss": 0.0357, |
| "step": 1911 |
| }, |
| { |
| "epoch": 6.168012924071083, |
| "grad_norm": 0.18414311110973358, |
| "learning_rate": 1.2284103720405865e-05, |
| "loss": 0.0239, |
| "step": 1912 |
| }, |
| { |
| "epoch": 6.17124394184168, |
| "grad_norm": 0.2954137921333313, |
| "learning_rate": 1.227959413754228e-05, |
| "loss": 0.0315, |
| "step": 1913 |
| }, |
| { |
| "epoch": 6.174474959612278, |
| "grad_norm": 0.3954852521419525, |
| "learning_rate": 1.2275084554678694e-05, |
| "loss": 0.0461, |
| "step": 1914 |
| }, |
| { |
| "epoch": 6.1777059773828755, |
| "grad_norm": 0.33128318190574646, |
| "learning_rate": 1.2270574971815108e-05, |
| "loss": 0.0386, |
| "step": 1915 |
| }, |
| { |
| "epoch": 6.180936995153473, |
| "grad_norm": 0.2270084172487259, |
| "learning_rate": 1.2266065388951522e-05, |
| "loss": 0.0255, |
| "step": 1916 |
| }, |
| { |
| "epoch": 6.184168012924071, |
| "grad_norm": 0.3373894989490509, |
| "learning_rate": 1.2261555806087937e-05, |
| "loss": 0.0403, |
| "step": 1917 |
| }, |
| { |
| "epoch": 6.187399030694669, |
| "grad_norm": 0.30850750207901, |
| "learning_rate": 1.2257046223224353e-05, |
| "loss": 0.0354, |
| "step": 1918 |
| }, |
| { |
| "epoch": 6.190630048465267, |
| "grad_norm": 0.1884108930826187, |
| "learning_rate": 1.2252536640360767e-05, |
| "loss": 0.0232, |
| "step": 1919 |
| }, |
| { |
| "epoch": 6.193861066235864, |
| "grad_norm": 0.2788880169391632, |
| "learning_rate": 1.2248027057497183e-05, |
| "loss": 0.0316, |
| "step": 1920 |
| }, |
| { |
| "epoch": 6.197092084006462, |
| "grad_norm": 0.32791954278945923, |
| "learning_rate": 1.2243517474633598e-05, |
| "loss": 0.0369, |
| "step": 1921 |
| }, |
| { |
| "epoch": 6.20032310177706, |
| "grad_norm": 0.3660040497779846, |
| "learning_rate": 1.2239007891770012e-05, |
| "loss": 0.0344, |
| "step": 1922 |
| }, |
| { |
| "epoch": 6.203554119547658, |
| "grad_norm": 0.30265817046165466, |
| "learning_rate": 1.2234498308906427e-05, |
| "loss": 0.0315, |
| "step": 1923 |
| }, |
| { |
| "epoch": 6.206785137318255, |
| "grad_norm": 0.22006791830062866, |
| "learning_rate": 1.2229988726042843e-05, |
| "loss": 0.0249, |
| "step": 1924 |
| }, |
| { |
| "epoch": 6.210016155088853, |
| "grad_norm": 0.30455026030540466, |
| "learning_rate": 1.2225479143179257e-05, |
| "loss": 0.0382, |
| "step": 1925 |
| }, |
| { |
| "epoch": 6.21324717285945, |
| "grad_norm": 0.2880460321903229, |
| "learning_rate": 1.2220969560315671e-05, |
| "loss": 0.032, |
| "step": 1926 |
| }, |
| { |
| "epoch": 6.216478190630048, |
| "grad_norm": 0.2904002368450165, |
| "learning_rate": 1.2216459977452086e-05, |
| "loss": 0.0338, |
| "step": 1927 |
| }, |
| { |
| "epoch": 6.219709208400646, |
| "grad_norm": 0.24469764530658722, |
| "learning_rate": 1.22119503945885e-05, |
| "loss": 0.0275, |
| "step": 1928 |
| }, |
| { |
| "epoch": 6.222940226171244, |
| "grad_norm": 0.2588971257209778, |
| "learning_rate": 1.2207440811724915e-05, |
| "loss": 0.0287, |
| "step": 1929 |
| }, |
| { |
| "epoch": 6.226171243941842, |
| "grad_norm": 0.32264140248298645, |
| "learning_rate": 1.2202931228861332e-05, |
| "loss": 0.0335, |
| "step": 1930 |
| }, |
| { |
| "epoch": 6.22940226171244, |
| "grad_norm": 0.19703605771064758, |
| "learning_rate": 1.2198421645997747e-05, |
| "loss": 0.0213, |
| "step": 1931 |
| }, |
| { |
| "epoch": 6.2326332794830375, |
| "grad_norm": 0.34524163603782654, |
| "learning_rate": 1.2193912063134161e-05, |
| "loss": 0.0407, |
| "step": 1932 |
| }, |
| { |
| "epoch": 6.2358642972536344, |
| "grad_norm": 0.3203098177909851, |
| "learning_rate": 1.2189402480270576e-05, |
| "loss": 0.037, |
| "step": 1933 |
| }, |
| { |
| "epoch": 6.239095315024232, |
| "grad_norm": 0.24202866852283478, |
| "learning_rate": 1.218489289740699e-05, |
| "loss": 0.0285, |
| "step": 1934 |
| }, |
| { |
| "epoch": 6.24232633279483, |
| "grad_norm": 0.35560938715934753, |
| "learning_rate": 1.2180383314543404e-05, |
| "loss": 0.0388, |
| "step": 1935 |
| }, |
| { |
| "epoch": 6.245557350565428, |
| "grad_norm": 0.3043487071990967, |
| "learning_rate": 1.2175873731679822e-05, |
| "loss": 0.0352, |
| "step": 1936 |
| }, |
| { |
| "epoch": 6.248788368336026, |
| "grad_norm": 0.26735958456993103, |
| "learning_rate": 1.2171364148816237e-05, |
| "loss": 0.0286, |
| "step": 1937 |
| }, |
| { |
| "epoch": 6.252019386106624, |
| "grad_norm": 0.29427802562713623, |
| "learning_rate": 1.2166854565952651e-05, |
| "loss": 0.031, |
| "step": 1938 |
| }, |
| { |
| "epoch": 6.255250403877222, |
| "grad_norm": 0.2908162772655487, |
| "learning_rate": 1.2162344983089065e-05, |
| "loss": 0.0292, |
| "step": 1939 |
| }, |
| { |
| "epoch": 6.258481421647819, |
| "grad_norm": 0.27389881014823914, |
| "learning_rate": 1.215783540022548e-05, |
| "loss": 0.0327, |
| "step": 1940 |
| }, |
| { |
| "epoch": 6.2617124394184165, |
| "grad_norm": 0.2960582375526428, |
| "learning_rate": 1.2153325817361894e-05, |
| "loss": 0.0347, |
| "step": 1941 |
| }, |
| { |
| "epoch": 6.264943457189014, |
| "grad_norm": 0.24950234591960907, |
| "learning_rate": 1.214881623449831e-05, |
| "loss": 0.0303, |
| "step": 1942 |
| }, |
| { |
| "epoch": 6.268174474959612, |
| "grad_norm": 0.23290525376796722, |
| "learning_rate": 1.2144306651634725e-05, |
| "loss": 0.0283, |
| "step": 1943 |
| }, |
| { |
| "epoch": 6.27140549273021, |
| "grad_norm": 0.2551596164703369, |
| "learning_rate": 1.2139797068771139e-05, |
| "loss": 0.0275, |
| "step": 1944 |
| }, |
| { |
| "epoch": 6.274636510500808, |
| "grad_norm": 0.3014620244503021, |
| "learning_rate": 1.2135287485907555e-05, |
| "loss": 0.0281, |
| "step": 1945 |
| }, |
| { |
| "epoch": 6.277867528271406, |
| "grad_norm": 0.25358477234840393, |
| "learning_rate": 1.213077790304397e-05, |
| "loss": 0.0292, |
| "step": 1946 |
| }, |
| { |
| "epoch": 6.281098546042003, |
| "grad_norm": 0.40006953477859497, |
| "learning_rate": 1.2126268320180384e-05, |
| "loss": 0.0389, |
| "step": 1947 |
| }, |
| { |
| "epoch": 6.284329563812601, |
| "grad_norm": 0.3093584179878235, |
| "learning_rate": 1.21217587373168e-05, |
| "loss": 0.0349, |
| "step": 1948 |
| }, |
| { |
| "epoch": 6.2875605815831985, |
| "grad_norm": 0.3144993484020233, |
| "learning_rate": 1.2117249154453214e-05, |
| "loss": 0.0353, |
| "step": 1949 |
| }, |
| { |
| "epoch": 6.290791599353796, |
| "grad_norm": 0.24986687302589417, |
| "learning_rate": 1.2112739571589629e-05, |
| "loss": 0.0319, |
| "step": 1950 |
| }, |
| { |
| "epoch": 6.290791599353796, |
| "eval_loss": 0.08407828956842422, |
| "eval_runtime": 187.9593, |
| "eval_samples_per_second": 1.048, |
| "eval_steps_per_second": 1.048, |
| "step": 1950 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 4635, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 15, |
| "save_steps": 75, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.2979829657338933e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|