| { | |
| "best_global_step": 1125, | |
| "best_metric": 0.0719488188624382, | |
| "best_model_checkpoint": "outputs_3/checkpoint-1125", | |
| "epoch": 3.630048465266559, | |
| "eval_steps": 75, | |
| "global_step": 1125, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032310177705977385, | |
| "grad_norm": 0.0250613521784544, | |
| "learning_rate": 0.0, | |
| "loss": 0.1103, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.006462035541195477, | |
| "grad_norm": 0.03354981914162636, | |
| "learning_rate": 1.0000000000000001e-07, | |
| "loss": 0.1336, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.009693053311793215, | |
| "grad_norm": 0.024361876770853996, | |
| "learning_rate": 2.0000000000000002e-07, | |
| "loss": 0.1158, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.012924071082390954, | |
| "grad_norm": 0.024172648787498474, | |
| "learning_rate": 3.0000000000000004e-07, | |
| "loss": 0.0937, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.01615508885298869, | |
| "grad_norm": 0.024678289890289307, | |
| "learning_rate": 4.0000000000000003e-07, | |
| "loss": 0.1047, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01938610662358643, | |
| "grad_norm": 0.031230105087161064, | |
| "learning_rate": 5.000000000000001e-07, | |
| "loss": 0.1479, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.022617124394184167, | |
| "grad_norm": 0.024539202451705933, | |
| "learning_rate": 6.000000000000001e-07, | |
| "loss": 0.126, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.025848142164781908, | |
| "grad_norm": 0.02983587421476841, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 0.1218, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.029079159935379646, | |
| "grad_norm": 0.02570008486509323, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.1153, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.03231017770597738, | |
| "grad_norm": 0.024239273741841316, | |
| "learning_rate": 9.000000000000001e-07, | |
| "loss": 0.0952, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.035541195476575124, | |
| "grad_norm": 0.020495450124144554, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 0.0958, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.03877221324717286, | |
| "grad_norm": 0.028608962893486023, | |
| "learning_rate": 1.1e-06, | |
| "loss": 0.1169, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0420032310177706, | |
| "grad_norm": 0.03878644108772278, | |
| "learning_rate": 1.2000000000000002e-06, | |
| "loss": 0.1371, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.045234248788368334, | |
| "grad_norm": 0.02789674885571003, | |
| "learning_rate": 1.3e-06, | |
| "loss": 0.1266, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.048465266558966075, | |
| "grad_norm": 0.03313566744327545, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 0.1301, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.051696284329563816, | |
| "grad_norm": 0.0248391292989254, | |
| "learning_rate": 1.5e-06, | |
| "loss": 0.1056, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.05492730210016155, | |
| "grad_norm": 0.024395154789090157, | |
| "learning_rate": 1.6000000000000001e-06, | |
| "loss": 0.112, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.05815831987075929, | |
| "grad_norm": 0.03043658658862114, | |
| "learning_rate": 1.7000000000000002e-06, | |
| "loss": 0.1037, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.061389337641357025, | |
| "grad_norm": 0.02323235385119915, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 0.0996, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.06462035541195477, | |
| "grad_norm": 0.03656580671668053, | |
| "learning_rate": 1.9000000000000002e-06, | |
| "loss": 0.1327, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06785137318255251, | |
| "grad_norm": 0.02677535079419613, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.1205, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.07108239095315025, | |
| "grad_norm": 0.029969926923513412, | |
| "learning_rate": 2.1000000000000002e-06, | |
| "loss": 0.126, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.07431340872374798, | |
| "grad_norm": 0.02832009270787239, | |
| "learning_rate": 2.2e-06, | |
| "loss": 0.1208, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.07754442649434572, | |
| "grad_norm": 0.023000000044703484, | |
| "learning_rate": 2.3000000000000004e-06, | |
| "loss": 0.102, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.08077544426494346, | |
| "grad_norm": 0.04773552715778351, | |
| "learning_rate": 2.4000000000000003e-06, | |
| "loss": 0.1403, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0840064620355412, | |
| "grad_norm": 0.03021993860602379, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.1116, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.08723747980613894, | |
| "grad_norm": 0.026623567566275597, | |
| "learning_rate": 2.6e-06, | |
| "loss": 0.1052, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.09046849757673667, | |
| "grad_norm": 0.02503894828259945, | |
| "learning_rate": 2.7000000000000004e-06, | |
| "loss": 0.1016, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.09369951534733441, | |
| "grad_norm": 0.026578862220048904, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.1026, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.09693053311793215, | |
| "grad_norm": 0.029514916241168976, | |
| "learning_rate": 2.9e-06, | |
| "loss": 0.1047, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10016155088852989, | |
| "grad_norm": 0.025679711252450943, | |
| "learning_rate": 3e-06, | |
| "loss": 0.1228, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.10339256865912763, | |
| "grad_norm": 0.023719631135463715, | |
| "learning_rate": 3.1000000000000004e-06, | |
| "loss": 0.0997, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.10662358642972536, | |
| "grad_norm": 0.03468646854162216, | |
| "learning_rate": 3.2000000000000003e-06, | |
| "loss": 0.1354, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1098546042003231, | |
| "grad_norm": 0.02570706605911255, | |
| "learning_rate": 3.3000000000000006e-06, | |
| "loss": 0.1007, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.11308562197092084, | |
| "grad_norm": 0.03752969205379486, | |
| "learning_rate": 3.4000000000000005e-06, | |
| "loss": 0.1512, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11631663974151858, | |
| "grad_norm": 0.034612756222486496, | |
| "learning_rate": 3.5e-06, | |
| "loss": 0.1212, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.11954765751211632, | |
| "grad_norm": 0.022866638377308846, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 0.0962, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.12277867528271405, | |
| "grad_norm": 0.03314971551299095, | |
| "learning_rate": 3.7e-06, | |
| "loss": 0.1275, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.1260096930533118, | |
| "grad_norm": 0.026049382984638214, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 0.1095, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.12924071082390953, | |
| "grad_norm": 0.027846891433000565, | |
| "learning_rate": 3.900000000000001e-06, | |
| "loss": 0.1208, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13247172859450726, | |
| "grad_norm": 0.03061266988515854, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.1071, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.13570274636510501, | |
| "grad_norm": 0.028831277042627335, | |
| "learning_rate": 4.1e-06, | |
| "loss": 0.122, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.13893376413570274, | |
| "grad_norm": 0.024587510153651237, | |
| "learning_rate": 4.2000000000000004e-06, | |
| "loss": 0.0891, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.1421647819063005, | |
| "grad_norm": 0.03614000231027603, | |
| "learning_rate": 4.3e-06, | |
| "loss": 0.1443, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.14539579967689822, | |
| "grad_norm": 0.0399002730846405, | |
| "learning_rate": 4.4e-06, | |
| "loss": 0.1306, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14862681744749595, | |
| "grad_norm": 0.03731178864836693, | |
| "learning_rate": 4.5e-06, | |
| "loss": 0.1552, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.1518578352180937, | |
| "grad_norm": 0.03669052943587303, | |
| "learning_rate": 4.600000000000001e-06, | |
| "loss": 0.1285, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.15508885298869143, | |
| "grad_norm": 0.029436811804771423, | |
| "learning_rate": 4.7e-06, | |
| "loss": 0.1128, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.1583198707592892, | |
| "grad_norm": 0.03553691506385803, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.1295, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.16155088852988692, | |
| "grad_norm": 0.045196086168289185, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 0.1589, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16478190630048464, | |
| "grad_norm": 0.03583088517189026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.1195, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1680129240710824, | |
| "grad_norm": 0.03799896687269211, | |
| "learning_rate": 5.1e-06, | |
| "loss": 0.1293, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.17124394184168013, | |
| "grad_norm": 0.0341208279132843, | |
| "learning_rate": 5.2e-06, | |
| "loss": 0.1121, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.17447495961227788, | |
| "grad_norm": 0.0367840901017189, | |
| "learning_rate": 5.300000000000001e-06, | |
| "loss": 0.1288, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.1777059773828756, | |
| "grad_norm": 0.03497275337576866, | |
| "learning_rate": 5.400000000000001e-06, | |
| "loss": 0.1146, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.18093699515347333, | |
| "grad_norm": 0.04450898617506027, | |
| "learning_rate": 5.500000000000001e-06, | |
| "loss": 0.1261, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.1841680129240711, | |
| "grad_norm": 0.029873637482523918, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 0.0981, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.18739903069466882, | |
| "grad_norm": 0.030145753175020218, | |
| "learning_rate": 5.7e-06, | |
| "loss": 0.1121, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.19063004846526657, | |
| "grad_norm": 0.03658242151141167, | |
| "learning_rate": 5.8e-06, | |
| "loss": 0.1416, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.1938610662358643, | |
| "grad_norm": 0.049440935254096985, | |
| "learning_rate": 5.9e-06, | |
| "loss": 0.125, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.19709208400646203, | |
| "grad_norm": 0.0388176292181015, | |
| "learning_rate": 6e-06, | |
| "loss": 0.1195, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.20032310177705978, | |
| "grad_norm": 0.03422081843018532, | |
| "learning_rate": 6.1e-06, | |
| "loss": 0.1191, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.2035541195476575, | |
| "grad_norm": 0.047777559608221054, | |
| "learning_rate": 6.200000000000001e-06, | |
| "loss": 0.1304, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.20678513731825526, | |
| "grad_norm": 0.031583212316036224, | |
| "learning_rate": 6.300000000000001e-06, | |
| "loss": 0.0996, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.210016155088853, | |
| "grad_norm": 0.03744835779070854, | |
| "learning_rate": 6.4000000000000006e-06, | |
| "loss": 0.1124, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.21324717285945072, | |
| "grad_norm": 0.04165760055184364, | |
| "learning_rate": 6.5000000000000004e-06, | |
| "loss": 0.1041, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.21647819063004847, | |
| "grad_norm": 0.03987026587128639, | |
| "learning_rate": 6.600000000000001e-06, | |
| "loss": 0.133, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.2197092084006462, | |
| "grad_norm": 0.040481116622686386, | |
| "learning_rate": 6.700000000000001e-06, | |
| "loss": 0.0944, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.22294022617124395, | |
| "grad_norm": 0.03691282495856285, | |
| "learning_rate": 6.800000000000001e-06, | |
| "loss": 0.1018, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.22617124394184168, | |
| "grad_norm": 0.04485676437616348, | |
| "learning_rate": 6.9e-06, | |
| "loss": 0.137, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2294022617124394, | |
| "grad_norm": 0.02854587510228157, | |
| "learning_rate": 7e-06, | |
| "loss": 0.0885, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.23263327948303716, | |
| "grad_norm": 0.04462384432554245, | |
| "learning_rate": 7.100000000000001e-06, | |
| "loss": 0.1429, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.2358642972536349, | |
| "grad_norm": 0.040676336735486984, | |
| "learning_rate": 7.2000000000000005e-06, | |
| "loss": 0.121, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.23909531502423265, | |
| "grad_norm": 0.0420430488884449, | |
| "learning_rate": 7.3e-06, | |
| "loss": 0.1197, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.24232633279483037, | |
| "grad_norm": 0.034694962203502655, | |
| "learning_rate": 7.4e-06, | |
| "loss": 0.1177, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24232633279483037, | |
| "eval_loss": 0.12372970581054688, | |
| "eval_runtime": 188.2884, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2455573505654281, | |
| "grad_norm": 0.03579672798514366, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.1058, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.24878836833602586, | |
| "grad_norm": 0.029704652726650238, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 0.0882, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.2520193861066236, | |
| "grad_norm": 0.04732828587293625, | |
| "learning_rate": 7.7e-06, | |
| "loss": 0.1277, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.2552504038772213, | |
| "grad_norm": 0.027987899258732796, | |
| "learning_rate": 7.800000000000002e-06, | |
| "loss": 0.0751, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.25848142164781907, | |
| "grad_norm": 0.04119185730814934, | |
| "learning_rate": 7.9e-06, | |
| "loss": 0.1043, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2617124394184168, | |
| "grad_norm": 0.04631367698311806, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.118, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.2649434571890145, | |
| "grad_norm": 0.03165270760655403, | |
| "learning_rate": 8.1e-06, | |
| "loss": 0.0889, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.2681744749596123, | |
| "grad_norm": 0.03328806161880493, | |
| "learning_rate": 8.2e-06, | |
| "loss": 0.097, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.27140549273021003, | |
| "grad_norm": 0.05337163060903549, | |
| "learning_rate": 8.3e-06, | |
| "loss": 0.1373, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.27463651050080773, | |
| "grad_norm": 0.030968431383371353, | |
| "learning_rate": 8.400000000000001e-06, | |
| "loss": 0.077, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2778675282714055, | |
| "grad_norm": 0.03477643430233002, | |
| "learning_rate": 8.5e-06, | |
| "loss": 0.1016, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.28109854604200324, | |
| "grad_norm": 0.04051528871059418, | |
| "learning_rate": 8.6e-06, | |
| "loss": 0.1017, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.284329563812601, | |
| "grad_norm": 0.039160750806331635, | |
| "learning_rate": 8.700000000000001e-06, | |
| "loss": 0.112, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.2875605815831987, | |
| "grad_norm": 0.03572917729616165, | |
| "learning_rate": 8.8e-06, | |
| "loss": 0.083, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.29079159935379645, | |
| "grad_norm": 0.05116155743598938, | |
| "learning_rate": 8.900000000000001e-06, | |
| "loss": 0.1262, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2940226171243942, | |
| "grad_norm": 0.043991196900606155, | |
| "learning_rate": 9e-06, | |
| "loss": 0.1147, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.2972536348949919, | |
| "grad_norm": 0.03514918312430382, | |
| "learning_rate": 9.100000000000001e-06, | |
| "loss": 0.0874, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.30048465266558966, | |
| "grad_norm": 0.03676354140043259, | |
| "learning_rate": 9.200000000000002e-06, | |
| "loss": 0.0988, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.3037156704361874, | |
| "grad_norm": 0.04684548079967499, | |
| "learning_rate": 9.3e-06, | |
| "loss": 0.129, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.3069466882067851, | |
| "grad_norm": 0.033971093595027924, | |
| "learning_rate": 9.4e-06, | |
| "loss": 0.09, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.31017770597738287, | |
| "grad_norm": 0.0339505635201931, | |
| "learning_rate": 9.5e-06, | |
| "loss": 0.083, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3134087237479806, | |
| "grad_norm": 0.04067440703511238, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 0.1093, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.3166397415185784, | |
| "grad_norm": 0.036671143025159836, | |
| "learning_rate": 9.7e-06, | |
| "loss": 0.0929, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.3198707592891761, | |
| "grad_norm": 0.0459955595433712, | |
| "learning_rate": 9.800000000000001e-06, | |
| "loss": 0.1273, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.32310177705977383, | |
| "grad_norm": 0.05120276287198067, | |
| "learning_rate": 9.9e-06, | |
| "loss": 0.1349, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3263327948303716, | |
| "grad_norm": 0.03149951994419098, | |
| "learning_rate": 1e-05, | |
| "loss": 0.074, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.3295638126009693, | |
| "grad_norm": 0.2025747299194336, | |
| "learning_rate": 1.0100000000000002e-05, | |
| "loss": 0.099, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.33279483037156704, | |
| "grad_norm": 0.03870416432619095, | |
| "learning_rate": 1.02e-05, | |
| "loss": 0.0984, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.3360258481421648, | |
| "grad_norm": 0.05344085767865181, | |
| "learning_rate": 1.0300000000000001e-05, | |
| "loss": 0.1057, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.3392568659127625, | |
| "grad_norm": 0.04757027328014374, | |
| "learning_rate": 1.04e-05, | |
| "loss": 0.124, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.34248788368336025, | |
| "grad_norm": 0.03201949968934059, | |
| "learning_rate": 1.0500000000000001e-05, | |
| "loss": 0.0807, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.345718901453958, | |
| "grad_norm": 0.03873045742511749, | |
| "learning_rate": 1.0600000000000002e-05, | |
| "loss": 0.1048, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.34894991922455576, | |
| "grad_norm": 0.027501031756401062, | |
| "learning_rate": 1.0700000000000001e-05, | |
| "loss": 0.0613, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.35218093699515346, | |
| "grad_norm": 0.03909388184547424, | |
| "learning_rate": 1.0800000000000002e-05, | |
| "loss": 0.0929, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.3554119547657512, | |
| "grad_norm": 0.08006097376346588, | |
| "learning_rate": 1.0900000000000002e-05, | |
| "loss": 0.0908, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.35864297253634897, | |
| "grad_norm": 0.03813672810792923, | |
| "learning_rate": 1.1000000000000001e-05, | |
| "loss": 0.0764, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.36187399030694667, | |
| "grad_norm": 0.030327340587973595, | |
| "learning_rate": 1.1100000000000002e-05, | |
| "loss": 0.077, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.3651050080775444, | |
| "grad_norm": 0.03776196017861366, | |
| "learning_rate": 1.1200000000000001e-05, | |
| "loss": 0.0759, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.3683360258481422, | |
| "grad_norm": 0.039037927985191345, | |
| "learning_rate": 1.13e-05, | |
| "loss": 0.0926, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.3715670436187399, | |
| "grad_norm": 0.0416683666408062, | |
| "learning_rate": 1.14e-05, | |
| "loss": 0.0914, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.37479806138933763, | |
| "grad_norm": 0.04185537248849869, | |
| "learning_rate": 1.15e-05, | |
| "loss": 0.0893, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.3780290791599354, | |
| "grad_norm": 0.04651897773146629, | |
| "learning_rate": 1.16e-05, | |
| "loss": 0.1094, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.38126009693053314, | |
| "grad_norm": 0.04604775831103325, | |
| "learning_rate": 1.17e-05, | |
| "loss": 0.1068, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.38449111470113084, | |
| "grad_norm": 0.02846536412835121, | |
| "learning_rate": 1.18e-05, | |
| "loss": 0.064, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.3877221324717286, | |
| "grad_norm": 0.033402059227228165, | |
| "learning_rate": 1.1900000000000001e-05, | |
| "loss": 0.0725, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.39095315024232635, | |
| "grad_norm": 0.044676899909973145, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.1017, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.39418416801292405, | |
| "grad_norm": 0.05336389318108559, | |
| "learning_rate": 1.2100000000000001e-05, | |
| "loss": 0.1063, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.3974151857835218, | |
| "grad_norm": 0.0402502678334713, | |
| "learning_rate": 1.22e-05, | |
| "loss": 0.0764, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.40064620355411956, | |
| "grad_norm": 0.04342082887887955, | |
| "learning_rate": 1.23e-05, | |
| "loss": 0.1008, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.40387722132471726, | |
| "grad_norm": 0.047081444412469864, | |
| "learning_rate": 1.2400000000000002e-05, | |
| "loss": 0.1029, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.407108239095315, | |
| "grad_norm": 0.038031261414289474, | |
| "learning_rate": 1.25e-05, | |
| "loss": 0.0783, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.41033925686591277, | |
| "grad_norm": 0.03746628388762474, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 0.0764, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.4135702746365105, | |
| "grad_norm": 0.04288509115576744, | |
| "learning_rate": 1.27e-05, | |
| "loss": 0.0892, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.4168012924071082, | |
| "grad_norm": 0.042407114058732986, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 0.0925, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.420032310177706, | |
| "grad_norm": 0.03754522651433945, | |
| "learning_rate": 1.2900000000000002e-05, | |
| "loss": 0.0833, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.42326332794830374, | |
| "grad_norm": 0.04337688535451889, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 0.0976, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.42649434571890144, | |
| "grad_norm": 0.03174331411719322, | |
| "learning_rate": 1.3100000000000002e-05, | |
| "loss": 0.0684, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.4297253634894992, | |
| "grad_norm": 0.04556446522474289, | |
| "learning_rate": 1.3200000000000002e-05, | |
| "loss": 0.0979, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.43295638126009695, | |
| "grad_norm": 0.04222508519887924, | |
| "learning_rate": 1.3300000000000001e-05, | |
| "loss": 0.0923, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.43618739903069464, | |
| "grad_norm": 0.047948531806468964, | |
| "learning_rate": 1.3400000000000002e-05, | |
| "loss": 0.0991, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4394184168012924, | |
| "grad_norm": 0.04752594605088234, | |
| "learning_rate": 1.3500000000000001e-05, | |
| "loss": 0.0864, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.44264943457189015, | |
| "grad_norm": 0.049508459866046906, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 0.1035, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.4458804523424879, | |
| "grad_norm": 0.05521553382277489, | |
| "learning_rate": 1.3700000000000003e-05, | |
| "loss": 0.0978, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.4491114701130856, | |
| "grad_norm": 0.05352664738893509, | |
| "learning_rate": 1.38e-05, | |
| "loss": 0.1097, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.45234248788368336, | |
| "grad_norm": 0.04332451522350311, | |
| "learning_rate": 1.39e-05, | |
| "loss": 0.0917, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4555735056542811, | |
| "grad_norm": 0.05932965502142906, | |
| "learning_rate": 1.4e-05, | |
| "loss": 0.0909, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.4588045234248788, | |
| "grad_norm": 0.04634483903646469, | |
| "learning_rate": 1.41e-05, | |
| "loss": 0.0806, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.4620355411954766, | |
| "grad_norm": 0.037898797541856766, | |
| "learning_rate": 1.4200000000000001e-05, | |
| "loss": 0.0802, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.46526655896607433, | |
| "grad_norm": 0.04366337135434151, | |
| "learning_rate": 1.43e-05, | |
| "loss": 0.0835, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.46849757673667203, | |
| "grad_norm": 0.03588287532329559, | |
| "learning_rate": 1.4400000000000001e-05, | |
| "loss": 0.0746, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4717285945072698, | |
| "grad_norm": 0.04979556053876877, | |
| "learning_rate": 1.45e-05, | |
| "loss": 0.0914, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.47495961227786754, | |
| "grad_norm": 0.03938375040888786, | |
| "learning_rate": 1.46e-05, | |
| "loss": 0.0696, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.4781906300484653, | |
| "grad_norm": 0.04531609266996384, | |
| "learning_rate": 1.4700000000000002e-05, | |
| "loss": 0.0847, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.481421647819063, | |
| "grad_norm": 0.04314682260155678, | |
| "learning_rate": 1.48e-05, | |
| "loss": 0.0872, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.48465266558966075, | |
| "grad_norm": 0.04925313591957092, | |
| "learning_rate": 1.4900000000000001e-05, | |
| "loss": 0.0905, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.48465266558966075, | |
| "eval_loss": 0.09467896819114685, | |
| "eval_runtime": 188.3915, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4878836833602585, | |
| "grad_norm": 0.04450507089495659, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.0704, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.4911147011308562, | |
| "grad_norm": 0.03887806460261345, | |
| "learning_rate": 1.5100000000000001e-05, | |
| "loss": 0.0727, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.49434571890145396, | |
| "grad_norm": 0.04298221319913864, | |
| "learning_rate": 1.5200000000000002e-05, | |
| "loss": 0.0763, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.4975767366720517, | |
| "grad_norm": 0.046640265733003616, | |
| "learning_rate": 1.5300000000000003e-05, | |
| "loss": 0.0794, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.5008077544426495, | |
| "grad_norm": 0.03670887276530266, | |
| "learning_rate": 1.54e-05, | |
| "loss": 0.0681, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.5040387722132472, | |
| "grad_norm": 0.03891611844301224, | |
| "learning_rate": 1.55e-05, | |
| "loss": 0.0712, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.5072697899838449, | |
| "grad_norm": 0.042465128004550934, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 0.0828, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.5105008077544426, | |
| "grad_norm": 0.054676711559295654, | |
| "learning_rate": 1.5700000000000002e-05, | |
| "loss": 0.0978, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.5137318255250404, | |
| "grad_norm": 0.040950365364551544, | |
| "learning_rate": 1.58e-05, | |
| "loss": 0.0822, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.5169628432956381, | |
| "grad_norm": 0.04756248742341995, | |
| "learning_rate": 1.5900000000000004e-05, | |
| "loss": 0.0914, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5201938610662359, | |
| "grad_norm": 0.046163927763700485, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.089, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.5234248788368336, | |
| "grad_norm": 0.04039200022816658, | |
| "learning_rate": 1.6100000000000002e-05, | |
| "loss": 0.0657, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.5266558966074314, | |
| "grad_norm": 0.05602607503533363, | |
| "learning_rate": 1.62e-05, | |
| "loss": 0.1044, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.529886914378029, | |
| "grad_norm": 0.04312260076403618, | |
| "learning_rate": 1.63e-05, | |
| "loss": 0.0821, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.5331179321486268, | |
| "grad_norm": 0.047874368727207184, | |
| "learning_rate": 1.64e-05, | |
| "loss": 0.0824, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5363489499192245, | |
| "grad_norm": 0.06421804428100586, | |
| "learning_rate": 1.65e-05, | |
| "loss": 0.1126, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.5395799676898223, | |
| "grad_norm": 0.05542091280221939, | |
| "learning_rate": 1.66e-05, | |
| "loss": 0.0959, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.5428109854604201, | |
| "grad_norm": 0.05822491645812988, | |
| "learning_rate": 1.67e-05, | |
| "loss": 0.099, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.5460420032310178, | |
| "grad_norm": 0.04154228791594505, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 0.0737, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.5492730210016155, | |
| "grad_norm": 0.04687827080488205, | |
| "learning_rate": 1.69e-05, | |
| "loss": 0.0725, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5525040387722132, | |
| "grad_norm": 0.056682366877794266, | |
| "learning_rate": 1.7e-05, | |
| "loss": 0.1004, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.555735056542811, | |
| "grad_norm": 0.0667276531457901, | |
| "learning_rate": 1.7100000000000002e-05, | |
| "loss": 0.0917, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.5589660743134087, | |
| "grad_norm": 0.044881295412778854, | |
| "learning_rate": 1.72e-05, | |
| "loss": 0.0689, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.5621970920840065, | |
| "grad_norm": 0.0385456345975399, | |
| "learning_rate": 1.73e-05, | |
| "loss": 0.0592, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.5654281098546042, | |
| "grad_norm": 0.05141144245862961, | |
| "learning_rate": 1.7400000000000003e-05, | |
| "loss": 0.0895, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.568659127625202, | |
| "grad_norm": 0.06854357570409775, | |
| "learning_rate": 1.7500000000000002e-05, | |
| "loss": 0.0692, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.5718901453957996, | |
| "grad_norm": 0.04410829395055771, | |
| "learning_rate": 1.76e-05, | |
| "loss": 0.0688, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.5751211631663974, | |
| "grad_norm": 0.03727763518691063, | |
| "learning_rate": 1.77e-05, | |
| "loss": 0.0638, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.5783521809369951, | |
| "grad_norm": 0.044415879994630814, | |
| "learning_rate": 1.7800000000000002e-05, | |
| "loss": 0.0682, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.5815831987075929, | |
| "grad_norm": 0.06855777651071548, | |
| "learning_rate": 1.79e-05, | |
| "loss": 0.1018, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5848142164781907, | |
| "grad_norm": 0.053684502840042114, | |
| "learning_rate": 1.8e-05, | |
| "loss": 0.0862, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.5880452342487884, | |
| "grad_norm": 0.0487506277859211, | |
| "learning_rate": 1.8100000000000003e-05, | |
| "loss": 0.073, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.5912762520193862, | |
| "grad_norm": 0.04568934440612793, | |
| "learning_rate": 1.8200000000000002e-05, | |
| "loss": 0.0726, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.5945072697899838, | |
| "grad_norm": 0.04607719928026199, | |
| "learning_rate": 1.83e-05, | |
| "loss": 0.0685, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.5977382875605816, | |
| "grad_norm": 0.05040200799703598, | |
| "learning_rate": 1.8400000000000003e-05, | |
| "loss": 0.0721, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.6009693053311793, | |
| "grad_norm": 0.0538799948990345, | |
| "learning_rate": 1.8500000000000002e-05, | |
| "loss": 0.0769, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.6042003231017771, | |
| "grad_norm": 0.058767516165971756, | |
| "learning_rate": 1.86e-05, | |
| "loss": 0.1015, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.6074313408723748, | |
| "grad_norm": 0.056379787623882294, | |
| "learning_rate": 1.8700000000000004e-05, | |
| "loss": 0.0887, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.6106623586429726, | |
| "grad_norm": 0.04885280132293701, | |
| "learning_rate": 1.88e-05, | |
| "loss": 0.0767, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.6138933764135702, | |
| "grad_norm": 0.04340769350528717, | |
| "learning_rate": 1.8900000000000002e-05, | |
| "loss": 0.061, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.617124394184168, | |
| "grad_norm": 0.051385678350925446, | |
| "learning_rate": 1.9e-05, | |
| "loss": 0.0794, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.6203554119547657, | |
| "grad_norm": 0.03737674281001091, | |
| "learning_rate": 1.91e-05, | |
| "loss": 0.0594, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.6235864297253635, | |
| "grad_norm": 0.047995615750551224, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 0.06, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.6268174474959612, | |
| "grad_norm": 0.04322716221213341, | |
| "learning_rate": 1.93e-05, | |
| "loss": 0.0712, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.630048465266559, | |
| "grad_norm": 0.044713038951158524, | |
| "learning_rate": 1.94e-05, | |
| "loss": 0.0679, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6332794830371568, | |
| "grad_norm": 0.05132253095507622, | |
| "learning_rate": 1.95e-05, | |
| "loss": 0.0723, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.6365105008077544, | |
| "grad_norm": 0.039808765053749084, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 0.057, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.6397415185783522, | |
| "grad_norm": 0.05255698040127754, | |
| "learning_rate": 1.97e-05, | |
| "loss": 0.0774, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.6429725363489499, | |
| "grad_norm": 0.05560529604554176, | |
| "learning_rate": 1.98e-05, | |
| "loss": 0.0839, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.6462035541195477, | |
| "grad_norm": 0.0640430748462677, | |
| "learning_rate": 1.9900000000000003e-05, | |
| "loss": 0.0993, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6494345718901454, | |
| "grad_norm": 0.05945132300257683, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0917, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.6526655896607432, | |
| "grad_norm": 0.0569840632379055, | |
| "learning_rate": 1.9995490417136416e-05, | |
| "loss": 0.096, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.6558966074313409, | |
| "grad_norm": 0.052612412720918655, | |
| "learning_rate": 1.999098083427283e-05, | |
| "loss": 0.079, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.6591276252019386, | |
| "grad_norm": 0.05563991889357567, | |
| "learning_rate": 1.9986471251409248e-05, | |
| "loss": 0.0748, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.6623586429725363, | |
| "grad_norm": 0.0527903214097023, | |
| "learning_rate": 1.9981961668545663e-05, | |
| "loss": 0.0789, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6655896607431341, | |
| "grad_norm": 0.06214462220668793, | |
| "learning_rate": 1.9977452085682077e-05, | |
| "loss": 0.0806, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.6688206785137318, | |
| "grad_norm": 0.05324917659163475, | |
| "learning_rate": 1.997294250281849e-05, | |
| "loss": 0.0778, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.6720516962843296, | |
| "grad_norm": 0.03270899876952171, | |
| "learning_rate": 1.9968432919954906e-05, | |
| "loss": 0.0469, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.6752827140549273, | |
| "grad_norm": 0.060554295778274536, | |
| "learning_rate": 1.996392333709132e-05, | |
| "loss": 0.0926, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.678513731825525, | |
| "grad_norm": 0.05076554790139198, | |
| "learning_rate": 1.9959413754227738e-05, | |
| "loss": 0.0752, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6817447495961227, | |
| "grad_norm": 0.05301008000969887, | |
| "learning_rate": 1.9954904171364152e-05, | |
| "loss": 0.0783, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.6849757673667205, | |
| "grad_norm": 0.04376392439007759, | |
| "learning_rate": 1.9950394588500567e-05, | |
| "loss": 0.0603, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.6882067851373183, | |
| "grad_norm": 0.049395546317100525, | |
| "learning_rate": 1.994588500563698e-05, | |
| "loss": 0.071, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.691437802907916, | |
| "grad_norm": 0.061137910932302475, | |
| "learning_rate": 1.9941375422773395e-05, | |
| "loss": 0.0995, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.6946688206785138, | |
| "grad_norm": 0.05603012442588806, | |
| "learning_rate": 1.993686583990981e-05, | |
| "loss": 0.0787, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6978998384491115, | |
| "grad_norm": 0.06130882352590561, | |
| "learning_rate": 1.9932356257046224e-05, | |
| "loss": 0.0745, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.7011308562197092, | |
| "grad_norm": 0.06154336780309677, | |
| "learning_rate": 1.9927846674182642e-05, | |
| "loss": 0.0707, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.7043618739903069, | |
| "grad_norm": 0.06344747543334961, | |
| "learning_rate": 1.9923337091319056e-05, | |
| "loss": 0.0978, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.7075928917609047, | |
| "grad_norm": 0.06630375236272812, | |
| "learning_rate": 1.991882750845547e-05, | |
| "loss": 0.0721, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.7108239095315024, | |
| "grad_norm": 0.04976838827133179, | |
| "learning_rate": 1.9914317925591885e-05, | |
| "loss": 0.0595, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7140549273021002, | |
| "grad_norm": 0.05968477949500084, | |
| "learning_rate": 1.99098083427283e-05, | |
| "loss": 0.0792, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.7172859450726979, | |
| "grad_norm": 0.0983344092965126, | |
| "learning_rate": 1.9905298759864714e-05, | |
| "loss": 0.0872, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.7205169628432956, | |
| "grad_norm": 0.08061773329973221, | |
| "learning_rate": 1.990078917700113e-05, | |
| "loss": 0.0824, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.7237479806138933, | |
| "grad_norm": 0.055074963718652725, | |
| "learning_rate": 1.9896279594137543e-05, | |
| "loss": 0.0677, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.7269789983844911, | |
| "grad_norm": 0.06062469258904457, | |
| "learning_rate": 1.9891770011273957e-05, | |
| "loss": 0.0658, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7269789983844911, | |
| "eval_loss": 0.08582010865211487, | |
| "eval_runtime": 188.2111, | |
| "eval_samples_per_second": 1.047, | |
| "eval_steps_per_second": 1.047, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7302100161550888, | |
| "grad_norm": 0.07606152445077896, | |
| "learning_rate": 1.988726042841037e-05, | |
| "loss": 0.1078, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.7334410339256866, | |
| "grad_norm": 0.06171920895576477, | |
| "learning_rate": 1.988275084554679e-05, | |
| "loss": 0.086, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.7366720516962844, | |
| "grad_norm": 0.03934045881032944, | |
| "learning_rate": 1.9878241262683204e-05, | |
| "loss": 0.0516, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.7399030694668821, | |
| "grad_norm": 0.0552021786570549, | |
| "learning_rate": 1.9873731679819618e-05, | |
| "loss": 0.0761, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.7431340872374798, | |
| "grad_norm": 0.05151893198490143, | |
| "learning_rate": 1.9869222096956032e-05, | |
| "loss": 0.0726, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7463651050080775, | |
| "grad_norm": 0.0533306822180748, | |
| "learning_rate": 1.9864712514092447e-05, | |
| "loss": 0.0717, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.7495961227786753, | |
| "grad_norm": 0.052841685712337494, | |
| "learning_rate": 1.986020293122886e-05, | |
| "loss": 0.0755, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.752827140549273, | |
| "grad_norm": 0.040998924523591995, | |
| "learning_rate": 1.9855693348365276e-05, | |
| "loss": 0.0554, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.7560581583198708, | |
| "grad_norm": 0.057859815657138824, | |
| "learning_rate": 1.9851183765501693e-05, | |
| "loss": 0.0719, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.7592891760904685, | |
| "grad_norm": 0.04167502373456955, | |
| "learning_rate": 1.9846674182638108e-05, | |
| "loss": 0.0625, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7625201938610663, | |
| "grad_norm": 0.058570049703121185, | |
| "learning_rate": 1.9842164599774522e-05, | |
| "loss": 0.0773, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.7657512116316639, | |
| "grad_norm": 0.06181475892663002, | |
| "learning_rate": 1.9837655016910937e-05, | |
| "loss": 0.083, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.7689822294022617, | |
| "grad_norm": 0.06188640370965004, | |
| "learning_rate": 1.983314543404735e-05, | |
| "loss": 0.0838, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.7722132471728594, | |
| "grad_norm": 0.0784875750541687, | |
| "learning_rate": 1.9828635851183765e-05, | |
| "loss": 0.1032, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.7754442649434572, | |
| "grad_norm": 0.06791771203279495, | |
| "learning_rate": 1.9824126268320183e-05, | |
| "loss": 0.0888, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.778675282714055, | |
| "grad_norm": 0.06790699809789658, | |
| "learning_rate": 1.9819616685456598e-05, | |
| "loss": 0.0919, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.7819063004846527, | |
| "grad_norm": 0.04812704026699066, | |
| "learning_rate": 1.9815107102593012e-05, | |
| "loss": 0.058, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7851373182552503, | |
| "grad_norm": 0.0617465041577816, | |
| "learning_rate": 1.9810597519729426e-05, | |
| "loss": 0.0852, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.7883683360258481, | |
| "grad_norm": 0.049844078719615936, | |
| "learning_rate": 1.980608793686584e-05, | |
| "loss": 0.0668, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.7915993537964459, | |
| "grad_norm": 0.0725836232304573, | |
| "learning_rate": 1.9801578354002255e-05, | |
| "loss": 0.0929, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7948303715670436, | |
| "grad_norm": 0.0587320439517498, | |
| "learning_rate": 1.9797068771138673e-05, | |
| "loss": 0.0764, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.7980613893376414, | |
| "grad_norm": 0.04824487864971161, | |
| "learning_rate": 1.9792559188275087e-05, | |
| "loss": 0.0562, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.8012924071082391, | |
| "grad_norm": 0.06668104231357574, | |
| "learning_rate": 1.97880496054115e-05, | |
| "loss": 0.0842, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.8045234248788369, | |
| "grad_norm": 0.057721976190805435, | |
| "learning_rate": 1.9783540022547916e-05, | |
| "loss": 0.0838, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.8077544426494345, | |
| "grad_norm": 0.07014774531126022, | |
| "learning_rate": 1.977903043968433e-05, | |
| "loss": 0.0831, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8109854604200323, | |
| "grad_norm": 0.0693356841802597, | |
| "learning_rate": 1.9774520856820745e-05, | |
| "loss": 0.0903, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.81421647819063, | |
| "grad_norm": 0.05464401841163635, | |
| "learning_rate": 1.9770011273957163e-05, | |
| "loss": 0.0682, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.8174474959612278, | |
| "grad_norm": 0.053677983582019806, | |
| "learning_rate": 1.9765501691093577e-05, | |
| "loss": 0.0588, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.8206785137318255, | |
| "grad_norm": 0.04152385890483856, | |
| "learning_rate": 1.976099210822999e-05, | |
| "loss": 0.0564, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.8239095315024233, | |
| "grad_norm": 0.06150719150900841, | |
| "learning_rate": 1.9756482525366406e-05, | |
| "loss": 0.0694, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.827140549273021, | |
| "grad_norm": 0.05433864891529083, | |
| "learning_rate": 1.975197294250282e-05, | |
| "loss": 0.067, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.8303715670436187, | |
| "grad_norm": 0.04325372353196144, | |
| "learning_rate": 1.9747463359639235e-05, | |
| "loss": 0.0575, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.8336025848142165, | |
| "grad_norm": 0.049097690731287, | |
| "learning_rate": 1.9742953776775652e-05, | |
| "loss": 0.0571, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.8368336025848142, | |
| "grad_norm": 0.06370379030704498, | |
| "learning_rate": 1.9738444193912067e-05, | |
| "loss": 0.0877, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.840064620355412, | |
| "grad_norm": 0.05573710799217224, | |
| "learning_rate": 1.973393461104848e-05, | |
| "loss": 0.0759, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8432956381260097, | |
| "grad_norm": 0.06537079066038132, | |
| "learning_rate": 1.9729425028184896e-05, | |
| "loss": 0.0759, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.8465266558966075, | |
| "grad_norm": 0.04301934316754341, | |
| "learning_rate": 1.972491544532131e-05, | |
| "loss": 0.0564, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.8497576736672051, | |
| "grad_norm": 0.07281677424907684, | |
| "learning_rate": 1.9720405862457724e-05, | |
| "loss": 0.0685, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.8529886914378029, | |
| "grad_norm": 0.06059825047850609, | |
| "learning_rate": 1.971589627959414e-05, | |
| "loss": 0.071, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.8562197092084006, | |
| "grad_norm": 0.05605108663439751, | |
| "learning_rate": 1.9711386696730553e-05, | |
| "loss": 0.0634, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8594507269789984, | |
| "grad_norm": 0.07372546941041946, | |
| "learning_rate": 1.9706877113866967e-05, | |
| "loss": 0.0916, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.8626817447495961, | |
| "grad_norm": 0.051352906972169876, | |
| "learning_rate": 1.9702367531003382e-05, | |
| "loss": 0.0653, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.8659127625201939, | |
| "grad_norm": 0.059334397315979004, | |
| "learning_rate": 1.96978579481398e-05, | |
| "loss": 0.0717, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.8691437802907916, | |
| "grad_norm": 0.06220857426524162, | |
| "learning_rate": 1.9693348365276214e-05, | |
| "loss": 0.0612, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.8723747980613893, | |
| "grad_norm": 0.053203944116830826, | |
| "learning_rate": 1.968883878241263e-05, | |
| "loss": 0.0699, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.875605815831987, | |
| "grad_norm": 0.06943807750940323, | |
| "learning_rate": 1.9684329199549043e-05, | |
| "loss": 0.0793, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.8788368336025848, | |
| "grad_norm": 0.07023902982473373, | |
| "learning_rate": 1.9679819616685457e-05, | |
| "loss": 0.0859, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.8820678513731826, | |
| "grad_norm": 0.06727661192417145, | |
| "learning_rate": 1.967531003382187e-05, | |
| "loss": 0.0796, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.8852988691437803, | |
| "grad_norm": 0.08004336804151535, | |
| "learning_rate": 1.9670800450958286e-05, | |
| "loss": 0.0989, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.8885298869143781, | |
| "grad_norm": 0.06687738746404648, | |
| "learning_rate": 1.96662908680947e-05, | |
| "loss": 0.0674, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8917609046849758, | |
| "grad_norm": 0.06280867755413055, | |
| "learning_rate": 1.9661781285231118e-05, | |
| "loss": 0.0735, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.8949919224555735, | |
| "grad_norm": 0.06883740425109863, | |
| "learning_rate": 1.9657271702367533e-05, | |
| "loss": 0.0893, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.8982229402261712, | |
| "grad_norm": 0.059292376041412354, | |
| "learning_rate": 1.9652762119503947e-05, | |
| "loss": 0.0819, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.901453957996769, | |
| "grad_norm": 0.0578530877828598, | |
| "learning_rate": 1.964825253664036e-05, | |
| "loss": 0.0724, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.9046849757673667, | |
| "grad_norm": 0.08932427316904068, | |
| "learning_rate": 1.9643742953776776e-05, | |
| "loss": 0.0939, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9079159935379645, | |
| "grad_norm": 0.07406419515609741, | |
| "learning_rate": 1.963923337091319e-05, | |
| "loss": 0.0868, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.9111470113085622, | |
| "grad_norm": 0.05354011803865433, | |
| "learning_rate": 1.9634723788049608e-05, | |
| "loss": 0.0685, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.9143780290791599, | |
| "grad_norm": 0.06414072960615158, | |
| "learning_rate": 1.9630214205186022e-05, | |
| "loss": 0.0686, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.9176090468497576, | |
| "grad_norm": 0.058192793279886246, | |
| "learning_rate": 1.9625704622322437e-05, | |
| "loss": 0.0674, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.9208400646203554, | |
| "grad_norm": 0.10264746099710464, | |
| "learning_rate": 1.962119503945885e-05, | |
| "loss": 0.1152, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.9240710823909531, | |
| "grad_norm": 0.066757932305336, | |
| "learning_rate": 1.9616685456595265e-05, | |
| "loss": 0.0757, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.9273021001615509, | |
| "grad_norm": 0.06598404794931412, | |
| "learning_rate": 1.9612175873731683e-05, | |
| "loss": 0.0768, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.9305331179321487, | |
| "grad_norm": 0.07162454724311829, | |
| "learning_rate": 1.9607666290868098e-05, | |
| "loss": 0.081, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.9337641357027464, | |
| "grad_norm": 0.05917588993906975, | |
| "learning_rate": 1.9603156708004512e-05, | |
| "loss": 0.0645, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.9369951534733441, | |
| "grad_norm": 0.06051475182175636, | |
| "learning_rate": 1.9598647125140926e-05, | |
| "loss": 0.0656, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9402261712439418, | |
| "grad_norm": 0.06452775001525879, | |
| "learning_rate": 1.959413754227734e-05, | |
| "loss": 0.0704, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.9434571890145396, | |
| "grad_norm": 0.06445769965648651, | |
| "learning_rate": 1.9589627959413755e-05, | |
| "loss": 0.0759, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.9466882067851373, | |
| "grad_norm": 0.06948834657669067, | |
| "learning_rate": 1.9585118376550173e-05, | |
| "loss": 0.0776, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.9499192245557351, | |
| "grad_norm": 0.05026319995522499, | |
| "learning_rate": 1.9580608793686587e-05, | |
| "loss": 0.0574, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.9531502423263328, | |
| "grad_norm": 0.08733383566141129, | |
| "learning_rate": 1.9576099210823002e-05, | |
| "loss": 0.0898, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9563812600969306, | |
| "grad_norm": 0.05138668045401573, | |
| "learning_rate": 1.9571589627959416e-05, | |
| "loss": 0.0525, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.9596122778675282, | |
| "grad_norm": 0.0710444375872612, | |
| "learning_rate": 1.956708004509583e-05, | |
| "loss": 0.0832, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.962843295638126, | |
| "grad_norm": 0.06288463622331619, | |
| "learning_rate": 1.9562570462232245e-05, | |
| "loss": 0.0673, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.9660743134087237, | |
| "grad_norm": 0.05722356587648392, | |
| "learning_rate": 1.9558060879368663e-05, | |
| "loss": 0.0702, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.9693053311793215, | |
| "grad_norm": 0.07167758047580719, | |
| "learning_rate": 1.9553551296505077e-05, | |
| "loss": 0.0809, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9693053311793215, | |
| "eval_loss": 0.08212888240814209, | |
| "eval_runtime": 188.3411, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9725363489499192, | |
| "grad_norm": 0.04536513611674309, | |
| "learning_rate": 1.954904171364149e-05, | |
| "loss": 0.0485, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.975767366720517, | |
| "grad_norm": 0.07035136222839355, | |
| "learning_rate": 1.9544532130777906e-05, | |
| "loss": 0.0763, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.9789983844911146, | |
| "grad_norm": 0.06417107582092285, | |
| "learning_rate": 1.954002254791432e-05, | |
| "loss": 0.0735, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.9822294022617124, | |
| "grad_norm": 0.06369137018918991, | |
| "learning_rate": 1.9535512965050735e-05, | |
| "loss": 0.07, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.9854604200323102, | |
| "grad_norm": 0.053664304316043854, | |
| "learning_rate": 1.953100338218715e-05, | |
| "loss": 0.058, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9886914378029079, | |
| "grad_norm": 0.07393426448106766, | |
| "learning_rate": 1.9526493799323563e-05, | |
| "loss": 0.0783, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.9919224555735057, | |
| "grad_norm": 0.05141662806272507, | |
| "learning_rate": 1.9521984216459978e-05, | |
| "loss": 0.0556, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.9951534733441034, | |
| "grad_norm": 0.06411275267601013, | |
| "learning_rate": 1.9517474633596392e-05, | |
| "loss": 0.0687, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.9983844911147012, | |
| "grad_norm": 0.08877477794885635, | |
| "learning_rate": 1.951296505073281e-05, | |
| "loss": 0.0846, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.08283062279224396, | |
| "learning_rate": 1.9508455467869224e-05, | |
| "loss": 0.0541, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0032310177705976, | |
| "grad_norm": 0.06769707798957825, | |
| "learning_rate": 1.950394588500564e-05, | |
| "loss": 0.0752, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.0064620355411955, | |
| "grad_norm": 0.06125921383500099, | |
| "learning_rate": 1.9499436302142053e-05, | |
| "loss": 0.0719, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.0096930533117932, | |
| "grad_norm": 0.03994071111083031, | |
| "learning_rate": 1.9494926719278468e-05, | |
| "loss": 0.0421, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.012924071082391, | |
| "grad_norm": 0.05433064326643944, | |
| "learning_rate": 1.9490417136414882e-05, | |
| "loss": 0.066, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.0161550888529887, | |
| "grad_norm": 0.06107380986213684, | |
| "learning_rate": 1.9485907553551296e-05, | |
| "loss": 0.0724, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.0193861066235865, | |
| "grad_norm": 0.0669042244553566, | |
| "learning_rate": 1.948139797068771e-05, | |
| "loss": 0.0772, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.0226171243941842, | |
| "grad_norm": 0.0474565327167511, | |
| "learning_rate": 1.947688838782413e-05, | |
| "loss": 0.0491, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.0258481421647818, | |
| "grad_norm": 0.054098691791296005, | |
| "learning_rate": 1.9472378804960543e-05, | |
| "loss": 0.0618, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.0290791599353797, | |
| "grad_norm": 0.06151336431503296, | |
| "learning_rate": 1.9467869222096957e-05, | |
| "loss": 0.0604, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.0323101777059773, | |
| "grad_norm": 0.051618464291095734, | |
| "learning_rate": 1.946335963923337e-05, | |
| "loss": 0.0551, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0355411954765752, | |
| "grad_norm": 0.08121399581432343, | |
| "learning_rate": 1.9458850056369786e-05, | |
| "loss": 0.0939, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.0387722132471728, | |
| "grad_norm": 0.05889379233121872, | |
| "learning_rate": 1.94543404735062e-05, | |
| "loss": 0.0687, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.0420032310177707, | |
| "grad_norm": 0.06208242103457451, | |
| "learning_rate": 1.9449830890642618e-05, | |
| "loss": 0.0744, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.0452342487883683, | |
| "grad_norm": 0.06454786658287048, | |
| "learning_rate": 1.9445321307779033e-05, | |
| "loss": 0.0684, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.048465266558966, | |
| "grad_norm": 0.07085470855236053, | |
| "learning_rate": 1.9440811724915447e-05, | |
| "loss": 0.0727, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.0516962843295639, | |
| "grad_norm": 0.07236117869615555, | |
| "learning_rate": 1.943630214205186e-05, | |
| "loss": 0.0892, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.0549273021001615, | |
| "grad_norm": 0.054056137800216675, | |
| "learning_rate": 1.9431792559188276e-05, | |
| "loss": 0.0634, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.0581583198707594, | |
| "grad_norm": 0.05462612211704254, | |
| "learning_rate": 1.942728297632469e-05, | |
| "loss": 0.0631, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.061389337641357, | |
| "grad_norm": 0.0674949586391449, | |
| "learning_rate": 1.9422773393461108e-05, | |
| "loss": 0.0641, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.0646203554119547, | |
| "grad_norm": 0.07532529532909393, | |
| "learning_rate": 1.9418263810597522e-05, | |
| "loss": 0.0765, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0678513731825525, | |
| "grad_norm": 0.06264142692089081, | |
| "learning_rate": 1.9413754227733937e-05, | |
| "loss": 0.0704, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.0710823909531502, | |
| "grad_norm": 0.06789285689592361, | |
| "learning_rate": 1.940924464487035e-05, | |
| "loss": 0.0667, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.074313408723748, | |
| "grad_norm": 0.06181450933218002, | |
| "learning_rate": 1.9404735062006765e-05, | |
| "loss": 0.0648, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.0775444264943457, | |
| "grad_norm": 0.07014179229736328, | |
| "learning_rate": 1.940022547914318e-05, | |
| "loss": 0.076, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.0807754442649435, | |
| "grad_norm": 0.07433414459228516, | |
| "learning_rate": 1.9395715896279598e-05, | |
| "loss": 0.0757, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.0840064620355412, | |
| "grad_norm": 0.04758503660559654, | |
| "learning_rate": 1.9391206313416012e-05, | |
| "loss": 0.0492, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.0872374798061388, | |
| "grad_norm": 0.06751306354999542, | |
| "learning_rate": 1.9386696730552426e-05, | |
| "loss": 0.0682, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.0904684975767367, | |
| "grad_norm": 0.06028216332197189, | |
| "learning_rate": 1.938218714768884e-05, | |
| "loss": 0.059, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.0936995153473343, | |
| "grad_norm": 0.060358040034770966, | |
| "learning_rate": 1.9377677564825255e-05, | |
| "loss": 0.0659, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.0969305331179322, | |
| "grad_norm": 0.06687436252832413, | |
| "learning_rate": 1.937316798196167e-05, | |
| "loss": 0.0517, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1001615508885298, | |
| "grad_norm": 0.07463373243808746, | |
| "learning_rate": 1.9368658399098087e-05, | |
| "loss": 0.0674, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.1033925686591277, | |
| "grad_norm": 0.06248531863093376, | |
| "learning_rate": 1.9364148816234502e-05, | |
| "loss": 0.0681, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.1066235864297254, | |
| "grad_norm": 0.06864578276872635, | |
| "learning_rate": 1.9359639233370916e-05, | |
| "loss": 0.0703, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.109854604200323, | |
| "grad_norm": 0.0693066269159317, | |
| "learning_rate": 1.935512965050733e-05, | |
| "loss": 0.0599, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.1130856219709209, | |
| "grad_norm": 0.13610310852527618, | |
| "learning_rate": 1.9350620067643745e-05, | |
| "loss": 0.0789, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.1163166397415185, | |
| "grad_norm": 0.09487364441156387, | |
| "learning_rate": 1.934611048478016e-05, | |
| "loss": 0.094, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.1195476575121164, | |
| "grad_norm": 0.0767926499247551, | |
| "learning_rate": 1.9341600901916574e-05, | |
| "loss": 0.0751, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.122778675282714, | |
| "grad_norm": 0.1105605959892273, | |
| "learning_rate": 1.9337091319052988e-05, | |
| "loss": 0.0908, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.1260096930533119, | |
| "grad_norm": 0.06821838766336441, | |
| "learning_rate": 1.9332581736189403e-05, | |
| "loss": 0.0702, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.1292407108239095, | |
| "grad_norm": 0.07123742997646332, | |
| "learning_rate": 1.932807215332582e-05, | |
| "loss": 0.0637, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1324717285945072, | |
| "grad_norm": 0.08340942859649658, | |
| "learning_rate": 1.9323562570462235e-05, | |
| "loss": 0.0803, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.135702746365105, | |
| "grad_norm": 0.06730187684297562, | |
| "learning_rate": 1.931905298759865e-05, | |
| "loss": 0.0644, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.1389337641357027, | |
| "grad_norm": 0.06728731095790863, | |
| "learning_rate": 1.9314543404735063e-05, | |
| "loss": 0.0633, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.1421647819063006, | |
| "grad_norm": 0.07192697376012802, | |
| "learning_rate": 1.9310033821871478e-05, | |
| "loss": 0.0664, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.1453957996768982, | |
| "grad_norm": 0.07150010764598846, | |
| "learning_rate": 1.9305524239007892e-05, | |
| "loss": 0.0745, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.148626817447496, | |
| "grad_norm": 0.05815986543893814, | |
| "learning_rate": 1.9301014656144307e-05, | |
| "loss": 0.0501, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.1518578352180937, | |
| "grad_norm": 0.063558429479599, | |
| "learning_rate": 1.929650507328072e-05, | |
| "loss": 0.064, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.1550888529886914, | |
| "grad_norm": 0.08062389492988586, | |
| "learning_rate": 1.9291995490417135e-05, | |
| "loss": 0.0792, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.1583198707592892, | |
| "grad_norm": 0.06872212886810303, | |
| "learning_rate": 1.9287485907553553e-05, | |
| "loss": 0.0756, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.1615508885298869, | |
| "grad_norm": 0.06003013253211975, | |
| "learning_rate": 1.9282976324689968e-05, | |
| "loss": 0.0577, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1647819063004847, | |
| "grad_norm": 0.07533125579357147, | |
| "learning_rate": 1.9278466741826382e-05, | |
| "loss": 0.0745, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.1680129240710824, | |
| "grad_norm": 0.0708516389131546, | |
| "learning_rate": 1.9273957158962796e-05, | |
| "loss": 0.0671, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.1712439418416802, | |
| "grad_norm": 0.10226985812187195, | |
| "learning_rate": 1.926944757609921e-05, | |
| "loss": 0.1126, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.1744749596122779, | |
| "grad_norm": 0.067733995616436, | |
| "learning_rate": 1.9264937993235625e-05, | |
| "loss": 0.0554, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 1.1777059773828755, | |
| "grad_norm": 0.08708222955465317, | |
| "learning_rate": 1.9260428410372043e-05, | |
| "loss": 0.0806, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.1809369951534734, | |
| "grad_norm": 0.06153462454676628, | |
| "learning_rate": 1.9255918827508457e-05, | |
| "loss": 0.0532, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 1.184168012924071, | |
| "grad_norm": 0.051941219717264175, | |
| "learning_rate": 1.9251409244644872e-05, | |
| "loss": 0.0503, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 1.187399030694669, | |
| "grad_norm": 0.09817774593830109, | |
| "learning_rate": 1.9246899661781286e-05, | |
| "loss": 0.0801, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.1906300484652665, | |
| "grad_norm": 0.08504205197095871, | |
| "learning_rate": 1.92423900789177e-05, | |
| "loss": 0.08, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 1.1938610662358644, | |
| "grad_norm": 0.0611301064491272, | |
| "learning_rate": 1.9237880496054118e-05, | |
| "loss": 0.061, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.197092084006462, | |
| "grad_norm": 0.06038827449083328, | |
| "learning_rate": 1.9233370913190533e-05, | |
| "loss": 0.0512, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 1.2003231017770597, | |
| "grad_norm": 0.08283468335866928, | |
| "learning_rate": 1.9228861330326947e-05, | |
| "loss": 0.0753, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 1.2035541195476576, | |
| "grad_norm": 0.09547346830368042, | |
| "learning_rate": 1.922435174746336e-05, | |
| "loss": 0.0809, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 1.2067851373182552, | |
| "grad_norm": 0.058611780405044556, | |
| "learning_rate": 1.9219842164599776e-05, | |
| "loss": 0.0563, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 1.210016155088853, | |
| "grad_norm": 0.08549389988183975, | |
| "learning_rate": 1.921533258173619e-05, | |
| "loss": 0.0733, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.210016155088853, | |
| "eval_loss": 0.0792868584394455, | |
| "eval_runtime": 188.2838, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.2132471728594507, | |
| "grad_norm": 0.08417635411024094, | |
| "learning_rate": 1.9210822998872608e-05, | |
| "loss": 0.0841, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 1.2164781906300486, | |
| "grad_norm": 0.08157463371753693, | |
| "learning_rate": 1.9206313416009022e-05, | |
| "loss": 0.0714, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 1.2197092084006462, | |
| "grad_norm": 0.05649822950363159, | |
| "learning_rate": 1.9201803833145437e-05, | |
| "loss": 0.0503, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 1.2229402261712439, | |
| "grad_norm": 0.07617928087711334, | |
| "learning_rate": 1.919729425028185e-05, | |
| "loss": 0.0727, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 1.2261712439418417, | |
| "grad_norm": 0.0574098639190197, | |
| "learning_rate": 1.9192784667418266e-05, | |
| "loss": 0.0506, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2294022617124394, | |
| "grad_norm": 0.07354257255792618, | |
| "learning_rate": 1.918827508455468e-05, | |
| "loss": 0.0728, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 1.2326332794830372, | |
| "grad_norm": 0.07268121838569641, | |
| "learning_rate": 1.9183765501691098e-05, | |
| "loss": 0.0679, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 1.235864297253635, | |
| "grad_norm": 0.07641527056694031, | |
| "learning_rate": 1.9179255918827512e-05, | |
| "loss": 0.0663, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 1.2390953150242328, | |
| "grad_norm": 0.059996772557497025, | |
| "learning_rate": 1.9174746335963926e-05, | |
| "loss": 0.0523, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.2423263327948304, | |
| "grad_norm": 0.07397306710481644, | |
| "learning_rate": 1.917023675310034e-05, | |
| "loss": 0.0662, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.245557350565428, | |
| "grad_norm": 0.09324625134468079, | |
| "learning_rate": 1.9165727170236755e-05, | |
| "loss": 0.083, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 1.248788368336026, | |
| "grad_norm": 0.08019818365573883, | |
| "learning_rate": 1.916121758737317e-05, | |
| "loss": 0.0682, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 1.2520193861066236, | |
| "grad_norm": 0.08203406631946564, | |
| "learning_rate": 1.9156708004509584e-05, | |
| "loss": 0.0788, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 1.2552504038772212, | |
| "grad_norm": 0.07293461263179779, | |
| "learning_rate": 1.9152198421646e-05, | |
| "loss": 0.0583, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 1.258481421647819, | |
| "grad_norm": 0.07020010054111481, | |
| "learning_rate": 1.9147688838782413e-05, | |
| "loss": 0.0546, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.261712439418417, | |
| "grad_norm": 0.0655217245221138, | |
| "learning_rate": 1.914317925591883e-05, | |
| "loss": 0.0561, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 1.2649434571890146, | |
| "grad_norm": 0.0773930773139, | |
| "learning_rate": 1.9138669673055245e-05, | |
| "loss": 0.0805, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 1.2681744749596122, | |
| "grad_norm": 0.06243716925382614, | |
| "learning_rate": 1.913416009019166e-05, | |
| "loss": 0.0573, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 1.27140549273021, | |
| "grad_norm": 0.07922864705324173, | |
| "learning_rate": 1.9129650507328074e-05, | |
| "loss": 0.0679, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 1.2746365105008077, | |
| "grad_norm": 0.10133316367864609, | |
| "learning_rate": 1.9125140924464488e-05, | |
| "loss": 0.1135, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.2778675282714054, | |
| "grad_norm": 0.0727897360920906, | |
| "learning_rate": 1.9120631341600903e-05, | |
| "loss": 0.0658, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 1.2810985460420032, | |
| "grad_norm": 0.0690392330288887, | |
| "learning_rate": 1.9116121758737317e-05, | |
| "loss": 0.0656, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 1.284329563812601, | |
| "grad_norm": 0.062050607055425644, | |
| "learning_rate": 1.911161217587373e-05, | |
| "loss": 0.0542, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 1.2875605815831987, | |
| "grad_norm": 0.0690266340970993, | |
| "learning_rate": 1.9107102593010146e-05, | |
| "loss": 0.065, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 1.2907915993537964, | |
| "grad_norm": 0.07588627934455872, | |
| "learning_rate": 1.9102593010146564e-05, | |
| "loss": 0.0761, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2940226171243943, | |
| "grad_norm": 0.07368933409452438, | |
| "learning_rate": 1.9098083427282978e-05, | |
| "loss": 0.0637, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 1.297253634894992, | |
| "grad_norm": 0.0670572817325592, | |
| "learning_rate": 1.9093573844419392e-05, | |
| "loss": 0.0656, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 1.3004846526655895, | |
| "grad_norm": 0.06778164952993393, | |
| "learning_rate": 1.9089064261555807e-05, | |
| "loss": 0.0577, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 1.3037156704361874, | |
| "grad_norm": 0.10589181631803513, | |
| "learning_rate": 1.908455467869222e-05, | |
| "loss": 0.0849, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 1.306946688206785, | |
| "grad_norm": 0.060739271342754364, | |
| "learning_rate": 1.9080045095828635e-05, | |
| "loss": 0.0563, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.310177705977383, | |
| "grad_norm": 0.062488917261362076, | |
| "learning_rate": 1.9075535512965053e-05, | |
| "loss": 0.0555, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 1.3134087237479806, | |
| "grad_norm": 0.08088962733745575, | |
| "learning_rate": 1.9071025930101468e-05, | |
| "loss": 0.0683, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 1.3166397415185784, | |
| "grad_norm": 0.07679299265146255, | |
| "learning_rate": 1.9066516347237882e-05, | |
| "loss": 0.0713, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 1.319870759289176, | |
| "grad_norm": 0.09731165319681168, | |
| "learning_rate": 1.9062006764374296e-05, | |
| "loss": 0.094, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 1.3231017770597737, | |
| "grad_norm": 0.09488274902105331, | |
| "learning_rate": 1.905749718151071e-05, | |
| "loss": 0.0855, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.3263327948303716, | |
| "grad_norm": 0.08556380867958069, | |
| "learning_rate": 1.9052987598647125e-05, | |
| "loss": 0.0872, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 1.3295638126009692, | |
| "grad_norm": 0.0821579322218895, | |
| "learning_rate": 1.9048478015783543e-05, | |
| "loss": 0.0802, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 1.332794830371567, | |
| "grad_norm": 0.08073097467422485, | |
| "learning_rate": 1.9043968432919957e-05, | |
| "loss": 0.0764, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 1.3360258481421647, | |
| "grad_norm": 0.07449216395616531, | |
| "learning_rate": 1.9039458850056372e-05, | |
| "loss": 0.0681, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 1.3392568659127626, | |
| "grad_norm": 0.05690048635005951, | |
| "learning_rate": 1.9034949267192786e-05, | |
| "loss": 0.0479, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.3424878836833603, | |
| "grad_norm": 0.08106525242328644, | |
| "learning_rate": 1.90304396843292e-05, | |
| "loss": 0.0751, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.345718901453958, | |
| "grad_norm": 0.06518511474132538, | |
| "learning_rate": 1.9025930101465615e-05, | |
| "loss": 0.0594, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 1.3489499192245558, | |
| "grad_norm": 0.08395849913358688, | |
| "learning_rate": 1.9021420518602033e-05, | |
| "loss": 0.0756, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 1.3521809369951534, | |
| "grad_norm": 0.05958770960569382, | |
| "learning_rate": 1.9016910935738447e-05, | |
| "loss": 0.0491, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 1.3554119547657513, | |
| "grad_norm": 0.07311136275529861, | |
| "learning_rate": 1.901240135287486e-05, | |
| "loss": 0.0719, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.358642972536349, | |
| "grad_norm": 0.09682740271091461, | |
| "learning_rate": 1.9007891770011276e-05, | |
| "loss": 0.0895, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 1.3618739903069468, | |
| "grad_norm": 0.06295045465230942, | |
| "learning_rate": 1.900338218714769e-05, | |
| "loss": 0.0589, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 1.3651050080775444, | |
| "grad_norm": 0.0831819698214531, | |
| "learning_rate": 1.8998872604284105e-05, | |
| "loss": 0.0727, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 1.368336025848142, | |
| "grad_norm": 0.06702585518360138, | |
| "learning_rate": 1.8994363021420522e-05, | |
| "loss": 0.0617, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 1.37156704361874, | |
| "grad_norm": 0.06618952006101608, | |
| "learning_rate": 1.8989853438556937e-05, | |
| "loss": 0.0517, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.3747980613893376, | |
| "grad_norm": 0.07830128818750381, | |
| "learning_rate": 1.898534385569335e-05, | |
| "loss": 0.0747, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 1.3780290791599354, | |
| "grad_norm": 0.07554402947425842, | |
| "learning_rate": 1.8980834272829766e-05, | |
| "loss": 0.078, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 1.381260096930533, | |
| "grad_norm": 0.07517927139997482, | |
| "learning_rate": 1.897632468996618e-05, | |
| "loss": 0.0715, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 1.384491114701131, | |
| "grad_norm": 0.05810945853590965, | |
| "learning_rate": 1.8971815107102594e-05, | |
| "loss": 0.0524, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 1.3877221324717286, | |
| "grad_norm": 0.1092490404844284, | |
| "learning_rate": 1.896730552423901e-05, | |
| "loss": 0.0987, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.3909531502423262, | |
| "grad_norm": 0.08325308561325073, | |
| "learning_rate": 1.8962795941375423e-05, | |
| "loss": 0.0746, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 1.394184168012924, | |
| "grad_norm": 0.08017408847808838, | |
| "learning_rate": 1.895828635851184e-05, | |
| "loss": 0.0676, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.3974151857835218, | |
| "grad_norm": 0.09756331145763397, | |
| "learning_rate": 1.8953776775648255e-05, | |
| "loss": 0.0788, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 1.4006462035541196, | |
| "grad_norm": 0.0654483512043953, | |
| "learning_rate": 1.894926719278467e-05, | |
| "loss": 0.0552, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 1.4038772213247173, | |
| "grad_norm": 0.07338982075452805, | |
| "learning_rate": 1.8944757609921084e-05, | |
| "loss": 0.0597, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.4071082390953151, | |
| "grad_norm": 0.06292750686407089, | |
| "learning_rate": 1.89402480270575e-05, | |
| "loss": 0.0482, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 1.4103392568659128, | |
| "grad_norm": 0.09405938535928726, | |
| "learning_rate": 1.8935738444193913e-05, | |
| "loss": 0.0795, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 1.4135702746365104, | |
| "grad_norm": 0.09486392885446548, | |
| "learning_rate": 1.8931228861330327e-05, | |
| "loss": 0.0811, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.4168012924071083, | |
| "grad_norm": 0.0729052945971489, | |
| "learning_rate": 1.892671927846674e-05, | |
| "loss": 0.0682, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 1.420032310177706, | |
| "grad_norm": 0.06790515035390854, | |
| "learning_rate": 1.8922209695603156e-05, | |
| "loss": 0.0541, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4232633279483038, | |
| "grad_norm": 0.08173596858978271, | |
| "learning_rate": 1.891770011273957e-05, | |
| "loss": 0.0697, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 1.4264943457189014, | |
| "grad_norm": 0.0874050036072731, | |
| "learning_rate": 1.8913190529875988e-05, | |
| "loss": 0.0689, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 1.4297253634894993, | |
| "grad_norm": 0.07508452981710434, | |
| "learning_rate": 1.8908680947012403e-05, | |
| "loss": 0.069, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 1.432956381260097, | |
| "grad_norm": 0.09134234488010406, | |
| "learning_rate": 1.8904171364148817e-05, | |
| "loss": 0.072, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 1.4361873990306946, | |
| "grad_norm": 0.0830577090382576, | |
| "learning_rate": 1.889966178128523e-05, | |
| "loss": 0.0681, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.4394184168012925, | |
| "grad_norm": 0.0741642490029335, | |
| "learning_rate": 1.8895152198421646e-05, | |
| "loss": 0.0642, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 1.44264943457189, | |
| "grad_norm": 0.07305614650249481, | |
| "learning_rate": 1.889064261555806e-05, | |
| "loss": 0.0569, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 1.445880452342488, | |
| "grad_norm": 0.05348379164934158, | |
| "learning_rate": 1.8886133032694478e-05, | |
| "loss": 0.0434, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.4491114701130856, | |
| "grad_norm": 0.09780937433242798, | |
| "learning_rate": 1.8881623449830892e-05, | |
| "loss": 0.0856, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 1.4523424878836835, | |
| "grad_norm": 0.081721231341362, | |
| "learning_rate": 1.8877113866967307e-05, | |
| "loss": 0.0655, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4523424878836835, | |
| "eval_loss": 0.0769171267747879, | |
| "eval_runtime": 188.2463, | |
| "eval_samples_per_second": 1.047, | |
| "eval_steps_per_second": 1.047, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4555735056542811, | |
| "grad_norm": 0.06182597577571869, | |
| "learning_rate": 1.887260428410372e-05, | |
| "loss": 0.0549, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 1.4588045234248788, | |
| "grad_norm": 0.0831274464726448, | |
| "learning_rate": 1.8868094701240136e-05, | |
| "loss": 0.0664, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 1.4620355411954766, | |
| "grad_norm": 0.07277555763721466, | |
| "learning_rate": 1.8863585118376553e-05, | |
| "loss": 0.0619, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 1.4652665589660743, | |
| "grad_norm": 0.09069440513849258, | |
| "learning_rate": 1.8859075535512968e-05, | |
| "loss": 0.0717, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 1.468497576736672, | |
| "grad_norm": 0.08567981421947479, | |
| "learning_rate": 1.8854565952649382e-05, | |
| "loss": 0.0761, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.4717285945072698, | |
| "grad_norm": 0.08353572338819504, | |
| "learning_rate": 1.8850056369785796e-05, | |
| "loss": 0.0621, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 1.4749596122778676, | |
| "grad_norm": 0.06493799388408661, | |
| "learning_rate": 1.884554678692221e-05, | |
| "loss": 0.0549, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 1.4781906300484653, | |
| "grad_norm": 0.07239842414855957, | |
| "learning_rate": 1.8841037204058625e-05, | |
| "loss": 0.0574, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 1.481421647819063, | |
| "grad_norm": 0.1062210276722908, | |
| "learning_rate": 1.8836527621195043e-05, | |
| "loss": 0.0831, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 1.4846526655896608, | |
| "grad_norm": 0.06695660948753357, | |
| "learning_rate": 1.8832018038331457e-05, | |
| "loss": 0.0488, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4878836833602584, | |
| "grad_norm": 0.08332875370979309, | |
| "learning_rate": 1.8827508455467872e-05, | |
| "loss": 0.0752, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 1.491114701130856, | |
| "grad_norm": 0.09285688400268555, | |
| "learning_rate": 1.8822998872604286e-05, | |
| "loss": 0.0811, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 1.494345718901454, | |
| "grad_norm": 0.07672538608312607, | |
| "learning_rate": 1.88184892897407e-05, | |
| "loss": 0.0565, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 1.4975767366720518, | |
| "grad_norm": 0.07295355945825577, | |
| "learning_rate": 1.8813979706877115e-05, | |
| "loss": 0.0615, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.5008077544426495, | |
| "grad_norm": 0.05997586250305176, | |
| "learning_rate": 1.8809470124013533e-05, | |
| "loss": 0.0494, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.504038772213247, | |
| "grad_norm": 0.08460883051156998, | |
| "learning_rate": 1.8804960541149947e-05, | |
| "loss": 0.0715, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 1.507269789983845, | |
| "grad_norm": 0.08083106577396393, | |
| "learning_rate": 1.880045095828636e-05, | |
| "loss": 0.0615, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 1.5105008077544426, | |
| "grad_norm": 0.09291260689496994, | |
| "learning_rate": 1.8795941375422776e-05, | |
| "loss": 0.0767, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 1.5137318255250403, | |
| "grad_norm": 0.0817233994603157, | |
| "learning_rate": 1.879143179255919e-05, | |
| "loss": 0.0728, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 1.5169628432956381, | |
| "grad_norm": 0.07894831895828247, | |
| "learning_rate": 1.8786922209695605e-05, | |
| "loss": 0.0697, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.520193861066236, | |
| "grad_norm": 0.05180181935429573, | |
| "learning_rate": 1.878241262683202e-05, | |
| "loss": 0.0407, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 1.5234248788368336, | |
| "grad_norm": 0.08214667439460754, | |
| "learning_rate": 1.8777903043968433e-05, | |
| "loss": 0.0673, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 1.5266558966074313, | |
| "grad_norm": 0.06972946226596832, | |
| "learning_rate": 1.877339346110485e-05, | |
| "loss": 0.0501, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 1.5298869143780292, | |
| "grad_norm": 0.08416459709405899, | |
| "learning_rate": 1.8768883878241266e-05, | |
| "loss": 0.0666, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 1.5331179321486268, | |
| "grad_norm": 0.07642164081335068, | |
| "learning_rate": 1.876437429537768e-05, | |
| "loss": 0.0592, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.5363489499192244, | |
| "grad_norm": 0.0762806385755539, | |
| "learning_rate": 1.8759864712514094e-05, | |
| "loss": 0.0573, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 1.5395799676898223, | |
| "grad_norm": 0.06152572110295296, | |
| "learning_rate": 1.875535512965051e-05, | |
| "loss": 0.0509, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 1.5428109854604202, | |
| "grad_norm": 0.08461987972259521, | |
| "learning_rate": 1.8750845546786923e-05, | |
| "loss": 0.0693, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 1.5460420032310178, | |
| "grad_norm": 0.06401054561138153, | |
| "learning_rate": 1.8746335963923338e-05, | |
| "loss": 0.0523, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 1.5492730210016155, | |
| "grad_norm": 0.07567861676216125, | |
| "learning_rate": 1.8741826381059752e-05, | |
| "loss": 0.0632, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5525040387722133, | |
| "grad_norm": 0.07169700413942337, | |
| "learning_rate": 1.8737316798196166e-05, | |
| "loss": 0.0541, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 1.555735056542811, | |
| "grad_norm": 0.067410409450531, | |
| "learning_rate": 1.873280721533258e-05, | |
| "loss": 0.0576, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 1.5589660743134086, | |
| "grad_norm": 0.0794718787074089, | |
| "learning_rate": 1.8728297632469e-05, | |
| "loss": 0.0602, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 1.5621970920840065, | |
| "grad_norm": 0.09098870307207108, | |
| "learning_rate": 1.8723788049605413e-05, | |
| "loss": 0.0741, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.5654281098546043, | |
| "grad_norm": 0.07266968488693237, | |
| "learning_rate": 1.8719278466741827e-05, | |
| "loss": 0.0535, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.568659127625202, | |
| "grad_norm": 0.07994985580444336, | |
| "learning_rate": 1.8714768883878242e-05, | |
| "loss": 0.0642, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 1.5718901453957996, | |
| "grad_norm": 0.09563203901052475, | |
| "learning_rate": 1.8710259301014656e-05, | |
| "loss": 0.0738, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 1.5751211631663975, | |
| "grad_norm": 0.07337169349193573, | |
| "learning_rate": 1.870574971815107e-05, | |
| "loss": 0.0615, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 1.5783521809369951, | |
| "grad_norm": 0.08605758100748062, | |
| "learning_rate": 1.8701240135287488e-05, | |
| "loss": 0.0737, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 1.5815831987075928, | |
| "grad_norm": 0.08178628236055374, | |
| "learning_rate": 1.8696730552423903e-05, | |
| "loss": 0.0562, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5848142164781907, | |
| "grad_norm": 0.08635883033275604, | |
| "learning_rate": 1.8692220969560317e-05, | |
| "loss": 0.0655, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 1.5880452342487885, | |
| "grad_norm": 0.10575321316719055, | |
| "learning_rate": 1.868771138669673e-05, | |
| "loss": 0.0857, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 1.5912762520193862, | |
| "grad_norm": 0.10067257285118103, | |
| "learning_rate": 1.8683201803833146e-05, | |
| "loss": 0.0817, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 1.5945072697899838, | |
| "grad_norm": 0.07644681632518768, | |
| "learning_rate": 1.867869222096956e-05, | |
| "loss": 0.0625, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.5977382875605817, | |
| "grad_norm": 0.07164619863033295, | |
| "learning_rate": 1.8674182638105978e-05, | |
| "loss": 0.0597, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.6009693053311793, | |
| "grad_norm": 0.07293085008859634, | |
| "learning_rate": 1.8669673055242392e-05, | |
| "loss": 0.0604, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.604200323101777, | |
| "grad_norm": 0.09480689465999603, | |
| "learning_rate": 1.8665163472378807e-05, | |
| "loss": 0.0737, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.6074313408723748, | |
| "grad_norm": 0.09798948466777802, | |
| "learning_rate": 1.866065388951522e-05, | |
| "loss": 0.0706, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.6106623586429727, | |
| "grad_norm": 0.08216292411088943, | |
| "learning_rate": 1.8656144306651636e-05, | |
| "loss": 0.0585, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.6138933764135701, | |
| "grad_norm": 0.10146701335906982, | |
| "learning_rate": 1.865163472378805e-05, | |
| "loss": 0.0631, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.617124394184168, | |
| "grad_norm": 0.07699297368526459, | |
| "learning_rate": 1.8647125140924468e-05, | |
| "loss": 0.0592, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.6203554119547658, | |
| "grad_norm": 0.07803017646074295, | |
| "learning_rate": 1.8642615558060882e-05, | |
| "loss": 0.063, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.6235864297253635, | |
| "grad_norm": 0.08820293843746185, | |
| "learning_rate": 1.8638105975197297e-05, | |
| "loss": 0.0733, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.6268174474959611, | |
| "grad_norm": 0.10102511942386627, | |
| "learning_rate": 1.863359639233371e-05, | |
| "loss": 0.0735, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.630048465266559, | |
| "grad_norm": 0.08669153600931168, | |
| "learning_rate": 1.8629086809470125e-05, | |
| "loss": 0.0757, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.6332794830371569, | |
| "grad_norm": 0.08120600879192352, | |
| "learning_rate": 1.862457722660654e-05, | |
| "loss": 0.0586, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.6365105008077543, | |
| "grad_norm": 0.06960420310497284, | |
| "learning_rate": 1.8620067643742957e-05, | |
| "loss": 0.0519, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.6397415185783522, | |
| "grad_norm": 0.08567452430725098, | |
| "learning_rate": 1.8615558060879372e-05, | |
| "loss": 0.0703, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.64297253634895, | |
| "grad_norm": 0.08288481831550598, | |
| "learning_rate": 1.8611048478015786e-05, | |
| "loss": 0.0624, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.6462035541195477, | |
| "grad_norm": 0.10185632109642029, | |
| "learning_rate": 1.86065388951522e-05, | |
| "loss": 0.072, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6494345718901453, | |
| "grad_norm": 0.0914456769824028, | |
| "learning_rate": 1.8602029312288615e-05, | |
| "loss": 0.0683, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.6526655896607432, | |
| "grad_norm": 0.08204774558544159, | |
| "learning_rate": 1.859751972942503e-05, | |
| "loss": 0.0657, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.655896607431341, | |
| "grad_norm": 0.11823786050081253, | |
| "learning_rate": 1.8593010146561444e-05, | |
| "loss": 0.0872, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.6591276252019385, | |
| "grad_norm": 0.13115671277046204, | |
| "learning_rate": 1.858850056369786e-05, | |
| "loss": 0.1004, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.6623586429725363, | |
| "grad_norm": 0.09443841129541397, | |
| "learning_rate": 1.8583990980834276e-05, | |
| "loss": 0.0722, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.6655896607431342, | |
| "grad_norm": 0.06847009062767029, | |
| "learning_rate": 1.857948139797069e-05, | |
| "loss": 0.054, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.6688206785137318, | |
| "grad_norm": 0.07960178703069687, | |
| "learning_rate": 1.8574971815107105e-05, | |
| "loss": 0.065, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.6720516962843295, | |
| "grad_norm": 0.07255195826292038, | |
| "learning_rate": 1.857046223224352e-05, | |
| "loss": 0.0523, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.6752827140549273, | |
| "grad_norm": 0.08610787242650986, | |
| "learning_rate": 1.8565952649379934e-05, | |
| "loss": 0.0667, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.678513731825525, | |
| "grad_norm": 0.09422770887613297, | |
| "learning_rate": 1.8561443066516348e-05, | |
| "loss": 0.0772, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6817447495961226, | |
| "grad_norm": 0.10716807097196579, | |
| "learning_rate": 1.8556933483652762e-05, | |
| "loss": 0.0825, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.6849757673667205, | |
| "grad_norm": 0.09894333779811859, | |
| "learning_rate": 1.8552423900789177e-05, | |
| "loss": 0.084, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.6882067851373184, | |
| "grad_norm": 0.08127731829881668, | |
| "learning_rate": 1.854791431792559e-05, | |
| "loss": 0.063, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.691437802907916, | |
| "grad_norm": 0.08127739280462265, | |
| "learning_rate": 1.854340473506201e-05, | |
| "loss": 0.0602, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.6946688206785137, | |
| "grad_norm": 0.08109954744577408, | |
| "learning_rate": 1.8538895152198423e-05, | |
| "loss": 0.0618, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.6946688206785137, | |
| "eval_loss": 0.07574764639139175, | |
| "eval_runtime": 188.1431, | |
| "eval_samples_per_second": 1.047, | |
| "eval_steps_per_second": 1.047, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.6978998384491115, | |
| "grad_norm": 0.07915576547384262, | |
| "learning_rate": 1.8534385569334838e-05, | |
| "loss": 0.0566, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.7011308562197092, | |
| "grad_norm": 0.09259936213493347, | |
| "learning_rate": 1.8529875986471252e-05, | |
| "loss": 0.0821, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.7043618739903068, | |
| "grad_norm": 0.06958405673503876, | |
| "learning_rate": 1.8525366403607666e-05, | |
| "loss": 0.0513, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.7075928917609047, | |
| "grad_norm": 0.11260278522968292, | |
| "learning_rate": 1.852085682074408e-05, | |
| "loss": 0.0791, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.7108239095315025, | |
| "grad_norm": 0.08565714955329895, | |
| "learning_rate": 1.8516347237880495e-05, | |
| "loss": 0.0704, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.7140549273021002, | |
| "grad_norm": 0.0977453961968422, | |
| "learning_rate": 1.8511837655016913e-05, | |
| "loss": 0.0666, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.7172859450726978, | |
| "grad_norm": 0.09589142352342606, | |
| "learning_rate": 1.8507328072153327e-05, | |
| "loss": 0.0678, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.7205169628432957, | |
| "grad_norm": 0.10372763872146606, | |
| "learning_rate": 1.8502818489289742e-05, | |
| "loss": 0.0755, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.7237479806138933, | |
| "grad_norm": 0.09707041829824448, | |
| "learning_rate": 1.8498308906426156e-05, | |
| "loss": 0.0759, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.726978998384491, | |
| "grad_norm": 0.07280156016349792, | |
| "learning_rate": 1.849379932356257e-05, | |
| "loss": 0.058, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.7302100161550888, | |
| "grad_norm": 0.08810850977897644, | |
| "learning_rate": 1.848928974069899e-05, | |
| "loss": 0.0691, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.7334410339256867, | |
| "grad_norm": 0.09844056516885757, | |
| "learning_rate": 1.8484780157835403e-05, | |
| "loss": 0.0682, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.7366720516962844, | |
| "grad_norm": 0.06963982433080673, | |
| "learning_rate": 1.8480270574971817e-05, | |
| "loss": 0.0513, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.739903069466882, | |
| "grad_norm": 0.08248520642518997, | |
| "learning_rate": 1.847576099210823e-05, | |
| "loss": 0.0611, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.7431340872374799, | |
| "grad_norm": 0.09553173929452896, | |
| "learning_rate": 1.8471251409244646e-05, | |
| "loss": 0.0754, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7463651050080775, | |
| "grad_norm": 0.06919584423303604, | |
| "learning_rate": 1.846674182638106e-05, | |
| "loss": 0.0508, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.7495961227786752, | |
| "grad_norm": 0.07004183530807495, | |
| "learning_rate": 1.8462232243517478e-05, | |
| "loss": 0.0498, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.752827140549273, | |
| "grad_norm": 0.08570928126573563, | |
| "learning_rate": 1.8457722660653892e-05, | |
| "loss": 0.0631, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.7560581583198709, | |
| "grad_norm": 0.0732467994093895, | |
| "learning_rate": 1.8453213077790307e-05, | |
| "loss": 0.0557, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.7592891760904685, | |
| "grad_norm": 0.07687011361122131, | |
| "learning_rate": 1.844870349492672e-05, | |
| "loss": 0.0573, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.7625201938610662, | |
| "grad_norm": 0.07186026871204376, | |
| "learning_rate": 1.8444193912063136e-05, | |
| "loss": 0.0502, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.765751211631664, | |
| "grad_norm": 0.07176259905099869, | |
| "learning_rate": 1.843968432919955e-05, | |
| "loss": 0.0529, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.7689822294022617, | |
| "grad_norm": 0.0842595249414444, | |
| "learning_rate": 1.8435174746335968e-05, | |
| "loss": 0.0674, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.7722132471728593, | |
| "grad_norm": 0.07965710759162903, | |
| "learning_rate": 1.8430665163472382e-05, | |
| "loss": 0.0619, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.7754442649434572, | |
| "grad_norm": 0.08953316509723663, | |
| "learning_rate": 1.8426155580608797e-05, | |
| "loss": 0.0643, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.778675282714055, | |
| "grad_norm": 0.0928904190659523, | |
| "learning_rate": 1.842164599774521e-05, | |
| "loss": 0.0712, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.7819063004846527, | |
| "grad_norm": 0.08743231743574142, | |
| "learning_rate": 1.8417136414881625e-05, | |
| "loss": 0.0657, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.7851373182552503, | |
| "grad_norm": 0.07706678658723831, | |
| "learning_rate": 1.841262683201804e-05, | |
| "loss": 0.0568, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.7883683360258482, | |
| "grad_norm": 0.0831725150346756, | |
| "learning_rate": 1.8408117249154454e-05, | |
| "loss": 0.0578, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.7915993537964459, | |
| "grad_norm": 0.09395398199558258, | |
| "learning_rate": 1.8403607666290872e-05, | |
| "loss": 0.0715, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.7948303715670435, | |
| "grad_norm": 0.0942830964922905, | |
| "learning_rate": 1.8399098083427286e-05, | |
| "loss": 0.0634, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.7980613893376414, | |
| "grad_norm": 0.0980205312371254, | |
| "learning_rate": 1.83945885005637e-05, | |
| "loss": 0.0694, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.8012924071082392, | |
| "grad_norm": 0.10699216276407242, | |
| "learning_rate": 1.8390078917700115e-05, | |
| "loss": 0.0871, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.8045234248788369, | |
| "grad_norm": 0.09851755946874619, | |
| "learning_rate": 1.838556933483653e-05, | |
| "loss": 0.0644, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.8077544426494345, | |
| "grad_norm": 0.09926044940948486, | |
| "learning_rate": 1.8381059751972944e-05, | |
| "loss": 0.0629, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.8109854604200324, | |
| "grad_norm": 0.09520839154720306, | |
| "learning_rate": 1.8376550169109358e-05, | |
| "loss": 0.0607, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.81421647819063, | |
| "grad_norm": 0.06896607577800751, | |
| "learning_rate": 1.8372040586245773e-05, | |
| "loss": 0.0559, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.8174474959612277, | |
| "grad_norm": 0.09539300203323364, | |
| "learning_rate": 1.8367531003382187e-05, | |
| "loss": 0.0683, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.8206785137318255, | |
| "grad_norm": 0.10596197098493576, | |
| "learning_rate": 1.83630214205186e-05, | |
| "loss": 0.075, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.8239095315024234, | |
| "grad_norm": 0.13212427496910095, | |
| "learning_rate": 1.835851183765502e-05, | |
| "loss": 0.0755, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.827140549273021, | |
| "grad_norm": 0.12438125163316727, | |
| "learning_rate": 1.8354002254791434e-05, | |
| "loss": 0.0853, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.8303715670436187, | |
| "grad_norm": 0.06944366544485092, | |
| "learning_rate": 1.8349492671927848e-05, | |
| "loss": 0.0434, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.8336025848142166, | |
| "grad_norm": 0.10360438376665115, | |
| "learning_rate": 1.8344983089064262e-05, | |
| "loss": 0.0778, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.8368336025848142, | |
| "grad_norm": 0.1002860888838768, | |
| "learning_rate": 1.8340473506200677e-05, | |
| "loss": 0.0733, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.8400646203554119, | |
| "grad_norm": 0.10875017940998077, | |
| "learning_rate": 1.833596392333709e-05, | |
| "loss": 0.0781, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8432956381260097, | |
| "grad_norm": 0.08669572323560715, | |
| "learning_rate": 1.8331454340473506e-05, | |
| "loss": 0.0587, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.8465266558966076, | |
| "grad_norm": 0.09304548799991608, | |
| "learning_rate": 1.8326944757609923e-05, | |
| "loss": 0.0675, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.849757673667205, | |
| "grad_norm": 0.09815046936273575, | |
| "learning_rate": 1.8322435174746338e-05, | |
| "loss": 0.0752, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.8529886914378029, | |
| "grad_norm": 0.08040884137153625, | |
| "learning_rate": 1.8317925591882752e-05, | |
| "loss": 0.06, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.8562197092084007, | |
| "grad_norm": 0.08228793740272522, | |
| "learning_rate": 1.8313416009019166e-05, | |
| "loss": 0.0547, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.8594507269789984, | |
| "grad_norm": 0.07945281267166138, | |
| "learning_rate": 1.830890642615558e-05, | |
| "loss": 0.0467, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.862681744749596, | |
| "grad_norm": 0.07659505307674408, | |
| "learning_rate": 1.8304396843291995e-05, | |
| "loss": 0.0571, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.865912762520194, | |
| "grad_norm": 0.07296533137559891, | |
| "learning_rate": 1.8299887260428413e-05, | |
| "loss": 0.0567, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.8691437802907918, | |
| "grad_norm": 0.10132135450839996, | |
| "learning_rate": 1.8295377677564827e-05, | |
| "loss": 0.0589, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.8723747980613892, | |
| "grad_norm": 0.0985584482550621, | |
| "learning_rate": 1.8290868094701242e-05, | |
| "loss": 0.0707, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.875605815831987, | |
| "grad_norm": 0.09575635194778442, | |
| "learning_rate": 1.8286358511837656e-05, | |
| "loss": 0.0687, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.878836833602585, | |
| "grad_norm": 0.10488908737897873, | |
| "learning_rate": 1.828184892897407e-05, | |
| "loss": 0.0752, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.8820678513731826, | |
| "grad_norm": 0.0739881619811058, | |
| "learning_rate": 1.8277339346110485e-05, | |
| "loss": 0.0537, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.8852988691437802, | |
| "grad_norm": 0.06086435914039612, | |
| "learning_rate": 1.8272829763246903e-05, | |
| "loss": 0.051, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.888529886914378, | |
| "grad_norm": 0.09947849065065384, | |
| "learning_rate": 1.8268320180383317e-05, | |
| "loss": 0.0705, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.891760904684976, | |
| "grad_norm": 0.09509633481502533, | |
| "learning_rate": 1.826381059751973e-05, | |
| "loss": 0.0665, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.8949919224555734, | |
| "grad_norm": 0.12639068067073822, | |
| "learning_rate": 1.8259301014656146e-05, | |
| "loss": 0.0538, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.8982229402261712, | |
| "grad_norm": 0.09957147389650345, | |
| "learning_rate": 1.825479143179256e-05, | |
| "loss": 0.0649, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.901453957996769, | |
| "grad_norm": 0.10096530616283417, | |
| "learning_rate": 1.8250281848928975e-05, | |
| "loss": 0.0607, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.9046849757673667, | |
| "grad_norm": 0.10449621081352234, | |
| "learning_rate": 1.8245772266065392e-05, | |
| "loss": 0.0742, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.9079159935379644, | |
| "grad_norm": 0.09589482843875885, | |
| "learning_rate": 1.8241262683201807e-05, | |
| "loss": 0.069, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.9111470113085622, | |
| "grad_norm": 0.10444579273462296, | |
| "learning_rate": 1.823675310033822e-05, | |
| "loss": 0.0726, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.9143780290791599, | |
| "grad_norm": 0.09007811546325684, | |
| "learning_rate": 1.8232243517474636e-05, | |
| "loss": 0.0668, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.9176090468497575, | |
| "grad_norm": 0.093568354845047, | |
| "learning_rate": 1.822773393461105e-05, | |
| "loss": 0.0658, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.9208400646203554, | |
| "grad_norm": 0.0825546383857727, | |
| "learning_rate": 1.8223224351747464e-05, | |
| "loss": 0.0586, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.9240710823909533, | |
| "grad_norm": 0.08507188409566879, | |
| "learning_rate": 1.8218714768883882e-05, | |
| "loss": 0.0617, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.927302100161551, | |
| "grad_norm": 0.0858079269528389, | |
| "learning_rate": 1.8214205186020297e-05, | |
| "loss": 0.0601, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.9305331179321485, | |
| "grad_norm": 0.1205410435795784, | |
| "learning_rate": 1.820969560315671e-05, | |
| "loss": 0.0883, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.9337641357027464, | |
| "grad_norm": 0.1289929449558258, | |
| "learning_rate": 1.8205186020293125e-05, | |
| "loss": 0.076, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.936995153473344, | |
| "grad_norm": 0.11139614135026932, | |
| "learning_rate": 1.820067643742954e-05, | |
| "loss": 0.0846, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.936995153473344, | |
| "eval_loss": 0.07442964613437653, | |
| "eval_runtime": 188.1343, | |
| "eval_samples_per_second": 1.047, | |
| "eval_steps_per_second": 1.047, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9402261712439417, | |
| "grad_norm": 0.0923788920044899, | |
| "learning_rate": 1.8196166854565954e-05, | |
| "loss": 0.0605, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.9434571890145396, | |
| "grad_norm": 0.1251497119665146, | |
| "learning_rate": 1.819165727170237e-05, | |
| "loss": 0.0894, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.9466882067851374, | |
| "grad_norm": 0.11357556283473969, | |
| "learning_rate": 1.8187147688838783e-05, | |
| "loss": 0.0819, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.949919224555735, | |
| "grad_norm": 0.09567239135503769, | |
| "learning_rate": 1.8182638105975197e-05, | |
| "loss": 0.0624, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.9531502423263327, | |
| "grad_norm": 0.09191922098398209, | |
| "learning_rate": 1.8178128523111612e-05, | |
| "loss": 0.0578, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.9563812600969306, | |
| "grad_norm": 0.09503104537725449, | |
| "learning_rate": 1.817361894024803e-05, | |
| "loss": 0.0641, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.9596122778675282, | |
| "grad_norm": 0.11036618053913116, | |
| "learning_rate": 1.8169109357384444e-05, | |
| "loss": 0.0666, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.9628432956381259, | |
| "grad_norm": 0.09104762971401215, | |
| "learning_rate": 1.8164599774520858e-05, | |
| "loss": 0.0649, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.9660743134087237, | |
| "grad_norm": 0.10882871598005295, | |
| "learning_rate": 1.8160090191657273e-05, | |
| "loss": 0.0824, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.9693053311793216, | |
| "grad_norm": 0.09632111340761185, | |
| "learning_rate": 1.8155580608793687e-05, | |
| "loss": 0.069, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9725363489499192, | |
| "grad_norm": 0.07966237515211105, | |
| "learning_rate": 1.81510710259301e-05, | |
| "loss": 0.0557, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.975767366720517, | |
| "grad_norm": 0.10335849225521088, | |
| "learning_rate": 1.8146561443066516e-05, | |
| "loss": 0.0787, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.9789983844911148, | |
| "grad_norm": 0.12096443772315979, | |
| "learning_rate": 1.814205186020293e-05, | |
| "loss": 0.0731, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.9822294022617124, | |
| "grad_norm": 0.09432677179574966, | |
| "learning_rate": 1.8137542277339348e-05, | |
| "loss": 0.0635, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.98546042003231, | |
| "grad_norm": 0.11708611994981766, | |
| "learning_rate": 1.8133032694475762e-05, | |
| "loss": 0.0874, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.988691437802908, | |
| "grad_norm": 0.1113506332039833, | |
| "learning_rate": 1.8128523111612177e-05, | |
| "loss": 0.0678, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.9919224555735058, | |
| "grad_norm": 0.09246299415826797, | |
| "learning_rate": 1.812401352874859e-05, | |
| "loss": 0.0672, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.9951534733441034, | |
| "grad_norm": 0.1115182563662529, | |
| "learning_rate": 1.8119503945885006e-05, | |
| "loss": 0.0758, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.998384491114701, | |
| "grad_norm": 0.10478319972753525, | |
| "learning_rate": 1.811499436302142e-05, | |
| "loss": 0.0648, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.13386856019496918, | |
| "learning_rate": 1.8110484780157838e-05, | |
| "loss": 0.0624, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.003231017770598, | |
| "grad_norm": 0.09262198954820633, | |
| "learning_rate": 1.8105975197294252e-05, | |
| "loss": 0.0608, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 2.0064620355411953, | |
| "grad_norm": 0.07091473788022995, | |
| "learning_rate": 1.8101465614430667e-05, | |
| "loss": 0.0434, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.009693053311793, | |
| "grad_norm": 0.10324624925851822, | |
| "learning_rate": 1.809695603156708e-05, | |
| "loss": 0.0764, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 2.012924071082391, | |
| "grad_norm": 0.10515467822551727, | |
| "learning_rate": 1.8092446448703495e-05, | |
| "loss": 0.0609, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.016155088852989, | |
| "grad_norm": 0.10561127960681915, | |
| "learning_rate": 1.8087936865839913e-05, | |
| "loss": 0.0719, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.0193861066235863, | |
| "grad_norm": 0.1146024838089943, | |
| "learning_rate": 1.8083427282976327e-05, | |
| "loss": 0.0726, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 2.022617124394184, | |
| "grad_norm": 0.07930684089660645, | |
| "learning_rate": 1.8078917700112742e-05, | |
| "loss": 0.0481, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 2.025848142164782, | |
| "grad_norm": 0.09927454590797424, | |
| "learning_rate": 1.8074408117249156e-05, | |
| "loss": 0.0608, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 2.0290791599353795, | |
| "grad_norm": 0.08592136949300766, | |
| "learning_rate": 1.806989853438557e-05, | |
| "loss": 0.0577, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 2.0323101777059773, | |
| "grad_norm": 0.09232696890830994, | |
| "learning_rate": 1.8065388951521985e-05, | |
| "loss": 0.0528, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.035541195476575, | |
| "grad_norm": 0.08930118381977081, | |
| "learning_rate": 1.8060879368658403e-05, | |
| "loss": 0.0633, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 2.038772213247173, | |
| "grad_norm": 0.09835111349821091, | |
| "learning_rate": 1.8056369785794817e-05, | |
| "loss": 0.0648, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 2.0420032310177705, | |
| "grad_norm": 0.10789217799901962, | |
| "learning_rate": 1.805186020293123e-05, | |
| "loss": 0.0695, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 2.0452342487883683, | |
| "grad_norm": 0.10506349802017212, | |
| "learning_rate": 1.8047350620067646e-05, | |
| "loss": 0.0635, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 2.048465266558966, | |
| "grad_norm": 0.13068322837352753, | |
| "learning_rate": 1.804284103720406e-05, | |
| "loss": 0.0802, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.0516962843295636, | |
| "grad_norm": 0.09663469344377518, | |
| "learning_rate": 1.8038331454340475e-05, | |
| "loss": 0.062, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 2.0549273021001615, | |
| "grad_norm": 0.07054325938224792, | |
| "learning_rate": 1.8033821871476893e-05, | |
| "loss": 0.0453, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 2.0581583198707594, | |
| "grad_norm": 0.07739470899105072, | |
| "learning_rate": 1.8029312288613307e-05, | |
| "loss": 0.0463, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 2.0613893376413572, | |
| "grad_norm": 0.09022580832242966, | |
| "learning_rate": 1.802480270574972e-05, | |
| "loss": 0.0587, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 2.0646203554119547, | |
| "grad_norm": 0.09953221678733826, | |
| "learning_rate": 1.8020293122886136e-05, | |
| "loss": 0.0585, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0678513731825525, | |
| "grad_norm": 0.10069511830806732, | |
| "learning_rate": 1.801578354002255e-05, | |
| "loss": 0.0587, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 2.0710823909531504, | |
| "grad_norm": 0.09177737683057785, | |
| "learning_rate": 1.8011273957158964e-05, | |
| "loss": 0.0523, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 2.074313408723748, | |
| "grad_norm": 0.1010020524263382, | |
| "learning_rate": 1.800676437429538e-05, | |
| "loss": 0.0552, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 2.0775444264943457, | |
| "grad_norm": 0.0997423455119133, | |
| "learning_rate": 1.8002254791431793e-05, | |
| "loss": 0.0662, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 2.0807754442649435, | |
| "grad_norm": 0.09295801818370819, | |
| "learning_rate": 1.7997745208568208e-05, | |
| "loss": 0.0599, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.0840064620355414, | |
| "grad_norm": 0.1053297147154808, | |
| "learning_rate": 1.7993235625704622e-05, | |
| "loss": 0.064, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 2.087237479806139, | |
| "grad_norm": 0.11978495121002197, | |
| "learning_rate": 1.798872604284104e-05, | |
| "loss": 0.0727, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 2.0904684975767367, | |
| "grad_norm": 0.07878235727548599, | |
| "learning_rate": 1.7984216459977454e-05, | |
| "loss": 0.0486, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 2.0936995153473346, | |
| "grad_norm": 0.14993903040885925, | |
| "learning_rate": 1.797970687711387e-05, | |
| "loss": 0.0458, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 2.096930533117932, | |
| "grad_norm": 0.0925765186548233, | |
| "learning_rate": 1.7975197294250283e-05, | |
| "loss": 0.0546, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.10016155088853, | |
| "grad_norm": 0.09530377388000488, | |
| "learning_rate": 1.7970687711386697e-05, | |
| "loss": 0.0514, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 2.1033925686591277, | |
| "grad_norm": 0.0945788025856018, | |
| "learning_rate": 1.7966178128523112e-05, | |
| "loss": 0.0604, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 2.106623586429725, | |
| "grad_norm": 0.11486334353685379, | |
| "learning_rate": 1.7961668545659526e-05, | |
| "loss": 0.0633, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 2.109854604200323, | |
| "grad_norm": 0.1077791377902031, | |
| "learning_rate": 1.795715896279594e-05, | |
| "loss": 0.0602, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 2.113085621970921, | |
| "grad_norm": 0.10789015889167786, | |
| "learning_rate": 1.795264937993236e-05, | |
| "loss": 0.0541, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.1163166397415187, | |
| "grad_norm": 0.10327862948179245, | |
| "learning_rate": 1.7948139797068773e-05, | |
| "loss": 0.0652, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.119547657512116, | |
| "grad_norm": 0.10486488789319992, | |
| "learning_rate": 1.7943630214205187e-05, | |
| "loss": 0.0617, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 2.122778675282714, | |
| "grad_norm": 0.0882355272769928, | |
| "learning_rate": 1.79391206313416e-05, | |
| "loss": 0.0553, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 2.126009693053312, | |
| "grad_norm": 0.08177275210618973, | |
| "learning_rate": 1.7934611048478016e-05, | |
| "loss": 0.0522, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 2.1292407108239093, | |
| "grad_norm": 0.1455976665019989, | |
| "learning_rate": 1.793010146561443e-05, | |
| "loss": 0.0909, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.132471728594507, | |
| "grad_norm": 0.1145886555314064, | |
| "learning_rate": 1.7925591882750848e-05, | |
| "loss": 0.0733, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 2.135702746365105, | |
| "grad_norm": 0.1092807874083519, | |
| "learning_rate": 1.7921082299887262e-05, | |
| "loss": 0.0581, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 2.138933764135703, | |
| "grad_norm": 0.07647505402565002, | |
| "learning_rate": 1.7916572717023677e-05, | |
| "loss": 0.0461, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 2.1421647819063003, | |
| "grad_norm": 0.09198980778455734, | |
| "learning_rate": 1.791206313416009e-05, | |
| "loss": 0.0549, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 2.145395799676898, | |
| "grad_norm": 0.10971511900424957, | |
| "learning_rate": 1.7907553551296506e-05, | |
| "loss": 0.0643, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.148626817447496, | |
| "grad_norm": 0.11374619603157043, | |
| "learning_rate": 1.790304396843292e-05, | |
| "loss": 0.0754, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 2.1518578352180935, | |
| "grad_norm": 0.09252484142780304, | |
| "learning_rate": 1.7898534385569338e-05, | |
| "loss": 0.0502, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 2.1550888529886914, | |
| "grad_norm": 0.09586004912853241, | |
| "learning_rate": 1.7894024802705752e-05, | |
| "loss": 0.0529, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 2.158319870759289, | |
| "grad_norm": 0.10206209868192673, | |
| "learning_rate": 1.7889515219842167e-05, | |
| "loss": 0.0605, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 2.161550888529887, | |
| "grad_norm": 0.15015992522239685, | |
| "learning_rate": 1.788500563697858e-05, | |
| "loss": 0.0926, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1647819063004845, | |
| "grad_norm": 0.10588102042675018, | |
| "learning_rate": 1.7880496054114995e-05, | |
| "loss": 0.0623, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 2.1680129240710824, | |
| "grad_norm": 0.09418896585702896, | |
| "learning_rate": 1.787598647125141e-05, | |
| "loss": 0.0564, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.1712439418416802, | |
| "grad_norm": 0.08213125914335251, | |
| "learning_rate": 1.7871476888387828e-05, | |
| "loss": 0.0464, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 2.1744749596122777, | |
| "grad_norm": 0.09321248531341553, | |
| "learning_rate": 1.7866967305524242e-05, | |
| "loss": 0.0531, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 2.1777059773828755, | |
| "grad_norm": 0.10642002522945404, | |
| "learning_rate": 1.7862457722660656e-05, | |
| "loss": 0.0669, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.1777059773828755, | |
| "eval_loss": 0.0747215747833252, | |
| "eval_runtime": 188.1708, | |
| "eval_samples_per_second": 1.047, | |
| "eval_steps_per_second": 1.047, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.1809369951534734, | |
| "grad_norm": 0.12946507334709167, | |
| "learning_rate": 1.785794813979707e-05, | |
| "loss": 0.072, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 2.1841680129240713, | |
| "grad_norm": 0.10074260830879211, | |
| "learning_rate": 1.7853438556933485e-05, | |
| "loss": 0.0521, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 2.1873990306946687, | |
| "grad_norm": 0.12798738479614258, | |
| "learning_rate": 1.78489289740699e-05, | |
| "loss": 0.0758, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 2.1906300484652665, | |
| "grad_norm": 0.10193175077438354, | |
| "learning_rate": 1.7844419391206317e-05, | |
| "loss": 0.0632, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 2.1938610662358644, | |
| "grad_norm": 0.12570485472679138, | |
| "learning_rate": 1.783990980834273e-05, | |
| "loss": 0.0724, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.197092084006462, | |
| "grad_norm": 0.10074017196893692, | |
| "learning_rate": 1.7835400225479146e-05, | |
| "loss": 0.0563, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 2.2003231017770597, | |
| "grad_norm": 0.08727949112653732, | |
| "learning_rate": 1.783089064261556e-05, | |
| "loss": 0.0524, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 2.2035541195476576, | |
| "grad_norm": 0.11030570417642593, | |
| "learning_rate": 1.7826381059751975e-05, | |
| "loss": 0.0668, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 2.2067851373182554, | |
| "grad_norm": 0.10606499761343002, | |
| "learning_rate": 1.782187147688839e-05, | |
| "loss": 0.0606, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 2.210016155088853, | |
| "grad_norm": 0.11735937744379044, | |
| "learning_rate": 1.7817361894024804e-05, | |
| "loss": 0.0648, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.2132471728594507, | |
| "grad_norm": 0.106626495718956, | |
| "learning_rate": 1.7812852311161218e-05, | |
| "loss": 0.0649, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 2.2164781906300486, | |
| "grad_norm": 0.12231657654047012, | |
| "learning_rate": 1.7808342728297632e-05, | |
| "loss": 0.0702, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 2.219709208400646, | |
| "grad_norm": 0.08800094574689865, | |
| "learning_rate": 1.780383314543405e-05, | |
| "loss": 0.0515, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 2.222940226171244, | |
| "grad_norm": 0.08806774020195007, | |
| "learning_rate": 1.7799323562570465e-05, | |
| "loss": 0.0493, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 2.2261712439418417, | |
| "grad_norm": 0.10804681479930878, | |
| "learning_rate": 1.779481397970688e-05, | |
| "loss": 0.0604, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.2294022617124396, | |
| "grad_norm": 0.11405564099550247, | |
| "learning_rate": 1.7790304396843293e-05, | |
| "loss": 0.0597, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 2.232633279483037, | |
| "grad_norm": 0.11010053753852844, | |
| "learning_rate": 1.7785794813979708e-05, | |
| "loss": 0.0634, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 2.235864297253635, | |
| "grad_norm": 0.10657312721014023, | |
| "learning_rate": 1.7781285231116122e-05, | |
| "loss": 0.0539, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 2.2390953150242328, | |
| "grad_norm": 0.08584710210561752, | |
| "learning_rate": 1.7776775648252536e-05, | |
| "loss": 0.0571, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 2.24232633279483, | |
| "grad_norm": 0.10155533254146576, | |
| "learning_rate": 1.777226606538895e-05, | |
| "loss": 0.0597, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.245557350565428, | |
| "grad_norm": 0.11395770311355591, | |
| "learning_rate": 1.7767756482525365e-05, | |
| "loss": 0.0675, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 2.248788368336026, | |
| "grad_norm": 0.11109079420566559, | |
| "learning_rate": 1.7763246899661783e-05, | |
| "loss": 0.062, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 2.2520193861066238, | |
| "grad_norm": 0.13479241728782654, | |
| "learning_rate": 1.7758737316798197e-05, | |
| "loss": 0.0782, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 2.255250403877221, | |
| "grad_norm": 0.12003345042467117, | |
| "learning_rate": 1.7754227733934612e-05, | |
| "loss": 0.0683, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 2.258481421647819, | |
| "grad_norm": 0.13395312428474426, | |
| "learning_rate": 1.7749718151071026e-05, | |
| "loss": 0.0764, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.261712439418417, | |
| "grad_norm": 0.10561169683933258, | |
| "learning_rate": 1.774520856820744e-05, | |
| "loss": 0.0552, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 2.2649434571890144, | |
| "grad_norm": 0.1412249207496643, | |
| "learning_rate": 1.7740698985343855e-05, | |
| "loss": 0.0812, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 2.268174474959612, | |
| "grad_norm": 0.11307451128959656, | |
| "learning_rate": 1.7736189402480273e-05, | |
| "loss": 0.0671, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 2.27140549273021, | |
| "grad_norm": 0.10989584773778915, | |
| "learning_rate": 1.7731679819616687e-05, | |
| "loss": 0.059, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.274636510500808, | |
| "grad_norm": 0.0964912474155426, | |
| "learning_rate": 1.77271702367531e-05, | |
| "loss": 0.0515, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.2778675282714054, | |
| "grad_norm": 0.09640849381685257, | |
| "learning_rate": 1.7722660653889516e-05, | |
| "loss": 0.0548, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 2.2810985460420032, | |
| "grad_norm": 0.08393755555152893, | |
| "learning_rate": 1.771815107102593e-05, | |
| "loss": 0.0477, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 2.284329563812601, | |
| "grad_norm": 0.08865144103765488, | |
| "learning_rate": 1.7713641488162348e-05, | |
| "loss": 0.0527, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 2.2875605815831985, | |
| "grad_norm": 0.10840681195259094, | |
| "learning_rate": 1.7709131905298762e-05, | |
| "loss": 0.061, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 2.2907915993537964, | |
| "grad_norm": 0.1336364448070526, | |
| "learning_rate": 1.7704622322435177e-05, | |
| "loss": 0.0655, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2940226171243943, | |
| "grad_norm": 0.10070191323757172, | |
| "learning_rate": 1.770011273957159e-05, | |
| "loss": 0.0599, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 2.297253634894992, | |
| "grad_norm": 0.12378398329019547, | |
| "learning_rate": 1.7695603156708006e-05, | |
| "loss": 0.0712, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 2.3004846526655895, | |
| "grad_norm": 0.08809908479452133, | |
| "learning_rate": 1.769109357384442e-05, | |
| "loss": 0.0419, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 2.3037156704361874, | |
| "grad_norm": 0.10536797344684601, | |
| "learning_rate": 1.7686583990980838e-05, | |
| "loss": 0.062, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 2.3069466882067853, | |
| "grad_norm": 0.09373629838228226, | |
| "learning_rate": 1.7682074408117252e-05, | |
| "loss": 0.0455, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.3101777059773827, | |
| "grad_norm": 0.08422086387872696, | |
| "learning_rate": 1.7677564825253667e-05, | |
| "loss": 0.0481, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 2.3134087237479806, | |
| "grad_norm": 0.12226711213588715, | |
| "learning_rate": 1.767305524239008e-05, | |
| "loss": 0.0713, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 2.3166397415185784, | |
| "grad_norm": 0.11593876034021378, | |
| "learning_rate": 1.7668545659526495e-05, | |
| "loss": 0.065, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 2.3198707592891763, | |
| "grad_norm": 0.10055369138717651, | |
| "learning_rate": 1.766403607666291e-05, | |
| "loss": 0.0515, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 2.3231017770597737, | |
| "grad_norm": 0.1200050637125969, | |
| "learning_rate": 1.7659526493799328e-05, | |
| "loss": 0.0664, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.3263327948303716, | |
| "grad_norm": 0.10001233220100403, | |
| "learning_rate": 1.7655016910935742e-05, | |
| "loss": 0.0578, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 2.3295638126009695, | |
| "grad_norm": 0.08621415495872498, | |
| "learning_rate": 1.7650507328072156e-05, | |
| "loss": 0.0426, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 2.332794830371567, | |
| "grad_norm": 0.08662088960409164, | |
| "learning_rate": 1.764599774520857e-05, | |
| "loss": 0.048, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 2.3360258481421647, | |
| "grad_norm": 0.09761569648981094, | |
| "learning_rate": 1.7641488162344985e-05, | |
| "loss": 0.0585, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 2.3392568659127626, | |
| "grad_norm": 0.1272287666797638, | |
| "learning_rate": 1.76369785794814e-05, | |
| "loss": 0.0722, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.3424878836833605, | |
| "grad_norm": 0.0827430784702301, | |
| "learning_rate": 1.7632468996617814e-05, | |
| "loss": 0.0517, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 2.345718901453958, | |
| "grad_norm": 0.08261015266180038, | |
| "learning_rate": 1.7627959413754228e-05, | |
| "loss": 0.0464, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 2.3489499192245558, | |
| "grad_norm": 0.10019004344940186, | |
| "learning_rate": 1.7623449830890643e-05, | |
| "loss": 0.0539, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 2.3521809369951536, | |
| "grad_norm": 0.11860109120607376, | |
| "learning_rate": 1.761894024802706e-05, | |
| "loss": 0.0604, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 2.355411954765751, | |
| "grad_norm": 0.13498210906982422, | |
| "learning_rate": 1.7614430665163475e-05, | |
| "loss": 0.0497, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.358642972536349, | |
| "grad_norm": 0.09867555648088455, | |
| "learning_rate": 1.760992108229989e-05, | |
| "loss": 0.0572, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 2.361873990306947, | |
| "grad_norm": 0.10520780086517334, | |
| "learning_rate": 1.7605411499436304e-05, | |
| "loss": 0.0613, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 2.3651050080775446, | |
| "grad_norm": 0.1396726369857788, | |
| "learning_rate": 1.7600901916572718e-05, | |
| "loss": 0.0808, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 2.368336025848142, | |
| "grad_norm": 0.09852424263954163, | |
| "learning_rate": 1.7596392333709132e-05, | |
| "loss": 0.0602, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 2.37156704361874, | |
| "grad_norm": 0.08897744864225388, | |
| "learning_rate": 1.7591882750845547e-05, | |
| "loss": 0.0463, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.374798061389338, | |
| "grad_norm": 0.12664619088172913, | |
| "learning_rate": 1.758737316798196e-05, | |
| "loss": 0.0731, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 2.378029079159935, | |
| "grad_norm": 0.0975411906838417, | |
| "learning_rate": 1.7582863585118376e-05, | |
| "loss": 0.0541, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 2.381260096930533, | |
| "grad_norm": 0.10056427121162415, | |
| "learning_rate": 1.7578354002254793e-05, | |
| "loss": 0.0564, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 2.384491114701131, | |
| "grad_norm": 0.0751773938536644, | |
| "learning_rate": 1.7573844419391208e-05, | |
| "loss": 0.043, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 2.387722132471729, | |
| "grad_norm": 0.12571515142917633, | |
| "learning_rate": 1.7569334836527622e-05, | |
| "loss": 0.0715, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3909531502423262, | |
| "grad_norm": 0.09152042865753174, | |
| "learning_rate": 1.7564825253664037e-05, | |
| "loss": 0.0513, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 2.394184168012924, | |
| "grad_norm": 0.16221173107624054, | |
| "learning_rate": 1.756031567080045e-05, | |
| "loss": 0.1003, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 2.397415185783522, | |
| "grad_norm": 0.09910274296998978, | |
| "learning_rate": 1.7555806087936865e-05, | |
| "loss": 0.0543, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 2.4006462035541194, | |
| "grad_norm": 0.10756971687078476, | |
| "learning_rate": 1.7551296505073283e-05, | |
| "loss": 0.0573, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 2.4038772213247173, | |
| "grad_norm": 0.08702822029590607, | |
| "learning_rate": 1.7546786922209697e-05, | |
| "loss": 0.047, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 2.407108239095315, | |
| "grad_norm": 0.14440016448497772, | |
| "learning_rate": 1.7542277339346112e-05, | |
| "loss": 0.0685, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 2.410339256865913, | |
| "grad_norm": 0.09141986817121506, | |
| "learning_rate": 1.7537767756482526e-05, | |
| "loss": 0.0521, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 2.4135702746365104, | |
| "grad_norm": 0.12515199184417725, | |
| "learning_rate": 1.753325817361894e-05, | |
| "loss": 0.0605, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 2.4168012924071083, | |
| "grad_norm": 0.12448819726705551, | |
| "learning_rate": 1.7528748590755355e-05, | |
| "loss": 0.0723, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 2.420032310177706, | |
| "grad_norm": 0.13118943572044373, | |
| "learning_rate": 1.7524239007891773e-05, | |
| "loss": 0.0654, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.420032310177706, | |
| "eval_loss": 0.07308099418878555, | |
| "eval_runtime": 188.3306, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.4232633279483036, | |
| "grad_norm": 0.1215345561504364, | |
| "learning_rate": 1.7519729425028187e-05, | |
| "loss": 0.0644, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 2.4264943457189014, | |
| "grad_norm": 0.12855304777622223, | |
| "learning_rate": 1.75152198421646e-05, | |
| "loss": 0.0666, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 2.4297253634894993, | |
| "grad_norm": 0.11538267880678177, | |
| "learning_rate": 1.7510710259301016e-05, | |
| "loss": 0.0545, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 2.432956381260097, | |
| "grad_norm": 0.10273373872041702, | |
| "learning_rate": 1.750620067643743e-05, | |
| "loss": 0.0594, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 2.4361873990306946, | |
| "grad_norm": 0.10953179746866226, | |
| "learning_rate": 1.7501691093573845e-05, | |
| "loss": 0.0587, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 2.4394184168012925, | |
| "grad_norm": 0.09215240180492401, | |
| "learning_rate": 1.7497181510710263e-05, | |
| "loss": 0.0501, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 2.4426494345718903, | |
| "grad_norm": 0.11669941246509552, | |
| "learning_rate": 1.7492671927846677e-05, | |
| "loss": 0.0585, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 2.4458804523424877, | |
| "grad_norm": 0.11698901653289795, | |
| "learning_rate": 1.748816234498309e-05, | |
| "loss": 0.0551, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 2.4491114701130856, | |
| "grad_norm": 0.1258348822593689, | |
| "learning_rate": 1.7483652762119506e-05, | |
| "loss": 0.0629, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 2.4523424878836835, | |
| "grad_norm": 0.12607377767562866, | |
| "learning_rate": 1.747914317925592e-05, | |
| "loss": 0.0728, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.4555735056542813, | |
| "grad_norm": 0.0982760339975357, | |
| "learning_rate": 1.7474633596392334e-05, | |
| "loss": 0.051, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 2.4588045234248788, | |
| "grad_norm": 0.15601739287376404, | |
| "learning_rate": 1.7470124013528752e-05, | |
| "loss": 0.076, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 2.4620355411954766, | |
| "grad_norm": 0.13090789318084717, | |
| "learning_rate": 1.7465614430665167e-05, | |
| "loss": 0.0739, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 2.4652665589660745, | |
| "grad_norm": 0.10627159476280212, | |
| "learning_rate": 1.746110484780158e-05, | |
| "loss": 0.0509, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 2.468497576736672, | |
| "grad_norm": 0.07108946144580841, | |
| "learning_rate": 1.7456595264937995e-05, | |
| "loss": 0.039, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 2.47172859450727, | |
| "grad_norm": 0.14733023941516876, | |
| "learning_rate": 1.745208568207441e-05, | |
| "loss": 0.0788, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 2.4749596122778676, | |
| "grad_norm": 0.10821715742349625, | |
| "learning_rate": 1.7447576099210824e-05, | |
| "loss": 0.0524, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 2.4781906300484655, | |
| "grad_norm": 0.11846361309289932, | |
| "learning_rate": 1.744306651634724e-05, | |
| "loss": 0.0579, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 2.481421647819063, | |
| "grad_norm": 0.10738200694322586, | |
| "learning_rate": 1.7438556933483653e-05, | |
| "loss": 0.0589, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 2.484652665589661, | |
| "grad_norm": 0.1159975603222847, | |
| "learning_rate": 1.743404735062007e-05, | |
| "loss": 0.0565, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.4878836833602587, | |
| "grad_norm": 0.10056610405445099, | |
| "learning_rate": 1.7429537767756485e-05, | |
| "loss": 0.0538, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 2.491114701130856, | |
| "grad_norm": 0.10329104959964752, | |
| "learning_rate": 1.74250281848929e-05, | |
| "loss": 0.0566, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 2.494345718901454, | |
| "grad_norm": 0.1422542929649353, | |
| "learning_rate": 1.7420518602029314e-05, | |
| "loss": 0.0747, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 2.497576736672052, | |
| "grad_norm": 0.12898680567741394, | |
| "learning_rate": 1.741600901916573e-05, | |
| "loss": 0.0735, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 2.5008077544426497, | |
| "grad_norm": 0.13066206872463226, | |
| "learning_rate": 1.7411499436302143e-05, | |
| "loss": 0.062, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 2.504038772213247, | |
| "grad_norm": 0.12379497289657593, | |
| "learning_rate": 1.7406989853438557e-05, | |
| "loss": 0.0631, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 2.507269789983845, | |
| "grad_norm": 0.1296347826719284, | |
| "learning_rate": 1.740248027057497e-05, | |
| "loss": 0.0634, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 2.5105008077544424, | |
| "grad_norm": 0.10818596184253693, | |
| "learning_rate": 1.7397970687711386e-05, | |
| "loss": 0.0606, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 2.5137318255250403, | |
| "grad_norm": 0.12639783322811127, | |
| "learning_rate": 1.73934611048478e-05, | |
| "loss": 0.0459, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 2.516962843295638, | |
| "grad_norm": 0.1167321428656578, | |
| "learning_rate": 1.7388951521984218e-05, | |
| "loss": 0.0667, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.520193861066236, | |
| "grad_norm": 0.10153870284557343, | |
| "learning_rate": 1.7384441939120632e-05, | |
| "loss": 0.0589, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 2.523424878836834, | |
| "grad_norm": 0.09986142814159393, | |
| "learning_rate": 1.7379932356257047e-05, | |
| "loss": 0.045, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 2.5266558966074313, | |
| "grad_norm": 0.10662157833576202, | |
| "learning_rate": 1.737542277339346e-05, | |
| "loss": 0.0586, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 2.529886914378029, | |
| "grad_norm": 0.11709077656269073, | |
| "learning_rate": 1.7370913190529876e-05, | |
| "loss": 0.0613, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 2.5331179321486266, | |
| "grad_norm": 0.13120310008525848, | |
| "learning_rate": 1.736640360766629e-05, | |
| "loss": 0.0664, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 2.5363489499192244, | |
| "grad_norm": 0.13849826157093048, | |
| "learning_rate": 1.7361894024802708e-05, | |
| "loss": 0.0673, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 2.5395799676898223, | |
| "grad_norm": 0.08833606541156769, | |
| "learning_rate": 1.7357384441939122e-05, | |
| "loss": 0.0459, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 2.54281098546042, | |
| "grad_norm": 0.09421700984239578, | |
| "learning_rate": 1.7352874859075537e-05, | |
| "loss": 0.0481, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 2.546042003231018, | |
| "grad_norm": 0.1201411634683609, | |
| "learning_rate": 1.734836527621195e-05, | |
| "loss": 0.0608, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 2.5492730210016155, | |
| "grad_norm": 0.09896653145551682, | |
| "learning_rate": 1.7343855693348365e-05, | |
| "loss": 0.0465, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.5525040387722133, | |
| "grad_norm": 0.12088964134454727, | |
| "learning_rate": 1.7339346110484783e-05, | |
| "loss": 0.0614, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 2.5557350565428107, | |
| "grad_norm": 0.11183801293373108, | |
| "learning_rate": 1.7334836527621198e-05, | |
| "loss": 0.0545, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 2.5589660743134086, | |
| "grad_norm": 0.11126703768968582, | |
| "learning_rate": 1.7330326944757612e-05, | |
| "loss": 0.0509, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 2.5621970920840065, | |
| "grad_norm": 0.1374976634979248, | |
| "learning_rate": 1.7325817361894026e-05, | |
| "loss": 0.0664, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 2.5654281098546043, | |
| "grad_norm": 0.16783633828163147, | |
| "learning_rate": 1.732130777903044e-05, | |
| "loss": 0.0868, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 2.568659127625202, | |
| "grad_norm": 0.11534145474433899, | |
| "learning_rate": 1.7316798196166855e-05, | |
| "loss": 0.053, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 2.5718901453957996, | |
| "grad_norm": 0.13769778609275818, | |
| "learning_rate": 1.7312288613303273e-05, | |
| "loss": 0.0756, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 2.5751211631663975, | |
| "grad_norm": 0.09577422589063644, | |
| "learning_rate": 1.7307779030439687e-05, | |
| "loss": 0.0441, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 2.578352180936995, | |
| "grad_norm": 0.11375096440315247, | |
| "learning_rate": 1.73032694475761e-05, | |
| "loss": 0.0524, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 2.581583198707593, | |
| "grad_norm": 0.11465324461460114, | |
| "learning_rate": 1.7298759864712516e-05, | |
| "loss": 0.0526, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.5848142164781907, | |
| "grad_norm": 0.11597500741481781, | |
| "learning_rate": 1.729425028184893e-05, | |
| "loss": 0.0591, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 2.5880452342487885, | |
| "grad_norm": 0.09817709028720856, | |
| "learning_rate": 1.7289740698985345e-05, | |
| "loss": 0.0503, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 2.5912762520193864, | |
| "grad_norm": 0.10352802276611328, | |
| "learning_rate": 1.7285231116121763e-05, | |
| "loss": 0.0461, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 2.594507269789984, | |
| "grad_norm": 0.12035888433456421, | |
| "learning_rate": 1.7280721533258177e-05, | |
| "loss": 0.0576, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 2.5977382875605817, | |
| "grad_norm": 0.12561960518360138, | |
| "learning_rate": 1.727621195039459e-05, | |
| "loss": 0.0599, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 2.600969305331179, | |
| "grad_norm": 0.11348681896924973, | |
| "learning_rate": 1.7271702367531006e-05, | |
| "loss": 0.0503, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 2.604200323101777, | |
| "grad_norm": 0.09772437810897827, | |
| "learning_rate": 1.726719278466742e-05, | |
| "loss": 0.0471, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 2.607431340872375, | |
| "grad_norm": 0.10316241532564163, | |
| "learning_rate": 1.7262683201803835e-05, | |
| "loss": 0.0514, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 2.6106623586429727, | |
| "grad_norm": 0.11204390227794647, | |
| "learning_rate": 1.725817361894025e-05, | |
| "loss": 0.0578, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 2.61389337641357, | |
| "grad_norm": 0.10899617522954941, | |
| "learning_rate": 1.7253664036076663e-05, | |
| "loss": 0.0495, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.617124394184168, | |
| "grad_norm": 0.1386026293039322, | |
| "learning_rate": 1.724915445321308e-05, | |
| "loss": 0.0542, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 2.620355411954766, | |
| "grad_norm": 0.13927608728408813, | |
| "learning_rate": 1.7244644870349495e-05, | |
| "loss": 0.0628, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 2.6235864297253633, | |
| "grad_norm": 0.099461629986763, | |
| "learning_rate": 1.724013528748591e-05, | |
| "loss": 0.0501, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 2.626817447495961, | |
| "grad_norm": 0.09142296761274338, | |
| "learning_rate": 1.7235625704622324e-05, | |
| "loss": 0.0475, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 2.630048465266559, | |
| "grad_norm": 0.12531687319278717, | |
| "learning_rate": 1.723111612175874e-05, | |
| "loss": 0.059, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 2.633279483037157, | |
| "grad_norm": 0.1252615749835968, | |
| "learning_rate": 1.7226606538895153e-05, | |
| "loss": 0.0589, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.6365105008077543, | |
| "grad_norm": 0.12725740671157837, | |
| "learning_rate": 1.7222096956031567e-05, | |
| "loss": 0.0522, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 2.639741518578352, | |
| "grad_norm": 0.12746059894561768, | |
| "learning_rate": 1.7217587373167982e-05, | |
| "loss": 0.0661, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 2.64297253634895, | |
| "grad_norm": 0.2133682370185852, | |
| "learning_rate": 1.7213077790304396e-05, | |
| "loss": 0.0639, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 2.6462035541195474, | |
| "grad_norm": 0.11452341079711914, | |
| "learning_rate": 1.720856820744081e-05, | |
| "loss": 0.0512, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6494345718901453, | |
| "grad_norm": 0.12344635277986526, | |
| "learning_rate": 1.720405862457723e-05, | |
| "loss": 0.0503, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 2.652665589660743, | |
| "grad_norm": 0.12654437124729156, | |
| "learning_rate": 1.7199549041713643e-05, | |
| "loss": 0.0543, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 2.655896607431341, | |
| "grad_norm": 0.12805619835853577, | |
| "learning_rate": 1.7195039458850057e-05, | |
| "loss": 0.0646, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 2.6591276252019385, | |
| "grad_norm": 0.11218256503343582, | |
| "learning_rate": 1.719052987598647e-05, | |
| "loss": 0.0575, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 2.6623586429725363, | |
| "grad_norm": 0.12950399518013, | |
| "learning_rate": 1.7186020293122886e-05, | |
| "loss": 0.0673, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.6623586429725363, | |
| "eval_loss": 0.07266557961702347, | |
| "eval_runtime": 187.9396, | |
| "eval_samples_per_second": 1.048, | |
| "eval_steps_per_second": 1.048, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.665589660743134, | |
| "grad_norm": 0.11642561107873917, | |
| "learning_rate": 1.71815107102593e-05, | |
| "loss": 0.0664, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 2.6688206785137316, | |
| "grad_norm": 0.09707733243703842, | |
| "learning_rate": 1.7177001127395718e-05, | |
| "loss": 0.0503, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 2.6720516962843295, | |
| "grad_norm": 0.07535319775342941, | |
| "learning_rate": 1.7172491544532133e-05, | |
| "loss": 0.038, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 2.6752827140549273, | |
| "grad_norm": 0.12683290243148804, | |
| "learning_rate": 1.7167981961668547e-05, | |
| "loss": 0.0614, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 2.678513731825525, | |
| "grad_norm": 0.1531742513179779, | |
| "learning_rate": 1.716347237880496e-05, | |
| "loss": 0.069, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.6817447495961226, | |
| "grad_norm": 0.13030219078063965, | |
| "learning_rate": 1.7158962795941376e-05, | |
| "loss": 0.0593, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 2.6849757673667205, | |
| "grad_norm": 0.13288383185863495, | |
| "learning_rate": 1.715445321307779e-05, | |
| "loss": 0.0607, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 2.6882067851373184, | |
| "grad_norm": 0.1245107427239418, | |
| "learning_rate": 1.7149943630214208e-05, | |
| "loss": 0.059, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 2.691437802907916, | |
| "grad_norm": 0.10677826404571533, | |
| "learning_rate": 1.7145434047350622e-05, | |
| "loss": 0.0532, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 2.6946688206785137, | |
| "grad_norm": 0.11118808388710022, | |
| "learning_rate": 1.7140924464487037e-05, | |
| "loss": 0.055, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 2.6978998384491115, | |
| "grad_norm": 0.11494432389736176, | |
| "learning_rate": 1.713641488162345e-05, | |
| "loss": 0.0556, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 2.7011308562197094, | |
| "grad_norm": 0.14139457046985626, | |
| "learning_rate": 1.7131905298759865e-05, | |
| "loss": 0.0687, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 2.704361873990307, | |
| "grad_norm": 0.12973995506763458, | |
| "learning_rate": 1.712739571589628e-05, | |
| "loss": 0.0639, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 2.7075928917609047, | |
| "grad_norm": 0.12217195332050323, | |
| "learning_rate": 1.7122886133032698e-05, | |
| "loss": 0.0636, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 2.7108239095315025, | |
| "grad_norm": 0.08900095522403717, | |
| "learning_rate": 1.7118376550169112e-05, | |
| "loss": 0.0491, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.7140549273021, | |
| "grad_norm": 0.12005368620157242, | |
| "learning_rate": 1.7113866967305526e-05, | |
| "loss": 0.0605, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 2.717285945072698, | |
| "grad_norm": 0.1201101690530777, | |
| "learning_rate": 1.710935738444194e-05, | |
| "loss": 0.0597, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 2.7205169628432957, | |
| "grad_norm": 0.12422793358564377, | |
| "learning_rate": 1.7104847801578355e-05, | |
| "loss": 0.0604, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 2.7237479806138936, | |
| "grad_norm": 0.11504203826189041, | |
| "learning_rate": 1.710033821871477e-05, | |
| "loss": 0.0567, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 2.726978998384491, | |
| "grad_norm": 0.13158410787582397, | |
| "learning_rate": 1.7095828635851187e-05, | |
| "loss": 0.0751, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 2.730210016155089, | |
| "grad_norm": 0.13026569783687592, | |
| "learning_rate": 1.70913190529876e-05, | |
| "loss": 0.057, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 2.7334410339256867, | |
| "grad_norm": 0.5227922201156616, | |
| "learning_rate": 1.7086809470124016e-05, | |
| "loss": 0.0567, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 2.736672051696284, | |
| "grad_norm": 0.08213207870721817, | |
| "learning_rate": 1.708229988726043e-05, | |
| "loss": 0.0385, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 2.739903069466882, | |
| "grad_norm": 0.14717501401901245, | |
| "learning_rate": 1.7077790304396845e-05, | |
| "loss": 0.0746, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 2.74313408723748, | |
| "grad_norm": 0.11484480649232864, | |
| "learning_rate": 1.707328072153326e-05, | |
| "loss": 0.0513, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.7463651050080777, | |
| "grad_norm": 0.13454794883728027, | |
| "learning_rate": 1.7068771138669674e-05, | |
| "loss": 0.0668, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 2.749596122778675, | |
| "grad_norm": 0.16599783301353455, | |
| "learning_rate": 1.706426155580609e-05, | |
| "loss": 0.0659, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 2.752827140549273, | |
| "grad_norm": 0.11365890502929688, | |
| "learning_rate": 1.7059751972942506e-05, | |
| "loss": 0.0591, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 2.756058158319871, | |
| "grad_norm": 0.12964101135730743, | |
| "learning_rate": 1.705524239007892e-05, | |
| "loss": 0.0598, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 2.7592891760904683, | |
| "grad_norm": 0.10415180772542953, | |
| "learning_rate": 1.7050732807215335e-05, | |
| "loss": 0.0499, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 2.762520193861066, | |
| "grad_norm": 0.1433461755514145, | |
| "learning_rate": 1.704622322435175e-05, | |
| "loss": 0.0668, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 2.765751211631664, | |
| "grad_norm": 0.12921610474586487, | |
| "learning_rate": 1.7041713641488163e-05, | |
| "loss": 0.0525, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 2.768982229402262, | |
| "grad_norm": 0.11878325045108795, | |
| "learning_rate": 1.7037204058624578e-05, | |
| "loss": 0.06, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 2.7722132471728593, | |
| "grad_norm": 0.09403812140226364, | |
| "learning_rate": 1.7032694475760992e-05, | |
| "loss": 0.0472, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 2.775444264943457, | |
| "grad_norm": 0.09613174945116043, | |
| "learning_rate": 1.7028184892897407e-05, | |
| "loss": 0.0508, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.778675282714055, | |
| "grad_norm": 0.11891157180070877, | |
| "learning_rate": 1.702367531003382e-05, | |
| "loss": 0.0589, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 2.7819063004846525, | |
| "grad_norm": 0.1563875377178192, | |
| "learning_rate": 1.701916572717024e-05, | |
| "loss": 0.0794, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 2.7851373182552503, | |
| "grad_norm": 0.13382849097251892, | |
| "learning_rate": 1.7014656144306653e-05, | |
| "loss": 0.0683, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 2.788368336025848, | |
| "grad_norm": 0.15156099200248718, | |
| "learning_rate": 1.7010146561443067e-05, | |
| "loss": 0.0694, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 2.791599353796446, | |
| "grad_norm": 0.12621727585792542, | |
| "learning_rate": 1.7005636978579482e-05, | |
| "loss": 0.0567, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 2.7948303715670435, | |
| "grad_norm": 0.15388263761997223, | |
| "learning_rate": 1.7001127395715896e-05, | |
| "loss": 0.0738, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 2.7980613893376414, | |
| "grad_norm": 0.1349688321352005, | |
| "learning_rate": 1.699661781285231e-05, | |
| "loss": 0.0546, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 2.8012924071082392, | |
| "grad_norm": 0.11894084513187408, | |
| "learning_rate": 1.6992108229988725e-05, | |
| "loss": 0.0538, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 2.8045234248788367, | |
| "grad_norm": 0.21414369344711304, | |
| "learning_rate": 1.6987598647125143e-05, | |
| "loss": 0.0511, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 2.8077544426494345, | |
| "grad_norm": 0.1107967421412468, | |
| "learning_rate": 1.6983089064261557e-05, | |
| "loss": 0.0551, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.8109854604200324, | |
| "grad_norm": 0.16710782051086426, | |
| "learning_rate": 1.697857948139797e-05, | |
| "loss": 0.0805, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 2.8142164781906303, | |
| "grad_norm": 0.12345987558364868, | |
| "learning_rate": 1.6974069898534386e-05, | |
| "loss": 0.0576, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 2.8174474959612277, | |
| "grad_norm": 0.11037538200616837, | |
| "learning_rate": 1.69695603156708e-05, | |
| "loss": 0.0486, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 2.8206785137318255, | |
| "grad_norm": 0.10175740718841553, | |
| "learning_rate": 1.6965050732807218e-05, | |
| "loss": 0.0529, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 2.8239095315024234, | |
| "grad_norm": 0.1126103326678276, | |
| "learning_rate": 1.6960541149943633e-05, | |
| "loss": 0.0546, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.827140549273021, | |
| "grad_norm": 0.09911254793405533, | |
| "learning_rate": 1.6956031567080047e-05, | |
| "loss": 0.0455, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 2.8303715670436187, | |
| "grad_norm": 0.1377602368593216, | |
| "learning_rate": 1.695152198421646e-05, | |
| "loss": 0.0599, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 2.8336025848142166, | |
| "grad_norm": 0.11292906850576401, | |
| "learning_rate": 1.6947012401352876e-05, | |
| "loss": 0.053, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 2.8368336025848144, | |
| "grad_norm": 0.13102898001670837, | |
| "learning_rate": 1.694250281848929e-05, | |
| "loss": 0.0629, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 2.840064620355412, | |
| "grad_norm": 0.12573762238025665, | |
| "learning_rate": 1.6937993235625708e-05, | |
| "loss": 0.052, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.8432956381260097, | |
| "grad_norm": 0.11562048643827438, | |
| "learning_rate": 1.6933483652762122e-05, | |
| "loss": 0.0594, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 2.8465266558966076, | |
| "grad_norm": 0.15344659984111786, | |
| "learning_rate": 1.6928974069898537e-05, | |
| "loss": 0.0636, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 2.849757673667205, | |
| "grad_norm": 0.11969706416130066, | |
| "learning_rate": 1.692446448703495e-05, | |
| "loss": 0.0549, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 2.852988691437803, | |
| "grad_norm": 0.0930706337094307, | |
| "learning_rate": 1.6919954904171365e-05, | |
| "loss": 0.0434, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 2.8562197092084007, | |
| "grad_norm": 0.1458914428949356, | |
| "learning_rate": 1.691544532130778e-05, | |
| "loss": 0.0707, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 2.8594507269789986, | |
| "grad_norm": 0.11928731948137283, | |
| "learning_rate": 1.6910935738444198e-05, | |
| "loss": 0.0584, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 2.862681744749596, | |
| "grad_norm": 0.1225530207157135, | |
| "learning_rate": 1.6906426155580612e-05, | |
| "loss": 0.0619, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 2.865912762520194, | |
| "grad_norm": 0.13734394311904907, | |
| "learning_rate": 1.6901916572717026e-05, | |
| "loss": 0.065, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 2.8691437802907918, | |
| "grad_norm": 0.13331110775470734, | |
| "learning_rate": 1.689740698985344e-05, | |
| "loss": 0.0652, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 2.872374798061389, | |
| "grad_norm": 0.1149471327662468, | |
| "learning_rate": 1.6892897406989855e-05, | |
| "loss": 0.0495, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.875605815831987, | |
| "grad_norm": 0.14575156569480896, | |
| "learning_rate": 1.688838782412627e-05, | |
| "loss": 0.069, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 2.878836833602585, | |
| "grad_norm": 0.1205376535654068, | |
| "learning_rate": 1.6883878241262684e-05, | |
| "loss": 0.0595, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 2.8820678513731828, | |
| "grad_norm": 0.13029593229293823, | |
| "learning_rate": 1.6879368658399102e-05, | |
| "loss": 0.0652, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 2.88529886914378, | |
| "grad_norm": 0.1242680773139, | |
| "learning_rate": 1.6874859075535516e-05, | |
| "loss": 0.0628, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 2.888529886914378, | |
| "grad_norm": 0.1066925972700119, | |
| "learning_rate": 1.687034949267193e-05, | |
| "loss": 0.0571, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 2.891760904684976, | |
| "grad_norm": 0.09622512012720108, | |
| "learning_rate": 1.6865839909808345e-05, | |
| "loss": 0.0447, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 2.8949919224555734, | |
| "grad_norm": 0.14432470500469208, | |
| "learning_rate": 1.686133032694476e-05, | |
| "loss": 0.0658, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 2.898222940226171, | |
| "grad_norm": 0.1262982338666916, | |
| "learning_rate": 1.6856820744081174e-05, | |
| "loss": 0.057, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 2.901453957996769, | |
| "grad_norm": 0.12278001755475998, | |
| "learning_rate": 1.6852311161217588e-05, | |
| "loss": 0.0533, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 2.904684975767367, | |
| "grad_norm": 0.13526810705661774, | |
| "learning_rate": 1.6847801578354002e-05, | |
| "loss": 0.0536, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.904684975767367, | |
| "eval_loss": 0.07217078655958176, | |
| "eval_runtime": 188.4791, | |
| "eval_samples_per_second": 1.045, | |
| "eval_steps_per_second": 1.045, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.9079159935379644, | |
| "grad_norm": 0.07958260923624039, | |
| "learning_rate": 1.6843291995490417e-05, | |
| "loss": 0.0395, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 2.9111470113085622, | |
| "grad_norm": 0.10193248093128204, | |
| "learning_rate": 1.683878241262683e-05, | |
| "loss": 0.0495, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 2.9143780290791597, | |
| "grad_norm": 0.09985180199146271, | |
| "learning_rate": 1.683427282976325e-05, | |
| "loss": 0.0496, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 2.9176090468497575, | |
| "grad_norm": 0.15160292387008667, | |
| "learning_rate": 1.6829763246899663e-05, | |
| "loss": 0.0668, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 2.9208400646203554, | |
| "grad_norm": 0.13049964606761932, | |
| "learning_rate": 1.6825253664036078e-05, | |
| "loss": 0.067, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 2.9240710823909533, | |
| "grad_norm": 0.13118034601211548, | |
| "learning_rate": 1.6820744081172492e-05, | |
| "loss": 0.06, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 2.927302100161551, | |
| "grad_norm": 0.11038261651992798, | |
| "learning_rate": 1.6816234498308907e-05, | |
| "loss": 0.0542, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 2.9305331179321485, | |
| "grad_norm": 0.11064022779464722, | |
| "learning_rate": 1.681172491544532e-05, | |
| "loss": 0.0535, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 2.9337641357027464, | |
| "grad_norm": 0.10448973625898361, | |
| "learning_rate": 1.6807215332581735e-05, | |
| "loss": 0.0444, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 2.936995153473344, | |
| "grad_norm": 0.09960347414016724, | |
| "learning_rate": 1.6802705749718153e-05, | |
| "loss": 0.0477, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.9402261712439417, | |
| "grad_norm": 0.10175690799951553, | |
| "learning_rate": 1.6798196166854568e-05, | |
| "loss": 0.0521, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 2.9434571890145396, | |
| "grad_norm": 0.11306945234537125, | |
| "learning_rate": 1.6793686583990982e-05, | |
| "loss": 0.0483, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 2.9466882067851374, | |
| "grad_norm": 0.12629052996635437, | |
| "learning_rate": 1.6789177001127396e-05, | |
| "loss": 0.0594, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 2.9499192245557353, | |
| "grad_norm": 0.16772274672985077, | |
| "learning_rate": 1.678466741826381e-05, | |
| "loss": 0.0808, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 2.9531502423263327, | |
| "grad_norm": 0.14857324957847595, | |
| "learning_rate": 1.6780157835400225e-05, | |
| "loss": 0.0626, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 2.9563812600969306, | |
| "grad_norm": 0.12077292054891586, | |
| "learning_rate": 1.6775648252536643e-05, | |
| "loss": 0.0547, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 2.959612277867528, | |
| "grad_norm": 0.08560369163751602, | |
| "learning_rate": 1.6771138669673057e-05, | |
| "loss": 0.0402, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 2.962843295638126, | |
| "grad_norm": 0.1328180432319641, | |
| "learning_rate": 1.676662908680947e-05, | |
| "loss": 0.0603, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 2.9660743134087237, | |
| "grad_norm": 0.13140498101711273, | |
| "learning_rate": 1.6762119503945886e-05, | |
| "loss": 0.0541, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 2.9693053311793216, | |
| "grad_norm": 0.13546602427959442, | |
| "learning_rate": 1.67576099210823e-05, | |
| "loss": 0.0644, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.9725363489499195, | |
| "grad_norm": 0.13099107146263123, | |
| "learning_rate": 1.6753100338218715e-05, | |
| "loss": 0.0544, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 2.975767366720517, | |
| "grad_norm": 0.12933450937271118, | |
| "learning_rate": 1.6748590755355133e-05, | |
| "loss": 0.0632, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 2.9789983844911148, | |
| "grad_norm": 0.12769202888011932, | |
| "learning_rate": 1.6744081172491547e-05, | |
| "loss": 0.0591, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 2.982229402261712, | |
| "grad_norm": 0.12964068353176117, | |
| "learning_rate": 1.673957158962796e-05, | |
| "loss": 0.0602, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 2.98546042003231, | |
| "grad_norm": 0.1714252084493637, | |
| "learning_rate": 1.6735062006764376e-05, | |
| "loss": 0.076, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.988691437802908, | |
| "grad_norm": 0.15382935106754303, | |
| "learning_rate": 1.673055242390079e-05, | |
| "loss": 0.0624, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 2.991922455573506, | |
| "grad_norm": 0.15337888896465302, | |
| "learning_rate": 1.6726042841037205e-05, | |
| "loss": 0.0652, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 2.9951534733441036, | |
| "grad_norm": 0.1587558090686798, | |
| "learning_rate": 1.6721533258173622e-05, | |
| "loss": 0.0754, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 2.998384491114701, | |
| "grad_norm": 0.09836894273757935, | |
| "learning_rate": 1.6717023675310037e-05, | |
| "loss": 0.046, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.20606525242328644, | |
| "learning_rate": 1.671251409244645e-05, | |
| "loss": 0.0692, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.003231017770598, | |
| "grad_norm": 0.12933456897735596, | |
| "learning_rate": 1.6708004509582866e-05, | |
| "loss": 0.0601, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 3.0064620355411953, | |
| "grad_norm": 0.15069305896759033, | |
| "learning_rate": 1.670349492671928e-05, | |
| "loss": 0.0691, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 3.009693053311793, | |
| "grad_norm": 0.12518665194511414, | |
| "learning_rate": 1.6698985343855694e-05, | |
| "loss": 0.0594, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 3.012924071082391, | |
| "grad_norm": 0.13509726524353027, | |
| "learning_rate": 1.6694475760992112e-05, | |
| "loss": 0.0553, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 3.016155088852989, | |
| "grad_norm": 0.18207424879074097, | |
| "learning_rate": 1.6689966178128526e-05, | |
| "loss": 0.0691, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 3.0193861066235863, | |
| "grad_norm": 0.10155311226844788, | |
| "learning_rate": 1.668545659526494e-05, | |
| "loss": 0.0442, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 3.022617124394184, | |
| "grad_norm": 0.08462440967559814, | |
| "learning_rate": 1.6680947012401355e-05, | |
| "loss": 0.0397, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 3.025848142164782, | |
| "grad_norm": 0.1074729785323143, | |
| "learning_rate": 1.667643742953777e-05, | |
| "loss": 0.0476, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 3.0290791599353795, | |
| "grad_norm": 0.10128747671842575, | |
| "learning_rate": 1.6671927846674184e-05, | |
| "loss": 0.044, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 3.0323101777059773, | |
| "grad_norm": 0.10703253746032715, | |
| "learning_rate": 1.66674182638106e-05, | |
| "loss": 0.0434, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.035541195476575, | |
| "grad_norm": 0.16827581822872162, | |
| "learning_rate": 1.6662908680947013e-05, | |
| "loss": 0.058, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 3.038772213247173, | |
| "grad_norm": 0.12423544377088547, | |
| "learning_rate": 1.6658399098083427e-05, | |
| "loss": 0.0529, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 3.0420032310177705, | |
| "grad_norm": 0.11421461403369904, | |
| "learning_rate": 1.665388951521984e-05, | |
| "loss": 0.0462, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 3.0452342487883683, | |
| "grad_norm": 0.1504746377468109, | |
| "learning_rate": 1.664937993235626e-05, | |
| "loss": 0.0607, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 3.048465266558966, | |
| "grad_norm": 0.1171237900853157, | |
| "learning_rate": 1.6644870349492674e-05, | |
| "loss": 0.0488, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 3.0516962843295636, | |
| "grad_norm": 0.12751275300979614, | |
| "learning_rate": 1.6640360766629088e-05, | |
| "loss": 0.0566, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 3.0549273021001615, | |
| "grad_norm": 0.10137461870908737, | |
| "learning_rate": 1.6635851183765503e-05, | |
| "loss": 0.042, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 3.0581583198707594, | |
| "grad_norm": 0.10805993527173996, | |
| "learning_rate": 1.6631341600901917e-05, | |
| "loss": 0.0436, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 3.0613893376413572, | |
| "grad_norm": 0.15429779887199402, | |
| "learning_rate": 1.662683201803833e-05, | |
| "loss": 0.0611, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 3.0646203554119547, | |
| "grad_norm": 0.15192106366157532, | |
| "learning_rate": 1.6622322435174746e-05, | |
| "loss": 0.0558, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.0678513731825525, | |
| "grad_norm": 0.14291639626026154, | |
| "learning_rate": 1.661781285231116e-05, | |
| "loss": 0.0582, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 3.0710823909531504, | |
| "grad_norm": 0.11516083776950836, | |
| "learning_rate": 1.6613303269447578e-05, | |
| "loss": 0.0455, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 3.074313408723748, | |
| "grad_norm": 0.11716248095035553, | |
| "learning_rate": 1.6608793686583992e-05, | |
| "loss": 0.0419, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 3.0775444264943457, | |
| "grad_norm": 0.13777975738048553, | |
| "learning_rate": 1.6604284103720407e-05, | |
| "loss": 0.0587, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 3.0807754442649435, | |
| "grad_norm": 0.15481697022914886, | |
| "learning_rate": 1.659977452085682e-05, | |
| "loss": 0.058, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 3.0840064620355414, | |
| "grad_norm": 0.11290151625871658, | |
| "learning_rate": 1.6595264937993235e-05, | |
| "loss": 0.0428, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 3.087237479806139, | |
| "grad_norm": 0.11138515174388885, | |
| "learning_rate": 1.6590755355129653e-05, | |
| "loss": 0.0445, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 3.0904684975767367, | |
| "grad_norm": 0.13892598450183868, | |
| "learning_rate": 1.6586245772266068e-05, | |
| "loss": 0.0532, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 3.0936995153473346, | |
| "grad_norm": 0.14099125564098358, | |
| "learning_rate": 1.6581736189402482e-05, | |
| "loss": 0.0532, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 3.096930533117932, | |
| "grad_norm": 0.1620667278766632, | |
| "learning_rate": 1.6577226606538896e-05, | |
| "loss": 0.0716, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.10016155088853, | |
| "grad_norm": 0.1435079723596573, | |
| "learning_rate": 1.657271702367531e-05, | |
| "loss": 0.0587, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 3.1033925686591277, | |
| "grad_norm": 0.1412099003791809, | |
| "learning_rate": 1.6568207440811725e-05, | |
| "loss": 0.0599, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 3.106623586429725, | |
| "grad_norm": 0.16996391117572784, | |
| "learning_rate": 1.6563697857948143e-05, | |
| "loss": 0.0577, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 3.109854604200323, | |
| "grad_norm": 0.14544463157653809, | |
| "learning_rate": 1.6559188275084557e-05, | |
| "loss": 0.0595, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 3.113085621970921, | |
| "grad_norm": 0.12646666169166565, | |
| "learning_rate": 1.6554678692220972e-05, | |
| "loss": 0.0496, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 3.1163166397415187, | |
| "grad_norm": 0.16260091960430145, | |
| "learning_rate": 1.6550169109357386e-05, | |
| "loss": 0.0588, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 3.119547657512116, | |
| "grad_norm": 0.14531579613685608, | |
| "learning_rate": 1.65456595264938e-05, | |
| "loss": 0.0654, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 3.122778675282714, | |
| "grad_norm": 0.13838277757167816, | |
| "learning_rate": 1.6541149943630215e-05, | |
| "loss": 0.058, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 3.126009693053312, | |
| "grad_norm": 0.10179346054792404, | |
| "learning_rate": 1.6536640360766633e-05, | |
| "loss": 0.0394, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 3.1292407108239093, | |
| "grad_norm": 0.14759835600852966, | |
| "learning_rate": 1.6532130777903047e-05, | |
| "loss": 0.0616, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.132471728594507, | |
| "grad_norm": 0.12317320704460144, | |
| "learning_rate": 1.652762119503946e-05, | |
| "loss": 0.0457, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 3.135702746365105, | |
| "grad_norm": 0.14770029485225677, | |
| "learning_rate": 1.6523111612175876e-05, | |
| "loss": 0.0606, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 3.138933764135703, | |
| "grad_norm": 0.14644749462604523, | |
| "learning_rate": 1.651860202931229e-05, | |
| "loss": 0.0576, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 3.1421647819063003, | |
| "grad_norm": 0.15745016932487488, | |
| "learning_rate": 1.6514092446448705e-05, | |
| "loss": 0.0658, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 3.145395799676898, | |
| "grad_norm": 0.15281431376934052, | |
| "learning_rate": 1.6509582863585122e-05, | |
| "loss": 0.0579, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 3.145395799676898, | |
| "eval_loss": 0.07240297645330429, | |
| "eval_runtime": 188.3995, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 3.148626817447496, | |
| "grad_norm": 0.186857670545578, | |
| "learning_rate": 1.6505073280721537e-05, | |
| "loss": 0.0712, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 3.1518578352180935, | |
| "grad_norm": 0.11668923497200012, | |
| "learning_rate": 1.650056369785795e-05, | |
| "loss": 0.0432, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 3.1550888529886914, | |
| "grad_norm": 0.1078757792711258, | |
| "learning_rate": 1.6496054114994366e-05, | |
| "loss": 0.0438, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 3.158319870759289, | |
| "grad_norm": 0.10889827460050583, | |
| "learning_rate": 1.649154453213078e-05, | |
| "loss": 0.0404, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 3.161550888529887, | |
| "grad_norm": 0.11770477145910263, | |
| "learning_rate": 1.6487034949267194e-05, | |
| "loss": 0.0379, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.1647819063004845, | |
| "grad_norm": 0.1730085015296936, | |
| "learning_rate": 1.648252536640361e-05, | |
| "loss": 0.0527, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 3.1680129240710824, | |
| "grad_norm": 0.17005400359630585, | |
| "learning_rate": 1.6478015783540023e-05, | |
| "loss": 0.0608, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 3.1712439418416802, | |
| "grad_norm": 0.15987516939640045, | |
| "learning_rate": 1.6473506200676438e-05, | |
| "loss": 0.0657, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 3.1744749596122777, | |
| "grad_norm": 0.12467172741889954, | |
| "learning_rate": 1.6468996617812852e-05, | |
| "loss": 0.0532, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 3.1777059773828755, | |
| "grad_norm": 0.13436008989810944, | |
| "learning_rate": 1.646448703494927e-05, | |
| "loss": 0.0464, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 3.1809369951534734, | |
| "grad_norm": 0.15260566771030426, | |
| "learning_rate": 1.6459977452085684e-05, | |
| "loss": 0.0585, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 3.1841680129240713, | |
| "grad_norm": 0.1228412613272667, | |
| "learning_rate": 1.64554678692221e-05, | |
| "loss": 0.042, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 3.1873990306946687, | |
| "grad_norm": 0.1621600091457367, | |
| "learning_rate": 1.6450958286358513e-05, | |
| "loss": 0.0705, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 3.1906300484652665, | |
| "grad_norm": 0.14798057079315186, | |
| "learning_rate": 1.6446448703494927e-05, | |
| "loss": 0.0536, | |
| "step": 989 | |
| }, | |
| { | |
| "epoch": 3.1938610662358644, | |
| "grad_norm": 0.17002591490745544, | |
| "learning_rate": 1.644193912063134e-05, | |
| "loss": 0.0624, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.197092084006462, | |
| "grad_norm": 0.11882289499044418, | |
| "learning_rate": 1.6437429537767756e-05, | |
| "loss": 0.0386, | |
| "step": 991 | |
| }, | |
| { | |
| "epoch": 3.2003231017770597, | |
| "grad_norm": 0.177546426653862, | |
| "learning_rate": 1.643291995490417e-05, | |
| "loss": 0.0684, | |
| "step": 992 | |
| }, | |
| { | |
| "epoch": 3.2035541195476576, | |
| "grad_norm": 0.1515907645225525, | |
| "learning_rate": 1.6428410372040588e-05, | |
| "loss": 0.0588, | |
| "step": 993 | |
| }, | |
| { | |
| "epoch": 3.2067851373182554, | |
| "grad_norm": 0.13172346353530884, | |
| "learning_rate": 1.6423900789177003e-05, | |
| "loss": 0.0502, | |
| "step": 994 | |
| }, | |
| { | |
| "epoch": 3.210016155088853, | |
| "grad_norm": 0.1430046707391739, | |
| "learning_rate": 1.6419391206313417e-05, | |
| "loss": 0.0538, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 3.2132471728594507, | |
| "grad_norm": 0.10192380100488663, | |
| "learning_rate": 1.641488162344983e-05, | |
| "loss": 0.0414, | |
| "step": 996 | |
| }, | |
| { | |
| "epoch": 3.2164781906300486, | |
| "grad_norm": 0.12296223640441895, | |
| "learning_rate": 1.6410372040586246e-05, | |
| "loss": 0.0466, | |
| "step": 997 | |
| }, | |
| { | |
| "epoch": 3.219709208400646, | |
| "grad_norm": 0.1641893982887268, | |
| "learning_rate": 1.640586245772266e-05, | |
| "loss": 0.0606, | |
| "step": 998 | |
| }, | |
| { | |
| "epoch": 3.222940226171244, | |
| "grad_norm": 0.109470434486866, | |
| "learning_rate": 1.6401352874859078e-05, | |
| "loss": 0.0424, | |
| "step": 999 | |
| }, | |
| { | |
| "epoch": 3.2261712439418417, | |
| "grad_norm": 0.10068835318088531, | |
| "learning_rate": 1.6396843291995492e-05, | |
| "loss": 0.0364, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.2294022617124396, | |
| "grad_norm": 0.09672326594591141, | |
| "learning_rate": 1.6392333709131907e-05, | |
| "loss": 0.0403, | |
| "step": 1001 | |
| }, | |
| { | |
| "epoch": 3.232633279483037, | |
| "grad_norm": 0.21638123691082, | |
| "learning_rate": 1.638782412626832e-05, | |
| "loss": 0.0785, | |
| "step": 1002 | |
| }, | |
| { | |
| "epoch": 3.235864297253635, | |
| "grad_norm": 0.15015633404254913, | |
| "learning_rate": 1.6383314543404735e-05, | |
| "loss": 0.0513, | |
| "step": 1003 | |
| }, | |
| { | |
| "epoch": 3.2390953150242328, | |
| "grad_norm": 0.11658553779125214, | |
| "learning_rate": 1.637880496054115e-05, | |
| "loss": 0.0444, | |
| "step": 1004 | |
| }, | |
| { | |
| "epoch": 3.24232633279483, | |
| "grad_norm": 0.142287939786911, | |
| "learning_rate": 1.6374295377677568e-05, | |
| "loss": 0.0576, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 3.245557350565428, | |
| "grad_norm": 0.11885146051645279, | |
| "learning_rate": 1.6369785794813982e-05, | |
| "loss": 0.0442, | |
| "step": 1006 | |
| }, | |
| { | |
| "epoch": 3.248788368336026, | |
| "grad_norm": 0.1423695832490921, | |
| "learning_rate": 1.6365276211950396e-05, | |
| "loss": 0.0539, | |
| "step": 1007 | |
| }, | |
| { | |
| "epoch": 3.2520193861066238, | |
| "grad_norm": 0.14337588846683502, | |
| "learning_rate": 1.636076662908681e-05, | |
| "loss": 0.056, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 3.255250403877221, | |
| "grad_norm": 0.15875791013240814, | |
| "learning_rate": 1.6356257046223225e-05, | |
| "loss": 0.0672, | |
| "step": 1009 | |
| }, | |
| { | |
| "epoch": 3.258481421647819, | |
| "grad_norm": 0.1158808171749115, | |
| "learning_rate": 1.635174746335964e-05, | |
| "loss": 0.0464, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 3.261712439418417, | |
| "grad_norm": 0.12882259488105774, | |
| "learning_rate": 1.6347237880496057e-05, | |
| "loss": 0.0497, | |
| "step": 1011 | |
| }, | |
| { | |
| "epoch": 3.2649434571890144, | |
| "grad_norm": 0.15846951305866241, | |
| "learning_rate": 1.6342728297632472e-05, | |
| "loss": 0.0656, | |
| "step": 1012 | |
| }, | |
| { | |
| "epoch": 3.268174474959612, | |
| "grad_norm": 0.16542094945907593, | |
| "learning_rate": 1.6338218714768886e-05, | |
| "loss": 0.06, | |
| "step": 1013 | |
| }, | |
| { | |
| "epoch": 3.27140549273021, | |
| "grad_norm": 0.1318148821592331, | |
| "learning_rate": 1.63337091319053e-05, | |
| "loss": 0.0475, | |
| "step": 1014 | |
| }, | |
| { | |
| "epoch": 3.274636510500808, | |
| "grad_norm": 0.110074482858181, | |
| "learning_rate": 1.6329199549041715e-05, | |
| "loss": 0.0411, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 3.2778675282714054, | |
| "grad_norm": 0.14870020747184753, | |
| "learning_rate": 1.632468996617813e-05, | |
| "loss": 0.0491, | |
| "step": 1016 | |
| }, | |
| { | |
| "epoch": 3.2810985460420032, | |
| "grad_norm": 0.16925957798957825, | |
| "learning_rate": 1.6320180383314547e-05, | |
| "loss": 0.0648, | |
| "step": 1017 | |
| }, | |
| { | |
| "epoch": 3.284329563812601, | |
| "grad_norm": 0.11573273688554764, | |
| "learning_rate": 1.631567080045096e-05, | |
| "loss": 0.0457, | |
| "step": 1018 | |
| }, | |
| { | |
| "epoch": 3.2875605815831985, | |
| "grad_norm": 0.16456320881843567, | |
| "learning_rate": 1.6311161217587376e-05, | |
| "loss": 0.0608, | |
| "step": 1019 | |
| }, | |
| { | |
| "epoch": 3.2907915993537964, | |
| "grad_norm": 0.14626148343086243, | |
| "learning_rate": 1.630665163472379e-05, | |
| "loss": 0.0543, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 3.2940226171243943, | |
| "grad_norm": 0.15516629815101624, | |
| "learning_rate": 1.6302142051860205e-05, | |
| "loss": 0.0628, | |
| "step": 1021 | |
| }, | |
| { | |
| "epoch": 3.297253634894992, | |
| "grad_norm": 0.18173731863498688, | |
| "learning_rate": 1.629763246899662e-05, | |
| "loss": 0.0677, | |
| "step": 1022 | |
| }, | |
| { | |
| "epoch": 3.3004846526655895, | |
| "grad_norm": 0.146285742521286, | |
| "learning_rate": 1.6293122886133033e-05, | |
| "loss": 0.052, | |
| "step": 1023 | |
| }, | |
| { | |
| "epoch": 3.3037156704361874, | |
| "grad_norm": 0.19001832604408264, | |
| "learning_rate": 1.6288613303269448e-05, | |
| "loss": 0.0693, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 3.3069466882067853, | |
| "grad_norm": 0.09959913045167923, | |
| "learning_rate": 1.6284103720405862e-05, | |
| "loss": 0.0339, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 3.3101777059773827, | |
| "grad_norm": 0.16015265882015228, | |
| "learning_rate": 1.627959413754228e-05, | |
| "loss": 0.0615, | |
| "step": 1026 | |
| }, | |
| { | |
| "epoch": 3.3134087237479806, | |
| "grad_norm": 0.13552674651145935, | |
| "learning_rate": 1.6275084554678694e-05, | |
| "loss": 0.0475, | |
| "step": 1027 | |
| }, | |
| { | |
| "epoch": 3.3166397415185784, | |
| "grad_norm": 0.16153255105018616, | |
| "learning_rate": 1.627057497181511e-05, | |
| "loss": 0.0601, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 3.3198707592891763, | |
| "grad_norm": 0.14683452248573303, | |
| "learning_rate": 1.6266065388951523e-05, | |
| "loss": 0.0526, | |
| "step": 1029 | |
| }, | |
| { | |
| "epoch": 3.3231017770597737, | |
| "grad_norm": 0.11259462684392929, | |
| "learning_rate": 1.6261555806087938e-05, | |
| "loss": 0.0429, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 3.3263327948303716, | |
| "grad_norm": 0.1688949465751648, | |
| "learning_rate": 1.6257046223224352e-05, | |
| "loss": 0.0624, | |
| "step": 1031 | |
| }, | |
| { | |
| "epoch": 3.3295638126009695, | |
| "grad_norm": 0.10584679245948792, | |
| "learning_rate": 1.6252536640360766e-05, | |
| "loss": 0.0376, | |
| "step": 1032 | |
| }, | |
| { | |
| "epoch": 3.332794830371567, | |
| "grad_norm": 0.16506820917129517, | |
| "learning_rate": 1.624802705749718e-05, | |
| "loss": 0.0585, | |
| "step": 1033 | |
| }, | |
| { | |
| "epoch": 3.3360258481421647, | |
| "grad_norm": 0.11264611780643463, | |
| "learning_rate": 1.6243517474633595e-05, | |
| "loss": 0.0432, | |
| "step": 1034 | |
| }, | |
| { | |
| "epoch": 3.3392568659127626, | |
| "grad_norm": 0.17402660846710205, | |
| "learning_rate": 1.6239007891770013e-05, | |
| "loss": 0.0566, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 3.3424878836833605, | |
| "grad_norm": 0.15407973527908325, | |
| "learning_rate": 1.6234498308906427e-05, | |
| "loss": 0.0519, | |
| "step": 1036 | |
| }, | |
| { | |
| "epoch": 3.345718901453958, | |
| "grad_norm": 0.1423128843307495, | |
| "learning_rate": 1.622998872604284e-05, | |
| "loss": 0.0511, | |
| "step": 1037 | |
| }, | |
| { | |
| "epoch": 3.3489499192245558, | |
| "grad_norm": 0.11291830986738205, | |
| "learning_rate": 1.6225479143179256e-05, | |
| "loss": 0.0413, | |
| "step": 1038 | |
| }, | |
| { | |
| "epoch": 3.3521809369951536, | |
| "grad_norm": 0.17067734897136688, | |
| "learning_rate": 1.622096956031567e-05, | |
| "loss": 0.0587, | |
| "step": 1039 | |
| }, | |
| { | |
| "epoch": 3.355411954765751, | |
| "grad_norm": 0.17072725296020508, | |
| "learning_rate": 1.6216459977452088e-05, | |
| "loss": 0.0606, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 3.358642972536349, | |
| "grad_norm": 0.13390378654003143, | |
| "learning_rate": 1.6211950394588503e-05, | |
| "loss": 0.0411, | |
| "step": 1041 | |
| }, | |
| { | |
| "epoch": 3.361873990306947, | |
| "grad_norm": 0.10424879193305969, | |
| "learning_rate": 1.6207440811724917e-05, | |
| "loss": 0.0356, | |
| "step": 1042 | |
| }, | |
| { | |
| "epoch": 3.3651050080775446, | |
| "grad_norm": 0.26167649030685425, | |
| "learning_rate": 1.620293122886133e-05, | |
| "loss": 0.0569, | |
| "step": 1043 | |
| }, | |
| { | |
| "epoch": 3.368336025848142, | |
| "grad_norm": 0.1589985489845276, | |
| "learning_rate": 1.6198421645997746e-05, | |
| "loss": 0.0539, | |
| "step": 1044 | |
| }, | |
| { | |
| "epoch": 3.37156704361874, | |
| "grad_norm": 0.14946100115776062, | |
| "learning_rate": 1.619391206313416e-05, | |
| "loss": 0.0531, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 3.374798061389338, | |
| "grad_norm": 0.1740565448999405, | |
| "learning_rate": 1.6189402480270578e-05, | |
| "loss": 0.0626, | |
| "step": 1046 | |
| }, | |
| { | |
| "epoch": 3.378029079159935, | |
| "grad_norm": 0.16527672111988068, | |
| "learning_rate": 1.6184892897406992e-05, | |
| "loss": 0.061, | |
| "step": 1047 | |
| }, | |
| { | |
| "epoch": 3.381260096930533, | |
| "grad_norm": 0.11640432476997375, | |
| "learning_rate": 1.6180383314543407e-05, | |
| "loss": 0.0433, | |
| "step": 1048 | |
| }, | |
| { | |
| "epoch": 3.384491114701131, | |
| "grad_norm": 0.11857109516859055, | |
| "learning_rate": 1.617587373167982e-05, | |
| "loss": 0.0374, | |
| "step": 1049 | |
| }, | |
| { | |
| "epoch": 3.387722132471729, | |
| "grad_norm": 0.15555702149868011, | |
| "learning_rate": 1.6171364148816236e-05, | |
| "loss": 0.0505, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.387722132471729, | |
| "eval_loss": 0.07340462505817413, | |
| "eval_runtime": 188.3173, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 3.3909531502423262, | |
| "grad_norm": 0.1614767611026764, | |
| "learning_rate": 1.616685456595265e-05, | |
| "loss": 0.0616, | |
| "step": 1051 | |
| }, | |
| { | |
| "epoch": 3.394184168012924, | |
| "grad_norm": 0.13632138073444366, | |
| "learning_rate": 1.6162344983089068e-05, | |
| "loss": 0.0495, | |
| "step": 1052 | |
| }, | |
| { | |
| "epoch": 3.397415185783522, | |
| "grad_norm": 0.1372632533311844, | |
| "learning_rate": 1.6157835400225482e-05, | |
| "loss": 0.0433, | |
| "step": 1053 | |
| }, | |
| { | |
| "epoch": 3.4006462035541194, | |
| "grad_norm": 0.17442293465137482, | |
| "learning_rate": 1.6153325817361896e-05, | |
| "loss": 0.0578, | |
| "step": 1054 | |
| }, | |
| { | |
| "epoch": 3.4038772213247173, | |
| "grad_norm": 0.18292829394340515, | |
| "learning_rate": 1.614881623449831e-05, | |
| "loss": 0.0648, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 3.407108239095315, | |
| "grad_norm": 0.15629050135612488, | |
| "learning_rate": 1.6144306651634725e-05, | |
| "loss": 0.0504, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 3.410339256865913, | |
| "grad_norm": 0.1325637549161911, | |
| "learning_rate": 1.613979706877114e-05, | |
| "loss": 0.039, | |
| "step": 1057 | |
| }, | |
| { | |
| "epoch": 3.4135702746365104, | |
| "grad_norm": 0.17488038539886475, | |
| "learning_rate": 1.6135287485907557e-05, | |
| "loss": 0.0557, | |
| "step": 1058 | |
| }, | |
| { | |
| "epoch": 3.4168012924071083, | |
| "grad_norm": 0.1492920219898224, | |
| "learning_rate": 1.6130777903043972e-05, | |
| "loss": 0.0537, | |
| "step": 1059 | |
| }, | |
| { | |
| "epoch": 3.420032310177706, | |
| "grad_norm": 0.14212766289710999, | |
| "learning_rate": 1.6126268320180386e-05, | |
| "loss": 0.0422, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 3.4232633279483036, | |
| "grad_norm": 0.1526009440422058, | |
| "learning_rate": 1.61217587373168e-05, | |
| "loss": 0.0572, | |
| "step": 1061 | |
| }, | |
| { | |
| "epoch": 3.4264943457189014, | |
| "grad_norm": 0.11353015899658203, | |
| "learning_rate": 1.6117249154453215e-05, | |
| "loss": 0.0383, | |
| "step": 1062 | |
| }, | |
| { | |
| "epoch": 3.4297253634894993, | |
| "grad_norm": 0.18253804743289948, | |
| "learning_rate": 1.611273957158963e-05, | |
| "loss": 0.0657, | |
| "step": 1063 | |
| }, | |
| { | |
| "epoch": 3.432956381260097, | |
| "grad_norm": 0.1422210931777954, | |
| "learning_rate": 1.6108229988726044e-05, | |
| "loss": 0.0528, | |
| "step": 1064 | |
| }, | |
| { | |
| "epoch": 3.4361873990306946, | |
| "grad_norm": 0.15058425068855286, | |
| "learning_rate": 1.6103720405862458e-05, | |
| "loss": 0.052, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 3.4394184168012925, | |
| "grad_norm": 0.13702097535133362, | |
| "learning_rate": 1.6099210822998873e-05, | |
| "loss": 0.0478, | |
| "step": 1066 | |
| }, | |
| { | |
| "epoch": 3.4426494345718903, | |
| "grad_norm": 0.19859325885772705, | |
| "learning_rate": 1.609470124013529e-05, | |
| "loss": 0.0512, | |
| "step": 1067 | |
| }, | |
| { | |
| "epoch": 3.4458804523424877, | |
| "grad_norm": 0.1942027509212494, | |
| "learning_rate": 1.6090191657271705e-05, | |
| "loss": 0.0711, | |
| "step": 1068 | |
| }, | |
| { | |
| "epoch": 3.4491114701130856, | |
| "grad_norm": 0.1615460216999054, | |
| "learning_rate": 1.608568207440812e-05, | |
| "loss": 0.0532, | |
| "step": 1069 | |
| }, | |
| { | |
| "epoch": 3.4523424878836835, | |
| "grad_norm": 0.15197354555130005, | |
| "learning_rate": 1.6081172491544533e-05, | |
| "loss": 0.0566, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 3.4555735056542813, | |
| "grad_norm": 0.131342813372612, | |
| "learning_rate": 1.6076662908680948e-05, | |
| "loss": 0.0448, | |
| "step": 1071 | |
| }, | |
| { | |
| "epoch": 3.4588045234248788, | |
| "grad_norm": 0.1431513875722885, | |
| "learning_rate": 1.6072153325817362e-05, | |
| "loss": 0.0502, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 3.4620355411954766, | |
| "grad_norm": 0.1926133781671524, | |
| "learning_rate": 1.6067643742953777e-05, | |
| "loss": 0.0625, | |
| "step": 1073 | |
| }, | |
| { | |
| "epoch": 3.4652665589660745, | |
| "grad_norm": 0.13775743544101715, | |
| "learning_rate": 1.606313416009019e-05, | |
| "loss": 0.0529, | |
| "step": 1074 | |
| }, | |
| { | |
| "epoch": 3.468497576736672, | |
| "grad_norm": 0.1370486617088318, | |
| "learning_rate": 1.6058624577226605e-05, | |
| "loss": 0.0489, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 3.47172859450727, | |
| "grad_norm": 0.18667002022266388, | |
| "learning_rate": 1.6054114994363023e-05, | |
| "loss": 0.0734, | |
| "step": 1076 | |
| }, | |
| { | |
| "epoch": 3.4749596122778676, | |
| "grad_norm": 0.16826723515987396, | |
| "learning_rate": 1.6049605411499438e-05, | |
| "loss": 0.0568, | |
| "step": 1077 | |
| }, | |
| { | |
| "epoch": 3.4781906300484655, | |
| "grad_norm": 0.1706121861934662, | |
| "learning_rate": 1.6045095828635852e-05, | |
| "loss": 0.064, | |
| "step": 1078 | |
| }, | |
| { | |
| "epoch": 3.481421647819063, | |
| "grad_norm": 0.12642326951026917, | |
| "learning_rate": 1.6040586245772266e-05, | |
| "loss": 0.0441, | |
| "step": 1079 | |
| }, | |
| { | |
| "epoch": 3.484652665589661, | |
| "grad_norm": 0.14685548841953278, | |
| "learning_rate": 1.603607666290868e-05, | |
| "loss": 0.0484, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 3.4878836833602587, | |
| "grad_norm": 0.13969610631465912, | |
| "learning_rate": 1.6031567080045095e-05, | |
| "loss": 0.0467, | |
| "step": 1081 | |
| }, | |
| { | |
| "epoch": 3.491114701130856, | |
| "grad_norm": 0.18751631677150726, | |
| "learning_rate": 1.6027057497181513e-05, | |
| "loss": 0.0653, | |
| "step": 1082 | |
| }, | |
| { | |
| "epoch": 3.494345718901454, | |
| "grad_norm": 0.14187730848789215, | |
| "learning_rate": 1.6022547914317927e-05, | |
| "loss": 0.0485, | |
| "step": 1083 | |
| }, | |
| { | |
| "epoch": 3.497576736672052, | |
| "grad_norm": 0.13812421262264252, | |
| "learning_rate": 1.6018038331454342e-05, | |
| "loss": 0.0494, | |
| "step": 1084 | |
| }, | |
| { | |
| "epoch": 3.5008077544426497, | |
| "grad_norm": 0.13007357716560364, | |
| "learning_rate": 1.6013528748590756e-05, | |
| "loss": 0.0454, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 3.504038772213247, | |
| "grad_norm": 0.16555847227573395, | |
| "learning_rate": 1.600901916572717e-05, | |
| "loss": 0.0474, | |
| "step": 1086 | |
| }, | |
| { | |
| "epoch": 3.507269789983845, | |
| "grad_norm": 0.11033131182193756, | |
| "learning_rate": 1.6004509582863585e-05, | |
| "loss": 0.0367, | |
| "step": 1087 | |
| }, | |
| { | |
| "epoch": 3.5105008077544424, | |
| "grad_norm": 0.14931395649909973, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.0489, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 3.5137318255250403, | |
| "grad_norm": 0.16848234832286835, | |
| "learning_rate": 1.5995490417136417e-05, | |
| "loss": 0.0578, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 3.516962843295638, | |
| "grad_norm": 0.15877306461334229, | |
| "learning_rate": 1.599098083427283e-05, | |
| "loss": 0.0524, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 3.520193861066236, | |
| "grad_norm": 0.16530410945415497, | |
| "learning_rate": 1.5986471251409246e-05, | |
| "loss": 0.0619, | |
| "step": 1091 | |
| }, | |
| { | |
| "epoch": 3.523424878836834, | |
| "grad_norm": 0.14331963658332825, | |
| "learning_rate": 1.598196166854566e-05, | |
| "loss": 0.0486, | |
| "step": 1092 | |
| }, | |
| { | |
| "epoch": 3.5266558966074313, | |
| "grad_norm": 0.15027795732021332, | |
| "learning_rate": 1.5977452085682075e-05, | |
| "loss": 0.048, | |
| "step": 1093 | |
| }, | |
| { | |
| "epoch": 3.529886914378029, | |
| "grad_norm": 0.15376311540603638, | |
| "learning_rate": 1.5972942502818492e-05, | |
| "loss": 0.0559, | |
| "step": 1094 | |
| }, | |
| { | |
| "epoch": 3.5331179321486266, | |
| "grad_norm": 0.1315869837999344, | |
| "learning_rate": 1.5968432919954907e-05, | |
| "loss": 0.0451, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 3.5363489499192244, | |
| "grad_norm": 0.13606947660446167, | |
| "learning_rate": 1.596392333709132e-05, | |
| "loss": 0.0433, | |
| "step": 1096 | |
| }, | |
| { | |
| "epoch": 3.5395799676898223, | |
| "grad_norm": 0.20028483867645264, | |
| "learning_rate": 1.5959413754227736e-05, | |
| "loss": 0.0593, | |
| "step": 1097 | |
| }, | |
| { | |
| "epoch": 3.54281098546042, | |
| "grad_norm": 0.15004722774028778, | |
| "learning_rate": 1.595490417136415e-05, | |
| "loss": 0.0501, | |
| "step": 1098 | |
| }, | |
| { | |
| "epoch": 3.546042003231018, | |
| "grad_norm": 0.14318561553955078, | |
| "learning_rate": 1.5950394588500564e-05, | |
| "loss": 0.0463, | |
| "step": 1099 | |
| }, | |
| { | |
| "epoch": 3.5492730210016155, | |
| "grad_norm": 0.11907773464918137, | |
| "learning_rate": 1.5945885005636982e-05, | |
| "loss": 0.0367, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.5525040387722133, | |
| "grad_norm": 0.19116652011871338, | |
| "learning_rate": 1.5941375422773397e-05, | |
| "loss": 0.0741, | |
| "step": 1101 | |
| }, | |
| { | |
| "epoch": 3.5557350565428107, | |
| "grad_norm": 0.14904284477233887, | |
| "learning_rate": 1.593686583990981e-05, | |
| "loss": 0.0493, | |
| "step": 1102 | |
| }, | |
| { | |
| "epoch": 3.5589660743134086, | |
| "grad_norm": 0.134224995970726, | |
| "learning_rate": 1.5932356257046225e-05, | |
| "loss": 0.0519, | |
| "step": 1103 | |
| }, | |
| { | |
| "epoch": 3.5621970920840065, | |
| "grad_norm": 0.16584910452365875, | |
| "learning_rate": 1.592784667418264e-05, | |
| "loss": 0.0604, | |
| "step": 1104 | |
| }, | |
| { | |
| "epoch": 3.5654281098546043, | |
| "grad_norm": 0.19957157969474792, | |
| "learning_rate": 1.5923337091319054e-05, | |
| "loss": 0.0874, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 3.568659127625202, | |
| "grad_norm": 0.16732187569141388, | |
| "learning_rate": 1.591882750845547e-05, | |
| "loss": 0.0521, | |
| "step": 1106 | |
| }, | |
| { | |
| "epoch": 3.5718901453957996, | |
| "grad_norm": 0.10181103646755219, | |
| "learning_rate": 1.5914317925591883e-05, | |
| "loss": 0.0356, | |
| "step": 1107 | |
| }, | |
| { | |
| "epoch": 3.5751211631663975, | |
| "grad_norm": 0.1692725121974945, | |
| "learning_rate": 1.59098083427283e-05, | |
| "loss": 0.0546, | |
| "step": 1108 | |
| }, | |
| { | |
| "epoch": 3.578352180936995, | |
| "grad_norm": 0.15010833740234375, | |
| "learning_rate": 1.5905298759864715e-05, | |
| "loss": 0.0525, | |
| "step": 1109 | |
| }, | |
| { | |
| "epoch": 3.581583198707593, | |
| "grad_norm": 0.14599505066871643, | |
| "learning_rate": 1.590078917700113e-05, | |
| "loss": 0.0503, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 3.5848142164781907, | |
| "grad_norm": 0.1455962210893631, | |
| "learning_rate": 1.5896279594137544e-05, | |
| "loss": 0.0538, | |
| "step": 1111 | |
| }, | |
| { | |
| "epoch": 3.5880452342487885, | |
| "grad_norm": 0.16955074667930603, | |
| "learning_rate": 1.5891770011273958e-05, | |
| "loss": 0.0562, | |
| "step": 1112 | |
| }, | |
| { | |
| "epoch": 3.5912762520193864, | |
| "grad_norm": 0.12441671639680862, | |
| "learning_rate": 1.5887260428410373e-05, | |
| "loss": 0.0423, | |
| "step": 1113 | |
| }, | |
| { | |
| "epoch": 3.594507269789984, | |
| "grad_norm": 0.1585661768913269, | |
| "learning_rate": 1.5882750845546787e-05, | |
| "loss": 0.0585, | |
| "step": 1114 | |
| }, | |
| { | |
| "epoch": 3.5977382875605817, | |
| "grad_norm": 0.1167236939072609, | |
| "learning_rate": 1.58782412626832e-05, | |
| "loss": 0.0396, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 3.600969305331179, | |
| "grad_norm": 0.11918371170759201, | |
| "learning_rate": 1.5873731679819616e-05, | |
| "loss": 0.0443, | |
| "step": 1116 | |
| }, | |
| { | |
| "epoch": 3.604200323101777, | |
| "grad_norm": 0.1533348709344864, | |
| "learning_rate": 1.586922209695603e-05, | |
| "loss": 0.0554, | |
| "step": 1117 | |
| }, | |
| { | |
| "epoch": 3.607431340872375, | |
| "grad_norm": 0.14366954565048218, | |
| "learning_rate": 1.5864712514092448e-05, | |
| "loss": 0.046, | |
| "step": 1118 | |
| }, | |
| { | |
| "epoch": 3.6106623586429727, | |
| "grad_norm": 0.17497192323207855, | |
| "learning_rate": 1.5860202931228862e-05, | |
| "loss": 0.0572, | |
| "step": 1119 | |
| }, | |
| { | |
| "epoch": 3.61389337641357, | |
| "grad_norm": 0.13877834379673004, | |
| "learning_rate": 1.5855693348365277e-05, | |
| "loss": 0.0493, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 3.617124394184168, | |
| "grad_norm": 0.19728821516036987, | |
| "learning_rate": 1.585118376550169e-05, | |
| "loss": 0.0712, | |
| "step": 1121 | |
| }, | |
| { | |
| "epoch": 3.620355411954766, | |
| "grad_norm": 0.11815082281827927, | |
| "learning_rate": 1.5846674182638105e-05, | |
| "loss": 0.0431, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 3.6235864297253633, | |
| "grad_norm": 0.1730748862028122, | |
| "learning_rate": 1.5842164599774523e-05, | |
| "loss": 0.0604, | |
| "step": 1123 | |
| }, | |
| { | |
| "epoch": 3.626817447495961, | |
| "grad_norm": 0.10923890024423599, | |
| "learning_rate": 1.5837655016910938e-05, | |
| "loss": 0.0355, | |
| "step": 1124 | |
| }, | |
| { | |
| "epoch": 3.630048465266559, | |
| "grad_norm": 0.12295415252447128, | |
| "learning_rate": 1.5833145434047352e-05, | |
| "loss": 0.0412, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 3.630048465266559, | |
| "eval_loss": 0.0719488188624382, | |
| "eval_runtime": 188.3414, | |
| "eval_samples_per_second": 1.046, | |
| "eval_steps_per_second": 1.046, | |
| "step": 1125 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 4635, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 15, | |
| "save_steps": 75, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3264284514179097e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |