{ "best_global_step": 270, "best_metric": 0.031980086117982864, "best_model_checkpoint": "saves_multiple/lora/llama-3-8b-instruct/train_copa_42_1760623607/checkpoint-270", "epoch": 20.0, "eval_steps": 90, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05555555555555555, "grad_norm": 4.979881286621094, "learning_rate": 1.1111111111111112e-06, "loss": 0.4642, "num_input_tokens_seen": 1600, "step": 5, "train_runtime": 3.0252, "train_tokens_per_second": 528.885 }, { "epoch": 0.1111111111111111, "grad_norm": 7.964423656463623, "learning_rate": 2.5e-06, "loss": 0.5471, "num_input_tokens_seen": 3200, "step": 10, "train_runtime": 4.0395, "train_tokens_per_second": 792.173 }, { "epoch": 0.16666666666666666, "grad_norm": 7.779398441314697, "learning_rate": 3.888888888888889e-06, "loss": 0.5976, "num_input_tokens_seen": 4768, "step": 15, "train_runtime": 5.0299, "train_tokens_per_second": 947.926 }, { "epoch": 0.2222222222222222, "grad_norm": 10.463203430175781, "learning_rate": 5.277777777777778e-06, "loss": 0.7557, "num_input_tokens_seen": 6336, "step": 20, "train_runtime": 6.0174, "train_tokens_per_second": 1052.951 }, { "epoch": 0.2777777777777778, "grad_norm": 8.324784278869629, "learning_rate": 6.666666666666667e-06, "loss": 0.5813, "num_input_tokens_seen": 7904, "step": 25, "train_runtime": 7.0043, "train_tokens_per_second": 1128.443 }, { "epoch": 0.3333333333333333, "grad_norm": 4.771843433380127, "learning_rate": 8.055555555555557e-06, "loss": 0.45, "num_input_tokens_seen": 9504, "step": 30, "train_runtime": 7.9907, "train_tokens_per_second": 1189.377 }, { "epoch": 0.3888888888888889, "grad_norm": 1.3658571243286133, "learning_rate": 9.444444444444445e-06, "loss": 0.1443, "num_input_tokens_seen": 11072, "step": 35, "train_runtime": 8.9725, "train_tokens_per_second": 1233.995 }, { "epoch": 0.4444444444444444, "grad_norm": 1.2481129169464111, "learning_rate": 1.0833333333333334e-05, "loss": 0.1337, "num_input_tokens_seen": 12672, "step": 40, "train_runtime": 9.9542, "train_tokens_per_second": 1273.032 }, { "epoch": 0.5, "grad_norm": 0.28093546628952026, "learning_rate": 1.2222222222222222e-05, "loss": 0.0038, "num_input_tokens_seen": 14176, "step": 45, "train_runtime": 10.931, "train_tokens_per_second": 1296.857 }, { "epoch": 0.5555555555555556, "grad_norm": 3.8658318519592285, "learning_rate": 1.3611111111111111e-05, "loss": 0.1347, "num_input_tokens_seen": 15776, "step": 50, "train_runtime": 11.9114, "train_tokens_per_second": 1324.449 }, { "epoch": 0.6111111111111112, "grad_norm": 0.2225475311279297, "learning_rate": 1.5e-05, "loss": 0.2198, "num_input_tokens_seen": 17312, "step": 55, "train_runtime": 12.8971, "train_tokens_per_second": 1342.319 }, { "epoch": 0.6666666666666666, "grad_norm": 7.42499303817749, "learning_rate": 1.638888888888889e-05, "loss": 0.1586, "num_input_tokens_seen": 18848, "step": 60, "train_runtime": 13.8816, "train_tokens_per_second": 1357.769 }, { "epoch": 0.7222222222222222, "grad_norm": 0.841469943523407, "learning_rate": 1.777777777777778e-05, "loss": 0.0848, "num_input_tokens_seen": 20448, "step": 65, "train_runtime": 14.8732, "train_tokens_per_second": 1374.823 }, { "epoch": 0.7777777777777778, "grad_norm": 0.05004585161805153, "learning_rate": 1.9166666666666667e-05, "loss": 0.0214, "num_input_tokens_seen": 22016, "step": 70, "train_runtime": 15.863, "train_tokens_per_second": 1387.886 }, { "epoch": 0.8333333333333334, "grad_norm": 6.2881693840026855, "learning_rate": 2.0555555555555555e-05, "loss": 0.1651, "num_input_tokens_seen": 23616, "step": 75, "train_runtime": 16.8625, "train_tokens_per_second": 1400.508 }, { "epoch": 0.8888888888888888, "grad_norm": 1.709317922592163, "learning_rate": 2.1944444444444445e-05, "loss": 0.1222, "num_input_tokens_seen": 25152, "step": 80, "train_runtime": 17.8403, "train_tokens_per_second": 1409.845 }, { "epoch": 0.9444444444444444, "grad_norm": 0.8996117115020752, "learning_rate": 2.3333333333333336e-05, "loss": 0.0104, "num_input_tokens_seen": 26688, "step": 85, "train_runtime": 18.8188, "train_tokens_per_second": 1418.156 }, { "epoch": 1.0, "grad_norm": 3.802898406982422, "learning_rate": 2.4722222222222223e-05, "loss": 0.0823, "num_input_tokens_seen": 28256, "step": 90, "train_runtime": 19.8711, "train_tokens_per_second": 1421.968 }, { "epoch": 1.0, "eval_loss": 0.0761384591460228, "eval_runtime": 0.6056, "eval_samples_per_second": 66.045, "eval_steps_per_second": 16.511, "num_input_tokens_seen": 28256, "step": 90 }, { "epoch": 1.0555555555555556, "grad_norm": 2.656207799911499, "learning_rate": 2.6111111111111114e-05, "loss": 0.0515, "num_input_tokens_seen": 29824, "step": 95, "train_runtime": 22.881, "train_tokens_per_second": 1303.442 }, { "epoch": 1.1111111111111112, "grad_norm": 2.034520387649536, "learning_rate": 2.7500000000000004e-05, "loss": 0.1462, "num_input_tokens_seen": 31360, "step": 100, "train_runtime": 23.8876, "train_tokens_per_second": 1312.818 }, { "epoch": 1.1666666666666667, "grad_norm": 4.51493501663208, "learning_rate": 2.8888888888888888e-05, "loss": 0.0746, "num_input_tokens_seen": 32960, "step": 105, "train_runtime": 24.89, "train_tokens_per_second": 1324.226 }, { "epoch": 1.2222222222222223, "grad_norm": 1.581369400024414, "learning_rate": 3.0277777777777776e-05, "loss": 0.132, "num_input_tokens_seen": 34464, "step": 110, "train_runtime": 25.902, "train_tokens_per_second": 1330.554 }, { "epoch": 1.2777777777777777, "grad_norm": 0.4392402172088623, "learning_rate": 3.1666666666666666e-05, "loss": 0.0438, "num_input_tokens_seen": 36032, "step": 115, "train_runtime": 26.8987, "train_tokens_per_second": 1339.545 }, { "epoch": 1.3333333333333333, "grad_norm": 0.129021555185318, "learning_rate": 3.3055555555555553e-05, "loss": 0.0689, "num_input_tokens_seen": 37600, "step": 120, "train_runtime": 28.1811, "train_tokens_per_second": 1334.226 }, { "epoch": 1.3888888888888888, "grad_norm": 1.9013866186141968, "learning_rate": 3.444444444444445e-05, "loss": 0.0283, "num_input_tokens_seen": 39168, "step": 125, "train_runtime": 29.2593, "train_tokens_per_second": 1338.65 }, { "epoch": 1.4444444444444444, "grad_norm": 2.102815628051758, "learning_rate": 3.5833333333333335e-05, "loss": 0.1178, "num_input_tokens_seen": 40736, "step": 130, "train_runtime": 30.2678, "train_tokens_per_second": 1345.851 }, { "epoch": 1.5, "grad_norm": 4.758823394775391, "learning_rate": 3.722222222222222e-05, "loss": 0.0915, "num_input_tokens_seen": 42240, "step": 135, "train_runtime": 31.2777, "train_tokens_per_second": 1350.485 }, { "epoch": 1.5555555555555556, "grad_norm": 2.1376194953918457, "learning_rate": 3.8611111111111116e-05, "loss": 0.0715, "num_input_tokens_seen": 43840, "step": 140, "train_runtime": 32.2733, "train_tokens_per_second": 1358.399 }, { "epoch": 1.6111111111111112, "grad_norm": 0.31904762983322144, "learning_rate": 4e-05, "loss": 0.0238, "num_input_tokens_seen": 45408, "step": 145, "train_runtime": 33.2683, "train_tokens_per_second": 1364.901 }, { "epoch": 1.6666666666666665, "grad_norm": 0.1338454931974411, "learning_rate": 4.138888888888889e-05, "loss": 0.1054, "num_input_tokens_seen": 46976, "step": 150, "train_runtime": 34.2698, "train_tokens_per_second": 1370.768 }, { "epoch": 1.7222222222222223, "grad_norm": 10.176454544067383, "learning_rate": 4.277777777777778e-05, "loss": 0.0814, "num_input_tokens_seen": 48512, "step": 155, "train_runtime": 35.2598, "train_tokens_per_second": 1375.843 }, { "epoch": 1.7777777777777777, "grad_norm": 0.6385492086410522, "learning_rate": 4.4166666666666665e-05, "loss": 0.0153, "num_input_tokens_seen": 50112, "step": 160, "train_runtime": 36.2473, "train_tokens_per_second": 1382.503 }, { "epoch": 1.8333333333333335, "grad_norm": 0.11571057885885239, "learning_rate": 4.555555555555556e-05, "loss": 0.0289, "num_input_tokens_seen": 51712, "step": 165, "train_runtime": 37.2398, "train_tokens_per_second": 1388.623 }, { "epoch": 1.8888888888888888, "grad_norm": 0.007955890148878098, "learning_rate": 4.6944444444444446e-05, "loss": 0.1408, "num_input_tokens_seen": 53280, "step": 170, "train_runtime": 38.2292, "train_tokens_per_second": 1393.697 }, { "epoch": 1.9444444444444444, "grad_norm": 2.387082099914551, "learning_rate": 4.8333333333333334e-05, "loss": 0.0322, "num_input_tokens_seen": 54880, "step": 175, "train_runtime": 39.2168, "train_tokens_per_second": 1399.399 }, { "epoch": 2.0, "grad_norm": 0.00578233040869236, "learning_rate": 4.972222222222223e-05, "loss": 0.0632, "num_input_tokens_seen": 56480, "step": 180, "train_runtime": 40.2337, "train_tokens_per_second": 1403.799 }, { "epoch": 2.0, "eval_loss": 0.04248045012354851, "eval_runtime": 0.5977, "eval_samples_per_second": 66.92, "eval_steps_per_second": 16.73, "num_input_tokens_seen": 56480, "step": 180 }, { "epoch": 2.0555555555555554, "grad_norm": 0.011782662943005562, "learning_rate": 4.9999247861994194e-05, "loss": 0.0047, "num_input_tokens_seen": 58048, "step": 185, "train_runtime": 43.8286, "train_tokens_per_second": 1324.432 }, { "epoch": 2.111111111111111, "grad_norm": 0.003803877392783761, "learning_rate": 4.9996192378909786e-05, "loss": 0.0001, "num_input_tokens_seen": 59584, "step": 190, "train_runtime": 44.8302, "train_tokens_per_second": 1329.105 }, { "epoch": 2.1666666666666665, "grad_norm": 1.5047531127929688, "learning_rate": 4.999078682916774e-05, "loss": 0.003, "num_input_tokens_seen": 61216, "step": 195, "train_runtime": 45.8336, "train_tokens_per_second": 1335.615 }, { "epoch": 2.2222222222222223, "grad_norm": 0.005229149479418993, "learning_rate": 4.998303172098155e-05, "loss": 0.0324, "num_input_tokens_seen": 62784, "step": 200, "train_runtime": 46.8344, "train_tokens_per_second": 1340.552 }, { "epoch": 2.2777777777777777, "grad_norm": 0.006047505419701338, "learning_rate": 4.997292778346312e-05, "loss": 0.0052, "num_input_tokens_seen": 64352, "step": 205, "train_runtime": 48.0207, "train_tokens_per_second": 1340.09 }, { "epoch": 2.3333333333333335, "grad_norm": 0.0012461725855246186, "learning_rate": 4.996047596655418e-05, "loss": 0.145, "num_input_tokens_seen": 65952, "step": 210, "train_runtime": 49.188, "train_tokens_per_second": 1340.816 }, { "epoch": 2.388888888888889, "grad_norm": 0.10850047320127487, "learning_rate": 4.994567744093703e-05, "loss": 0.0034, "num_input_tokens_seen": 67552, "step": 215, "train_runtime": 50.1911, "train_tokens_per_second": 1345.896 }, { "epoch": 2.4444444444444446, "grad_norm": 0.23551766574382782, "learning_rate": 4.992853359792444e-05, "loss": 0.0821, "num_input_tokens_seen": 69120, "step": 220, "train_runtime": 51.1853, "train_tokens_per_second": 1350.388 }, { "epoch": 2.5, "grad_norm": 0.0021248296834528446, "learning_rate": 4.9909046049328846e-05, "loss": 0.1169, "num_input_tokens_seen": 70688, "step": 225, "train_runtime": 52.1695, "train_tokens_per_second": 1354.968 }, { "epoch": 2.5555555555555554, "grad_norm": 0.0033943697344511747, "learning_rate": 4.988721662731083e-05, "loss": 0.0178, "num_input_tokens_seen": 72288, "step": 230, "train_runtime": 53.1709, "train_tokens_per_second": 1359.539 }, { "epoch": 2.611111111111111, "grad_norm": 0.0016524430830031633, "learning_rate": 4.9863047384206835e-05, "loss": 0.0005, "num_input_tokens_seen": 73856, "step": 235, "train_runtime": 54.1665, "train_tokens_per_second": 1363.5 }, { "epoch": 2.6666666666666665, "grad_norm": 0.0009248171118088067, "learning_rate": 4.983654059233626e-05, "loss": 0.0029, "num_input_tokens_seen": 75392, "step": 240, "train_runtime": 55.1584, "train_tokens_per_second": 1366.827 }, { "epoch": 2.7222222222222223, "grad_norm": 3.975990056991577, "learning_rate": 4.9807698743787744e-05, "loss": 0.0065, "num_input_tokens_seen": 76960, "step": 245, "train_runtime": 56.1528, "train_tokens_per_second": 1370.547 }, { "epoch": 2.7777777777777777, "grad_norm": 4.601274013519287, "learning_rate": 4.9776524550184965e-05, "loss": 0.0153, "num_input_tokens_seen": 78496, "step": 250, "train_runtime": 57.1444, "train_tokens_per_second": 1373.644 }, { "epoch": 2.8333333333333335, "grad_norm": 0.0008291220874525607, "learning_rate": 4.974302094243164e-05, "loss": 0.0017, "num_input_tokens_seen": 80000, "step": 255, "train_runtime": 58.1432, "train_tokens_per_second": 1375.914 }, { "epoch": 2.888888888888889, "grad_norm": 0.0009602424106560647, "learning_rate": 4.970719107043595e-05, "loss": 0.0003, "num_input_tokens_seen": 81568, "step": 260, "train_runtime": 59.1423, "train_tokens_per_second": 1379.182 }, { "epoch": 2.9444444444444446, "grad_norm": 17.70364761352539, "learning_rate": 4.966903830281449e-05, "loss": 0.0911, "num_input_tokens_seen": 83168, "step": 265, "train_runtime": 60.1379, "train_tokens_per_second": 1382.955 }, { "epoch": 3.0, "grad_norm": 0.0032549037132412195, "learning_rate": 4.962856622657541e-05, "loss": 0.0001, "num_input_tokens_seen": 84736, "step": 270, "train_runtime": 61.1564, "train_tokens_per_second": 1385.562 }, { "epoch": 3.0, "eval_loss": 0.031980086117982864, "eval_runtime": 0.6134, "eval_samples_per_second": 65.21, "eval_steps_per_second": 16.302, "num_input_tokens_seen": 84736, "step": 270 }, { "epoch": 3.0555555555555554, "grad_norm": 0.0005670373793691397, "learning_rate": 4.9585778646781364e-05, "loss": 0.0, "num_input_tokens_seen": 86304, "step": 275, "train_runtime": 64.1475, "train_tokens_per_second": 1345.399 }, { "epoch": 3.111111111111111, "grad_norm": 0.0004655662050936371, "learning_rate": 4.9540679586191605e-05, "loss": 0.0, "num_input_tokens_seen": 87904, "step": 280, "train_runtime": 65.1433, "train_tokens_per_second": 1349.395 }, { "epoch": 3.1666666666666665, "grad_norm": 0.013010970316827297, "learning_rate": 4.9493273284883854e-05, "loss": 0.0002, "num_input_tokens_seen": 89408, "step": 285, "train_runtime": 66.1464, "train_tokens_per_second": 1351.668 }, { "epoch": 3.2222222222222223, "grad_norm": 0.03000788763165474, "learning_rate": 4.9443564199855666e-05, "loss": 0.0431, "num_input_tokens_seen": 91008, "step": 290, "train_runtime": 67.5302, "train_tokens_per_second": 1347.664 }, { "epoch": 3.2777777777777777, "grad_norm": 0.011430704034864902, "learning_rate": 4.939155700460536e-05, "loss": 0.0001, "num_input_tokens_seen": 92512, "step": 295, "train_runtime": 68.5407, "train_tokens_per_second": 1349.738 }, { "epoch": 3.3333333333333335, "grad_norm": 0.0016691834898665547, "learning_rate": 4.933725658869267e-05, "loss": 0.0001, "num_input_tokens_seen": 94080, "step": 300, "train_runtime": 69.5493, "train_tokens_per_second": 1352.71 }, { "epoch": 3.388888888888889, "grad_norm": 0.0007329596555791795, "learning_rate": 4.9280668057279014e-05, "loss": 0.0003, "num_input_tokens_seen": 95680, "step": 305, "train_runtime": 70.5521, "train_tokens_per_second": 1356.16 }, { "epoch": 3.4444444444444446, "grad_norm": 0.0010038951877504587, "learning_rate": 4.9221796730647516e-05, "loss": 0.0001, "num_input_tokens_seen": 97248, "step": 310, "train_runtime": 71.5538, "train_tokens_per_second": 1359.09 }, { "epoch": 3.5, "grad_norm": 0.0010289876954630017, "learning_rate": 4.916064814370287e-05, "loss": 0.0001, "num_input_tokens_seen": 98784, "step": 315, "train_runtime": 72.5566, "train_tokens_per_second": 1361.476 }, { "epoch": 3.5555555555555554, "grad_norm": 0.0014408943243324757, "learning_rate": 4.9097228045450864e-05, "loss": 0.0, "num_input_tokens_seen": 100384, "step": 320, "train_runtime": 73.5624, "train_tokens_per_second": 1364.611 }, { "epoch": 3.611111111111111, "grad_norm": 0.0009136789012700319, "learning_rate": 4.9031542398457974e-05, "loss": 0.0001, "num_input_tokens_seen": 101952, "step": 325, "train_runtime": 74.564, "train_tokens_per_second": 1367.308 }, { "epoch": 3.6666666666666665, "grad_norm": 0.0063927448354661465, "learning_rate": 4.896359737829071e-05, "loss": 0.0, "num_input_tokens_seen": 103520, "step": 330, "train_runtime": 75.5715, "train_tokens_per_second": 1369.829 }, { "epoch": 3.7222222222222223, "grad_norm": 0.0008054813952185214, "learning_rate": 4.889339937293508e-05, "loss": 0.0, "num_input_tokens_seen": 105120, "step": 335, "train_runtime": 76.5593, "train_tokens_per_second": 1373.053 }, { "epoch": 3.7777777777777777, "grad_norm": 0.00381942605599761, "learning_rate": 4.8820954982195905e-05, "loss": 0.0, "num_input_tokens_seen": 106720, "step": 340, "train_runtime": 77.5551, "train_tokens_per_second": 1376.053 }, { "epoch": 3.8333333333333335, "grad_norm": 0.0009083571494556963, "learning_rate": 4.874627101707644e-05, "loss": 0.003, "num_input_tokens_seen": 108320, "step": 345, "train_runtime": 78.5473, "train_tokens_per_second": 1379.042 }, { "epoch": 3.888888888888889, "grad_norm": 0.0005886949365958571, "learning_rate": 4.8669354499137955e-05, "loss": 0.0001, "num_input_tokens_seen": 109888, "step": 350, "train_runtime": 79.5433, "train_tokens_per_second": 1381.486 }, { "epoch": 3.9444444444444446, "grad_norm": 0.0010697426041588187, "learning_rate": 4.859021265983959e-05, "loss": 0.0001, "num_input_tokens_seen": 111424, "step": 355, "train_runtime": 80.5314, "train_tokens_per_second": 1383.61 }, { "epoch": 4.0, "grad_norm": 0.0009298754739575088, "learning_rate": 4.850885293985853e-05, "loss": 0.0004, "num_input_tokens_seen": 113024, "step": 360, "train_runtime": 81.5524, "train_tokens_per_second": 1385.906 }, { "epoch": 4.0, "eval_loss": 0.13314101099967957, "eval_runtime": 0.6074, "eval_samples_per_second": 65.852, "eval_steps_per_second": 16.463, "num_input_tokens_seen": 113024, "step": 360 }, { "epoch": 4.055555555555555, "grad_norm": 0.00112513592466712, "learning_rate": 4.8425282988390376e-05, "loss": 0.0001, "num_input_tokens_seen": 114624, "step": 365, "train_runtime": 84.5041, "train_tokens_per_second": 1356.432 }, { "epoch": 4.111111111111111, "grad_norm": 0.0016354748513549566, "learning_rate": 4.8339510662430046e-05, "loss": 0.0, "num_input_tokens_seen": 116224, "step": 370, "train_runtime": 85.4986, "train_tokens_per_second": 1359.367 }, { "epoch": 4.166666666666667, "grad_norm": 0.0012682407395914197, "learning_rate": 4.825154402603308e-05, "loss": 0.0002, "num_input_tokens_seen": 117760, "step": 375, "train_runtime": 86.4897, "train_tokens_per_second": 1361.549 }, { "epoch": 4.222222222222222, "grad_norm": 0.0007402659975923598, "learning_rate": 4.816139134955746e-05, "loss": 0.0002, "num_input_tokens_seen": 119360, "step": 380, "train_runtime": 87.4932, "train_tokens_per_second": 1364.221 }, { "epoch": 4.277777777777778, "grad_norm": 0.0005341200157999992, "learning_rate": 4.806906110888606e-05, "loss": 0.0, "num_input_tokens_seen": 120960, "step": 385, "train_runtime": 88.8595, "train_tokens_per_second": 1361.249 }, { "epoch": 4.333333333333333, "grad_norm": 0.0011014444753527641, "learning_rate": 4.797456198462979e-05, "loss": 0.0, "num_input_tokens_seen": 122528, "step": 390, "train_runtime": 89.8594, "train_tokens_per_second": 1363.552 }, { "epoch": 4.388888888888889, "grad_norm": 0.0006175978342071176, "learning_rate": 4.7877902861311446e-05, "loss": 0.0, "num_input_tokens_seen": 124096, "step": 395, "train_runtime": 90.8578, "train_tokens_per_second": 1365.827 }, { "epoch": 4.444444444444445, "grad_norm": 0.01581227220594883, "learning_rate": 4.777909282653042e-05, "loss": 0.0, "num_input_tokens_seen": 125696, "step": 400, "train_runtime": 91.8515, "train_tokens_per_second": 1368.469 }, { "epoch": 4.5, "grad_norm": 0.0006168749532662332, "learning_rate": 4.7678141170108345e-05, "loss": 0.0, "num_input_tokens_seen": 127264, "step": 405, "train_runtime": 92.8418, "train_tokens_per_second": 1370.762 }, { "epoch": 4.555555555555555, "grad_norm": 0.0005042373668402433, "learning_rate": 4.757505738321563e-05, "loss": 0.0, "num_input_tokens_seen": 128832, "step": 410, "train_runtime": 93.837, "train_tokens_per_second": 1372.934 }, { "epoch": 4.611111111111111, "grad_norm": 0.0050198351964354515, "learning_rate": 4.7469851157479177e-05, "loss": 0.0, "num_input_tokens_seen": 130464, "step": 415, "train_runtime": 94.8275, "train_tokens_per_second": 1375.803 }, { "epoch": 4.666666666666667, "grad_norm": 0.006533036939799786, "learning_rate": 4.736253238407119e-05, "loss": 0.0, "num_input_tokens_seen": 132032, "step": 420, "train_runtime": 95.8227, "train_tokens_per_second": 1377.878 }, { "epoch": 4.722222222222222, "grad_norm": 0.019177285954356194, "learning_rate": 4.725311115277924e-05, "loss": 0.0, "num_input_tokens_seen": 133632, "step": 425, "train_runtime": 96.8089, "train_tokens_per_second": 1380.369 }, { "epoch": 4.777777777777778, "grad_norm": 0.00037291410262696445, "learning_rate": 4.714159775105765e-05, "loss": 0.0, "num_input_tokens_seen": 135232, "step": 430, "train_runtime": 97.7946, "train_tokens_per_second": 1382.817 }, { "epoch": 4.833333333333333, "grad_norm": 0.0007687642355449498, "learning_rate": 4.70280026630603e-05, "loss": 0.0, "num_input_tokens_seen": 136768, "step": 435, "train_runtime": 98.7848, "train_tokens_per_second": 1384.505 }, { "epoch": 4.888888888888889, "grad_norm": 0.0013276153476908803, "learning_rate": 4.6912336568654925e-05, "loss": 0.0, "num_input_tokens_seen": 138368, "step": 440, "train_runtime": 99.776, "train_tokens_per_second": 1386.786 }, { "epoch": 4.944444444444445, "grad_norm": 0.0006149167311377823, "learning_rate": 4.679461034241906e-05, "loss": 0.0, "num_input_tokens_seen": 139904, "step": 445, "train_runtime": 100.7675, "train_tokens_per_second": 1388.385 }, { "epoch": 5.0, "grad_norm": 0.0007349227671511471, "learning_rate": 4.667483505261762e-05, "loss": 0.0, "num_input_tokens_seen": 141440, "step": 450, "train_runtime": 101.7817, "train_tokens_per_second": 1389.641 }, { "epoch": 5.0, "eval_loss": 0.0918540507555008, "eval_runtime": 0.5945, "eval_samples_per_second": 67.279, "eval_steps_per_second": 16.82, "num_input_tokens_seen": 141440, "step": 450 }, { "epoch": 5.055555555555555, "grad_norm": 0.00029102564440108836, "learning_rate": 4.655302196016228e-05, "loss": 0.0, "num_input_tokens_seen": 142976, "step": 455, "train_runtime": 105.861, "train_tokens_per_second": 1350.601 }, { "epoch": 5.111111111111111, "grad_norm": 0.004522264935076237, "learning_rate": 4.642918251755281e-05, "loss": 0.0, "num_input_tokens_seen": 144576, "step": 460, "train_runtime": 106.8662, "train_tokens_per_second": 1352.869 }, { "epoch": 5.166666666666667, "grad_norm": 0.0003640787035692483, "learning_rate": 4.6303328367800284e-05, "loss": 0.0, "num_input_tokens_seen": 146144, "step": 465, "train_runtime": 107.8667, "train_tokens_per_second": 1354.858 }, { "epoch": 5.222222222222222, "grad_norm": 0.0005836294149048626, "learning_rate": 4.6175471343332485e-05, "loss": 0.0, "num_input_tokens_seen": 147712, "step": 470, "train_runtime": 109.244, "train_tokens_per_second": 1352.129 }, { "epoch": 5.277777777777778, "grad_norm": 0.0006122889462858438, "learning_rate": 4.604562346488144e-05, "loss": 0.0, "num_input_tokens_seen": 149248, "step": 475, "train_runtime": 110.2327, "train_tokens_per_second": 1353.935 }, { "epoch": 5.333333333333333, "grad_norm": 0.003949976991862059, "learning_rate": 4.591379694035325e-05, "loss": 0.0, "num_input_tokens_seen": 150816, "step": 480, "train_runtime": 111.2319, "train_tokens_per_second": 1355.87 }, { "epoch": 5.388888888888889, "grad_norm": 0.00033874381915666163, "learning_rate": 4.5780004163680365e-05, "loss": 0.0, "num_input_tokens_seen": 152352, "step": 485, "train_runtime": 112.2295, "train_tokens_per_second": 1357.504 }, { "epoch": 5.444444444444445, "grad_norm": 0.00028005815693177283, "learning_rate": 4.5644257713656356e-05, "loss": 0.0, "num_input_tokens_seen": 153888, "step": 490, "train_runtime": 113.2177, "train_tokens_per_second": 1359.222 }, { "epoch": 5.5, "grad_norm": 0.0004653169307857752, "learning_rate": 4.550657035275323e-05, "loss": 0.0, "num_input_tokens_seen": 155488, "step": 495, "train_runtime": 114.2099, "train_tokens_per_second": 1361.423 }, { "epoch": 5.555555555555555, "grad_norm": 0.0005347527912817895, "learning_rate": 4.536695502592162e-05, "loss": 0.0, "num_input_tokens_seen": 157024, "step": 500, "train_runtime": 115.1947, "train_tokens_per_second": 1363.118 }, { "epoch": 5.611111111111111, "grad_norm": 0.00025865226052701473, "learning_rate": 4.522542485937369e-05, "loss": 0.0, "num_input_tokens_seen": 158528, "step": 505, "train_runtime": 116.1728, "train_tokens_per_second": 1364.587 }, { "epoch": 5.666666666666667, "grad_norm": 0.0004275833198335022, "learning_rate": 4.5081993159349056e-05, "loss": 0.0, "num_input_tokens_seen": 160064, "step": 510, "train_runtime": 117.156, "train_tokens_per_second": 1366.247 }, { "epoch": 5.722222222222222, "grad_norm": 0.0003096760483458638, "learning_rate": 4.493667341086379e-05, "loss": 0.0, "num_input_tokens_seen": 161664, "step": 515, "train_runtime": 118.1395, "train_tokens_per_second": 1368.416 }, { "epoch": 5.777777777777778, "grad_norm": 0.00047315939445979893, "learning_rate": 4.478947927644258e-05, "loss": 0.0, "num_input_tokens_seen": 163264, "step": 520, "train_runtime": 119.1311, "train_tokens_per_second": 1370.457 }, { "epoch": 5.833333333333333, "grad_norm": 0.00034214110928587615, "learning_rate": 4.464042459483425e-05, "loss": 0.0, "num_input_tokens_seen": 164864, "step": 525, "train_runtime": 120.1224, "train_tokens_per_second": 1372.466 }, { "epoch": 5.888888888888889, "grad_norm": 0.00308756111189723, "learning_rate": 4.448952337971064e-05, "loss": 0.0, "num_input_tokens_seen": 166432, "step": 530, "train_runtime": 121.1211, "train_tokens_per_second": 1374.096 }, { "epoch": 5.944444444444445, "grad_norm": 0.004324956797063351, "learning_rate": 4.43367898183491e-05, "loss": 0.0, "num_input_tokens_seen": 168032, "step": 535, "train_runtime": 122.1198, "train_tokens_per_second": 1375.961 }, { "epoch": 6.0, "grad_norm": 0.004404026083648205, "learning_rate": 4.418223827029867e-05, "loss": 0.0, "num_input_tokens_seen": 169600, "step": 540, "train_runtime": 123.1419, "train_tokens_per_second": 1377.272 }, { "epoch": 6.0, "eval_loss": 0.0915924459695816, "eval_runtime": 0.596, "eval_samples_per_second": 67.115, "eval_steps_per_second": 16.779, "num_input_tokens_seen": 169600, "step": 540 }, { "epoch": 6.055555555555555, "grad_norm": 0.002966878702864051, "learning_rate": 4.402588326603002e-05, "loss": 0.0, "num_input_tokens_seen": 171168, "step": 545, "train_runtime": 126.2563, "train_tokens_per_second": 1355.718 }, { "epoch": 6.111111111111111, "grad_norm": 0.0008398924255743623, "learning_rate": 4.386773950556931e-05, "loss": 0.0, "num_input_tokens_seen": 172672, "step": 550, "train_runtime": 127.2547, "train_tokens_per_second": 1356.901 }, { "epoch": 6.166666666666667, "grad_norm": 0.000296527985483408, "learning_rate": 4.3707821857116176e-05, "loss": 0.0, "num_input_tokens_seen": 174240, "step": 555, "train_runtime": 128.2549, "train_tokens_per_second": 1358.544 }, { "epoch": 6.222222222222222, "grad_norm": 0.0003234162868466228, "learning_rate": 4.354614535564588e-05, "loss": 0.0, "num_input_tokens_seen": 175776, "step": 560, "train_runtime": 129.2509, "train_tokens_per_second": 1359.96 }, { "epoch": 6.277777777777778, "grad_norm": 0.0002368101995671168, "learning_rate": 4.3382725201495723e-05, "loss": 0.0, "num_input_tokens_seen": 177376, "step": 565, "train_runtime": 130.6234, "train_tokens_per_second": 1357.919 }, { "epoch": 6.333333333333333, "grad_norm": 0.0002874001220334321, "learning_rate": 4.321757675893596e-05, "loss": 0.0, "num_input_tokens_seen": 178912, "step": 570, "train_runtime": 131.6129, "train_tokens_per_second": 1359.38 }, { "epoch": 6.388888888888889, "grad_norm": 0.00027711590519174933, "learning_rate": 4.305071555472534e-05, "loss": 0.0, "num_input_tokens_seen": 180480, "step": 575, "train_runtime": 132.6048, "train_tokens_per_second": 1361.036 }, { "epoch": 6.444444444444445, "grad_norm": 0.010155759751796722, "learning_rate": 4.288215727665129e-05, "loss": 0.0, "num_input_tokens_seen": 182048, "step": 580, "train_runtime": 133.6061, "train_tokens_per_second": 1362.572 }, { "epoch": 6.5, "grad_norm": 0.0001757553982315585, "learning_rate": 4.2711917772055e-05, "loss": 0.0, "num_input_tokens_seen": 183648, "step": 585, "train_runtime": 134.6109, "train_tokens_per_second": 1364.288 }, { "epoch": 6.555555555555555, "grad_norm": 0.001011924701742828, "learning_rate": 4.254001304634151e-05, "loss": 0.0, "num_input_tokens_seen": 185248, "step": 590, "train_runtime": 135.6027, "train_tokens_per_second": 1366.109 }, { "epoch": 6.611111111111111, "grad_norm": 0.00028477245359681547, "learning_rate": 4.2366459261474933e-05, "loss": 0.0, "num_input_tokens_seen": 186720, "step": 595, "train_runtime": 136.5916, "train_tokens_per_second": 1366.994 }, { "epoch": 6.666666666666667, "grad_norm": 0.000251986290095374, "learning_rate": 4.2191272734458955e-05, "loss": 0.0, "num_input_tokens_seen": 188288, "step": 600, "train_runtime": 137.5781, "train_tokens_per_second": 1368.59 }, { "epoch": 6.722222222222222, "grad_norm": 0.00020348228281363845, "learning_rate": 4.201446993580276e-05, "loss": 0.0, "num_input_tokens_seen": 189888, "step": 605, "train_runtime": 138.5612, "train_tokens_per_second": 1370.427 }, { "epoch": 6.777777777777778, "grad_norm": 0.00034336469252593815, "learning_rate": 4.183606748797251e-05, "loss": 0.0, "num_input_tokens_seen": 191424, "step": 610, "train_runtime": 139.5474, "train_tokens_per_second": 1371.749 }, { "epoch": 6.833333333333333, "grad_norm": 0.00023455290647689253, "learning_rate": 4.1656082163828566e-05, "loss": 0.0, "num_input_tokens_seen": 193056, "step": 615, "train_runtime": 140.5386, "train_tokens_per_second": 1373.687 }, { "epoch": 6.888888888888889, "grad_norm": 0.00036284461384639144, "learning_rate": 4.147453088504854e-05, "loss": 0.0, "num_input_tokens_seen": 194592, "step": 620, "train_runtime": 141.5234, "train_tokens_per_second": 1374.981 }, { "epoch": 6.944444444444445, "grad_norm": 0.00021292144083417952, "learning_rate": 4.129143072053638e-05, "loss": 0.0, "num_input_tokens_seen": 196192, "step": 625, "train_runtime": 142.5109, "train_tokens_per_second": 1376.68 }, { "epoch": 7.0, "grad_norm": 0.0002205904747825116, "learning_rate": 4.110679888481763e-05, "loss": 0.0, "num_input_tokens_seen": 197792, "step": 630, "train_runtime": 143.5317, "train_tokens_per_second": 1378.037 }, { "epoch": 7.0, "eval_loss": 0.09356953203678131, "eval_runtime": 0.6071, "eval_samples_per_second": 65.885, "eval_steps_per_second": 16.471, "num_input_tokens_seen": 197792, "step": 630 }, { "epoch": 7.055555555555555, "grad_norm": 0.0002980381832458079, "learning_rate": 4.09206527364209e-05, "loss": 0.0, "num_input_tokens_seen": 199392, "step": 635, "train_runtime": 147.3203, "train_tokens_per_second": 1353.459 }, { "epoch": 7.111111111111111, "grad_norm": 0.00024097529239952564, "learning_rate": 4.073300977624594e-05, "loss": 0.0, "num_input_tokens_seen": 200992, "step": 640, "train_runtime": 148.3313, "train_tokens_per_second": 1355.021 }, { "epoch": 7.166666666666667, "grad_norm": 0.0004512780287768692, "learning_rate": 4.054388764591822e-05, "loss": 0.0, "num_input_tokens_seen": 202592, "step": 645, "train_runtime": 149.3397, "train_tokens_per_second": 1356.585 }, { "epoch": 7.222222222222222, "grad_norm": 0.00019677304953802377, "learning_rate": 4.035330412613035e-05, "loss": 0.0, "num_input_tokens_seen": 204064, "step": 650, "train_runtime": 150.7088, "train_tokens_per_second": 1354.028 }, { "epoch": 7.277777777777778, "grad_norm": 0.0002680430479813367, "learning_rate": 4.0161277134970345e-05, "loss": 0.0, "num_input_tokens_seen": 205664, "step": 655, "train_runtime": 151.7101, "train_tokens_per_second": 1355.638 }, { "epoch": 7.333333333333333, "grad_norm": 0.0026542171835899353, "learning_rate": 3.996782472623705e-05, "loss": 0.0, "num_input_tokens_seen": 207264, "step": 660, "train_runtime": 152.7074, "train_tokens_per_second": 1357.262 }, { "epoch": 7.388888888888889, "grad_norm": 0.000235486586461775, "learning_rate": 3.977296508774278e-05, "loss": 0.0, "num_input_tokens_seen": 208832, "step": 665, "train_runtime": 153.6955, "train_tokens_per_second": 1358.738 }, { "epoch": 7.444444444444445, "grad_norm": 0.00028199254302307963, "learning_rate": 3.957671653960337e-05, "loss": 0.0, "num_input_tokens_seen": 210368, "step": 670, "train_runtime": 154.6827, "train_tokens_per_second": 1359.997 }, { "epoch": 7.5, "grad_norm": 0.0038107491564005613, "learning_rate": 3.9379097532515725e-05, "loss": 0.0, "num_input_tokens_seen": 211936, "step": 675, "train_runtime": 155.6733, "train_tokens_per_second": 1361.416 }, { "epoch": 7.555555555555555, "grad_norm": 0.00019210667232982814, "learning_rate": 3.918012664602317e-05, "loss": 0.0, "num_input_tokens_seen": 213536, "step": 680, "train_runtime": 156.657, "train_tokens_per_second": 1363.08 }, { "epoch": 7.611111111111111, "grad_norm": 0.00032003698288463056, "learning_rate": 3.897982258676867e-05, "loss": 0.0, "num_input_tokens_seen": 215136, "step": 685, "train_runtime": 157.6422, "train_tokens_per_second": 1364.71 }, { "epoch": 7.666666666666667, "grad_norm": 0.005916266702115536, "learning_rate": 3.8778204186736076e-05, "loss": 0.0, "num_input_tokens_seen": 216736, "step": 690, "train_runtime": 158.6275, "train_tokens_per_second": 1366.32 }, { "epoch": 7.722222222222222, "grad_norm": 0.00015416365931741893, "learning_rate": 3.8575290401479586e-05, "loss": 0.0, "num_input_tokens_seen": 218272, "step": 695, "train_runtime": 159.6154, "train_tokens_per_second": 1367.487 }, { "epoch": 7.777777777777778, "grad_norm": 0.00023018961655907333, "learning_rate": 3.837110030834161e-05, "loss": 0.0, "num_input_tokens_seen": 219808, "step": 700, "train_runtime": 160.6075, "train_tokens_per_second": 1368.603 }, { "epoch": 7.833333333333333, "grad_norm": 0.00026057587820105255, "learning_rate": 3.8165653104659185e-05, "loss": 0.0, "num_input_tokens_seen": 221312, "step": 705, "train_runtime": 161.5993, "train_tokens_per_second": 1369.511 }, { "epoch": 7.888888888888889, "grad_norm": 0.00022156370687298477, "learning_rate": 3.79589681059591e-05, "loss": 0.0, "num_input_tokens_seen": 222880, "step": 710, "train_runtime": 162.5872, "train_tokens_per_second": 1370.834 }, { "epoch": 7.944444444444445, "grad_norm": 0.0018801233964040875, "learning_rate": 3.775106474414188e-05, "loss": 0.0, "num_input_tokens_seen": 224416, "step": 715, "train_runtime": 163.5753, "train_tokens_per_second": 1371.943 }, { "epoch": 8.0, "grad_norm": 0.00017104309517890215, "learning_rate": 3.75419625656549e-05, "loss": 0.0, "num_input_tokens_seen": 225984, "step": 720, "train_runtime": 164.5945, "train_tokens_per_second": 1372.974 }, { "epoch": 8.0, "eval_loss": 0.09462722390890121, "eval_runtime": 0.598, "eval_samples_per_second": 66.886, "eval_steps_per_second": 16.721, "num_input_tokens_seen": 225984, "step": 720 }, { "epoch": 8.055555555555555, "grad_norm": 0.0003053999971598387, "learning_rate": 3.7331681229654635e-05, "loss": 0.0, "num_input_tokens_seen": 227552, "step": 725, "train_runtime": 168.3399, "train_tokens_per_second": 1351.741 }, { "epoch": 8.11111111111111, "grad_norm": 0.0002048484020633623, "learning_rate": 3.712024050615843e-05, "loss": 0.0, "num_input_tokens_seen": 229088, "step": 730, "train_runtime": 169.3378, "train_tokens_per_second": 1352.846 }, { "epoch": 8.166666666666666, "grad_norm": 0.0001294966641580686, "learning_rate": 3.690766027418573e-05, "loss": 0.0, "num_input_tokens_seen": 230656, "step": 735, "train_runtime": 170.3353, "train_tokens_per_second": 1354.129 }, { "epoch": 8.222222222222221, "grad_norm": 0.002143693622201681, "learning_rate": 3.6693960519889106e-05, "loss": 0.0, "num_input_tokens_seen": 232224, "step": 740, "train_runtime": 171.6703, "train_tokens_per_second": 1352.733 }, { "epoch": 8.277777777777779, "grad_norm": 0.00016797089483588934, "learning_rate": 3.6479161334675296e-05, "loss": 0.0, "num_input_tokens_seen": 233792, "step": 745, "train_runtime": 172.6956, "train_tokens_per_second": 1353.781 }, { "epoch": 8.333333333333334, "grad_norm": 0.0014062707778066397, "learning_rate": 3.626328291331618e-05, "loss": 0.0, "num_input_tokens_seen": 235328, "step": 750, "train_runtime": 173.6966, "train_tokens_per_second": 1354.822 }, { "epoch": 8.38888888888889, "grad_norm": 0.0001441802887711674, "learning_rate": 3.60463455520502e-05, "loss": 0.0, "num_input_tokens_seen": 236864, "step": 755, "train_runtime": 174.7044, "train_tokens_per_second": 1355.799 }, { "epoch": 8.444444444444445, "grad_norm": 0.00014861173985991627, "learning_rate": 3.582836964667408e-05, "loss": 0.0, "num_input_tokens_seen": 238368, "step": 760, "train_runtime": 175.6965, "train_tokens_per_second": 1356.703 }, { "epoch": 8.5, "grad_norm": 0.005405796226114035, "learning_rate": 3.560937569062538e-05, "loss": 0.0, "num_input_tokens_seen": 239936, "step": 765, "train_runtime": 176.6915, "train_tokens_per_second": 1357.938 }, { "epoch": 8.555555555555555, "grad_norm": 0.00024216774909291416, "learning_rate": 3.538938427305573e-05, "loss": 0.0, "num_input_tokens_seen": 241536, "step": 770, "train_runtime": 177.6905, "train_tokens_per_second": 1359.308 }, { "epoch": 8.61111111111111, "grad_norm": 0.00018038312555290759, "learning_rate": 3.516841607689501e-05, "loss": 0.0, "num_input_tokens_seen": 243136, "step": 775, "train_runtime": 178.6849, "train_tokens_per_second": 1360.697 }, { "epoch": 8.666666666666666, "grad_norm": 0.00010779478179756552, "learning_rate": 3.494649187690695e-05, "loss": 0.0, "num_input_tokens_seen": 244704, "step": 780, "train_runtime": 179.6734, "train_tokens_per_second": 1361.938 }, { "epoch": 8.722222222222221, "grad_norm": 0.00021039468992967159, "learning_rate": 3.4723632537735846e-05, "loss": 0.0, "num_input_tokens_seen": 246272, "step": 785, "train_runtime": 180.663, "train_tokens_per_second": 1363.157 }, { "epoch": 8.777777777777779, "grad_norm": 0.00020861504890490323, "learning_rate": 3.449985901194498e-05, "loss": 0.0, "num_input_tokens_seen": 247808, "step": 790, "train_runtime": 181.6553, "train_tokens_per_second": 1364.166 }, { "epoch": 8.833333333333334, "grad_norm": 0.00012843194417655468, "learning_rate": 3.427519233804667e-05, "loss": 0.0, "num_input_tokens_seen": 249376, "step": 795, "train_runtime": 182.6534, "train_tokens_per_second": 1365.296 }, { "epoch": 8.88888888888889, "grad_norm": 0.0010842857882380486, "learning_rate": 3.404965363852437e-05, "loss": 0.0, "num_input_tokens_seen": 250944, "step": 800, "train_runtime": 183.6402, "train_tokens_per_second": 1366.498 }, { "epoch": 8.944444444444445, "grad_norm": 0.00013960848446004093, "learning_rate": 3.382326411784672e-05, "loss": 0.0, "num_input_tokens_seen": 252512, "step": 805, "train_runtime": 184.6256, "train_tokens_per_second": 1367.698 }, { "epoch": 9.0, "grad_norm": 0.00010977009515045211, "learning_rate": 3.359604506047403e-05, "loss": 0.0, "num_input_tokens_seen": 254112, "step": 810, "train_runtime": 185.6479, "train_tokens_per_second": 1368.785 }, { "epoch": 9.0, "eval_loss": 0.09544342756271362, "eval_runtime": 0.6013, "eval_samples_per_second": 66.52, "eval_steps_per_second": 16.63, "num_input_tokens_seen": 254112, "step": 810 }, { "epoch": 9.055555555555555, "grad_norm": 0.00026078041992150247, "learning_rate": 3.336801782885712e-05, "loss": 0.0, "num_input_tokens_seen": 255680, "step": 815, "train_runtime": 188.653, "train_tokens_per_second": 1355.292 }, { "epoch": 9.11111111111111, "grad_norm": 0.001745498855598271, "learning_rate": 3.313920386142892e-05, "loss": 0.0, "num_input_tokens_seen": 257216, "step": 820, "train_runtime": 189.6522, "train_tokens_per_second": 1356.251 }, { "epoch": 9.166666666666666, "grad_norm": 0.00015339218953158706, "learning_rate": 3.290962467058891e-05, "loss": 0.0, "num_input_tokens_seen": 258816, "step": 825, "train_runtime": 190.653, "train_tokens_per_second": 1357.524 }, { "epoch": 9.222222222222221, "grad_norm": 0.00017147160542663187, "learning_rate": 3.267930184068057e-05, "loss": 0.0, "num_input_tokens_seen": 260384, "step": 830, "train_runtime": 192.0257, "train_tokens_per_second": 1355.985 }, { "epoch": 9.277777777777779, "grad_norm": 0.001096663880161941, "learning_rate": 3.244825702596205e-05, "loss": 0.0, "num_input_tokens_seen": 262048, "step": 835, "train_runtime": 193.0221, "train_tokens_per_second": 1357.607 }, { "epoch": 9.333333333333334, "grad_norm": 0.00014087087765801698, "learning_rate": 3.2216511948570374e-05, "loss": 0.0, "num_input_tokens_seen": 263616, "step": 840, "train_runtime": 194.0181, "train_tokens_per_second": 1358.718 }, { "epoch": 9.38888888888889, "grad_norm": 0.001216462580487132, "learning_rate": 3.198408839647911e-05, "loss": 0.0, "num_input_tokens_seen": 265152, "step": 845, "train_runtime": 195.0127, "train_tokens_per_second": 1359.665 }, { "epoch": 9.444444444444445, "grad_norm": 0.00014244158228393644, "learning_rate": 3.1751008221450025e-05, "loss": 0.0, "num_input_tokens_seen": 266688, "step": 850, "train_runtime": 196.0079, "train_tokens_per_second": 1360.598 }, { "epoch": 9.5, "grad_norm": 0.00025856212596409023, "learning_rate": 3.151729333697854e-05, "loss": 0.0, "num_input_tokens_seen": 268256, "step": 855, "train_runtime": 197.0069, "train_tokens_per_second": 1361.658 }, { "epoch": 9.555555555555555, "grad_norm": 0.0008981631835922599, "learning_rate": 3.1282965716233594e-05, "loss": 0.0, "num_input_tokens_seen": 269824, "step": 860, "train_runtime": 197.9964, "train_tokens_per_second": 1362.772 }, { "epoch": 9.61111111111111, "grad_norm": 0.0005464640562422574, "learning_rate": 3.104804738999169e-05, "loss": 0.0, "num_input_tokens_seen": 271424, "step": 865, "train_runtime": 198.9842, "train_tokens_per_second": 1364.048 }, { "epoch": 9.666666666666666, "grad_norm": 0.0024021826684474945, "learning_rate": 3.0812560444565745e-05, "loss": 0.0, "num_input_tokens_seen": 272960, "step": 870, "train_runtime": 199.9757, "train_tokens_per_second": 1364.966 }, { "epoch": 9.722222222222221, "grad_norm": 0.00012538804730866104, "learning_rate": 3.057652701972848e-05, "loss": 0.0, "num_input_tokens_seen": 274528, "step": 875, "train_runtime": 200.9703, "train_tokens_per_second": 1366.013 }, { "epoch": 9.777777777777779, "grad_norm": 0.00012139719910919666, "learning_rate": 3.0339969306631005e-05, "loss": 0.0, "num_input_tokens_seen": 276128, "step": 880, "train_runtime": 201.9634, "train_tokens_per_second": 1367.218 }, { "epoch": 9.833333333333334, "grad_norm": 9.653216693550348e-05, "learning_rate": 3.0102909545716396e-05, "loss": 0.0, "num_input_tokens_seen": 277664, "step": 885, "train_runtime": 202.9471, "train_tokens_per_second": 1368.159 }, { "epoch": 9.88888888888889, "grad_norm": 0.00013615642092190683, "learning_rate": 2.9865370024628775e-05, "loss": 0.0, "num_input_tokens_seen": 279232, "step": 890, "train_runtime": 203.9343, "train_tokens_per_second": 1369.225 }, { "epoch": 9.944444444444445, "grad_norm": 0.0023047905415296555, "learning_rate": 2.9627373076117863e-05, "loss": 0.0, "num_input_tokens_seen": 280768, "step": 895, "train_runtime": 204.9205, "train_tokens_per_second": 1370.131 }, { "epoch": 10.0, "grad_norm": 9.74706417764537e-05, "learning_rate": 2.9388941075939334e-05, "loss": 0.0, "num_input_tokens_seen": 282368, "step": 900, "train_runtime": 205.9446, "train_tokens_per_second": 1371.087 }, { "epoch": 10.0, "eval_loss": 0.09752228856086731, "eval_runtime": 0.6023, "eval_samples_per_second": 66.412, "eval_steps_per_second": 16.603, "num_input_tokens_seen": 282368, "step": 900 }, { "epoch": 10.055555555555555, "grad_norm": 0.0006159305921755731, "learning_rate": 2.9150096440751107e-05, "loss": 0.0, "num_input_tokens_seen": 283936, "step": 905, "train_runtime": 208.9239, "train_tokens_per_second": 1359.04 }, { "epoch": 10.11111111111111, "grad_norm": 0.0004533717001322657, "learning_rate": 2.8910861626005776e-05, "loss": 0.0, "num_input_tokens_seen": 285504, "step": 910, "train_runtime": 209.9285, "train_tokens_per_second": 1360.006 }, { "epoch": 10.166666666666666, "grad_norm": 0.00022959569469094276, "learning_rate": 2.8671259123839472e-05, "loss": 0.0, "num_input_tokens_seen": 287072, "step": 915, "train_runtime": 210.9305, "train_tokens_per_second": 1360.979 }, { "epoch": 10.222222222222221, "grad_norm": 0.00010036968888016418, "learning_rate": 2.843131146095719e-05, "loss": 0.0, "num_input_tokens_seen": 288576, "step": 920, "train_runtime": 212.3072, "train_tokens_per_second": 1359.238 }, { "epoch": 10.277777777777779, "grad_norm": 0.0005590940127149224, "learning_rate": 2.8191041196514873e-05, "loss": 0.0, "num_input_tokens_seen": 290144, "step": 925, "train_runtime": 213.3028, "train_tokens_per_second": 1360.244 }, { "epoch": 10.333333333333334, "grad_norm": 9.856712131295353e-05, "learning_rate": 2.795047091999849e-05, "loss": 0.0, "num_input_tokens_seen": 291744, "step": 930, "train_runtime": 214.306, "train_tokens_per_second": 1361.343 }, { "epoch": 10.38888888888889, "grad_norm": 0.0005754076410084963, "learning_rate": 2.770962324910027e-05, "loss": 0.0, "num_input_tokens_seen": 293344, "step": 935, "train_runtime": 215.2976, "train_tokens_per_second": 1362.505 }, { "epoch": 10.444444444444445, "grad_norm": 0.0001776302233338356, "learning_rate": 2.7468520827592197e-05, "loss": 0.0, "num_input_tokens_seen": 294912, "step": 940, "train_runtime": 216.2823, "train_tokens_per_second": 1363.551 }, { "epoch": 10.5, "grad_norm": 9.569031681166962e-05, "learning_rate": 2.7227186323197162e-05, "loss": 0.0, "num_input_tokens_seen": 296480, "step": 945, "train_runtime": 217.2754, "train_tokens_per_second": 1364.536 }, { "epoch": 10.555555555555555, "grad_norm": 0.00022195291239768267, "learning_rate": 2.6985642425457757e-05, "loss": 0.0, "num_input_tokens_seen": 298048, "step": 950, "train_runtime": 218.2618, "train_tokens_per_second": 1365.553 }, { "epoch": 10.61111111111111, "grad_norm": 0.00029812438879162073, "learning_rate": 2.674391184360313e-05, "loss": 0.0, "num_input_tokens_seen": 299648, "step": 955, "train_runtime": 219.2478, "train_tokens_per_second": 1366.709 }, { "epoch": 10.666666666666666, "grad_norm": 0.00017939717508852482, "learning_rate": 2.650201730441392e-05, "loss": 0.0, "num_input_tokens_seen": 301216, "step": 960, "train_runtime": 220.2385, "train_tokens_per_second": 1367.681 }, { "epoch": 10.722222222222221, "grad_norm": 0.00010067274706671014, "learning_rate": 2.6259981550085504e-05, "loss": 0.0, "num_input_tokens_seen": 302784, "step": 965, "train_runtime": 221.234, "train_tokens_per_second": 1368.614 }, { "epoch": 10.777777777777779, "grad_norm": 0.0001221598795382306, "learning_rate": 2.60178273360899e-05, "loss": 0.0, "num_input_tokens_seen": 304320, "step": 970, "train_runtime": 222.2212, "train_tokens_per_second": 1369.446 }, { "epoch": 10.833333333333334, "grad_norm": 8.320201595779508e-05, "learning_rate": 2.5775577429036345e-05, "loss": 0.0, "num_input_tokens_seen": 305856, "step": 975, "train_runtime": 223.2099, "train_tokens_per_second": 1370.262 }, { "epoch": 10.88888888888889, "grad_norm": 9.071001841221005e-05, "learning_rate": 2.553325460453086e-05, "loss": 0.0, "num_input_tokens_seen": 307424, "step": 980, "train_runtime": 224.1941, "train_tokens_per_second": 1371.241 }, { "epoch": 10.944444444444445, "grad_norm": 8.711887494428083e-05, "learning_rate": 2.5290881645034932e-05, "loss": 0.0, "num_input_tokens_seen": 308992, "step": 985, "train_runtime": 225.1769, "train_tokens_per_second": 1372.219 }, { "epoch": 11.0, "grad_norm": 0.0004310951044317335, "learning_rate": 2.504848133772358e-05, "loss": 0.0, "num_input_tokens_seen": 310560, "step": 990, "train_runtime": 226.1973, "train_tokens_per_second": 1372.961 }, { "epoch": 11.0, "eval_loss": 0.09742120653390884, "eval_runtime": 0.609, "eval_samples_per_second": 65.684, "eval_steps_per_second": 16.421, "num_input_tokens_seen": 310560, "step": 990 }, { "epoch": 11.055555555555555, "grad_norm": 0.0001357738219667226, "learning_rate": 2.4806076472342997e-05, "loss": 0.0, "num_input_tokens_seen": 312160, "step": 995, "train_runtime": 229.2017, "train_tokens_per_second": 1361.944 }, { "epoch": 11.11111111111111, "grad_norm": 0.0001030619241646491, "learning_rate": 2.4563689839067913e-05, "loss": 0.0, "num_input_tokens_seen": 313728, "step": 1000, "train_runtime": 230.2071, "train_tokens_per_second": 1362.808 }, { "epoch": 11.166666666666666, "grad_norm": 0.00016433015116490424, "learning_rate": 2.432134422635893e-05, "loss": 0.0, "num_input_tokens_seen": 315264, "step": 1005, "train_runtime": 231.2095, "train_tokens_per_second": 1363.542 }, { "epoch": 11.222222222222221, "grad_norm": 9.156288433587179e-05, "learning_rate": 2.4079062418820002e-05, "loss": 0.0, "num_input_tokens_seen": 316864, "step": 1010, "train_runtime": 232.5834, "train_tokens_per_second": 1362.367 }, { "epoch": 11.277777777777779, "grad_norm": 0.00010515288158785552, "learning_rate": 2.3836867195056335e-05, "loss": 0.0, "num_input_tokens_seen": 318432, "step": 1015, "train_runtime": 233.5728, "train_tokens_per_second": 1363.309 }, { "epoch": 11.333333333333334, "grad_norm": 9.929422958521172e-05, "learning_rate": 2.3594781325532784e-05, "loss": 0.0, "num_input_tokens_seen": 320032, "step": 1020, "train_runtime": 234.5725, "train_tokens_per_second": 1364.32 }, { "epoch": 11.38888888888889, "grad_norm": 0.00018657129839994013, "learning_rate": 2.3352827570433036e-05, "loss": 0.0, "num_input_tokens_seen": 321536, "step": 1025, "train_runtime": 235.5754, "train_tokens_per_second": 1364.896 }, { "epoch": 11.444444444444445, "grad_norm": 0.0009437872213311493, "learning_rate": 2.3111028677519804e-05, "loss": 0.0, "num_input_tokens_seen": 323040, "step": 1030, "train_runtime": 236.5719, "train_tokens_per_second": 1365.505 }, { "epoch": 11.5, "grad_norm": 0.0004917692276649177, "learning_rate": 2.2869407379996088e-05, "loss": 0.0, "num_input_tokens_seen": 324608, "step": 1035, "train_runtime": 237.5635, "train_tokens_per_second": 1366.405 }, { "epoch": 11.555555555555555, "grad_norm": 0.00029067799914628267, "learning_rate": 2.2627986394367938e-05, "loss": 0.0, "num_input_tokens_seen": 326144, "step": 1040, "train_runtime": 238.5496, "train_tokens_per_second": 1367.196 }, { "epoch": 11.61111111111111, "grad_norm": 0.0002513430663384497, "learning_rate": 2.238678841830867e-05, "loss": 0.0, "num_input_tokens_seen": 327712, "step": 1045, "train_runtime": 239.545, "train_tokens_per_second": 1368.061 }, { "epoch": 11.666666666666666, "grad_norm": 9.253092866856605e-05, "learning_rate": 2.2145836128524902e-05, "loss": 0.0, "num_input_tokens_seen": 329248, "step": 1050, "train_runtime": 240.5431, "train_tokens_per_second": 1368.769 }, { "epoch": 11.722222222222221, "grad_norm": 0.0007531816372647882, "learning_rate": 2.1905152178624595e-05, "loss": 0.0, "num_input_tokens_seen": 330816, "step": 1055, "train_runtime": 241.5329, "train_tokens_per_second": 1369.652 }, { "epoch": 11.777777777777779, "grad_norm": 0.00010041355562862009, "learning_rate": 2.1664759196987182e-05, "loss": 0.0, "num_input_tokens_seen": 332416, "step": 1060, "train_runtime": 242.5302, "train_tokens_per_second": 1370.617 }, { "epoch": 11.833333333333334, "grad_norm": 9.113392297876999e-05, "learning_rate": 2.1424679784636144e-05, "loss": 0.0, "num_input_tokens_seen": 334016, "step": 1065, "train_runtime": 243.5141, "train_tokens_per_second": 1371.649 }, { "epoch": 11.88888888888889, "grad_norm": 0.00012517881987150759, "learning_rate": 2.118493651311413e-05, "loss": 0.0, "num_input_tokens_seen": 335616, "step": 1070, "train_runtime": 244.4977, "train_tokens_per_second": 1372.676 }, { "epoch": 11.944444444444445, "grad_norm": 0.00023915937345009297, "learning_rate": 2.0945551922360818e-05, "loss": 0.0, "num_input_tokens_seen": 337152, "step": 1075, "train_runtime": 245.4795, "train_tokens_per_second": 1373.442 }, { "epoch": 12.0, "grad_norm": 0.00010040521738119423, "learning_rate": 2.070654851859383e-05, "loss": 0.0, "num_input_tokens_seen": 338784, "step": 1080, "train_runtime": 246.4941, "train_tokens_per_second": 1374.41 }, { "epoch": 12.0, "eval_loss": 0.09843836724758148, "eval_runtime": 0.5963, "eval_samples_per_second": 67.075, "eval_steps_per_second": 16.769, "num_input_tokens_seen": 338784, "step": 1080 }, { "epoch": 12.055555555555555, "grad_norm": 0.00010358745203120634, "learning_rate": 2.0467948772192713e-05, "loss": 0.0, "num_input_tokens_seen": 340288, "step": 1085, "train_runtime": 250.4628, "train_tokens_per_second": 1358.637 }, { "epoch": 12.11111111111111, "grad_norm": 0.00046444806503131986, "learning_rate": 2.022977511558638e-05, "loss": 0.0, "num_input_tokens_seen": 341888, "step": 1090, "train_runtime": 251.4632, "train_tokens_per_second": 1359.595 }, { "epoch": 12.166666666666666, "grad_norm": 9.370278712594882e-05, "learning_rate": 1.9992049941144066e-05, "loss": 0.0, "num_input_tokens_seen": 343488, "step": 1095, "train_runtime": 252.474, "train_tokens_per_second": 1360.489 }, { "epoch": 12.222222222222221, "grad_norm": 9.531097748549655e-05, "learning_rate": 1.9754795599070068e-05, "loss": 0.0, "num_input_tokens_seen": 344992, "step": 1100, "train_runtime": 253.4783, "train_tokens_per_second": 1361.032 }, { "epoch": 12.277777777777779, "grad_norm": 0.000848272698931396, "learning_rate": 1.9518034395302414e-05, "loss": 0.0, "num_input_tokens_seen": 346560, "step": 1105, "train_runtime": 254.8455, "train_tokens_per_second": 1359.883 }, { "epoch": 12.333333333333334, "grad_norm": 0.00033023158903233707, "learning_rate": 1.9281788589415804e-05, "loss": 0.0, "num_input_tokens_seen": 348160, "step": 1110, "train_runtime": 255.8278, "train_tokens_per_second": 1360.915 }, { "epoch": 12.38888888888889, "grad_norm": 7.174572965595871e-05, "learning_rate": 1.9046080392528735e-05, "loss": 0.0, "num_input_tokens_seen": 349760, "step": 1115, "train_runtime": 256.8207, "train_tokens_per_second": 1361.884 }, { "epoch": 12.444444444444445, "grad_norm": 0.00012129171955166385, "learning_rate": 1.8810931965215356e-05, "loss": 0.0, "num_input_tokens_seen": 351328, "step": 1120, "train_runtime": 257.8137, "train_tokens_per_second": 1362.721 }, { "epoch": 12.5, "grad_norm": 0.00013478209439199418, "learning_rate": 1.857636541542195e-05, "loss": 0.0, "num_input_tokens_seen": 352896, "step": 1125, "train_runtime": 258.8048, "train_tokens_per_second": 1363.561 }, { "epoch": 12.555555555555555, "grad_norm": 0.00010490142449270934, "learning_rate": 1.8342402796388445e-05, "loss": 0.0, "num_input_tokens_seen": 354464, "step": 1130, "train_runtime": 259.7902, "train_tokens_per_second": 1364.424 }, { "epoch": 12.61111111111111, "grad_norm": 0.0001263675221707672, "learning_rate": 1.8109066104575023e-05, "loss": 0.0, "num_input_tokens_seen": 356032, "step": 1135, "train_runtime": 260.7839, "train_tokens_per_second": 1365.238 }, { "epoch": 12.666666666666666, "grad_norm": 0.0018804783467203379, "learning_rate": 1.7876377277594053e-05, "loss": 0.0, "num_input_tokens_seen": 357632, "step": 1140, "train_runtime": 261.7738, "train_tokens_per_second": 1366.187 }, { "epoch": 12.722222222222221, "grad_norm": 0.000149234474520199, "learning_rate": 1.764435819214762e-05, "loss": 0.0, "num_input_tokens_seen": 359168, "step": 1145, "train_runtime": 262.7594, "train_tokens_per_second": 1366.908 }, { "epoch": 12.777777777777779, "grad_norm": 0.0024518172722309828, "learning_rate": 1.7413030661970742e-05, "loss": 0.0, "num_input_tokens_seen": 360736, "step": 1150, "train_runtime": 263.7558, "train_tokens_per_second": 1367.689 }, { "epoch": 12.833333333333334, "grad_norm": 0.0021246427204459906, "learning_rate": 1.7182416435780454e-05, "loss": 0.0, "num_input_tokens_seen": 362304, "step": 1155, "train_runtime": 264.7498, "train_tokens_per_second": 1368.477 }, { "epoch": 12.88888888888889, "grad_norm": 0.0012594735017046332, "learning_rate": 1.695253719523115e-05, "loss": 0.0, "num_input_tokens_seen": 363872, "step": 1160, "train_runtime": 265.7411, "train_tokens_per_second": 1369.273 }, { "epoch": 12.944444444444445, "grad_norm": 0.0007461548666469753, "learning_rate": 1.672341455287605e-05, "loss": 0.0, "num_input_tokens_seen": 365376, "step": 1165, "train_runtime": 266.7353, "train_tokens_per_second": 1369.807 }, { "epoch": 13.0, "grad_norm": 8.894857455743477e-05, "learning_rate": 1.649507005013532e-05, "loss": 0.0, "num_input_tokens_seen": 366944, "step": 1170, "train_runtime": 267.7547, "train_tokens_per_second": 1370.448 }, { "epoch": 13.0, "eval_loss": 0.10053114593029022, "eval_runtime": 0.5957, "eval_samples_per_second": 67.146, "eval_steps_per_second": 16.786, "num_input_tokens_seen": 366944, "step": 1170 }, { "epoch": 13.055555555555555, "grad_norm": 0.00011566525063244626, "learning_rate": 1.6267525155270773e-05, "loss": 0.0, "num_input_tokens_seen": 368384, "step": 1175, "train_runtime": 270.7088, "train_tokens_per_second": 1360.813 }, { "epoch": 13.11111111111111, "grad_norm": 8.19827473605983e-05, "learning_rate": 1.6040801261367493e-05, "loss": 0.0, "num_input_tokens_seen": 369984, "step": 1180, "train_runtime": 271.7086, "train_tokens_per_second": 1361.694 }, { "epoch": 13.166666666666666, "grad_norm": 0.0003286909486632794, "learning_rate": 1.5814919684322545e-05, "loss": 0.0, "num_input_tokens_seen": 371520, "step": 1185, "train_runtime": 272.7163, "train_tokens_per_second": 1362.295 }, { "epoch": 13.222222222222221, "grad_norm": 0.00017523310089018196, "learning_rate": 1.5589901660840896e-05, "loss": 0.0, "num_input_tokens_seen": 373120, "step": 1190, "train_runtime": 273.9929, "train_tokens_per_second": 1361.787 }, { "epoch": 13.277777777777779, "grad_norm": 8.97664504009299e-05, "learning_rate": 1.5365768346438797e-05, "loss": 0.0, "num_input_tokens_seen": 374688, "step": 1195, "train_runtime": 275.0855, "train_tokens_per_second": 1362.079 }, { "epoch": 13.333333333333334, "grad_norm": 0.0008034742204472423, "learning_rate": 1.5142540813454836e-05, "loss": 0.0, "num_input_tokens_seen": 376288, "step": 1200, "train_runtime": 276.0763, "train_tokens_per_second": 1362.986 }, { "epoch": 13.38888888888889, "grad_norm": 8.930965122999623e-05, "learning_rate": 1.4920240049068748e-05, "loss": 0.0, "num_input_tokens_seen": 377888, "step": 1205, "train_runtime": 277.0638, "train_tokens_per_second": 1363.903 }, { "epoch": 13.444444444444445, "grad_norm": 8.921877451939508e-05, "learning_rate": 1.4698886953328292e-05, "loss": 0.0, "num_input_tokens_seen": 379424, "step": 1210, "train_runtime": 278.052, "train_tokens_per_second": 1364.579 }, { "epoch": 13.5, "grad_norm": 0.00013365161430556327, "learning_rate": 1.4478502337184274e-05, "loss": 0.0, "num_input_tokens_seen": 380992, "step": 1215, "train_runtime": 279.0442, "train_tokens_per_second": 1365.346 }, { "epoch": 13.555555555555555, "grad_norm": 0.0007560288649983704, "learning_rate": 1.4259106920533955e-05, "loss": 0.0, "num_input_tokens_seen": 382592, "step": 1220, "train_runtime": 280.0463, "train_tokens_per_second": 1366.174 }, { "epoch": 13.61111111111111, "grad_norm": 0.0001108413707697764, "learning_rate": 1.4040721330273062e-05, "loss": 0.0, "num_input_tokens_seen": 384192, "step": 1225, "train_runtime": 281.0547, "train_tokens_per_second": 1366.965 }, { "epoch": 13.666666666666666, "grad_norm": 0.0013877995079383254, "learning_rate": 1.3823366098356487e-05, "loss": 0.0, "num_input_tokens_seen": 385760, "step": 1230, "train_runtime": 282.0425, "train_tokens_per_second": 1367.737 }, { "epoch": 13.722222222222221, "grad_norm": 8.269518002634868e-05, "learning_rate": 1.3607061659867892e-05, "loss": 0.0, "num_input_tokens_seen": 387328, "step": 1235, "train_runtime": 283.0315, "train_tokens_per_second": 1368.498 }, { "epoch": 13.777777777777779, "grad_norm": 8.950541814556345e-05, "learning_rate": 1.3391828351098578e-05, "loss": 0.0, "num_input_tokens_seen": 388896, "step": 1240, "train_runtime": 284.0182, "train_tokens_per_second": 1369.264 }, { "epoch": 13.833333333333334, "grad_norm": 8.850841550156474e-05, "learning_rate": 1.3177686407635417e-05, "loss": 0.0, "num_input_tokens_seen": 390496, "step": 1245, "train_runtime": 285.0002, "train_tokens_per_second": 1370.16 }, { "epoch": 13.88888888888889, "grad_norm": 9.216259786626324e-05, "learning_rate": 1.29646559624584e-05, "loss": 0.0, "num_input_tokens_seen": 392064, "step": 1250, "train_runtime": 285.991, "train_tokens_per_second": 1370.896 }, { "epoch": 13.944444444444445, "grad_norm": 0.00014821415243204683, "learning_rate": 1.2752757044047827e-05, "loss": 0.0, "num_input_tokens_seen": 393632, "step": 1255, "train_runtime": 286.9869, "train_tokens_per_second": 1371.603 }, { "epoch": 14.0, "grad_norm": 0.0001271448127226904, "learning_rate": 1.2542009574501246e-05, "loss": 0.0, "num_input_tokens_seen": 395104, "step": 1260, "train_runtime": 288.0114, "train_tokens_per_second": 1371.835 }, { "epoch": 14.0, "eval_loss": 0.1015445813536644, "eval_runtime": 0.5977, "eval_samples_per_second": 66.924, "eval_steps_per_second": 16.731, "num_input_tokens_seen": 395104, "step": 1260 }, { "epoch": 14.055555555555555, "grad_norm": 0.0006528616067953408, "learning_rate": 1.2332433367660442e-05, "loss": 0.0, "num_input_tokens_seen": 396672, "step": 1265, "train_runtime": 291.0816, "train_tokens_per_second": 1362.752 }, { "epoch": 14.11111111111111, "grad_norm": 6.952865805942565e-05, "learning_rate": 1.2124048127248644e-05, "loss": 0.0, "num_input_tokens_seen": 398304, "step": 1270, "train_runtime": 292.0907, "train_tokens_per_second": 1363.631 }, { "epoch": 14.166666666666666, "grad_norm": 0.0010912856087088585, "learning_rate": 1.1916873445017982e-05, "loss": 0.0, "num_input_tokens_seen": 399840, "step": 1275, "train_runtime": 293.0967, "train_tokens_per_second": 1364.191 }, { "epoch": 14.222222222222221, "grad_norm": 8.120456186588854e-05, "learning_rate": 1.1710928798907556e-05, "loss": 0.0, "num_input_tokens_seen": 401440, "step": 1280, "train_runtime": 294.4827, "train_tokens_per_second": 1363.204 }, { "epoch": 14.277777777777779, "grad_norm": 7.50229082768783e-05, "learning_rate": 1.1506233551212186e-05, "loss": 0.0, "num_input_tokens_seen": 403040, "step": 1285, "train_runtime": 295.4802, "train_tokens_per_second": 1364.017 }, { "epoch": 14.333333333333334, "grad_norm": 0.0005424844566732645, "learning_rate": 1.1302806946762004e-05, "loss": 0.0, "num_input_tokens_seen": 404640, "step": 1290, "train_runtime": 296.4859, "train_tokens_per_second": 1364.787 }, { "epoch": 14.38888888888889, "grad_norm": 0.0002076294185826555, "learning_rate": 1.1100668111113166e-05, "loss": 0.0, "num_input_tokens_seen": 406208, "step": 1295, "train_runtime": 297.4831, "train_tokens_per_second": 1365.483 }, { "epoch": 14.444444444444445, "grad_norm": 7.367003854596987e-05, "learning_rate": 1.0899836048749645e-05, "loss": 0.0, "num_input_tokens_seen": 407776, "step": 1300, "train_runtime": 298.4873, "train_tokens_per_second": 1366.142 }, { "epoch": 14.5, "grad_norm": 0.00013382034376263618, "learning_rate": 1.0700329641296541e-05, "loss": 0.0, "num_input_tokens_seen": 409312, "step": 1305, "train_runtime": 299.4789, "train_tokens_per_second": 1366.747 }, { "epoch": 14.555555555555555, "grad_norm": 8.281052578240633e-05, "learning_rate": 1.0502167645744895e-05, "loss": 0.0, "num_input_tokens_seen": 410816, "step": 1310, "train_runtime": 300.4767, "train_tokens_per_second": 1367.214 }, { "epoch": 14.61111111111111, "grad_norm": 7.426820957334712e-05, "learning_rate": 1.0305368692688174e-05, "loss": 0.0, "num_input_tokens_seen": 412416, "step": 1315, "train_runtime": 301.4875, "train_tokens_per_second": 1367.937 }, { "epoch": 14.666666666666666, "grad_norm": 7.392842962872237e-05, "learning_rate": 1.01099512845707e-05, "loss": 0.0, "num_input_tokens_seen": 414016, "step": 1320, "train_runtime": 302.4896, "train_tokens_per_second": 1368.695 }, { "epoch": 14.722222222222221, "grad_norm": 0.0008706842781975865, "learning_rate": 9.91593379394811e-06, "loss": 0.0, "num_input_tokens_seen": 415552, "step": 1325, "train_runtime": 303.4701, "train_tokens_per_second": 1369.334 }, { "epoch": 14.777777777777779, "grad_norm": 6.673461030004546e-05, "learning_rate": 9.723334461760006e-06, "loss": 0.0, "num_input_tokens_seen": 417088, "step": 1330, "train_runtime": 304.4572, "train_tokens_per_second": 1369.94 }, { "epoch": 14.833333333333334, "grad_norm": 0.00035145695437677205, "learning_rate": 9.532171395615036e-06, "loss": 0.0, "num_input_tokens_seen": 418656, "step": 1335, "train_runtime": 305.4513, "train_tokens_per_second": 1370.615 }, { "epoch": 14.88888888888889, "grad_norm": 6.785539881093428e-05, "learning_rate": 9.342462568088416e-06, "loss": 0.0, "num_input_tokens_seen": 420256, "step": 1340, "train_runtime": 306.4583, "train_tokens_per_second": 1371.332 }, { "epoch": 14.944444444444445, "grad_norm": 0.00039442972047254443, "learning_rate": 9.154225815032242e-06, "loss": 0.0, "num_input_tokens_seen": 421792, "step": 1345, "train_runtime": 307.4536, "train_tokens_per_second": 1371.888 }, { "epoch": 15.0, "grad_norm": 9.206620597979054e-05, "learning_rate": 8.967478833898612e-06, "loss": 0.0, "num_input_tokens_seen": 423360, "step": 1350, "train_runtime": 308.4816, "train_tokens_per_second": 1372.4 }, { "epoch": 15.0, "eval_loss": 0.09843109548091888, "eval_runtime": 0.5996, "eval_samples_per_second": 66.707, "eval_steps_per_second": 16.677, "num_input_tokens_seen": 423360, "step": 1350 }, { "epoch": 15.055555555555555, "grad_norm": 0.00019164431432727724, "learning_rate": 8.78223918207575e-06, "loss": 0.0, "num_input_tokens_seen": 424992, "step": 1355, "train_runtime": 311.4312, "train_tokens_per_second": 1364.642 }, { "epoch": 15.11111111111111, "grad_norm": 0.00013541574298869818, "learning_rate": 8.598524275237322e-06, "loss": 0.0, "num_input_tokens_seen": 426528, "step": 1360, "train_runtime": 312.4427, "train_tokens_per_second": 1365.14 }, { "epoch": 15.166666666666666, "grad_norm": 9.736158972373232e-05, "learning_rate": 8.41635138570507e-06, "loss": 0.0, "num_input_tokens_seen": 428096, "step": 1365, "train_runtime": 313.4539, "train_tokens_per_second": 1365.738 }, { "epoch": 15.222222222222221, "grad_norm": 8.587204501964152e-05, "learning_rate": 8.235737640824908e-06, "loss": 0.0, "num_input_tokens_seen": 429600, "step": 1370, "train_runtime": 314.4616, "train_tokens_per_second": 1366.144 }, { "epoch": 15.277777777777779, "grad_norm": 0.0007348962826654315, "learning_rate": 8.056700021356694e-06, "loss": 0.0, "num_input_tokens_seen": 431200, "step": 1375, "train_runtime": 315.8366, "train_tokens_per_second": 1365.263 }, { "epoch": 15.333333333333334, "grad_norm": 8.820810035103932e-05, "learning_rate": 7.879255359877705e-06, "loss": 0.0, "num_input_tokens_seen": 432736, "step": 1380, "train_runtime": 316.8401, "train_tokens_per_second": 1365.787 }, { "epoch": 15.38888888888889, "grad_norm": 8.136556425597519e-05, "learning_rate": 7.703420339200101e-06, "loss": 0.0, "num_input_tokens_seen": 434336, "step": 1385, "train_runtime": 317.8502, "train_tokens_per_second": 1366.48 }, { "epoch": 15.444444444444445, "grad_norm": 6.743312405887991e-05, "learning_rate": 7.529211490802498e-06, "loss": 0.0, "num_input_tokens_seen": 435904, "step": 1390, "train_runtime": 318.8467, "train_tokens_per_second": 1367.127 }, { "epoch": 15.5, "grad_norm": 0.00012355401122476906, "learning_rate": 7.3566451932756744e-06, "loss": 0.0, "num_input_tokens_seen": 437440, "step": 1395, "train_runtime": 319.8478, "train_tokens_per_second": 1367.65 }, { "epoch": 15.555555555555555, "grad_norm": 7.22238328307867e-05, "learning_rate": 7.185737670782727e-06, "loss": 0.0, "num_input_tokens_seen": 438976, "step": 1400, "train_runtime": 320.843, "train_tokens_per_second": 1368.196 }, { "epoch": 15.61111111111111, "grad_norm": 0.00056417629821226, "learning_rate": 7.016504991533726e-06, "loss": 0.0, "num_input_tokens_seen": 440512, "step": 1405, "train_runtime": 321.855, "train_tokens_per_second": 1368.666 }, { "epoch": 15.666666666666666, "grad_norm": 0.00010159982048207894, "learning_rate": 6.848963066275027e-06, "loss": 0.0, "num_input_tokens_seen": 442112, "step": 1410, "train_runtime": 322.864, "train_tokens_per_second": 1369.344 }, { "epoch": 15.722222222222221, "grad_norm": 7.245803135447204e-05, "learning_rate": 6.683127646793411e-06, "loss": 0.0, "num_input_tokens_seen": 443616, "step": 1415, "train_runtime": 323.8761, "train_tokens_per_second": 1369.709 }, { "epoch": 15.777777777777779, "grad_norm": 0.00023071092437021434, "learning_rate": 6.519014324435102e-06, "loss": 0.0, "num_input_tokens_seen": 445184, "step": 1420, "train_runtime": 324.8846, "train_tokens_per_second": 1370.283 }, { "epoch": 15.833333333333334, "grad_norm": 0.00012904863979201764, "learning_rate": 6.356638528639955e-06, "loss": 0.0, "num_input_tokens_seen": 446752, "step": 1425, "train_runtime": 325.889, "train_tokens_per_second": 1370.872 }, { "epoch": 15.88888888888889, "grad_norm": 0.00011090516636613756, "learning_rate": 6.196015525490825e-06, "loss": 0.0, "num_input_tokens_seen": 448352, "step": 1430, "train_runtime": 326.9002, "train_tokens_per_second": 1371.526 }, { "epoch": 15.944444444444445, "grad_norm": 8.183154568541795e-05, "learning_rate": 6.037160416278278e-06, "loss": 0.0, "num_input_tokens_seen": 449888, "step": 1435, "train_runtime": 327.9062, "train_tokens_per_second": 1372.002 }, { "epoch": 16.0, "grad_norm": 6.404591840691864e-05, "learning_rate": 5.880088136080814e-06, "loss": 0.0, "num_input_tokens_seen": 451424, "step": 1440, "train_runtime": 328.9466, "train_tokens_per_second": 1372.332 }, { "epoch": 16.0, "eval_loss": 0.10147368907928467, "eval_runtime": 0.6127, "eval_samples_per_second": 65.282, "eval_steps_per_second": 16.321, "num_input_tokens_seen": 451424, "step": 1440 }, { "epoch": 16.055555555555557, "grad_norm": 9.41158868954517e-05, "learning_rate": 5.724813452360736e-06, "loss": 0.0, "num_input_tokens_seen": 452992, "step": 1445, "train_runtime": 332.0143, "train_tokens_per_second": 1364.375 }, { "epoch": 16.11111111111111, "grad_norm": 7.823929627193138e-05, "learning_rate": 5.571350963575728e-06, "loss": 0.0, "num_input_tokens_seen": 454496, "step": 1450, "train_runtime": 333.0203, "train_tokens_per_second": 1364.77 }, { "epoch": 16.166666666666668, "grad_norm": 8.621180313639343e-05, "learning_rate": 5.4197150978063965e-06, "loss": 0.0, "num_input_tokens_seen": 456096, "step": 1455, "train_runtime": 334.0333, "train_tokens_per_second": 1365.421 }, { "epoch": 16.22222222222222, "grad_norm": 0.0001230365305673331, "learning_rate": 5.269920111399732e-06, "loss": 0.0, "num_input_tokens_seen": 457696, "step": 1460, "train_runtime": 335.4085, "train_tokens_per_second": 1364.593 }, { "epoch": 16.27777777777778, "grad_norm": 0.0001045580575009808, "learning_rate": 5.121980087628803e-06, "loss": 0.0, "num_input_tokens_seen": 459232, "step": 1465, "train_runtime": 336.4013, "train_tokens_per_second": 1365.131 }, { "epoch": 16.333333333333332, "grad_norm": 0.0001560485252412036, "learning_rate": 4.975908935368701e-06, "loss": 0.0, "num_input_tokens_seen": 460832, "step": 1470, "train_runtime": 337.4008, "train_tokens_per_second": 1365.829 }, { "epoch": 16.38888888888889, "grad_norm": 9.085774217965081e-05, "learning_rate": 4.831720387788827e-06, "loss": 0.0, "num_input_tokens_seen": 462432, "step": 1475, "train_runtime": 338.3975, "train_tokens_per_second": 1366.535 }, { "epoch": 16.444444444444443, "grad_norm": 8.979045378509909e-05, "learning_rate": 4.689428001061774e-06, "loss": 0.0, "num_input_tokens_seen": 464000, "step": 1480, "train_runtime": 339.3925, "train_tokens_per_second": 1367.149 }, { "epoch": 16.5, "grad_norm": 0.0014842926757410169, "learning_rate": 4.549045153088813e-06, "loss": 0.0, "num_input_tokens_seen": 465536, "step": 1485, "train_runtime": 340.3809, "train_tokens_per_second": 1367.692 }, { "epoch": 16.555555555555557, "grad_norm": 6.461729208240286e-05, "learning_rate": 4.410585042242124e-06, "loss": 0.0, "num_input_tokens_seen": 467136, "step": 1490, "train_runtime": 341.379, "train_tokens_per_second": 1368.379 }, { "epoch": 16.61111111111111, "grad_norm": 8.179433643817902e-05, "learning_rate": 4.274060686123959e-06, "loss": 0.0, "num_input_tokens_seen": 468672, "step": 1495, "train_runtime": 342.3622, "train_tokens_per_second": 1368.936 }, { "epoch": 16.666666666666668, "grad_norm": 0.0006601639906875789, "learning_rate": 4.1394849203427284e-06, "loss": 0.0, "num_input_tokens_seen": 470272, "step": 1500, "train_runtime": 343.3439, "train_tokens_per_second": 1369.682 }, { "epoch": 16.72222222222222, "grad_norm": 7.60215989430435e-05, "learning_rate": 4.006870397306256e-06, "loss": 0.0, "num_input_tokens_seen": 471872, "step": 1505, "train_runtime": 344.3462, "train_tokens_per_second": 1370.342 }, { "epoch": 16.77777777777778, "grad_norm": 7.329335494432598e-05, "learning_rate": 3.876229585032245e-06, "loss": 0.0, "num_input_tokens_seen": 473440, "step": 1510, "train_runtime": 345.3573, "train_tokens_per_second": 1370.87 }, { "epoch": 16.833333333333332, "grad_norm": 0.0019351422088220716, "learning_rate": 3.7475747659760502e-06, "loss": 0.0, "num_input_tokens_seen": 475008, "step": 1515, "train_runtime": 346.362, "train_tokens_per_second": 1371.421 }, { "epoch": 16.88888888888889, "grad_norm": 0.0013020788319408894, "learning_rate": 3.6209180358759394e-06, "loss": 0.0, "num_input_tokens_seen": 476608, "step": 1520, "train_runtime": 347.3666, "train_tokens_per_second": 1372.06 }, { "epoch": 16.944444444444443, "grad_norm": 9.357877570437267e-05, "learning_rate": 3.4962713026158694e-06, "loss": 0.0, "num_input_tokens_seen": 478176, "step": 1525, "train_runtime": 348.3691, "train_tokens_per_second": 1372.613 }, { "epoch": 17.0, "grad_norm": 0.0017041382379829884, "learning_rate": 3.373646285105958e-06, "loss": 0.0, "num_input_tokens_seen": 479744, "step": 1530, "train_runtime": 349.4009, "train_tokens_per_second": 1373.048 }, { "epoch": 17.0, "eval_loss": 0.10042545944452286, "eval_runtime": 0.6044, "eval_samples_per_second": 66.186, "eval_steps_per_second": 16.547, "num_input_tokens_seen": 479744, "step": 1530 }, { "epoch": 17.055555555555557, "grad_norm": 7.093072053976357e-05, "learning_rate": 3.2530545121807145e-06, "loss": 0.0, "num_input_tokens_seen": 481344, "step": 1535, "train_runtime": 353.2337, "train_tokens_per_second": 1362.679 }, { "epoch": 17.11111111111111, "grad_norm": 8.884620910976082e-05, "learning_rate": 3.1345073215151066e-06, "loss": 0.0, "num_input_tokens_seen": 482944, "step": 1540, "train_runtime": 354.2327, "train_tokens_per_second": 1363.352 }, { "epoch": 17.166666666666668, "grad_norm": 8.379245264222845e-05, "learning_rate": 3.0180158585586397e-06, "loss": 0.0, "num_input_tokens_seen": 484480, "step": 1545, "train_runtime": 355.2312, "train_tokens_per_second": 1363.844 }, { "epoch": 17.22222222222222, "grad_norm": 0.000972072419244796, "learning_rate": 2.9035910754875136e-06, "loss": 0.0, "num_input_tokens_seen": 486016, "step": 1550, "train_runtime": 356.4143, "train_tokens_per_second": 1363.627 }, { "epoch": 17.27777777777778, "grad_norm": 6.56789488857612e-05, "learning_rate": 2.7912437301749026e-06, "loss": 0.0, "num_input_tokens_seen": 487584, "step": 1555, "train_runtime": 357.5927, "train_tokens_per_second": 1363.518 }, { "epoch": 17.333333333333332, "grad_norm": 8.938275277614594e-05, "learning_rate": 2.6809843851795357e-06, "loss": 0.0, "num_input_tokens_seen": 489088, "step": 1560, "train_runtime": 358.5777, "train_tokens_per_second": 1363.966 }, { "epoch": 17.38888888888889, "grad_norm": 0.0002912400523200631, "learning_rate": 2.57282340675267e-06, "loss": 0.0, "num_input_tokens_seen": 490688, "step": 1565, "train_runtime": 359.563, "train_tokens_per_second": 1364.679 }, { "epoch": 17.444444444444443, "grad_norm": 0.00030330222216434777, "learning_rate": 2.4667709638634434e-06, "loss": 0.0, "num_input_tokens_seen": 492288, "step": 1570, "train_runtime": 360.5596, "train_tokens_per_second": 1365.344 }, { "epoch": 17.5, "grad_norm": 0.0004344982444308698, "learning_rate": 2.3628370272428564e-06, "loss": 0.0, "num_input_tokens_seen": 493824, "step": 1575, "train_runtime": 361.5646, "train_tokens_per_second": 1365.797 }, { "epoch": 17.555555555555557, "grad_norm": 8.363231609109789e-05, "learning_rate": 2.2610313684463177e-06, "loss": 0.0, "num_input_tokens_seen": 495456, "step": 1580, "train_runtime": 362.567, "train_tokens_per_second": 1366.523 }, { "epoch": 17.61111111111111, "grad_norm": 7.062214717734605e-05, "learning_rate": 2.1613635589349756e-06, "loss": 0.0, "num_input_tokens_seen": 497024, "step": 1585, "train_runtime": 363.5616, "train_tokens_per_second": 1367.097 }, { "epoch": 17.666666666666668, "grad_norm": 6.458121060859412e-05, "learning_rate": 2.063842969175847e-06, "loss": 0.0, "num_input_tokens_seen": 498592, "step": 1590, "train_runtime": 364.5519, "train_tokens_per_second": 1367.685 }, { "epoch": 17.72222222222222, "grad_norm": 0.0005226784851402044, "learning_rate": 1.968478767760812e-06, "loss": 0.0, "num_input_tokens_seen": 500128, "step": 1595, "train_runtime": 365.5389, "train_tokens_per_second": 1368.194 }, { "epoch": 17.77777777777778, "grad_norm": 7.787663344061002e-05, "learning_rate": 1.8752799205445982e-06, "loss": 0.0, "num_input_tokens_seen": 501696, "step": 1600, "train_runtime": 366.5276, "train_tokens_per_second": 1368.781 }, { "epoch": 17.833333333333332, "grad_norm": 0.00036964804166927934, "learning_rate": 1.784255189801895e-06, "loss": 0.0, "num_input_tokens_seen": 503232, "step": 1605, "train_runtime": 367.5159, "train_tokens_per_second": 1369.28 }, { "epoch": 17.88888888888889, "grad_norm": 0.0005482187261804938, "learning_rate": 1.6954131334034922e-06, "loss": 0.0, "num_input_tokens_seen": 504736, "step": 1610, "train_runtime": 368.5023, "train_tokens_per_second": 1369.696 }, { "epoch": 17.944444444444443, "grad_norm": 9.297586802858859e-05, "learning_rate": 1.6087621040117157e-06, "loss": 0.0, "num_input_tokens_seen": 506304, "step": 1615, "train_runtime": 369.4882, "train_tokens_per_second": 1370.285 }, { "epoch": 18.0, "grad_norm": 0.0001506548869656399, "learning_rate": 1.524310248295152e-06, "loss": 0.0, "num_input_tokens_seen": 507872, "step": 1620, "train_runtime": 370.504, "train_tokens_per_second": 1370.76 }, { "epoch": 18.0, "eval_loss": 0.10146409273147583, "eval_runtime": 0.5964, "eval_samples_per_second": 67.067, "eval_steps_per_second": 16.767, "num_input_tokens_seen": 507872, "step": 1620 }, { "epoch": 18.055555555555557, "grad_norm": 7.465356611646712e-05, "learning_rate": 1.4420655061626932e-06, "loss": 0.0, "num_input_tokens_seen": 509408, "step": 1625, "train_runtime": 374.0225, "train_tokens_per_second": 1361.972 }, { "epoch": 18.11111111111111, "grad_norm": 8.583228191128e-05, "learning_rate": 1.362035610017079e-06, "loss": 0.0, "num_input_tokens_seen": 510912, "step": 1630, "train_runtime": 375.02, "train_tokens_per_second": 1362.359 }, { "epoch": 18.166666666666668, "grad_norm": 0.0012792643392458558, "learning_rate": 1.2842280840278997e-06, "loss": 0.0, "num_input_tokens_seen": 512384, "step": 1635, "train_runtime": 376.0171, "train_tokens_per_second": 1362.661 }, { "epoch": 18.22222222222222, "grad_norm": 0.0011630634544417262, "learning_rate": 1.2086502434241865e-06, "loss": 0.0, "num_input_tokens_seen": 513952, "step": 1640, "train_runtime": 377.0188, "train_tokens_per_second": 1363.2 }, { "epoch": 18.27777777777778, "grad_norm": 6.724595732521266e-05, "learning_rate": 1.1353091938067023e-06, "loss": 0.0, "num_input_tokens_seen": 515520, "step": 1645, "train_runtime": 378.3782, "train_tokens_per_second": 1362.446 }, { "epoch": 18.333333333333332, "grad_norm": 0.0009548751986585557, "learning_rate": 1.0642118304798442e-06, "loss": 0.0, "num_input_tokens_seen": 517120, "step": 1650, "train_runtime": 379.3741, "train_tokens_per_second": 1363.087 }, { "epoch": 18.38888888888889, "grad_norm": 0.00011085678124800324, "learning_rate": 9.95364837803392e-07, "loss": 0.0, "num_input_tokens_seen": 518688, "step": 1655, "train_runtime": 380.3654, "train_tokens_per_second": 1363.657 }, { "epoch": 18.444444444444443, "grad_norm": 6.726537685608491e-05, "learning_rate": 9.287746885640603e-07, "loss": 0.0, "num_input_tokens_seen": 520224, "step": 1660, "train_runtime": 381.3662, "train_tokens_per_second": 1364.106 }, { "epoch": 18.5, "grad_norm": 7.827718218322843e-05, "learning_rate": 8.64447643366953e-07, "loss": 0.0, "num_input_tokens_seen": 521824, "step": 1665, "train_runtime": 382.3614, "train_tokens_per_second": 1364.74 }, { "epoch": 18.555555555555557, "grad_norm": 6.854932871647179e-05, "learning_rate": 8.023897500469391e-07, "loss": 0.0, "num_input_tokens_seen": 523424, "step": 1670, "train_runtime": 383.3573, "train_tokens_per_second": 1365.368 }, { "epoch": 18.61111111111111, "grad_norm": 7.369754166575149e-05, "learning_rate": 7.426068431000882e-07, "loss": 0.0, "num_input_tokens_seen": 524960, "step": 1675, "train_runtime": 384.3485, "train_tokens_per_second": 1365.844 }, { "epoch": 18.666666666666668, "grad_norm": 8.4144230640959e-05, "learning_rate": 6.851045431350927e-07, "loss": 0.0, "num_input_tokens_seen": 526496, "step": 1680, "train_runtime": 385.3428, "train_tokens_per_second": 1366.306 }, { "epoch": 18.72222222222222, "grad_norm": 9.67139785643667e-05, "learning_rate": 6.298882563448599e-07, "loss": 0.0, "num_input_tokens_seen": 528064, "step": 1685, "train_runtime": 386.3413, "train_tokens_per_second": 1366.833 }, { "epoch": 18.77777777777778, "grad_norm": 8.19860870251432e-05, "learning_rate": 5.769631739982267e-07, "loss": 0.0, "num_input_tokens_seen": 529632, "step": 1690, "train_runtime": 387.3257, "train_tokens_per_second": 1367.407 }, { "epoch": 18.833333333333332, "grad_norm": 8.164047176251188e-05, "learning_rate": 5.263342719518921e-07, "loss": 0.0, "num_input_tokens_seen": 531232, "step": 1695, "train_runtime": 388.308, "train_tokens_per_second": 1368.069 }, { "epoch": 18.88888888888889, "grad_norm": 8.214041008614004e-05, "learning_rate": 4.780063101826132e-07, "loss": 0.0, "num_input_tokens_seen": 532800, "step": 1700, "train_runtime": 389.2971, "train_tokens_per_second": 1368.62 }, { "epoch": 18.944444444444443, "grad_norm": 0.0002978986594825983, "learning_rate": 4.319838323396691e-07, "loss": 0.0, "num_input_tokens_seen": 534400, "step": 1705, "train_runtime": 390.2917, "train_tokens_per_second": 1369.232 }, { "epoch": 19.0, "grad_norm": 0.00022920536866877228, "learning_rate": 3.88271165317694e-07, "loss": 0.0, "num_input_tokens_seen": 535968, "step": 1710, "train_runtime": 391.4404, "train_tokens_per_second": 1369.22 }, { "epoch": 19.0, "eval_loss": 0.101484015583992, "eval_runtime": 0.6037, "eval_samples_per_second": 66.259, "eval_steps_per_second": 16.565, "num_input_tokens_seen": 535968, "step": 1710 }, { "epoch": 19.055555555555557, "grad_norm": 0.0005205800989642739, "learning_rate": 3.468724188498751e-07, "loss": 0.0, "num_input_tokens_seen": 537536, "step": 1715, "train_runtime": 396.0619, "train_tokens_per_second": 1357.202 }, { "epoch": 19.11111111111111, "grad_norm": 8.487372542731464e-05, "learning_rate": 3.077914851215585e-07, "loss": 0.0, "num_input_tokens_seen": 539072, "step": 1720, "train_runtime": 397.0577, "train_tokens_per_second": 1357.666 }, { "epoch": 19.166666666666668, "grad_norm": 7.69746329751797e-05, "learning_rate": 2.71032038404323e-07, "loss": 0.0, "num_input_tokens_seen": 540608, "step": 1725, "train_runtime": 398.0558, "train_tokens_per_second": 1358.121 }, { "epoch": 19.22222222222222, "grad_norm": 5.529871486942284e-05, "learning_rate": 2.365975347105448e-07, "loss": 0.0, "num_input_tokens_seen": 542208, "step": 1730, "train_runtime": 399.3251, "train_tokens_per_second": 1357.811 }, { "epoch": 19.27777777777778, "grad_norm": 0.0011178107233718038, "learning_rate": 2.0449121146845774e-07, "loss": 0.0, "num_input_tokens_seen": 543776, "step": 1735, "train_runtime": 400.4282, "train_tokens_per_second": 1357.986 }, { "epoch": 19.333333333333332, "grad_norm": 0.00038915197364985943, "learning_rate": 1.747160872177883e-07, "loss": 0.0, "num_input_tokens_seen": 545280, "step": 1740, "train_runtime": 401.4192, "train_tokens_per_second": 1358.381 }, { "epoch": 19.38888888888889, "grad_norm": 0.0006458798889070749, "learning_rate": 1.472749613259661e-07, "loss": 0.0, "num_input_tokens_seen": 546848, "step": 1745, "train_runtime": 402.4075, "train_tokens_per_second": 1358.941 }, { "epoch": 19.444444444444443, "grad_norm": 0.0004789363010786474, "learning_rate": 1.22170413724923e-07, "loss": 0.0, "num_input_tokens_seen": 548416, "step": 1750, "train_runtime": 403.4048, "train_tokens_per_second": 1359.468 }, { "epoch": 19.5, "grad_norm": 7.903060031821951e-05, "learning_rate": 9.940480466855417e-08, "loss": 0.0, "num_input_tokens_seen": 550016, "step": 1755, "train_runtime": 404.4032, "train_tokens_per_second": 1360.068 }, { "epoch": 19.555555555555557, "grad_norm": 7.166919385781512e-05, "learning_rate": 7.898027451078982e-08, "loss": 0.0, "num_input_tokens_seen": 551584, "step": 1760, "train_runtime": 405.3892, "train_tokens_per_second": 1360.628 }, { "epoch": 19.61111111111111, "grad_norm": 9.801947453524917e-05, "learning_rate": 6.089874350439506e-08, "loss": 0.0, "num_input_tokens_seen": 553152, "step": 1765, "train_runtime": 406.3826, "train_tokens_per_second": 1361.161 }, { "epoch": 19.666666666666668, "grad_norm": 0.0003389905614312738, "learning_rate": 4.516191162040051e-08, "loss": 0.0, "num_input_tokens_seen": 554752, "step": 1770, "train_runtime": 407.3778, "train_tokens_per_second": 1361.763 }, { "epoch": 19.72222222222222, "grad_norm": 5.742942084907554e-05, "learning_rate": 3.177125838830786e-08, "loss": 0.0, "num_input_tokens_seen": 556288, "step": 1775, "train_runtime": 408.3618, "train_tokens_per_second": 1362.243 }, { "epoch": 19.77777777777778, "grad_norm": 0.0005994142848066986, "learning_rate": 2.0728042756967824e-08, "loss": 0.0, "num_input_tokens_seen": 557888, "step": 1780, "train_runtime": 409.3472, "train_tokens_per_second": 1362.872 }, { "epoch": 19.833333333333332, "grad_norm": 6.821275019319728e-05, "learning_rate": 1.2033302976222071e-08, "loss": 0.0, "num_input_tokens_seen": 559424, "step": 1785, "train_runtime": 410.3322, "train_tokens_per_second": 1363.344 }, { "epoch": 19.88888888888889, "grad_norm": 0.0002234447019873187, "learning_rate": 5.687856499297928e-09, "loss": 0.0, "num_input_tokens_seen": 560960, "step": 1790, "train_runtime": 411.3205, "train_tokens_per_second": 1363.803 }, { "epoch": 19.944444444444443, "grad_norm": 0.00014750372793059796, "learning_rate": 1.692299905944883e-09, "loss": 0.0, "num_input_tokens_seen": 562592, "step": 1795, "train_runtime": 412.3054, "train_tokens_per_second": 1364.503 }, { "epoch": 20.0, "grad_norm": 6.993868737481534e-05, "learning_rate": 4.700884634611935e-11, "loss": 0.0, "num_input_tokens_seen": 564096, "step": 1800, "train_runtime": 413.3282, "train_tokens_per_second": 1364.765 }, { "epoch": 20.0, "eval_loss": 0.10150279104709625, "eval_runtime": 0.5986, "eval_samples_per_second": 66.824, "eval_steps_per_second": 16.706, "num_input_tokens_seen": 564096, "step": 1800 }, { "epoch": 20.0, "num_input_tokens_seen": 564096, "step": 1800, "total_flos": 2.5450673718951936e+16, "train_loss": 0.018261103643923585, "train_runtime": 415.2132, "train_samples_per_second": 17.34, "train_steps_per_second": 4.335 } ], "logging_steps": 5, "max_steps": 1800, "num_input_tokens_seen": 564096, "num_train_epochs": 20, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.5450673718951936e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }