{ "best_global_step": 1650, "best_metric": 0.006614842917770147, "best_model_checkpoint": "/content/NH-SQL-finetuned/checkpoint-1650", "epoch": 50.0, "eval_steps": 500, "global_step": 1650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030303030303030304, "grad_norm": 6.595283031463623, "learning_rate": 0.0, "loss": 1.9666, "step": 1 }, { "epoch": 0.6060606060606061, "grad_norm": 6.3463263511657715, "learning_rate": 5.757575757575758e-07, "loss": 1.8746, "step": 20 }, { "epoch": 1.0, "eval_loss": 1.7834347486495972, "eval_runtime": 0.9389, "eval_samples_per_second": 140.592, "eval_steps_per_second": 18.107, "step": 33 }, { "epoch": 1.2121212121212122, "grad_norm": 4.871561050415039, "learning_rate": 1.181818181818182e-06, "loss": 1.7412, "step": 40 }, { "epoch": 1.8181818181818183, "grad_norm": 3.2557735443115234, "learning_rate": 1.787878787878788e-06, "loss": 1.5522, "step": 60 }, { "epoch": 2.0, "eval_loss": 1.2882661819458008, "eval_runtime": 0.9475, "eval_samples_per_second": 139.315, "eval_steps_per_second": 17.942, "step": 66 }, { "epoch": 2.4242424242424243, "grad_norm": 1.9632468223571777, "learning_rate": 2.393939393939394e-06, "loss": 1.2669, "step": 80 }, { "epoch": 3.0, "eval_loss": 0.8929286003112793, "eval_runtime": 0.9528, "eval_samples_per_second": 138.545, "eval_steps_per_second": 17.843, "step": 99 }, { "epoch": 3.0303030303030303, "grad_norm": 1.6738139390945435, "learning_rate": 3e-06, "loss": 1.0527, "step": 100 }, { "epoch": 3.6363636363636362, "grad_norm": 1.4490736722946167, "learning_rate": 3.606060606060606e-06, "loss": 0.8491, "step": 120 }, { "epoch": 4.0, "eval_loss": 0.5932092070579529, "eval_runtime": 0.9509, "eval_samples_per_second": 138.815, "eval_steps_per_second": 17.878, "step": 132 }, { "epoch": 4.242424242424242, "grad_norm": 1.0791282653808594, "learning_rate": 4.212121212121212e-06, "loss": 0.6586, "step": 140 }, { "epoch": 4.848484848484849, "grad_norm": 0.8789141178131104, "learning_rate": 4.818181818181819e-06, "loss": 0.4801, "step": 160 }, { "epoch": 5.0, "eval_loss": 0.42336800694465637, "eval_runtime": 0.9459, "eval_samples_per_second": 139.555, "eval_steps_per_second": 17.973, "step": 165 }, { "epoch": 5.454545454545454, "grad_norm": 0.8390600085258484, "learning_rate": 4.9989035693310165e-06, "loss": 0.4134, "step": 180 }, { "epoch": 6.0, "eval_loss": 0.3166239559650421, "eval_runtime": 0.9402, "eval_samples_per_second": 140.394, "eval_steps_per_second": 18.081, "step": 198 }, { "epoch": 6.0606060606060606, "grad_norm": 0.908724308013916, "learning_rate": 4.993535611735464e-06, "loss": 0.33, "step": 200 }, { "epoch": 6.666666666666667, "grad_norm": 0.8689959645271301, "learning_rate": 4.983704338371375e-06, "loss": 0.2941, "step": 220 }, { "epoch": 7.0, "eval_loss": 0.246794193983078, "eval_runtime": 0.9539, "eval_samples_per_second": 138.381, "eval_steps_per_second": 17.822, "step": 231 }, { "epoch": 7.2727272727272725, "grad_norm": 0.806349515914917, "learning_rate": 4.969427346772643e-06, "loss": 0.2513, "step": 240 }, { "epoch": 7.878787878787879, "grad_norm": 0.8216381072998047, "learning_rate": 4.950730192107368e-06, "loss": 0.2244, "step": 260 }, { "epoch": 8.0, "eval_loss": 0.19389495253562927, "eval_runtime": 0.9398, "eval_samples_per_second": 140.46, "eval_steps_per_second": 18.09, "step": 264 }, { "epoch": 8.484848484848484, "grad_norm": 0.8309475183486938, "learning_rate": 4.927646341435276e-06, "loss": 0.1756, "step": 280 }, { "epoch": 9.0, "eval_loss": 0.1608426719903946, "eval_runtime": 0.9567, "eval_samples_per_second": 137.969, "eval_steps_per_second": 17.769, "step": 297 }, { "epoch": 9.090909090909092, "grad_norm": 0.807766854763031, "learning_rate": 4.900217113803193e-06, "loss": 0.1666, "step": 300 }, { "epoch": 9.696969696969697, "grad_norm": 0.638713538646698, "learning_rate": 4.868491606285823e-06, "loss": 0.1756, "step": 320 }, { "epoch": 10.0, "eval_loss": 0.13285745680332184, "eval_runtime": 0.9431, "eval_samples_per_second": 139.968, "eval_steps_per_second": 18.026, "step": 330 }, { "epoch": 10.303030303030303, "grad_norm": 0.6577040553092957, "learning_rate": 4.832526606104213e-06, "loss": 0.1407, "step": 340 }, { "epoch": 10.909090909090908, "grad_norm": 0.7107315063476562, "learning_rate": 4.792386488979193e-06, "loss": 0.1218, "step": 360 }, { "epoch": 11.0, "eval_loss": 0.1115950271487236, "eval_runtime": 0.9468, "eval_samples_per_second": 139.417, "eval_steps_per_second": 17.955, "step": 363 }, { "epoch": 11.515151515151516, "grad_norm": 0.7401019334793091, "learning_rate": 4.74814310390176e-06, "loss": 0.1362, "step": 380 }, { "epoch": 12.0, "eval_loss": 0.09207186847925186, "eval_runtime": 0.9332, "eval_samples_per_second": 141.446, "eval_steps_per_second": 18.216, "step": 396 }, { "epoch": 12.121212121212121, "grad_norm": 0.68468177318573, "learning_rate": 4.699875644526633e-06, "loss": 0.0987, "step": 400 }, { "epoch": 12.727272727272727, "grad_norm": 0.7722787857055664, "learning_rate": 4.647670507419206e-06, "loss": 0.0989, "step": 420 }, { "epoch": 13.0, "eval_loss": 0.0792744979262352, "eval_runtime": 0.935, "eval_samples_per_second": 141.172, "eval_steps_per_second": 18.181, "step": 429 }, { "epoch": 13.333333333333334, "grad_norm": 0.48431074619293213, "learning_rate": 4.591621137409602e-06, "loss": 0.0936, "step": 440 }, { "epoch": 13.93939393939394, "grad_norm": 0.555296778678894, "learning_rate": 4.53182786033067e-06, "loss": 0.0945, "step": 460 }, { "epoch": 14.0, "eval_loss": 0.07043830305337906, "eval_runtime": 0.9312, "eval_samples_per_second": 141.753, "eval_steps_per_second": 18.256, "step": 462 }, { "epoch": 14.545454545454545, "grad_norm": 0.5280194878578186, "learning_rate": 4.468397703439282e-06, "loss": 0.0811, "step": 480 }, { "epoch": 15.0, "eval_loss": 0.06017066538333893, "eval_runtime": 0.9305, "eval_samples_per_second": 141.866, "eval_steps_per_second": 18.271, "step": 495 }, { "epoch": 15.151515151515152, "grad_norm": 0.5912495851516724, "learning_rate": 4.401444203842396e-06, "loss": 0.0742, "step": 500 }, { "epoch": 15.757575757575758, "grad_norm": 1.0043439865112305, "learning_rate": 4.331087205270778e-06, "loss": 0.0778, "step": 520 }, { "epoch": 16.0, "eval_loss": 0.052856337279081345, "eval_runtime": 0.9363, "eval_samples_per_second": 140.988, "eval_steps_per_second": 18.158, "step": 528 }, { "epoch": 16.363636363636363, "grad_norm": 0.37315091490745544, "learning_rate": 4.257452643564155e-06, "loss": 0.0746, "step": 540 }, { "epoch": 16.96969696969697, "grad_norm": 0.5725280046463013, "learning_rate": 4.180672321251766e-06, "loss": 0.0651, "step": 560 }, { "epoch": 17.0, "eval_loss": 0.04653371125459671, "eval_runtime": 0.9263, "eval_samples_per_second": 142.504, "eval_steps_per_second": 18.353, "step": 561 }, { "epoch": 17.575757575757574, "grad_norm": 0.6575049161911011, "learning_rate": 4.100883671631806e-06, "loss": 0.0529, "step": 580 }, { "epoch": 18.0, "eval_loss": 0.04210372641682625, "eval_runtime": 0.9448, "eval_samples_per_second": 139.719, "eval_steps_per_second": 17.994, "step": 594 }, { "epoch": 18.181818181818183, "grad_norm": 0.7622207999229431, "learning_rate": 4.018229512772053e-06, "loss": 0.0644, "step": 600 }, { "epoch": 18.78787878787879, "grad_norm": 0.5367633700370789, "learning_rate": 3.9328577918719916e-06, "loss": 0.0551, "step": 620 }, { "epoch": 19.0, "eval_loss": 0.03950377553701401, "eval_runtime": 0.9419, "eval_samples_per_second": 140.144, "eval_steps_per_second": 18.049, "step": 627 }, { "epoch": 19.393939393939394, "grad_norm": 0.581658661365509, "learning_rate": 3.844921320444031e-06, "loss": 0.0566, "step": 640 }, { "epoch": 20.0, "grad_norm": 0.7774003148078918, "learning_rate": 3.754577500787828e-06, "loss": 0.0532, "step": 660 }, { "epoch": 20.0, "eval_loss": 0.0353802889585495, "eval_runtime": 0.9534, "eval_samples_per_second": 138.453, "eval_steps_per_second": 17.831, "step": 660 }, { "epoch": 20.606060606060606, "grad_norm": 0.6069843769073486, "learning_rate": 3.66198804424729e-06, "loss": 0.0436, "step": 680 }, { "epoch": 21.0, "eval_loss": 0.031004376709461212, "eval_runtime": 0.9391, "eval_samples_per_second": 140.556, "eval_steps_per_second": 18.102, "step": 693 }, { "epoch": 21.21212121212121, "grad_norm": 0.5869702696800232, "learning_rate": 3.5673186817546047e-06, "loss": 0.0487, "step": 700 }, { "epoch": 21.818181818181817, "grad_norm": 0.39276406168937683, "learning_rate": 3.4707388671793814e-06, "loss": 0.0459, "step": 720 }, { "epoch": 22.0, "eval_loss": 0.027377676218748093, "eval_runtime": 0.9405, "eval_samples_per_second": 140.352, "eval_steps_per_second": 18.076, "step": 726 }, { "epoch": 22.424242424242426, "grad_norm": 0.5247741937637329, "learning_rate": 3.3724214740138933e-06, "loss": 0.0461, "step": 740 }, { "epoch": 23.0, "eval_loss": 0.02525358274579048, "eval_runtime": 0.9593, "eval_samples_per_second": 137.596, "eval_steps_per_second": 17.721, "step": 759 }, { "epoch": 23.03030303030303, "grad_norm": 0.4420148730278015, "learning_rate": 3.272542485937369e-06, "loss": 0.0424, "step": 760 }, { "epoch": 23.636363636363637, "grad_norm": 0.4988000690937042, "learning_rate": 3.171280681813174e-06, "loss": 0.0443, "step": 780 }, { "epoch": 24.0, "eval_loss": 0.02300359681248665, "eval_runtime": 0.9498, "eval_samples_per_second": 138.973, "eval_steps_per_second": 17.898, "step": 792 }, { "epoch": 24.242424242424242, "grad_norm": 0.6696539521217346, "learning_rate": 3.0688173156827454e-06, "loss": 0.0346, "step": 800 }, { "epoch": 24.848484848484848, "grad_norm": 0.6659572720527649, "learning_rate": 2.9653357923290753e-06, "loss": 0.0394, "step": 820 }, { "epoch": 25.0, "eval_loss": 0.02064535580575466, "eval_runtime": 0.9419, "eval_samples_per_second": 140.14, "eval_steps_per_second": 18.048, "step": 825 }, { "epoch": 25.454545454545453, "grad_norm": 0.477318674325943, "learning_rate": 2.86102133899045e-06, "loss": 0.0354, "step": 840 }, { "epoch": 26.0, "eval_loss": 0.017997996881604195, "eval_runtime": 0.9337, "eval_samples_per_second": 141.375, "eval_steps_per_second": 18.207, "step": 858 }, { "epoch": 26.060606060606062, "grad_norm": 0.4188827574253082, "learning_rate": 2.7560606738120947e-06, "loss": 0.0379, "step": 860 }, { "epoch": 26.666666666666668, "grad_norm": 0.37732234597206116, "learning_rate": 2.6506416716291466e-06, "loss": 0.0369, "step": 880 }, { "epoch": 27.0, "eval_loss": 0.01666710339486599, "eval_runtime": 0.9423, "eval_samples_per_second": 140.084, "eval_steps_per_second": 18.041, "step": 891 }, { "epoch": 27.272727272727273, "grad_norm": 0.5058871507644653, "learning_rate": 2.544953027679216e-06, "loss": 0.0327, "step": 900 }, { "epoch": 27.87878787878788, "grad_norm": 0.5595805644989014, "learning_rate": 2.4391839198464613e-06, "loss": 0.0338, "step": 920 }, { "epoch": 28.0, "eval_loss": 0.015013493597507477, "eval_runtime": 0.93, "eval_samples_per_second": 141.934, "eval_steps_per_second": 18.279, "step": 924 }, { "epoch": 28.484848484848484, "grad_norm": 0.4609270393848419, "learning_rate": 2.3335236700417404e-06, "loss": 0.0306, "step": 940 }, { "epoch": 29.0, "eval_loss": 0.014108900912106037, "eval_runtime": 0.9248, "eval_samples_per_second": 142.737, "eval_steps_per_second": 18.383, "step": 957 }, { "epoch": 29.09090909090909, "grad_norm": 0.3746008276939392, "learning_rate": 2.2281614053249796e-06, "loss": 0.0307, "step": 960 }, { "epoch": 29.696969696969695, "grad_norm": 0.5330935716629028, "learning_rate": 2.1232857193762923e-06, "loss": 0.0298, "step": 980 }, { "epoch": 30.0, "eval_loss": 0.013387720100581646, "eval_runtime": 0.9587, "eval_samples_per_second": 137.688, "eval_steps_per_second": 17.733, "step": 990 }, { "epoch": 30.303030303030305, "grad_norm": 0.31854015588760376, "learning_rate": 2.019084334921849e-06, "loss": 0.028, "step": 1000 }, { "epoch": 30.90909090909091, "grad_norm": 0.38515424728393555, "learning_rate": 1.9157437677186903e-06, "loss": 0.031, "step": 1020 }, { "epoch": 31.0, "eval_loss": 0.012279902584850788, "eval_runtime": 0.9342, "eval_samples_per_second": 141.305, "eval_steps_per_second": 18.198, "step": 1023 }, { "epoch": 31.515151515151516, "grad_norm": 0.45346567034721375, "learning_rate": 1.8134489926999837e-06, "loss": 0.033, "step": 1040 }, { "epoch": 32.0, "eval_loss": 0.011261457577347755, "eval_runtime": 0.9421, "eval_samples_per_second": 140.119, "eval_steps_per_second": 18.046, "step": 1056 }, { "epoch": 32.121212121212125, "grad_norm": 0.44893690943717957, "learning_rate": 1.7123831128782686e-06, "loss": 0.0246, "step": 1060 }, { "epoch": 32.72727272727273, "grad_norm": 0.4021283984184265, "learning_rate": 1.612727031599356e-06, "loss": 0.03, "step": 1080 }, { "epoch": 33.0, "eval_loss": 0.010378457605838776, "eval_runtime": 0.9543, "eval_samples_per_second": 138.318, "eval_steps_per_second": 17.814, "step": 1089 }, { "epoch": 33.333333333333336, "grad_norm": 0.6586357951164246, "learning_rate": 1.5146591287335452e-06, "loss": 0.0266, "step": 1100 }, { "epoch": 33.93939393939394, "grad_norm": 0.4133249521255493, "learning_rate": 1.4183549413837288e-06, "loss": 0.026, "step": 1120 }, { "epoch": 34.0, "eval_loss": 0.009874224662780762, "eval_runtime": 0.9426, "eval_samples_per_second": 140.041, "eval_steps_per_second": 18.036, "step": 1122 }, { "epoch": 34.54545454545455, "grad_norm": 0.4618055522441864, "learning_rate": 1.3239868496819407e-06, "loss": 0.0278, "step": 1140 }, { "epoch": 35.0, "eval_loss": 0.009275372140109539, "eval_runtime": 0.9399, "eval_samples_per_second": 140.441, "eval_steps_per_second": 18.087, "step": 1155 }, { "epoch": 35.15151515151515, "grad_norm": 0.3481239676475525, "learning_rate": 1.2317237682367178e-06, "loss": 0.0253, "step": 1160 }, { "epoch": 35.75757575757576, "grad_norm": 0.42644399404525757, "learning_rate": 1.1417308437836181e-06, "loss": 0.0269, "step": 1180 }, { "epoch": 36.0, "eval_loss": 0.008818729780614376, "eval_runtime": 0.9417, "eval_samples_per_second": 140.172, "eval_steps_per_second": 18.052, "step": 1188 }, { "epoch": 36.36363636363637, "grad_norm": 0.5186192393302917, "learning_rate": 1.0541691595800338e-06, "loss": 0.0231, "step": 1200 }, { "epoch": 36.96969696969697, "grad_norm": 0.3892291784286499, "learning_rate": 9.691954470734692e-07, "loss": 0.0273, "step": 1220 }, { "epoch": 37.0, "eval_loss": 0.008245617151260376, "eval_runtime": 0.9541, "eval_samples_per_second": 138.353, "eval_steps_per_second": 17.818, "step": 1221 }, { "epoch": 37.57575757575758, "grad_norm": 0.5318649411201477, "learning_rate": 8.869618053593429e-07, "loss": 0.0251, "step": 1240 }, { "epoch": 38.0, "eval_loss": 0.007936290465295315, "eval_runtime": 0.9281, "eval_samples_per_second": 142.225, "eval_steps_per_second": 18.317, "step": 1254 }, { "epoch": 38.18181818181818, "grad_norm": 0.37824392318725586, "learning_rate": 8.076154289305019e-07, "loss": 0.0258, "step": 1260 }, { "epoch": 38.78787878787879, "grad_norm": 0.402786523103714, "learning_rate": 7.312983442057497e-07, "loss": 0.0263, "step": 1280 }, { "epoch": 39.0, "eval_loss": 0.007719958666712046, "eval_runtime": 0.9423, "eval_samples_per_second": 140.079, "eval_steps_per_second": 18.041, "step": 1287 }, { "epoch": 39.39393939393939, "grad_norm": 0.41106194257736206, "learning_rate": 6.581471553089874e-07, "loss": 0.0243, "step": 1300 }, { "epoch": 40.0, "grad_norm": 0.5733346939086914, "learning_rate": 5.882927995540266e-07, "loss": 0.0247, "step": 1320 }, { "epoch": 40.0, "eval_loss": 0.00738176517188549, "eval_runtime": 0.9247, "eval_samples_per_second": 142.756, "eval_steps_per_second": 18.385, "step": 1320 }, { "epoch": 40.60606060606061, "grad_norm": 0.3024619221687317, "learning_rate": 5.218603130727243e-07, "loss": 0.0243, "step": 1340 }, { "epoch": 41.0, "eval_loss": 0.007284797262400389, "eval_runtime": 0.9333, "eval_samples_per_second": 141.426, "eval_steps_per_second": 18.214, "step": 1353 }, { "epoch": 41.21212121212121, "grad_norm": 0.48434221744537354, "learning_rate": 4.589686070059762e-07, "loss": 0.0245, "step": 1360 }, { "epoch": 41.81818181818182, "grad_norm": 0.4191039204597473, "learning_rate": 3.997302546581597e-07, "loss": 0.0259, "step": 1380 }, { "epoch": 42.0, "eval_loss": 0.007058488205075264, "eval_runtime": 0.9549, "eval_samples_per_second": 138.232, "eval_steps_per_second": 17.803, "step": 1386 }, { "epoch": 42.42424242424242, "grad_norm": 0.3011131286621094, "learning_rate": 3.4425128999602265e-07, "loss": 0.0234, "step": 1400 }, { "epoch": 43.0, "eval_loss": 0.006919534411281347, "eval_runtime": 0.9466, "eval_samples_per_second": 139.447, "eval_steps_per_second": 17.959, "step": 1419 }, { "epoch": 43.03030303030303, "grad_norm": 0.49921727180480957, "learning_rate": 2.9263101785268253e-07, "loss": 0.0268, "step": 1420 }, { "epoch": 43.63636363636363, "grad_norm": 0.41327381134033203, "learning_rate": 2.449618361764788e-07, "loss": 0.0232, "step": 1440 }, { "epoch": 44.0, "eval_loss": 0.0067825643345713615, "eval_runtime": 0.9457, "eval_samples_per_second": 139.574, "eval_steps_per_second": 17.975, "step": 1452 }, { "epoch": 44.24242424242424, "grad_norm": 0.42085084319114685, "learning_rate": 2.0132907064282837e-07, "loss": 0.0222, "step": 1460 }, { "epoch": 44.84848484848485, "grad_norm": 0.4812968075275421, "learning_rate": 1.6181082192513352e-07, "loss": 0.0245, "step": 1480 }, { "epoch": 45.0, "eval_loss": 0.006708750035613775, "eval_runtime": 0.944, "eval_samples_per_second": 139.826, "eval_steps_per_second": 18.008, "step": 1485 }, { "epoch": 45.45454545454545, "grad_norm": 0.41410696506500244, "learning_rate": 1.264778258981178e-07, "loss": 0.0234, "step": 1500 }, { "epoch": 46.0, "eval_loss": 0.006636774633079767, "eval_runtime": 0.9246, "eval_samples_per_second": 142.77, "eval_steps_per_second": 18.387, "step": 1518 }, { "epoch": 46.06060606060606, "grad_norm": 0.36860212683677673, "learning_rate": 9.539332702381026e-08, "loss": 0.0264, "step": 1520 }, { "epoch": 46.666666666666664, "grad_norm": 0.3396029770374298, "learning_rate": 6.86129651468273e-08, "loss": 0.0229, "step": 1540 }, { "epoch": 47.0, "eval_loss": 0.00661947438493371, "eval_runtime": 0.9503, "eval_samples_per_second": 138.905, "eval_steps_per_second": 17.889, "step": 1551 }, { "epoch": 47.27272727272727, "grad_norm": 0.4342035949230194, "learning_rate": 4.618467590157133e-08, "loss": 0.0233, "step": 1560 }, { "epoch": 47.878787878787875, "grad_norm": 0.4722955822944641, "learning_rate": 2.814860490961607e-08, "loss": 0.0248, "step": 1580 }, { "epoch": 48.0, "eval_loss": 0.006618270184844732, "eval_runtime": 0.9496, "eval_samples_per_second": 139.011, "eval_steps_per_second": 17.903, "step": 1584 }, { "epoch": 48.484848484848484, "grad_norm": 0.22162474691867828, "learning_rate": 1.453703592086353e-08, "loss": 0.0239, "step": 1600 }, { "epoch": 49.0, "eval_loss": 0.006619932595640421, "eval_runtime": 0.947, "eval_samples_per_second": 139.389, "eval_steps_per_second": 17.952, "step": 1617 }, { "epoch": 49.09090909090909, "grad_norm": 0.4756720960140228, "learning_rate": 5.374333027093892e-09, "loss": 0.0236, "step": 1620 }, { "epoch": 49.696969696969695, "grad_norm": 0.42004162073135376, "learning_rate": 6.768970513457151e-10, "loss": 0.0228, "step": 1640 }, { "epoch": 50.0, "eval_loss": 0.006614842917770147, "eval_runtime": 0.9549, "eval_samples_per_second": 138.227, "eval_steps_per_second": 17.802, "step": 1650 } ], "logging_steps": 20, "max_steps": 1650, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.91340755288064e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }