| { | |
| "best_global_step": 1650, | |
| "best_metric": 0.006614842917770147, | |
| "best_model_checkpoint": "/content/NH-SQL-finetuned/checkpoint-1650", | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 1650, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030303030303030304, | |
| "grad_norm": 6.595283031463623, | |
| "learning_rate": 0.0, | |
| "loss": 1.9666, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.6060606060606061, | |
| "grad_norm": 6.3463263511657715, | |
| "learning_rate": 5.757575757575758e-07, | |
| "loss": 1.8746, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.7834347486495972, | |
| "eval_runtime": 0.9389, | |
| "eval_samples_per_second": 140.592, | |
| "eval_steps_per_second": 18.107, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.2121212121212122, | |
| "grad_norm": 4.871561050415039, | |
| "learning_rate": 1.181818181818182e-06, | |
| "loss": 1.7412, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.8181818181818183, | |
| "grad_norm": 3.2557735443115234, | |
| "learning_rate": 1.787878787878788e-06, | |
| "loss": 1.5522, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.2882661819458008, | |
| "eval_runtime": 0.9475, | |
| "eval_samples_per_second": 139.315, | |
| "eval_steps_per_second": 17.942, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 1.9632468223571777, | |
| "learning_rate": 2.393939393939394e-06, | |
| "loss": 1.2669, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.8929286003112793, | |
| "eval_runtime": 0.9528, | |
| "eval_samples_per_second": 138.545, | |
| "eval_steps_per_second": 17.843, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 3.0303030303030303, | |
| "grad_norm": 1.6738139390945435, | |
| "learning_rate": 3e-06, | |
| "loss": 1.0527, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.6363636363636362, | |
| "grad_norm": 1.4490736722946167, | |
| "learning_rate": 3.606060606060606e-06, | |
| "loss": 0.8491, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.5932092070579529, | |
| "eval_runtime": 0.9509, | |
| "eval_samples_per_second": 138.815, | |
| "eval_steps_per_second": 17.878, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 4.242424242424242, | |
| "grad_norm": 1.0791282653808594, | |
| "learning_rate": 4.212121212121212e-06, | |
| "loss": 0.6586, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 4.848484848484849, | |
| "grad_norm": 0.8789141178131104, | |
| "learning_rate": 4.818181818181819e-06, | |
| "loss": 0.4801, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.42336800694465637, | |
| "eval_runtime": 0.9459, | |
| "eval_samples_per_second": 139.555, | |
| "eval_steps_per_second": 17.973, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 5.454545454545454, | |
| "grad_norm": 0.8390600085258484, | |
| "learning_rate": 4.9989035693310165e-06, | |
| "loss": 0.4134, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.3166239559650421, | |
| "eval_runtime": 0.9402, | |
| "eval_samples_per_second": 140.394, | |
| "eval_steps_per_second": 18.081, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 6.0606060606060606, | |
| "grad_norm": 0.908724308013916, | |
| "learning_rate": 4.993535611735464e-06, | |
| "loss": 0.33, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.8689959645271301, | |
| "learning_rate": 4.983704338371375e-06, | |
| "loss": 0.2941, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.246794193983078, | |
| "eval_runtime": 0.9539, | |
| "eval_samples_per_second": 138.381, | |
| "eval_steps_per_second": 17.822, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 7.2727272727272725, | |
| "grad_norm": 0.806349515914917, | |
| "learning_rate": 4.969427346772643e-06, | |
| "loss": 0.2513, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 7.878787878787879, | |
| "grad_norm": 0.8216381072998047, | |
| "learning_rate": 4.950730192107368e-06, | |
| "loss": 0.2244, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.19389495253562927, | |
| "eval_runtime": 0.9398, | |
| "eval_samples_per_second": 140.46, | |
| "eval_steps_per_second": 18.09, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 8.484848484848484, | |
| "grad_norm": 0.8309475183486938, | |
| "learning_rate": 4.927646341435276e-06, | |
| "loss": 0.1756, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.1608426719903946, | |
| "eval_runtime": 0.9567, | |
| "eval_samples_per_second": 137.969, | |
| "eval_steps_per_second": 17.769, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 9.090909090909092, | |
| "grad_norm": 0.807766854763031, | |
| "learning_rate": 4.900217113803193e-06, | |
| "loss": 0.1666, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 9.696969696969697, | |
| "grad_norm": 0.638713538646698, | |
| "learning_rate": 4.868491606285823e-06, | |
| "loss": 0.1756, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.13285745680332184, | |
| "eval_runtime": 0.9431, | |
| "eval_samples_per_second": 139.968, | |
| "eval_steps_per_second": 18.026, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 10.303030303030303, | |
| "grad_norm": 0.6577040553092957, | |
| "learning_rate": 4.832526606104213e-06, | |
| "loss": 0.1407, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 10.909090909090908, | |
| "grad_norm": 0.7107315063476562, | |
| "learning_rate": 4.792386488979193e-06, | |
| "loss": 0.1218, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.1115950271487236, | |
| "eval_runtime": 0.9468, | |
| "eval_samples_per_second": 139.417, | |
| "eval_steps_per_second": 17.955, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 11.515151515151516, | |
| "grad_norm": 0.7401019334793091, | |
| "learning_rate": 4.74814310390176e-06, | |
| "loss": 0.1362, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.09207186847925186, | |
| "eval_runtime": 0.9332, | |
| "eval_samples_per_second": 141.446, | |
| "eval_steps_per_second": 18.216, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 12.121212121212121, | |
| "grad_norm": 0.68468177318573, | |
| "learning_rate": 4.699875644526633e-06, | |
| "loss": 0.0987, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 12.727272727272727, | |
| "grad_norm": 0.7722787857055664, | |
| "learning_rate": 4.647670507419206e-06, | |
| "loss": 0.0989, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.0792744979262352, | |
| "eval_runtime": 0.935, | |
| "eval_samples_per_second": 141.172, | |
| "eval_steps_per_second": 18.181, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 13.333333333333334, | |
| "grad_norm": 0.48431074619293213, | |
| "learning_rate": 4.591621137409602e-06, | |
| "loss": 0.0936, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 13.93939393939394, | |
| "grad_norm": 0.555296778678894, | |
| "learning_rate": 4.53182786033067e-06, | |
| "loss": 0.0945, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.07043830305337906, | |
| "eval_runtime": 0.9312, | |
| "eval_samples_per_second": 141.753, | |
| "eval_steps_per_second": 18.256, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 14.545454545454545, | |
| "grad_norm": 0.5280194878578186, | |
| "learning_rate": 4.468397703439282e-06, | |
| "loss": 0.0811, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.06017066538333893, | |
| "eval_runtime": 0.9305, | |
| "eval_samples_per_second": 141.866, | |
| "eval_steps_per_second": 18.271, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 15.151515151515152, | |
| "grad_norm": 0.5912495851516724, | |
| "learning_rate": 4.401444203842396e-06, | |
| "loss": 0.0742, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 15.757575757575758, | |
| "grad_norm": 1.0043439865112305, | |
| "learning_rate": 4.331087205270778e-06, | |
| "loss": 0.0778, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.052856337279081345, | |
| "eval_runtime": 0.9363, | |
| "eval_samples_per_second": 140.988, | |
| "eval_steps_per_second": 18.158, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 16.363636363636363, | |
| "grad_norm": 0.37315091490745544, | |
| "learning_rate": 4.257452643564155e-06, | |
| "loss": 0.0746, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 16.96969696969697, | |
| "grad_norm": 0.5725280046463013, | |
| "learning_rate": 4.180672321251766e-06, | |
| "loss": 0.0651, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.04653371125459671, | |
| "eval_runtime": 0.9263, | |
| "eval_samples_per_second": 142.504, | |
| "eval_steps_per_second": 18.353, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 17.575757575757574, | |
| "grad_norm": 0.6575049161911011, | |
| "learning_rate": 4.100883671631806e-06, | |
| "loss": 0.0529, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.04210372641682625, | |
| "eval_runtime": 0.9448, | |
| "eval_samples_per_second": 139.719, | |
| "eval_steps_per_second": 17.994, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 18.181818181818183, | |
| "grad_norm": 0.7622207999229431, | |
| "learning_rate": 4.018229512772053e-06, | |
| "loss": 0.0644, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 18.78787878787879, | |
| "grad_norm": 0.5367633700370789, | |
| "learning_rate": 3.9328577918719916e-06, | |
| "loss": 0.0551, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.03950377553701401, | |
| "eval_runtime": 0.9419, | |
| "eval_samples_per_second": 140.144, | |
| "eval_steps_per_second": 18.049, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 19.393939393939394, | |
| "grad_norm": 0.581658661365509, | |
| "learning_rate": 3.844921320444031e-06, | |
| "loss": 0.0566, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.7774003148078918, | |
| "learning_rate": 3.754577500787828e-06, | |
| "loss": 0.0532, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.0353802889585495, | |
| "eval_runtime": 0.9534, | |
| "eval_samples_per_second": 138.453, | |
| "eval_steps_per_second": 17.831, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 20.606060606060606, | |
| "grad_norm": 0.6069843769073486, | |
| "learning_rate": 3.66198804424729e-06, | |
| "loss": 0.0436, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.031004376709461212, | |
| "eval_runtime": 0.9391, | |
| "eval_samples_per_second": 140.556, | |
| "eval_steps_per_second": 18.102, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 21.21212121212121, | |
| "grad_norm": 0.5869702696800232, | |
| "learning_rate": 3.5673186817546047e-06, | |
| "loss": 0.0487, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 21.818181818181817, | |
| "grad_norm": 0.39276406168937683, | |
| "learning_rate": 3.4707388671793814e-06, | |
| "loss": 0.0459, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.027377676218748093, | |
| "eval_runtime": 0.9405, | |
| "eval_samples_per_second": 140.352, | |
| "eval_steps_per_second": 18.076, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 22.424242424242426, | |
| "grad_norm": 0.5247741937637329, | |
| "learning_rate": 3.3724214740138933e-06, | |
| "loss": 0.0461, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.02525358274579048, | |
| "eval_runtime": 0.9593, | |
| "eval_samples_per_second": 137.596, | |
| "eval_steps_per_second": 17.721, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 23.03030303030303, | |
| "grad_norm": 0.4420148730278015, | |
| "learning_rate": 3.272542485937369e-06, | |
| "loss": 0.0424, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 23.636363636363637, | |
| "grad_norm": 0.4988000690937042, | |
| "learning_rate": 3.171280681813174e-06, | |
| "loss": 0.0443, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.02300359681248665, | |
| "eval_runtime": 0.9498, | |
| "eval_samples_per_second": 138.973, | |
| "eval_steps_per_second": 17.898, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 24.242424242424242, | |
| "grad_norm": 0.6696539521217346, | |
| "learning_rate": 3.0688173156827454e-06, | |
| "loss": 0.0346, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 24.848484848484848, | |
| "grad_norm": 0.6659572720527649, | |
| "learning_rate": 2.9653357923290753e-06, | |
| "loss": 0.0394, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.02064535580575466, | |
| "eval_runtime": 0.9419, | |
| "eval_samples_per_second": 140.14, | |
| "eval_steps_per_second": 18.048, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 25.454545454545453, | |
| "grad_norm": 0.477318674325943, | |
| "learning_rate": 2.86102133899045e-06, | |
| "loss": 0.0354, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.017997996881604195, | |
| "eval_runtime": 0.9337, | |
| "eval_samples_per_second": 141.375, | |
| "eval_steps_per_second": 18.207, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 26.060606060606062, | |
| "grad_norm": 0.4188827574253082, | |
| "learning_rate": 2.7560606738120947e-06, | |
| "loss": 0.0379, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 26.666666666666668, | |
| "grad_norm": 0.37732234597206116, | |
| "learning_rate": 2.6506416716291466e-06, | |
| "loss": 0.0369, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.01666710339486599, | |
| "eval_runtime": 0.9423, | |
| "eval_samples_per_second": 140.084, | |
| "eval_steps_per_second": 18.041, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 27.272727272727273, | |
| "grad_norm": 0.5058871507644653, | |
| "learning_rate": 2.544953027679216e-06, | |
| "loss": 0.0327, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 27.87878787878788, | |
| "grad_norm": 0.5595805644989014, | |
| "learning_rate": 2.4391839198464613e-06, | |
| "loss": 0.0338, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.015013493597507477, | |
| "eval_runtime": 0.93, | |
| "eval_samples_per_second": 141.934, | |
| "eval_steps_per_second": 18.279, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 28.484848484848484, | |
| "grad_norm": 0.4609270393848419, | |
| "learning_rate": 2.3335236700417404e-06, | |
| "loss": 0.0306, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.014108900912106037, | |
| "eval_runtime": 0.9248, | |
| "eval_samples_per_second": 142.737, | |
| "eval_steps_per_second": 18.383, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 29.09090909090909, | |
| "grad_norm": 0.3746008276939392, | |
| "learning_rate": 2.2281614053249796e-06, | |
| "loss": 0.0307, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 29.696969696969695, | |
| "grad_norm": 0.5330935716629028, | |
| "learning_rate": 2.1232857193762923e-06, | |
| "loss": 0.0298, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.013387720100581646, | |
| "eval_runtime": 0.9587, | |
| "eval_samples_per_second": 137.688, | |
| "eval_steps_per_second": 17.733, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 30.303030303030305, | |
| "grad_norm": 0.31854015588760376, | |
| "learning_rate": 2.019084334921849e-06, | |
| "loss": 0.028, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 30.90909090909091, | |
| "grad_norm": 0.38515424728393555, | |
| "learning_rate": 1.9157437677186903e-06, | |
| "loss": 0.031, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.012279902584850788, | |
| "eval_runtime": 0.9342, | |
| "eval_samples_per_second": 141.305, | |
| "eval_steps_per_second": 18.198, | |
| "step": 1023 | |
| }, | |
| { | |
| "epoch": 31.515151515151516, | |
| "grad_norm": 0.45346567034721375, | |
| "learning_rate": 1.8134489926999837e-06, | |
| "loss": 0.033, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.011261457577347755, | |
| "eval_runtime": 0.9421, | |
| "eval_samples_per_second": 140.119, | |
| "eval_steps_per_second": 18.046, | |
| "step": 1056 | |
| }, | |
| { | |
| "epoch": 32.121212121212125, | |
| "grad_norm": 0.44893690943717957, | |
| "learning_rate": 1.7123831128782686e-06, | |
| "loss": 0.0246, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 32.72727272727273, | |
| "grad_norm": 0.4021283984184265, | |
| "learning_rate": 1.612727031599356e-06, | |
| "loss": 0.03, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.010378457605838776, | |
| "eval_runtime": 0.9543, | |
| "eval_samples_per_second": 138.318, | |
| "eval_steps_per_second": 17.814, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 33.333333333333336, | |
| "grad_norm": 0.6586357951164246, | |
| "learning_rate": 1.5146591287335452e-06, | |
| "loss": 0.0266, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 33.93939393939394, | |
| "grad_norm": 0.4133249521255493, | |
| "learning_rate": 1.4183549413837288e-06, | |
| "loss": 0.026, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.009874224662780762, | |
| "eval_runtime": 0.9426, | |
| "eval_samples_per_second": 140.041, | |
| "eval_steps_per_second": 18.036, | |
| "step": 1122 | |
| }, | |
| { | |
| "epoch": 34.54545454545455, | |
| "grad_norm": 0.4618055522441864, | |
| "learning_rate": 1.3239868496819407e-06, | |
| "loss": 0.0278, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.009275372140109539, | |
| "eval_runtime": 0.9399, | |
| "eval_samples_per_second": 140.441, | |
| "eval_steps_per_second": 18.087, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 35.15151515151515, | |
| "grad_norm": 0.3481239676475525, | |
| "learning_rate": 1.2317237682367178e-06, | |
| "loss": 0.0253, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 35.75757575757576, | |
| "grad_norm": 0.42644399404525757, | |
| "learning_rate": 1.1417308437836181e-06, | |
| "loss": 0.0269, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.008818729780614376, | |
| "eval_runtime": 0.9417, | |
| "eval_samples_per_second": 140.172, | |
| "eval_steps_per_second": 18.052, | |
| "step": 1188 | |
| }, | |
| { | |
| "epoch": 36.36363636363637, | |
| "grad_norm": 0.5186192393302917, | |
| "learning_rate": 1.0541691595800338e-06, | |
| "loss": 0.0231, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 36.96969696969697, | |
| "grad_norm": 0.3892291784286499, | |
| "learning_rate": 9.691954470734692e-07, | |
| "loss": 0.0273, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.008245617151260376, | |
| "eval_runtime": 0.9541, | |
| "eval_samples_per_second": 138.353, | |
| "eval_steps_per_second": 17.818, | |
| "step": 1221 | |
| }, | |
| { | |
| "epoch": 37.57575757575758, | |
| "grad_norm": 0.5318649411201477, | |
| "learning_rate": 8.869618053593429e-07, | |
| "loss": 0.0251, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.007936290465295315, | |
| "eval_runtime": 0.9281, | |
| "eval_samples_per_second": 142.225, | |
| "eval_steps_per_second": 18.317, | |
| "step": 1254 | |
| }, | |
| { | |
| "epoch": 38.18181818181818, | |
| "grad_norm": 0.37824392318725586, | |
| "learning_rate": 8.076154289305019e-07, | |
| "loss": 0.0258, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 38.78787878787879, | |
| "grad_norm": 0.402786523103714, | |
| "learning_rate": 7.312983442057497e-07, | |
| "loss": 0.0263, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.007719958666712046, | |
| "eval_runtime": 0.9423, | |
| "eval_samples_per_second": 140.079, | |
| "eval_steps_per_second": 18.041, | |
| "step": 1287 | |
| }, | |
| { | |
| "epoch": 39.39393939393939, | |
| "grad_norm": 0.41106194257736206, | |
| "learning_rate": 6.581471553089874e-07, | |
| "loss": 0.0243, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 0.5733346939086914, | |
| "learning_rate": 5.882927995540266e-07, | |
| "loss": 0.0247, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.00738176517188549, | |
| "eval_runtime": 0.9247, | |
| "eval_samples_per_second": 142.756, | |
| "eval_steps_per_second": 18.385, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 40.60606060606061, | |
| "grad_norm": 0.3024619221687317, | |
| "learning_rate": 5.218603130727243e-07, | |
| "loss": 0.0243, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 0.007284797262400389, | |
| "eval_runtime": 0.9333, | |
| "eval_samples_per_second": 141.426, | |
| "eval_steps_per_second": 18.214, | |
| "step": 1353 | |
| }, | |
| { | |
| "epoch": 41.21212121212121, | |
| "grad_norm": 0.48434221744537354, | |
| "learning_rate": 4.589686070059762e-07, | |
| "loss": 0.0245, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 41.81818181818182, | |
| "grad_norm": 0.4191039204597473, | |
| "learning_rate": 3.997302546581597e-07, | |
| "loss": 0.0259, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 0.007058488205075264, | |
| "eval_runtime": 0.9549, | |
| "eval_samples_per_second": 138.232, | |
| "eval_steps_per_second": 17.803, | |
| "step": 1386 | |
| }, | |
| { | |
| "epoch": 42.42424242424242, | |
| "grad_norm": 0.3011131286621094, | |
| "learning_rate": 3.4425128999602265e-07, | |
| "loss": 0.0234, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 0.006919534411281347, | |
| "eval_runtime": 0.9466, | |
| "eval_samples_per_second": 139.447, | |
| "eval_steps_per_second": 17.959, | |
| "step": 1419 | |
| }, | |
| { | |
| "epoch": 43.03030303030303, | |
| "grad_norm": 0.49921727180480957, | |
| "learning_rate": 2.9263101785268253e-07, | |
| "loss": 0.0268, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 43.63636363636363, | |
| "grad_norm": 0.41327381134033203, | |
| "learning_rate": 2.449618361764788e-07, | |
| "loss": 0.0232, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.0067825643345713615, | |
| "eval_runtime": 0.9457, | |
| "eval_samples_per_second": 139.574, | |
| "eval_steps_per_second": 17.975, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 44.24242424242424, | |
| "grad_norm": 0.42085084319114685, | |
| "learning_rate": 2.0132907064282837e-07, | |
| "loss": 0.0222, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 44.84848484848485, | |
| "grad_norm": 0.4812968075275421, | |
| "learning_rate": 1.6181082192513352e-07, | |
| "loss": 0.0245, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 0.006708750035613775, | |
| "eval_runtime": 0.944, | |
| "eval_samples_per_second": 139.826, | |
| "eval_steps_per_second": 18.008, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 45.45454545454545, | |
| "grad_norm": 0.41410696506500244, | |
| "learning_rate": 1.264778258981178e-07, | |
| "loss": 0.0234, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 0.006636774633079767, | |
| "eval_runtime": 0.9246, | |
| "eval_samples_per_second": 142.77, | |
| "eval_steps_per_second": 18.387, | |
| "step": 1518 | |
| }, | |
| { | |
| "epoch": 46.06060606060606, | |
| "grad_norm": 0.36860212683677673, | |
| "learning_rate": 9.539332702381026e-08, | |
| "loss": 0.0264, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 46.666666666666664, | |
| "grad_norm": 0.3396029770374298, | |
| "learning_rate": 6.86129651468273e-08, | |
| "loss": 0.0229, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 0.00661947438493371, | |
| "eval_runtime": 0.9503, | |
| "eval_samples_per_second": 138.905, | |
| "eval_steps_per_second": 17.889, | |
| "step": 1551 | |
| }, | |
| { | |
| "epoch": 47.27272727272727, | |
| "grad_norm": 0.4342035949230194, | |
| "learning_rate": 4.618467590157133e-08, | |
| "loss": 0.0233, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 47.878787878787875, | |
| "grad_norm": 0.4722955822944641, | |
| "learning_rate": 2.814860490961607e-08, | |
| "loss": 0.0248, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.006618270184844732, | |
| "eval_runtime": 0.9496, | |
| "eval_samples_per_second": 139.011, | |
| "eval_steps_per_second": 17.903, | |
| "step": 1584 | |
| }, | |
| { | |
| "epoch": 48.484848484848484, | |
| "grad_norm": 0.22162474691867828, | |
| "learning_rate": 1.453703592086353e-08, | |
| "loss": 0.0239, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 0.006619932595640421, | |
| "eval_runtime": 0.947, | |
| "eval_samples_per_second": 139.389, | |
| "eval_steps_per_second": 17.952, | |
| "step": 1617 | |
| }, | |
| { | |
| "epoch": 49.09090909090909, | |
| "grad_norm": 0.4756720960140228, | |
| "learning_rate": 5.374333027093892e-09, | |
| "loss": 0.0236, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 49.696969696969695, | |
| "grad_norm": 0.42004162073135376, | |
| "learning_rate": 6.768970513457151e-10, | |
| "loss": 0.0228, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 0.006614842917770147, | |
| "eval_runtime": 0.9549, | |
| "eval_samples_per_second": 138.227, | |
| "eval_steps_per_second": 17.802, | |
| "step": 1650 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 1650, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.91340755288064e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |