| { | |
| "best_metric": 1.3527668714523315, | |
| "best_model_checkpoint": "/scratch/gpfs/BG11/suze-mdl-steps/model_combo_10.txt/checkpoint-22506", | |
| "epoch": 46.118852459016395, | |
| "eval_steps": 121, | |
| "global_step": 22506, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.24795081967213115, | |
| "grad_norm": 6.242262363433838, | |
| "learning_rate": 1.990081967213115e-05, | |
| "loss": 1.5517, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.24795081967213115, | |
| "eval_loss": 1.451869249343872, | |
| "eval_runtime": 4.1207, | |
| "eval_samples_per_second": 105.321, | |
| "eval_steps_per_second": 13.347, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.4959016393442623, | |
| "grad_norm": 5.890889644622803, | |
| "learning_rate": 1.9801639344262295e-05, | |
| "loss": 1.5921, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.4959016393442623, | |
| "eval_loss": 1.410286784172058, | |
| "eval_runtime": 4.1324, | |
| "eval_samples_per_second": 105.023, | |
| "eval_steps_per_second": 13.309, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.7438524590163934, | |
| "grad_norm": 5.299322128295898, | |
| "learning_rate": 1.9702459016393446e-05, | |
| "loss": 1.6134, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.7438524590163934, | |
| "eval_loss": 1.4782627820968628, | |
| "eval_runtime": 4.1505, | |
| "eval_samples_per_second": 104.567, | |
| "eval_steps_per_second": 13.252, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.9918032786885246, | |
| "grad_norm": 5.992594242095947, | |
| "learning_rate": 1.960327868852459e-05, | |
| "loss": 1.554, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.9918032786885246, | |
| "eval_loss": 1.4408639669418335, | |
| "eval_runtime": 4.1339, | |
| "eval_samples_per_second": 104.987, | |
| "eval_steps_per_second": 13.305, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 1.2397540983606556, | |
| "grad_norm": 5.52030611038208, | |
| "learning_rate": 1.950409836065574e-05, | |
| "loss": 1.5205, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.2397540983606556, | |
| "eval_loss": 1.4437590837478638, | |
| "eval_runtime": 4.1338, | |
| "eval_samples_per_second": 104.988, | |
| "eval_steps_per_second": 13.305, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.4877049180327868, | |
| "grad_norm": 5.520483016967773, | |
| "learning_rate": 1.9404918032786887e-05, | |
| "loss": 1.5542, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.4877049180327868, | |
| "eval_loss": 1.4612840414047241, | |
| "eval_runtime": 4.1326, | |
| "eval_samples_per_second": 105.019, | |
| "eval_steps_per_second": 13.309, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.735655737704918, | |
| "grad_norm": 5.218242168426514, | |
| "learning_rate": 1.9305737704918036e-05, | |
| "loss": 1.5079, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 1.735655737704918, | |
| "eval_loss": 1.4480838775634766, | |
| "eval_runtime": 4.13, | |
| "eval_samples_per_second": 105.086, | |
| "eval_steps_per_second": 13.317, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 1.9836065573770492, | |
| "grad_norm": 5.828314304351807, | |
| "learning_rate": 1.920655737704918e-05, | |
| "loss": 1.525, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 1.9836065573770492, | |
| "eval_loss": 1.4295066595077515, | |
| "eval_runtime": 4.129, | |
| "eval_samples_per_second": 105.11, | |
| "eval_steps_per_second": 13.32, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 2.2315573770491803, | |
| "grad_norm": 6.473424911499023, | |
| "learning_rate": 1.910737704918033e-05, | |
| "loss": 1.5396, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 2.2315573770491803, | |
| "eval_loss": 1.4381992816925049, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.178, | |
| "eval_steps_per_second": 13.329, | |
| "step": 1089 | |
| }, | |
| { | |
| "epoch": 2.4795081967213113, | |
| "grad_norm": 5.705435276031494, | |
| "learning_rate": 1.9008196721311477e-05, | |
| "loss": 1.4863, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.4795081967213113, | |
| "eval_loss": 1.4149571657180786, | |
| "eval_runtime": 4.1261, | |
| "eval_samples_per_second": 105.183, | |
| "eval_steps_per_second": 13.33, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.7274590163934427, | |
| "grad_norm": 6.025406837463379, | |
| "learning_rate": 1.8909016393442625e-05, | |
| "loss": 1.497, | |
| "step": 1331 | |
| }, | |
| { | |
| "epoch": 2.7274590163934427, | |
| "eval_loss": 1.4381588697433472, | |
| "eval_runtime": 4.1242, | |
| "eval_samples_per_second": 105.232, | |
| "eval_steps_per_second": 13.336, | |
| "step": 1331 | |
| }, | |
| { | |
| "epoch": 2.9754098360655736, | |
| "grad_norm": 5.826422214508057, | |
| "learning_rate": 1.8809836065573773e-05, | |
| "loss": 1.5218, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 2.9754098360655736, | |
| "eval_loss": 1.4456675052642822, | |
| "eval_runtime": 4.1254, | |
| "eval_samples_per_second": 105.203, | |
| "eval_steps_per_second": 13.332, | |
| "step": 1452 | |
| }, | |
| { | |
| "epoch": 3.223360655737705, | |
| "grad_norm": 5.673182010650635, | |
| "learning_rate": 1.8710655737704918e-05, | |
| "loss": 1.4819, | |
| "step": 1573 | |
| }, | |
| { | |
| "epoch": 3.223360655737705, | |
| "eval_loss": 1.4289182424545288, | |
| "eval_runtime": 4.1236, | |
| "eval_samples_per_second": 105.249, | |
| "eval_steps_per_second": 13.338, | |
| "step": 1573 | |
| }, | |
| { | |
| "epoch": 3.471311475409836, | |
| "grad_norm": 5.801868438720703, | |
| "learning_rate": 1.8611475409836066e-05, | |
| "loss": 1.4607, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 3.471311475409836, | |
| "eval_loss": 1.4265520572662354, | |
| "eval_runtime": 4.1254, | |
| "eval_samples_per_second": 105.202, | |
| "eval_steps_per_second": 13.332, | |
| "step": 1694 | |
| }, | |
| { | |
| "epoch": 3.7192622950819674, | |
| "grad_norm": 6.084210395812988, | |
| "learning_rate": 1.8512295081967214e-05, | |
| "loss": 1.5112, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.7192622950819674, | |
| "eval_loss": 1.4315944910049438, | |
| "eval_runtime": 4.1231, | |
| "eval_samples_per_second": 105.26, | |
| "eval_steps_per_second": 13.339, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 3.9672131147540983, | |
| "grad_norm": 5.915734767913818, | |
| "learning_rate": 1.8413114754098362e-05, | |
| "loss": 1.4653, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 3.9672131147540983, | |
| "eval_loss": 1.4230977296829224, | |
| "eval_runtime": 4.1241, | |
| "eval_samples_per_second": 105.235, | |
| "eval_steps_per_second": 13.336, | |
| "step": 1936 | |
| }, | |
| { | |
| "epoch": 4.215163934426229, | |
| "grad_norm": 5.40634822845459, | |
| "learning_rate": 1.831393442622951e-05, | |
| "loss": 1.4445, | |
| "step": 2057 | |
| }, | |
| { | |
| "epoch": 4.215163934426229, | |
| "eval_loss": 1.4059760570526123, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.2, | |
| "eval_steps_per_second": 13.332, | |
| "step": 2057 | |
| }, | |
| { | |
| "epoch": 4.463114754098361, | |
| "grad_norm": 5.65101432800293, | |
| "learning_rate": 1.8214754098360655e-05, | |
| "loss": 1.4843, | |
| "step": 2178 | |
| }, | |
| { | |
| "epoch": 4.463114754098361, | |
| "eval_loss": 1.4199328422546387, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.178, | |
| "eval_steps_per_second": 13.329, | |
| "step": 2178 | |
| }, | |
| { | |
| "epoch": 4.711065573770492, | |
| "grad_norm": 5.768662452697754, | |
| "learning_rate": 1.8115573770491807e-05, | |
| "loss": 1.4737, | |
| "step": 2299 | |
| }, | |
| { | |
| "epoch": 4.711065573770492, | |
| "eval_loss": 1.4175463914871216, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.2, | |
| "eval_steps_per_second": 13.332, | |
| "step": 2299 | |
| }, | |
| { | |
| "epoch": 4.959016393442623, | |
| "grad_norm": 5.345828056335449, | |
| "learning_rate": 1.8016393442622952e-05, | |
| "loss": 1.4308, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 4.959016393442623, | |
| "eval_loss": 1.3921349048614502, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.224, | |
| "eval_steps_per_second": 13.335, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 5.206967213114754, | |
| "grad_norm": 5.558488845825195, | |
| "learning_rate": 1.79172131147541e-05, | |
| "loss": 1.4288, | |
| "step": 2541 | |
| }, | |
| { | |
| "epoch": 5.206967213114754, | |
| "eval_loss": 1.4271553754806519, | |
| "eval_runtime": 4.123, | |
| "eval_samples_per_second": 105.263, | |
| "eval_steps_per_second": 13.34, | |
| "step": 2541 | |
| }, | |
| { | |
| "epoch": 5.454918032786885, | |
| "grad_norm": 5.293428897857666, | |
| "learning_rate": 1.7818032786885248e-05, | |
| "loss": 1.4291, | |
| "step": 2662 | |
| }, | |
| { | |
| "epoch": 5.454918032786885, | |
| "eval_loss": 1.430025577545166, | |
| "eval_runtime": 4.1248, | |
| "eval_samples_per_second": 105.218, | |
| "eval_steps_per_second": 13.334, | |
| "step": 2662 | |
| }, | |
| { | |
| "epoch": 5.702868852459017, | |
| "grad_norm": 5.373508453369141, | |
| "learning_rate": 1.7718852459016396e-05, | |
| "loss": 1.4616, | |
| "step": 2783 | |
| }, | |
| { | |
| "epoch": 5.702868852459017, | |
| "eval_loss": 1.452791452407837, | |
| "eval_runtime": 4.1221, | |
| "eval_samples_per_second": 105.286, | |
| "eval_steps_per_second": 13.343, | |
| "step": 2783 | |
| }, | |
| { | |
| "epoch": 5.950819672131147, | |
| "grad_norm": 4.782499313354492, | |
| "learning_rate": 1.761967213114754e-05, | |
| "loss": 1.4241, | |
| "step": 2904 | |
| }, | |
| { | |
| "epoch": 5.950819672131147, | |
| "eval_loss": 1.4025797843933105, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.176, | |
| "eval_steps_per_second": 13.329, | |
| "step": 2904 | |
| }, | |
| { | |
| "epoch": 6.198770491803279, | |
| "grad_norm": 5.344179630279541, | |
| "learning_rate": 1.752049180327869e-05, | |
| "loss": 1.4187, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 6.198770491803279, | |
| "eval_loss": 1.4222002029418945, | |
| "eval_runtime": 4.1238, | |
| "eval_samples_per_second": 105.242, | |
| "eval_steps_per_second": 13.337, | |
| "step": 3025 | |
| }, | |
| { | |
| "epoch": 6.44672131147541, | |
| "grad_norm": 5.498497009277344, | |
| "learning_rate": 1.7421311475409838e-05, | |
| "loss": 1.4179, | |
| "step": 3146 | |
| }, | |
| { | |
| "epoch": 6.44672131147541, | |
| "eval_loss": 1.425389051437378, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.201, | |
| "eval_steps_per_second": 13.332, | |
| "step": 3146 | |
| }, | |
| { | |
| "epoch": 6.6946721311475414, | |
| "grad_norm": 6.103499412536621, | |
| "learning_rate": 1.7322131147540986e-05, | |
| "loss": 1.4036, | |
| "step": 3267 | |
| }, | |
| { | |
| "epoch": 6.6946721311475414, | |
| "eval_loss": 1.4255503416061401, | |
| "eval_runtime": 4.1267, | |
| "eval_samples_per_second": 105.168, | |
| "eval_steps_per_second": 13.328, | |
| "step": 3267 | |
| }, | |
| { | |
| "epoch": 6.942622950819672, | |
| "grad_norm": 5.6141462326049805, | |
| "learning_rate": 1.7222950819672134e-05, | |
| "loss": 1.4237, | |
| "step": 3388 | |
| }, | |
| { | |
| "epoch": 6.942622950819672, | |
| "eval_loss": 1.4209522008895874, | |
| "eval_runtime": 4.1246, | |
| "eval_samples_per_second": 105.221, | |
| "eval_steps_per_second": 13.334, | |
| "step": 3388 | |
| }, | |
| { | |
| "epoch": 7.190573770491803, | |
| "grad_norm": 6.086358547210693, | |
| "learning_rate": 1.712377049180328e-05, | |
| "loss": 1.3858, | |
| "step": 3509 | |
| }, | |
| { | |
| "epoch": 7.190573770491803, | |
| "eval_loss": 1.4121168851852417, | |
| "eval_runtime": 4.1269, | |
| "eval_samples_per_second": 105.163, | |
| "eval_steps_per_second": 13.327, | |
| "step": 3509 | |
| }, | |
| { | |
| "epoch": 7.438524590163935, | |
| "grad_norm": 5.1807637214660645, | |
| "learning_rate": 1.7024590163934427e-05, | |
| "loss": 1.4206, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 7.438524590163935, | |
| "eval_loss": 1.421557903289795, | |
| "eval_runtime": 4.1232, | |
| "eval_samples_per_second": 105.259, | |
| "eval_steps_per_second": 13.339, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 7.686475409836065, | |
| "grad_norm": 5.515040397644043, | |
| "learning_rate": 1.6925409836065575e-05, | |
| "loss": 1.4213, | |
| "step": 3751 | |
| }, | |
| { | |
| "epoch": 7.686475409836065, | |
| "eval_loss": 1.392737627029419, | |
| "eval_runtime": 4.1243, | |
| "eval_samples_per_second": 105.23, | |
| "eval_steps_per_second": 13.336, | |
| "step": 3751 | |
| }, | |
| { | |
| "epoch": 7.934426229508197, | |
| "grad_norm": 5.375150203704834, | |
| "learning_rate": 1.6826229508196723e-05, | |
| "loss": 1.3999, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 7.934426229508197, | |
| "eval_loss": 1.425716519355774, | |
| "eval_runtime": 4.1231, | |
| "eval_samples_per_second": 105.259, | |
| "eval_steps_per_second": 13.339, | |
| "step": 3872 | |
| }, | |
| { | |
| "epoch": 8.182377049180328, | |
| "grad_norm": 5.733981609344482, | |
| "learning_rate": 1.672704918032787e-05, | |
| "loss": 1.4004, | |
| "step": 3993 | |
| }, | |
| { | |
| "epoch": 8.182377049180328, | |
| "eval_loss": 1.4038772583007812, | |
| "eval_runtime": 4.122, | |
| "eval_samples_per_second": 105.29, | |
| "eval_steps_per_second": 13.343, | |
| "step": 3993 | |
| }, | |
| { | |
| "epoch": 8.430327868852459, | |
| "grad_norm": 5.9191083908081055, | |
| "learning_rate": 1.6627868852459016e-05, | |
| "loss": 1.3726, | |
| "step": 4114 | |
| }, | |
| { | |
| "epoch": 8.430327868852459, | |
| "eval_loss": 1.3957583904266357, | |
| "eval_runtime": 4.122, | |
| "eval_samples_per_second": 105.288, | |
| "eval_steps_per_second": 13.343, | |
| "step": 4114 | |
| }, | |
| { | |
| "epoch": 8.67827868852459, | |
| "grad_norm": 5.487318992614746, | |
| "learning_rate": 1.6528688524590168e-05, | |
| "loss": 1.385, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 8.67827868852459, | |
| "eval_loss": 1.457805871963501, | |
| "eval_runtime": 4.177, | |
| "eval_samples_per_second": 103.902, | |
| "eval_steps_per_second": 13.167, | |
| "step": 4235 | |
| }, | |
| { | |
| "epoch": 8.926229508196721, | |
| "grad_norm": 5.596704959869385, | |
| "learning_rate": 1.6429508196721313e-05, | |
| "loss": 1.401, | |
| "step": 4356 | |
| }, | |
| { | |
| "epoch": 8.926229508196721, | |
| "eval_loss": 1.3986533880233765, | |
| "eval_runtime": 4.1733, | |
| "eval_samples_per_second": 103.994, | |
| "eval_steps_per_second": 13.179, | |
| "step": 4356 | |
| }, | |
| { | |
| "epoch": 9.174180327868852, | |
| "grad_norm": 5.978359222412109, | |
| "learning_rate": 1.633032786885246e-05, | |
| "loss": 1.3846, | |
| "step": 4477 | |
| }, | |
| { | |
| "epoch": 9.174180327868852, | |
| "eval_loss": 1.4179773330688477, | |
| "eval_runtime": 4.1466, | |
| "eval_samples_per_second": 104.664, | |
| "eval_steps_per_second": 13.264, | |
| "step": 4477 | |
| }, | |
| { | |
| "epoch": 9.422131147540984, | |
| "grad_norm": 5.524637222290039, | |
| "learning_rate": 1.6231147540983606e-05, | |
| "loss": 1.3677, | |
| "step": 4598 | |
| }, | |
| { | |
| "epoch": 9.422131147540984, | |
| "eval_loss": 1.41084623336792, | |
| "eval_runtime": 4.1689, | |
| "eval_samples_per_second": 104.103, | |
| "eval_steps_per_second": 13.193, | |
| "step": 4598 | |
| }, | |
| { | |
| "epoch": 9.670081967213115, | |
| "grad_norm": 5.55858039855957, | |
| "learning_rate": 1.6131967213114757e-05, | |
| "loss": 1.366, | |
| "step": 4719 | |
| }, | |
| { | |
| "epoch": 9.670081967213115, | |
| "eval_loss": 1.404625415802002, | |
| "eval_runtime": 4.1425, | |
| "eval_samples_per_second": 104.768, | |
| "eval_steps_per_second": 13.277, | |
| "step": 4719 | |
| }, | |
| { | |
| "epoch": 9.918032786885245, | |
| "grad_norm": 5.1674275398254395, | |
| "learning_rate": 1.6032786885245902e-05, | |
| "loss": 1.3817, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 9.918032786885245, | |
| "eval_loss": 1.4428249597549438, | |
| "eval_runtime": 4.1564, | |
| "eval_samples_per_second": 104.418, | |
| "eval_steps_per_second": 13.233, | |
| "step": 4840 | |
| }, | |
| { | |
| "epoch": 10.165983606557377, | |
| "grad_norm": 5.132227897644043, | |
| "learning_rate": 1.593360655737705e-05, | |
| "loss": 1.3653, | |
| "step": 4961 | |
| }, | |
| { | |
| "epoch": 10.165983606557377, | |
| "eval_loss": 1.3900526762008667, | |
| "eval_runtime": 4.153, | |
| "eval_samples_per_second": 104.502, | |
| "eval_steps_per_second": 13.243, | |
| "step": 4961 | |
| }, | |
| { | |
| "epoch": 10.413934426229508, | |
| "grad_norm": 6.354508876800537, | |
| "learning_rate": 1.5834426229508198e-05, | |
| "loss": 1.3543, | |
| "step": 5082 | |
| }, | |
| { | |
| "epoch": 10.413934426229508, | |
| "eval_loss": 1.415820598602295, | |
| "eval_runtime": 4.155, | |
| "eval_samples_per_second": 104.452, | |
| "eval_steps_per_second": 13.237, | |
| "step": 5082 | |
| }, | |
| { | |
| "epoch": 10.66188524590164, | |
| "grad_norm": 5.173457622528076, | |
| "learning_rate": 1.5735245901639346e-05, | |
| "loss": 1.3692, | |
| "step": 5203 | |
| }, | |
| { | |
| "epoch": 10.66188524590164, | |
| "eval_loss": 1.3986164331436157, | |
| "eval_runtime": 4.1614, | |
| "eval_samples_per_second": 104.292, | |
| "eval_steps_per_second": 13.217, | |
| "step": 5203 | |
| }, | |
| { | |
| "epoch": 10.90983606557377, | |
| "grad_norm": 5.250792026519775, | |
| "learning_rate": 1.5636065573770495e-05, | |
| "loss": 1.364, | |
| "step": 5324 | |
| }, | |
| { | |
| "epoch": 10.90983606557377, | |
| "eval_loss": 1.4086618423461914, | |
| "eval_runtime": 4.1657, | |
| "eval_samples_per_second": 104.185, | |
| "eval_steps_per_second": 13.203, | |
| "step": 5324 | |
| }, | |
| { | |
| "epoch": 11.157786885245901, | |
| "grad_norm": 4.989199161529541, | |
| "learning_rate": 1.553688524590164e-05, | |
| "loss": 1.3471, | |
| "step": 5445 | |
| }, | |
| { | |
| "epoch": 11.157786885245901, | |
| "eval_loss": 1.4073328971862793, | |
| "eval_runtime": 4.1729, | |
| "eval_samples_per_second": 104.003, | |
| "eval_steps_per_second": 13.18, | |
| "step": 5445 | |
| }, | |
| { | |
| "epoch": 11.405737704918034, | |
| "grad_norm": 5.10518741607666, | |
| "learning_rate": 1.5437704918032788e-05, | |
| "loss": 1.3468, | |
| "step": 5566 | |
| }, | |
| { | |
| "epoch": 11.405737704918034, | |
| "eval_loss": 1.37214994430542, | |
| "eval_runtime": 4.1352, | |
| "eval_samples_per_second": 104.953, | |
| "eval_steps_per_second": 13.3, | |
| "step": 5566 | |
| }, | |
| { | |
| "epoch": 11.653688524590164, | |
| "grad_norm": 5.374734878540039, | |
| "learning_rate": 1.5338524590163936e-05, | |
| "loss": 1.3465, | |
| "step": 5687 | |
| }, | |
| { | |
| "epoch": 11.653688524590164, | |
| "eval_loss": 1.3922464847564697, | |
| "eval_runtime": 4.1462, | |
| "eval_samples_per_second": 104.675, | |
| "eval_steps_per_second": 13.265, | |
| "step": 5687 | |
| }, | |
| { | |
| "epoch": 11.901639344262295, | |
| "grad_norm": 5.95257568359375, | |
| "learning_rate": 1.5239344262295084e-05, | |
| "loss": 1.3421, | |
| "step": 5808 | |
| }, | |
| { | |
| "epoch": 11.901639344262295, | |
| "eval_loss": 1.4019742012023926, | |
| "eval_runtime": 4.1249, | |
| "eval_samples_per_second": 105.214, | |
| "eval_steps_per_second": 13.334, | |
| "step": 5808 | |
| }, | |
| { | |
| "epoch": 12.149590163934427, | |
| "grad_norm": 5.823298931121826, | |
| "learning_rate": 1.514016393442623e-05, | |
| "loss": 1.3324, | |
| "step": 5929 | |
| }, | |
| { | |
| "epoch": 12.149590163934427, | |
| "eval_loss": 1.4440032243728638, | |
| "eval_runtime": 4.1244, | |
| "eval_samples_per_second": 105.227, | |
| "eval_steps_per_second": 13.335, | |
| "step": 5929 | |
| }, | |
| { | |
| "epoch": 12.397540983606557, | |
| "grad_norm": 5.080052375793457, | |
| "learning_rate": 1.5040983606557377e-05, | |
| "loss": 1.2988, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 12.397540983606557, | |
| "eval_loss": 1.3977147340774536, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.225, | |
| "eval_steps_per_second": 13.335, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 12.645491803278688, | |
| "grad_norm": 5.39788293838501, | |
| "learning_rate": 1.4941803278688525e-05, | |
| "loss": 1.3222, | |
| "step": 6171 | |
| }, | |
| { | |
| "epoch": 12.645491803278688, | |
| "eval_loss": 1.4123514890670776, | |
| "eval_runtime": 4.1278, | |
| "eval_samples_per_second": 105.14, | |
| "eval_steps_per_second": 13.324, | |
| "step": 6171 | |
| }, | |
| { | |
| "epoch": 12.89344262295082, | |
| "grad_norm": 5.456956386566162, | |
| "learning_rate": 1.4842622950819673e-05, | |
| "loss": 1.3274, | |
| "step": 6292 | |
| }, | |
| { | |
| "epoch": 12.89344262295082, | |
| "eval_loss": 1.3916068077087402, | |
| "eval_runtime": 4.1237, | |
| "eval_samples_per_second": 105.245, | |
| "eval_steps_per_second": 13.338, | |
| "step": 6292 | |
| }, | |
| { | |
| "epoch": 13.14139344262295, | |
| "grad_norm": 6.0245490074157715, | |
| "learning_rate": 1.4743442622950822e-05, | |
| "loss": 1.3313, | |
| "step": 6413 | |
| }, | |
| { | |
| "epoch": 13.14139344262295, | |
| "eval_loss": 1.419415831565857, | |
| "eval_runtime": 4.1248, | |
| "eval_samples_per_second": 105.218, | |
| "eval_steps_per_second": 13.334, | |
| "step": 6413 | |
| }, | |
| { | |
| "epoch": 13.389344262295081, | |
| "grad_norm": 5.695260524749756, | |
| "learning_rate": 1.4644262295081968e-05, | |
| "loss": 1.3364, | |
| "step": 6534 | |
| }, | |
| { | |
| "epoch": 13.389344262295081, | |
| "eval_loss": 1.4173094034194946, | |
| "eval_runtime": 4.1242, | |
| "eval_samples_per_second": 105.233, | |
| "eval_steps_per_second": 13.336, | |
| "step": 6534 | |
| }, | |
| { | |
| "epoch": 13.637295081967213, | |
| "grad_norm": 5.108427047729492, | |
| "learning_rate": 1.4545081967213115e-05, | |
| "loss": 1.3057, | |
| "step": 6655 | |
| }, | |
| { | |
| "epoch": 13.637295081967213, | |
| "eval_loss": 1.4033712148666382, | |
| "eval_runtime": 4.1221, | |
| "eval_samples_per_second": 105.287, | |
| "eval_steps_per_second": 13.343, | |
| "step": 6655 | |
| }, | |
| { | |
| "epoch": 13.885245901639344, | |
| "grad_norm": 7.01995849609375, | |
| "learning_rate": 1.4445901639344264e-05, | |
| "loss": 1.3075, | |
| "step": 6776 | |
| }, | |
| { | |
| "epoch": 13.885245901639344, | |
| "eval_loss": 1.4299310445785522, | |
| "eval_runtime": 4.1219, | |
| "eval_samples_per_second": 105.291, | |
| "eval_steps_per_second": 13.343, | |
| "step": 6776 | |
| }, | |
| { | |
| "epoch": 14.133196721311476, | |
| "grad_norm": 6.163614749908447, | |
| "learning_rate": 1.4346721311475411e-05, | |
| "loss": 1.319, | |
| "step": 6897 | |
| }, | |
| { | |
| "epoch": 14.133196721311476, | |
| "eval_loss": 1.4211634397506714, | |
| "eval_runtime": 4.1221, | |
| "eval_samples_per_second": 105.286, | |
| "eval_steps_per_second": 13.343, | |
| "step": 6897 | |
| }, | |
| { | |
| "epoch": 14.381147540983607, | |
| "grad_norm": 4.972089767456055, | |
| "learning_rate": 1.4247540983606557e-05, | |
| "loss": 1.2971, | |
| "step": 7018 | |
| }, | |
| { | |
| "epoch": 14.381147540983607, | |
| "eval_loss": 1.4057807922363281, | |
| "eval_runtime": 4.125, | |
| "eval_samples_per_second": 105.211, | |
| "eval_steps_per_second": 13.333, | |
| "step": 7018 | |
| }, | |
| { | |
| "epoch": 14.629098360655737, | |
| "grad_norm": 5.575927734375, | |
| "learning_rate": 1.4148360655737706e-05, | |
| "loss": 1.2984, | |
| "step": 7139 | |
| }, | |
| { | |
| "epoch": 14.629098360655737, | |
| "eval_loss": 1.4220715761184692, | |
| "eval_runtime": 4.1218, | |
| "eval_samples_per_second": 105.294, | |
| "eval_steps_per_second": 13.344, | |
| "step": 7139 | |
| }, | |
| { | |
| "epoch": 14.87704918032787, | |
| "grad_norm": 5.530676364898682, | |
| "learning_rate": 1.4049180327868854e-05, | |
| "loss": 1.2919, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 14.87704918032787, | |
| "eval_loss": 1.410522222518921, | |
| "eval_runtime": 4.122, | |
| "eval_samples_per_second": 105.288, | |
| "eval_steps_per_second": 13.343, | |
| "step": 7260 | |
| }, | |
| { | |
| "epoch": 15.125, | |
| "grad_norm": 5.002058029174805, | |
| "learning_rate": 1.3950000000000002e-05, | |
| "loss": 1.2886, | |
| "step": 7381 | |
| }, | |
| { | |
| "epoch": 15.125, | |
| "eval_loss": 1.3917385339736938, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.203, | |
| "eval_steps_per_second": 13.332, | |
| "step": 7381 | |
| }, | |
| { | |
| "epoch": 15.37295081967213, | |
| "grad_norm": 4.9464874267578125, | |
| "learning_rate": 1.3850819672131148e-05, | |
| "loss": 1.3125, | |
| "step": 7502 | |
| }, | |
| { | |
| "epoch": 15.37295081967213, | |
| "eval_loss": 1.3845915794372559, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.204, | |
| "eval_steps_per_second": 13.332, | |
| "step": 7502 | |
| }, | |
| { | |
| "epoch": 15.620901639344263, | |
| "grad_norm": 5.266591548919678, | |
| "learning_rate": 1.3751639344262295e-05, | |
| "loss": 1.2923, | |
| "step": 7623 | |
| }, | |
| { | |
| "epoch": 15.620901639344263, | |
| "eval_loss": 1.4047952890396118, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.205, | |
| "eval_steps_per_second": 13.332, | |
| "step": 7623 | |
| }, | |
| { | |
| "epoch": 15.868852459016393, | |
| "grad_norm": 5.416749000549316, | |
| "learning_rate": 1.3652459016393445e-05, | |
| "loss": 1.3015, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 15.868852459016393, | |
| "eval_loss": 1.428416132926941, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.2, | |
| "eval_steps_per_second": 13.332, | |
| "step": 7744 | |
| }, | |
| { | |
| "epoch": 16.116803278688526, | |
| "grad_norm": 4.929660797119141, | |
| "learning_rate": 1.3553278688524591e-05, | |
| "loss": 1.257, | |
| "step": 7865 | |
| }, | |
| { | |
| "epoch": 16.116803278688526, | |
| "eval_loss": 1.4093120098114014, | |
| "eval_runtime": 4.1273, | |
| "eval_samples_per_second": 105.154, | |
| "eval_steps_per_second": 13.326, | |
| "step": 7865 | |
| }, | |
| { | |
| "epoch": 16.364754098360656, | |
| "grad_norm": 5.573082447052002, | |
| "learning_rate": 1.3454098360655738e-05, | |
| "loss": 1.2802, | |
| "step": 7986 | |
| }, | |
| { | |
| "epoch": 16.364754098360656, | |
| "eval_loss": 1.4037296772003174, | |
| "eval_runtime": 4.125, | |
| "eval_samples_per_second": 105.212, | |
| "eval_steps_per_second": 13.333, | |
| "step": 7986 | |
| }, | |
| { | |
| "epoch": 16.612704918032787, | |
| "grad_norm": 5.440979480743408, | |
| "learning_rate": 1.3354918032786886e-05, | |
| "loss": 1.2671, | |
| "step": 8107 | |
| }, | |
| { | |
| "epoch": 16.612704918032787, | |
| "eval_loss": 1.392979383468628, | |
| "eval_runtime": 4.123, | |
| "eval_samples_per_second": 105.264, | |
| "eval_steps_per_second": 13.34, | |
| "step": 8107 | |
| }, | |
| { | |
| "epoch": 16.860655737704917, | |
| "grad_norm": 6.133276462554932, | |
| "learning_rate": 1.3255737704918034e-05, | |
| "loss": 1.2816, | |
| "step": 8228 | |
| }, | |
| { | |
| "epoch": 16.860655737704917, | |
| "eval_loss": 1.3886833190917969, | |
| "eval_runtime": 4.1222, | |
| "eval_samples_per_second": 105.284, | |
| "eval_steps_per_second": 13.342, | |
| "step": 8228 | |
| }, | |
| { | |
| "epoch": 17.108606557377048, | |
| "grad_norm": 5.235976219177246, | |
| "learning_rate": 1.3156557377049182e-05, | |
| "loss": 1.2669, | |
| "step": 8349 | |
| }, | |
| { | |
| "epoch": 17.108606557377048, | |
| "eval_loss": 1.4218270778656006, | |
| "eval_runtime": 4.1262, | |
| "eval_samples_per_second": 105.181, | |
| "eval_steps_per_second": 13.329, | |
| "step": 8349 | |
| }, | |
| { | |
| "epoch": 17.35655737704918, | |
| "grad_norm": 5.8139519691467285, | |
| "learning_rate": 1.3057377049180329e-05, | |
| "loss": 1.2582, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 17.35655737704918, | |
| "eval_loss": 1.419556736946106, | |
| "eval_runtime": 4.1304, | |
| "eval_samples_per_second": 105.074, | |
| "eval_steps_per_second": 13.316, | |
| "step": 8470 | |
| }, | |
| { | |
| "epoch": 17.604508196721312, | |
| "grad_norm": 4.849159240722656, | |
| "learning_rate": 1.2958196721311475e-05, | |
| "loss": 1.2908, | |
| "step": 8591 | |
| }, | |
| { | |
| "epoch": 17.604508196721312, | |
| "eval_loss": 1.4165232181549072, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.193, | |
| "eval_steps_per_second": 13.331, | |
| "step": 8591 | |
| }, | |
| { | |
| "epoch": 17.852459016393443, | |
| "grad_norm": 5.0550360679626465, | |
| "learning_rate": 1.2859016393442625e-05, | |
| "loss": 1.2312, | |
| "step": 8712 | |
| }, | |
| { | |
| "epoch": 17.852459016393443, | |
| "eval_loss": 1.4022456407546997, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.198, | |
| "eval_steps_per_second": 13.332, | |
| "step": 8712 | |
| }, | |
| { | |
| "epoch": 18.100409836065573, | |
| "grad_norm": 5.549849987030029, | |
| "learning_rate": 1.2759836065573772e-05, | |
| "loss": 1.2664, | |
| "step": 8833 | |
| }, | |
| { | |
| "epoch": 18.100409836065573, | |
| "eval_loss": 1.3795406818389893, | |
| "eval_runtime": 4.1227, | |
| "eval_samples_per_second": 105.27, | |
| "eval_steps_per_second": 13.341, | |
| "step": 8833 | |
| }, | |
| { | |
| "epoch": 18.348360655737704, | |
| "grad_norm": 5.974192142486572, | |
| "learning_rate": 1.2660655737704918e-05, | |
| "loss": 1.25, | |
| "step": 8954 | |
| }, | |
| { | |
| "epoch": 18.348360655737704, | |
| "eval_loss": 1.4104958772659302, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.204, | |
| "eval_steps_per_second": 13.332, | |
| "step": 8954 | |
| }, | |
| { | |
| "epoch": 18.596311475409838, | |
| "grad_norm": 5.209727764129639, | |
| "learning_rate": 1.2561475409836066e-05, | |
| "loss": 1.259, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 18.596311475409838, | |
| "eval_loss": 1.3699523210525513, | |
| "eval_runtime": 4.1271, | |
| "eval_samples_per_second": 105.159, | |
| "eval_steps_per_second": 13.327, | |
| "step": 9075 | |
| }, | |
| { | |
| "epoch": 18.84426229508197, | |
| "grad_norm": 5.296604156494141, | |
| "learning_rate": 1.2462295081967215e-05, | |
| "loss": 1.2593, | |
| "step": 9196 | |
| }, | |
| { | |
| "epoch": 18.84426229508197, | |
| "eval_loss": 1.3999890089035034, | |
| "eval_runtime": 4.1243, | |
| "eval_samples_per_second": 105.231, | |
| "eval_steps_per_second": 13.336, | |
| "step": 9196 | |
| }, | |
| { | |
| "epoch": 19.0922131147541, | |
| "grad_norm": 5.599063396453857, | |
| "learning_rate": 1.2363114754098363e-05, | |
| "loss": 1.2479, | |
| "step": 9317 | |
| }, | |
| { | |
| "epoch": 19.0922131147541, | |
| "eval_loss": 1.4049623012542725, | |
| "eval_runtime": 4.1272, | |
| "eval_samples_per_second": 105.155, | |
| "eval_steps_per_second": 13.326, | |
| "step": 9317 | |
| }, | |
| { | |
| "epoch": 19.34016393442623, | |
| "grad_norm": 5.4184346199035645, | |
| "learning_rate": 1.226393442622951e-05, | |
| "loss": 1.2436, | |
| "step": 9438 | |
| }, | |
| { | |
| "epoch": 19.34016393442623, | |
| "eval_loss": 1.4119490385055542, | |
| "eval_runtime": 4.1249, | |
| "eval_samples_per_second": 105.213, | |
| "eval_steps_per_second": 13.334, | |
| "step": 9438 | |
| }, | |
| { | |
| "epoch": 19.58811475409836, | |
| "grad_norm": 4.9451189041137695, | |
| "learning_rate": 1.2164754098360656e-05, | |
| "loss": 1.2554, | |
| "step": 9559 | |
| }, | |
| { | |
| "epoch": 19.58811475409836, | |
| "eval_loss": 1.3934992551803589, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.195, | |
| "eval_steps_per_second": 13.331, | |
| "step": 9559 | |
| }, | |
| { | |
| "epoch": 19.83606557377049, | |
| "grad_norm": 5.6321845054626465, | |
| "learning_rate": 1.2065573770491806e-05, | |
| "loss": 1.2534, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 19.83606557377049, | |
| "eval_loss": 1.3982270956039429, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.206, | |
| "eval_steps_per_second": 13.333, | |
| "step": 9680 | |
| }, | |
| { | |
| "epoch": 20.084016393442624, | |
| "grad_norm": 5.988424301147461, | |
| "learning_rate": 1.1966393442622952e-05, | |
| "loss": 1.2402, | |
| "step": 9801 | |
| }, | |
| { | |
| "epoch": 20.084016393442624, | |
| "eval_loss": 1.4115673303604126, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.225, | |
| "eval_steps_per_second": 13.335, | |
| "step": 9801 | |
| }, | |
| { | |
| "epoch": 20.331967213114755, | |
| "grad_norm": 5.451227188110352, | |
| "learning_rate": 1.1867213114754099e-05, | |
| "loss": 1.2298, | |
| "step": 9922 | |
| }, | |
| { | |
| "epoch": 20.331967213114755, | |
| "eval_loss": 1.381482720375061, | |
| "eval_runtime": 4.1454, | |
| "eval_samples_per_second": 104.695, | |
| "eval_steps_per_second": 13.268, | |
| "step": 9922 | |
| }, | |
| { | |
| "epoch": 20.579918032786885, | |
| "grad_norm": 5.616585731506348, | |
| "learning_rate": 1.1768032786885247e-05, | |
| "loss": 1.2807, | |
| "step": 10043 | |
| }, | |
| { | |
| "epoch": 20.579918032786885, | |
| "eval_loss": 1.380122423171997, | |
| "eval_runtime": 4.1243, | |
| "eval_samples_per_second": 105.229, | |
| "eval_steps_per_second": 13.336, | |
| "step": 10043 | |
| }, | |
| { | |
| "epoch": 20.827868852459016, | |
| "grad_norm": 4.926662445068359, | |
| "learning_rate": 1.1668852459016395e-05, | |
| "loss": 1.2016, | |
| "step": 10164 | |
| }, | |
| { | |
| "epoch": 20.827868852459016, | |
| "eval_loss": 1.365037202835083, | |
| "eval_runtime": 4.1243, | |
| "eval_samples_per_second": 105.23, | |
| "eval_steps_per_second": 13.336, | |
| "step": 10164 | |
| }, | |
| { | |
| "epoch": 21.075819672131146, | |
| "grad_norm": 5.1953959465026855, | |
| "learning_rate": 1.1569672131147543e-05, | |
| "loss": 1.2534, | |
| "step": 10285 | |
| }, | |
| { | |
| "epoch": 21.075819672131146, | |
| "eval_loss": 1.419316291809082, | |
| "eval_runtime": 4.1263, | |
| "eval_samples_per_second": 105.18, | |
| "eval_steps_per_second": 13.329, | |
| "step": 10285 | |
| }, | |
| { | |
| "epoch": 21.32377049180328, | |
| "grad_norm": 5.846003532409668, | |
| "learning_rate": 1.147049180327869e-05, | |
| "loss": 1.2345, | |
| "step": 10406 | |
| }, | |
| { | |
| "epoch": 21.32377049180328, | |
| "eval_loss": 1.4061498641967773, | |
| "eval_runtime": 4.1277, | |
| "eval_samples_per_second": 105.143, | |
| "eval_steps_per_second": 13.325, | |
| "step": 10406 | |
| }, | |
| { | |
| "epoch": 21.57172131147541, | |
| "grad_norm": 5.565953731536865, | |
| "learning_rate": 1.1371311475409836e-05, | |
| "loss": 1.229, | |
| "step": 10527 | |
| }, | |
| { | |
| "epoch": 21.57172131147541, | |
| "eval_loss": 1.3947559595108032, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.226, | |
| "eval_steps_per_second": 13.335, | |
| "step": 10527 | |
| }, | |
| { | |
| "epoch": 21.81967213114754, | |
| "grad_norm": 4.984577655792236, | |
| "learning_rate": 1.1272131147540986e-05, | |
| "loss": 1.2148, | |
| "step": 10648 | |
| }, | |
| { | |
| "epoch": 21.81967213114754, | |
| "eval_loss": 1.398898720741272, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.176, | |
| "eval_steps_per_second": 13.329, | |
| "step": 10648 | |
| }, | |
| { | |
| "epoch": 22.067622950819672, | |
| "grad_norm": 6.103436470031738, | |
| "learning_rate": 1.1172950819672133e-05, | |
| "loss": 1.2339, | |
| "step": 10769 | |
| }, | |
| { | |
| "epoch": 22.067622950819672, | |
| "eval_loss": 1.4189534187316895, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.193, | |
| "eval_steps_per_second": 13.331, | |
| "step": 10769 | |
| }, | |
| { | |
| "epoch": 22.315573770491802, | |
| "grad_norm": 5.6105732917785645, | |
| "learning_rate": 1.1073770491803279e-05, | |
| "loss": 1.2476, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 22.315573770491802, | |
| "eval_loss": 1.4122945070266724, | |
| "eval_runtime": 4.1262, | |
| "eval_samples_per_second": 105.181, | |
| "eval_steps_per_second": 13.329, | |
| "step": 10890 | |
| }, | |
| { | |
| "epoch": 22.563524590163933, | |
| "grad_norm": 6.014953136444092, | |
| "learning_rate": 1.0974590163934427e-05, | |
| "loss": 1.2173, | |
| "step": 11011 | |
| }, | |
| { | |
| "epoch": 22.563524590163933, | |
| "eval_loss": 1.417721152305603, | |
| "eval_runtime": 4.1262, | |
| "eval_samples_per_second": 105.182, | |
| "eval_steps_per_second": 13.33, | |
| "step": 11011 | |
| }, | |
| { | |
| "epoch": 22.811475409836067, | |
| "grad_norm": 5.092090129852295, | |
| "learning_rate": 1.0875409836065575e-05, | |
| "loss": 1.2119, | |
| "step": 11132 | |
| }, | |
| { | |
| "epoch": 22.811475409836067, | |
| "eval_loss": 1.4074928760528564, | |
| "eval_runtime": 4.1242, | |
| "eval_samples_per_second": 105.233, | |
| "eval_steps_per_second": 13.336, | |
| "step": 11132 | |
| }, | |
| { | |
| "epoch": 23.059426229508198, | |
| "grad_norm": 5.808371067047119, | |
| "learning_rate": 1.0776229508196724e-05, | |
| "loss": 1.2049, | |
| "step": 11253 | |
| }, | |
| { | |
| "epoch": 23.059426229508198, | |
| "eval_loss": 1.4072319269180298, | |
| "eval_runtime": 4.131, | |
| "eval_samples_per_second": 105.06, | |
| "eval_steps_per_second": 13.314, | |
| "step": 11253 | |
| }, | |
| { | |
| "epoch": 23.307377049180328, | |
| "grad_norm": 5.78179407119751, | |
| "learning_rate": 1.067704918032787e-05, | |
| "loss": 1.1968, | |
| "step": 11374 | |
| }, | |
| { | |
| "epoch": 23.307377049180328, | |
| "eval_loss": 1.4036682844161987, | |
| "eval_runtime": 4.1268, | |
| "eval_samples_per_second": 105.165, | |
| "eval_steps_per_second": 13.327, | |
| "step": 11374 | |
| }, | |
| { | |
| "epoch": 23.55532786885246, | |
| "grad_norm": 5.584239482879639, | |
| "learning_rate": 1.0577868852459017e-05, | |
| "loss": 1.1862, | |
| "step": 11495 | |
| }, | |
| { | |
| "epoch": 23.55532786885246, | |
| "eval_loss": 1.4230456352233887, | |
| "eval_runtime": 4.1296, | |
| "eval_samples_per_second": 105.095, | |
| "eval_steps_per_second": 13.318, | |
| "step": 11495 | |
| }, | |
| { | |
| "epoch": 23.80327868852459, | |
| "grad_norm": 6.3946685791015625, | |
| "learning_rate": 1.0478688524590163e-05, | |
| "loss": 1.2407, | |
| "step": 11616 | |
| }, | |
| { | |
| "epoch": 23.80327868852459, | |
| "eval_loss": 1.3906227350234985, | |
| "eval_runtime": 4.1272, | |
| "eval_samples_per_second": 105.157, | |
| "eval_steps_per_second": 13.326, | |
| "step": 11616 | |
| }, | |
| { | |
| "epoch": 24.05122950819672, | |
| "grad_norm": 5.362001419067383, | |
| "learning_rate": 1.0379508196721313e-05, | |
| "loss": 1.2213, | |
| "step": 11737 | |
| }, | |
| { | |
| "epoch": 24.05122950819672, | |
| "eval_loss": 1.406900405883789, | |
| "eval_runtime": 4.1286, | |
| "eval_samples_per_second": 105.121, | |
| "eval_steps_per_second": 13.322, | |
| "step": 11737 | |
| }, | |
| { | |
| "epoch": 24.299180327868854, | |
| "grad_norm": 5.9641265869140625, | |
| "learning_rate": 1.028032786885246e-05, | |
| "loss": 1.2041, | |
| "step": 11858 | |
| }, | |
| { | |
| "epoch": 24.299180327868854, | |
| "eval_loss": 1.3982616662979126, | |
| "eval_runtime": 4.1239, | |
| "eval_samples_per_second": 105.24, | |
| "eval_steps_per_second": 13.337, | |
| "step": 11858 | |
| }, | |
| { | |
| "epoch": 24.547131147540984, | |
| "grad_norm": 5.0064568519592285, | |
| "learning_rate": 1.0181147540983608e-05, | |
| "loss": 1.1758, | |
| "step": 11979 | |
| }, | |
| { | |
| "epoch": 24.547131147540984, | |
| "eval_loss": 1.4083583354949951, | |
| "eval_runtime": 4.122, | |
| "eval_samples_per_second": 105.289, | |
| "eval_steps_per_second": 13.343, | |
| "step": 11979 | |
| }, | |
| { | |
| "epoch": 24.795081967213115, | |
| "grad_norm": 5.097125053405762, | |
| "learning_rate": 1.0081967213114754e-05, | |
| "loss": 1.2335, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 24.795081967213115, | |
| "eval_loss": 1.3814207315444946, | |
| "eval_runtime": 4.1238, | |
| "eval_samples_per_second": 105.243, | |
| "eval_steps_per_second": 13.337, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 25.043032786885245, | |
| "grad_norm": 5.703944206237793, | |
| "learning_rate": 9.982786885245902e-06, | |
| "loss": 1.2038, | |
| "step": 12221 | |
| }, | |
| { | |
| "epoch": 25.043032786885245, | |
| "eval_loss": 1.3744884729385376, | |
| "eval_runtime": 4.1288, | |
| "eval_samples_per_second": 105.115, | |
| "eval_steps_per_second": 13.321, | |
| "step": 12221 | |
| }, | |
| { | |
| "epoch": 25.290983606557376, | |
| "grad_norm": 6.049025535583496, | |
| "learning_rate": 9.88360655737705e-06, | |
| "loss": 1.1871, | |
| "step": 12342 | |
| }, | |
| { | |
| "epoch": 25.290983606557376, | |
| "eval_loss": 1.3687578439712524, | |
| "eval_runtime": 4.1247, | |
| "eval_samples_per_second": 105.22, | |
| "eval_steps_per_second": 13.334, | |
| "step": 12342 | |
| }, | |
| { | |
| "epoch": 25.53893442622951, | |
| "grad_norm": 5.135526180267334, | |
| "learning_rate": 9.784426229508197e-06, | |
| "loss": 1.2076, | |
| "step": 12463 | |
| }, | |
| { | |
| "epoch": 25.53893442622951, | |
| "eval_loss": 1.368115782737732, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.2, | |
| "eval_steps_per_second": 13.332, | |
| "step": 12463 | |
| }, | |
| { | |
| "epoch": 25.78688524590164, | |
| "grad_norm": 5.23483419418335, | |
| "learning_rate": 9.685245901639345e-06, | |
| "loss": 1.1994, | |
| "step": 12584 | |
| }, | |
| { | |
| "epoch": 25.78688524590164, | |
| "eval_loss": 1.397793173789978, | |
| "eval_runtime": 4.1272, | |
| "eval_samples_per_second": 105.155, | |
| "eval_steps_per_second": 13.326, | |
| "step": 12584 | |
| }, | |
| { | |
| "epoch": 26.03483606557377, | |
| "grad_norm": 5.80164098739624, | |
| "learning_rate": 9.586065573770492e-06, | |
| "loss": 1.2154, | |
| "step": 12705 | |
| }, | |
| { | |
| "epoch": 26.03483606557377, | |
| "eval_loss": 1.3746296167373657, | |
| "eval_runtime": 4.125, | |
| "eval_samples_per_second": 105.211, | |
| "eval_steps_per_second": 13.333, | |
| "step": 12705 | |
| }, | |
| { | |
| "epoch": 26.2827868852459, | |
| "grad_norm": 5.612881183624268, | |
| "learning_rate": 9.48688524590164e-06, | |
| "loss": 1.1805, | |
| "step": 12826 | |
| }, | |
| { | |
| "epoch": 26.2827868852459, | |
| "eval_loss": 1.4100968837738037, | |
| "eval_runtime": 4.1236, | |
| "eval_samples_per_second": 105.248, | |
| "eval_steps_per_second": 13.338, | |
| "step": 12826 | |
| }, | |
| { | |
| "epoch": 26.53073770491803, | |
| "grad_norm": 5.775158882141113, | |
| "learning_rate": 9.387704918032788e-06, | |
| "loss": 1.1533, | |
| "step": 12947 | |
| }, | |
| { | |
| "epoch": 26.53073770491803, | |
| "eval_loss": 1.387868046760559, | |
| "eval_runtime": 4.1263, | |
| "eval_samples_per_second": 105.178, | |
| "eval_steps_per_second": 13.329, | |
| "step": 12947 | |
| }, | |
| { | |
| "epoch": 26.778688524590162, | |
| "grad_norm": 5.682550430297852, | |
| "learning_rate": 9.288524590163936e-06, | |
| "loss": 1.1848, | |
| "step": 13068 | |
| }, | |
| { | |
| "epoch": 26.778688524590162, | |
| "eval_loss": 1.4003947973251343, | |
| "eval_runtime": 4.1236, | |
| "eval_samples_per_second": 105.247, | |
| "eval_steps_per_second": 13.338, | |
| "step": 13068 | |
| }, | |
| { | |
| "epoch": 27.026639344262296, | |
| "grad_norm": 5.132202625274658, | |
| "learning_rate": 9.189344262295083e-06, | |
| "loss": 1.2298, | |
| "step": 13189 | |
| }, | |
| { | |
| "epoch": 27.026639344262296, | |
| "eval_loss": 1.3953262567520142, | |
| "eval_runtime": 4.127, | |
| "eval_samples_per_second": 105.162, | |
| "eval_steps_per_second": 13.327, | |
| "step": 13189 | |
| }, | |
| { | |
| "epoch": 27.274590163934427, | |
| "grad_norm": 4.892816066741943, | |
| "learning_rate": 9.09016393442623e-06, | |
| "loss": 1.1734, | |
| "step": 13310 | |
| }, | |
| { | |
| "epoch": 27.274590163934427, | |
| "eval_loss": 1.3932636976242065, | |
| "eval_runtime": 4.1237, | |
| "eval_samples_per_second": 105.245, | |
| "eval_steps_per_second": 13.338, | |
| "step": 13310 | |
| }, | |
| { | |
| "epoch": 27.522540983606557, | |
| "grad_norm": 5.972357273101807, | |
| "learning_rate": 8.990983606557377e-06, | |
| "loss": 1.1857, | |
| "step": 13431 | |
| }, | |
| { | |
| "epoch": 27.522540983606557, | |
| "eval_loss": 1.373570203781128, | |
| "eval_runtime": 4.1234, | |
| "eval_samples_per_second": 105.254, | |
| "eval_steps_per_second": 13.339, | |
| "step": 13431 | |
| }, | |
| { | |
| "epoch": 27.770491803278688, | |
| "grad_norm": 5.362318992614746, | |
| "learning_rate": 8.891803278688526e-06, | |
| "loss": 1.1913, | |
| "step": 13552 | |
| }, | |
| { | |
| "epoch": 27.770491803278688, | |
| "eval_loss": 1.3928087949752808, | |
| "eval_runtime": 4.1266, | |
| "eval_samples_per_second": 105.171, | |
| "eval_steps_per_second": 13.328, | |
| "step": 13552 | |
| }, | |
| { | |
| "epoch": 28.01844262295082, | |
| "grad_norm": 6.388829708099365, | |
| "learning_rate": 8.792622950819672e-06, | |
| "loss": 1.1802, | |
| "step": 13673 | |
| }, | |
| { | |
| "epoch": 28.01844262295082, | |
| "eval_loss": 1.4076062440872192, | |
| "eval_runtime": 4.1256, | |
| "eval_samples_per_second": 105.196, | |
| "eval_steps_per_second": 13.331, | |
| "step": 13673 | |
| }, | |
| { | |
| "epoch": 28.266393442622952, | |
| "grad_norm": 5.451745510101318, | |
| "learning_rate": 8.69344262295082e-06, | |
| "loss": 1.1592, | |
| "step": 13794 | |
| }, | |
| { | |
| "epoch": 28.266393442622952, | |
| "eval_loss": 1.4036214351654053, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.177, | |
| "eval_steps_per_second": 13.329, | |
| "step": 13794 | |
| }, | |
| { | |
| "epoch": 28.514344262295083, | |
| "grad_norm": 5.723804473876953, | |
| "learning_rate": 8.594262295081968e-06, | |
| "loss": 1.159, | |
| "step": 13915 | |
| }, | |
| { | |
| "epoch": 28.514344262295083, | |
| "eval_loss": 1.4207592010498047, | |
| "eval_runtime": 4.1242, | |
| "eval_samples_per_second": 105.233, | |
| "eval_steps_per_second": 13.336, | |
| "step": 13915 | |
| }, | |
| { | |
| "epoch": 28.762295081967213, | |
| "grad_norm": 5.422088146209717, | |
| "learning_rate": 8.495081967213117e-06, | |
| "loss": 1.1692, | |
| "step": 14036 | |
| }, | |
| { | |
| "epoch": 28.762295081967213, | |
| "eval_loss": 1.4013181924819946, | |
| "eval_runtime": 4.1286, | |
| "eval_samples_per_second": 105.119, | |
| "eval_steps_per_second": 13.322, | |
| "step": 14036 | |
| }, | |
| { | |
| "epoch": 29.010245901639344, | |
| "grad_norm": 6.352570056915283, | |
| "learning_rate": 8.395901639344263e-06, | |
| "loss": 1.1904, | |
| "step": 14157 | |
| }, | |
| { | |
| "epoch": 29.010245901639344, | |
| "eval_loss": 1.404019832611084, | |
| "eval_runtime": 4.1262, | |
| "eval_samples_per_second": 105.181, | |
| "eval_steps_per_second": 13.329, | |
| "step": 14157 | |
| }, | |
| { | |
| "epoch": 29.258196721311474, | |
| "grad_norm": 5.4049177169799805, | |
| "learning_rate": 8.296721311475411e-06, | |
| "loss": 1.177, | |
| "step": 14278 | |
| }, | |
| { | |
| "epoch": 29.258196721311474, | |
| "eval_loss": 1.4221630096435547, | |
| "eval_runtime": 4.1266, | |
| "eval_samples_per_second": 105.171, | |
| "eval_steps_per_second": 13.328, | |
| "step": 14278 | |
| }, | |
| { | |
| "epoch": 29.506147540983605, | |
| "grad_norm": 5.744983196258545, | |
| "learning_rate": 8.197540983606558e-06, | |
| "loss": 1.1714, | |
| "step": 14399 | |
| }, | |
| { | |
| "epoch": 29.506147540983605, | |
| "eval_loss": 1.3760550022125244, | |
| "eval_runtime": 4.1269, | |
| "eval_samples_per_second": 105.164, | |
| "eval_steps_per_second": 13.327, | |
| "step": 14399 | |
| }, | |
| { | |
| "epoch": 29.75409836065574, | |
| "grad_norm": 5.267645359039307, | |
| "learning_rate": 8.098360655737706e-06, | |
| "loss": 1.1455, | |
| "step": 14520 | |
| }, | |
| { | |
| "epoch": 29.75409836065574, | |
| "eval_loss": 1.3890068531036377, | |
| "eval_runtime": 4.1267, | |
| "eval_samples_per_second": 105.168, | |
| "eval_steps_per_second": 13.328, | |
| "step": 14520 | |
| }, | |
| { | |
| "epoch": 30.00204918032787, | |
| "grad_norm": 5.940517425537109, | |
| "learning_rate": 7.999180327868852e-06, | |
| "loss": 1.1909, | |
| "step": 14641 | |
| }, | |
| { | |
| "epoch": 30.00204918032787, | |
| "eval_loss": 1.4213520288467407, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.226, | |
| "eval_steps_per_second": 13.335, | |
| "step": 14641 | |
| }, | |
| { | |
| "epoch": 30.25, | |
| "grad_norm": 4.8279643058776855, | |
| "learning_rate": 7.9e-06, | |
| "loss": 1.1729, | |
| "step": 14762 | |
| }, | |
| { | |
| "epoch": 30.25, | |
| "eval_loss": 1.387317419052124, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.177, | |
| "eval_steps_per_second": 13.329, | |
| "step": 14762 | |
| }, | |
| { | |
| "epoch": 30.49795081967213, | |
| "grad_norm": 5.653112411499023, | |
| "learning_rate": 7.800819672131149e-06, | |
| "loss": 1.1609, | |
| "step": 14883 | |
| }, | |
| { | |
| "epoch": 30.49795081967213, | |
| "eval_loss": 1.3974180221557617, | |
| "eval_runtime": 4.1211, | |
| "eval_samples_per_second": 105.311, | |
| "eval_steps_per_second": 13.346, | |
| "step": 14883 | |
| }, | |
| { | |
| "epoch": 30.74590163934426, | |
| "grad_norm": 5.124967098236084, | |
| "learning_rate": 7.701639344262295e-06, | |
| "loss": 1.1414, | |
| "step": 15004 | |
| }, | |
| { | |
| "epoch": 30.74590163934426, | |
| "eval_loss": 1.4140675067901611, | |
| "eval_runtime": 4.1217, | |
| "eval_samples_per_second": 105.297, | |
| "eval_steps_per_second": 13.344, | |
| "step": 15004 | |
| }, | |
| { | |
| "epoch": 30.993852459016395, | |
| "grad_norm": 5.296119689941406, | |
| "learning_rate": 7.6024590163934435e-06, | |
| "loss": 1.192, | |
| "step": 15125 | |
| }, | |
| { | |
| "epoch": 30.993852459016395, | |
| "eval_loss": 1.412348747253418, | |
| "eval_runtime": 4.1325, | |
| "eval_samples_per_second": 105.022, | |
| "eval_steps_per_second": 13.309, | |
| "step": 15125 | |
| }, | |
| { | |
| "epoch": 31.241803278688526, | |
| "grad_norm": 6.010870456695557, | |
| "learning_rate": 7.503278688524591e-06, | |
| "loss": 1.1565, | |
| "step": 15246 | |
| }, | |
| { | |
| "epoch": 31.241803278688526, | |
| "eval_loss": 1.3736042976379395, | |
| "eval_runtime": 4.1233, | |
| "eval_samples_per_second": 105.254, | |
| "eval_steps_per_second": 13.339, | |
| "step": 15246 | |
| }, | |
| { | |
| "epoch": 31.489754098360656, | |
| "grad_norm": 5.853750705718994, | |
| "learning_rate": 7.404098360655738e-06, | |
| "loss": 1.1371, | |
| "step": 15367 | |
| }, | |
| { | |
| "epoch": 31.489754098360656, | |
| "eval_loss": 1.3946257829666138, | |
| "eval_runtime": 4.1239, | |
| "eval_samples_per_second": 105.24, | |
| "eval_steps_per_second": 13.337, | |
| "step": 15367 | |
| }, | |
| { | |
| "epoch": 31.737704918032787, | |
| "grad_norm": 5.3367815017700195, | |
| "learning_rate": 7.304918032786886e-06, | |
| "loss": 1.1387, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 31.737704918032787, | |
| "eval_loss": 1.4051241874694824, | |
| "eval_runtime": 4.1214, | |
| "eval_samples_per_second": 105.303, | |
| "eval_steps_per_second": 13.345, | |
| "step": 15488 | |
| }, | |
| { | |
| "epoch": 31.985655737704917, | |
| "grad_norm": 4.613351821899414, | |
| "learning_rate": 7.205737704918034e-06, | |
| "loss": 1.1757, | |
| "step": 15609 | |
| }, | |
| { | |
| "epoch": 31.985655737704917, | |
| "eval_loss": 1.3825907707214355, | |
| "eval_runtime": 4.1236, | |
| "eval_samples_per_second": 105.248, | |
| "eval_steps_per_second": 13.338, | |
| "step": 15609 | |
| }, | |
| { | |
| "epoch": 32.23360655737705, | |
| "grad_norm": 5.111568450927734, | |
| "learning_rate": 7.106557377049181e-06, | |
| "loss": 1.1681, | |
| "step": 15730 | |
| }, | |
| { | |
| "epoch": 32.23360655737705, | |
| "eval_loss": 1.3923113346099854, | |
| "eval_runtime": 4.1224, | |
| "eval_samples_per_second": 105.278, | |
| "eval_steps_per_second": 13.342, | |
| "step": 15730 | |
| }, | |
| { | |
| "epoch": 32.48155737704918, | |
| "grad_norm": 5.637566089630127, | |
| "learning_rate": 7.007377049180328e-06, | |
| "loss": 1.1408, | |
| "step": 15851 | |
| }, | |
| { | |
| "epoch": 32.48155737704918, | |
| "eval_loss": 1.3988559246063232, | |
| "eval_runtime": 4.1222, | |
| "eval_samples_per_second": 105.283, | |
| "eval_steps_per_second": 13.342, | |
| "step": 15851 | |
| }, | |
| { | |
| "epoch": 32.72950819672131, | |
| "grad_norm": 5.146847724914551, | |
| "learning_rate": 6.9081967213114765e-06, | |
| "loss": 1.1424, | |
| "step": 15972 | |
| }, | |
| { | |
| "epoch": 32.72950819672131, | |
| "eval_loss": 1.3839340209960938, | |
| "eval_runtime": 4.124, | |
| "eval_samples_per_second": 105.237, | |
| "eval_steps_per_second": 13.336, | |
| "step": 15972 | |
| }, | |
| { | |
| "epoch": 32.97745901639344, | |
| "grad_norm": 4.275019645690918, | |
| "learning_rate": 6.809016393442624e-06, | |
| "loss": 1.1464, | |
| "step": 16093 | |
| }, | |
| { | |
| "epoch": 32.97745901639344, | |
| "eval_loss": 1.4004756212234497, | |
| "eval_runtime": 4.1251, | |
| "eval_samples_per_second": 105.211, | |
| "eval_steps_per_second": 13.333, | |
| "step": 16093 | |
| }, | |
| { | |
| "epoch": 33.22540983606557, | |
| "grad_norm": 4.860910892486572, | |
| "learning_rate": 6.709836065573771e-06, | |
| "loss": 1.145, | |
| "step": 16214 | |
| }, | |
| { | |
| "epoch": 33.22540983606557, | |
| "eval_loss": 1.380777359008789, | |
| "eval_runtime": 4.1264, | |
| "eval_samples_per_second": 105.177, | |
| "eval_steps_per_second": 13.329, | |
| "step": 16214 | |
| }, | |
| { | |
| "epoch": 33.47336065573771, | |
| "grad_norm": 5.620210647583008, | |
| "learning_rate": 6.6106557377049185e-06, | |
| "loss": 1.1134, | |
| "step": 16335 | |
| }, | |
| { | |
| "epoch": 33.47336065573771, | |
| "eval_loss": 1.414490818977356, | |
| "eval_runtime": 4.1242, | |
| "eval_samples_per_second": 105.231, | |
| "eval_steps_per_second": 13.336, | |
| "step": 16335 | |
| }, | |
| { | |
| "epoch": 33.721311475409834, | |
| "grad_norm": 5.423040866851807, | |
| "learning_rate": 6.511475409836067e-06, | |
| "loss": 1.1455, | |
| "step": 16456 | |
| }, | |
| { | |
| "epoch": 33.721311475409834, | |
| "eval_loss": 1.3776419162750244, | |
| "eval_runtime": 4.1244, | |
| "eval_samples_per_second": 105.227, | |
| "eval_steps_per_second": 13.335, | |
| "step": 16456 | |
| }, | |
| { | |
| "epoch": 33.96926229508197, | |
| "grad_norm": 4.687174320220947, | |
| "learning_rate": 6.412295081967213e-06, | |
| "loss": 1.1581, | |
| "step": 16577 | |
| }, | |
| { | |
| "epoch": 33.96926229508197, | |
| "eval_loss": 1.3872053623199463, | |
| "eval_runtime": 4.1225, | |
| "eval_samples_per_second": 105.276, | |
| "eval_steps_per_second": 13.341, | |
| "step": 16577 | |
| }, | |
| { | |
| "epoch": 34.217213114754095, | |
| "grad_norm": 4.981241703033447, | |
| "learning_rate": 6.313114754098361e-06, | |
| "loss": 1.1465, | |
| "step": 16698 | |
| }, | |
| { | |
| "epoch": 34.217213114754095, | |
| "eval_loss": 1.4010353088378906, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.195, | |
| "eval_steps_per_second": 13.331, | |
| "step": 16698 | |
| }, | |
| { | |
| "epoch": 34.46516393442623, | |
| "grad_norm": 5.649103164672852, | |
| "learning_rate": 6.213934426229509e-06, | |
| "loss": 1.1247, | |
| "step": 16819 | |
| }, | |
| { | |
| "epoch": 34.46516393442623, | |
| "eval_loss": 1.3929107189178467, | |
| "eval_runtime": 4.124, | |
| "eval_samples_per_second": 105.238, | |
| "eval_steps_per_second": 13.337, | |
| "step": 16819 | |
| }, | |
| { | |
| "epoch": 34.71311475409836, | |
| "grad_norm": 4.71021032333374, | |
| "learning_rate": 6.114754098360656e-06, | |
| "loss": 1.1658, | |
| "step": 16940 | |
| }, | |
| { | |
| "epoch": 34.71311475409836, | |
| "eval_loss": 1.4008798599243164, | |
| "eval_runtime": 4.1267, | |
| "eval_samples_per_second": 105.17, | |
| "eval_steps_per_second": 13.328, | |
| "step": 16940 | |
| }, | |
| { | |
| "epoch": 34.96106557377049, | |
| "grad_norm": 5.667390823364258, | |
| "learning_rate": 6.015573770491803e-06, | |
| "loss": 1.1465, | |
| "step": 17061 | |
| }, | |
| { | |
| "epoch": 34.96106557377049, | |
| "eval_loss": 1.3762644529342651, | |
| "eval_runtime": 4.1235, | |
| "eval_samples_per_second": 105.25, | |
| "eval_steps_per_second": 13.338, | |
| "step": 17061 | |
| }, | |
| { | |
| "epoch": 35.209016393442624, | |
| "grad_norm": 5.304030418395996, | |
| "learning_rate": 5.916393442622951e-06, | |
| "loss": 1.1292, | |
| "step": 17182 | |
| }, | |
| { | |
| "epoch": 35.209016393442624, | |
| "eval_loss": 1.3737713098526, | |
| "eval_runtime": 4.1273, | |
| "eval_samples_per_second": 105.153, | |
| "eval_steps_per_second": 13.326, | |
| "step": 17182 | |
| }, | |
| { | |
| "epoch": 35.45696721311475, | |
| "grad_norm": 5.078277587890625, | |
| "learning_rate": 5.817213114754099e-06, | |
| "loss": 1.0997, | |
| "step": 17303 | |
| }, | |
| { | |
| "epoch": 35.45696721311475, | |
| "eval_loss": 1.3623582124710083, | |
| "eval_runtime": 4.1241, | |
| "eval_samples_per_second": 105.236, | |
| "eval_steps_per_second": 13.336, | |
| "step": 17303 | |
| }, | |
| { | |
| "epoch": 35.704918032786885, | |
| "grad_norm": 5.220365524291992, | |
| "learning_rate": 5.718032786885246e-06, | |
| "loss": 1.1463, | |
| "step": 17424 | |
| }, | |
| { | |
| "epoch": 35.704918032786885, | |
| "eval_loss": 1.3832011222839355, | |
| "eval_runtime": 4.1256, | |
| "eval_samples_per_second": 105.197, | |
| "eval_steps_per_second": 13.331, | |
| "step": 17424 | |
| }, | |
| { | |
| "epoch": 35.95286885245902, | |
| "grad_norm": 5.473215103149414, | |
| "learning_rate": 5.618852459016394e-06, | |
| "loss": 1.1426, | |
| "step": 17545 | |
| }, | |
| { | |
| "epoch": 35.95286885245902, | |
| "eval_loss": 1.3841850757598877, | |
| "eval_runtime": 4.124, | |
| "eval_samples_per_second": 105.237, | |
| "eval_steps_per_second": 13.336, | |
| "step": 17545 | |
| }, | |
| { | |
| "epoch": 36.200819672131146, | |
| "grad_norm": 5.864542484283447, | |
| "learning_rate": 5.519672131147541e-06, | |
| "loss": 1.1237, | |
| "step": 17666 | |
| }, | |
| { | |
| "epoch": 36.200819672131146, | |
| "eval_loss": 1.4124358892440796, | |
| "eval_runtime": 4.1246, | |
| "eval_samples_per_second": 105.223, | |
| "eval_steps_per_second": 13.335, | |
| "step": 17666 | |
| }, | |
| { | |
| "epoch": 36.44877049180328, | |
| "grad_norm": 5.416541576385498, | |
| "learning_rate": 5.420491803278689e-06, | |
| "loss": 1.0961, | |
| "step": 17787 | |
| }, | |
| { | |
| "epoch": 36.44877049180328, | |
| "eval_loss": 1.3820034265518188, | |
| "eval_runtime": 4.1254, | |
| "eval_samples_per_second": 105.203, | |
| "eval_steps_per_second": 13.332, | |
| "step": 17787 | |
| }, | |
| { | |
| "epoch": 36.69672131147541, | |
| "grad_norm": 5.523916721343994, | |
| "learning_rate": 5.3213114754098365e-06, | |
| "loss": 1.1677, | |
| "step": 17908 | |
| }, | |
| { | |
| "epoch": 36.69672131147541, | |
| "eval_loss": 1.391912817955017, | |
| "eval_runtime": 4.124, | |
| "eval_samples_per_second": 105.238, | |
| "eval_steps_per_second": 13.337, | |
| "step": 17908 | |
| }, | |
| { | |
| "epoch": 36.94467213114754, | |
| "grad_norm": 6.368148326873779, | |
| "learning_rate": 5.222131147540984e-06, | |
| "loss": 1.1142, | |
| "step": 18029 | |
| }, | |
| { | |
| "epoch": 36.94467213114754, | |
| "eval_loss": 1.427830457687378, | |
| "eval_runtime": 4.1227, | |
| "eval_samples_per_second": 105.27, | |
| "eval_steps_per_second": 13.341, | |
| "step": 18029 | |
| }, | |
| { | |
| "epoch": 37.192622950819676, | |
| "grad_norm": 4.822169303894043, | |
| "learning_rate": 5.122950819672131e-06, | |
| "loss": 1.146, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 37.192622950819676, | |
| "eval_loss": 1.3914594650268555, | |
| "eval_runtime": 4.1302, | |
| "eval_samples_per_second": 105.081, | |
| "eval_steps_per_second": 13.317, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 37.4405737704918, | |
| "grad_norm": 5.081784248352051, | |
| "learning_rate": 5.023770491803279e-06, | |
| "loss": 1.1352, | |
| "step": 18271 | |
| }, | |
| { | |
| "epoch": 37.4405737704918, | |
| "eval_loss": 1.38656485080719, | |
| "eval_runtime": 4.1258, | |
| "eval_samples_per_second": 105.192, | |
| "eval_steps_per_second": 13.331, | |
| "step": 18271 | |
| }, | |
| { | |
| "epoch": 37.68852459016394, | |
| "grad_norm": 5.341165065765381, | |
| "learning_rate": 4.924590163934427e-06, | |
| "loss": 1.1353, | |
| "step": 18392 | |
| }, | |
| { | |
| "epoch": 37.68852459016394, | |
| "eval_loss": 1.3952445983886719, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.195, | |
| "eval_steps_per_second": 13.331, | |
| "step": 18392 | |
| }, | |
| { | |
| "epoch": 37.93647540983606, | |
| "grad_norm": 5.281919002532959, | |
| "learning_rate": 4.825409836065574e-06, | |
| "loss": 1.1099, | |
| "step": 18513 | |
| }, | |
| { | |
| "epoch": 37.93647540983606, | |
| "eval_loss": 1.3772515058517456, | |
| "eval_runtime": 4.1261, | |
| "eval_samples_per_second": 105.184, | |
| "eval_steps_per_second": 13.33, | |
| "step": 18513 | |
| }, | |
| { | |
| "epoch": 38.1844262295082, | |
| "grad_norm": 5.485556125640869, | |
| "learning_rate": 4.726229508196722e-06, | |
| "loss": 1.1088, | |
| "step": 18634 | |
| }, | |
| { | |
| "epoch": 38.1844262295082, | |
| "eval_loss": 1.3951290845870972, | |
| "eval_runtime": 4.1248, | |
| "eval_samples_per_second": 105.218, | |
| "eval_steps_per_second": 13.334, | |
| "step": 18634 | |
| }, | |
| { | |
| "epoch": 38.432377049180324, | |
| "grad_norm": 4.903234004974365, | |
| "learning_rate": 4.6270491803278695e-06, | |
| "loss": 1.1141, | |
| "step": 18755 | |
| }, | |
| { | |
| "epoch": 38.432377049180324, | |
| "eval_loss": 1.4029154777526855, | |
| "eval_runtime": 4.1245, | |
| "eval_samples_per_second": 105.224, | |
| "eval_steps_per_second": 13.335, | |
| "step": 18755 | |
| }, | |
| { | |
| "epoch": 38.68032786885246, | |
| "grad_norm": 6.011325359344482, | |
| "learning_rate": 4.527868852459017e-06, | |
| "loss": 1.1115, | |
| "step": 18876 | |
| }, | |
| { | |
| "epoch": 38.68032786885246, | |
| "eval_loss": 1.3774739503860474, | |
| "eval_runtime": 4.1241, | |
| "eval_samples_per_second": 105.234, | |
| "eval_steps_per_second": 13.336, | |
| "step": 18876 | |
| }, | |
| { | |
| "epoch": 38.92827868852459, | |
| "grad_norm": 5.690525531768799, | |
| "learning_rate": 4.428688524590164e-06, | |
| "loss": 1.1552, | |
| "step": 18997 | |
| }, | |
| { | |
| "epoch": 38.92827868852459, | |
| "eval_loss": 1.3836060762405396, | |
| "eval_runtime": 4.1288, | |
| "eval_samples_per_second": 105.116, | |
| "eval_steps_per_second": 13.321, | |
| "step": 18997 | |
| }, | |
| { | |
| "epoch": 39.17622950819672, | |
| "grad_norm": 4.86214017868042, | |
| "learning_rate": 4.329508196721312e-06, | |
| "loss": 1.1291, | |
| "step": 19118 | |
| }, | |
| { | |
| "epoch": 39.17622950819672, | |
| "eval_loss": 1.3931368589401245, | |
| "eval_runtime": 4.1259, | |
| "eval_samples_per_second": 105.189, | |
| "eval_steps_per_second": 13.33, | |
| "step": 19118 | |
| }, | |
| { | |
| "epoch": 39.424180327868854, | |
| "grad_norm": 5.337844371795654, | |
| "learning_rate": 4.23032786885246e-06, | |
| "loss": 1.1261, | |
| "step": 19239 | |
| }, | |
| { | |
| "epoch": 39.424180327868854, | |
| "eval_loss": 1.393588662147522, | |
| "eval_runtime": 4.1276, | |
| "eval_samples_per_second": 105.146, | |
| "eval_steps_per_second": 13.325, | |
| "step": 19239 | |
| }, | |
| { | |
| "epoch": 39.67213114754098, | |
| "grad_norm": 5.509841442108154, | |
| "learning_rate": 4.131147540983607e-06, | |
| "loss": 1.1209, | |
| "step": 19360 | |
| }, | |
| { | |
| "epoch": 39.67213114754098, | |
| "eval_loss": 1.4069132804870605, | |
| "eval_runtime": 4.1261, | |
| "eval_samples_per_second": 105.184, | |
| "eval_steps_per_second": 13.33, | |
| "step": 19360 | |
| }, | |
| { | |
| "epoch": 39.920081967213115, | |
| "grad_norm": 4.939957141876221, | |
| "learning_rate": 4.031967213114754e-06, | |
| "loss": 1.124, | |
| "step": 19481 | |
| }, | |
| { | |
| "epoch": 39.920081967213115, | |
| "eval_loss": 1.391053318977356, | |
| "eval_runtime": 4.1267, | |
| "eval_samples_per_second": 105.169, | |
| "eval_steps_per_second": 13.328, | |
| "step": 19481 | |
| }, | |
| { | |
| "epoch": 40.16803278688525, | |
| "grad_norm": 5.964232444763184, | |
| "learning_rate": 3.932786885245902e-06, | |
| "loss": 1.1374, | |
| "step": 19602 | |
| }, | |
| { | |
| "epoch": 40.16803278688525, | |
| "eval_loss": 1.3899863958358765, | |
| "eval_runtime": 4.1261, | |
| "eval_samples_per_second": 105.183, | |
| "eval_steps_per_second": 13.33, | |
| "step": 19602 | |
| }, | |
| { | |
| "epoch": 40.415983606557376, | |
| "grad_norm": 5.4043049812316895, | |
| "learning_rate": 3.833606557377049e-06, | |
| "loss": 1.1079, | |
| "step": 19723 | |
| }, | |
| { | |
| "epoch": 40.415983606557376, | |
| "eval_loss": 1.386367678642273, | |
| "eval_runtime": 4.127, | |
| "eval_samples_per_second": 105.162, | |
| "eval_steps_per_second": 13.327, | |
| "step": 19723 | |
| }, | |
| { | |
| "epoch": 40.66393442622951, | |
| "grad_norm": 5.620067596435547, | |
| "learning_rate": 3.734426229508197e-06, | |
| "loss": 1.1286, | |
| "step": 19844 | |
| }, | |
| { | |
| "epoch": 40.66393442622951, | |
| "eval_loss": 1.3876351118087769, | |
| "eval_runtime": 4.1244, | |
| "eval_samples_per_second": 105.228, | |
| "eval_steps_per_second": 13.335, | |
| "step": 19844 | |
| }, | |
| { | |
| "epoch": 40.91188524590164, | |
| "grad_norm": 5.134771347045898, | |
| "learning_rate": 3.635245901639344e-06, | |
| "loss": 1.103, | |
| "step": 19965 | |
| }, | |
| { | |
| "epoch": 40.91188524590164, | |
| "eval_loss": 1.401169776916504, | |
| "eval_runtime": 4.125, | |
| "eval_samples_per_second": 105.212, | |
| "eval_steps_per_second": 13.333, | |
| "step": 19965 | |
| }, | |
| { | |
| "epoch": 41.15983606557377, | |
| "grad_norm": 5.033725261688232, | |
| "learning_rate": 3.536065573770492e-06, | |
| "loss": 1.1017, | |
| "step": 20086 | |
| }, | |
| { | |
| "epoch": 41.15983606557377, | |
| "eval_loss": 1.413968801498413, | |
| "eval_runtime": 4.1265, | |
| "eval_samples_per_second": 105.175, | |
| "eval_steps_per_second": 13.329, | |
| "step": 20086 | |
| }, | |
| { | |
| "epoch": 41.407786885245905, | |
| "grad_norm": 5.851089000701904, | |
| "learning_rate": 3.4368852459016393e-06, | |
| "loss": 1.1036, | |
| "step": 20207 | |
| }, | |
| { | |
| "epoch": 41.407786885245905, | |
| "eval_loss": 1.369694709777832, | |
| "eval_runtime": 4.1274, | |
| "eval_samples_per_second": 105.151, | |
| "eval_steps_per_second": 13.326, | |
| "step": 20207 | |
| }, | |
| { | |
| "epoch": 41.65573770491803, | |
| "grad_norm": 5.828238010406494, | |
| "learning_rate": 3.337704918032787e-06, | |
| "loss": 1.1052, | |
| "step": 20328 | |
| }, | |
| { | |
| "epoch": 41.65573770491803, | |
| "eval_loss": 1.4037874937057495, | |
| "eval_runtime": 4.1236, | |
| "eval_samples_per_second": 105.247, | |
| "eval_steps_per_second": 13.338, | |
| "step": 20328 | |
| }, | |
| { | |
| "epoch": 41.903688524590166, | |
| "grad_norm": 5.322448253631592, | |
| "learning_rate": 3.2385245901639344e-06, | |
| "loss": 1.0977, | |
| "step": 20449 | |
| }, | |
| { | |
| "epoch": 41.903688524590166, | |
| "eval_loss": 1.3905967473983765, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.204, | |
| "eval_steps_per_second": 13.332, | |
| "step": 20449 | |
| }, | |
| { | |
| "epoch": 42.15163934426229, | |
| "grad_norm": 5.0981526374816895, | |
| "learning_rate": 3.139344262295082e-06, | |
| "loss": 1.1343, | |
| "step": 20570 | |
| }, | |
| { | |
| "epoch": 42.15163934426229, | |
| "eval_loss": 1.3920131921768188, | |
| "eval_runtime": 4.1259, | |
| "eval_samples_per_second": 105.19, | |
| "eval_steps_per_second": 13.331, | |
| "step": 20570 | |
| }, | |
| { | |
| "epoch": 42.39959016393443, | |
| "grad_norm": 5.493677616119385, | |
| "learning_rate": 3.0401639344262295e-06, | |
| "loss": 1.103, | |
| "step": 20691 | |
| }, | |
| { | |
| "epoch": 42.39959016393443, | |
| "eval_loss": 1.387071132659912, | |
| "eval_runtime": 4.1293, | |
| "eval_samples_per_second": 105.102, | |
| "eval_steps_per_second": 13.319, | |
| "step": 20691 | |
| }, | |
| { | |
| "epoch": 42.64754098360656, | |
| "grad_norm": 5.632537364959717, | |
| "learning_rate": 2.9409836065573772e-06, | |
| "loss": 1.1092, | |
| "step": 20812 | |
| }, | |
| { | |
| "epoch": 42.64754098360656, | |
| "eval_loss": 1.387178659439087, | |
| "eval_runtime": 4.1265, | |
| "eval_samples_per_second": 105.175, | |
| "eval_steps_per_second": 13.329, | |
| "step": 20812 | |
| }, | |
| { | |
| "epoch": 42.89549180327869, | |
| "grad_norm": 5.351899147033691, | |
| "learning_rate": 2.8418032786885246e-06, | |
| "loss": 1.0858, | |
| "step": 20933 | |
| }, | |
| { | |
| "epoch": 42.89549180327869, | |
| "eval_loss": 1.3848626613616943, | |
| "eval_runtime": 4.1255, | |
| "eval_samples_per_second": 105.199, | |
| "eval_steps_per_second": 13.332, | |
| "step": 20933 | |
| }, | |
| { | |
| "epoch": 43.14344262295082, | |
| "grad_norm": 7.1183247566223145, | |
| "learning_rate": 2.7426229508196723e-06, | |
| "loss": 1.0998, | |
| "step": 21054 | |
| }, | |
| { | |
| "epoch": 43.14344262295082, | |
| "eval_loss": 1.3888462781906128, | |
| "eval_runtime": 4.1286, | |
| "eval_samples_per_second": 105.12, | |
| "eval_steps_per_second": 13.322, | |
| "step": 21054 | |
| }, | |
| { | |
| "epoch": 43.39139344262295, | |
| "grad_norm": 5.993564128875732, | |
| "learning_rate": 2.6434426229508197e-06, | |
| "loss": 1.1082, | |
| "step": 21175 | |
| }, | |
| { | |
| "epoch": 43.39139344262295, | |
| "eval_loss": 1.3925265073776245, | |
| "eval_runtime": 4.1287, | |
| "eval_samples_per_second": 105.118, | |
| "eval_steps_per_second": 13.321, | |
| "step": 21175 | |
| }, | |
| { | |
| "epoch": 43.63934426229508, | |
| "grad_norm": 4.941683292388916, | |
| "learning_rate": 2.5442622950819674e-06, | |
| "loss": 1.1112, | |
| "step": 21296 | |
| }, | |
| { | |
| "epoch": 43.63934426229508, | |
| "eval_loss": 1.4261186122894287, | |
| "eval_runtime": 4.1253, | |
| "eval_samples_per_second": 105.205, | |
| "eval_steps_per_second": 13.332, | |
| "step": 21296 | |
| }, | |
| { | |
| "epoch": 43.88729508196721, | |
| "grad_norm": 5.67835807800293, | |
| "learning_rate": 2.4450819672131148e-06, | |
| "loss": 1.0969, | |
| "step": 21417 | |
| }, | |
| { | |
| "epoch": 43.88729508196721, | |
| "eval_loss": 1.3613173961639404, | |
| "eval_runtime": 4.1247, | |
| "eval_samples_per_second": 105.219, | |
| "eval_steps_per_second": 13.334, | |
| "step": 21417 | |
| }, | |
| { | |
| "epoch": 44.135245901639344, | |
| "grad_norm": 5.428313255310059, | |
| "learning_rate": 2.3459016393442625e-06, | |
| "loss": 1.112, | |
| "step": 21538 | |
| }, | |
| { | |
| "epoch": 44.135245901639344, | |
| "eval_loss": 1.387001395225525, | |
| "eval_runtime": 4.1267, | |
| "eval_samples_per_second": 105.17, | |
| "eval_steps_per_second": 13.328, | |
| "step": 21538 | |
| }, | |
| { | |
| "epoch": 44.38319672131148, | |
| "grad_norm": 5.432636260986328, | |
| "learning_rate": 2.24672131147541e-06, | |
| "loss": 1.0919, | |
| "step": 21659 | |
| }, | |
| { | |
| "epoch": 44.38319672131148, | |
| "eval_loss": 1.4082472324371338, | |
| "eval_runtime": 4.1279, | |
| "eval_samples_per_second": 105.139, | |
| "eval_steps_per_second": 13.324, | |
| "step": 21659 | |
| }, | |
| { | |
| "epoch": 44.631147540983605, | |
| "grad_norm": 5.585594177246094, | |
| "learning_rate": 2.1475409836065576e-06, | |
| "loss": 1.0921, | |
| "step": 21780 | |
| }, | |
| { | |
| "epoch": 44.631147540983605, | |
| "eval_loss": 1.3896039724349976, | |
| "eval_runtime": 4.1241, | |
| "eval_samples_per_second": 105.234, | |
| "eval_steps_per_second": 13.336, | |
| "step": 21780 | |
| }, | |
| { | |
| "epoch": 44.87909836065574, | |
| "grad_norm": 5.751293659210205, | |
| "learning_rate": 2.048360655737705e-06, | |
| "loss": 1.1125, | |
| "step": 21901 | |
| }, | |
| { | |
| "epoch": 44.87909836065574, | |
| "eval_loss": 1.3886334896087646, | |
| "eval_runtime": 4.1257, | |
| "eval_samples_per_second": 105.194, | |
| "eval_steps_per_second": 13.331, | |
| "step": 21901 | |
| }, | |
| { | |
| "epoch": 45.127049180327866, | |
| "grad_norm": 5.387383937835693, | |
| "learning_rate": 1.9491803278688527e-06, | |
| "loss": 1.0884, | |
| "step": 22022 | |
| }, | |
| { | |
| "epoch": 45.127049180327866, | |
| "eval_loss": 1.3778773546218872, | |
| "eval_runtime": 4.1273, | |
| "eval_samples_per_second": 105.154, | |
| "eval_steps_per_second": 13.326, | |
| "step": 22022 | |
| }, | |
| { | |
| "epoch": 45.375, | |
| "grad_norm": 4.957601070404053, | |
| "learning_rate": 1.85e-06, | |
| "loss": 1.0855, | |
| "step": 22143 | |
| }, | |
| { | |
| "epoch": 45.375, | |
| "eval_loss": 1.3757483959197998, | |
| "eval_runtime": 4.1246, | |
| "eval_samples_per_second": 105.223, | |
| "eval_steps_per_second": 13.335, | |
| "step": 22143 | |
| }, | |
| { | |
| "epoch": 45.622950819672134, | |
| "grad_norm": 5.841992378234863, | |
| "learning_rate": 1.7508196721311476e-06, | |
| "loss": 1.1281, | |
| "step": 22264 | |
| }, | |
| { | |
| "epoch": 45.622950819672134, | |
| "eval_loss": 1.385141372680664, | |
| "eval_runtime": 4.132, | |
| "eval_samples_per_second": 105.034, | |
| "eval_steps_per_second": 13.311, | |
| "step": 22264 | |
| }, | |
| { | |
| "epoch": 45.87090163934426, | |
| "grad_norm": 5.011885643005371, | |
| "learning_rate": 1.6516393442622952e-06, | |
| "loss": 1.0856, | |
| "step": 22385 | |
| }, | |
| { | |
| "epoch": 45.87090163934426, | |
| "eval_loss": 1.364372968673706, | |
| "eval_runtime": 4.1268, | |
| "eval_samples_per_second": 105.167, | |
| "eval_steps_per_second": 13.328, | |
| "step": 22385 | |
| }, | |
| { | |
| "epoch": 46.118852459016395, | |
| "grad_norm": 5.6464996337890625, | |
| "learning_rate": 1.5524590163934427e-06, | |
| "loss": 1.0963, | |
| "step": 22506 | |
| }, | |
| { | |
| "epoch": 46.118852459016395, | |
| "eval_loss": 1.3527668714523315, | |
| "eval_runtime": 4.122, | |
| "eval_samples_per_second": 105.289, | |
| "eval_steps_per_second": 13.343, | |
| "step": 22506 | |
| } | |
| ], | |
| "logging_steps": 121, | |
| "max_steps": 24400, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 121, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.732761177640141e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |