diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,17 +1,17 @@ { - "best_global_step": 40, - "best_metric": 0.7394412159919739, - "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-40", - "epoch": 2.1052631578947367, + "best_global_step": 240, + "best_metric": 0.0008580058929510415, + "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-240", + "epoch": 12.631578947368421, "eval_steps": 1, - "global_step": 40, + "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05263157894736842, - "grad_norm": 2.3607187271118164, + "grad_norm": 2.392829656600952, "learning_rate": 0.0, "loss": 3.2235, "step": 1 @@ -19,601 +19,3601 @@ { "epoch": 0.05263157894736842, "eval_loss": 3.15524959564209, - "eval_runtime": 3.3685, - "eval_samples_per_second": 8.906, - "eval_steps_per_second": 1.187, + "eval_runtime": 3.3805, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, "step": 1 }, { "epoch": 0.10526315789473684, - "grad_norm": 2.431220531463623, + "grad_norm": 2.4646599292755127, "learning_rate": 3.3333333333333335e-05, "loss": 3.165, "step": 2 }, { "epoch": 0.10526315789473684, - "eval_loss": 3.0020461082458496, - "eval_runtime": 3.299, - "eval_samples_per_second": 9.094, - "eval_steps_per_second": 1.212, + "eval_loss": 3.0003557205200195, + "eval_runtime": 3.3887, + "eval_samples_per_second": 8.853, + "eval_steps_per_second": 1.18, "step": 2 }, { "epoch": 0.15789473684210525, - "grad_norm": 1.8372516632080078, + "grad_norm": 1.8603581190109253, "learning_rate": 6.666666666666667e-05, "loss": 2.7821, "step": 3 }, { "epoch": 0.15789473684210525, - "eval_loss": 2.6930112838745117, - "eval_runtime": 3.3119, - "eval_samples_per_second": 9.058, - "eval_steps_per_second": 1.208, + "eval_loss": 2.6881206035614014, + "eval_runtime": 3.3941, + "eval_samples_per_second": 8.839, + "eval_steps_per_second": 1.179, "step": 3 }, { "epoch": 0.21052631578947367, - "grad_norm": 1.6948609352111816, + "grad_norm": 1.7065073251724243, "learning_rate": 0.0001, - "loss": 2.7014, + "loss": 2.6959, "step": 4 }, { "epoch": 0.21052631578947367, - "eval_loss": 2.349722146987915, - "eval_runtime": 3.3194, - "eval_samples_per_second": 9.038, - "eval_steps_per_second": 1.205, + "eval_loss": 2.3458807468414307, + "eval_runtime": 3.4084, + "eval_samples_per_second": 8.802, + "eval_steps_per_second": 1.174, "step": 4 }, { "epoch": 0.2631578947368421, - "grad_norm": 1.333439826965332, + "grad_norm": 1.3534406423568726, "learning_rate": 0.00013333333333333334, - "loss": 2.3248, + "loss": 2.324, "step": 5 }, { "epoch": 0.2631578947368421, - "eval_loss": 2.06449294090271, - "eval_runtime": 3.331, - "eval_samples_per_second": 9.006, - "eval_steps_per_second": 1.201, + "eval_loss": 2.0636515617370605, + "eval_runtime": 3.4023, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, "step": 5 }, { "epoch": 0.3157894736842105, - "grad_norm": 1.3499835729599, + "grad_norm": 1.3654303550720215, "learning_rate": 0.00016666666666666666, - "loss": 2.0069, + "loss": 2.0045, "step": 6 }, { "epoch": 0.3157894736842105, - "eval_loss": 1.8060506582260132, - "eval_runtime": 3.3417, - "eval_samples_per_second": 8.977, - "eval_steps_per_second": 1.197, + "eval_loss": 1.8059495687484741, + "eval_runtime": 3.3904, + "eval_samples_per_second": 8.849, + "eval_steps_per_second": 1.18, "step": 6 }, { "epoch": 0.3684210526315789, - "grad_norm": 1.649509310722351, + "grad_norm": 1.6560827493667603, "learning_rate": 0.0002, - "loss": 1.8274, + "loss": 1.8291, "step": 7 }, { "epoch": 0.3684210526315789, - "eval_loss": 1.554451584815979, - "eval_runtime": 3.3577, - "eval_samples_per_second": 8.935, - "eval_steps_per_second": 1.191, + "eval_loss": 1.5527682304382324, + "eval_runtime": 3.3766, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.185, "step": 7 }, { "epoch": 0.42105263157894735, - "grad_norm": 1.6009737253189087, + "grad_norm": 1.5807621479034424, "learning_rate": 0.00023333333333333333, - "loss": 1.5562, + "loss": 1.5517, "step": 8 }, { "epoch": 0.42105263157894735, - "eval_loss": 1.3974536657333374, - "eval_runtime": 3.3654, - "eval_samples_per_second": 8.914, - "eval_steps_per_second": 1.189, + "eval_loss": 1.3975309133529663, + "eval_runtime": 3.3664, + "eval_samples_per_second": 8.912, + "eval_steps_per_second": 1.188, "step": 8 }, { "epoch": 0.47368421052631576, - "grad_norm": 1.7731741666793823, + "grad_norm": 1.7972851991653442, "learning_rate": 0.0002666666666666667, - "loss": 1.4525, + "loss": 1.4511, "step": 9 }, { "epoch": 0.47368421052631576, - "eval_loss": 1.3451876640319824, - "eval_runtime": 3.3744, - "eval_samples_per_second": 8.89, - "eval_steps_per_second": 1.185, + "eval_loss": 1.346282958984375, + "eval_runtime": 3.3698, + "eval_samples_per_second": 8.903, + "eval_steps_per_second": 1.187, "step": 9 }, { "epoch": 0.5263157894736842, - "grad_norm": 1.629805088043213, + "grad_norm": 1.6609612703323364, "learning_rate": 0.0003, - "loss": 1.4081, + "loss": 1.412, "step": 10 }, { "epoch": 0.5263157894736842, - "eval_loss": 1.2556439638137817, - "eval_runtime": 3.3912, - "eval_samples_per_second": 8.847, - "eval_steps_per_second": 1.18, + "eval_loss": 1.2566777467727661, + "eval_runtime": 3.3636, + "eval_samples_per_second": 8.919, + "eval_steps_per_second": 1.189, "step": 10 }, { "epoch": 0.5789473684210527, - "grad_norm": 1.313006043434143, + "grad_norm": 1.3480994701385498, "learning_rate": 0.0003333333333333333, - "loss": 1.3422, + "loss": 1.3464, "step": 11 }, { "epoch": 0.5789473684210527, - "eval_loss": 1.1746076345443726, - "eval_runtime": 3.3899, - "eval_samples_per_second": 8.85, - "eval_steps_per_second": 1.18, + "eval_loss": 1.177741527557373, + "eval_runtime": 3.3574, + "eval_samples_per_second": 8.935, + "eval_steps_per_second": 1.191, "step": 11 }, { "epoch": 0.631578947368421, - "grad_norm": 0.9396845698356628, + "grad_norm": 0.9665437936782837, "learning_rate": 0.00036666666666666667, - "loss": 1.2091, + "loss": 1.217, "step": 12 }, { "epoch": 0.631578947368421, - "eval_loss": 1.1337084770202637, - "eval_runtime": 3.3948, - "eval_samples_per_second": 8.837, - "eval_steps_per_second": 1.178, + "eval_loss": 1.1329197883605957, + "eval_runtime": 3.3641, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.189, "step": 12 }, { "epoch": 0.6842105263157895, - "grad_norm": 1.076097846031189, + "grad_norm": 1.0468593835830688, "learning_rate": 0.0004, - "loss": 1.1891, + "loss": 1.1858, "step": 13 }, { "epoch": 0.6842105263157895, - "eval_loss": 1.0741407871246338, - "eval_runtime": 3.3911, - "eval_samples_per_second": 8.847, - "eval_steps_per_second": 1.18, + "eval_loss": 1.0717506408691406, + "eval_runtime": 3.3742, + "eval_samples_per_second": 8.891, + "eval_steps_per_second": 1.185, "step": 13 }, { "epoch": 0.7368421052631579, - "grad_norm": 0.8671520352363586, + "grad_norm": 0.876053512096405, "learning_rate": 0.00043333333333333337, - "loss": 1.0924, + "loss": 1.09, "step": 14 }, { "epoch": 0.7368421052631579, - "eval_loss": 1.050424575805664, - "eval_runtime": 3.3794, - "eval_samples_per_second": 8.877, - "eval_steps_per_second": 1.184, + "eval_loss": 1.0482743978500366, + "eval_runtime": 3.3843, + "eval_samples_per_second": 8.864, + "eval_steps_per_second": 1.182, "step": 14 }, { "epoch": 0.7894736842105263, - "grad_norm": 0.8102416396141052, + "grad_norm": 0.8056855797767639, "learning_rate": 0.00046666666666666666, - "loss": 1.1182, + "loss": 1.1206, "step": 15 }, { "epoch": 0.7894736842105263, - "eval_loss": 1.02986741065979, - "eval_runtime": 3.3781, - "eval_samples_per_second": 8.881, - "eval_steps_per_second": 1.184, + "eval_loss": 1.026999592781067, + "eval_runtime": 3.3874, + "eval_samples_per_second": 8.856, + "eval_steps_per_second": 1.181, "step": 15 }, { "epoch": 0.8421052631578947, - "grad_norm": 1.4678000211715698, + "grad_norm": 0.8941182494163513, "learning_rate": 0.0005, - "loss": 1.1182, + "loss": 1.1109, "step": 16 }, { "epoch": 0.8421052631578947, - "eval_loss": 1.0076123476028442, - "eval_runtime": 3.3672, - "eval_samples_per_second": 8.91, - "eval_steps_per_second": 1.188, + "eval_loss": 0.9942652583122253, + "eval_runtime": 3.3784, + "eval_samples_per_second": 8.88, + "eval_steps_per_second": 1.184, "step": 16 }, { "epoch": 0.8947368421052632, - "grad_norm": 0.8557516932487488, - "learning_rate": 0.0004993910125649561, - "loss": 1.1433, + "grad_norm": 0.7703595161437988, + "learning_rate": 0.0004999776608025946, + "loss": 1.1183, "step": 17 }, { "epoch": 0.8947368421052632, - "eval_loss": 0.9948338866233826, - "eval_runtime": 3.3641, - "eval_samples_per_second": 8.918, - "eval_steps_per_second": 1.189, + "eval_loss": 0.9911304116249084, + "eval_runtime": 3.3747, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, "step": 17 }, { "epoch": 0.9473684210526315, - "grad_norm": 1.181545376777649, - "learning_rate": 0.0004975670171853926, - "loss": 1.0207, + "grad_norm": 0.7860077619552612, + "learning_rate": 0.000499910647202696, + "loss": 1.0252, "step": 18 }, { "epoch": 0.9473684210526315, - "eval_loss": 0.959977924823761, - "eval_runtime": 3.3618, - "eval_samples_per_second": 8.924, - "eval_steps_per_second": 1.19, + "eval_loss": 0.951456606388092, + "eval_runtime": 3.3745, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, "step": 18 }, { "epoch": 1.0, - "grad_norm": 0.7064942121505737, - "learning_rate": 0.0004945369001834514, - "loss": 1.0768, + "grad_norm": 0.7760164737701416, + "learning_rate": 0.0004997989711765446, + "loss": 1.0673, "step": 19 }, { "epoch": 1.0, - "eval_loss": 0.9442862272262573, - "eval_runtime": 3.3598, - "eval_samples_per_second": 8.929, - "eval_steps_per_second": 1.191, + "eval_loss": 0.9409392476081848, + "eval_runtime": 3.3722, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.186, "step": 19 }, { "epoch": 1.0526315789473684, - "grad_norm": 0.7763754725456238, - "learning_rate": 0.0004903154239845797, - "loss": 0.9409, + "grad_norm": 0.8154846429824829, + "learning_rate": 0.0004996426526821629, + "loss": 0.9349, "step": 20 }, { "epoch": 1.0526315789473684, - "eval_loss": 0.9225653409957886, - "eval_runtime": 3.3593, - "eval_samples_per_second": 8.93, - "eval_steps_per_second": 1.191, + "eval_loss": 0.9243174195289612, + "eval_runtime": 3.364, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.189, "step": 20 }, { "epoch": 1.1052631578947367, - "grad_norm": 0.6782916188240051, - "learning_rate": 0.0004849231551964771, - "loss": 0.9597, + "grad_norm": 0.6889561414718628, + "learning_rate": 0.0004994417196557883, + "loss": 0.964, "step": 21 }, { "epoch": 1.1052631578947367, - "eval_loss": 0.9122769832611084, - "eval_runtime": 3.3624, - "eval_samples_per_second": 8.922, - "eval_steps_per_second": 1.19, + "eval_loss": 0.9147518873214722, + "eval_runtime": 3.3599, + "eval_samples_per_second": 8.929, + "eval_steps_per_second": 1.191, "step": 21 }, { "epoch": 1.1578947368421053, - "grad_norm": 0.638238251209259, - "learning_rate": 0.0004783863644106502, - "loss": 0.9609, + "grad_norm": 0.6425629258155823, + "learning_rate": 0.0004991962080068813, + "loss": 0.9766, "step": 22 }, { "epoch": 1.1578947368421053, - "eval_loss": 0.8951469659805298, - "eval_runtime": 3.3744, - "eval_samples_per_second": 8.891, - "eval_steps_per_second": 1.185, + "eval_loss": 0.896856427192688, + "eval_runtime": 3.3719, + "eval_samples_per_second": 8.897, + "eval_steps_per_second": 1.186, "step": 22 }, { "epoch": 1.2105263157894737, - "grad_norm": 0.6865942478179932, - "learning_rate": 0.00047073689821473173, - "loss": 0.894, + "grad_norm": 0.6940252184867859, + "learning_rate": 0.0004989061616117073, + "loss": 0.9007, "step": 23 }, { "epoch": 1.2105263157894737, - "eval_loss": 0.8961806893348694, - "eval_runtime": 3.3869, - "eval_samples_per_second": 8.858, - "eval_steps_per_second": 1.181, + "eval_loss": 0.8906852602958679, + "eval_runtime": 3.3829, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, "step": 23 }, { "epoch": 1.263157894736842, - "grad_norm": 0.7614845633506775, - "learning_rate": 0.00046201202403910646, - "loss": 0.9654, + "grad_norm": 0.6726508736610413, + "learning_rate": 0.0004985716323054959, + "loss": 0.9631, "step": 24 }, { "epoch": 1.263157894736842, - "eval_loss": 0.9240673184394836, - "eval_runtime": 3.3864, - "eval_samples_per_second": 8.859, - "eval_steps_per_second": 1.181, + "eval_loss": 0.9088860154151917, + "eval_runtime": 3.3738, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, "step": 24 }, { "epoch": 1.3157894736842106, - "grad_norm": 0.8841014504432678, - "learning_rate": 0.0004522542485937369, - "loss": 0.8996, + "grad_norm": 0.8016015291213989, + "learning_rate": 0.0004981926798731766, + "loss": 0.8834, "step": 25 }, { "epoch": 1.3157894736842106, - "eval_loss": 0.8987072706222534, - "eval_runtime": 3.3804, - "eval_samples_per_second": 8.875, - "eval_steps_per_second": 1.183, + "eval_loss": 0.8827565312385559, + "eval_runtime": 3.3734, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.186, "step": 25 }, { "epoch": 1.368421052631579, - "grad_norm": 0.695126473903656, - "learning_rate": 0.0004415111107797445, - "loss": 0.9224, + "grad_norm": 0.6770251989364624, + "learning_rate": 0.000497769372038695, + "loss": 0.9293, "step": 26 }, { "epoch": 1.368421052631579, - "eval_loss": 0.8950093388557434, - "eval_runtime": 3.3744, - "eval_samples_per_second": 8.89, + "eval_loss": 0.8824294209480286, + "eval_runtime": 3.3743, + "eval_samples_per_second": 8.891, "eval_steps_per_second": 1.185, "step": 26 }, { "epoch": 1.4210526315789473, - "grad_norm": 0.6917558908462524, - "learning_rate": 0.0004298349500846628, - "loss": 0.8954, + "grad_norm": 0.5907787084579468, + "learning_rate": 0.0004973017844529094, + "loss": 0.8882, "step": 27 }, { "epoch": 1.4210526315789473, - "eval_loss": 0.8965355157852173, - "eval_runtime": 3.3739, - "eval_samples_per_second": 8.892, - "eval_steps_per_second": 1.186, + "eval_loss": 0.8865768313407898, + "eval_runtime": 3.3689, + "eval_samples_per_second": 8.905, + "eval_steps_per_second": 1.187, "step": 27 }, { "epoch": 1.4736842105263157, - "grad_norm": 0.6432511806488037, - "learning_rate": 0.0004172826515897146, - "loss": 0.7978, + "grad_norm": 0.6106719970703125, + "learning_rate": 0.0004967900006800708, + "loss": 0.7885, "step": 28 }, { "epoch": 1.4736842105263157, - "eval_loss": 0.8845272660255432, - "eval_runtime": 3.3701, - "eval_samples_per_second": 8.902, + "eval_loss": 0.8721387982368469, + "eval_runtime": 3.3686, + "eval_samples_per_second": 8.906, "eval_steps_per_second": 1.187, "step": 28 }, { "epoch": 1.526315789473684, - "grad_norm": 0.6906137466430664, - "learning_rate": 0.00040391536883141455, - "loss": 0.9925, + "grad_norm": 0.7571881413459778, + "learning_rate": 0.000496234112182889, + "loss": 0.9921, "step": 29 }, { "epoch": 1.526315789473684, - "eval_loss": 0.8681280016899109, - "eval_runtime": 3.368, - "eval_samples_per_second": 8.907, - "eval_steps_per_second": 1.188, + "eval_loss": 0.8667643666267395, + "eval_runtime": 3.3694, + "eval_samples_per_second": 8.904, + "eval_steps_per_second": 1.187, "step": 29 }, { "epoch": 1.5789473684210527, - "grad_norm": 0.6398982405662537, - "learning_rate": 0.0003897982258676867, - "loss": 0.8644, + "grad_norm": 0.7024618983268738, + "learning_rate": 0.000495634218306187, + "loss": 0.8551, "step": 30 }, { "epoch": 1.5789473684210527, - "eval_loss": 0.857525110244751, - "eval_runtime": 3.3617, - "eval_samples_per_second": 8.924, - "eval_steps_per_second": 1.19, + "eval_loss": 0.8591945767402649, + "eval_runtime": 3.3679, + "eval_samples_per_second": 8.908, + "eval_steps_per_second": 1.188, "step": 30 }, { "epoch": 1.631578947368421, - "grad_norm": 0.6282161474227905, - "learning_rate": 0.000375, - "loss": 0.9207, + "grad_norm": 0.6524854302406311, + "learning_rate": 0.0004949904262591467, + "loss": 0.9371, "step": 31 }, { "epoch": 1.631578947368421, - "eval_loss": 0.8413797616958618, - "eval_runtime": 3.3632, - "eval_samples_per_second": 8.92, - "eval_steps_per_second": 1.189, + "eval_loss": 0.8411317467689514, + "eval_runtime": 3.3624, + "eval_samples_per_second": 8.922, + "eval_steps_per_second": 1.19, "step": 31 }, { "epoch": 1.6842105263157894, - "grad_norm": 0.5699971914291382, - "learning_rate": 0.00035959278669726934, - "loss": 0.8974, + "grad_norm": 0.6261035799980164, + "learning_rate": 0.0004943028510961491, + "loss": 0.9069, "step": 32 }, { "epoch": 1.6842105263157894, - "eval_loss": 0.8179092407226562, - "eval_runtime": 3.3714, - "eval_samples_per_second": 8.898, - "eval_steps_per_second": 1.186, + "eval_loss": 0.8188798427581787, + "eval_runtime": 3.3764, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.185, "step": 32 }, { "epoch": 1.736842105263158, - "grad_norm": 0.7283058762550354, - "learning_rate": 0.00034365164835397803, - "loss": 1.0363, + "grad_norm": 0.6555256247520447, + "learning_rate": 0.0004935716156962127, + "loss": 1.053, "step": 33 }, { "epoch": 1.736842105263158, - "eval_loss": 0.8006649017333984, - "eval_runtime": 3.3726, - "eval_samples_per_second": 8.895, - "eval_steps_per_second": 1.186, + "eval_loss": 0.8079219460487366, + "eval_runtime": 3.386, + "eval_samples_per_second": 8.86, + "eval_steps_per_second": 1.181, "step": 33 }, { "epoch": 1.7894736842105263, - "grad_norm": 0.8358228206634521, - "learning_rate": 0.00032725424859373687, - "loss": 0.8818, + "grad_norm": 0.5852477550506592, + "learning_rate": 0.000492796850741033, + "loss": 0.8765, "step": 34 }, { "epoch": 1.7894736842105263, - "eval_loss": 0.796642005443573, - "eval_runtime": 3.3722, - "eval_samples_per_second": 8.896, - "eval_steps_per_second": 1.186, + "eval_loss": 0.813165545463562, + "eval_runtime": 3.3846, + "eval_samples_per_second": 8.864, + "eval_steps_per_second": 1.182, "step": 34 }, { "epoch": 1.8421052631578947, - "grad_norm": 0.6364978551864624, - "learning_rate": 0.0003104804738999169, - "loss": 0.9305, + "grad_norm": 0.639496922492981, + "learning_rate": 0.0004919786946916281, + "loss": 0.9568, "step": 35 }, { "epoch": 1.8421052631578947, - "eval_loss": 0.7924755215644836, - "eval_runtime": 3.3733, - "eval_samples_per_second": 8.893, - "eval_steps_per_second": 1.186, + "eval_loss": 0.805493950843811, + "eval_runtime": 3.3855, + "eval_samples_per_second": 8.861, + "eval_steps_per_second": 1.182, "step": 35 }, { "epoch": 1.8947368421052633, - "grad_norm": 0.8200335502624512, - "learning_rate": 0.00029341204441673266, - "loss": 0.8827, + "grad_norm": 0.7414811253547668, + "learning_rate": 0.0004911172937635942, + "loss": 0.8907, "step": 36 }, { "epoch": 1.8947368421052633, - "eval_loss": 0.7788340449333191, - "eval_runtime": 3.3722, - "eval_samples_per_second": 8.896, - "eval_steps_per_second": 1.186, + "eval_loss": 0.7844607830047607, + "eval_runtime": 3.3839, + "eval_samples_per_second": 8.866, + "eval_steps_per_second": 1.182, "step": 36 }, { "epoch": 1.9473684210526314, - "grad_norm": 0.775111198425293, - "learning_rate": 0.0002761321158169134, - "loss": 0.9169, + "grad_norm": 0.6827616095542908, + "learning_rate": 0.0004902128019009741, + "loss": 0.9307, "step": 37 }, { "epoch": 1.9473684210526314, - "eval_loss": 0.7667044401168823, - "eval_runtime": 3.3756, - "eval_samples_per_second": 8.887, - "eval_steps_per_second": 1.185, + "eval_loss": 0.7627255320549011, + "eval_runtime": 3.3799, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, "step": 37 }, { "epoch": 2.0, - "grad_norm": 0.727277934551239, - "learning_rate": 0.0002587248741756253, - "loss": 1.0112, + "grad_norm": 0.6943689584732056, + "learning_rate": 0.000489265380748746, + "loss": 1.0176, "step": 38 }, { "epoch": 2.0, - "eval_loss": 0.7591570615768433, - "eval_runtime": 3.3728, - "eval_samples_per_second": 8.895, - "eval_steps_per_second": 1.186, + "eval_loss": 0.747063398361206, + "eval_runtime": 3.377, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.184, "step": 38 }, { "epoch": 2.0526315789473686, - "grad_norm": 0.5648457407951355, - "learning_rate": 0.00024127512582437484, - "loss": 0.8317, + "grad_norm": 0.5756209492683411, + "learning_rate": 0.0004882751996239352, + "loss": 0.8116, "step": 39 }, { "epoch": 2.0526315789473686, - "eval_loss": 0.7463916540145874, - "eval_runtime": 3.3708, - "eval_samples_per_second": 8.9, - "eval_steps_per_second": 1.187, + "eval_loss": 0.7395761013031006, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, "step": 39 }, { "epoch": 2.1052631578947367, - "grad_norm": 0.5476389527320862, - "learning_rate": 0.00022386788418308668, - "loss": 0.7733, + "grad_norm": 0.5255556106567383, + "learning_rate": 0.0004872424354853545, + "loss": 0.7513, "step": 40 }, { "epoch": 2.1052631578947367, - "eval_loss": 0.7394412159919739, - "eval_runtime": 3.3669, - "eval_samples_per_second": 8.91, - "eval_steps_per_second": 1.188, + "eval_loss": 0.7269207239151001, + "eval_runtime": 3.3751, + "eval_samples_per_second": 8.889, + "eval_steps_per_second": 1.185, "step": 40 + }, + { + "epoch": 2.1578947368421053, + "grad_norm": 0.4454876482486725, + "learning_rate": 0.0004861672729019797, + "loss": 0.7191, + "step": 41 + }, + { + "epoch": 2.1578947368421053, + "eval_loss": 0.7170627117156982, + "eval_runtime": 3.3598, + "eval_samples_per_second": 8.929, + "eval_steps_per_second": 1.191, + "step": 41 + }, + { + "epoch": 2.2105263157894735, + "grad_norm": 0.47672373056411743, + "learning_rate": 0.0004850499040199643, + "loss": 0.6647, + "step": 42 + }, + { + "epoch": 2.2105263157894735, + "eval_loss": 0.7082312703132629, + "eval_runtime": 3.3769, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 42 + }, + { + "epoch": 2.263157894736842, + "grad_norm": 0.7529894709587097, + "learning_rate": 0.0004838905285283005, + "loss": 0.7894, + "step": 43 + }, + { + "epoch": 2.263157894736842, + "eval_loss": 0.696483314037323, + "eval_runtime": 3.3804, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 43 + }, + { + "epoch": 2.3157894736842106, + "grad_norm": 0.9218277931213379, + "learning_rate": 0.00048268935362313215, + "loss": 0.7369, + "step": 44 + }, + { + "epoch": 2.3157894736842106, + "eval_loss": 0.6849729418754578, + "eval_runtime": 3.3802, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 44 + }, + { + "epoch": 2.3684210526315788, + "grad_norm": 0.5996107459068298, + "learning_rate": 0.00048144659397072586, + "loss": 0.6475, + "step": 45 + }, + { + "epoch": 2.3684210526315788, + "eval_loss": 0.6888956427574158, + "eval_runtime": 3.3809, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 45 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 0.7164121866226196, + "learning_rate": 0.0004801624716691072, + "loss": 0.7686, + "step": 46 + }, + { + "epoch": 2.4210526315789473, + "eval_loss": 0.6782687902450562, + "eval_runtime": 3.3792, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 46 + }, + { + "epoch": 2.473684210526316, + "grad_norm": 0.6449366807937622, + "learning_rate": 0.00047883721620836894, + "loss": 0.7207, + "step": 47 + }, + { + "epoch": 2.473684210526316, + "eval_loss": 0.6781312227249146, + "eval_runtime": 3.3778, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 47 + }, + { + "epoch": 2.526315789473684, + "grad_norm": 0.7168515920639038, + "learning_rate": 0.0004774710644296578, + "loss": 0.6626, + "step": 48 + }, + { + "epoch": 2.526315789473684, + "eval_loss": 0.6798893213272095, + "eval_runtime": 3.3753, + "eval_samples_per_second": 8.888, + "eval_steps_per_second": 1.185, + "step": 48 + }, + { + "epoch": 2.5789473684210527, + "grad_norm": 0.990449845790863, + "learning_rate": 0.00047606426048284813, + "loss": 0.7714, + "step": 49 + }, + { + "epoch": 2.5789473684210527, + "eval_loss": 0.6772938966751099, + "eval_runtime": 3.3745, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 49 + }, + { + "epoch": 2.6315789473684212, + "grad_norm": 0.5531617999076843, + "learning_rate": 0.00047461705578290833, + "loss": 0.6755, + "step": 50 + }, + { + "epoch": 2.6315789473684212, + "eval_loss": 0.6659534573554993, + "eval_runtime": 3.3717, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.186, + "step": 50 + }, + { + "epoch": 2.6842105263157894, + "grad_norm": 0.6090494394302368, + "learning_rate": 0.0004731297089649703, + "loss": 0.7172, + "step": 51 + }, + { + "epoch": 2.6842105263157894, + "eval_loss": 0.6523382067680359, + "eval_runtime": 3.3674, + "eval_samples_per_second": 8.909, + "eval_steps_per_second": 1.188, + "step": 51 + }, + { + "epoch": 2.736842105263158, + "grad_norm": 0.6239104270935059, + "learning_rate": 0.0004716024858381075, + "loss": 0.7175, + "step": 52 + }, + { + "epoch": 2.736842105263158, + "eval_loss": 0.6453693509101868, + "eval_runtime": 3.3771, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 52 + }, + { + "epoch": 2.7894736842105265, + "grad_norm": 0.7118521332740784, + "learning_rate": 0.00047003565933783123, + "loss": 0.8214, + "step": 53 + }, + { + "epoch": 2.7894736842105265, + "eval_loss": 0.6597944498062134, + "eval_runtime": 3.3753, + "eval_samples_per_second": 8.888, + "eval_steps_per_second": 1.185, + "step": 53 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 0.5826541185379028, + "learning_rate": 0.0004684295094773134, + "loss": 0.7019, + "step": 54 + }, + { + "epoch": 2.8421052631578947, + "eval_loss": 0.6597120761871338, + "eval_runtime": 3.3811, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 54 + }, + { + "epoch": 2.8947368421052633, + "grad_norm": 0.5552948117256165, + "learning_rate": 0.00046678432329734434, + "loss": 0.7227, + "step": 55 + }, + { + "epoch": 2.8947368421052633, + "eval_loss": 0.6556410193443298, + "eval_runtime": 3.3751, + "eval_samples_per_second": 8.889, + "eval_steps_per_second": 1.185, + "step": 55 + }, + { + "epoch": 2.9473684210526314, + "grad_norm": 0.6333251595497131, + "learning_rate": 0.00046510039481503486, + "loss": 0.7787, + "step": 56 + }, + { + "epoch": 2.9473684210526314, + "eval_loss": 0.6368284821510315, + "eval_runtime": 3.3752, + "eval_samples_per_second": 8.888, + "eval_steps_per_second": 1.185, + "step": 56 + }, + { + "epoch": 3.0, + "grad_norm": 0.7100315690040588, + "learning_rate": 0.00046337802497127117, + "loss": 0.764, + "step": 57 + }, + { + "epoch": 3.0, + "eval_loss": 0.6208627223968506, + "eval_runtime": 3.3706, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 1.187, + "step": 57 + }, + { + "epoch": 3.0526315789473686, + "grad_norm": 0.597659170627594, + "learning_rate": 0.00046161752157693284, + "loss": 0.6185, + "step": 58 + }, + { + "epoch": 3.0526315789473686, + "eval_loss": 0.6139974594116211, + "eval_runtime": 3.3674, + "eval_samples_per_second": 8.909, + "eval_steps_per_second": 1.188, + "step": 58 + }, + { + "epoch": 3.1052631578947367, + "grad_norm": 0.7387836575508118, + "learning_rate": 0.0004598191992578828, + "loss": 0.6251, + "step": 59 + }, + { + "epoch": 3.1052631578947367, + "eval_loss": 0.6229983568191528, + "eval_runtime": 3.37, + "eval_samples_per_second": 8.902, + "eval_steps_per_second": 1.187, + "step": 59 + }, + { + "epoch": 3.1578947368421053, + "grad_norm": 0.9215511083602905, + "learning_rate": 0.00045798337939873923, + "loss": 0.6738, + "step": 60 + }, + { + "epoch": 3.1578947368421053, + "eval_loss": 0.5894501805305481, + "eval_runtime": 3.3696, + "eval_samples_per_second": 8.903, + "eval_steps_per_second": 1.187, + "step": 60 + }, + { + "epoch": 3.2105263157894735, + "grad_norm": 0.6679036021232605, + "learning_rate": 0.0004561103900854401, + "loss": 0.5474, + "step": 61 + }, + { + "epoch": 3.2105263157894735, + "eval_loss": 0.5716174840927124, + "eval_runtime": 3.3568, + "eval_samples_per_second": 8.937, + "eval_steps_per_second": 1.192, + "step": 61 + }, + { + "epoch": 3.263157894736842, + "grad_norm": 0.6849400401115417, + "learning_rate": 0.0004542005660466094, + "loss": 0.6281, + "step": 62 + }, + { + "epoch": 3.263157894736842, + "eval_loss": 0.5605036020278931, + "eval_runtime": 3.3739, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, + "step": 62 + }, + { + "epoch": 3.3157894736842106, + "grad_norm": 0.6399258971214294, + "learning_rate": 0.0004522542485937369, + "loss": 0.5701, + "step": 63 + }, + { + "epoch": 3.3157894736842106, + "eval_loss": 0.5636869072914124, + "eval_runtime": 3.3835, + "eval_samples_per_second": 8.867, + "eval_steps_per_second": 1.182, + "step": 63 + }, + { + "epoch": 3.3684210526315788, + "grad_norm": 0.5196012854576111, + "learning_rate": 0.0004502717855601809, + "loss": 0.5712, + "step": 64 + }, + { + "epoch": 3.3684210526315788, + "eval_loss": 0.5717850923538208, + "eval_runtime": 3.3926, + "eval_samples_per_second": 8.843, + "eval_steps_per_second": 1.179, + "step": 64 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 0.640343964099884, + "learning_rate": 0.0004482535312390058, + "loss": 0.5722, + "step": 65 + }, + { + "epoch": 3.4210526315789473, + "eval_loss": 0.5577818751335144, + "eval_runtime": 3.3885, + "eval_samples_per_second": 8.853, + "eval_steps_per_second": 1.18, + "step": 65 + }, + { + "epoch": 3.473684210526316, + "grad_norm": 0.8241782784461975, + "learning_rate": 0.00044619984631966527, + "loss": 0.4595, + "step": 66 + }, + { + "epoch": 3.473684210526316, + "eval_loss": 0.5444250106811523, + "eval_runtime": 3.3826, + "eval_samples_per_second": 8.869, + "eval_steps_per_second": 1.183, + "step": 66 + }, + { + "epoch": 3.526315789473684, + "grad_norm": 0.8776825666427612, + "learning_rate": 0.0004441110978235418, + "loss": 0.6452, + "step": 67 + }, + { + "epoch": 3.526315789473684, + "eval_loss": 0.5407612919807434, + "eval_runtime": 3.3745, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 67 + }, + { + "epoch": 3.5789473684210527, + "grad_norm": 0.7806735038757324, + "learning_rate": 0.0004419876590383554, + "loss": 0.6386, + "step": 68 + }, + { + "epoch": 3.5789473684210527, + "eval_loss": 0.546351432800293, + "eval_runtime": 3.3647, + "eval_samples_per_second": 8.916, + "eval_steps_per_second": 1.189, + "step": 68 + }, + { + "epoch": 3.6315789473684212, + "grad_norm": 0.6366623640060425, + "learning_rate": 0.00043982990945145146, + "loss": 0.5596, + "step": 69 + }, + { + "epoch": 3.6315789473684212, + "eval_loss": 0.5475739240646362, + "eval_runtime": 3.364, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.189, + "step": 69 + }, + { + "epoch": 3.6842105263157894, + "grad_norm": 0.6691639423370361, + "learning_rate": 0.0004376382346819819, + "loss": 0.577, + "step": 70 + }, + { + "epoch": 3.6842105263157894, + "eval_loss": 0.5291566848754883, + "eval_runtime": 3.3701, + "eval_samples_per_second": 8.902, + "eval_steps_per_second": 1.187, + "step": 70 + }, + { + "epoch": 3.736842105263158, + "grad_norm": 0.6550089120864868, + "learning_rate": 0.00043541302641198946, + "loss": 0.6525, + "step": 71 + }, + { + "epoch": 3.736842105263158, + "eval_loss": 0.5043501257896423, + "eval_runtime": 3.3542, + "eval_samples_per_second": 8.944, + "eval_steps_per_second": 1.193, + "step": 71 + }, + { + "epoch": 3.7894736842105265, + "grad_norm": 0.5108755826950073, + "learning_rate": 0.00043315468231640834, + "loss": 0.5603, + "step": 72 + }, + { + "epoch": 3.7894736842105265, + "eval_loss": 0.5015916228294373, + "eval_runtime": 3.373, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.186, + "step": 72 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 0.63949054479599, + "learning_rate": 0.00043086360599199516, + "loss": 0.5848, + "step": 73 + }, + { + "epoch": 3.8421052631578947, + "eval_loss": 0.4879189729690552, + "eval_runtime": 3.3828, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 73 + }, + { + "epoch": 3.8947368421052633, + "grad_norm": 0.6961733102798462, + "learning_rate": 0.0004285402068852002, + "loss": 0.6082, + "step": 74 + }, + { + "epoch": 3.8947368421052633, + "eval_loss": 0.48071128129959106, + "eval_runtime": 3.3853, + "eval_samples_per_second": 8.862, + "eval_steps_per_second": 1.182, + "step": 74 + }, + { + "epoch": 3.9473684210526314, + "grad_norm": 0.5982790589332581, + "learning_rate": 0.00042618490021899383, + "loss": 0.5669, + "step": 75 + }, + { + "epoch": 3.9473684210526314, + "eval_loss": 0.48517391085624695, + "eval_runtime": 3.3794, + "eval_samples_per_second": 8.877, + "eval_steps_per_second": 1.184, + "step": 75 + }, + { + "epoch": 4.0, + "grad_norm": 0.5146693587303162, + "learning_rate": 0.00042379810691866064, + "loss": 0.5407, + "step": 76 + }, + { + "epoch": 4.0, + "eval_loss": 0.49224573373794556, + "eval_runtime": 3.375, + "eval_samples_per_second": 8.889, + "eval_steps_per_second": 1.185, + "step": 76 + }, + { + "epoch": 4.052631578947368, + "grad_norm": 0.5795295238494873, + "learning_rate": 0.00042138025353657407, + "loss": 0.4275, + "step": 77 + }, + { + "epoch": 4.052631578947368, + "eval_loss": 0.46562132239341736, + "eval_runtime": 3.3759, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 77 + }, + { + "epoch": 4.105263157894737, + "grad_norm": 0.6563876867294312, + "learning_rate": 0.00041893177217596633, + "loss": 0.4278, + "step": 78 + }, + { + "epoch": 4.105263157894737, + "eval_loss": 0.43292027711868286, + "eval_runtime": 3.3665, + "eval_samples_per_second": 8.911, + "eval_steps_per_second": 1.188, + "step": 78 + }, + { + "epoch": 4.157894736842105, + "grad_norm": 0.5894250869750977, + "learning_rate": 0.0004164531004137049, + "loss": 0.399, + "step": 79 + }, + { + "epoch": 4.157894736842105, + "eval_loss": 0.4149712026119232, + "eval_runtime": 3.3714, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.186, + "step": 79 + }, + { + "epoch": 4.2105263157894735, + "grad_norm": 0.6952552795410156, + "learning_rate": 0.0004139446812220924, + "loss": 0.3984, + "step": 80 + }, + { + "epoch": 4.2105263157894735, + "eval_loss": 0.4051551818847656, + "eval_runtime": 3.3691, + "eval_samples_per_second": 8.905, + "eval_steps_per_second": 1.187, + "step": 80 + }, + { + "epoch": 4.2631578947368425, + "grad_norm": 1.0759810209274292, + "learning_rate": 0.0004114069628897006, + "loss": 0.4491, + "step": 81 + }, + { + "epoch": 4.2631578947368425, + "eval_loss": 0.38415494561195374, + "eval_runtime": 3.3619, + "eval_samples_per_second": 8.924, + "eval_steps_per_second": 1.19, + "step": 81 + }, + { + "epoch": 4.315789473684211, + "grad_norm": 0.7915282249450684, + "learning_rate": 0.0004088403989412559, + "loss": 0.4249, + "step": 82 + }, + { + "epoch": 4.315789473684211, + "eval_loss": 0.3752954602241516, + "eval_runtime": 3.3693, + "eval_samples_per_second": 8.904, + "eval_steps_per_second": 1.187, + "step": 82 + }, + { + "epoch": 4.368421052631579, + "grad_norm": 0.8162989020347595, + "learning_rate": 0.00040624544805658794, + "loss": 0.4117, + "step": 83 + }, + { + "epoch": 4.368421052631579, + "eval_loss": 0.3794693648815155, + "eval_runtime": 3.3773, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 83 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 0.838843822479248, + "learning_rate": 0.00040362257398865713, + "loss": 0.479, + "step": 84 + }, + { + "epoch": 4.421052631578947, + "eval_loss": 0.37851518392562866, + "eval_runtime": 3.3773, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 84 + }, + { + "epoch": 4.473684210526316, + "grad_norm": 0.7240565419197083, + "learning_rate": 0.00040097224548067613, + "loss": 0.3968, + "step": 85 + }, + { + "epoch": 4.473684210526316, + "eval_loss": 0.3777390718460083, + "eval_runtime": 3.3799, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 85 + }, + { + "epoch": 4.526315789473684, + "grad_norm": 0.6041298508644104, + "learning_rate": 0.0003982949361823388, + "loss": 0.3958, + "step": 86 + }, + { + "epoch": 4.526315789473684, + "eval_loss": 0.3891761004924774, + "eval_runtime": 3.3811, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 86 + }, + { + "epoch": 4.578947368421053, + "grad_norm": 0.7584108710289001, + "learning_rate": 0.0003955911245651726, + "loss": 0.4224, + "step": 87 + }, + { + "epoch": 4.578947368421053, + "eval_loss": 0.387052059173584, + "eval_runtime": 3.3769, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 87 + }, + { + "epoch": 4.631578947368421, + "grad_norm": 0.6544305682182312, + "learning_rate": 0.0003928612938370292, + "loss": 0.3959, + "step": 88 + }, + { + "epoch": 4.631578947368421, + "eval_loss": 0.3794018626213074, + "eval_runtime": 3.3754, + "eval_samples_per_second": 8.888, + "eval_steps_per_second": 1.185, + "step": 88 + }, + { + "epoch": 4.684210526315789, + "grad_norm": 0.6925466060638428, + "learning_rate": 0.00039010593185572867, + "loss": 0.3221, + "step": 89 + }, + { + "epoch": 4.684210526315789, + "eval_loss": 0.3659925162792206, + "eval_runtime": 3.3793, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 89 + }, + { + "epoch": 4.7368421052631575, + "grad_norm": 0.6043423414230347, + "learning_rate": 0.00038732553104187296, + "loss": 0.3331, + "step": 90 + }, + { + "epoch": 4.7368421052631575, + "eval_loss": 0.3581882417201996, + "eval_runtime": 3.3757, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 90 + }, + { + "epoch": 4.7894736842105265, + "grad_norm": 0.8853951692581177, + "learning_rate": 0.0003845205882908432, + "loss": 0.4611, + "step": 91 + }, + { + "epoch": 4.7894736842105265, + "eval_loss": 0.3461619019508362, + "eval_runtime": 3.3654, + "eval_samples_per_second": 8.914, + "eval_steps_per_second": 1.189, + "step": 91 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 0.7234694361686707, + "learning_rate": 0.0003816916048839979, + "loss": 0.3553, + "step": 92 + }, + { + "epoch": 4.842105263157895, + "eval_loss": 0.35203301906585693, + "eval_runtime": 3.3746, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 92 + }, + { + "epoch": 4.894736842105263, + "grad_norm": 0.8052714467048645, + "learning_rate": 0.0003788390863990875, + "loss": 0.5006, + "step": 93 + }, + { + "epoch": 4.894736842105263, + "eval_loss": 0.35418692231178284, + "eval_runtime": 3.3786, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 93 + }, + { + "epoch": 4.947368421052632, + "grad_norm": 0.6172913312911987, + "learning_rate": 0.00037596354261990007, + "loss": 0.4838, + "step": 94 + }, + { + "epoch": 4.947368421052632, + "eval_loss": 0.3642553687095642, + "eval_runtime": 3.379, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 94 + }, + { + "epoch": 5.0, + "grad_norm": 1.04489004611969, + "learning_rate": 0.0003730654874451569, + "loss": 0.4829, + "step": 95 + }, + { + "epoch": 5.0, + "eval_loss": 0.3575284779071808, + "eval_runtime": 3.3794, + "eval_samples_per_second": 8.877, + "eval_steps_per_second": 1.184, + "step": 95 + }, + { + "epoch": 5.052631578947368, + "grad_norm": 0.6183101534843445, + "learning_rate": 0.00037014543879667093, + "loss": 0.267, + "step": 96 + }, + { + "epoch": 5.052631578947368, + "eval_loss": 0.3375321924686432, + "eval_runtime": 3.3808, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 96 + }, + { + "epoch": 5.105263157894737, + "grad_norm": 0.5247947573661804, + "learning_rate": 0.0003672039185267878, + "loss": 0.3576, + "step": 97 + }, + { + "epoch": 5.105263157894737, + "eval_loss": 0.3204007148742676, + "eval_runtime": 3.3778, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 97 + }, + { + "epoch": 5.157894736842105, + "grad_norm": 0.5456224679946899, + "learning_rate": 0.00036424145232512333, + "loss": 0.3071, + "step": 98 + }, + { + "epoch": 5.157894736842105, + "eval_loss": 0.3128179907798767, + "eval_runtime": 3.3777, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 98 + }, + { + "epoch": 5.2105263157894735, + "grad_norm": 0.6962345838546753, + "learning_rate": 0.0003612585696246158, + "loss": 0.2541, + "step": 99 + }, + { + "epoch": 5.2105263157894735, + "eval_loss": 0.30804356932640076, + "eval_runtime": 3.3769, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 99 + }, + { + "epoch": 5.2631578947368425, + "grad_norm": 0.8855291604995728, + "learning_rate": 0.0003582558035069091, + "loss": 0.3128, + "step": 100 + }, + { + "epoch": 5.2631578947368425, + "eval_loss": 0.29729241132736206, + "eval_runtime": 3.3756, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 100 + }, + { + "epoch": 5.315789473684211, + "grad_norm": 0.8910918831825256, + "learning_rate": 0.0003552336906070838, + "loss": 0.3038, + "step": 101 + }, + { + "epoch": 5.315789473684211, + "eval_loss": 0.2814747989177704, + "eval_runtime": 3.3697, + "eval_samples_per_second": 8.903, + "eval_steps_per_second": 1.187, + "step": 101 + }, + { + "epoch": 5.368421052631579, + "grad_norm": 0.6493738293647766, + "learning_rate": 0.000352192771017753, + "loss": 0.2874, + "step": 102 + }, + { + "epoch": 5.368421052631579, + "eval_loss": 0.2786155939102173, + "eval_runtime": 3.3754, + "eval_samples_per_second": 8.888, + "eval_steps_per_second": 1.185, + "step": 102 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 0.6749550104141235, + "learning_rate": 0.0003491335881925407, + "loss": 0.3021, + "step": 103 + }, + { + "epoch": 5.421052631578947, + "eval_loss": 0.27060186862945557, + "eval_runtime": 3.3768, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 103 + }, + { + "epoch": 5.473684210526316, + "grad_norm": 0.6542089581489563, + "learning_rate": 0.0003460566888489593, + "loss": 0.2936, + "step": 104 + }, + { + "epoch": 5.473684210526316, + "eval_loss": 0.2601727545261383, + "eval_runtime": 3.3792, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 104 + }, + { + "epoch": 5.526315789473684, + "grad_norm": 0.6920850872993469, + "learning_rate": 0.00034296262287070335, + "loss": 0.263, + "step": 105 + }, + { + "epoch": 5.526315789473684, + "eval_loss": 0.25834792852401733, + "eval_runtime": 3.3788, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 105 + }, + { + "epoch": 5.578947368421053, + "grad_norm": 0.9174802899360657, + "learning_rate": 0.0003398519432093782, + "loss": 0.3511, + "step": 106 + }, + { + "epoch": 5.578947368421053, + "eval_loss": 0.24882812798023224, + "eval_runtime": 3.3807, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 106 + }, + { + "epoch": 5.631578947368421, + "grad_norm": 0.8116822242736816, + "learning_rate": 0.0003367252057856802, + "loss": 0.2829, + "step": 107 + }, + { + "epoch": 5.631578947368421, + "eval_loss": 0.2406025230884552, + "eval_runtime": 3.377, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.184, + "step": 107 + }, + { + "epoch": 5.684210526315789, + "grad_norm": 0.7639219760894775, + "learning_rate": 0.00033358296939004547, + "loss": 0.2807, + "step": 108 + }, + { + "epoch": 5.684210526315789, + "eval_loss": 0.23070518672466278, + "eval_runtime": 3.3772, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 108 + }, + { + "epoch": 5.7368421052631575, + "grad_norm": 0.6583912372589111, + "learning_rate": 0.00033042579558278717, + "loss": 0.265, + "step": 109 + }, + { + "epoch": 5.7368421052631575, + "eval_loss": 0.22192642092704773, + "eval_runtime": 3.3773, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 109 + }, + { + "epoch": 5.7894736842105265, + "grad_norm": 0.8119845390319824, + "learning_rate": 0.00032725424859373687, + "loss": 0.3047, + "step": 110 + }, + { + "epoch": 5.7894736842105265, + "eval_loss": 0.21517355740070343, + "eval_runtime": 3.3769, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 110 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 0.730301558971405, + "learning_rate": 0.0003240688952214085, + "loss": 0.2843, + "step": 111 + }, + { + "epoch": 5.842105263157895, + "eval_loss": 0.2068045288324356, + "eval_runtime": 3.37, + "eval_samples_per_second": 8.902, + "eval_steps_per_second": 1.187, + "step": 111 + }, + { + "epoch": 5.894736842105263, + "grad_norm": 0.6950502991676331, + "learning_rate": 0.00032087030473170445, + "loss": 0.2386, + "step": 112 + }, + { + "epoch": 5.894736842105263, + "eval_loss": 0.19935902953147888, + "eval_runtime": 3.3727, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.186, + "step": 112 + }, + { + "epoch": 5.947368421052632, + "grad_norm": 0.6581968665122986, + "learning_rate": 0.00031765904875617973, + "loss": 0.2102, + "step": 113 + }, + { + "epoch": 5.947368421052632, + "eval_loss": 0.19789482653141022, + "eval_runtime": 3.3829, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 113 + }, + { + "epoch": 6.0, + "grad_norm": 0.6693621277809143, + "learning_rate": 0.00031443570118988356, + "loss": 0.2301, + "step": 114 + }, + { + "epoch": 6.0, + "eval_loss": 0.20284578204154968, + "eval_runtime": 3.3782, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 114 + }, + { + "epoch": 6.052631578947368, + "grad_norm": 0.919114351272583, + "learning_rate": 0.00031120083808879663, + "loss": 0.2117, + "step": 115 + }, + { + "epoch": 6.052631578947368, + "eval_loss": 0.19495722651481628, + "eval_runtime": 3.3789, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 115 + }, + { + "epoch": 6.105263157894737, + "grad_norm": 0.5941214561462402, + "learning_rate": 0.0003079550375668821, + "loss": 0.1839, + "step": 116 + }, + { + "epoch": 6.105263157894737, + "eval_loss": 0.18467417359352112, + "eval_runtime": 3.3763, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.185, + "step": 116 + }, + { + "epoch": 6.157894736842105, + "grad_norm": 0.6383218169212341, + "learning_rate": 0.00030469887969276877, + "loss": 0.1538, + "step": 117 + }, + { + "epoch": 6.157894736842105, + "eval_loss": 0.17883038520812988, + "eval_runtime": 3.3776, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 117 + }, + { + "epoch": 6.2105263157894735, + "grad_norm": 0.7167591452598572, + "learning_rate": 0.00030143294638608487, + "loss": 0.1457, + "step": 118 + }, + { + "epoch": 6.2105263157894735, + "eval_loss": 0.173020139336586, + "eval_runtime": 3.3758, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 118 + }, + { + "epoch": 6.2631578947368425, + "grad_norm": 1.0165019035339355, + "learning_rate": 0.00029815782131346137, + "loss": 0.1968, + "step": 119 + }, + { + "epoch": 6.2631578947368425, + "eval_loss": 0.1655978560447693, + "eval_runtime": 3.3745, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 119 + }, + { + "epoch": 6.315789473684211, + "grad_norm": 0.7757887244224548, + "learning_rate": 0.0002948740897842223, + "loss": 0.1797, + "step": 120 + }, + { + "epoch": 6.315789473684211, + "eval_loss": 0.1618230789899826, + "eval_runtime": 3.3739, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, + "step": 120 + }, + { + "epoch": 6.368421052631579, + "grad_norm": 0.766126275062561, + "learning_rate": 0.00029158233864578256, + "loss": 0.1328, + "step": 121 + }, + { + "epoch": 6.368421052631579, + "eval_loss": 0.16206717491149902, + "eval_runtime": 3.3696, + "eval_samples_per_second": 8.903, + "eval_steps_per_second": 1.187, + "step": 121 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 0.7166094183921814, + "learning_rate": 0.00028828315617877, + "loss": 0.1305, + "step": 122 + }, + { + "epoch": 6.421052631578947, + "eval_loss": 0.15461011230945587, + "eval_runtime": 3.3756, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 122 + }, + { + "epoch": 6.473684210526316, + "grad_norm": 0.8475157618522644, + "learning_rate": 0.0002849771319918922, + "loss": 0.2097, + "step": 123 + }, + { + "epoch": 6.473684210526316, + "eval_loss": 0.1477242410182953, + "eval_runtime": 3.3806, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 123 + }, + { + "epoch": 6.526315789473684, + "grad_norm": 0.7089868783950806, + "learning_rate": 0.00028166485691656423, + "loss": 0.1661, + "step": 124 + }, + { + "epoch": 6.526315789473684, + "eval_loss": 0.14408369362354279, + "eval_runtime": 3.3756, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 124 + }, + { + "epoch": 6.578947368421053, + "grad_norm": 0.5744835734367371, + "learning_rate": 0.00027834692290132053, + "loss": 0.1315, + "step": 125 + }, + { + "epoch": 6.578947368421053, + "eval_loss": 0.13885816931724548, + "eval_runtime": 3.3819, + "eval_samples_per_second": 8.871, + "eval_steps_per_second": 1.183, + "step": 125 + }, + { + "epoch": 6.631578947368421, + "grad_norm": 0.6334472298622131, + "learning_rate": 0.0002750239229060246, + "loss": 0.1052, + "step": 126 + }, + { + "epoch": 6.631578947368421, + "eval_loss": 0.13384667038917542, + "eval_runtime": 3.381, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 126 + }, + { + "epoch": 6.684210526315789, + "grad_norm": 0.6963412761688232, + "learning_rate": 0.0002716964507958994, + "loss": 0.1668, + "step": 127 + }, + { + "epoch": 6.684210526315789, + "eval_loss": 0.11921342462301254, + "eval_runtime": 3.3807, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 127 + }, + { + "epoch": 6.7368421052631575, + "grad_norm": 0.7505935430526733, + "learning_rate": 0.0002683651012353955, + "loss": 0.1627, + "step": 128 + }, + { + "epoch": 6.7368421052631575, + "eval_loss": 0.11014129221439362, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 128 + }, + { + "epoch": 6.7894736842105265, + "grad_norm": 0.674117922782898, + "learning_rate": 0.0002650304695819168, + "loss": 0.1501, + "step": 129 + }, + { + "epoch": 6.7894736842105265, + "eval_loss": 0.10147183388471603, + "eval_runtime": 3.3743, + "eval_samples_per_second": 8.891, + "eval_steps_per_second": 1.185, + "step": 129 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 0.7658000588417053, + "learning_rate": 0.00026169315177942135, + "loss": 0.227, + "step": 130 + }, + { + "epoch": 6.842105263157895, + "eval_loss": 0.09033489227294922, + "eval_runtime": 3.3725, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.186, + "step": 130 + }, + { + "epoch": 6.894736842105263, + "grad_norm": 0.5885961651802063, + "learning_rate": 0.0002583537442519187, + "loss": 0.1572, + "step": 131 + }, + { + "epoch": 6.894736842105263, + "eval_loss": 0.0783379077911377, + "eval_runtime": 3.3652, + "eval_samples_per_second": 8.915, + "eval_steps_per_second": 1.189, + "step": 131 + }, + { + "epoch": 6.947368421052632, + "grad_norm": 0.6617143750190735, + "learning_rate": 0.00025501284379688067, + "loss": 0.1475, + "step": 132 + }, + { + "epoch": 6.947368421052632, + "eval_loss": 0.0756978839635849, + "eval_runtime": 3.3713, + "eval_samples_per_second": 8.899, + "eval_steps_per_second": 1.186, + "step": 132 + }, + { + "epoch": 7.0, + "grad_norm": 0.6922865509986877, + "learning_rate": 0.0002516710474785856, + "loss": 0.1381, + "step": 133 + }, + { + "epoch": 7.0, + "eval_loss": 0.07574369013309479, + "eval_runtime": 3.3776, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 133 + }, + { + "epoch": 7.052631578947368, + "grad_norm": 0.44237321615219116, + "learning_rate": 0.0002483289525214145, + "loss": 0.056, + "step": 134 + }, + { + "epoch": 7.052631578947368, + "eval_loss": 0.07430978864431381, + "eval_runtime": 3.379, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 134 + }, + { + "epoch": 7.105263157894737, + "grad_norm": 0.4937044680118561, + "learning_rate": 0.00024498715620311935, + "loss": 0.0546, + "step": 135 + }, + { + "epoch": 7.105263157894737, + "eval_loss": 0.07223793864250183, + "eval_runtime": 3.3798, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 135 + }, + { + "epoch": 7.157894736842105, + "grad_norm": 0.843362033367157, + "learning_rate": 0.00024164625574808144, + "loss": 0.0676, + "step": 136 + }, + { + "epoch": 7.157894736842105, + "eval_loss": 0.0715818703174591, + "eval_runtime": 3.3784, + "eval_samples_per_second": 8.88, + "eval_steps_per_second": 1.184, + "step": 136 + }, + { + "epoch": 7.2105263157894735, + "grad_norm": 0.5508689284324646, + "learning_rate": 0.00023830684822057877, + "loss": 0.0849, + "step": 137 + }, + { + "epoch": 7.2105263157894735, + "eval_loss": 0.0706930160522461, + "eval_runtime": 3.3794, + "eval_samples_per_second": 8.877, + "eval_steps_per_second": 1.184, + "step": 137 + }, + { + "epoch": 7.2631578947368425, + "grad_norm": 0.45906394720077515, + "learning_rate": 0.00023496953041808325, + "loss": 0.037, + "step": 138 + }, + { + "epoch": 7.2631578947368425, + "eval_loss": 0.06921949237585068, + "eval_runtime": 3.38, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 138 + }, + { + "epoch": 7.315789473684211, + "grad_norm": 0.9073092341423035, + "learning_rate": 0.0002316348987646045, + "loss": 0.0821, + "step": 139 + }, + { + "epoch": 7.315789473684211, + "eval_loss": 0.06192217394709587, + "eval_runtime": 3.3725, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.186, + "step": 139 + }, + { + "epoch": 7.368421052631579, + "grad_norm": 0.6644508242607117, + "learning_rate": 0.00022830354920410064, + "loss": 0.0687, + "step": 140 + }, + { + "epoch": 7.368421052631579, + "eval_loss": 0.06147347763180733, + "eval_runtime": 3.3708, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 1.187, + "step": 140 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 0.6074717044830322, + "learning_rate": 0.0002249760770939754, + "loss": 0.0494, + "step": 141 + }, + { + "epoch": 7.421052631578947, + "eval_loss": 0.06366048008203506, + "eval_runtime": 3.3644, + "eval_samples_per_second": 8.917, + "eval_steps_per_second": 1.189, + "step": 141 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 0.5519073009490967, + "learning_rate": 0.0002216530770986795, + "loss": 0.0569, + "step": 142 + }, + { + "epoch": 7.473684210526316, + "eval_loss": 0.060868822038173676, + "eval_runtime": 3.3772, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 142 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 0.5936044454574585, + "learning_rate": 0.0002183351430834358, + "loss": 0.0529, + "step": 143 + }, + { + "epoch": 7.526315789473684, + "eval_loss": 0.060183968394994736, + "eval_runtime": 3.3842, + "eval_samples_per_second": 8.865, + "eval_steps_per_second": 1.182, + "step": 143 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 0.4775611162185669, + "learning_rate": 0.0002150228680081079, + "loss": 0.0606, + "step": 144 + }, + { + "epoch": 7.578947368421053, + "eval_loss": 0.060147836804389954, + "eval_runtime": 3.377, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.184, + "step": 144 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 0.5507313013076782, + "learning_rate": 0.00021171684382123, + "loss": 0.0515, + "step": 145 + }, + { + "epoch": 7.631578947368421, + "eval_loss": 0.05933203548192978, + "eval_runtime": 3.3739, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, + "step": 145 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 0.6271359324455261, + "learning_rate": 0.0002084176613542175, + "loss": 0.0768, + "step": 146 + }, + { + "epoch": 7.684210526315789, + "eval_loss": 0.05137402191758156, + "eval_runtime": 3.3705, + "eval_samples_per_second": 8.901, + "eval_steps_per_second": 1.187, + "step": 146 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 0.511416494846344, + "learning_rate": 0.00020512591021577773, + "loss": 0.0559, + "step": 147 + }, + { + "epoch": 7.7368421052631575, + "eval_loss": 0.04617203772068024, + "eval_runtime": 3.3685, + "eval_samples_per_second": 8.906, + "eval_steps_per_second": 1.187, + "step": 147 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 0.43504372239112854, + "learning_rate": 0.00020184217868653867, + "loss": 0.0495, + "step": 148 + }, + { + "epoch": 7.7894736842105265, + "eval_loss": 0.039726559072732925, + "eval_runtime": 3.3679, + "eval_samples_per_second": 8.907, + "eval_steps_per_second": 1.188, + "step": 148 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 0.5811814069747925, + "learning_rate": 0.0001985670536139151, + "loss": 0.0741, + "step": 149 + }, + { + "epoch": 7.842105263157895, + "eval_loss": 0.03227859362959862, + "eval_runtime": 3.3641, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.189, + "step": 149 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.41861817240715027, + "learning_rate": 0.0001953011203072312, + "loss": 0.0477, + "step": 150 + }, + { + "epoch": 7.894736842105263, + "eval_loss": 0.02963736467063427, + "eval_runtime": 3.3713, + "eval_samples_per_second": 8.899, + "eval_steps_per_second": 1.186, + "step": 150 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 0.610363245010376, + "learning_rate": 0.00019204496243311792, + "loss": 0.0794, + "step": 151 + }, + { + "epoch": 7.947368421052632, + "eval_loss": 0.02523585595190525, + "eval_runtime": 3.3702, + "eval_samples_per_second": 8.902, + "eval_steps_per_second": 1.187, + "step": 151 + }, + { + "epoch": 8.0, + "grad_norm": 0.6150274276733398, + "learning_rate": 0.00018879916191120349, + "loss": 0.0613, + "step": 152 + }, + { + "epoch": 8.0, + "eval_loss": 0.021767113357782364, + "eval_runtime": 3.3739, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, + "step": 152 + }, + { + "epoch": 8.052631578947368, + "grad_norm": 0.23665758967399597, + "learning_rate": 0.00018556429881011656, + "loss": 0.0141, + "step": 153 + }, + { + "epoch": 8.052631578947368, + "eval_loss": 0.021793341264128685, + "eval_runtime": 3.3739, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 1.186, + "step": 153 + }, + { + "epoch": 8.105263157894736, + "grad_norm": 0.27234676480293274, + "learning_rate": 0.0001823409512438203, + "loss": 0.019, + "step": 154 + }, + { + "epoch": 8.105263157894736, + "eval_loss": 0.021968627348542213, + "eval_runtime": 3.3796, + "eval_samples_per_second": 8.877, + "eval_steps_per_second": 1.184, + "step": 154 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 0.1709115207195282, + "learning_rate": 0.00017912969526829559, + "loss": 0.0113, + "step": 155 + }, + { + "epoch": 8.157894736842104, + "eval_loss": 0.02176038548350334, + "eval_runtime": 3.382, + "eval_samples_per_second": 8.87, + "eval_steps_per_second": 1.183, + "step": 155 + }, + { + "epoch": 8.210526315789474, + "grad_norm": 0.27976280450820923, + "learning_rate": 0.00017593110477859153, + "loss": 0.0208, + "step": 156 + }, + { + "epoch": 8.210526315789474, + "eval_loss": 0.01957644335925579, + "eval_runtime": 3.3846, + "eval_samples_per_second": 8.864, + "eval_steps_per_second": 1.182, + "step": 156 + }, + { + "epoch": 8.263157894736842, + "grad_norm": 0.38411393761634827, + "learning_rate": 0.00017274575140626317, + "loss": 0.018, + "step": 157 + }, + { + "epoch": 8.263157894736842, + "eval_loss": 0.01841222681105137, + "eval_runtime": 3.3829, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 157 + }, + { + "epoch": 8.31578947368421, + "grad_norm": 0.28138333559036255, + "learning_rate": 0.00016957420441721284, + "loss": 0.0121, + "step": 158 + }, + { + "epoch": 8.31578947368421, + "eval_loss": 0.01750197820365429, + "eval_runtime": 3.3767, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 158 + }, + { + "epoch": 8.368421052631579, + "grad_norm": 0.3294574022293091, + "learning_rate": 0.00016641703060995457, + "loss": 0.0167, + "step": 159 + }, + { + "epoch": 8.368421052631579, + "eval_loss": 0.01637749746441841, + "eval_runtime": 3.3774, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 159 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 0.3949958384037018, + "learning_rate": 0.00016327479421431983, + "loss": 0.0245, + "step": 160 + }, + { + "epoch": 8.421052631578947, + "eval_loss": 0.014502634294331074, + "eval_runtime": 3.3714, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 1.186, + "step": 160 + }, + { + "epoch": 8.473684210526315, + "grad_norm": 0.4900204539299011, + "learning_rate": 0.00016014805679062183, + "loss": 0.0262, + "step": 161 + }, + { + "epoch": 8.473684210526315, + "eval_loss": 0.012611261568963528, + "eval_runtime": 3.3701, + "eval_samples_per_second": 8.902, + "eval_steps_per_second": 1.187, + "step": 161 + }, + { + "epoch": 8.526315789473685, + "grad_norm": 0.34749501943588257, + "learning_rate": 0.0001570373771292967, + "loss": 0.0156, + "step": 162 + }, + { + "epoch": 8.526315789473685, + "eval_loss": 0.012049918994307518, + "eval_runtime": 3.3768, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 162 + }, + { + "epoch": 8.578947368421053, + "grad_norm": 0.3857935667037964, + "learning_rate": 0.00015394331115104075, + "loss": 0.023, + "step": 163 + }, + { + "epoch": 8.578947368421053, + "eval_loss": 0.010976273566484451, + "eval_runtime": 3.378, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 163 + }, + { + "epoch": 8.631578947368421, + "grad_norm": 0.3299902379512787, + "learning_rate": 0.00015086641180745932, + "loss": 0.0159, + "step": 164 + }, + { + "epoch": 8.631578947368421, + "eval_loss": 0.010536915622651577, + "eval_runtime": 3.3822, + "eval_samples_per_second": 8.87, + "eval_steps_per_second": 1.183, + "step": 164 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 0.40735048055648804, + "learning_rate": 0.00014780722898224708, + "loss": 0.0247, + "step": 165 + }, + { + "epoch": 8.68421052631579, + "eval_loss": 0.008787935599684715, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 165 + }, + { + "epoch": 8.736842105263158, + "grad_norm": 0.24645276367664337, + "learning_rate": 0.0001447663093929163, + "loss": 0.0146, + "step": 166 + }, + { + "epoch": 8.736842105263158, + "eval_loss": 0.008263031020760536, + "eval_runtime": 3.3807, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 166 + }, + { + "epoch": 8.789473684210526, + "grad_norm": 0.22991032898426056, + "learning_rate": 0.00014174419649309089, + "loss": 0.0103, + "step": 167 + }, + { + "epoch": 8.789473684210526, + "eval_loss": 0.008267836645245552, + "eval_runtime": 3.3789, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 167 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 0.20900578796863556, + "learning_rate": 0.00013874143037538418, + "loss": 0.0107, + "step": 168 + }, + { + "epoch": 8.842105263157894, + "eval_loss": 0.008572892285883427, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 168 + }, + { + "epoch": 8.894736842105264, + "grad_norm": 0.30114251375198364, + "learning_rate": 0.0001357585476748766, + "loss": 0.0213, + "step": 169 + }, + { + "epoch": 8.894736842105264, + "eval_loss": 0.006935921497642994, + "eval_runtime": 3.376, + "eval_samples_per_second": 8.886, + "eval_steps_per_second": 1.185, + "step": 169 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 0.21789942681789398, + "learning_rate": 0.00013279608147321223, + "loss": 0.0079, + "step": 170 + }, + { + "epoch": 8.947368421052632, + "eval_loss": 0.006833930965512991, + "eval_runtime": 3.3765, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.185, + "step": 170 + }, + { + "epoch": 9.0, + "grad_norm": 0.4003734886646271, + "learning_rate": 0.00012985456120332905, + "loss": 0.0176, + "step": 171 + }, + { + "epoch": 9.0, + "eval_loss": 0.006935244891792536, + "eval_runtime": 3.3641, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 1.189, + "step": 171 + }, + { + "epoch": 9.052631578947368, + "grad_norm": 0.1696169078350067, + "learning_rate": 0.00012693451255484312, + "loss": 0.0033, + "step": 172 + }, + { + "epoch": 9.052631578947368, + "eval_loss": 0.00692379754036665, + "eval_runtime": 3.3823, + "eval_samples_per_second": 8.87, + "eval_steps_per_second": 1.183, + "step": 172 + }, + { + "epoch": 9.105263157894736, + "grad_norm": 0.1699666529893875, + "learning_rate": 0.00012403645738009997, + "loss": 0.0072, + "step": 173 + }, + { + "epoch": 9.105263157894736, + "eval_loss": 0.006103164982050657, + "eval_runtime": 3.3831, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 173 + }, + { + "epoch": 9.157894736842104, + "grad_norm": 0.1330655813217163, + "learning_rate": 0.00012116091360091261, + "loss": 0.0039, + "step": 174 + }, + { + "epoch": 9.157894736842104, + "eval_loss": 0.005647609941661358, + "eval_runtime": 3.3762, + "eval_samples_per_second": 8.886, + "eval_steps_per_second": 1.185, + "step": 174 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 0.09521008282899857, + "learning_rate": 0.00011830839511600211, + "loss": 0.0034, + "step": 175 + }, + { + "epoch": 9.210526315789474, + "eval_loss": 0.005605170503258705, + "eval_runtime": 3.3771, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 175 + }, + { + "epoch": 9.263157894736842, + "grad_norm": 0.07198765128850937, + "learning_rate": 0.00011547941170915685, + "loss": 0.0046, + "step": 176 + }, + { + "epoch": 9.263157894736842, + "eval_loss": 0.00548657588660717, + "eval_runtime": 3.3809, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 176 + }, + { + "epoch": 9.31578947368421, + "grad_norm": 0.19620288908481598, + "learning_rate": 0.00011267446895812702, + "loss": 0.0054, + "step": 177 + }, + { + "epoch": 9.31578947368421, + "eval_loss": 0.005347335711121559, + "eval_runtime": 3.378, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 177 + }, + { + "epoch": 9.368421052631579, + "grad_norm": 0.12617181241512299, + "learning_rate": 0.0001098940681442713, + "loss": 0.0105, + "step": 178 + }, + { + "epoch": 9.368421052631579, + "eval_loss": 0.005180263426154852, + "eval_runtime": 3.3687, + "eval_samples_per_second": 8.905, + "eval_steps_per_second": 1.187, + "step": 178 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 0.11038723587989807, + "learning_rate": 0.00010713870616297092, + "loss": 0.0039, + "step": 179 + }, + { + "epoch": 9.421052631578947, + "eval_loss": 0.005051607731729746, + "eval_runtime": 3.372, + "eval_samples_per_second": 8.897, + "eval_steps_per_second": 1.186, + "step": 179 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 0.17400430142879486, + "learning_rate": 0.00010440887543482746, + "loss": 0.0073, + "step": 180 + }, + { + "epoch": 9.473684210526315, + "eval_loss": 0.004222193732857704, + "eval_runtime": 3.369, + "eval_samples_per_second": 8.905, + "eval_steps_per_second": 1.187, + "step": 180 + }, + { + "epoch": 9.526315789473685, + "grad_norm": 0.14349240064620972, + "learning_rate": 0.0001017050638176612, + "loss": 0.0062, + "step": 181 + }, + { + "epoch": 9.526315789473685, + "eval_loss": 0.0033600516617298126, + "eval_runtime": 3.3638, + "eval_samples_per_second": 8.919, + "eval_steps_per_second": 1.189, + "step": 181 + }, + { + "epoch": 9.578947368421053, + "grad_norm": 0.06090744957327843, + "learning_rate": 9.902775451932386e-05, + "loss": 0.0021, + "step": 182 + }, + { + "epoch": 9.578947368421053, + "eval_loss": 0.0029148412868380547, + "eval_runtime": 3.3733, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.186, + "step": 182 + }, + { + "epoch": 9.631578947368421, + "grad_norm": 0.08567659556865692, + "learning_rate": 9.637742601134286e-05, + "loss": 0.0051, + "step": 183 + }, + { + "epoch": 9.631578947368421, + "eval_loss": 0.0028052127454429865, + "eval_runtime": 3.378, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 183 + }, + { + "epoch": 9.68421052631579, + "grad_norm": 0.11216646432876587, + "learning_rate": 9.375455194341214e-05, + "loss": 0.0041, + "step": 184 + }, + { + "epoch": 9.68421052631579, + "eval_loss": 0.0025057741440832615, + "eval_runtime": 3.3825, + "eval_samples_per_second": 8.869, + "eval_steps_per_second": 1.183, + "step": 184 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 0.13728925585746765, + "learning_rate": 9.11596010587441e-05, + "loss": 0.0041, + "step": 185 + }, + { + "epoch": 9.736842105263158, + "eval_loss": 0.0024123352486640215, + "eval_runtime": 3.3829, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 185 + }, + { + "epoch": 9.789473684210526, + "grad_norm": 0.1449730098247528, + "learning_rate": 8.85930371102994e-05, + "loss": 0.0119, + "step": 186 + }, + { + "epoch": 9.789473684210526, + "eval_loss": 0.002238348126411438, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 186 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 0.24199621379375458, + "learning_rate": 8.605531877790762e-05, + "loss": 0.0067, + "step": 187 + }, + { + "epoch": 9.842105263157894, + "eval_loss": 0.002172964159399271, + "eval_runtime": 3.3833, + "eval_samples_per_second": 8.867, + "eval_steps_per_second": 1.182, + "step": 187 + }, + { + "epoch": 9.894736842105264, + "grad_norm": 0.09489741921424866, + "learning_rate": 8.354689958629513e-05, + "loss": 0.003, + "step": 188 + }, + { + "epoch": 9.894736842105264, + "eval_loss": 0.0021200678311288357, + "eval_runtime": 3.378, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 1.184, + "step": 188 + }, + { + "epoch": 9.947368421052632, + "grad_norm": 0.053513191640377045, + "learning_rate": 8.106822782403376e-05, + "loss": 0.0014, + "step": 189 + }, + { + "epoch": 9.947368421052632, + "eval_loss": 0.0020873628091067076, + "eval_runtime": 3.3756, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 189 + }, + { + "epoch": 10.0, + "grad_norm": 0.11307157576084137, + "learning_rate": 7.861974646342596e-05, + "loss": 0.0043, + "step": 190 + }, + { + "epoch": 10.0, + "eval_loss": 0.0019308760529384017, + "eval_runtime": 3.3719, + "eval_samples_per_second": 8.897, + "eval_steps_per_second": 1.186, + "step": 190 + }, + { + "epoch": 10.052631578947368, + "grad_norm": 0.021253453567624092, + "learning_rate": 7.620189308133943e-05, + "loss": 0.0012, + "step": 191 + }, + { + "epoch": 10.052631578947368, + "eval_loss": 0.001897598267532885, + "eval_runtime": 3.3667, + "eval_samples_per_second": 8.911, + "eval_steps_per_second": 1.188, + "step": 191 + }, + { + "epoch": 10.105263157894736, + "grad_norm": 0.0331900380551815, + "learning_rate": 7.381509978100626e-05, + "loss": 0.0014, + "step": 192 + }, + { + "epoch": 10.105263157894736, + "eval_loss": 0.0019330073846504092, + "eval_runtime": 3.383, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 192 + }, + { + "epoch": 10.157894736842104, + "grad_norm": 0.04711827635765076, + "learning_rate": 7.145979311479986e-05, + "loss": 0.0021, + "step": 193 + }, + { + "epoch": 10.157894736842104, + "eval_loss": 0.001935520558618009, + "eval_runtime": 3.3784, + "eval_samples_per_second": 8.88, + "eval_steps_per_second": 1.184, + "step": 193 + }, + { + "epoch": 10.210526315789474, + "grad_norm": 0.022507954388856888, + "learning_rate": 6.913639400800489e-05, + "loss": 0.0011, + "step": 194 + }, + { + "epoch": 10.210526315789474, + "eval_loss": 0.0018882205476984382, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 194 + }, + { + "epoch": 10.263157894736842, + "grad_norm": 0.037808384746313095, + "learning_rate": 6.684531768359173e-05, + "loss": 0.0018, + "step": 195 + }, + { + "epoch": 10.263157894736842, + "eval_loss": 0.0018376539228484035, + "eval_runtime": 3.381, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 195 + }, + { + "epoch": 10.31578947368421, + "grad_norm": 0.013078493997454643, + "learning_rate": 6.458697358801061e-05, + "loss": 0.0008, + "step": 196 + }, + { + "epoch": 10.31578947368421, + "eval_loss": 0.0018605777295306325, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 196 + }, + { + "epoch": 10.368421052631579, + "grad_norm": 0.10678137093782425, + "learning_rate": 6.236176531801813e-05, + "loss": 0.0025, + "step": 197 + }, + { + "epoch": 10.368421052631579, + "eval_loss": 0.0017640648875385523, + "eval_runtime": 3.3773, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 197 + }, + { + "epoch": 10.421052631578947, + "grad_norm": 0.028472477570176125, + "learning_rate": 6.017009054854858e-05, + "loss": 0.0013, + "step": 198 + }, + { + "epoch": 10.421052631578947, + "eval_loss": 0.0017073862254619598, + "eval_runtime": 3.38, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 198 + }, + { + "epoch": 10.473684210526315, + "grad_norm": 0.092536062002182, + "learning_rate": 5.801234096164468e-05, + "loss": 0.0029, + "step": 199 + }, + { + "epoch": 10.473684210526315, + "eval_loss": 0.0017115242080762982, + "eval_runtime": 3.3757, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 1.185, + "step": 199 + }, + { + "epoch": 10.526315789473685, + "grad_norm": 0.06385708600282669, + "learning_rate": 5.58889021764582e-05, + "loss": 0.0025, + "step": 200 + }, + { + "epoch": 10.526315789473685, + "eval_loss": 0.0015509836375713348, + "eval_runtime": 3.3775, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 200 + }, + { + "epoch": 10.578947368421053, + "grad_norm": 0.01563839800655842, + "learning_rate": 5.3800153680334754e-05, + "loss": 0.0009, + "step": 201 + }, + { + "epoch": 10.578947368421053, + "eval_loss": 0.00150794826913625, + "eval_runtime": 3.3669, + "eval_samples_per_second": 8.91, + "eval_steps_per_second": 1.188, + "step": 201 + }, + { + "epoch": 10.631578947368421, + "grad_norm": 0.023204339668154716, + "learning_rate": 5.17464687609942e-05, + "loss": 0.0013, + "step": 202 + }, + { + "epoch": 10.631578947368421, + "eval_loss": 0.0015423657605424523, + "eval_runtime": 3.3837, + "eval_samples_per_second": 8.866, + "eval_steps_per_second": 1.182, + "step": 202 + }, + { + "epoch": 10.68421052631579, + "grad_norm": 0.06458425521850586, + "learning_rate": 4.97282144398192e-05, + "loss": 0.0019, + "step": 203 + }, + { + "epoch": 10.68421052631579, + "eval_loss": 0.0014141839928925037, + "eval_runtime": 3.3837, + "eval_samples_per_second": 8.866, + "eval_steps_per_second": 1.182, + "step": 203 + }, + { + "epoch": 10.736842105263158, + "grad_norm": 0.11526883393526077, + "learning_rate": 4.7745751406263163e-05, + "loss": 0.0057, + "step": 204 + }, + { + "epoch": 10.736842105263158, + "eval_loss": 0.0014135593082755804, + "eval_runtime": 3.3846, + "eval_samples_per_second": 8.864, + "eval_steps_per_second": 1.182, + "step": 204 + }, + { + "epoch": 10.789473684210526, + "grad_norm": 0.016495322808623314, + "learning_rate": 4.5799433953390616e-05, + "loss": 0.001, + "step": 205 + }, + { + "epoch": 10.789473684210526, + "eval_loss": 0.0013517830520868301, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 205 + }, + { + "epoch": 10.842105263157894, + "grad_norm": 0.014301777817308903, + "learning_rate": 4.388960991455998e-05, + "loss": 0.0009, + "step": 206 + }, + { + "epoch": 10.842105263157894, + "eval_loss": 0.0013313922099769115, + "eval_runtime": 3.3771, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 206 + }, + { + "epoch": 10.894736842105264, + "grad_norm": 0.136412113904953, + "learning_rate": 4.2016620601260796e-05, + "loss": 0.002, + "step": 207 + }, + { + "epoch": 10.894736842105264, + "eval_loss": 0.0011529050534591079, + "eval_runtime": 3.3723, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.186, + "step": 207 + }, + { + "epoch": 10.947368421052632, + "grad_norm": 0.088556207716465, + "learning_rate": 4.0180800742117244e-05, + "loss": 0.0008, + "step": 208 + }, + { + "epoch": 10.947368421052632, + "eval_loss": 0.00109326362144202, + "eval_runtime": 3.3736, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.186, + "step": 208 + }, + { + "epoch": 11.0, + "grad_norm": 0.01456160843372345, + "learning_rate": 3.838247842306716e-05, + "loss": 0.001, + "step": 209 + }, + { + "epoch": 11.0, + "eval_loss": 0.0010605588322505355, + "eval_runtime": 3.3718, + "eval_samples_per_second": 8.897, + "eval_steps_per_second": 1.186, + "step": 209 + }, + { + "epoch": 11.052631578947368, + "grad_norm": 0.011590982787311077, + "learning_rate": 3.662197502872885e-05, + "loss": 0.0008, + "step": 210 + }, + { + "epoch": 11.052631578947368, + "eval_loss": 0.0010736124822869897, + "eval_runtime": 3.3769, + "eval_samples_per_second": 8.884, + "eval_steps_per_second": 1.185, + "step": 210 + }, + { + "epoch": 11.105263157894736, + "grad_norm": 0.012225581333041191, + "learning_rate": 3.489960518496521e-05, + "loss": 0.0007, + "step": 211 + }, + { + "epoch": 11.105263157894736, + "eval_loss": 0.001024996628984809, + "eval_runtime": 3.3708, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 1.187, + "step": 211 + }, + { + "epoch": 11.157894736842104, + "grad_norm": 0.014904456213116646, + "learning_rate": 3.321567670265568e-05, + "loss": 0.001, + "step": 212 + }, + { + "epoch": 11.157894736842104, + "eval_loss": 0.001045219600200653, + "eval_runtime": 3.3788, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 212 + }, + { + "epoch": 11.210526315789474, + "grad_norm": 0.02280263416469097, + "learning_rate": 3.157049052268662e-05, + "loss": 0.001, + "step": 213 + }, + { + "epoch": 11.210526315789474, + "eval_loss": 0.0009931708918884397, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 213 + }, + { + "epoch": 11.263157894736842, + "grad_norm": 0.011518572457134724, + "learning_rate": 2.9964340662168772e-05, + "loss": 0.0007, + "step": 214 + }, + { + "epoch": 11.263157894736842, + "eval_loss": 0.0009575362200848758, + "eval_runtime": 3.3816, + "eval_samples_per_second": 8.871, + "eval_steps_per_second": 1.183, + "step": 214 + }, + { + "epoch": 11.31578947368421, + "grad_norm": 0.012095076963305473, + "learning_rate": 2.8397514161892484e-05, + "loss": 0.0008, + "step": 215 + }, + { + "epoch": 11.31578947368421, + "eval_loss": 0.000961105281021446, + "eval_runtime": 3.3813, + "eval_samples_per_second": 8.872, + "eval_steps_per_second": 1.183, + "step": 215 + }, + { + "epoch": 11.368421052631579, + "grad_norm": 0.012648813426494598, + "learning_rate": 2.687029103502972e-05, + "loss": 0.0008, + "step": 216 + }, + { + "epoch": 11.368421052631579, + "eval_loss": 0.000959594442974776, + "eval_runtime": 3.3812, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 216 + }, + { + "epoch": 11.421052631578947, + "grad_norm": 0.02259828709065914, + "learning_rate": 2.5382944217091723e-05, + "loss": 0.0012, + "step": 217 + }, + { + "epoch": 11.421052631578947, + "eval_loss": 0.0009408009937033057, + "eval_runtime": 3.3766, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 1.185, + "step": 217 + }, + { + "epoch": 11.473684210526315, + "grad_norm": 0.011007304303348064, + "learning_rate": 2.3935739517151916e-05, + "loss": 0.0008, + "step": 218 + }, + { + "epoch": 11.473684210526315, + "eval_loss": 0.0009279777877964079, + "eval_runtime": 3.3762, + "eval_samples_per_second": 8.886, + "eval_steps_per_second": 1.185, + "step": 218 + }, + { + "epoch": 11.526315789473685, + "grad_norm": 0.0120201725512743, + "learning_rate": 2.2528935570342164e-05, + "loss": 0.0009, + "step": 219 + }, + { + "epoch": 11.526315789473685, + "eval_loss": 0.000922050909139216, + "eval_runtime": 3.3727, + "eval_samples_per_second": 8.895, + "eval_steps_per_second": 1.186, + "step": 219 + }, + { + "epoch": 11.578947368421053, + "grad_norm": 0.010646814480423927, + "learning_rate": 2.1162783791631057e-05, + "loss": 0.0007, + "step": 220 + }, + { + "epoch": 11.578947368421053, + "eval_loss": 0.0009199492633342743, + "eval_runtime": 3.3745, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 220 + }, + { + "epoch": 11.631578947368421, + "grad_norm": 0.01419683638960123, + "learning_rate": 1.9837528330892778e-05, + "loss": 0.0009, + "step": 221 + }, + { + "epoch": 11.631578947368421, + "eval_loss": 0.0009105838253162801, + "eval_runtime": 3.3685, + "eval_samples_per_second": 8.906, + "eval_steps_per_second": 1.187, + "step": 221 + }, + { + "epoch": 11.68421052631579, + "grad_norm": 0.04302644729614258, + "learning_rate": 1.8553406029274188e-05, + "loss": 0.0017, + "step": 222 + }, + { + "epoch": 11.68421052631579, + "eval_loss": 0.0009061154560185969, + "eval_runtime": 3.3707, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 1.187, + "step": 222 + }, + { + "epoch": 11.736842105263158, + "grad_norm": 0.053855180740356445, + "learning_rate": 1.7310646376867885e-05, + "loss": 0.0019, + "step": 223 + }, + { + "epoch": 11.736842105263158, + "eval_loss": 0.0008960295235738158, + "eval_runtime": 3.3744, + "eval_samples_per_second": 8.89, + "eval_steps_per_second": 1.185, + "step": 223 + }, + { + "epoch": 11.789473684210526, + "grad_norm": 0.042215775698423386, + "learning_rate": 1.6109471471699556e-05, + "loss": 0.0017, + "step": 224 + }, + { + "epoch": 11.789473684210526, + "eval_loss": 0.0008963170694187284, + "eval_runtime": 3.3801, + "eval_samples_per_second": 8.875, + "eval_steps_per_second": 1.183, + "step": 224 + }, + { + "epoch": 11.842105263157894, + "grad_norm": 0.06074398756027222, + "learning_rate": 1.4950095980035772e-05, + "loss": 0.0021, + "step": 225 + }, + { + "epoch": 11.842105263157894, + "eval_loss": 0.0009151420672424138, + "eval_runtime": 3.3792, + "eval_samples_per_second": 8.878, + "eval_steps_per_second": 1.184, + "step": 225 + }, + { + "epoch": 11.894736842105264, + "grad_norm": 0.03642531856894493, + "learning_rate": 1.3832727098020331e-05, + "loss": 0.0012, + "step": 226 + }, + { + "epoch": 11.894736842105264, + "eval_loss": 0.0008949197363108397, + "eval_runtime": 3.3812, + "eval_samples_per_second": 8.873, + "eval_steps_per_second": 1.183, + "step": 226 + }, + { + "epoch": 11.947368421052632, + "grad_norm": 0.012237842194736004, + "learning_rate": 1.2757564514645492e-05, + "loss": 0.0007, + "step": 227 + }, + { + "epoch": 11.947368421052632, + "eval_loss": 0.0008931474294513464, + "eval_runtime": 3.38, + "eval_samples_per_second": 8.876, + "eval_steps_per_second": 1.183, + "step": 227 + }, + { + "epoch": 12.0, + "grad_norm": 0.013033509254455566, + "learning_rate": 1.1724800376064798e-05, + "loss": 0.0008, + "step": 228 + }, + { + "epoch": 12.0, + "eval_loss": 0.0009141464834101498, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, + "step": 228 + }, + { + "epoch": 12.052631578947368, + "grad_norm": 0.00906482245773077, + "learning_rate": 1.0734619251253963e-05, + "loss": 0.0006, + "step": 229 + }, + { + "epoch": 12.052631578947368, + "eval_loss": 0.0009004678577184677, + "eval_runtime": 3.3773, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 229 + }, + { + "epoch": 12.105263157894736, + "grad_norm": 0.008960971608757973, + "learning_rate": 9.78719809902598e-06, + "loss": 0.0006, + "step": 230 + }, + { + "epoch": 12.105263157894736, + "eval_loss": 0.0008675061399117112, + "eval_runtime": 3.3731, + "eval_samples_per_second": 8.894, + "eval_steps_per_second": 1.186, + "step": 230 + }, + { + "epoch": 12.157894736842104, + "grad_norm": 0.013526183553040028, + "learning_rate": 8.882706236405884e-06, + "loss": 0.0008, + "step": 231 + }, + { + "epoch": 12.157894736842104, + "eval_loss": 0.000881785003002733, + "eval_runtime": 3.3662, + "eval_samples_per_second": 8.912, + "eval_steps_per_second": 1.188, + "step": 231 + }, + { + "epoch": 12.210526315789474, + "grad_norm": 0.05892425775527954, + "learning_rate": 8.02130530837189e-06, + "loss": 0.0024, + "step": 232 + }, + { + "epoch": 12.210526315789474, + "eval_loss": 0.0008758410695008934, + "eval_runtime": 3.3724, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 1.186, + "step": 232 + }, + { + "epoch": 12.263157894736842, + "grad_norm": 0.009400282986462116, + "learning_rate": 7.203149258967034e-06, + "loss": 0.0007, + "step": 233 + }, + { + "epoch": 12.263157894736842, + "eval_loss": 0.0008620958542451262, + "eval_runtime": 3.3864, + "eval_samples_per_second": 8.859, + "eval_steps_per_second": 1.181, + "step": 233 + }, + { + "epoch": 12.31578947368421, + "grad_norm": 0.045264292508363724, + "learning_rate": 6.428384303787282e-06, + "loss": 0.0015, + "step": 234 + }, + { + "epoch": 12.31578947368421, + "eval_loss": 0.0008675272110849619, + "eval_runtime": 3.3813, + "eval_samples_per_second": 8.872, + "eval_steps_per_second": 1.183, + "step": 234 + }, + { + "epoch": 12.368421052631579, + "grad_norm": 0.01382332295179367, + "learning_rate": 5.697148903850868e-06, + "loss": 0.0009, + "step": 235 + }, + { + "epoch": 12.368421052631579, + "eval_loss": 0.0008773647132329643, + "eval_runtime": 3.3807, + "eval_samples_per_second": 8.874, + "eval_steps_per_second": 1.183, + "step": 235 + }, + { + "epoch": 12.421052631578947, + "grad_norm": 0.020148828625679016, + "learning_rate": 5.009573740853312e-06, + "loss": 0.0011, + "step": 236 + }, + { + "epoch": 12.421052631578947, + "eval_loss": 0.0008683075429871678, + "eval_runtime": 3.3783, + "eval_samples_per_second": 8.88, + "eval_steps_per_second": 1.184, + "step": 236 + }, + { + "epoch": 12.473684210526315, + "grad_norm": 0.012339909560978413, + "learning_rate": 4.365781693813048e-06, + "loss": 0.0009, + "step": 237 + }, + { + "epoch": 12.473684210526315, + "eval_loss": 0.0008661063038744032, + "eval_runtime": 3.3733, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 1.186, + "step": 237 + }, + { + "epoch": 12.526315789473685, + "grad_norm": 0.015052181668579578, + "learning_rate": 3.765887817111069e-06, + "loss": 0.001, + "step": 238 + }, + { + "epoch": 12.526315789473685, + "eval_loss": 0.0008630995289422572, + "eval_runtime": 3.3706, + "eval_samples_per_second": 8.901, + "eval_steps_per_second": 1.187, + "step": 238 + }, + { + "epoch": 12.578947368421053, + "grad_norm": 0.00892038643360138, + "learning_rate": 3.2099993199292688e-06, + "loss": 0.0005, + "step": 239 + }, + { + "epoch": 12.578947368421053, + "eval_loss": 0.0009093356784433126, + "eval_runtime": 3.3705, + "eval_samples_per_second": 8.901, + "eval_steps_per_second": 1.187, + "step": 239 + }, + { + "epoch": 12.631578947368421, + "grad_norm": 0.01133538968861103, + "learning_rate": 2.698215547090599e-06, + "loss": 0.0008, + "step": 240 + }, + { + "epoch": 12.631578947368421, + "eval_loss": 0.0008580058929510415, + "eval_runtime": 3.367, + "eval_samples_per_second": 8.91, + "eval_steps_per_second": 1.188, + "step": 240 } ], "logging_steps": 1, - "max_steps": 60, + "max_steps": 250, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 14, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { @@ -627,7 +3627,7 @@ "attributes": {} } }, - "total_flos": 1661495727175680.0, + "total_flos": 9951159912714240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null