diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,17 +1,17 @@ { - "best_global_step": 135, - "best_metric": 0.028628086671233177, - "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-130", - "epoch": 7.368421052631579, + "best_global_step": 247, + "best_metric": 0.0044091795571148396, + "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-240", + "epoch": 13.157894736842104, "eval_steps": 1, - "global_step": 140, + "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05263157894736842, - "grad_norm": 3.286113739013672, + "grad_norm": 1.1712253093719482, "learning_rate": 0.0, "loss": 3.2235, "step": 1 @@ -19,996 +19,996 @@ { "epoch": 0.05263157894736842, "eval_loss": 3.15524959564209, - "eval_runtime": 3.3576, - "eval_samples_per_second": 8.935, - "eval_steps_per_second": 1.191, + "eval_runtime": 3.3832, + "eval_samples_per_second": 8.867, + "eval_steps_per_second": 1.182, "step": 1 }, { "epoch": 0.10526315789473684, - "grad_norm": 3.493290424346924, + "grad_norm": 1.2426623106002808, "learning_rate": 3.3333333333333335e-05, "loss": 3.165, "step": 2 }, { "epoch": 0.10526315789473684, - "eval_loss": 2.8496124744415283, - "eval_runtime": 3.3189, - "eval_samples_per_second": 9.039, - "eval_steps_per_second": 1.205, + "eval_loss": 3.1208913326263428, + "eval_runtime": 3.3506, + "eval_samples_per_second": 8.954, + "eval_steps_per_second": 1.194, "step": 2 }, { "epoch": 0.15789473684210525, - "grad_norm": 2.2974886894226074, + "grad_norm": 1.0169581174850464, "learning_rate": 6.666666666666667e-05, - "loss": 2.6575, + "loss": 2.8762, "step": 3 }, { "epoch": 0.15789473684210525, - "eval_loss": 2.4071192741394043, - "eval_runtime": 3.3333, - "eval_samples_per_second": 9.0, - "eval_steps_per_second": 1.2, + "eval_loss": 3.033876895904541, + "eval_runtime": 3.3775, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, "step": 3 }, { "epoch": 0.21052631578947367, - "grad_norm": 1.9221198558807373, + "grad_norm": 1.1397525072097778, "learning_rate": 0.0001, - "loss": 2.4219, + "loss": 3.0285, "step": 4 }, { "epoch": 0.21052631578947367, - "eval_loss": 2.0516483783721924, - "eval_runtime": 3.3382, - "eval_samples_per_second": 8.987, - "eval_steps_per_second": 1.198, + "eval_loss": 2.882239818572998, + "eval_runtime": 3.3981, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, "step": 4 }, { "epoch": 0.2631578947368421, - "grad_norm": 1.8296281099319458, + "grad_norm": 1.1276919841766357, "learning_rate": 0.00013333333333333334, - "loss": 2.0637, + "loss": 2.8059, "step": 5 }, { "epoch": 0.2631578947368421, - "eval_loss": 1.7472976446151733, - "eval_runtime": 3.3483, - "eval_samples_per_second": 8.96, - "eval_steps_per_second": 1.195, + "eval_loss": 2.671700954437256, + "eval_runtime": 3.4282, + "eval_samples_per_second": 8.751, + "eval_steps_per_second": 1.167, "step": 5 }, { "epoch": 0.3157894736842105, - "grad_norm": 1.8663091659545898, + "grad_norm": 1.1082642078399658, "learning_rate": 0.00016666666666666666, - "loss": 1.7628, + "loss": 2.5492, "step": 6 }, { "epoch": 0.3157894736842105, - "eval_loss": 1.4761121273040771, - "eval_runtime": 3.3561, - "eval_samples_per_second": 8.939, - "eval_steps_per_second": 1.192, + "eval_loss": 2.4450764656066895, + "eval_runtime": 3.4466, + "eval_samples_per_second": 8.704, + "eval_steps_per_second": 1.161, "step": 6 }, { "epoch": 0.3684210526315789, - "grad_norm": 1.6261323690414429, + "grad_norm": 1.0461392402648926, "learning_rate": 0.0002, - "loss": 1.5255, + "loss": 2.4397, "step": 7 }, { "epoch": 0.3684210526315789, - "eval_loss": 1.385249376296997, - "eval_runtime": 3.3779, - "eval_samples_per_second": 8.881, - "eval_steps_per_second": 1.184, + "eval_loss": 2.230668544769287, + "eval_runtime": 3.4595, + "eval_samples_per_second": 8.672, + "eval_steps_per_second": 1.156, "step": 7 }, { "epoch": 0.42105263157894735, - "grad_norm": 2.0629477500915527, + "grad_norm": 1.2636622190475464, "learning_rate": 0.00023333333333333333, - "loss": 1.3881, + "loss": 2.2026, "step": 8 }, { "epoch": 0.42105263157894735, - "eval_loss": 1.352536678314209, - "eval_runtime": 3.3911, - "eval_samples_per_second": 8.847, - "eval_steps_per_second": 1.18, + "eval_loss": 2.028376340866089, + "eval_runtime": 3.4413, + "eval_samples_per_second": 8.718, + "eval_steps_per_second": 1.162, "step": 8 }, { "epoch": 0.47368421052631576, - "grad_norm": 2.1097559928894043, + "grad_norm": 1.4393274784088135, "learning_rate": 0.0002666666666666667, - "loss": 1.384, + "loss": 2.0811, "step": 9 }, { "epoch": 0.47368421052631576, - "eval_loss": 1.2356091737747192, - "eval_runtime": 3.4035, - "eval_samples_per_second": 8.814, - "eval_steps_per_second": 1.175, + "eval_loss": 1.8259222507476807, + "eval_runtime": 3.4225, + "eval_samples_per_second": 8.765, + "eval_steps_per_second": 1.169, "step": 9 }, { "epoch": 0.5263157894736842, - "grad_norm": 1.2191412448883057, + "grad_norm": 1.4638570547103882, "learning_rate": 0.0003, - "loss": 1.294, + "loss": 1.8383, "step": 10 }, { "epoch": 0.5263157894736842, - "eval_loss": 1.1559504270553589, - "eval_runtime": 3.4138, - "eval_samples_per_second": 8.788, - "eval_steps_per_second": 1.172, + "eval_loss": 1.6318742036819458, + "eval_runtime": 3.4158, + "eval_samples_per_second": 8.783, + "eval_steps_per_second": 1.171, "step": 10 }, { "epoch": 0.5789473684210527, - "grad_norm": 1.2134253978729248, + "grad_norm": 1.4234288930892944, "learning_rate": 0.0003333333333333333, - "loss": 1.244, + "loss": 1.6943, "step": 11 }, { "epoch": 0.5789473684210527, - "eval_loss": 1.0915534496307373, - "eval_runtime": 3.4124, - "eval_samples_per_second": 8.791, - "eval_steps_per_second": 1.172, + "eval_loss": 1.4664249420166016, + "eval_runtime": 3.385, + "eval_samples_per_second": 8.863, + "eval_steps_per_second": 1.182, "step": 11 }, { "epoch": 0.631578947368421, - "grad_norm": 0.9994729161262512, + "grad_norm": 1.2770508527755737, "learning_rate": 0.00036666666666666667, - "loss": 1.1283, + "loss": 1.4634, "step": 12 }, { "epoch": 0.631578947368421, - "eval_loss": 1.06654691696167, - "eval_runtime": 3.4154, - "eval_samples_per_second": 8.784, - "eval_steps_per_second": 1.171, + "eval_loss": 1.37418794631958, + "eval_runtime": 3.3866, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 1.181, "step": 12 }, { "epoch": 0.6842105263157895, - "grad_norm": 1.1173161268234253, + "grad_norm": 1.5616014003753662, "learning_rate": 0.0004, - "loss": 1.1519, + "loss": 1.4361, "step": 13 }, { "epoch": 0.6842105263157895, - "eval_loss": 1.0182068347930908, - "eval_runtime": 3.4202, - "eval_samples_per_second": 8.771, - "eval_steps_per_second": 1.17, + "eval_loss": 1.3023313283920288, + "eval_runtime": 3.3859, + "eval_samples_per_second": 8.86, + "eval_steps_per_second": 1.181, "step": 13 }, { "epoch": 0.7368421052631579, - "grad_norm": 0.9380443096160889, + "grad_norm": 1.475995421409607, "learning_rate": 0.00043333333333333337, - "loss": 1.0633, + "loss": 1.3218, "step": 14 }, { "epoch": 0.7368421052631579, - "eval_loss": 1.0030596256256104, - "eval_runtime": 3.411, - "eval_samples_per_second": 8.795, - "eval_steps_per_second": 1.173, + "eval_loss": 1.237278699874878, + "eval_runtime": 3.3787, + "eval_samples_per_second": 8.879, + "eval_steps_per_second": 1.184, "step": 14 }, { "epoch": 0.7894736842105263, - "grad_norm": 0.9056739211082458, + "grad_norm": 1.1547696590423584, "learning_rate": 0.00046666666666666666, - "loss": 1.0771, + "loss": 1.3192, "step": 15 }, { "epoch": 0.7894736842105263, - "eval_loss": 0.9875913262367249, - "eval_runtime": 3.4061, - "eval_samples_per_second": 8.808, - "eval_steps_per_second": 1.174, + "eval_loss": 1.1772326231002808, + "eval_runtime": 3.3856, + "eval_samples_per_second": 8.861, + "eval_steps_per_second": 1.181, "step": 15 }, { "epoch": 0.8421052631578947, - "grad_norm": 0.9518136978149414, + "grad_norm": 0.8922737240791321, "learning_rate": 0.0005, - "loss": 1.0736, + "loss": 1.2386, "step": 16 }, { "epoch": 0.8421052631578947, - "eval_loss": 0.992712676525116, - "eval_runtime": 3.3965, - "eval_samples_per_second": 8.833, - "eval_steps_per_second": 1.178, + "eval_loss": 1.1432918310165405, + "eval_runtime": 3.3926, + "eval_samples_per_second": 8.843, + "eval_steps_per_second": 1.179, "step": 16 }, { "epoch": 0.8947368421052632, - "grad_norm": 0.8723889589309692, + "grad_norm": 0.8703598380088806, "learning_rate": 0.0004999776608025946, - "loss": 1.1262, + "loss": 1.2852, "step": 17 }, { "epoch": 0.8947368421052632, - "eval_loss": 0.9731314182281494, - "eval_runtime": 3.3937, - "eval_samples_per_second": 8.84, - "eval_steps_per_second": 1.179, + "eval_loss": 1.1231766939163208, + "eval_runtime": 3.401, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, "step": 17 }, { "epoch": 0.9473684210526315, - "grad_norm": 0.8370358347892761, + "grad_norm": 0.8985245823860168, "learning_rate": 0.000499910647202696, - "loss": 0.9972, + "loss": 1.1268, "step": 18 }, { "epoch": 0.9473684210526315, - "eval_loss": 0.9297105669975281, - "eval_runtime": 3.3902, - "eval_samples_per_second": 8.849, - "eval_steps_per_second": 1.18, + "eval_loss": 1.0892575979232788, + "eval_runtime": 3.4075, + "eval_samples_per_second": 8.804, + "eval_steps_per_second": 1.174, "step": 18 }, { "epoch": 1.0, - "grad_norm": 0.829403281211853, + "grad_norm": 0.705194890499115, "learning_rate": 0.0004997989711765446, - "loss": 1.0713, + "loss": 1.1925, "step": 19 }, { "epoch": 1.0, - "eval_loss": 0.9343510270118713, - "eval_runtime": 3.3917, - "eval_samples_per_second": 8.845, - "eval_steps_per_second": 1.179, + "eval_loss": 1.0620007514953613, + "eval_runtime": 3.4023, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, "step": 19 }, { "epoch": 1.0526315789473684, - "grad_norm": 0.7440597414970398, + "grad_norm": 0.6432715654373169, "learning_rate": 0.0004996426526821629, - "loss": 0.8964, + "loss": 1.0677, "step": 20 }, { "epoch": 1.0526315789473684, - "eval_loss": 0.9188496470451355, - "eval_runtime": 3.3924, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 1.0364060401916504, + "eval_runtime": 3.4021, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, "step": 20 }, { "epoch": 1.1052631578947367, - "grad_norm": 0.7966375350952148, + "grad_norm": 0.5805476903915405, "learning_rate": 0.0004994417196557883, - "loss": 0.945, + "loss": 1.0514, "step": 21 }, { "epoch": 1.1052631578947367, - "eval_loss": 0.8843300938606262, - "eval_runtime": 3.3876, - "eval_samples_per_second": 8.856, - "eval_steps_per_second": 1.181, + "eval_loss": 1.0189239978790283, + "eval_runtime": 3.3998, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, "step": 21 }, { "epoch": 1.1578947368421053, - "grad_norm": 0.7709185481071472, + "grad_norm": 0.5795720219612122, "learning_rate": 0.0004991962080068813, - "loss": 0.9541, + "loss": 1.0788, "step": 22 }, { "epoch": 1.1578947368421053, - "eval_loss": 0.8968064785003662, - "eval_runtime": 3.4082, - "eval_samples_per_second": 8.802, - "eval_steps_per_second": 1.174, + "eval_loss": 1.0024681091308594, + "eval_runtime": 3.3973, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, "step": 22 }, { "epoch": 1.2105263157894737, - "grad_norm": 1.4057934284210205, + "grad_norm": 0.7284250855445862, "learning_rate": 0.0004989061616117073, - "loss": 0.9209, + "loss": 0.9834, "step": 23 }, { "epoch": 1.2105263157894737, - "eval_loss": 0.9101091027259827, - "eval_runtime": 3.42, - "eval_samples_per_second": 8.772, - "eval_steps_per_second": 1.17, + "eval_loss": 0.9821510910987854, + "eval_runtime": 3.3979, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, "step": 23 }, { "epoch": 1.263157894736842, - "grad_norm": 0.8100272417068481, + "grad_norm": 0.7955266833305359, "learning_rate": 0.0004985716323054959, - "loss": 0.9458, + "loss": 1.0999, "step": 24 }, { "epoch": 1.263157894736842, - "eval_loss": 0.8911253213882446, - "eval_runtime": 3.4224, - "eval_samples_per_second": 8.766, - "eval_steps_per_second": 1.169, + "eval_loss": 0.973588228225708, + "eval_runtime": 3.3958, + "eval_samples_per_second": 8.834, + "eval_steps_per_second": 1.178, "step": 24 }, { "epoch": 1.3157894736842106, - "grad_norm": 0.8154920935630798, + "grad_norm": 0.6546872854232788, "learning_rate": 0.0004981926798731766, - "loss": 0.8877, + "loss": 0.9389, "step": 25 }, { "epoch": 1.3157894736842106, - "eval_loss": 0.8730154633522034, - "eval_runtime": 3.4191, - "eval_samples_per_second": 8.774, - "eval_steps_per_second": 1.17, + "eval_loss": 0.9707676768302917, + "eval_runtime": 3.4001, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, "step": 25 }, { "epoch": 1.368421052631579, - "grad_norm": 0.8646735548973083, + "grad_norm": 0.6482366323471069, "learning_rate": 0.000497769372038695, - "loss": 0.9094, + "loss": 1.0285, "step": 26 }, { "epoch": 1.368421052631579, - "eval_loss": 0.8862187266349792, - "eval_runtime": 3.4088, - "eval_samples_per_second": 8.801, - "eval_steps_per_second": 1.173, + "eval_loss": 0.9686868190765381, + "eval_runtime": 3.4003, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, "step": 26 }, { "epoch": 1.4210526315789473, - "grad_norm": 0.7782988548278809, + "grad_norm": 0.5976347327232361, "learning_rate": 0.0004973017844529094, - "loss": 0.915, + "loss": 0.9571, "step": 27 }, { "epoch": 1.4210526315789473, - "eval_loss": 0.8803919553756714, - "eval_runtime": 3.4031, - "eval_samples_per_second": 8.816, - "eval_steps_per_second": 1.175, + "eval_loss": 0.9679729342460632, + "eval_runtime": 3.3978, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, "step": 27 }, { "epoch": 1.4736842105263157, - "grad_norm": 0.6737643480300903, + "grad_norm": 0.6117852926254272, "learning_rate": 0.0004967900006800708, - "loss": 0.7767, + "loss": 0.8988, "step": 28 }, { "epoch": 1.4736842105263157, - "eval_loss": 0.8594701290130615, - "eval_runtime": 3.3956, - "eval_samples_per_second": 8.835, - "eval_steps_per_second": 1.178, + "eval_loss": 0.955328643321991, + "eval_runtime": 3.3986, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 28 }, { "epoch": 1.526315789473684, - "grad_norm": 0.7649226784706116, + "grad_norm": 0.8034415245056152, "learning_rate": 0.000496234112182889, - "loss": 0.9874, + "loss": 1.0419, "step": 29 }, { "epoch": 1.526315789473684, - "eval_loss": 0.8485653400421143, - "eval_runtime": 3.3962, - "eval_samples_per_second": 8.833, - "eval_steps_per_second": 1.178, + "eval_loss": 0.9429832696914673, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, "step": 29 }, { "epoch": 1.5789473684210527, - "grad_norm": 0.7090509533882141, + "grad_norm": 0.6744455099105835, "learning_rate": 0.000495634218306187, - "loss": 0.7892, + "loss": 1.0113, "step": 30 }, { "epoch": 1.5789473684210527, - "eval_loss": 0.8408802151679993, - "eval_runtime": 3.3904, - "eval_samples_per_second": 8.848, - "eval_steps_per_second": 1.18, + "eval_loss": 0.9402546286582947, + "eval_runtime": 3.4024, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, "step": 30 }, { "epoch": 1.631578947368421, - "grad_norm": 0.7707118391990662, + "grad_norm": 0.8540083169937134, "learning_rate": 0.0004949904262591467, - "loss": 0.9378, + "loss": 0.9779, "step": 31 }, { "epoch": 1.631578947368421, - "eval_loss": 0.8314517736434937, - "eval_runtime": 3.3741, - "eval_samples_per_second": 8.891, - "eval_steps_per_second": 1.186, + "eval_loss": 0.9174972176551819, + "eval_runtime": 3.3971, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, "step": 31 }, { "epoch": 1.6842105263157894, - "grad_norm": 0.843680202960968, + "grad_norm": 0.5661184787750244, "learning_rate": 0.0004943028510961491, - "loss": 0.9247, + "loss": 0.967, "step": 32 }, { "epoch": 1.6842105263157894, - "eval_loss": 0.8086416125297546, - "eval_runtime": 3.3954, - "eval_samples_per_second": 8.836, - "eval_steps_per_second": 1.178, + "eval_loss": 0.8996461629867554, + "eval_runtime": 3.401, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, "step": 32 }, { "epoch": 1.736842105263158, - "grad_norm": 0.9877843260765076, + "grad_norm": 0.6420716643333435, "learning_rate": 0.0004935716156962127, - "loss": 1.0658, + "loss": 1.0637, "step": 33 }, { "epoch": 1.736842105263158, - "eval_loss": 0.7970016002655029, - "eval_runtime": 3.4095, - "eval_samples_per_second": 8.799, - "eval_steps_per_second": 1.173, + "eval_loss": 0.8879114389419556, + "eval_runtime": 3.3843, + "eval_samples_per_second": 8.865, + "eval_steps_per_second": 1.182, "step": 33 }, { "epoch": 1.7894736842105263, - "grad_norm": 0.6670656800270081, + "grad_norm": 0.5820953249931335, "learning_rate": 0.000492796850741033, - "loss": 0.8875, + "loss": 0.9406, "step": 34 }, { "epoch": 1.7894736842105263, - "eval_loss": 0.8034056425094604, - "eval_runtime": 3.4129, - "eval_samples_per_second": 8.79, - "eval_steps_per_second": 1.172, + "eval_loss": 0.8790176510810852, + "eval_runtime": 3.3978, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, "step": 34 }, { "epoch": 1.8421052631578947, - "grad_norm": 0.765784502029419, + "grad_norm": 0.5555437207221985, "learning_rate": 0.0004919786946916281, - "loss": 0.9924, + "loss": 0.9973, "step": 35 }, { "epoch": 1.8421052631578947, - "eval_loss": 0.7739899754524231, - "eval_runtime": 3.4037, - "eval_samples_per_second": 8.814, - "eval_steps_per_second": 1.175, + "eval_loss": 0.8706895112991333, + "eval_runtime": 3.4025, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, "step": 35 }, { "epoch": 1.8947368421052633, - "grad_norm": 0.814027726650238, + "grad_norm": 0.6944723129272461, "learning_rate": 0.0004911172937635942, - "loss": 0.8918, + "loss": 0.9624, "step": 36 }, { "epoch": 1.8947368421052633, - "eval_loss": 0.7576621770858765, - "eval_runtime": 3.3993, - "eval_samples_per_second": 8.825, + "eval_loss": 0.8582616448402405, + "eval_runtime": 3.3987, + "eval_samples_per_second": 8.827, "eval_steps_per_second": 1.177, "step": 36 }, { "epoch": 1.9473684210526314, - "grad_norm": 0.7614129781723022, + "grad_norm": 0.516936182975769, "learning_rate": 0.0004902128019009741, - "loss": 0.9083, + "loss": 1.0242, "step": 37 }, { "epoch": 1.9473684210526314, - "eval_loss": 0.7541573643684387, - "eval_runtime": 3.4048, - "eval_samples_per_second": 8.811, - "eval_steps_per_second": 1.175, + "eval_loss": 0.8482629060745239, + "eval_runtime": 3.3973, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, "step": 37 }, { "epoch": 2.0, - "grad_norm": 0.8814827799797058, + "grad_norm": 0.6250211596488953, "learning_rate": 0.000489265380748746, - "loss": 1.0339, + "loss": 1.0646, "step": 38 }, { "epoch": 2.0, - "eval_loss": 0.7439185380935669, - "eval_runtime": 3.4, - "eval_samples_per_second": 8.824, - "eval_steps_per_second": 1.176, + "eval_loss": 0.8396931290626526, + "eval_runtime": 3.3968, + "eval_samples_per_second": 8.832, + "eval_steps_per_second": 1.178, "step": 38 }, { "epoch": 2.0526315789473686, - "grad_norm": 0.6238395571708679, + "grad_norm": 0.6457982659339905, "learning_rate": 0.0004882751996239352, - "loss": 0.7953, + "loss": 0.9107, "step": 39 }, { "epoch": 2.0526315789473686, - "eval_loss": 0.7419844269752502, - "eval_runtime": 3.3924, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 0.8291558027267456, + "eval_runtime": 3.3988, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 39 }, { "epoch": 2.1052631578947367, - "grad_norm": 0.7721253633499146, + "grad_norm": 0.49637654423713684, "learning_rate": 0.0004872424354853545, - "loss": 0.7831, + "loss": 0.8729, "step": 40 }, { "epoch": 2.1052631578947367, - "eval_loss": 0.7217150926589966, - "eval_runtime": 3.3936, - "eval_samples_per_second": 8.84, - "eval_steps_per_second": 1.179, + "eval_loss": 0.8166154026985168, + "eval_runtime": 3.3997, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, "step": 40 }, { "epoch": 2.1578947368421053, - "grad_norm": 0.7047480344772339, + "grad_norm": 0.6060866713523865, "learning_rate": 0.0004861672729019797, - "loss": 0.7399, + "loss": 0.8154, "step": 41 }, { "epoch": 2.1578947368421053, - "eval_loss": 0.7065999507904053, - "eval_runtime": 3.3827, - "eval_samples_per_second": 8.869, - "eval_steps_per_second": 1.182, + "eval_loss": 0.8058971762657166, + "eval_runtime": 3.3964, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, "step": 41 }, { "epoch": 2.2105263157894735, - "grad_norm": 0.5893692970275879, + "grad_norm": 0.5285487771034241, "learning_rate": 0.0004850499040199643, - "loss": 0.6439, + "loss": 0.7798, "step": 42 }, { "epoch": 2.2105263157894735, - "eval_loss": 0.70515376329422, - "eval_runtime": 3.4046, - "eval_samples_per_second": 8.812, - "eval_steps_per_second": 1.175, + "eval_loss": 0.7971588969230652, + "eval_runtime": 3.4012, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 42 }, { "epoch": 2.263157894736842, - "grad_norm": 0.7542199492454529, + "grad_norm": 0.7103962898254395, "learning_rate": 0.0004838905285283005, - "loss": 0.7689, + "loss": 0.9025, "step": 43 }, { "epoch": 2.263157894736842, - "eval_loss": 0.6860254406929016, - "eval_runtime": 3.4227, - "eval_samples_per_second": 8.765, - "eval_steps_per_second": 1.169, + "eval_loss": 0.7828482985496521, + "eval_runtime": 3.4, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.176, "step": 43 }, { "epoch": 2.3157894736842106, - "grad_norm": 0.8690117597579956, + "grad_norm": 0.6385390758514404, "learning_rate": 0.00048268935362313215, - "loss": 0.7773, + "loss": 0.8484, "step": 44 }, { "epoch": 2.3157894736842106, - "eval_loss": 0.6773452162742615, - "eval_runtime": 3.426, - "eval_samples_per_second": 8.757, - "eval_steps_per_second": 1.168, + "eval_loss": 0.7740622758865356, + "eval_runtime": 3.4013, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 44 }, { "epoch": 2.3684210526315788, - "grad_norm": 0.804668664932251, + "grad_norm": 0.6478577852249146, "learning_rate": 0.00048144659397072586, - "loss": 0.6301, + "loss": 0.794, "step": 45 }, { "epoch": 2.3684210526315788, - "eval_loss": 0.677211344242096, - "eval_runtime": 3.4133, - "eval_samples_per_second": 8.789, - "eval_steps_per_second": 1.172, + "eval_loss": 0.7711488604545593, + "eval_runtime": 3.4029, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 1.175, "step": 45 }, { "epoch": 2.4210526315789473, - "grad_norm": 0.8675904870033264, + "grad_norm": 0.6230824589729309, "learning_rate": 0.0004801624716691072, - "loss": 0.765, + "loss": 0.8394, "step": 46 }, { "epoch": 2.4210526315789473, - "eval_loss": 0.6618334054946899, - "eval_runtime": 3.4058, - "eval_samples_per_second": 8.808, - "eval_steps_per_second": 1.174, + "eval_loss": 0.7640188932418823, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, "step": 46 }, { "epoch": 2.473684210526316, - "grad_norm": 0.653973400592804, + "grad_norm": 0.5779664516448975, "learning_rate": 0.00047883721620836894, - "loss": 0.7057, + "loss": 0.7857, "step": 47 }, { "epoch": 2.473684210526316, - "eval_loss": 0.6598765850067139, - "eval_runtime": 3.4014, - "eval_samples_per_second": 8.82, - "eval_steps_per_second": 1.176, + "eval_loss": 0.758138120174408, + "eval_runtime": 3.3991, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, "step": 47 }, { "epoch": 2.526315789473684, - "grad_norm": 0.6789913177490234, + "grad_norm": 0.5758649110794067, "learning_rate": 0.0004774710644296578, - "loss": 0.6472, + "loss": 0.7685, "step": 48 }, { "epoch": 2.526315789473684, - "eval_loss": 0.6624078154563904, - "eval_runtime": 3.392, - "eval_samples_per_second": 8.844, - "eval_steps_per_second": 1.179, + "eval_loss": 0.7491741180419922, + "eval_runtime": 3.4037, + "eval_samples_per_second": 8.814, + "eval_steps_per_second": 1.175, "step": 48 }, { "epoch": 2.5789473684210527, - "grad_norm": 0.7310296893119812, + "grad_norm": 0.7427331805229187, "learning_rate": 0.00047606426048284813, - "loss": 0.7564, + "loss": 0.8529, "step": 49 }, { "epoch": 2.5789473684210527, - "eval_loss": 0.6666865944862366, - "eval_runtime": 3.3909, - "eval_samples_per_second": 8.847, - "eval_steps_per_second": 1.18, + "eval_loss": 0.7381884455680847, + "eval_runtime": 3.3985, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 49 }, { "epoch": 2.6315789473684212, - "grad_norm": 0.6358530521392822, + "grad_norm": 0.5156267285346985, "learning_rate": 0.00047461705578290833, - "loss": 0.6885, + "loss": 0.7453, "step": 50 }, { "epoch": 2.6315789473684212, - "eval_loss": 0.6546332240104675, - "eval_runtime": 3.3936, - "eval_samples_per_second": 8.84, - "eval_steps_per_second": 1.179, + "eval_loss": 0.735011637210846, + "eval_runtime": 3.3975, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, "step": 50 }, { "epoch": 2.6842105263157894, - "grad_norm": 0.7048445343971252, + "grad_norm": 0.5465694665908813, "learning_rate": 0.0004731297089649703, - "loss": 0.741, + "loss": 0.7681, "step": 51 }, { "epoch": 2.6842105263157894, - "eval_loss": 0.6262445449829102, - "eval_runtime": 3.3788, - "eval_samples_per_second": 8.879, - "eval_steps_per_second": 1.184, + "eval_loss": 0.7380778193473816, + "eval_runtime": 3.3945, + "eval_samples_per_second": 8.838, + "eval_steps_per_second": 1.178, "step": 51 }, { "epoch": 2.736842105263158, - "grad_norm": 0.6233330368995667, + "grad_norm": 0.5591109991073608, "learning_rate": 0.0004716024858381075, - "loss": 0.7005, + "loss": 0.7583, "step": 52 }, { "epoch": 2.736842105263158, - "eval_loss": 0.625355064868927, - "eval_runtime": 3.3944, - "eval_samples_per_second": 8.838, - "eval_steps_per_second": 1.178, + "eval_loss": 0.735223650932312, + "eval_runtime": 3.3923, + "eval_samples_per_second": 8.844, + "eval_steps_per_second": 1.179, "step": 52 }, { "epoch": 2.7894736842105265, - "grad_norm": 0.9111106991767883, + "grad_norm": 0.6300053596496582, "learning_rate": 0.00047003565933783123, - "loss": 0.7832, + "loss": 0.8622, "step": 53 }, { "epoch": 2.7894736842105265, - "eval_loss": 0.6406391263008118, - "eval_runtime": 3.4066, - "eval_samples_per_second": 8.806, - "eval_steps_per_second": 1.174, + "eval_loss": 0.7290965914726257, + "eval_runtime": 3.4013, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 53 }, { "epoch": 2.8421052631578947, - "grad_norm": 0.6745730042457581, + "grad_norm": 0.6577848792076111, "learning_rate": 0.0004684295094773134, - "loss": 0.7067, + "loss": 0.7678, "step": 54 }, { "epoch": 2.8421052631578947, - "eval_loss": 0.6424602270126343, - "eval_runtime": 3.4221, - "eval_samples_per_second": 8.766, - "eval_steps_per_second": 1.169, + "eval_loss": 0.7240878343582153, + "eval_runtime": 3.3989, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, "step": 54 }, { "epoch": 2.8947368421052633, - "grad_norm": 0.6174259185791016, + "grad_norm": 0.48959189653396606, "learning_rate": 0.00046678432329734434, - "loss": 0.6662, + "loss": 0.7592, "step": 55 }, { "epoch": 2.8947368421052633, - "eval_loss": 0.636626660823822, - "eval_runtime": 3.4142, - "eval_samples_per_second": 8.787, - "eval_steps_per_second": 1.172, + "eval_loss": 0.7289024591445923, + "eval_runtime": 3.4003, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, "step": 55 }, { "epoch": 2.9473684210526314, - "grad_norm": 1.0136440992355347, + "grad_norm": 0.6378675699234009, "learning_rate": 0.00046510039481503486, - "loss": 0.7744, + "loss": 0.8768, "step": 56 }, { "epoch": 2.9473684210526314, - "eval_loss": 0.606927752494812, - "eval_runtime": 3.4082, - "eval_samples_per_second": 8.802, + "eval_loss": 0.7245283722877502, + "eval_runtime": 3.4071, + "eval_samples_per_second": 8.805, "eval_steps_per_second": 1.174, "step": 56 }, { "epoch": 3.0, - "grad_norm": 0.9457768201828003, + "grad_norm": 0.533486545085907, "learning_rate": 0.00046337802497127117, - "loss": 0.7527, + "loss": 0.8078, "step": 57 }, { "epoch": 3.0, - "eval_loss": 0.5853554010391235, - "eval_runtime": 3.4062, - "eval_samples_per_second": 8.807, - "eval_steps_per_second": 1.174, + "eval_loss": 0.7103175520896912, + "eval_runtime": 3.4012, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, "step": 57 }, { "epoch": 3.0526315789473686, - "grad_norm": 0.590958297252655, + "grad_norm": 0.5410111546516418, "learning_rate": 0.00046161752157693284, - "loss": 0.5788, + "loss": 0.7147, "step": 58 }, { "epoch": 3.0526315789473686, - "eval_loss": 0.5719938278198242, - "eval_runtime": 3.3853, - "eval_samples_per_second": 8.862, - "eval_steps_per_second": 1.182, + "eval_loss": 0.6982213258743286, + "eval_runtime": 3.3871, + "eval_samples_per_second": 8.857, + "eval_steps_per_second": 1.181, "step": 58 }, { "epoch": 3.1052631578947367, - "grad_norm": 0.7152032256126404, + "grad_norm": 0.5490122437477112, "learning_rate": 0.0004598191992578828, - "loss": 0.5682, + "loss": 0.7584, "step": 59 }, { "epoch": 3.1052631578947367, - "eval_loss": 0.5641719102859497, - "eval_runtime": 3.3849, - "eval_samples_per_second": 8.863, - "eval_steps_per_second": 1.182, + "eval_loss": 0.6866177320480347, + "eval_runtime": 3.3873, + "eval_samples_per_second": 8.857, + "eval_steps_per_second": 1.181, "step": 59 }, { "epoch": 3.1578947368421053, - "grad_norm": 0.7634884119033813, + "grad_norm": 0.49469825625419617, "learning_rate": 0.00045798337939873923, - "loss": 0.6203, + "loss": 0.7261, "step": 60 }, { "epoch": 3.1578947368421053, - "eval_loss": 0.5324738621711731, - "eval_runtime": 3.381, - "eval_samples_per_second": 8.873, - "eval_steps_per_second": 1.183, + "eval_loss": 0.6730698943138123, + "eval_runtime": 3.3973, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, "step": 60 }, { "epoch": 3.2105263157894735, - "grad_norm": 0.792768120765686, + "grad_norm": 0.8399549126625061, "learning_rate": 0.0004561103900854401, - "loss": 0.4925, + "loss": 0.6503, "step": 61 }, { "epoch": 3.2105263157894735, - "eval_loss": 0.5288674235343933, - "eval_runtime": 3.3809, - "eval_samples_per_second": 8.873, - "eval_steps_per_second": 1.183, + "eval_loss": 0.6618488430976868, + "eval_runtime": 3.3947, + "eval_samples_per_second": 8.837, + "eval_steps_per_second": 1.178, "step": 61 }, { "epoch": 3.263157894736842, - "grad_norm": 0.9308955669403076, + "grad_norm": 0.5458311438560486, "learning_rate": 0.0004542005660466094, - "loss": 0.5549, + "loss": 0.7217, "step": 62 }, { "epoch": 3.263157894736842, - "eval_loss": 0.518241822719574, - "eval_runtime": 3.4116, - "eval_samples_per_second": 8.793, - "eval_steps_per_second": 1.172, + "eval_loss": 0.6508110761642456, + "eval_runtime": 3.4003, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, "step": 62 }, { "epoch": 3.3157894736842106, - "grad_norm": 0.8600835204124451, + "grad_norm": 0.9009385704994202, "learning_rate": 0.0004522542485937369, - "loss": 0.5303, + "loss": 0.6747, "step": 63 }, { "epoch": 3.3157894736842106, - "eval_loss": 0.5275946259498596, - "eval_runtime": 3.418, - "eval_samples_per_second": 8.777, - "eval_steps_per_second": 1.17, + "eval_loss": 0.6464059948921204, + "eval_runtime": 3.4046, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, "step": 63 }, { "epoch": 3.3684210526315788, - "grad_norm": 0.9408547282218933, + "grad_norm": 0.5399370193481445, "learning_rate": 0.0004502717855601809, - "loss": 0.5266, + "loss": 0.6838, "step": 64 }, { "epoch": 3.3684210526315788, - "eval_loss": 0.5335482954978943, - "eval_runtime": 3.4039, - "eval_samples_per_second": 8.813, - "eval_steps_per_second": 1.175, + "eval_loss": 0.6449176669120789, + "eval_runtime": 3.3903, + "eval_samples_per_second": 8.849, + "eval_steps_per_second": 1.18, "step": 64 }, { "epoch": 3.4210526315789473, - "grad_norm": 0.9287356734275818, + "grad_norm": 0.664746880531311, "learning_rate": 0.0004482535312390058, - "loss": 0.5422, + "loss": 0.6601, "step": 65 }, { "epoch": 3.4210526315789473, - "eval_loss": 0.5340169668197632, - "eval_runtime": 3.4, - "eval_samples_per_second": 8.824, - "eval_steps_per_second": 1.176, + "eval_loss": 0.6410928964614868, + "eval_runtime": 3.3948, + "eval_samples_per_second": 8.837, + "eval_steps_per_second": 1.178, "step": 65 }, { "epoch": 3.473684210526316, - "grad_norm": 0.7495890855789185, + "grad_norm": 0.7200000882148743, "learning_rate": 0.00044619984631966527, - "loss": 0.4125, + "loss": 0.5722, "step": 66 }, { "epoch": 3.473684210526316, - "eval_loss": 0.5400173664093018, - "eval_runtime": 3.3956, - "eval_samples_per_second": 8.835, - "eval_steps_per_second": 1.178, + "eval_loss": 0.6338309645652771, + "eval_runtime": 3.3867, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 1.181, "step": 66 }, { "epoch": 3.526315789473684, - "grad_norm": 0.7854607105255127, + "grad_norm": 0.8224210739135742, "learning_rate": 0.0004441110978235418, - "loss": 0.6212, + "loss": 0.6984, "step": 67 }, { "epoch": 3.526315789473684, - "eval_loss": 0.5298979878425598, + "eval_loss": 0.6232346892356873, "eval_runtime": 3.3872, "eval_samples_per_second": 8.857, "eval_steps_per_second": 1.181, @@ -1016,1098 +1016,2748 @@ }, { "epoch": 3.5789473684210527, - "grad_norm": 0.7772982120513916, + "grad_norm": 0.6948024034500122, "learning_rate": 0.0004419876590383554, - "loss": 0.6686, + "loss": 0.6921, "step": 68 }, { "epoch": 3.5789473684210527, - "eval_loss": 0.5085378885269165, - "eval_runtime": 3.3947, - "eval_samples_per_second": 8.837, - "eval_steps_per_second": 1.178, + "eval_loss": 0.6190816164016724, + "eval_runtime": 3.4096, + "eval_samples_per_second": 8.799, + "eval_steps_per_second": 1.173, "step": 68 }, { "epoch": 3.6315789473684212, - "grad_norm": 0.5734469890594482, + "grad_norm": 0.5954806804656982, "learning_rate": 0.00043982990945145146, - "loss": 0.5029, + "loss": 0.6452, "step": 69 }, { "epoch": 3.6315789473684212, - "eval_loss": 0.4952623248100281, - "eval_runtime": 3.394, - "eval_samples_per_second": 8.839, - "eval_steps_per_second": 1.179, + "eval_loss": 0.6215729117393494, + "eval_runtime": 3.4023, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, "step": 69 }, { "epoch": 3.6842105263157894, - "grad_norm": 0.7065844535827637, + "grad_norm": 0.6146106719970703, "learning_rate": 0.0004376382346819819, - "loss": 0.5456, + "loss": 0.6753, "step": 70 }, { "epoch": 3.6842105263157894, - "eval_loss": 0.4688592255115509, - "eval_runtime": 3.3926, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 0.616372287273407, + "eval_runtime": 3.4004, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, "step": 70 }, { "epoch": 3.736842105263158, - "grad_norm": 0.7695831656455994, + "grad_norm": 0.6286161541938782, "learning_rate": 0.00043541302641198946, - "loss": 0.6181, + "loss": 0.7126, "step": 71 }, { "epoch": 3.736842105263158, - "eval_loss": 0.4361562430858612, - "eval_runtime": 3.3683, - "eval_samples_per_second": 8.907, - "eval_steps_per_second": 1.188, + "eval_loss": 0.6052109599113464, + "eval_runtime": 3.3873, + "eval_samples_per_second": 8.857, + "eval_steps_per_second": 1.181, "step": 71 }, { "epoch": 3.7894736842105265, - "grad_norm": 0.7607541084289551, + "grad_norm": 0.5700982213020325, "learning_rate": 0.00043315468231640834, - "loss": 0.5418, + "loss": 0.6126, "step": 72 }, { "epoch": 3.7894736842105265, - "eval_loss": 0.43303337693214417, - "eval_runtime": 3.4041, - "eval_samples_per_second": 8.813, - "eval_steps_per_second": 1.175, + "eval_loss": 0.6031004786491394, + "eval_runtime": 3.3922, + "eval_samples_per_second": 8.844, + "eval_steps_per_second": 1.179, "step": 72 }, { "epoch": 3.8421052631578947, - "grad_norm": 0.7018841505050659, + "grad_norm": 0.8683762550354004, "learning_rate": 0.00043086360599199516, - "loss": 0.5615, + "loss": 0.7278, "step": 73 }, { "epoch": 3.8421052631578947, - "eval_loss": 0.4260921776294708, - "eval_runtime": 3.4211, - "eval_samples_per_second": 8.769, - "eval_steps_per_second": 1.169, + "eval_loss": 0.5932725667953491, + "eval_runtime": 3.3962, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, "step": 73 }, { "epoch": 3.8947368421052633, - "grad_norm": 0.8144612312316895, + "grad_norm": 0.8634172081947327, "learning_rate": 0.0004285402068852002, - "loss": 0.5775, + "loss": 0.6826, "step": 74 }, { "epoch": 3.8947368421052633, - "eval_loss": 0.43498238921165466, - "eval_runtime": 3.4168, - "eval_samples_per_second": 8.78, - "eval_steps_per_second": 1.171, + "eval_loss": 0.5909937620162964, + "eval_runtime": 3.3983, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 74 }, { "epoch": 3.9473684210526314, - "grad_norm": 0.7396026849746704, + "grad_norm": 0.556474506855011, "learning_rate": 0.00042618490021899383, - "loss": 0.5516, + "loss": 0.65, "step": 75 }, { "epoch": 3.9473684210526314, - "eval_loss": 0.42656373977661133, - "eval_runtime": 3.4118, - "eval_samples_per_second": 8.793, - "eval_steps_per_second": 1.172, + "eval_loss": 0.5868418216705322, + "eval_runtime": 3.399, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, "step": 75 }, { "epoch": 4.0, - "grad_norm": 0.6484639048576355, + "grad_norm": 0.5346130728721619, "learning_rate": 0.00042379810691866064, - "loss": 0.5277, + "loss": 0.6475, "step": 76 }, { "epoch": 4.0, - "eval_loss": 0.4239371418952942, - "eval_runtime": 3.4094, - "eval_samples_per_second": 8.799, - "eval_steps_per_second": 1.173, - "step": 76 + "eval_loss": 0.588336706161499, + "eval_runtime": 3.3932, + "eval_samples_per_second": 8.841, + "eval_steps_per_second": 1.179, + "step": 76 }, { "epoch": 4.052631578947368, - "grad_norm": 0.6360411643981934, + "grad_norm": 0.4865156412124634, "learning_rate": 0.00042138025353657407, - "loss": 0.3413, + "loss": 0.5485, "step": 77 }, { "epoch": 4.052631578947368, - "eval_loss": 0.3958459198474884, - "eval_runtime": 3.4011, - "eval_samples_per_second": 8.821, - "eval_steps_per_second": 1.176, + "eval_loss": 0.5785155892372131, + "eval_runtime": 3.3941, + "eval_samples_per_second": 8.839, + "eval_steps_per_second": 1.179, "step": 77 }, { "epoch": 4.105263157894737, - "grad_norm": 1.2476710081100464, + "grad_norm": 0.5607722997665405, "learning_rate": 0.00041893177217596633, - "loss": 0.385, + "loss": 0.5699, "step": 78 }, { "epoch": 4.105263157894737, - "eval_loss": 0.36877045035362244, - "eval_runtime": 3.3944, - "eval_samples_per_second": 8.838, + "eval_loss": 0.5646374821662903, + "eval_runtime": 3.3965, + "eval_samples_per_second": 8.833, "eval_steps_per_second": 1.178, "step": 78 }, { "epoch": 4.157894736842105, - "grad_norm": 0.7502722144126892, + "grad_norm": 0.5337282419204712, "learning_rate": 0.0004164531004137049, - "loss": 0.3582, + "loss": 0.5308, "step": 79 }, { "epoch": 4.157894736842105, - "eval_loss": 0.3534345030784607, - "eval_runtime": 3.3925, + "eval_loss": 0.5542218685150146, + "eval_runtime": 3.3924, "eval_samples_per_second": 8.843, "eval_steps_per_second": 1.179, "step": 79 }, { "epoch": 4.2105263157894735, - "grad_norm": 0.8109405040740967, + "grad_norm": 1.7681509256362915, "learning_rate": 0.0004139446812220924, - "loss": 0.373, + "loss": 0.5458, "step": 80 }, { "epoch": 4.2105263157894735, - "eval_loss": 0.33822891116142273, - "eval_runtime": 3.3885, - "eval_samples_per_second": 8.853, - "eval_steps_per_second": 1.18, + "eval_loss": 0.5494810938835144, + "eval_runtime": 3.3951, + "eval_samples_per_second": 8.836, + "eval_steps_per_second": 1.178, "step": 80 }, { "epoch": 4.2631578947368425, - "grad_norm": 1.0521330833435059, + "grad_norm": 0.8153849244117737, "learning_rate": 0.0004114069628897006, - "loss": 0.3709, + "loss": 0.592, "step": 81 }, { "epoch": 4.2631578947368425, - "eval_loss": 0.33233410120010376, - "eval_runtime": 3.3863, - "eval_samples_per_second": 8.859, - "eval_steps_per_second": 1.181, + "eval_loss": 0.5404940843582153, + "eval_runtime": 3.3937, + "eval_samples_per_second": 8.84, + "eval_steps_per_second": 1.179, "step": 81 }, { "epoch": 4.315789473684211, - "grad_norm": 0.7640473246574402, + "grad_norm": 0.7037251591682434, "learning_rate": 0.0004088403989412559, - "loss": 0.3946, + "loss": 0.579, "step": 82 }, { "epoch": 4.315789473684211, - "eval_loss": 0.328426718711853, - "eval_runtime": 3.3972, - "eval_samples_per_second": 8.831, - "eval_steps_per_second": 1.177, + "eval_loss": 0.530238926410675, + "eval_runtime": 3.3957, + "eval_samples_per_second": 8.835, + "eval_steps_per_second": 1.178, "step": 82 }, { "epoch": 4.368421052631579, - "grad_norm": 0.8075594305992126, + "grad_norm": 0.6703127026557922, "learning_rate": 0.00040624544805658794, - "loss": 0.3656, + "loss": 0.5513, "step": 83 }, { "epoch": 4.368421052631579, - "eval_loss": 0.33396559953689575, - "eval_runtime": 3.4167, - "eval_samples_per_second": 8.78, - "eval_steps_per_second": 1.171, + "eval_loss": 0.5282605290412903, + "eval_runtime": 3.4012, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 83 }, { "epoch": 4.421052631578947, - "grad_norm": 0.8138145208358765, + "grad_norm": 0.7324157357215881, "learning_rate": 0.00040362257398865713, - "loss": 0.4204, + "loss": 0.6175, "step": 84 }, { "epoch": 4.421052631578947, - "eval_loss": 0.3294268548488617, - "eval_runtime": 3.4263, - "eval_samples_per_second": 8.756, - "eval_steps_per_second": 1.167, + "eval_loss": 0.5271756052970886, + "eval_runtime": 3.3941, + "eval_samples_per_second": 8.839, + "eval_steps_per_second": 1.179, "step": 84 }, { "epoch": 4.473684210526316, - "grad_norm": 0.7655662894248962, + "grad_norm": 0.7354516386985779, "learning_rate": 0.00040097224548067613, - "loss": 0.3523, + "loss": 0.5497, "step": 85 }, { "epoch": 4.473684210526316, - "eval_loss": 0.3323798179626465, - "eval_runtime": 3.4171, - "eval_samples_per_second": 8.779, - "eval_steps_per_second": 1.171, + "eval_loss": 0.5268288850784302, + "eval_runtime": 3.397, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, "step": 85 }, { "epoch": 4.526315789473684, - "grad_norm": 1.1450239419937134, + "grad_norm": 0.6430884599685669, "learning_rate": 0.0003982949361823388, - "loss": 0.3841, + "loss": 0.5323, "step": 86 }, { "epoch": 4.526315789473684, - "eval_loss": 0.32615649700164795, - "eval_runtime": 3.4067, - "eval_samples_per_second": 8.806, + "eval_loss": 0.5271150469779968, + "eval_runtime": 3.4081, + "eval_samples_per_second": 8.803, "eval_steps_per_second": 1.174, "step": 86 }, { "epoch": 4.578947368421053, - "grad_norm": 0.906433641910553, + "grad_norm": 0.6861183643341064, "learning_rate": 0.0003955911245651726, - "loss": 0.3944, + "loss": 0.555, "step": 87 }, { "epoch": 4.578947368421053, - "eval_loss": 0.31898513436317444, - "eval_runtime": 3.4041, - "eval_samples_per_second": 8.813, - "eval_steps_per_second": 1.175, + "eval_loss": 0.5218092799186707, + "eval_runtime": 3.3947, + "eval_samples_per_second": 8.837, + "eval_steps_per_second": 1.178, "step": 87 }, { "epoch": 4.631578947368421, - "grad_norm": 0.8530308604240417, + "grad_norm": 0.6339515447616577, "learning_rate": 0.0003928612938370292, - "loss": 0.3458, + "loss": 0.5396, "step": 88 }, { "epoch": 4.631578947368421, - "eval_loss": 0.3205549418926239, - "eval_runtime": 3.3926, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 0.5187237858772278, + "eval_runtime": 3.3968, + "eval_samples_per_second": 8.832, + "eval_steps_per_second": 1.178, "step": 88 }, { "epoch": 4.684210526315789, - "grad_norm": 0.8352761268615723, + "grad_norm": 0.5840083360671997, "learning_rate": 0.00039010593185572867, - "loss": 0.2743, + "loss": 0.5043, "step": 89 }, { "epoch": 4.684210526315789, - "eval_loss": 0.2936182916164398, - "eval_runtime": 3.387, - "eval_samples_per_second": 8.857, - "eval_steps_per_second": 1.181, + "eval_loss": 0.5117171406745911, + "eval_runtime": 3.3945, + "eval_samples_per_second": 8.838, + "eval_steps_per_second": 1.178, "step": 89 }, { "epoch": 4.7368421052631575, - "grad_norm": 0.5991601943969727, + "grad_norm": 0.6243887543678284, "learning_rate": 0.00038732553104187296, - "loss": 0.3021, + "loss": 0.4985, "step": 90 }, { "epoch": 4.7368421052631575, - "eval_loss": 0.27553170919418335, - "eval_runtime": 3.3868, - "eval_samples_per_second": 8.858, - "eval_steps_per_second": 1.181, + "eval_loss": 0.5013009905815125, + "eval_runtime": 3.3983, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 90 }, { "epoch": 4.7894736842105265, - "grad_norm": 0.8881103992462158, + "grad_norm": 0.7383096814155579, "learning_rate": 0.0003845205882908432, - "loss": 0.4067, + "loss": 0.5446, "step": 91 }, { "epoch": 4.7894736842105265, - "eval_loss": 0.2532661557197571, - "eval_runtime": 3.37, - "eval_samples_per_second": 8.902, - "eval_steps_per_second": 1.187, + "eval_loss": 0.48944994807243347, + "eval_runtime": 3.3912, + "eval_samples_per_second": 8.846, + "eval_steps_per_second": 1.18, "step": 91 }, { "epoch": 4.842105263157895, - "grad_norm": 0.8522405624389648, + "grad_norm": 0.7017186880111694, "learning_rate": 0.0003816916048839979, - "loss": 0.3154, + "loss": 0.4855, "step": 92 }, { "epoch": 4.842105263157895, - "eval_loss": 0.2513314485549927, - "eval_runtime": 3.3938, - "eval_samples_per_second": 8.84, + "eval_loss": 0.490288108587265, + "eval_runtime": 3.392, + "eval_samples_per_second": 8.844, "eval_steps_per_second": 1.179, "step": 92 }, { "epoch": 4.894736842105263, - "grad_norm": 0.8646958470344543, + "grad_norm": 0.803577184677124, "learning_rate": 0.0003788390863990875, - "loss": 0.4164, + "loss": 0.599, "step": 93 }, { "epoch": 4.894736842105263, - "eval_loss": 0.2513067126274109, - "eval_runtime": 3.4157, - "eval_samples_per_second": 8.783, - "eval_steps_per_second": 1.171, + "eval_loss": 0.48545849323272705, + "eval_runtime": 3.3984, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 93 }, { "epoch": 4.947368421052632, - "grad_norm": 0.8191807270050049, + "grad_norm": 0.719249963760376, "learning_rate": 0.00037596354261990007, - "loss": 0.4242, + "loss": 0.5539, "step": 94 }, { "epoch": 4.947368421052632, - "eval_loss": 0.26223641633987427, - "eval_runtime": 3.4207, - "eval_samples_per_second": 8.77, - "eval_steps_per_second": 1.169, + "eval_loss": 0.4850545823574066, + "eval_runtime": 3.4015, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 94 }, { "epoch": 5.0, - "grad_norm": 0.8828155994415283, + "grad_norm": 0.7983654141426086, "learning_rate": 0.0003730654874451569, - "loss": 0.4332, + "loss": 0.5899, "step": 95 }, { "epoch": 5.0, - "eval_loss": 0.2630893886089325, - "eval_runtime": 3.4111, - "eval_samples_per_second": 8.795, - "eval_steps_per_second": 1.173, + "eval_loss": 0.47937095165252686, + "eval_runtime": 3.4007, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, "step": 95 }, { "epoch": 5.052631578947368, - "grad_norm": 0.8348397612571716, + "grad_norm": 0.6120598316192627, "learning_rate": 0.00037014543879667093, - "loss": 0.1959, + "loss": 0.4219, "step": 96 }, { "epoch": 5.052631578947368, - "eval_loss": 0.2434958517551422, - "eval_runtime": 3.404, - "eval_samples_per_second": 8.813, - "eval_steps_per_second": 1.175, + "eval_loss": 0.46941977739334106, + "eval_runtime": 3.3985, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 96 }, { "epoch": 5.105263157894737, - "grad_norm": 0.6735607981681824, + "grad_norm": 0.7291161417961121, "learning_rate": 0.0003672039185267878, - "loss": 0.2632, + "loss": 0.5002, "step": 97 }, { "epoch": 5.105263157894737, - "eval_loss": 0.21788616478443146, - "eval_runtime": 3.4036, - "eval_samples_per_second": 8.814, + "eval_loss": 0.45138782262802124, + "eval_runtime": 3.4042, + "eval_samples_per_second": 8.813, "eval_steps_per_second": 1.175, "step": 97 }, { "epoch": 5.157894736842105, - "grad_norm": 0.8541862368583679, + "grad_norm": 0.5574305057525635, "learning_rate": 0.00036424145232512333, - "loss": 0.2315, + "loss": 0.4445, "step": 98 }, { "epoch": 5.157894736842105, - "eval_loss": 0.20070011913776398, - "eval_runtime": 3.4, - "eval_samples_per_second": 8.824, + "eval_loss": 0.43881431221961975, + "eval_runtime": 3.4021, + "eval_samples_per_second": 8.818, "eval_steps_per_second": 1.176, "step": 98 }, { "epoch": 5.2105263157894735, - "grad_norm": 0.8920652270317078, + "grad_norm": 0.7164113521575928, "learning_rate": 0.0003612585696246158, - "loss": 0.2045, + "loss": 0.4292, "step": 99 }, { "epoch": 5.2105263157894735, - "eval_loss": 0.19080133736133575, - "eval_runtime": 3.3957, - "eval_samples_per_second": 8.835, - "eval_steps_per_second": 1.178, + "eval_loss": 0.43201857805252075, + "eval_runtime": 3.404, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, "step": 99 }, { "epoch": 5.2631578947368425, - "grad_norm": 1.0066344738006592, + "grad_norm": 0.7618677020072937, "learning_rate": 0.0003582558035069091, - "loss": 0.2322, + "loss": 0.4598, "step": 100 }, { "epoch": 5.2631578947368425, - "eval_loss": 0.19216839969158173, - "eval_runtime": 3.393, - "eval_samples_per_second": 8.842, - "eval_steps_per_second": 1.179, + "eval_loss": 0.434807151556015, + "eval_runtime": 3.3997, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, "step": 100 }, { "epoch": 5.315789473684211, - "grad_norm": 1.0094186067581177, + "grad_norm": 0.8746724128723145, "learning_rate": 0.0003552336906070838, - "loss": 0.234, + "loss": 0.4326, "step": 101 }, { "epoch": 5.315789473684211, - "eval_loss": 0.18470846116542816, - "eval_runtime": 3.3781, - "eval_samples_per_second": 8.881, - "eval_steps_per_second": 1.184, + "eval_loss": 0.4299829304218292, + "eval_runtime": 3.3922, + "eval_samples_per_second": 8.844, + "eval_steps_per_second": 1.179, "step": 101 }, { "epoch": 5.368421052631579, - "grad_norm": 0.7846252918243408, + "grad_norm": 0.8440446257591248, "learning_rate": 0.000352192771017753, - "loss": 0.2102, + "loss": 0.4344, "step": 102 }, { "epoch": 5.368421052631579, - "eval_loss": 0.1826123297214508, - "eval_runtime": 3.4041, - "eval_samples_per_second": 8.813, - "eval_steps_per_second": 1.175, + "eval_loss": 0.4243197739124298, + "eval_runtime": 3.3935, + "eval_samples_per_second": 8.84, + "eval_steps_per_second": 1.179, "step": 102 }, { "epoch": 5.421052631578947, - "grad_norm": 0.9099224209785461, + "grad_norm": 0.7834837436676025, "learning_rate": 0.0003491335881925407, - "loss": 0.2158, + "loss": 0.4662, "step": 103 }, { "epoch": 5.421052631578947, - "eval_loss": 0.17790839076042175, - "eval_runtime": 3.4112, - "eval_samples_per_second": 8.794, - "eval_steps_per_second": 1.173, + "eval_loss": 0.42057812213897705, + "eval_runtime": 3.3965, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, "step": 103 }, { "epoch": 5.473684210526316, - "grad_norm": 0.9225228428840637, + "grad_norm": 0.7472103834152222, "learning_rate": 0.0003460566888489593, - "loss": 0.2508, + "loss": 0.418, "step": 104 }, { "epoch": 5.473684210526316, - "eval_loss": 0.1678052842617035, - "eval_runtime": 3.4176, - "eval_samples_per_second": 8.778, - "eval_steps_per_second": 1.17, + "eval_loss": 0.4140828549861908, + "eval_runtime": 3.3928, + "eval_samples_per_second": 8.842, + "eval_steps_per_second": 1.179, "step": 104 }, { "epoch": 5.526315789473684, - "grad_norm": 0.7504759430885315, + "grad_norm": 0.8624552488327026, "learning_rate": 0.00034296262287070335, - "loss": 0.1729, + "loss": 0.3972, "step": 105 }, { "epoch": 5.526315789473684, - "eval_loss": 0.16910715401172638, - "eval_runtime": 3.4029, - "eval_samples_per_second": 8.816, - "eval_steps_per_second": 1.175, + "eval_loss": 0.4123520851135254, + "eval_runtime": 3.3971, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, "step": 105 }, { "epoch": 5.578947368421053, - "grad_norm": 0.9986382126808167, + "grad_norm": 0.8890901207923889, "learning_rate": 0.0003398519432093782, - "loss": 0.2501, + "loss": 0.4657, "step": 106 }, { "epoch": 5.578947368421053, - "eval_loss": 0.15678884088993073, - "eval_runtime": 3.402, - "eval_samples_per_second": 8.818, - "eval_steps_per_second": 1.176, + "eval_loss": 0.40413472056388855, + "eval_runtime": 3.3959, + "eval_samples_per_second": 8.834, + "eval_steps_per_second": 1.178, "step": 106 }, { "epoch": 5.631578947368421, - "grad_norm": 0.8715238571166992, + "grad_norm": 0.7559741139411926, "learning_rate": 0.0003367252057856802, - "loss": 0.2378, + "loss": 0.4583, "step": 107 }, { "epoch": 5.631578947368421, - "eval_loss": 0.15522931516170502, - "eval_runtime": 3.4027, - "eval_samples_per_second": 8.817, - "eval_steps_per_second": 1.176, + "eval_loss": 0.3957214951515198, + "eval_runtime": 3.3997, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, "step": 107 }, { "epoch": 5.684210526315789, - "grad_norm": 0.7107959985733032, + "grad_norm": 0.7100098729133606, "learning_rate": 0.00033358296939004547, - "loss": 0.2041, + "loss": 0.4226, "step": 108 }, { "epoch": 5.684210526315789, - "eval_loss": 0.15928159654140472, - "eval_runtime": 3.3917, - "eval_samples_per_second": 8.845, - "eval_steps_per_second": 1.179, + "eval_loss": 0.3925686478614807, + "eval_runtime": 3.3977, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, "step": 108 }, { "epoch": 5.7368421052631575, - "grad_norm": 0.9013569951057434, + "grad_norm": 0.693897545337677, "learning_rate": 0.00033042579558278717, - "loss": 0.2041, + "loss": 0.4317, "step": 109 }, { "epoch": 5.7368421052631575, - "eval_loss": 0.14394816756248474, - "eval_runtime": 3.3922, - "eval_samples_per_second": 8.844, - "eval_steps_per_second": 1.179, + "eval_loss": 0.38951781392097473, + "eval_runtime": 3.3988, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 109 }, { "epoch": 5.7894736842105265, - "grad_norm": 0.8455482125282288, + "grad_norm": 0.8033037781715393, "learning_rate": 0.00032725424859373687, - "loss": 0.2566, + "loss": 0.4543, "step": 110 }, { "epoch": 5.7894736842105265, - "eval_loss": 0.13271644711494446, - "eval_runtime": 3.3907, - "eval_samples_per_second": 8.848, - "eval_steps_per_second": 1.18, + "eval_loss": 0.38237908482551575, + "eval_runtime": 3.3962, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, "step": 110 }, { "epoch": 5.842105263157895, - "grad_norm": 0.8474248647689819, + "grad_norm": 0.8621124625205994, "learning_rate": 0.0003240688952214085, - "loss": 0.2349, + "loss": 0.4746, "step": 111 }, { "epoch": 5.842105263157895, - "eval_loss": 0.12245019525289536, - "eval_runtime": 3.3761, - "eval_samples_per_second": 8.886, - "eval_steps_per_second": 1.185, + "eval_loss": 0.373757928609848, + "eval_runtime": 3.3971, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, "step": 111 }, { "epoch": 5.894736842105263, - "grad_norm": 0.6457234025001526, + "grad_norm": 0.8101131319999695, "learning_rate": 0.00032087030473170445, - "loss": 0.1739, + "loss": 0.3917, "step": 112 }, { "epoch": 5.894736842105263, - "eval_loss": 0.11227525770664215, - "eval_runtime": 3.3894, - "eval_samples_per_second": 8.851, + "eval_loss": 0.3614272177219391, + "eval_runtime": 3.39, + "eval_samples_per_second": 8.85, "eval_steps_per_second": 1.18, "step": 112 }, { "epoch": 5.947368421052632, - "grad_norm": 0.7094982862472534, + "grad_norm": 0.6941331624984741, "learning_rate": 0.00031765904875617973, - "loss": 0.1764, + "loss": 0.3344, "step": 113 }, { "epoch": 5.947368421052632, - "eval_loss": 0.10451077669858932, - "eval_runtime": 3.4083, - "eval_samples_per_second": 8.802, - "eval_steps_per_second": 1.174, + "eval_loss": 0.35045164823532104, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, "step": 113 }, { "epoch": 6.0, - "grad_norm": 0.7440356612205505, + "grad_norm": 0.6586763262748718, "learning_rate": 0.00031443570118988356, - "loss": 0.183, + "loss": 0.3539, "step": 114 }, { "epoch": 6.0, - "eval_loss": 0.1312665045261383, - "eval_runtime": 3.4165, - "eval_samples_per_second": 8.781, - "eval_steps_per_second": 1.171, + "eval_loss": 0.34484100341796875, + "eval_runtime": 3.3948, + "eval_samples_per_second": 8.837, + "eval_steps_per_second": 1.178, "step": 114 }, { "epoch": 6.052631578947368, - "grad_norm": 3.956552743911743, + "grad_norm": 0.7052369713783264, "learning_rate": 0.00031120083808879663, - "loss": 0.1362, + "loss": 0.3257, "step": 115 }, { "epoch": 6.052631578947368, - "eval_loss": 0.10527395457029343, - "eval_runtime": 3.4101, - "eval_samples_per_second": 8.797, - "eval_steps_per_second": 1.173, + "eval_loss": 0.3385400176048279, + "eval_runtime": 3.4041, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, "step": 115 }, { "epoch": 6.105263157894737, - "grad_norm": 0.6579440236091614, + "grad_norm": 0.8040263056755066, "learning_rate": 0.0003079550375668821, - "loss": 0.1144, + "loss": 0.335, "step": 116 }, { "epoch": 6.105263157894737, - "eval_loss": 0.1004921942949295, - "eval_runtime": 3.4068, - "eval_samples_per_second": 8.806, - "eval_steps_per_second": 1.174, + "eval_loss": 0.3320732116699219, + "eval_runtime": 3.4012, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, "step": 116 }, { "epoch": 6.157894736842105, - "grad_norm": 0.5682271718978882, + "grad_norm": 0.9117230772972107, "learning_rate": 0.00030469887969276877, - "loss": 0.0992, + "loss": 0.3133, "step": 117 }, { "epoch": 6.157894736842105, - "eval_loss": 0.0927642211318016, - "eval_runtime": 3.3986, - "eval_samples_per_second": 8.827, - "eval_steps_per_second": 1.177, + "eval_loss": 0.328256756067276, + "eval_runtime": 3.4045, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, "step": 117 }, { "epoch": 6.2105263157894735, - "grad_norm": 0.6567280888557434, + "grad_norm": 0.8745028972625732, "learning_rate": 0.00030143294638608487, - "loss": 0.0722, + "loss": 0.2972, "step": 118 }, { "epoch": 6.2105263157894735, - "eval_loss": 0.08658146113157272, - "eval_runtime": 3.3927, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 0.3161332905292511, + "eval_runtime": 3.3982, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 118 }, { "epoch": 6.2631578947368425, - "grad_norm": 0.515227198600769, + "grad_norm": 0.893980085849762, "learning_rate": 0.00029815782131346137, - "loss": 0.0772, + "loss": 0.3135, "step": 119 }, { "epoch": 6.2631578947368425, - "eval_loss": 0.08781375735998154, - "eval_runtime": 3.3931, - "eval_samples_per_second": 8.841, - "eval_steps_per_second": 1.179, + "eval_loss": 0.3072938621044159, + "eval_runtime": 3.3977, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, "step": 119 }, { "epoch": 6.315789473684211, - "grad_norm": 0.8050438761711121, + "grad_norm": 0.8993279337882996, "learning_rate": 0.0002948740897842223, - "loss": 0.1098, + "loss": 0.2965, "step": 120 }, { "epoch": 6.315789473684211, - "eval_loss": 0.08503618091344833, - "eval_runtime": 3.3911, - "eval_samples_per_second": 8.847, + "eval_loss": 0.3108386695384979, + "eval_runtime": 3.3904, + "eval_samples_per_second": 8.849, "eval_steps_per_second": 1.18, "step": 120 }, { "epoch": 6.368421052631579, - "grad_norm": 0.5926359295845032, + "grad_norm": 0.8741037845611572, "learning_rate": 0.00029158233864578256, - "loss": 0.073, + "loss": 0.2753, "step": 121 }, { "epoch": 6.368421052631579, - "eval_loss": 0.08083069324493408, - "eval_runtime": 3.3784, - "eval_samples_per_second": 8.88, - "eval_steps_per_second": 1.184, + "eval_loss": 0.31585294008255005, + "eval_runtime": 3.3843, + "eval_samples_per_second": 8.865, + "eval_steps_per_second": 1.182, "step": 121 }, { "epoch": 6.421052631578947, - "grad_norm": 0.6040995121002197, + "grad_norm": 0.8745630979537964, "learning_rate": 0.00028828315617877, - "loss": 0.0605, + "loss": 0.305, "step": 122 }, { "epoch": 6.421052631578947, - "eval_loss": 0.07534124702215195, - "eval_runtime": 3.3976, - "eval_samples_per_second": 8.83, - "eval_steps_per_second": 1.177, + "eval_loss": 0.31079187989234924, + "eval_runtime": 3.3933, + "eval_samples_per_second": 8.841, + "eval_steps_per_second": 1.179, "step": 122 }, { "epoch": 6.473684210526316, - "grad_norm": 0.7090303301811218, + "grad_norm": 0.8834717869758606, "learning_rate": 0.0002849771319918922, - "loss": 0.0738, + "loss": 0.3354, "step": 123 }, { "epoch": 6.473684210526316, - "eval_loss": 0.07838510721921921, - "eval_runtime": 3.4089, - "eval_samples_per_second": 8.801, - "eval_steps_per_second": 1.173, + "eval_loss": 0.30564117431640625, + "eval_runtime": 3.3954, + "eval_samples_per_second": 8.836, + "eval_steps_per_second": 1.178, "step": 123 }, { "epoch": 6.526315789473684, - "grad_norm": 0.7555331587791443, + "grad_norm": 0.8826112151145935, "learning_rate": 0.00028166485691656423, - "loss": 0.0938, + "loss": 0.301, "step": 124 }, { "epoch": 6.526315789473684, - "eval_loss": 0.07789373397827148, - "eval_runtime": 3.4254, - "eval_samples_per_second": 8.758, - "eval_steps_per_second": 1.168, + "eval_loss": 0.2981402277946472, + "eval_runtime": 3.3997, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, "step": 124 }, { "epoch": 6.578947368421053, - "grad_norm": 0.5972513556480408, + "grad_norm": 0.7558391094207764, "learning_rate": 0.00027834692290132053, - "loss": 0.076, + "loss": 0.2935, "step": 125 }, { "epoch": 6.578947368421053, - "eval_loss": 0.07299390435218811, - "eval_runtime": 3.4358, - "eval_samples_per_second": 8.732, - "eval_steps_per_second": 1.164, + "eval_loss": 0.29539814591407776, + "eval_runtime": 3.3982, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 125 }, { "epoch": 6.631578947368421, - "grad_norm": 0.6388722062110901, + "grad_norm": 1.2316842079162598, "learning_rate": 0.0002750239229060246, - "loss": 0.0801, + "loss": 0.2295, "step": 126 }, { "epoch": 6.631578947368421, - "eval_loss": 0.06956001371145248, - "eval_runtime": 3.4214, - "eval_samples_per_second": 8.768, - "eval_steps_per_second": 1.169, + "eval_loss": 0.29493311047554016, + "eval_runtime": 3.3983, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, "step": 126 }, { "epoch": 6.684210526315789, - "grad_norm": 0.7171911001205444, + "grad_norm": 0.872908353805542, "learning_rate": 0.0002716964507958994, - "loss": 0.1169, + "loss": 0.3214, "step": 127 }, { "epoch": 6.684210526315789, - "eval_loss": 0.052916835993528366, - "eval_runtime": 3.4099, - "eval_samples_per_second": 8.798, - "eval_steps_per_second": 1.173, + "eval_loss": 0.2816743552684784, + "eval_runtime": 3.3969, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.178, "step": 127 }, { "epoch": 6.7368421052631575, - "grad_norm": 0.5959410667419434, + "grad_norm": 1.1845930814743042, "learning_rate": 0.0002683651012353955, - "loss": 0.0777, + "loss": 0.3108, "step": 128 }, { "epoch": 6.7368421052631575, - "eval_loss": 0.04862135648727417, - "eval_runtime": 3.3998, - "eval_samples_per_second": 8.824, - "eval_steps_per_second": 1.177, + "eval_loss": 0.27218949794769287, + "eval_runtime": 3.4046, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, "step": 128 }, { "epoch": 6.7894736842105265, - "grad_norm": 0.5596298575401306, + "grad_norm": 0.8063351511955261, "learning_rate": 0.0002650304695819168, - "loss": 0.0732, + "loss": 0.2863, "step": 129 }, { "epoch": 6.7894736842105265, - "eval_loss": 0.04528380557894707, - "eval_runtime": 3.389, - "eval_samples_per_second": 8.852, - "eval_steps_per_second": 1.18, + "eval_loss": 0.26498475670814514, + "eval_runtime": 3.4002, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, "step": 129 }, { "epoch": 6.842105263157895, - "grad_norm": 0.6277874112129211, + "grad_norm": 0.8428151607513428, "learning_rate": 0.00026169315177942135, - "loss": 0.1186, + "loss": 0.3621, "step": 130 }, { "epoch": 6.842105263157895, - "eval_loss": 0.03978530690073967, - "eval_runtime": 3.3863, - "eval_samples_per_second": 8.859, - "eval_steps_per_second": 1.181, + "eval_loss": 0.26111218333244324, + "eval_runtime": 3.4007, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, "step": 130 }, { "epoch": 6.894736842105263, - "grad_norm": 0.603584349155426, + "grad_norm": 0.7834460139274597, "learning_rate": 0.0002583537442519187, - "loss": 0.0886, + "loss": 0.3314, "step": 131 }, { "epoch": 6.894736842105263, - "eval_loss": 0.03136228770017624, - "eval_runtime": 3.3778, - "eval_samples_per_second": 8.882, - "eval_steps_per_second": 1.184, + "eval_loss": 0.2560313940048218, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, "step": 131 }, { "epoch": 6.947368421052632, - "grad_norm": 0.5069896578788757, + "grad_norm": 0.7809928059577942, "learning_rate": 0.00025501284379688067, - "loss": 0.0738, + "loss": 0.3259, "step": 132 }, { "epoch": 6.947368421052632, - "eval_loss": 0.035538725554943085, - "eval_runtime": 3.3926, - "eval_samples_per_second": 8.843, - "eval_steps_per_second": 1.179, + "eval_loss": 0.24695177376270294, + "eval_runtime": 3.3903, + "eval_samples_per_second": 8.849, + "eval_steps_per_second": 1.18, "step": 132 }, { "epoch": 7.0, - "grad_norm": 0.5902580618858337, + "grad_norm": 0.679124116897583, "learning_rate": 0.0002516710474785856, - "loss": 0.0666, + "loss": 0.2897, "step": 133 }, { "epoch": 7.0, - "eval_loss": 0.03617199510335922, - "eval_runtime": 3.4108, - "eval_samples_per_second": 8.796, - "eval_steps_per_second": 1.173, + "eval_loss": 0.23945002257823944, + "eval_runtime": 3.393, + "eval_samples_per_second": 8.842, + "eval_steps_per_second": 1.179, "step": 133 }, { "epoch": 7.052631578947368, - "grad_norm": 0.38795194029808044, + "grad_norm": 0.7083767056465149, "learning_rate": 0.0002483289525214145, - "loss": 0.043, + "loss": 0.2112, "step": 134 }, { "epoch": 7.052631578947368, - "eval_loss": 0.030730299651622772, - "eval_runtime": 3.4158, - "eval_samples_per_second": 8.783, - "eval_steps_per_second": 1.171, + "eval_loss": 0.22805434465408325, + "eval_runtime": 3.3927, + "eval_samples_per_second": 8.843, + "eval_steps_per_second": 1.179, "step": 134 }, { "epoch": 7.105263157894737, - "grad_norm": 0.37176647782325745, + "grad_norm": 0.7295684218406677, "learning_rate": 0.00024498715620311935, - "loss": 0.0234, + "loss": 0.1686, "step": 135 }, { "epoch": 7.105263157894737, - "eval_loss": 0.028628086671233177, - "eval_runtime": 3.4136, - "eval_samples_per_second": 8.788, - "eval_steps_per_second": 1.172, + "eval_loss": 0.22213517129421234, + "eval_runtime": 3.3986, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 135 }, { "epoch": 7.157894736842105, - "grad_norm": 0.3922203779220581, + "grad_norm": 0.9701097011566162, "learning_rate": 0.00024164625574808144, - "loss": 0.0276, + "loss": 0.192, "step": 136 }, { "epoch": 7.157894736842105, - "eval_loss": 0.02898269146680832, - "eval_runtime": 3.4054, - "eval_samples_per_second": 8.81, - "eval_steps_per_second": 1.175, + "eval_loss": 0.21580030024051666, + "eval_runtime": 3.3985, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, "step": 136 }, { "epoch": 7.2105263157894735, - "grad_norm": 0.49020880460739136, + "grad_norm": 0.9494478702545166, "learning_rate": 0.00023830684822057877, - "loss": 0.0543, + "loss": 0.205, "step": 137 }, { "epoch": 7.2105263157894735, - "eval_loss": 0.029869915917515755, - "eval_runtime": 3.4009, - "eval_samples_per_second": 8.821, + "eval_loss": 0.21216638386249542, + "eval_runtime": 3.4006, + "eval_samples_per_second": 8.822, "eval_steps_per_second": 1.176, "step": 137 }, { "epoch": 7.2631578947368425, - "grad_norm": 0.4029361307621002, + "grad_norm": 0.6897704005241394, "learning_rate": 0.00023496953041808325, - "loss": 0.0244, + "loss": 0.1542, "step": 138 }, { "epoch": 7.2631578947368425, - "eval_loss": 0.03184913843870163, - "eval_runtime": 3.3998, - "eval_samples_per_second": 8.824, - "eval_steps_per_second": 1.177, + "eval_loss": 0.21432656049728394, + "eval_runtime": 3.3968, + "eval_samples_per_second": 8.832, + "eval_steps_per_second": 1.178, "step": 138 }, { "epoch": 7.315789473684211, - "grad_norm": 0.5920937061309814, + "grad_norm": 0.7690937519073486, "learning_rate": 0.0002316348987646045, - "loss": 0.0561, + "loss": 0.213, "step": 139 }, { "epoch": 7.315789473684211, - "eval_loss": 0.0323193185031414, - "eval_runtime": 3.3965, - "eval_samples_per_second": 8.833, - "eval_steps_per_second": 1.178, + "eval_loss": 0.21909914910793304, + "eval_runtime": 3.3981, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, "step": 139 }, { "epoch": 7.368421052631579, - "grad_norm": 0.475091814994812, + "grad_norm": 0.9047114253044128, "learning_rate": 0.00022830354920410064, - "loss": 0.0382, + "loss": 0.2302, "step": 140 }, { "epoch": 7.368421052631579, - "eval_loss": 0.03044699877500534, - "eval_runtime": 3.3918, - "eval_samples_per_second": 8.845, - "eval_steps_per_second": 1.179, + "eval_loss": 0.2153581976890564, + "eval_runtime": 3.399, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, "step": 140 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 0.7724714279174805, + "learning_rate": 0.0002249760770939754, + "loss": 0.1825, + "step": 141 + }, + { + "epoch": 7.421052631578947, + "eval_loss": 0.20969410240650177, + "eval_runtime": 3.3776, + "eval_samples_per_second": 8.882, + "eval_steps_per_second": 1.184, + "step": 141 + }, + { + "epoch": 7.473684210526316, + "grad_norm": 0.7683383822441101, + "learning_rate": 0.0002216530770986795, + "loss": 0.1793, + "step": 142 + }, + { + "epoch": 7.473684210526316, + "eval_loss": 0.204229936003685, + "eval_runtime": 3.398, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, + "step": 142 + }, + { + "epoch": 7.526315789473684, + "grad_norm": 0.8928307890892029, + "learning_rate": 0.0002183351430834358, + "loss": 0.2218, + "step": 143 + }, + { + "epoch": 7.526315789473684, + "eval_loss": 0.197996586561203, + "eval_runtime": 3.3992, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, + "step": 143 + }, + { + "epoch": 7.578947368421053, + "grad_norm": 0.683783769607544, + "learning_rate": 0.0002150228680081079, + "loss": 0.1496, + "step": 144 + }, + { + "epoch": 7.578947368421053, + "eval_loss": 0.19135157763957977, + "eval_runtime": 3.4077, + "eval_samples_per_second": 8.804, + "eval_steps_per_second": 1.174, + "step": 144 + }, + { + "epoch": 7.631578947368421, + "grad_norm": 0.7701078653335571, + "learning_rate": 0.00021171684382123, + "loss": 0.2014, + "step": 145 + }, + { + "epoch": 7.631578947368421, + "eval_loss": 0.1854608803987503, + "eval_runtime": 3.4029, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 1.175, + "step": 145 + }, + { + "epoch": 7.684210526315789, + "grad_norm": 0.9109010696411133, + "learning_rate": 0.0002084176613542175, + "loss": 0.191, + "step": 146 + }, + { + "epoch": 7.684210526315789, + "eval_loss": 0.1755831390619278, + "eval_runtime": 3.4062, + "eval_samples_per_second": 8.807, + "eval_steps_per_second": 1.174, + "step": 146 + }, + { + "epoch": 7.7368421052631575, + "grad_norm": 0.9243440628051758, + "learning_rate": 0.00020512591021577773, + "loss": 0.1477, + "step": 147 + }, + { + "epoch": 7.7368421052631575, + "eval_loss": 0.17130498588085175, + "eval_runtime": 3.3849, + "eval_samples_per_second": 8.863, + "eval_steps_per_second": 1.182, + "step": 147 + }, + { + "epoch": 7.7894736842105265, + "grad_norm": 0.6701480746269226, + "learning_rate": 0.00020184217868653867, + "loss": 0.1978, + "step": 148 + }, + { + "epoch": 7.7894736842105265, + "eval_loss": 0.16958914697170258, + "eval_runtime": 3.3829, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 148 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 0.6767657399177551, + "learning_rate": 0.0001985670536139151, + "loss": 0.2179, + "step": 149 + }, + { + "epoch": 7.842105263157895, + "eval_loss": 0.16378562152385712, + "eval_runtime": 3.3828, + "eval_samples_per_second": 8.868, + "eval_steps_per_second": 1.182, + "step": 149 + }, + { + "epoch": 7.894736842105263, + "grad_norm": 0.6448670625686646, + "learning_rate": 0.0001953011203072312, + "loss": 0.2025, + "step": 150 + }, + { + "epoch": 7.894736842105263, + "eval_loss": 0.15805380046367645, + "eval_runtime": 3.3987, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, + "step": 150 + }, + { + "epoch": 7.947368421052632, + "grad_norm": 0.87026047706604, + "learning_rate": 0.00019204496243311792, + "loss": 0.2653, + "step": 151 + }, + { + "epoch": 7.947368421052632, + "eval_loss": 0.14828962087631226, + "eval_runtime": 3.3946, + "eval_samples_per_second": 8.838, + "eval_steps_per_second": 1.178, + "step": 151 + }, + { + "epoch": 8.0, + "grad_norm": 0.746687114238739, + "learning_rate": 0.00018879916191120349, + "loss": 0.2052, + "step": 152 + }, + { + "epoch": 8.0, + "eval_loss": 0.1411527693271637, + "eval_runtime": 3.3931, + "eval_samples_per_second": 8.841, + "eval_steps_per_second": 1.179, + "step": 152 + }, + { + "epoch": 8.052631578947368, + "grad_norm": 0.47239571809768677, + "learning_rate": 0.00018556429881011656, + "loss": 0.1007, + "step": 153 + }, + { + "epoch": 8.052631578947368, + "eval_loss": 0.13516879081726074, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, + "step": 153 + }, + { + "epoch": 8.105263157894736, + "grad_norm": 0.8439627289772034, + "learning_rate": 0.0001823409512438203, + "loss": 0.1246, + "step": 154 + }, + { + "epoch": 8.105263157894736, + "eval_loss": 0.13015992939472198, + "eval_runtime": 3.3942, + "eval_samples_per_second": 8.839, + "eval_steps_per_second": 1.178, + "step": 154 + }, + { + "epoch": 8.157894736842104, + "grad_norm": 0.6233652830123901, + "learning_rate": 0.00017912969526829559, + "loss": 0.0809, + "step": 155 + }, + { + "epoch": 8.157894736842104, + "eval_loss": 0.12686298787593842, + "eval_runtime": 3.3971, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, + "step": 155 + }, + { + "epoch": 8.210526315789474, + "grad_norm": 0.7331376075744629, + "learning_rate": 0.00017593110477859153, + "loss": 0.0948, + "step": 156 + }, + { + "epoch": 8.210526315789474, + "eval_loss": 0.12066776305437088, + "eval_runtime": 3.3964, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, + "step": 156 + }, + { + "epoch": 8.263157894736842, + "grad_norm": 0.7566715478897095, + "learning_rate": 0.00017274575140626317, + "loss": 0.1052, + "step": 157 + }, + { + "epoch": 8.263157894736842, + "eval_loss": 0.1153416633605957, + "eval_runtime": 3.3936, + "eval_samples_per_second": 8.84, + "eval_steps_per_second": 1.179, + "step": 157 + }, + { + "epoch": 8.31578947368421, + "grad_norm": 0.5211192965507507, + "learning_rate": 0.00016957420441721284, + "loss": 0.0584, + "step": 158 + }, + { + "epoch": 8.31578947368421, + "eval_loss": 0.10957438498735428, + "eval_runtime": 3.3951, + "eval_samples_per_second": 8.836, + "eval_steps_per_second": 1.178, + "step": 158 + }, + { + "epoch": 8.368421052631579, + "grad_norm": 0.7941140532493591, + "learning_rate": 0.00016641703060995457, + "loss": 0.1393, + "step": 159 + }, + { + "epoch": 8.368421052631579, + "eval_loss": 0.1009925901889801, + "eval_runtime": 3.3908, + "eval_samples_per_second": 8.847, + "eval_steps_per_second": 1.18, + "step": 159 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 0.7772736549377441, + "learning_rate": 0.00016327479421431983, + "loss": 0.1284, + "step": 160 + }, + { + "epoch": 8.421052631578947, + "eval_loss": 0.094593845307827, + "eval_runtime": 3.3985, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, + "step": 160 + }, + { + "epoch": 8.473684210526315, + "grad_norm": 0.8724604845046997, + "learning_rate": 0.00016014805679062183, + "loss": 0.1518, + "step": 161 + }, + { + "epoch": 8.473684210526315, + "eval_loss": 0.0894516333937645, + "eval_runtime": 3.3955, + "eval_samples_per_second": 8.835, + "eval_steps_per_second": 1.178, + "step": 161 + }, + { + "epoch": 8.526315789473685, + "grad_norm": 0.7179498672485352, + "learning_rate": 0.0001570373771292967, + "loss": 0.1107, + "step": 162 + }, + { + "epoch": 8.526315789473685, + "eval_loss": 0.0845918357372284, + "eval_runtime": 3.4033, + "eval_samples_per_second": 8.815, + "eval_steps_per_second": 1.175, + "step": 162 + }, + { + "epoch": 8.578947368421053, + "grad_norm": 0.6780802607536316, + "learning_rate": 0.00015394331115104075, + "loss": 0.0997, + "step": 163 + }, + { + "epoch": 8.578947368421053, + "eval_loss": 0.08240295946598053, + "eval_runtime": 3.402, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, + "step": 163 + }, + { + "epoch": 8.631578947368421, + "grad_norm": 0.6817135810852051, + "learning_rate": 0.00015086641180745932, + "loss": 0.1156, + "step": 164 + }, + { + "epoch": 8.631578947368421, + "eval_loss": 0.07952894270420074, + "eval_runtime": 3.4067, + "eval_samples_per_second": 8.806, + "eval_steps_per_second": 1.174, + "step": 164 + }, + { + "epoch": 8.68421052631579, + "grad_norm": 0.7739869356155396, + "learning_rate": 0.00014780722898224708, + "loss": 0.1247, + "step": 165 + }, + { + "epoch": 8.68421052631579, + "eval_loss": 0.07561580091714859, + "eval_runtime": 3.4045, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, + "step": 165 + }, + { + "epoch": 8.736842105263158, + "grad_norm": 0.8046780228614807, + "learning_rate": 0.0001447663093929163, + "loss": 0.1085, + "step": 166 + }, + { + "epoch": 8.736842105263158, + "eval_loss": 0.07319317758083344, + "eval_runtime": 3.3994, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, + "step": 166 + }, + { + "epoch": 8.789473684210526, + "grad_norm": 0.6103046536445618, + "learning_rate": 0.00014174419649309089, + "loss": 0.0832, + "step": 167 + }, + { + "epoch": 8.789473684210526, + "eval_loss": 0.07252493500709534, + "eval_runtime": 3.4001, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, + "step": 167 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 0.6907472610473633, + "learning_rate": 0.00013874143037538418, + "loss": 0.1031, + "step": 168 + }, + { + "epoch": 8.842105263157894, + "eval_loss": 0.07177206873893738, + "eval_runtime": 3.4044, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, + "step": 168 + }, + { + "epoch": 8.894736842105264, + "grad_norm": 0.6837093830108643, + "learning_rate": 0.0001357585476748766, + "loss": 0.1074, + "step": 169 + }, + { + "epoch": 8.894736842105264, + "eval_loss": 0.06924725323915482, + "eval_runtime": 3.4019, + "eval_samples_per_second": 8.819, + "eval_steps_per_second": 1.176, + "step": 169 + }, + { + "epoch": 8.947368421052632, + "grad_norm": 0.5226811766624451, + "learning_rate": 0.00013279608147321223, + "loss": 0.0467, + "step": 170 + }, + { + "epoch": 8.947368421052632, + "eval_loss": 0.06760647892951965, + "eval_runtime": 3.3966, + "eval_samples_per_second": 8.832, + "eval_steps_per_second": 1.178, + "step": 170 + }, + { + "epoch": 9.0, + "grad_norm": 0.8332634568214417, + "learning_rate": 0.00012985456120332905, + "loss": 0.1137, + "step": 171 + }, + { + "epoch": 9.0, + "eval_loss": 0.06686952710151672, + "eval_runtime": 3.3907, + "eval_samples_per_second": 8.848, + "eval_steps_per_second": 1.18, + "step": 171 + }, + { + "epoch": 9.052631578947368, + "grad_norm": 0.4633868932723999, + "learning_rate": 0.00012693451255484312, + "loss": 0.0353, + "step": 172 + }, + { + "epoch": 9.052631578947368, + "eval_loss": 0.06244245544075966, + "eval_runtime": 3.3982, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, + "step": 172 + }, + { + "epoch": 9.105263157894736, + "grad_norm": 0.7089731693267822, + "learning_rate": 0.00012403645738009997, + "loss": 0.059, + "step": 173 + }, + { + "epoch": 9.105263157894736, + "eval_loss": 0.05555792525410652, + "eval_runtime": 3.3985, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, + "step": 173 + }, + { + "epoch": 9.157894736842104, + "grad_norm": 0.570846438407898, + "learning_rate": 0.00012116091360091261, + "loss": 0.0545, + "step": 174 + }, + { + "epoch": 9.157894736842104, + "eval_loss": 0.052096955478191376, + "eval_runtime": 3.3964, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, + "step": 174 + }, + { + "epoch": 9.210526315789474, + "grad_norm": 0.4484975337982178, + "learning_rate": 0.00011830839511600211, + "loss": 0.0334, + "step": 175 + }, + { + "epoch": 9.210526315789474, + "eval_loss": 0.05176297202706337, + "eval_runtime": 3.4033, + "eval_samples_per_second": 8.815, + "eval_steps_per_second": 1.175, + "step": 175 + }, + { + "epoch": 9.263157894736842, + "grad_norm": 0.677650511264801, + "learning_rate": 0.00011547941170915685, + "loss": 0.0503, + "step": 176 + }, + { + "epoch": 9.263157894736842, + "eval_loss": 0.05027133598923683, + "eval_runtime": 3.4064, + "eval_samples_per_second": 8.807, + "eval_steps_per_second": 1.174, + "step": 176 + }, + { + "epoch": 9.31578947368421, + "grad_norm": 0.5817425847053528, + "learning_rate": 0.00011267446895812702, + "loss": 0.0293, + "step": 177 + }, + { + "epoch": 9.31578947368421, + "eval_loss": 0.049430813640356064, + "eval_runtime": 3.4086, + "eval_samples_per_second": 8.801, + "eval_steps_per_second": 1.174, + "step": 177 + }, + { + "epoch": 9.368421052631579, + "grad_norm": 0.970379114151001, + "learning_rate": 0.0001098940681442713, + "loss": 0.0679, + "step": 178 + }, + { + "epoch": 9.368421052631579, + "eval_loss": 0.04337286949157715, + "eval_runtime": 3.4, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.176, + "step": 178 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 0.4084687829017639, + "learning_rate": 0.00010713870616297092, + "loss": 0.0262, + "step": 179 + }, + { + "epoch": 9.421052631578947, + "eval_loss": 0.03992774710059166, + "eval_runtime": 3.4026, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, + "step": 179 + }, + { + "epoch": 9.473684210526315, + "grad_norm": 0.650490939617157, + "learning_rate": 0.00010440887543482746, + "loss": 0.0407, + "step": 180 + }, + { + "epoch": 9.473684210526315, + "eval_loss": 0.037015657871961594, + "eval_runtime": 3.4003, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, + "step": 180 + }, + { + "epoch": 9.526315789473685, + "grad_norm": 0.5609657764434814, + "learning_rate": 0.0001017050638176612, + "loss": 0.0328, + "step": 181 + }, + { + "epoch": 9.526315789473685, + "eval_loss": 0.03608579561114311, + "eval_runtime": 3.3972, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, + "step": 181 + }, + { + "epoch": 9.578947368421053, + "grad_norm": 0.48078685998916626, + "learning_rate": 9.902775451932386e-05, + "loss": 0.0216, + "step": 182 + }, + { + "epoch": 9.578947368421053, + "eval_loss": 0.0358748622238636, + "eval_runtime": 3.3946, + "eval_samples_per_second": 8.837, + "eval_steps_per_second": 1.178, + "step": 182 + }, + { + "epoch": 9.631578947368421, + "grad_norm": 0.5188214182853699, + "learning_rate": 9.637742601134286e-05, + "loss": 0.0438, + "step": 183 + }, + { + "epoch": 9.631578947368421, + "eval_loss": 0.03486837074160576, + "eval_runtime": 3.3974, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, + "step": 183 + }, + { + "epoch": 9.68421052631579, + "grad_norm": 0.7200556993484497, + "learning_rate": 9.375455194341214e-05, + "loss": 0.0663, + "step": 184 + }, + { + "epoch": 9.68421052631579, + "eval_loss": 0.03245267644524574, + "eval_runtime": 3.4008, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 184 + }, + { + "epoch": 9.736842105263158, + "grad_norm": 0.6560045480728149, + "learning_rate": 9.11596010587441e-05, + "loss": 0.064, + "step": 185 + }, + { + "epoch": 9.736842105263158, + "eval_loss": 0.029578620567917824, + "eval_runtime": 3.4019, + "eval_samples_per_second": 8.819, + "eval_steps_per_second": 1.176, + "step": 185 + }, + { + "epoch": 9.789473684210526, + "grad_norm": 0.5027221441268921, + "learning_rate": 8.85930371102994e-05, + "loss": 0.0416, + "step": 186 + }, + { + "epoch": 9.789473684210526, + "eval_loss": 0.026809442788362503, + "eval_runtime": 3.4005, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 186 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 0.4124845564365387, + "learning_rate": 8.605531877790762e-05, + "loss": 0.0335, + "step": 187 + }, + { + "epoch": 9.842105263157894, + "eval_loss": 0.02500898391008377, + "eval_runtime": 3.4021, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, + "step": 187 + }, + { + "epoch": 9.894736842105264, + "grad_norm": 0.5714792013168335, + "learning_rate": 8.354689958629513e-05, + "loss": 0.0491, + "step": 188 + }, + { + "epoch": 9.894736842105264, + "eval_loss": 0.022844497114419937, + "eval_runtime": 3.401, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 188 + }, + { + "epoch": 9.947368421052632, + "grad_norm": 0.48736098408699036, + "learning_rate": 8.106822782403376e-05, + "loss": 0.018, + "step": 189 + }, + { + "epoch": 9.947368421052632, + "eval_loss": 0.021435970440506935, + "eval_runtime": 3.4008, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 189 + }, + { + "epoch": 10.0, + "grad_norm": 0.5927891731262207, + "learning_rate": 7.861974646342596e-05, + "loss": 0.0388, + "step": 190 + }, + { + "epoch": 10.0, + "eval_loss": 0.019742580130696297, + "eval_runtime": 3.4006, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 190 + }, + { + "epoch": 10.052631578947368, + "grad_norm": 0.3376651108264923, + "learning_rate": 7.620189308133943e-05, + "loss": 0.0196, + "step": 191 + }, + { + "epoch": 10.052631578947368, + "eval_loss": 0.018559806048870087, + "eval_runtime": 3.388, + "eval_samples_per_second": 8.855, + "eval_steps_per_second": 1.181, + "step": 191 + }, + { + "epoch": 10.105263157894736, + "grad_norm": 0.3613579273223877, + "learning_rate": 7.381509978100626e-05, + "loss": 0.0172, + "step": 192 + }, + { + "epoch": 10.105263157894736, + "eval_loss": 0.017322294414043427, + "eval_runtime": 3.3891, + "eval_samples_per_second": 8.852, + "eval_steps_per_second": 1.18, + "step": 192 + }, + { + "epoch": 10.157894736842104, + "grad_norm": 0.2621256411075592, + "learning_rate": 7.145979311479986e-05, + "loss": 0.0159, + "step": 193 + }, + { + "epoch": 10.157894736842104, + "eval_loss": 0.016333211213350296, + "eval_runtime": 3.4014, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, + "step": 193 + }, + { + "epoch": 10.210526315789474, + "grad_norm": 0.24995078146457672, + "learning_rate": 6.913639400800489e-05, + "loss": 0.0132, + "step": 194 + }, + { + "epoch": 10.210526315789474, + "eval_loss": 0.015769897028803825, + "eval_runtime": 3.401, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 194 + }, + { + "epoch": 10.263157894736842, + "grad_norm": 0.38419196009635925, + "learning_rate": 6.684531768359173e-05, + "loss": 0.0196, + "step": 195 + }, + { + "epoch": 10.263157894736842, + "eval_loss": 0.015028283931314945, + "eval_runtime": 3.4, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.176, + "step": 195 + }, + { + "epoch": 10.31578947368421, + "grad_norm": 0.23766584694385529, + "learning_rate": 6.458697358801061e-05, + "loss": 0.009, + "step": 196 + }, + { + "epoch": 10.31578947368421, + "eval_loss": 0.014445771463215351, + "eval_runtime": 3.3979, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 1.177, + "step": 196 + }, + { + "epoch": 10.368421052631579, + "grad_norm": 0.2710660398006439, + "learning_rate": 6.236176531801813e-05, + "loss": 0.0096, + "step": 197 + }, + { + "epoch": 10.368421052631579, + "eval_loss": 0.01395699568092823, + "eval_runtime": 3.3981, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, + "step": 197 + }, + { + "epoch": 10.421052631578947, + "grad_norm": 0.20278970897197723, + "learning_rate": 6.017009054854858e-05, + "loss": 0.0087, + "step": 198 + }, + { + "epoch": 10.421052631578947, + "eval_loss": 0.013656516559422016, + "eval_runtime": 3.4043, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, + "step": 198 + }, + { + "epoch": 10.473684210526315, + "grad_norm": 0.3319687247276306, + "learning_rate": 5.801234096164468e-05, + "loss": 0.016, + "step": 199 + }, + { + "epoch": 10.473684210526315, + "eval_loss": 0.012863567098975182, + "eval_runtime": 3.403, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 1.175, + "step": 199 + }, + { + "epoch": 10.526315789473685, + "grad_norm": 0.25473591685295105, + "learning_rate": 5.58889021764582e-05, + "loss": 0.0105, + "step": 200 + }, + { + "epoch": 10.526315789473685, + "eval_loss": 0.012198278680443764, + "eval_runtime": 3.3999, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, + "step": 200 + }, + { + "epoch": 10.578947368421053, + "grad_norm": 0.3705623745918274, + "learning_rate": 5.3800153680334754e-05, + "loss": 0.0134, + "step": 201 + }, + { + "epoch": 10.578947368421053, + "eval_loss": 0.011488989926874638, + "eval_runtime": 3.3917, + "eval_samples_per_second": 8.845, + "eval_steps_per_second": 1.179, + "step": 201 + }, + { + "epoch": 10.631578947368421, + "grad_norm": 0.24455586075782776, + "learning_rate": 5.17464687609942e-05, + "loss": 0.0112, + "step": 202 + }, + { + "epoch": 10.631578947368421, + "eval_loss": 0.010651330463588238, + "eval_runtime": 3.3998, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, + "step": 202 + }, + { + "epoch": 10.68421052631579, + "grad_norm": 0.2879987955093384, + "learning_rate": 4.97282144398192e-05, + "loss": 0.0108, + "step": 203 + }, + { + "epoch": 10.68421052631579, + "eval_loss": 0.010258635506033897, + "eval_runtime": 3.4041, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, + "step": 203 + }, + { + "epoch": 10.736842105263158, + "grad_norm": 0.2595934569835663, + "learning_rate": 4.7745751406263163e-05, + "loss": 0.0116, + "step": 204 + }, + { + "epoch": 10.736842105263158, + "eval_loss": 0.009770309552550316, + "eval_runtime": 3.4083, + "eval_samples_per_second": 8.802, + "eval_steps_per_second": 1.174, + "step": 204 + }, + { + "epoch": 10.789473684210526, + "grad_norm": 0.3026018738746643, + "learning_rate": 4.5799433953390616e-05, + "loss": 0.0116, + "step": 205 + }, + { + "epoch": 10.789473684210526, + "eval_loss": 0.00936987716704607, + "eval_runtime": 3.4054, + "eval_samples_per_second": 8.81, + "eval_steps_per_second": 1.175, + "step": 205 + }, + { + "epoch": 10.842105263157894, + "grad_norm": 0.4068312644958496, + "learning_rate": 4.388960991455998e-05, + "loss": 0.0109, + "step": 206 + }, + { + "epoch": 10.842105263157894, + "eval_loss": 0.008922109380364418, + "eval_runtime": 3.4117, + "eval_samples_per_second": 8.793, + "eval_steps_per_second": 1.172, + "step": 206 + }, + { + "epoch": 10.894736842105264, + "grad_norm": 0.3379729688167572, + "learning_rate": 4.2016620601260796e-05, + "loss": 0.015, + "step": 207 + }, + { + "epoch": 10.894736842105264, + "eval_loss": 0.008320866152644157, + "eval_runtime": 3.4039, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, + "step": 207 + }, + { + "epoch": 10.947368421052632, + "grad_norm": 0.2505350410938263, + "learning_rate": 4.0180800742117244e-05, + "loss": 0.008, + "step": 208 + }, + { + "epoch": 10.947368421052632, + "eval_loss": 0.007898358628153801, + "eval_runtime": 3.3962, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 1.178, + "step": 208 + }, + { + "epoch": 11.0, + "grad_norm": 0.36052215099334717, + "learning_rate": 3.838247842306716e-05, + "loss": 0.0133, + "step": 209 + }, + { + "epoch": 11.0, + "eval_loss": 0.007371651474386454, + "eval_runtime": 3.3973, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, + "step": 209 + }, + { + "epoch": 11.052631578947368, + "grad_norm": 0.12308855354785919, + "learning_rate": 3.662197502872885e-05, + "loss": 0.0051, + "step": 210 + }, + { + "epoch": 11.052631578947368, + "eval_loss": 0.006998243276029825, + "eval_runtime": 3.4004, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 210 + }, + { + "epoch": 11.105263157894736, + "grad_norm": 0.12299831211566925, + "learning_rate": 3.489960518496521e-05, + "loss": 0.0065, + "step": 211 + }, + { + "epoch": 11.105263157894736, + "eval_loss": 0.006782620679587126, + "eval_runtime": 3.4059, + "eval_samples_per_second": 8.808, + "eval_steps_per_second": 1.174, + "step": 211 + }, + { + "epoch": 11.157894736842104, + "grad_norm": 0.12273000180721283, + "learning_rate": 3.321567670265568e-05, + "loss": 0.0059, + "step": 212 + }, + { + "epoch": 11.157894736842104, + "eval_loss": 0.006513877771794796, + "eval_runtime": 3.3943, + "eval_samples_per_second": 8.838, + "eval_steps_per_second": 1.178, + "step": 212 + }, + { + "epoch": 11.210526315789474, + "grad_norm": 0.11980213969945908, + "learning_rate": 3.157049052268662e-05, + "loss": 0.0051, + "step": 213 + }, + { + "epoch": 11.210526315789474, + "eval_loss": 0.006208530627191067, + "eval_runtime": 3.4058, + "eval_samples_per_second": 8.809, + "eval_steps_per_second": 1.174, + "step": 213 + }, + { + "epoch": 11.263157894736842, + "grad_norm": 0.14820842444896698, + "learning_rate": 2.9964340662168772e-05, + "loss": 0.005, + "step": 214 + }, + { + "epoch": 11.263157894736842, + "eval_loss": 0.006144699640572071, + "eval_runtime": 3.4009, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 214 + }, + { + "epoch": 11.31578947368421, + "grad_norm": 0.09703250229358673, + "learning_rate": 2.8397514161892484e-05, + "loss": 0.0047, + "step": 215 + }, + { + "epoch": 11.31578947368421, + "eval_loss": 0.00596656883135438, + "eval_runtime": 3.4079, + "eval_samples_per_second": 8.803, + "eval_steps_per_second": 1.174, + "step": 215 + }, + { + "epoch": 11.368421052631579, + "grad_norm": 0.1398313045501709, + "learning_rate": 2.687029103502972e-05, + "loss": 0.0058, + "step": 216 + }, + { + "epoch": 11.368421052631579, + "eval_loss": 0.0058633070439100266, + "eval_runtime": 3.403, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 1.175, + "step": 216 + }, + { + "epoch": 11.421052631578947, + "grad_norm": 0.12219510972499847, + "learning_rate": 2.5382944217091723e-05, + "loss": 0.0059, + "step": 217 + }, + { + "epoch": 11.421052631578947, + "eval_loss": 0.0056641846895217896, + "eval_runtime": 3.4055, + "eval_samples_per_second": 8.809, + "eval_steps_per_second": 1.175, + "step": 217 + }, + { + "epoch": 11.473684210526315, + "grad_norm": 0.10808281600475311, + "learning_rate": 2.3935739517151916e-05, + "loss": 0.005, + "step": 218 + }, + { + "epoch": 11.473684210526315, + "eval_loss": 0.005585065111517906, + "eval_runtime": 3.3987, + "eval_samples_per_second": 8.827, + "eval_steps_per_second": 1.177, + "step": 218 + }, + { + "epoch": 11.526315789473685, + "grad_norm": 0.19032533466815948, + "learning_rate": 2.2528935570342164e-05, + "loss": 0.0063, + "step": 219 + }, + { + "epoch": 11.526315789473685, + "eval_loss": 0.005458400584757328, + "eval_runtime": 3.4008, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 219 + }, + { + "epoch": 11.578947368421053, + "grad_norm": 0.09316842257976532, + "learning_rate": 2.1162783791631057e-05, + "loss": 0.004, + "step": 220 + }, + { + "epoch": 11.578947368421053, + "eval_loss": 0.0053214430809021, + "eval_runtime": 3.3972, + "eval_samples_per_second": 8.831, + "eval_steps_per_second": 1.177, + "step": 220 + }, + { + "epoch": 11.631578947368421, + "grad_norm": 0.13419128954410553, + "learning_rate": 1.9837528330892778e-05, + "loss": 0.0053, + "step": 221 + }, + { + "epoch": 11.631578947368421, + "eval_loss": 0.00523610832169652, + "eval_runtime": 3.3774, + "eval_samples_per_second": 8.883, + "eval_steps_per_second": 1.184, + "step": 221 + }, + { + "epoch": 11.68421052631579, + "grad_norm": 0.1483260989189148, + "learning_rate": 1.8553406029274188e-05, + "loss": 0.0063, + "step": 222 + }, + { + "epoch": 11.68421052631579, + "eval_loss": 0.0051864017732441425, + "eval_runtime": 3.3864, + "eval_samples_per_second": 8.859, + "eval_steps_per_second": 1.181, + "step": 222 + }, + { + "epoch": 11.736842105263158, + "grad_norm": 0.15016067028045654, + "learning_rate": 1.7310646376867885e-05, + "loss": 0.0067, + "step": 223 + }, + { + "epoch": 11.736842105263158, + "eval_loss": 0.0051628886722028255, + "eval_runtime": 3.399, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, + "step": 223 + }, + { + "epoch": 11.789473684210526, + "grad_norm": 0.0965675637125969, + "learning_rate": 1.6109471471699556e-05, + "loss": 0.0052, + "step": 224 + }, + { + "epoch": 11.789473684210526, + "eval_loss": 0.005002335179597139, + "eval_runtime": 3.4012, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, + "step": 224 + }, + { + "epoch": 11.842105263157894, + "grad_norm": 0.1401059329509735, + "learning_rate": 1.4950095980035772e-05, + "loss": 0.0055, + "step": 225 + }, + { + "epoch": 11.842105263157894, + "eval_loss": 0.004974076058715582, + "eval_runtime": 3.4045, + "eval_samples_per_second": 8.812, + "eval_steps_per_second": 1.175, + "step": 225 + }, + { + "epoch": 11.894736842105264, + "grad_norm": 0.08175503462553024, + "learning_rate": 1.3832727098020331e-05, + "loss": 0.0037, + "step": 226 + }, + { + "epoch": 11.894736842105264, + "eval_loss": 0.004897472448647022, + "eval_runtime": 3.4065, + "eval_samples_per_second": 8.807, + "eval_steps_per_second": 1.174, + "step": 226 + }, + { + "epoch": 11.947368421052632, + "grad_norm": 0.14667555689811707, + "learning_rate": 1.2757564514645492e-05, + "loss": 0.0047, + "step": 227 + }, + { + "epoch": 11.947368421052632, + "eval_loss": 0.004857571795582771, + "eval_runtime": 3.4021, + "eval_samples_per_second": 8.818, + "eval_steps_per_second": 1.176, + "step": 227 + }, + { + "epoch": 12.0, + "grad_norm": 0.07701026648283005, + "learning_rate": 1.1724800376064798e-05, + "loss": 0.0036, + "step": 228 + }, + { + "epoch": 12.0, + "eval_loss": 0.004770983941853046, + "eval_runtime": 3.4001, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 1.176, + "step": 228 + }, + { + "epoch": 12.052631578947368, + "grad_norm": 0.11114013940095901, + "learning_rate": 1.0734619251253963e-05, + "loss": 0.0057, + "step": 229 + }, + { + "epoch": 12.052631578947368, + "eval_loss": 0.004740286152809858, + "eval_runtime": 3.4009, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 229 + }, + { + "epoch": 12.105263157894736, + "grad_norm": 0.07092595100402832, + "learning_rate": 9.78719809902598e-06, + "loss": 0.0035, + "step": 230 + }, + { + "epoch": 12.105263157894736, + "eval_loss": 0.004716214258223772, + "eval_runtime": 3.4053, + "eval_samples_per_second": 8.81, + "eval_steps_per_second": 1.175, + "step": 230 + }, + { + "epoch": 12.157894736842104, + "grad_norm": 0.12435787171125412, + "learning_rate": 8.882706236405884e-06, + "loss": 0.0054, + "step": 231 + }, + { + "epoch": 12.157894736842104, + "eval_loss": 0.004733518231660128, + "eval_runtime": 3.3993, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, + "step": 231 + }, + { + "epoch": 12.210526315789474, + "grad_norm": 0.12049361318349838, + "learning_rate": 8.02130530837189e-06, + "loss": 0.0053, + "step": 232 + }, + { + "epoch": 12.210526315789474, + "eval_loss": 0.004637454636394978, + "eval_runtime": 3.4013, + "eval_samples_per_second": 8.82, + "eval_steps_per_second": 1.176, + "step": 232 + }, + { + "epoch": 12.263157894736842, + "grad_norm": 0.06943191587924957, + "learning_rate": 7.203149258967034e-06, + "loss": 0.0039, + "step": 233 + }, + { + "epoch": 12.263157894736842, + "eval_loss": 0.004599397070705891, + "eval_runtime": 3.4029, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 1.175, + "step": 233 + }, + { + "epoch": 12.31578947368421, + "grad_norm": 0.10378482937812805, + "learning_rate": 6.428384303787282e-06, + "loss": 0.0053, + "step": 234 + }, + { + "epoch": 12.31578947368421, + "eval_loss": 0.0046176365576684475, + "eval_runtime": 3.4039, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, + "step": 234 + }, + { + "epoch": 12.368421052631579, + "grad_norm": 0.08170512318611145, + "learning_rate": 5.697148903850868e-06, + "loss": 0.0046, + "step": 235 + }, + { + "epoch": 12.368421052631579, + "eval_loss": 0.00459822965785861, + "eval_runtime": 3.404, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 1.175, + "step": 235 + }, + { + "epoch": 12.421052631578947, + "grad_norm": 0.09477739036083221, + "learning_rate": 5.009573740853312e-06, + "loss": 0.0047, + "step": 236 + }, + { + "epoch": 12.421052631578947, + "eval_loss": 0.004573486745357513, + "eval_runtime": 3.4032, + "eval_samples_per_second": 8.815, + "eval_steps_per_second": 1.175, + "step": 236 + }, + { + "epoch": 12.473684210526315, + "grad_norm": 0.0745476633310318, + "learning_rate": 4.365781693813048e-06, + "loss": 0.004, + "step": 237 + }, + { + "epoch": 12.473684210526315, + "eval_loss": 0.004487224388867617, + "eval_runtime": 3.4067, + "eval_samples_per_second": 8.806, + "eval_steps_per_second": 1.174, + "step": 237 + }, + { + "epoch": 12.526315789473685, + "grad_norm": 0.13931944966316223, + "learning_rate": 3.765887817111069e-06, + "loss": 0.0065, + "step": 238 + }, + { + "epoch": 12.526315789473685, + "eval_loss": 0.004524969030171633, + "eval_runtime": 3.4058, + "eval_samples_per_second": 8.808, + "eval_steps_per_second": 1.174, + "step": 238 + }, + { + "epoch": 12.578947368421053, + "grad_norm": 0.056376032531261444, + "learning_rate": 3.2099993199292688e-06, + "loss": 0.0026, + "step": 239 + }, + { + "epoch": 12.578947368421053, + "eval_loss": 0.0044847470708191395, + "eval_runtime": 3.3996, + "eval_samples_per_second": 8.825, + "eval_steps_per_second": 1.177, + "step": 239 + }, + { + "epoch": 12.631578947368421, + "grad_norm": 0.07375714182853699, + "learning_rate": 2.698215547090599e-06, + "loss": 0.004, + "step": 240 + }, + { + "epoch": 12.631578947368421, + "eval_loss": 0.004458704963326454, + "eval_runtime": 3.3998, + "eval_samples_per_second": 8.824, + "eval_steps_per_second": 1.177, + "step": 240 + }, + { + "epoch": 12.68421052631579, + "grad_norm": 0.06447097659111023, + "learning_rate": 2.230627961304993e-06, + "loss": 0.0032, + "step": 241 + }, + { + "epoch": 12.68421052631579, + "eval_loss": 0.0044786701910197735, + "eval_runtime": 3.3934, + "eval_samples_per_second": 8.841, + "eval_steps_per_second": 1.179, + "step": 241 + }, + { + "epoch": 12.736842105263158, + "grad_norm": 0.1086612269282341, + "learning_rate": 1.807320126823414e-06, + "loss": 0.0042, + "step": 242 + }, + { + "epoch": 12.736842105263158, + "eval_loss": 0.004519260488450527, + "eval_runtime": 3.3977, + "eval_samples_per_second": 8.83, + "eval_steps_per_second": 1.177, + "step": 242 + }, + { + "epoch": 12.789473684210526, + "grad_norm": 0.052398040890693665, + "learning_rate": 1.4283676945041346e-06, + "loss": 0.0024, + "step": 243 + }, + { + "epoch": 12.789473684210526, + "eval_loss": 0.004430453758686781, + "eval_runtime": 3.4008, + "eval_samples_per_second": 8.821, + "eval_steps_per_second": 1.176, + "step": 243 + }, + { + "epoch": 12.842105263157894, + "grad_norm": 0.10231564193964005, + "learning_rate": 1.0938383882926617e-06, + "loss": 0.003, + "step": 244 + }, + { + "epoch": 12.842105263157894, + "eval_loss": 0.0044572907499969006, + "eval_runtime": 3.4036, + "eval_samples_per_second": 8.814, + "eval_steps_per_second": 1.175, + "step": 244 + }, + { + "epoch": 12.894736842105264, + "grad_norm": 0.1136302798986435, + "learning_rate": 8.037919931187243e-07, + "loss": 0.0028, + "step": 245 + }, + { + "epoch": 12.894736842105264, + "eval_loss": 0.0044529978185892105, + "eval_runtime": 3.4025, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, + "step": 245 + }, + { + "epoch": 12.947368421052632, + "grad_norm": 0.08841534703969955, + "learning_rate": 5.582803442117091e-07, + "loss": 0.0034, + "step": 246 + }, + { + "epoch": 12.947368421052632, + "eval_loss": 0.004437682218849659, + "eval_runtime": 3.3982, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 1.177, + "step": 246 + }, + { + "epoch": 13.0, + "grad_norm": 0.09434516727924347, + "learning_rate": 3.5734731783715333e-07, + "loss": 0.0051, + "step": 247 + }, + { + "epoch": 13.0, + "eval_loss": 0.0044091795571148396, + "eval_runtime": 3.4027, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 1.176, + "step": 247 + }, + { + "epoch": 13.052631578947368, + "grad_norm": 0.11519359052181244, + "learning_rate": 2.0102882345540696e-07, + "loss": 0.0041, + "step": 248 + }, + { + "epoch": 13.052631578947368, + "eval_loss": 0.004471189342439175, + "eval_runtime": 3.3961, + "eval_samples_per_second": 8.834, + "eval_steps_per_second": 1.178, + "step": 248 + }, + { + "epoch": 13.105263157894736, + "grad_norm": 0.054617173969745636, + "learning_rate": 8.935279730407086e-08, + "loss": 0.0026, + "step": 249 + }, + { + "epoch": 13.105263157894736, + "eval_loss": 0.004416502080857754, + "eval_runtime": 3.4007, + "eval_samples_per_second": 8.822, + "eval_steps_per_second": 1.176, + "step": 249 + }, + { + "epoch": 13.157894736842104, + "grad_norm": 0.0668402761220932, + "learning_rate": 2.2339197405490953e-08, + "loss": 0.0035, + "step": 250 + }, + { + "epoch": 13.157894736842104, + "eval_loss": 0.004414246417582035, + "eval_runtime": 3.3991, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 1.177, + "step": 250 } ], "logging_steps": 1, @@ -2122,12 +3772,12 @@ "should_evaluate": false, "should_log": false, "should_save": true, - "should_training_stop": false + "should_training_stop": true }, "attributes": {} } }, - "total_flos": 5968350472955904.0, + "total_flos": 1.0147096033671168e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null