diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3992 @@ +{ + "best_metric": 1.8658331632614136, + "best_model_checkpoint": "/nfs/production/literature/amina-mardiyyah/new_data/OT-Entity-Extraction-Pipeline/model_outputs/Continued_pretraining/TAPT/bioformers/bioformer-16L/Mardiyyah/TAPT_data_V2_split/tapt_base_LR-2e-05/checkpoint-255", + "epoch": 49.94117647058823, + "eval_steps": 1, + "global_step": 450, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.10457516339869281, + "eval_loss": 2.221620798110962, + "eval_runtime": 2.4071, + "eval_samples_per_second": 808.035, + "eval_steps_per_second": 12.879, + "step": 1 + }, + { + "epoch": 0.20915032679738563, + "eval_loss": 2.1887502670288086, + "eval_runtime": 2.4126, + "eval_samples_per_second": 806.196, + "eval_steps_per_second": 12.849, + "step": 2 + }, + { + "epoch": 0.3137254901960784, + "eval_loss": 2.173266887664795, + "eval_runtime": 2.4488, + "eval_samples_per_second": 794.257, + "eval_steps_per_second": 12.659, + "step": 3 + }, + { + "epoch": 0.41830065359477125, + "eval_loss": 2.1297478675842285, + "eval_runtime": 2.3778, + "eval_samples_per_second": 817.994, + "eval_steps_per_second": 13.037, + "step": 4 + }, + { + "epoch": 0.5228758169934641, + "eval_loss": 2.192237377166748, + "eval_runtime": 2.3912, + "eval_samples_per_second": 813.415, + "eval_steps_per_second": 12.964, + "step": 5 + }, + { + "epoch": 0.6274509803921569, + "eval_loss": 2.1468276977539062, + "eval_runtime": 2.3902, + "eval_samples_per_second": 813.739, + "eval_steps_per_second": 12.97, + "step": 6 + }, + { + "epoch": 0.7320261437908496, + "eval_loss": 2.1432690620422363, + "eval_runtime": 2.3678, + "eval_samples_per_second": 821.446, + "eval_steps_per_second": 13.092, + "step": 7 + }, + { + "epoch": 0.8366013071895425, + "eval_loss": 2.0927038192749023, + "eval_runtime": 2.3704, + "eval_samples_per_second": 820.552, + "eval_steps_per_second": 13.078, + "step": 8 + }, + { + "epoch": 0.9411764705882353, + "eval_loss": 2.124303102493286, + "eval_runtime": 2.4224, + "eval_samples_per_second": 802.925, + "eval_steps_per_second": 12.797, + "step": 9 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 2.930349588394165, + "learning_rate": 6.666666666666667e-06, + "loss": 2.4847, + "step": 9 + }, + { + "epoch": 1.1045751633986929, + "eval_loss": 2.1266961097717285, + "eval_runtime": 2.3993, + "eval_samples_per_second": 810.647, + "eval_steps_per_second": 12.92, + "step": 10 + }, + { + "epoch": 1.2091503267973855, + "eval_loss": 2.081995964050293, + "eval_runtime": 2.4023, + "eval_samples_per_second": 809.627, + "eval_steps_per_second": 12.904, + "step": 11 + }, + { + "epoch": 1.3137254901960784, + "eval_loss": 2.0737693309783936, + "eval_runtime": 2.4011, + "eval_samples_per_second": 810.042, + "eval_steps_per_second": 12.911, + "step": 12 + }, + { + "epoch": 1.4183006535947713, + "eval_loss": 2.0427086353302, + "eval_runtime": 2.4076, + "eval_samples_per_second": 807.874, + "eval_steps_per_second": 12.876, + "step": 13 + }, + { + "epoch": 1.522875816993464, + "eval_loss": 2.056819200515747, + "eval_runtime": 2.3991, + "eval_samples_per_second": 810.727, + "eval_steps_per_second": 12.922, + "step": 14 + }, + { + "epoch": 1.6274509803921569, + "eval_loss": 2.083451747894287, + "eval_runtime": 2.4323, + "eval_samples_per_second": 799.665, + "eval_steps_per_second": 12.745, + "step": 15 + }, + { + "epoch": 1.7320261437908497, + "eval_loss": 2.078913450241089, + "eval_runtime": 2.4756, + "eval_samples_per_second": 785.669, + "eval_steps_per_second": 12.522, + "step": 16 + }, + { + "epoch": 1.8366013071895426, + "eval_loss": 2.067417621612549, + "eval_runtime": 2.438, + "eval_samples_per_second": 797.798, + "eval_steps_per_second": 12.716, + "step": 17 + }, + { + "epoch": 1.9411764705882353, + "eval_loss": 2.0401482582092285, + "eval_runtime": 2.4764, + "eval_samples_per_second": 785.419, + "eval_steps_per_second": 12.518, + "step": 18 + }, + { + "epoch": 1.9411764705882353, + "grad_norm": 1.9967031478881836, + "learning_rate": 1.3333333333333333e-05, + "loss": 2.4101, + "step": 18 + }, + { + "epoch": 2.104575163398693, + "eval_loss": 2.0421407222747803, + "eval_runtime": 2.4395, + "eval_samples_per_second": 797.31, + "eval_steps_per_second": 12.708, + "step": 19 + }, + { + "epoch": 2.2091503267973858, + "eval_loss": 2.0762155055999756, + "eval_runtime": 2.4888, + "eval_samples_per_second": 781.507, + "eval_steps_per_second": 12.456, + "step": 20 + }, + { + "epoch": 2.313725490196078, + "eval_loss": 2.006462574005127, + "eval_runtime": 2.5161, + "eval_samples_per_second": 773.026, + "eval_steps_per_second": 12.321, + "step": 21 + }, + { + "epoch": 2.418300653594771, + "eval_loss": 2.0763015747070312, + "eval_runtime": 2.4587, + "eval_samples_per_second": 791.08, + "eval_steps_per_second": 12.608, + "step": 22 + }, + { + "epoch": 2.522875816993464, + "eval_loss": 2.0424351692199707, + "eval_runtime": 2.4605, + "eval_samples_per_second": 790.5, + "eval_steps_per_second": 12.599, + "step": 23 + }, + { + "epoch": 2.627450980392157, + "eval_loss": 2.031003952026367, + "eval_runtime": 2.4119, + "eval_samples_per_second": 806.408, + "eval_steps_per_second": 12.853, + "step": 24 + }, + { + "epoch": 2.7320261437908497, + "eval_loss": 2.0873942375183105, + "eval_runtime": 2.4077, + "eval_samples_per_second": 807.816, + "eval_steps_per_second": 12.875, + "step": 25 + }, + { + "epoch": 2.8366013071895426, + "eval_loss": 2.0235297679901123, + "eval_runtime": 2.3945, + "eval_samples_per_second": 812.27, + "eval_steps_per_second": 12.946, + "step": 26 + }, + { + "epoch": 2.9411764705882355, + "eval_loss": 2.059739589691162, + "eval_runtime": 2.4114, + "eval_samples_per_second": 806.602, + "eval_steps_per_second": 12.856, + "step": 27 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 2.4962875843048096, + "learning_rate": 2e-05, + "loss": 2.3677, + "step": 27 + }, + { + "epoch": 3.104575163398693, + "eval_loss": 1.986527442932129, + "eval_runtime": 2.3999, + "eval_samples_per_second": 810.462, + "eval_steps_per_second": 12.917, + "step": 28 + }, + { + "epoch": 3.2091503267973858, + "eval_loss": 2.0295257568359375, + "eval_runtime": 2.4114, + "eval_samples_per_second": 806.573, + "eval_steps_per_second": 12.855, + "step": 29 + }, + { + "epoch": 3.313725490196078, + "eval_loss": 2.029600143432617, + "eval_runtime": 2.3867, + "eval_samples_per_second": 814.924, + "eval_steps_per_second": 12.989, + "step": 30 + }, + { + "epoch": 3.418300653594771, + "eval_loss": 2.0018720626831055, + "eval_runtime": 2.3894, + "eval_samples_per_second": 814.014, + "eval_steps_per_second": 12.974, + "step": 31 + }, + { + "epoch": 3.522875816993464, + "eval_loss": 1.9695795774459839, + "eval_runtime": 2.3919, + "eval_samples_per_second": 813.157, + "eval_steps_per_second": 12.96, + "step": 32 + }, + { + "epoch": 3.627450980392157, + "eval_loss": 2.0265488624572754, + "eval_runtime": 2.426, + "eval_samples_per_second": 801.734, + "eval_steps_per_second": 12.778, + "step": 33 + }, + { + "epoch": 3.7320261437908497, + "eval_loss": 2.010695457458496, + "eval_runtime": 2.429, + "eval_samples_per_second": 800.731, + "eval_steps_per_second": 12.762, + "step": 34 + }, + { + "epoch": 3.8366013071895426, + "eval_loss": 2.034428119659424, + "eval_runtime": 2.4086, + "eval_samples_per_second": 807.51, + "eval_steps_per_second": 12.87, + "step": 35 + }, + { + "epoch": 3.9411764705882355, + "eval_loss": 2.0281381607055664, + "eval_runtime": 2.4118, + "eval_samples_per_second": 806.449, + "eval_steps_per_second": 12.853, + "step": 36 + }, + { + "epoch": 3.9411764705882355, + "grad_norm": 1.8768209218978882, + "learning_rate": 1.9574468085106384e-05, + "loss": 2.2639, + "step": 36 + }, + { + "epoch": 4.104575163398692, + "eval_loss": 2.0171053409576416, + "eval_runtime": 2.4083, + "eval_samples_per_second": 807.623, + "eval_steps_per_second": 12.872, + "step": 37 + }, + { + "epoch": 4.209150326797386, + "eval_loss": 2.0344126224517822, + "eval_runtime": 2.4108, + "eval_samples_per_second": 806.785, + "eval_steps_per_second": 12.859, + "step": 38 + }, + { + "epoch": 4.313725490196078, + "eval_loss": 1.9913954734802246, + "eval_runtime": 2.3858, + "eval_samples_per_second": 815.248, + "eval_steps_per_second": 12.994, + "step": 39 + }, + { + "epoch": 4.4183006535947715, + "eval_loss": 1.9855905771255493, + "eval_runtime": 2.3822, + "eval_samples_per_second": 816.455, + "eval_steps_per_second": 13.013, + "step": 40 + }, + { + "epoch": 4.522875816993464, + "eval_loss": 2.0357260704040527, + "eval_runtime": 2.3873, + "eval_samples_per_second": 814.74, + "eval_steps_per_second": 12.986, + "step": 41 + }, + { + "epoch": 4.627450980392156, + "eval_loss": 2.028900384902954, + "eval_runtime": 2.4177, + "eval_samples_per_second": 804.49, + "eval_steps_per_second": 12.822, + "step": 42 + }, + { + "epoch": 4.73202614379085, + "eval_loss": 1.9714045524597168, + "eval_runtime": 2.3849, + "eval_samples_per_second": 815.537, + "eval_steps_per_second": 12.998, + "step": 43 + }, + { + "epoch": 4.836601307189542, + "eval_loss": 1.9895257949829102, + "eval_runtime": 2.4769, + "eval_samples_per_second": 785.249, + "eval_steps_per_second": 12.516, + "step": 44 + }, + { + "epoch": 4.9411764705882355, + "eval_loss": 1.990486741065979, + "eval_runtime": 2.4263, + "eval_samples_per_second": 801.627, + "eval_steps_per_second": 12.777, + "step": 45 + }, + { + "epoch": 4.9411764705882355, + "grad_norm": 1.8783236742019653, + "learning_rate": 1.914893617021277e-05, + "loss": 2.2037, + "step": 45 + }, + { + "epoch": 5.104575163398692, + "eval_loss": 1.9589457511901855, + "eval_runtime": 2.4045, + "eval_samples_per_second": 808.887, + "eval_steps_per_second": 12.892, + "step": 46 + }, + { + "epoch": 5.209150326797386, + "eval_loss": 1.9864917993545532, + "eval_runtime": 2.4112, + "eval_samples_per_second": 806.648, + "eval_steps_per_second": 12.857, + "step": 47 + }, + { + "epoch": 5.313725490196078, + "eval_loss": 2.0113699436187744, + "eval_runtime": 2.4027, + "eval_samples_per_second": 809.512, + "eval_steps_per_second": 12.902, + "step": 48 + }, + { + "epoch": 5.4183006535947715, + "eval_loss": 2.0007834434509277, + "eval_runtime": 2.4133, + "eval_samples_per_second": 805.942, + "eval_steps_per_second": 12.845, + "step": 49 + }, + { + "epoch": 5.522875816993464, + "eval_loss": 1.9577592611312866, + "eval_runtime": 2.3791, + "eval_samples_per_second": 817.533, + "eval_steps_per_second": 13.03, + "step": 50 + }, + { + "epoch": 5.627450980392156, + "eval_loss": 2.029423236846924, + "eval_runtime": 2.3902, + "eval_samples_per_second": 813.74, + "eval_steps_per_second": 12.97, + "step": 51 + }, + { + "epoch": 5.73202614379085, + "eval_loss": 1.9585332870483398, + "eval_runtime": 2.4118, + "eval_samples_per_second": 806.436, + "eval_steps_per_second": 12.853, + "step": 52 + }, + { + "epoch": 5.836601307189542, + "eval_loss": 1.9783401489257812, + "eval_runtime": 2.4264, + "eval_samples_per_second": 801.594, + "eval_steps_per_second": 12.776, + "step": 53 + }, + { + "epoch": 5.9411764705882355, + "eval_loss": 1.9880473613739014, + "eval_runtime": 2.488, + "eval_samples_per_second": 781.753, + "eval_steps_per_second": 12.46, + "step": 54 + }, + { + "epoch": 5.9411764705882355, + "grad_norm": 1.8594753742218018, + "learning_rate": 1.872340425531915e-05, + "loss": 2.16, + "step": 54 + }, + { + "epoch": 6.104575163398692, + "eval_loss": 2.0060460567474365, + "eval_runtime": 2.4336, + "eval_samples_per_second": 799.234, + "eval_steps_per_second": 12.738, + "step": 55 + }, + { + "epoch": 6.209150326797386, + "eval_loss": 1.9557570219039917, + "eval_runtime": 2.4097, + "eval_samples_per_second": 807.143, + "eval_steps_per_second": 12.864, + "step": 56 + }, + { + "epoch": 6.313725490196078, + "eval_loss": 1.9664386510849, + "eval_runtime": 2.4059, + "eval_samples_per_second": 808.441, + "eval_steps_per_second": 12.885, + "step": 57 + }, + { + "epoch": 6.4183006535947715, + "eval_loss": 1.920135498046875, + "eval_runtime": 2.4056, + "eval_samples_per_second": 808.538, + "eval_steps_per_second": 12.887, + "step": 58 + }, + { + "epoch": 6.522875816993464, + "eval_loss": 1.9815952777862549, + "eval_runtime": 2.4122, + "eval_samples_per_second": 806.325, + "eval_steps_per_second": 12.851, + "step": 59 + }, + { + "epoch": 6.627450980392156, + "eval_loss": 1.9681768417358398, + "eval_runtime": 2.3867, + "eval_samples_per_second": 814.941, + "eval_steps_per_second": 12.989, + "step": 60 + }, + { + "epoch": 6.73202614379085, + "eval_loss": 1.9605098962783813, + "eval_runtime": 2.3875, + "eval_samples_per_second": 814.648, + "eval_steps_per_second": 12.984, + "step": 61 + }, + { + "epoch": 6.836601307189542, + "eval_loss": 1.9233237504959106, + "eval_runtime": 2.385, + "eval_samples_per_second": 815.51, + "eval_steps_per_second": 12.998, + "step": 62 + }, + { + "epoch": 6.9411764705882355, + "eval_loss": 1.9687212705612183, + "eval_runtime": 2.3854, + "eval_samples_per_second": 815.361, + "eval_steps_per_second": 12.995, + "step": 63 + }, + { + "epoch": 6.9411764705882355, + "grad_norm": 1.886400818824768, + "learning_rate": 1.8297872340425533e-05, + "loss": 2.1108, + "step": 63 + }, + { + "epoch": 7.104575163398692, + "eval_loss": 1.9986543655395508, + "eval_runtime": 2.4144, + "eval_samples_per_second": 805.581, + "eval_steps_per_second": 12.84, + "step": 64 + }, + { + "epoch": 7.209150326797386, + "eval_loss": 2.002251386642456, + "eval_runtime": 2.4255, + "eval_samples_per_second": 801.892, + "eval_steps_per_second": 12.781, + "step": 65 + }, + { + "epoch": 7.313725490196078, + "eval_loss": 1.9626870155334473, + "eval_runtime": 2.4235, + "eval_samples_per_second": 802.565, + "eval_steps_per_second": 12.792, + "step": 66 + }, + { + "epoch": 7.4183006535947715, + "eval_loss": 2.0214684009552, + "eval_runtime": 2.4109, + "eval_samples_per_second": 806.764, + "eval_steps_per_second": 12.858, + "step": 67 + }, + { + "epoch": 7.522875816993464, + "eval_loss": 1.961344599723816, + "eval_runtime": 2.4317, + "eval_samples_per_second": 799.86, + "eval_steps_per_second": 12.748, + "step": 68 + }, + { + "epoch": 7.627450980392156, + "eval_loss": 2.026102066040039, + "eval_runtime": 2.4027, + "eval_samples_per_second": 809.502, + "eval_steps_per_second": 12.902, + "step": 69 + }, + { + "epoch": 7.73202614379085, + "eval_loss": 1.9625698328018188, + "eval_runtime": 2.3794, + "eval_samples_per_second": 817.425, + "eval_steps_per_second": 13.028, + "step": 70 + }, + { + "epoch": 7.836601307189542, + "eval_loss": 2.000683546066284, + "eval_runtime": 2.3828, + "eval_samples_per_second": 816.278, + "eval_steps_per_second": 13.01, + "step": 71 + }, + { + "epoch": 7.9411764705882355, + "eval_loss": 1.9403586387634277, + "eval_runtime": 2.383, + "eval_samples_per_second": 816.21, + "eval_steps_per_second": 13.009, + "step": 72 + }, + { + "epoch": 7.9411764705882355, + "grad_norm": 3.798304319381714, + "learning_rate": 1.7872340425531915e-05, + "loss": 2.0949, + "step": 72 + }, + { + "epoch": 8.104575163398692, + "eval_loss": 1.994275450706482, + "eval_runtime": 2.3862, + "eval_samples_per_second": 815.09, + "eval_steps_per_second": 12.991, + "step": 73 + }, + { + "epoch": 8.209150326797385, + "eval_loss": 2.0442616939544678, + "eval_runtime": 2.3871, + "eval_samples_per_second": 814.81, + "eval_steps_per_second": 12.987, + "step": 74 + }, + { + "epoch": 8.313725490196079, + "eval_loss": 1.99091637134552, + "eval_runtime": 2.3849, + "eval_samples_per_second": 815.556, + "eval_steps_per_second": 12.999, + "step": 75 + }, + { + "epoch": 8.418300653594772, + "eval_loss": 1.9789609909057617, + "eval_runtime": 2.4032, + "eval_samples_per_second": 809.347, + "eval_steps_per_second": 12.9, + "step": 76 + }, + { + "epoch": 8.522875816993464, + "eval_loss": 1.9505332708358765, + "eval_runtime": 2.457, + "eval_samples_per_second": 791.608, + "eval_steps_per_second": 12.617, + "step": 77 + }, + { + "epoch": 8.627450980392156, + "eval_loss": 1.9477442502975464, + "eval_runtime": 2.4164, + "eval_samples_per_second": 804.91, + "eval_steps_per_second": 12.829, + "step": 78 + }, + { + "epoch": 8.732026143790849, + "eval_loss": 2.027162790298462, + "eval_runtime": 2.4059, + "eval_samples_per_second": 808.413, + "eval_steps_per_second": 12.885, + "step": 79 + }, + { + "epoch": 8.836601307189543, + "eval_loss": 1.954852819442749, + "eval_runtime": 2.4078, + "eval_samples_per_second": 807.8, + "eval_steps_per_second": 12.875, + "step": 80 + }, + { + "epoch": 8.941176470588236, + "eval_loss": 1.9641313552856445, + "eval_runtime": 2.4106, + "eval_samples_per_second": 806.865, + "eval_steps_per_second": 12.86, + "step": 81 + }, + { + "epoch": 8.941176470588236, + "grad_norm": 1.9192023277282715, + "learning_rate": 1.74468085106383e-05, + "loss": 2.0617, + "step": 81 + }, + { + "epoch": 9.104575163398692, + "eval_loss": 1.9859141111373901, + "eval_runtime": 2.3909, + "eval_samples_per_second": 813.488, + "eval_steps_per_second": 12.966, + "step": 82 + }, + { + "epoch": 9.209150326797385, + "eval_loss": 1.937601089477539, + "eval_runtime": 2.3889, + "eval_samples_per_second": 814.196, + "eval_steps_per_second": 12.977, + "step": 83 + }, + { + "epoch": 9.313725490196079, + "eval_loss": 1.9699262380599976, + "eval_runtime": 2.3832, + "eval_samples_per_second": 816.116, + "eval_steps_per_second": 13.008, + "step": 84 + }, + { + "epoch": 9.418300653594772, + "eval_loss": 1.9334497451782227, + "eval_runtime": 2.4366, + "eval_samples_per_second": 798.247, + "eval_steps_per_second": 12.723, + "step": 85 + }, + { + "epoch": 9.522875816993464, + "eval_loss": 1.9708276987075806, + "eval_runtime": 2.3835, + "eval_samples_per_second": 816.03, + "eval_steps_per_second": 13.006, + "step": 86 + }, + { + "epoch": 9.627450980392156, + "eval_loss": 1.970037817955017, + "eval_runtime": 2.4674, + "eval_samples_per_second": 788.282, + "eval_steps_per_second": 12.564, + "step": 87 + }, + { + "epoch": 9.732026143790849, + "eval_loss": 1.9634466171264648, + "eval_runtime": 2.4497, + "eval_samples_per_second": 793.975, + "eval_steps_per_second": 12.655, + "step": 88 + }, + { + "epoch": 9.836601307189543, + "eval_loss": 1.92203688621521, + "eval_runtime": 2.4109, + "eval_samples_per_second": 806.761, + "eval_steps_per_second": 12.858, + "step": 89 + }, + { + "epoch": 9.941176470588236, + "eval_loss": 1.966900110244751, + "eval_runtime": 2.4107, + "eval_samples_per_second": 806.822, + "eval_steps_per_second": 12.859, + "step": 90 + }, + { + "epoch": 9.941176470588236, + "grad_norm": 2.050672769546509, + "learning_rate": 1.7021276595744682e-05, + "loss": 2.0509, + "step": 90 + }, + { + "epoch": 10.104575163398692, + "eval_loss": 1.956833004951477, + "eval_runtime": 2.3947, + "eval_samples_per_second": 812.203, + "eval_steps_per_second": 12.945, + "step": 91 + }, + { + "epoch": 10.209150326797385, + "eval_loss": 1.9699444770812988, + "eval_runtime": 2.3837, + "eval_samples_per_second": 815.943, + "eval_steps_per_second": 13.005, + "step": 92 + }, + { + "epoch": 10.313725490196079, + "eval_loss": 2.0316123962402344, + "eval_runtime": 2.3823, + "eval_samples_per_second": 816.431, + "eval_steps_per_second": 13.013, + "step": 93 + }, + { + "epoch": 10.418300653594772, + "eval_loss": 1.912984848022461, + "eval_runtime": 2.3869, + "eval_samples_per_second": 814.86, + "eval_steps_per_second": 12.987, + "step": 94 + }, + { + "epoch": 10.522875816993464, + "eval_loss": 1.9707229137420654, + "eval_runtime": 2.3837, + "eval_samples_per_second": 815.942, + "eval_steps_per_second": 13.005, + "step": 95 + }, + { + "epoch": 10.627450980392156, + "eval_loss": 1.9623687267303467, + "eval_runtime": 2.455, + "eval_samples_per_second": 792.269, + "eval_steps_per_second": 12.627, + "step": 96 + }, + { + "epoch": 10.732026143790849, + "eval_loss": 1.9515836238861084, + "eval_runtime": 2.4477, + "eval_samples_per_second": 794.62, + "eval_steps_per_second": 12.665, + "step": 97 + }, + { + "epoch": 10.836601307189543, + "eval_loss": 1.9508367776870728, + "eval_runtime": 2.407, + "eval_samples_per_second": 808.071, + "eval_steps_per_second": 12.879, + "step": 98 + }, + { + "epoch": 10.941176470588236, + "eval_loss": 1.9166395664215088, + "eval_runtime": 2.4193, + "eval_samples_per_second": 803.965, + "eval_steps_per_second": 12.814, + "step": 99 + }, + { + "epoch": 10.941176470588236, + "grad_norm": 1.911039113998413, + "learning_rate": 1.6595744680851064e-05, + "loss": 1.9835, + "step": 99 + }, + { + "epoch": 11.104575163398692, + "eval_loss": 1.9469496011734009, + "eval_runtime": 2.4171, + "eval_samples_per_second": 804.693, + "eval_steps_per_second": 12.825, + "step": 100 + }, + { + "epoch": 11.209150326797385, + "eval_loss": 1.962018609046936, + "eval_runtime": 2.3932, + "eval_samples_per_second": 812.727, + "eval_steps_per_second": 12.953, + "step": 101 + }, + { + "epoch": 11.313725490196079, + "eval_loss": 1.94699227809906, + "eval_runtime": 2.3789, + "eval_samples_per_second": 817.61, + "eval_steps_per_second": 13.031, + "step": 102 + }, + { + "epoch": 11.418300653594772, + "eval_loss": 1.945833444595337, + "eval_runtime": 2.3834, + "eval_samples_per_second": 816.059, + "eval_steps_per_second": 13.007, + "step": 103 + }, + { + "epoch": 11.522875816993464, + "eval_loss": 1.9585113525390625, + "eval_runtime": 2.3811, + "eval_samples_per_second": 816.833, + "eval_steps_per_second": 13.019, + "step": 104 + }, + { + "epoch": 11.627450980392156, + "eval_loss": 1.9450502395629883, + "eval_runtime": 2.381, + "eval_samples_per_second": 816.892, + "eval_steps_per_second": 13.02, + "step": 105 + }, + { + "epoch": 11.732026143790849, + "eval_loss": 1.9202919006347656, + "eval_runtime": 2.4348, + "eval_samples_per_second": 798.83, + "eval_steps_per_second": 12.732, + "step": 106 + }, + { + "epoch": 11.836601307189543, + "eval_loss": 1.9322612285614014, + "eval_runtime": 2.4503, + "eval_samples_per_second": 793.773, + "eval_steps_per_second": 12.651, + "step": 107 + }, + { + "epoch": 11.941176470588236, + "eval_loss": 1.9641361236572266, + "eval_runtime": 2.4059, + "eval_samples_per_second": 808.427, + "eval_steps_per_second": 12.885, + "step": 108 + }, + { + "epoch": 11.941176470588236, + "grad_norm": 1.9469199180603027, + "learning_rate": 1.6170212765957446e-05, + "loss": 1.9719, + "step": 108 + }, + { + "epoch": 12.104575163398692, + "eval_loss": 1.9262347221374512, + "eval_runtime": 2.4058, + "eval_samples_per_second": 808.471, + "eval_steps_per_second": 12.886, + "step": 109 + }, + { + "epoch": 12.209150326797385, + "eval_loss": 1.9799877405166626, + "eval_runtime": 2.4101, + "eval_samples_per_second": 807.019, + "eval_steps_per_second": 12.863, + "step": 110 + }, + { + "epoch": 12.313725490196079, + "eval_loss": 1.9421709775924683, + "eval_runtime": 2.3763, + "eval_samples_per_second": 818.512, + "eval_steps_per_second": 13.046, + "step": 111 + }, + { + "epoch": 12.418300653594772, + "eval_loss": 1.9286293983459473, + "eval_runtime": 2.4023, + "eval_samples_per_second": 809.639, + "eval_steps_per_second": 12.904, + "step": 112 + }, + { + "epoch": 12.522875816993464, + "eval_loss": 1.9933801889419556, + "eval_runtime": 2.3795, + "eval_samples_per_second": 817.406, + "eval_steps_per_second": 13.028, + "step": 113 + }, + { + "epoch": 12.627450980392156, + "eval_loss": 1.9704465866088867, + "eval_runtime": 2.3792, + "eval_samples_per_second": 817.502, + "eval_steps_per_second": 13.03, + "step": 114 + }, + { + "epoch": 12.732026143790849, + "eval_loss": 1.939013957977295, + "eval_runtime": 2.4295, + "eval_samples_per_second": 800.592, + "eval_steps_per_second": 12.76, + "step": 115 + }, + { + "epoch": 12.836601307189543, + "eval_loss": 1.916093349456787, + "eval_runtime": 2.3862, + "eval_samples_per_second": 815.104, + "eval_steps_per_second": 12.991, + "step": 116 + }, + { + "epoch": 12.941176470588236, + "eval_loss": 1.94829523563385, + "eval_runtime": 2.4403, + "eval_samples_per_second": 797.034, + "eval_steps_per_second": 12.703, + "step": 117 + }, + { + "epoch": 12.941176470588236, + "grad_norm": 1.835829734802246, + "learning_rate": 1.5744680851063832e-05, + "loss": 1.9663, + "step": 117 + }, + { + "epoch": 13.104575163398692, + "eval_loss": 1.9584107398986816, + "eval_runtime": 2.4351, + "eval_samples_per_second": 798.732, + "eval_steps_per_second": 12.73, + "step": 118 + }, + { + "epoch": 13.209150326797385, + "eval_loss": 1.9641852378845215, + "eval_runtime": 2.4617, + "eval_samples_per_second": 790.102, + "eval_steps_per_second": 12.593, + "step": 119 + }, + { + "epoch": 13.313725490196079, + "eval_loss": 1.9446567296981812, + "eval_runtime": 2.4215, + "eval_samples_per_second": 803.226, + "eval_steps_per_second": 12.802, + "step": 120 + }, + { + "epoch": 13.418300653594772, + "eval_loss": 2.001385450363159, + "eval_runtime": 2.416, + "eval_samples_per_second": 805.044, + "eval_steps_per_second": 12.831, + "step": 121 + }, + { + "epoch": 13.522875816993464, + "eval_loss": 1.8805845975875854, + "eval_runtime": 2.4069, + "eval_samples_per_second": 808.102, + "eval_steps_per_second": 12.88, + "step": 122 + }, + { + "epoch": 13.627450980392156, + "eval_loss": 1.9486974477767944, + "eval_runtime": 2.4072, + "eval_samples_per_second": 807.987, + "eval_steps_per_second": 12.878, + "step": 123 + }, + { + "epoch": 13.732026143790849, + "eval_loss": 1.9180878400802612, + "eval_runtime": 2.4102, + "eval_samples_per_second": 806.973, + "eval_steps_per_second": 12.862, + "step": 124 + }, + { + "epoch": 13.836601307189543, + "eval_loss": 1.9238054752349854, + "eval_runtime": 2.3928, + "eval_samples_per_second": 812.868, + "eval_steps_per_second": 12.956, + "step": 125 + }, + { + "epoch": 13.941176470588236, + "eval_loss": 1.9513754844665527, + "eval_runtime": 2.3862, + "eval_samples_per_second": 815.11, + "eval_steps_per_second": 12.991, + "step": 126 + }, + { + "epoch": 13.941176470588236, + "grad_norm": 1.8433274030685425, + "learning_rate": 1.5319148936170214e-05, + "loss": 1.9785, + "step": 126 + }, + { + "epoch": 14.104575163398692, + "eval_loss": 1.9426443576812744, + "eval_runtime": 2.3889, + "eval_samples_per_second": 814.174, + "eval_steps_per_second": 12.977, + "step": 127 + }, + { + "epoch": 14.209150326797385, + "eval_loss": 1.9765559434890747, + "eval_runtime": 2.3875, + "eval_samples_per_second": 814.644, + "eval_steps_per_second": 12.984, + "step": 128 + }, + { + "epoch": 14.313725490196079, + "eval_loss": 1.9118081331253052, + "eval_runtime": 2.4268, + "eval_samples_per_second": 801.463, + "eval_steps_per_second": 12.774, + "step": 129 + }, + { + "epoch": 14.418300653594772, + "eval_loss": 1.9367104768753052, + "eval_runtime": 2.4183, + "eval_samples_per_second": 804.282, + "eval_steps_per_second": 12.819, + "step": 130 + }, + { + "epoch": 14.522875816993464, + "eval_loss": 1.9372411966323853, + "eval_runtime": 2.4201, + "eval_samples_per_second": 803.68, + "eval_steps_per_second": 12.809, + "step": 131 + }, + { + "epoch": 14.627450980392156, + "eval_loss": 1.923244595527649, + "eval_runtime": 2.4324, + "eval_samples_per_second": 799.606, + "eval_steps_per_second": 12.744, + "step": 132 + }, + { + "epoch": 14.732026143790849, + "eval_loss": 1.999928593635559, + "eval_runtime": 2.4161, + "eval_samples_per_second": 805.026, + "eval_steps_per_second": 12.831, + "step": 133 + }, + { + "epoch": 14.836601307189543, + "eval_loss": 1.9354963302612305, + "eval_runtime": 2.3965, + "eval_samples_per_second": 811.59, + "eval_steps_per_second": 12.935, + "step": 134 + }, + { + "epoch": 14.941176470588236, + "eval_loss": 1.965717077255249, + "eval_runtime": 2.4083, + "eval_samples_per_second": 807.639, + "eval_steps_per_second": 12.872, + "step": 135 + }, + { + "epoch": 14.941176470588236, + "grad_norm": 1.9256954193115234, + "learning_rate": 1.4893617021276596e-05, + "loss": 1.9329, + "step": 135 + }, + { + "epoch": 15.104575163398692, + "eval_loss": 1.9451290369033813, + "eval_runtime": 2.3926, + "eval_samples_per_second": 812.94, + "eval_steps_per_second": 12.957, + "step": 136 + }, + { + "epoch": 15.209150326797385, + "eval_loss": 1.9596805572509766, + "eval_runtime": 2.4003, + "eval_samples_per_second": 810.326, + "eval_steps_per_second": 12.915, + "step": 137 + }, + { + "epoch": 15.313725490196079, + "eval_loss": 1.9179918766021729, + "eval_runtime": 2.388, + "eval_samples_per_second": 814.49, + "eval_steps_per_second": 12.982, + "step": 138 + }, + { + "epoch": 15.418300653594772, + "eval_loss": 1.9344438314437866, + "eval_runtime": 2.4394, + "eval_samples_per_second": 797.34, + "eval_steps_per_second": 12.708, + "step": 139 + }, + { + "epoch": 15.522875816993464, + "eval_loss": 1.9772499799728394, + "eval_runtime": 2.4332, + "eval_samples_per_second": 799.368, + "eval_steps_per_second": 12.741, + "step": 140 + }, + { + "epoch": 15.627450980392156, + "eval_loss": 1.9796696901321411, + "eval_runtime": 2.4159, + "eval_samples_per_second": 805.096, + "eval_steps_per_second": 12.832, + "step": 141 + }, + { + "epoch": 15.732026143790849, + "eval_loss": 1.9060624837875366, + "eval_runtime": 2.4134, + "eval_samples_per_second": 805.928, + "eval_steps_per_second": 12.845, + "step": 142 + }, + { + "epoch": 15.836601307189543, + "eval_loss": 1.8885753154754639, + "eval_runtime": 2.4138, + "eval_samples_per_second": 805.794, + "eval_steps_per_second": 12.843, + "step": 143 + }, + { + "epoch": 15.941176470588236, + "eval_loss": 1.9685148000717163, + "eval_runtime": 2.4145, + "eval_samples_per_second": 805.548, + "eval_steps_per_second": 12.839, + "step": 144 + }, + { + "epoch": 15.941176470588236, + "grad_norm": 1.8362805843353271, + "learning_rate": 1.4468085106382981e-05, + "loss": 1.9144, + "step": 144 + }, + { + "epoch": 16.104575163398692, + "eval_loss": 1.9797979593276978, + "eval_runtime": 2.3896, + "eval_samples_per_second": 813.952, + "eval_steps_per_second": 12.973, + "step": 145 + }, + { + "epoch": 16.209150326797385, + "eval_loss": 1.9587923288345337, + "eval_runtime": 2.4182, + "eval_samples_per_second": 804.331, + "eval_steps_per_second": 12.82, + "step": 146 + }, + { + "epoch": 16.313725490196077, + "eval_loss": 1.9274431467056274, + "eval_runtime": 2.3878, + "eval_samples_per_second": 814.56, + "eval_steps_per_second": 12.983, + "step": 147 + }, + { + "epoch": 16.41830065359477, + "eval_loss": 1.958984375, + "eval_runtime": 2.4353, + "eval_samples_per_second": 798.68, + "eval_steps_per_second": 12.73, + "step": 148 + }, + { + "epoch": 16.522875816993466, + "eval_loss": 1.9552897214889526, + "eval_runtime": 2.4367, + "eval_samples_per_second": 798.2, + "eval_steps_per_second": 12.722, + "step": 149 + }, + { + "epoch": 16.627450980392158, + "eval_loss": 1.9142913818359375, + "eval_runtime": 2.4118, + "eval_samples_per_second": 806.44, + "eval_steps_per_second": 12.853, + "step": 150 + }, + { + "epoch": 16.73202614379085, + "eval_loss": 1.9268592596054077, + "eval_runtime": 2.4143, + "eval_samples_per_second": 805.633, + "eval_steps_per_second": 12.84, + "step": 151 + }, + { + "epoch": 16.836601307189543, + "eval_loss": 1.965384840965271, + "eval_runtime": 2.4177, + "eval_samples_per_second": 804.492, + "eval_steps_per_second": 12.822, + "step": 152 + }, + { + "epoch": 16.941176470588236, + "eval_loss": 1.9789389371871948, + "eval_runtime": 2.4157, + "eval_samples_per_second": 805.134, + "eval_steps_per_second": 12.832, + "step": 153 + }, + { + "epoch": 16.941176470588236, + "grad_norm": 1.918270230293274, + "learning_rate": 1.4042553191489363e-05, + "loss": 1.9103, + "step": 153 + }, + { + "epoch": 17.104575163398692, + "eval_loss": 1.9568538665771484, + "eval_runtime": 2.4112, + "eval_samples_per_second": 806.642, + "eval_steps_per_second": 12.857, + "step": 154 + }, + { + "epoch": 17.209150326797385, + "eval_loss": 1.9652351140975952, + "eval_runtime": 2.3867, + "eval_samples_per_second": 814.935, + "eval_steps_per_second": 12.989, + "step": 155 + }, + { + "epoch": 17.313725490196077, + "eval_loss": 1.9810242652893066, + "eval_runtime": 2.3904, + "eval_samples_per_second": 813.655, + "eval_steps_per_second": 12.968, + "step": 156 + }, + { + "epoch": 17.41830065359477, + "eval_loss": 1.928475260734558, + "eval_runtime": 2.3938, + "eval_samples_per_second": 812.526, + "eval_steps_per_second": 12.95, + "step": 157 + }, + { + "epoch": 17.522875816993466, + "eval_loss": 1.937834620475769, + "eval_runtime": 2.4231, + "eval_samples_per_second": 802.677, + "eval_steps_per_second": 12.793, + "step": 158 + }, + { + "epoch": 17.627450980392158, + "eval_loss": 1.9520132541656494, + "eval_runtime": 2.4525, + "eval_samples_per_second": 793.081, + "eval_steps_per_second": 12.64, + "step": 159 + }, + { + "epoch": 17.73202614379085, + "eval_loss": 1.9782063961029053, + "eval_runtime": 2.4273, + "eval_samples_per_second": 801.302, + "eval_steps_per_second": 12.771, + "step": 160 + }, + { + "epoch": 17.836601307189543, + "eval_loss": 1.9681016206741333, + "eval_runtime": 2.4116, + "eval_samples_per_second": 806.515, + "eval_steps_per_second": 12.854, + "step": 161 + }, + { + "epoch": 17.941176470588236, + "eval_loss": 1.8925799131393433, + "eval_runtime": 2.4098, + "eval_samples_per_second": 807.121, + "eval_steps_per_second": 12.864, + "step": 162 + }, + { + "epoch": 17.941176470588236, + "grad_norm": 1.8396626710891724, + "learning_rate": 1.3617021276595745e-05, + "loss": 1.887, + "step": 162 + }, + { + "epoch": 18.104575163398692, + "eval_loss": 1.9333585500717163, + "eval_runtime": 2.4106, + "eval_samples_per_second": 806.84, + "eval_steps_per_second": 12.86, + "step": 163 + }, + { + "epoch": 18.209150326797385, + "eval_loss": 1.925223469734192, + "eval_runtime": 2.3936, + "eval_samples_per_second": 812.57, + "eval_steps_per_second": 12.951, + "step": 164 + }, + { + "epoch": 18.313725490196077, + "eval_loss": 1.9398906230926514, + "eval_runtime": 2.4263, + "eval_samples_per_second": 801.632, + "eval_steps_per_second": 12.777, + "step": 165 + }, + { + "epoch": 18.41830065359477, + "eval_loss": 1.9518330097198486, + "eval_runtime": 2.3924, + "eval_samples_per_second": 812.992, + "eval_steps_per_second": 12.958, + "step": 166 + }, + { + "epoch": 18.522875816993466, + "eval_loss": 1.992385983467102, + "eval_runtime": 2.3882, + "eval_samples_per_second": 814.425, + "eval_steps_per_second": 12.981, + "step": 167 + }, + { + "epoch": 18.627450980392158, + "eval_loss": 1.905411720275879, + "eval_runtime": 2.3878, + "eval_samples_per_second": 814.555, + "eval_steps_per_second": 12.983, + "step": 168 + }, + { + "epoch": 18.73202614379085, + "eval_loss": 1.9480212926864624, + "eval_runtime": 2.4462, + "eval_samples_per_second": 795.096, + "eval_steps_per_second": 12.672, + "step": 169 + }, + { + "epoch": 18.836601307189543, + "eval_loss": 1.9308433532714844, + "eval_runtime": 2.445, + "eval_samples_per_second": 795.492, + "eval_steps_per_second": 12.679, + "step": 170 + }, + { + "epoch": 18.941176470588236, + "eval_loss": 1.9342797994613647, + "eval_runtime": 2.414, + "eval_samples_per_second": 805.712, + "eval_steps_per_second": 12.842, + "step": 171 + }, + { + "epoch": 18.941176470588236, + "grad_norm": 1.8954132795333862, + "learning_rate": 1.3191489361702127e-05, + "loss": 1.8644, + "step": 171 + }, + { + "epoch": 19.104575163398692, + "eval_loss": 1.9860589504241943, + "eval_runtime": 2.4977, + "eval_samples_per_second": 778.731, + "eval_steps_per_second": 12.412, + "step": 172 + }, + { + "epoch": 19.209150326797385, + "eval_loss": 1.9452682733535767, + "eval_runtime": 2.413, + "eval_samples_per_second": 806.043, + "eval_steps_per_second": 12.847, + "step": 173 + }, + { + "epoch": 19.313725490196077, + "eval_loss": 1.8998777866363525, + "eval_runtime": 2.3895, + "eval_samples_per_second": 813.974, + "eval_steps_per_second": 12.973, + "step": 174 + }, + { + "epoch": 19.41830065359477, + "eval_loss": 1.93086838722229, + "eval_runtime": 2.383, + "eval_samples_per_second": 816.195, + "eval_steps_per_second": 13.009, + "step": 175 + }, + { + "epoch": 19.522875816993466, + "eval_loss": 1.954423189163208, + "eval_runtime": 2.393, + "eval_samples_per_second": 812.774, + "eval_steps_per_second": 12.954, + "step": 176 + }, + { + "epoch": 19.627450980392158, + "eval_loss": 1.9435521364212036, + "eval_runtime": 2.4095, + "eval_samples_per_second": 807.227, + "eval_steps_per_second": 12.866, + "step": 177 + }, + { + "epoch": 19.73202614379085, + "eval_loss": 1.9165093898773193, + "eval_runtime": 2.3936, + "eval_samples_per_second": 812.598, + "eval_steps_per_second": 12.951, + "step": 178 + }, + { + "epoch": 19.836601307189543, + "eval_loss": 1.9695576429367065, + "eval_runtime": 2.4194, + "eval_samples_per_second": 803.908, + "eval_steps_per_second": 12.813, + "step": 179 + }, + { + "epoch": 19.941176470588236, + "eval_loss": 1.9247905015945435, + "eval_runtime": 2.4477, + "eval_samples_per_second": 794.61, + "eval_steps_per_second": 12.665, + "step": 180 + }, + { + "epoch": 19.941176470588236, + "grad_norm": 1.8007246255874634, + "learning_rate": 1.2765957446808513e-05, + "loss": 1.8687, + "step": 180 + }, + { + "epoch": 20.104575163398692, + "eval_loss": 1.9517226219177246, + "eval_runtime": 2.4132, + "eval_samples_per_second": 805.981, + "eval_steps_per_second": 12.846, + "step": 181 + }, + { + "epoch": 20.209150326797385, + "eval_loss": 1.9041943550109863, + "eval_runtime": 2.4088, + "eval_samples_per_second": 807.45, + "eval_steps_per_second": 12.869, + "step": 182 + }, + { + "epoch": 20.313725490196077, + "eval_loss": 1.992538571357727, + "eval_runtime": 2.4151, + "eval_samples_per_second": 805.334, + "eval_steps_per_second": 12.836, + "step": 183 + }, + { + "epoch": 20.41830065359477, + "eval_loss": 1.8842642307281494, + "eval_runtime": 2.4235, + "eval_samples_per_second": 802.553, + "eval_steps_per_second": 12.791, + "step": 184 + }, + { + "epoch": 20.522875816993466, + "eval_loss": 1.979435682296753, + "eval_runtime": 2.3874, + "eval_samples_per_second": 814.695, + "eval_steps_per_second": 12.985, + "step": 185 + }, + { + "epoch": 20.627450980392158, + "eval_loss": 1.9789183139801025, + "eval_runtime": 2.3863, + "eval_samples_per_second": 815.07, + "eval_steps_per_second": 12.991, + "step": 186 + }, + { + "epoch": 20.73202614379085, + "eval_loss": 1.9192243814468384, + "eval_runtime": 2.3878, + "eval_samples_per_second": 814.573, + "eval_steps_per_second": 12.983, + "step": 187 + }, + { + "epoch": 20.836601307189543, + "eval_loss": 1.9174364805221558, + "eval_runtime": 2.3935, + "eval_samples_per_second": 812.633, + "eval_steps_per_second": 12.952, + "step": 188 + }, + { + "epoch": 20.941176470588236, + "eval_loss": 1.9568063020706177, + "eval_runtime": 2.4211, + "eval_samples_per_second": 803.354, + "eval_steps_per_second": 12.804, + "step": 189 + }, + { + "epoch": 20.941176470588236, + "grad_norm": 1.8035422563552856, + "learning_rate": 1.2340425531914895e-05, + "loss": 1.8361, + "step": 189 + }, + { + "epoch": 21.104575163398692, + "eval_loss": 1.9128376245498657, + "eval_runtime": 2.4627, + "eval_samples_per_second": 789.793, + "eval_steps_per_second": 12.588, + "step": 190 + }, + { + "epoch": 21.209150326797385, + "eval_loss": 1.9428894519805908, + "eval_runtime": 2.4145, + "eval_samples_per_second": 805.537, + "eval_steps_per_second": 12.839, + "step": 191 + }, + { + "epoch": 21.313725490196077, + "eval_loss": 1.95577073097229, + "eval_runtime": 2.4665, + "eval_samples_per_second": 788.556, + "eval_steps_per_second": 12.568, + "step": 192 + }, + { + "epoch": 21.41830065359477, + "eval_loss": 1.9128402471542358, + "eval_runtime": 2.4073, + "eval_samples_per_second": 807.972, + "eval_steps_per_second": 12.878, + "step": 193 + }, + { + "epoch": 21.522875816993466, + "eval_loss": 1.9588518142700195, + "eval_runtime": 2.3872, + "eval_samples_per_second": 814.75, + "eval_steps_per_second": 12.986, + "step": 194 + }, + { + "epoch": 21.627450980392158, + "eval_loss": 1.9744739532470703, + "eval_runtime": 2.4193, + "eval_samples_per_second": 803.936, + "eval_steps_per_second": 12.813, + "step": 195 + }, + { + "epoch": 21.73202614379085, + "eval_loss": 1.9993598461151123, + "eval_runtime": 2.3876, + "eval_samples_per_second": 814.634, + "eval_steps_per_second": 12.984, + "step": 196 + }, + { + "epoch": 21.836601307189543, + "eval_loss": 1.959428071975708, + "eval_runtime": 2.3868, + "eval_samples_per_second": 814.91, + "eval_steps_per_second": 12.988, + "step": 197 + }, + { + "epoch": 21.941176470588236, + "eval_loss": 1.9063607454299927, + "eval_runtime": 2.3998, + "eval_samples_per_second": 810.501, + "eval_steps_per_second": 12.918, + "step": 198 + }, + { + "epoch": 21.941176470588236, + "grad_norm": 1.7296489477157593, + "learning_rate": 1.1914893617021277e-05, + "loss": 1.8461, + "step": 198 + }, + { + "epoch": 22.104575163398692, + "eval_loss": 1.9475386142730713, + "eval_runtime": 2.4105, + "eval_samples_per_second": 806.877, + "eval_steps_per_second": 12.86, + "step": 199 + }, + { + "epoch": 22.209150326797385, + "eval_loss": 1.9637689590454102, + "eval_runtime": 2.4134, + "eval_samples_per_second": 805.906, + "eval_steps_per_second": 12.845, + "step": 200 + }, + { + "epoch": 22.313725490196077, + "eval_loss": 1.9350510835647583, + "eval_runtime": 2.4228, + "eval_samples_per_second": 802.787, + "eval_steps_per_second": 12.795, + "step": 201 + }, + { + "epoch": 22.41830065359477, + "eval_loss": 1.9184238910675049, + "eval_runtime": 2.4127, + "eval_samples_per_second": 806.144, + "eval_steps_per_second": 12.849, + "step": 202 + }, + { + "epoch": 22.522875816993466, + "eval_loss": 1.9656862020492554, + "eval_runtime": 2.4145, + "eval_samples_per_second": 805.559, + "eval_steps_per_second": 12.839, + "step": 203 + }, + { + "epoch": 22.627450980392158, + "eval_loss": 1.9108870029449463, + "eval_runtime": 2.3915, + "eval_samples_per_second": 813.292, + "eval_steps_per_second": 12.962, + "step": 204 + }, + { + "epoch": 22.73202614379085, + "eval_loss": 1.9319818019866943, + "eval_runtime": 2.3918, + "eval_samples_per_second": 813.179, + "eval_steps_per_second": 12.961, + "step": 205 + }, + { + "epoch": 22.836601307189543, + "eval_loss": 1.9680215120315552, + "eval_runtime": 2.4001, + "eval_samples_per_second": 810.398, + "eval_steps_per_second": 12.916, + "step": 206 + }, + { + "epoch": 22.941176470588236, + "eval_loss": 1.9628697633743286, + "eval_runtime": 2.3906, + "eval_samples_per_second": 813.608, + "eval_steps_per_second": 12.968, + "step": 207 + }, + { + "epoch": 22.941176470588236, + "grad_norm": 1.8930681943893433, + "learning_rate": 1.1489361702127662e-05, + "loss": 1.8246, + "step": 207 + }, + { + "epoch": 23.104575163398692, + "eval_loss": 1.9429619312286377, + "eval_runtime": 2.3877, + "eval_samples_per_second": 814.582, + "eval_steps_per_second": 12.983, + "step": 208 + }, + { + "epoch": 23.209150326797385, + "eval_loss": 1.9262027740478516, + "eval_runtime": 2.5188, + "eval_samples_per_second": 772.187, + "eval_steps_per_second": 12.307, + "step": 209 + }, + { + "epoch": 23.313725490196077, + "eval_loss": 1.9614677429199219, + "eval_runtime": 2.4709, + "eval_samples_per_second": 787.175, + "eval_steps_per_second": 12.546, + "step": 210 + }, + { + "epoch": 23.41830065359477, + "eval_loss": 1.9559693336486816, + "eval_runtime": 2.4152, + "eval_samples_per_second": 805.308, + "eval_steps_per_second": 12.835, + "step": 211 + }, + { + "epoch": 23.522875816993466, + "eval_loss": 1.966059923171997, + "eval_runtime": 2.4122, + "eval_samples_per_second": 806.317, + "eval_steps_per_second": 12.851, + "step": 212 + }, + { + "epoch": 23.627450980392158, + "eval_loss": 1.9780749082565308, + "eval_runtime": 2.437, + "eval_samples_per_second": 798.101, + "eval_steps_per_second": 12.72, + "step": 213 + }, + { + "epoch": 23.73202614379085, + "eval_loss": 1.980626106262207, + "eval_runtime": 2.404, + "eval_samples_per_second": 809.055, + "eval_steps_per_second": 12.895, + "step": 214 + }, + { + "epoch": 23.836601307189543, + "eval_loss": 1.9735476970672607, + "eval_runtime": 2.3933, + "eval_samples_per_second": 812.67, + "eval_steps_per_second": 12.953, + "step": 215 + }, + { + "epoch": 23.941176470588236, + "eval_loss": 1.9582773447036743, + "eval_runtime": 2.4128, + "eval_samples_per_second": 806.128, + "eval_steps_per_second": 12.848, + "step": 216 + }, + { + "epoch": 23.941176470588236, + "grad_norm": 1.8361761569976807, + "learning_rate": 1.1063829787234044e-05, + "loss": 1.8181, + "step": 216 + }, + { + "epoch": 24.104575163398692, + "eval_loss": 1.9554569721221924, + "eval_runtime": 2.387, + "eval_samples_per_second": 814.839, + "eval_steps_per_second": 12.987, + "step": 217 + }, + { + "epoch": 24.209150326797385, + "eval_loss": 1.9165290594100952, + "eval_runtime": 2.3919, + "eval_samples_per_second": 813.148, + "eval_steps_per_second": 12.96, + "step": 218 + }, + { + "epoch": 24.313725490196077, + "eval_loss": 1.9637575149536133, + "eval_runtime": 2.4007, + "eval_samples_per_second": 810.191, + "eval_steps_per_second": 12.913, + "step": 219 + }, + { + "epoch": 24.41830065359477, + "eval_loss": 2.000793218612671, + "eval_runtime": 2.4153, + "eval_samples_per_second": 805.272, + "eval_steps_per_second": 12.835, + "step": 220 + }, + { + "epoch": 24.522875816993466, + "eval_loss": 1.9246618747711182, + "eval_runtime": 2.4113, + "eval_samples_per_second": 806.61, + "eval_steps_per_second": 12.856, + "step": 221 + }, + { + "epoch": 24.627450980392158, + "eval_loss": 1.9719598293304443, + "eval_runtime": 2.4103, + "eval_samples_per_second": 806.944, + "eval_steps_per_second": 12.861, + "step": 222 + }, + { + "epoch": 24.73202614379085, + "eval_loss": 2.008406400680542, + "eval_runtime": 2.4142, + "eval_samples_per_second": 805.666, + "eval_steps_per_second": 12.841, + "step": 223 + }, + { + "epoch": 24.836601307189543, + "eval_loss": 1.942387580871582, + "eval_runtime": 2.4335, + "eval_samples_per_second": 799.261, + "eval_steps_per_second": 12.739, + "step": 224 + }, + { + "epoch": 24.941176470588236, + "eval_loss": 1.9110654592514038, + "eval_runtime": 2.3897, + "eval_samples_per_second": 813.923, + "eval_steps_per_second": 12.973, + "step": 225 + }, + { + "epoch": 24.941176470588236, + "grad_norm": 1.8342725038528442, + "learning_rate": 1.0638297872340426e-05, + "loss": 1.797, + "step": 225 + }, + { + "epoch": 25.104575163398692, + "eval_loss": 1.9787415266036987, + "eval_runtime": 2.4748, + "eval_samples_per_second": 785.908, + "eval_steps_per_second": 12.526, + "step": 226 + }, + { + "epoch": 25.209150326797385, + "eval_loss": 1.9613263607025146, + "eval_runtime": 2.3841, + "eval_samples_per_second": 815.829, + "eval_steps_per_second": 13.003, + "step": 227 + }, + { + "epoch": 25.313725490196077, + "eval_loss": 1.8806324005126953, + "eval_runtime": 2.4198, + "eval_samples_per_second": 803.779, + "eval_steps_per_second": 12.811, + "step": 228 + }, + { + "epoch": 25.41830065359477, + "eval_loss": 1.9231013059616089, + "eval_runtime": 2.3851, + "eval_samples_per_second": 815.485, + "eval_steps_per_second": 12.997, + "step": 229 + }, + { + "epoch": 25.522875816993466, + "eval_loss": 1.9021631479263306, + "eval_runtime": 2.3853, + "eval_samples_per_second": 815.428, + "eval_steps_per_second": 12.997, + "step": 230 + }, + { + "epoch": 25.627450980392158, + "eval_loss": 1.9682537317276, + "eval_runtime": 2.4444, + "eval_samples_per_second": 795.691, + "eval_steps_per_second": 12.682, + "step": 231 + }, + { + "epoch": 25.73202614379085, + "eval_loss": 1.9824862480163574, + "eval_runtime": 2.4349, + "eval_samples_per_second": 798.799, + "eval_steps_per_second": 12.732, + "step": 232 + }, + { + "epoch": 25.836601307189543, + "eval_loss": 1.962891936302185, + "eval_runtime": 2.411, + "eval_samples_per_second": 806.729, + "eval_steps_per_second": 12.858, + "step": 233 + }, + { + "epoch": 25.941176470588236, + "eval_loss": 1.9116088151931763, + "eval_runtime": 2.4247, + "eval_samples_per_second": 802.172, + "eval_steps_per_second": 12.785, + "step": 234 + }, + { + "epoch": 25.941176470588236, + "grad_norm": 1.9028220176696777, + "learning_rate": 1.0212765957446808e-05, + "loss": 1.7749, + "step": 234 + }, + { + "epoch": 26.104575163398692, + "eval_loss": 1.9699651002883911, + "eval_runtime": 2.4102, + "eval_samples_per_second": 806.995, + "eval_steps_per_second": 12.862, + "step": 235 + }, + { + "epoch": 26.209150326797385, + "eval_loss": 1.9811697006225586, + "eval_runtime": 2.386, + "eval_samples_per_second": 815.174, + "eval_steps_per_second": 12.992, + "step": 236 + }, + { + "epoch": 26.313725490196077, + "eval_loss": 1.9248907566070557, + "eval_runtime": 2.3932, + "eval_samples_per_second": 812.727, + "eval_steps_per_second": 12.953, + "step": 237 + }, + { + "epoch": 26.41830065359477, + "eval_loss": 1.9683917760849, + "eval_runtime": 2.3996, + "eval_samples_per_second": 810.537, + "eval_steps_per_second": 12.919, + "step": 238 + }, + { + "epoch": 26.522875816993466, + "eval_loss": 1.9604750871658325, + "eval_runtime": 2.3993, + "eval_samples_per_second": 810.666, + "eval_steps_per_second": 12.921, + "step": 239 + }, + { + "epoch": 26.627450980392158, + "eval_loss": 1.8918408155441284, + "eval_runtime": 2.4162, + "eval_samples_per_second": 804.968, + "eval_steps_per_second": 12.83, + "step": 240 + }, + { + "epoch": 26.73202614379085, + "eval_loss": 1.9443118572235107, + "eval_runtime": 2.3904, + "eval_samples_per_second": 813.679, + "eval_steps_per_second": 12.969, + "step": 241 + }, + { + "epoch": 26.836601307189543, + "eval_loss": 1.9147528409957886, + "eval_runtime": 2.4305, + "eval_samples_per_second": 800.261, + "eval_steps_per_second": 12.755, + "step": 242 + }, + { + "epoch": 26.941176470588236, + "eval_loss": 1.8974157571792603, + "eval_runtime": 2.4514, + "eval_samples_per_second": 793.417, + "eval_steps_per_second": 12.646, + "step": 243 + }, + { + "epoch": 26.941176470588236, + "grad_norm": 1.7782148122787476, + "learning_rate": 9.787234042553192e-06, + "loss": 1.8022, + "step": 243 + }, + { + "epoch": 27.104575163398692, + "eval_loss": 1.9711873531341553, + "eval_runtime": 2.4377, + "eval_samples_per_second": 797.883, + "eval_steps_per_second": 12.717, + "step": 244 + }, + { + "epoch": 27.209150326797385, + "eval_loss": 1.9718581438064575, + "eval_runtime": 2.404, + "eval_samples_per_second": 809.082, + "eval_steps_per_second": 12.895, + "step": 245 + }, + { + "epoch": 27.313725490196077, + "eval_loss": 1.9540036916732788, + "eval_runtime": 2.411, + "eval_samples_per_second": 806.731, + "eval_steps_per_second": 12.858, + "step": 246 + }, + { + "epoch": 27.41830065359477, + "eval_loss": 1.8907063007354736, + "eval_runtime": 2.3875, + "eval_samples_per_second": 814.658, + "eval_steps_per_second": 12.984, + "step": 247 + }, + { + "epoch": 27.522875816993466, + "eval_loss": 1.9907869100570679, + "eval_runtime": 2.3956, + "eval_samples_per_second": 811.916, + "eval_steps_per_second": 12.941, + "step": 248 + }, + { + "epoch": 27.627450980392158, + "eval_loss": 1.9273970127105713, + "eval_runtime": 2.3922, + "eval_samples_per_second": 813.044, + "eval_steps_per_second": 12.959, + "step": 249 + }, + { + "epoch": 27.73202614379085, + "eval_loss": 1.9233652353286743, + "eval_runtime": 2.3912, + "eval_samples_per_second": 813.399, + "eval_steps_per_second": 12.964, + "step": 250 + }, + { + "epoch": 27.836601307189543, + "eval_loss": 1.9580994844436646, + "eval_runtime": 2.3875, + "eval_samples_per_second": 814.648, + "eval_steps_per_second": 12.984, + "step": 251 + }, + { + "epoch": 27.941176470588236, + "eval_loss": 1.9409220218658447, + "eval_runtime": 2.4743, + "eval_samples_per_second": 786.095, + "eval_steps_per_second": 12.529, + "step": 252 + }, + { + "epoch": 27.941176470588236, + "grad_norm": 1.8487893342971802, + "learning_rate": 9.361702127659576e-06, + "loss": 1.7879, + "step": 252 + }, + { + "epoch": 28.104575163398692, + "eval_loss": 1.8715720176696777, + "eval_runtime": 2.4867, + "eval_samples_per_second": 782.152, + "eval_steps_per_second": 12.466, + "step": 253 + }, + { + "epoch": 28.209150326797385, + "eval_loss": 1.9945265054702759, + "eval_runtime": 2.4913, + "eval_samples_per_second": 780.713, + "eval_steps_per_second": 12.443, + "step": 254 + }, + { + "epoch": 28.313725490196077, + "eval_loss": 1.8658331632614136, + "eval_runtime": 2.4682, + "eval_samples_per_second": 788.013, + "eval_steps_per_second": 12.56, + "step": 255 + }, + { + "epoch": 28.41830065359477, + "eval_loss": 1.946846604347229, + "eval_runtime": 2.4685, + "eval_samples_per_second": 787.912, + "eval_steps_per_second": 12.558, + "step": 256 + }, + { + "epoch": 28.522875816993466, + "eval_loss": 1.9456650018692017, + "eval_runtime": 2.4963, + "eval_samples_per_second": 779.142, + "eval_steps_per_second": 12.418, + "step": 257 + }, + { + "epoch": 28.627450980392158, + "eval_loss": 1.9555299282073975, + "eval_runtime": 2.467, + "eval_samples_per_second": 788.417, + "eval_steps_per_second": 12.566, + "step": 258 + }, + { + "epoch": 28.73202614379085, + "eval_loss": 1.9544572830200195, + "eval_runtime": 2.4342, + "eval_samples_per_second": 799.038, + "eval_steps_per_second": 12.735, + "step": 259 + }, + { + "epoch": 28.836601307189543, + "eval_loss": 1.9225515127182007, + "eval_runtime": 2.3903, + "eval_samples_per_second": 813.707, + "eval_steps_per_second": 12.969, + "step": 260 + }, + { + "epoch": 28.941176470588236, + "eval_loss": 1.9331358671188354, + "eval_runtime": 2.3878, + "eval_samples_per_second": 814.568, + "eval_steps_per_second": 12.983, + "step": 261 + }, + { + "epoch": 28.941176470588236, + "grad_norm": 1.77451491355896, + "learning_rate": 8.936170212765958e-06, + "loss": 1.8019, + "step": 261 + }, + { + "epoch": 29.104575163398692, + "eval_loss": 1.9785720109939575, + "eval_runtime": 2.4363, + "eval_samples_per_second": 798.34, + "eval_steps_per_second": 12.724, + "step": 262 + }, + { + "epoch": 29.209150326797385, + "eval_loss": 1.9767541885375977, + "eval_runtime": 2.4358, + "eval_samples_per_second": 798.515, + "eval_steps_per_second": 12.727, + "step": 263 + }, + { + "epoch": 29.313725490196077, + "eval_loss": 1.9601216316223145, + "eval_runtime": 2.4048, + "eval_samples_per_second": 808.808, + "eval_steps_per_second": 12.891, + "step": 264 + }, + { + "epoch": 29.41830065359477, + "eval_loss": 1.917155385017395, + "eval_runtime": 2.4141, + "eval_samples_per_second": 805.697, + "eval_steps_per_second": 12.841, + "step": 265 + }, + { + "epoch": 29.522875816993466, + "eval_loss": 1.922187328338623, + "eval_runtime": 2.4093, + "eval_samples_per_second": 807.305, + "eval_steps_per_second": 12.867, + "step": 266 + }, + { + "epoch": 29.627450980392158, + "eval_loss": 1.918397307395935, + "eval_runtime": 2.3948, + "eval_samples_per_second": 812.187, + "eval_steps_per_second": 12.945, + "step": 267 + }, + { + "epoch": 29.73202614379085, + "eval_loss": 1.882236123085022, + "eval_runtime": 2.3912, + "eval_samples_per_second": 813.4, + "eval_steps_per_second": 12.964, + "step": 268 + }, + { + "epoch": 29.836601307189543, + "eval_loss": 1.916178822517395, + "eval_runtime": 2.3869, + "eval_samples_per_second": 814.874, + "eval_steps_per_second": 12.988, + "step": 269 + }, + { + "epoch": 29.941176470588236, + "eval_loss": 1.9769715070724487, + "eval_runtime": 2.3858, + "eval_samples_per_second": 815.249, + "eval_steps_per_second": 12.994, + "step": 270 + }, + { + "epoch": 29.941176470588236, + "grad_norm": 2.04988431930542, + "learning_rate": 8.510638297872341e-06, + "loss": 1.7614, + "step": 270 + }, + { + "epoch": 30.104575163398692, + "eval_loss": 1.9033125638961792, + "eval_runtime": 2.4701, + "eval_samples_per_second": 787.431, + "eval_steps_per_second": 12.55, + "step": 271 + }, + { + "epoch": 30.209150326797385, + "eval_loss": 1.9454644918441772, + "eval_runtime": 2.4111, + "eval_samples_per_second": 806.698, + "eval_steps_per_second": 12.857, + "step": 272 + }, + { + "epoch": 30.313725490196077, + "eval_loss": 1.9106584787368774, + "eval_runtime": 2.4243, + "eval_samples_per_second": 802.304, + "eval_steps_per_second": 12.787, + "step": 273 + }, + { + "epoch": 30.41830065359477, + "eval_loss": 1.9812813997268677, + "eval_runtime": 2.4148, + "eval_samples_per_second": 805.466, + "eval_steps_per_second": 12.838, + "step": 274 + }, + { + "epoch": 30.522875816993466, + "eval_loss": 1.9426772594451904, + "eval_runtime": 2.3997, + "eval_samples_per_second": 810.526, + "eval_steps_per_second": 12.918, + "step": 275 + }, + { + "epoch": 30.627450980392158, + "eval_loss": 1.9499095678329468, + "eval_runtime": 2.3957, + "eval_samples_per_second": 811.863, + "eval_steps_per_second": 12.94, + "step": 276 + }, + { + "epoch": 30.73202614379085, + "eval_loss": 1.961235523223877, + "eval_runtime": 2.3821, + "eval_samples_per_second": 816.509, + "eval_steps_per_second": 13.014, + "step": 277 + }, + { + "epoch": 30.836601307189543, + "eval_loss": 1.9450849294662476, + "eval_runtime": 2.3942, + "eval_samples_per_second": 812.387, + "eval_steps_per_second": 12.948, + "step": 278 + }, + { + "epoch": 30.941176470588236, + "eval_loss": 1.9132739305496216, + "eval_runtime": 2.4136, + "eval_samples_per_second": 805.836, + "eval_steps_per_second": 12.844, + "step": 279 + }, + { + "epoch": 30.941176470588236, + "grad_norm": 1.8763809204101562, + "learning_rate": 8.085106382978723e-06, + "loss": 1.7619, + "step": 279 + }, + { + "epoch": 31.104575163398692, + "eval_loss": 1.9205107688903809, + "eval_runtime": 2.4332, + "eval_samples_per_second": 799.374, + "eval_steps_per_second": 12.741, + "step": 280 + }, + { + "epoch": 31.209150326797385, + "eval_loss": 1.946841835975647, + "eval_runtime": 2.464, + "eval_samples_per_second": 789.356, + "eval_steps_per_second": 12.581, + "step": 281 + }, + { + "epoch": 31.313725490196077, + "eval_loss": 1.9464671611785889, + "eval_runtime": 2.4074, + "eval_samples_per_second": 807.917, + "eval_steps_per_second": 12.877, + "step": 282 + }, + { + "epoch": 31.41830065359477, + "eval_loss": 1.8833441734313965, + "eval_runtime": 2.4082, + "eval_samples_per_second": 807.64, + "eval_steps_per_second": 12.872, + "step": 283 + }, + { + "epoch": 31.522875816993466, + "eval_loss": 1.9414160251617432, + "eval_runtime": 2.407, + "eval_samples_per_second": 808.072, + "eval_steps_per_second": 12.879, + "step": 284 + }, + { + "epoch": 31.627450980392158, + "eval_loss": 1.9485697746276855, + "eval_runtime": 2.4025, + "eval_samples_per_second": 809.577, + "eval_steps_per_second": 12.903, + "step": 285 + }, + { + "epoch": 31.73202614379085, + "eval_loss": 1.9184815883636475, + "eval_runtime": 2.3828, + "eval_samples_per_second": 816.275, + "eval_steps_per_second": 13.01, + "step": 286 + }, + { + "epoch": 31.836601307189543, + "eval_loss": 1.9518897533416748, + "eval_runtime": 2.4096, + "eval_samples_per_second": 807.189, + "eval_steps_per_second": 12.865, + "step": 287 + }, + { + "epoch": 31.941176470588236, + "eval_loss": 1.9385578632354736, + "eval_runtime": 2.4088, + "eval_samples_per_second": 807.447, + "eval_steps_per_second": 12.869, + "step": 288 + }, + { + "epoch": 31.941176470588236, + "grad_norm": 1.8053061962127686, + "learning_rate": 7.659574468085107e-06, + "loss": 1.7713, + "step": 288 + }, + { + "epoch": 32.10457516339869, + "eval_loss": 1.8966560363769531, + "eval_runtime": 2.4403, + "eval_samples_per_second": 797.048, + "eval_steps_per_second": 12.704, + "step": 289 + }, + { + "epoch": 32.209150326797385, + "eval_loss": 1.9649851322174072, + "eval_runtime": 2.4092, + "eval_samples_per_second": 807.307, + "eval_steps_per_second": 12.867, + "step": 290 + }, + { + "epoch": 32.31372549019608, + "eval_loss": 1.919927954673767, + "eval_runtime": 2.41, + "eval_samples_per_second": 807.057, + "eval_steps_per_second": 12.863, + "step": 291 + }, + { + "epoch": 32.41830065359477, + "eval_loss": 1.9147096872329712, + "eval_runtime": 2.4127, + "eval_samples_per_second": 806.134, + "eval_steps_per_second": 12.848, + "step": 292 + }, + { + "epoch": 32.52287581699346, + "eval_loss": 1.9159774780273438, + "eval_runtime": 2.4052, + "eval_samples_per_second": 808.673, + "eval_steps_per_second": 12.889, + "step": 293 + }, + { + "epoch": 32.627450980392155, + "eval_loss": 1.9953843355178833, + "eval_runtime": 2.3906, + "eval_samples_per_second": 813.616, + "eval_steps_per_second": 12.968, + "step": 294 + }, + { + "epoch": 32.73202614379085, + "eval_loss": 1.92180597782135, + "eval_runtime": 2.4249, + "eval_samples_per_second": 802.081, + "eval_steps_per_second": 12.784, + "step": 295 + }, + { + "epoch": 32.83660130718954, + "eval_loss": 1.9786967039108276, + "eval_runtime": 2.3846, + "eval_samples_per_second": 815.658, + "eval_steps_per_second": 13.0, + "step": 296 + }, + { + "epoch": 32.94117647058823, + "eval_loss": 1.9362424612045288, + "eval_runtime": 2.3879, + "eval_samples_per_second": 814.534, + "eval_steps_per_second": 12.982, + "step": 297 + }, + { + "epoch": 32.94117647058823, + "grad_norm": 2.0353338718414307, + "learning_rate": 7.234042553191491e-06, + "loss": 1.7635, + "step": 297 + }, + { + "epoch": 33.10457516339869, + "eval_loss": 1.9281338453292847, + "eval_runtime": 2.4396, + "eval_samples_per_second": 797.251, + "eval_steps_per_second": 12.707, + "step": 298 + }, + { + "epoch": 33.209150326797385, + "eval_loss": 1.921140193939209, + "eval_runtime": 2.4118, + "eval_samples_per_second": 806.459, + "eval_steps_per_second": 12.854, + "step": 299 + }, + { + "epoch": 33.31372549019608, + "eval_loss": 1.9680968523025513, + "eval_runtime": 2.4046, + "eval_samples_per_second": 808.854, + "eval_steps_per_second": 12.892, + "step": 300 + }, + { + "epoch": 33.41830065359477, + "eval_loss": 1.9094316959381104, + "eval_runtime": 2.4065, + "eval_samples_per_second": 808.218, + "eval_steps_per_second": 12.882, + "step": 301 + }, + { + "epoch": 33.52287581699346, + "eval_loss": 1.9845983982086182, + "eval_runtime": 2.4045, + "eval_samples_per_second": 808.905, + "eval_steps_per_second": 12.893, + "step": 302 + }, + { + "epoch": 33.627450980392155, + "eval_loss": 1.9461405277252197, + "eval_runtime": 2.4098, + "eval_samples_per_second": 807.107, + "eval_steps_per_second": 12.864, + "step": 303 + }, + { + "epoch": 33.73202614379085, + "eval_loss": 1.8947722911834717, + "eval_runtime": 2.392, + "eval_samples_per_second": 813.124, + "eval_steps_per_second": 12.96, + "step": 304 + }, + { + "epoch": 33.83660130718954, + "eval_loss": 1.9371235370635986, + "eval_runtime": 2.3855, + "eval_samples_per_second": 815.347, + "eval_steps_per_second": 12.995, + "step": 305 + }, + { + "epoch": 33.94117647058823, + "eval_loss": 2.006459951400757, + "eval_runtime": 2.4252, + "eval_samples_per_second": 802.01, + "eval_steps_per_second": 12.783, + "step": 306 + }, + { + "epoch": 33.94117647058823, + "grad_norm": 1.884189486503601, + "learning_rate": 6.808510638297873e-06, + "loss": 1.7394, + "step": 306 + }, + { + "epoch": 34.10457516339869, + "eval_loss": 1.9282273054122925, + "eval_runtime": 2.4222, + "eval_samples_per_second": 802.994, + "eval_steps_per_second": 12.798, + "step": 307 + }, + { + "epoch": 34.209150326797385, + "eval_loss": 1.9412920475006104, + "eval_runtime": 2.5082, + "eval_samples_per_second": 775.463, + "eval_steps_per_second": 12.36, + "step": 308 + }, + { + "epoch": 34.31372549019608, + "eval_loss": 1.9883979558944702, + "eval_runtime": 2.5353, + "eval_samples_per_second": 767.18, + "eval_steps_per_second": 12.228, + "step": 309 + }, + { + "epoch": 34.41830065359477, + "eval_loss": 1.917364239692688, + "eval_runtime": 2.4621, + "eval_samples_per_second": 789.978, + "eval_steps_per_second": 12.591, + "step": 310 + }, + { + "epoch": 34.52287581699346, + "eval_loss": 1.9594651460647583, + "eval_runtime": 2.5176, + "eval_samples_per_second": 772.567, + "eval_steps_per_second": 12.313, + "step": 311 + }, + { + "epoch": 34.627450980392155, + "eval_loss": 1.9423621892929077, + "eval_runtime": 2.5063, + "eval_samples_per_second": 776.037, + "eval_steps_per_second": 12.369, + "step": 312 + }, + { + "epoch": 34.73202614379085, + "eval_loss": 1.9494574069976807, + "eval_runtime": 2.4032, + "eval_samples_per_second": 809.321, + "eval_steps_per_second": 12.899, + "step": 313 + }, + { + "epoch": 34.83660130718954, + "eval_loss": 1.9160590171813965, + "eval_runtime": 2.3866, + "eval_samples_per_second": 814.959, + "eval_steps_per_second": 12.989, + "step": 314 + }, + { + "epoch": 34.94117647058823, + "eval_loss": 1.963183879852295, + "eval_runtime": 2.3866, + "eval_samples_per_second": 814.96, + "eval_steps_per_second": 12.989, + "step": 315 + }, + { + "epoch": 34.94117647058823, + "grad_norm": 1.7952407598495483, + "learning_rate": 6.382978723404256e-06, + "loss": 1.7434, + "step": 315 + }, + { + "epoch": 35.10457516339869, + "eval_loss": 1.9129880666732788, + "eval_runtime": 2.3856, + "eval_samples_per_second": 815.294, + "eval_steps_per_second": 12.994, + "step": 316 + }, + { + "epoch": 35.209150326797385, + "eval_loss": 1.9850044250488281, + "eval_runtime": 2.4394, + "eval_samples_per_second": 797.311, + "eval_steps_per_second": 12.708, + "step": 317 + }, + { + "epoch": 35.31372549019608, + "eval_loss": 1.9291285276412964, + "eval_runtime": 2.3856, + "eval_samples_per_second": 815.319, + "eval_steps_per_second": 12.995, + "step": 318 + }, + { + "epoch": 35.41830065359477, + "eval_loss": 1.9300141334533691, + "eval_runtime": 2.5153, + "eval_samples_per_second": 773.266, + "eval_steps_per_second": 12.325, + "step": 319 + }, + { + "epoch": 35.52287581699346, + "eval_loss": 1.9399768114089966, + "eval_runtime": 2.4758, + "eval_samples_per_second": 785.618, + "eval_steps_per_second": 12.521, + "step": 320 + }, + { + "epoch": 35.627450980392155, + "eval_loss": 1.9735783338546753, + "eval_runtime": 2.4325, + "eval_samples_per_second": 799.598, + "eval_steps_per_second": 12.744, + "step": 321 + }, + { + "epoch": 35.73202614379085, + "eval_loss": 1.9033024311065674, + "eval_runtime": 2.4152, + "eval_samples_per_second": 805.309, + "eval_steps_per_second": 12.835, + "step": 322 + }, + { + "epoch": 35.83660130718954, + "eval_loss": 1.9249202013015747, + "eval_runtime": 2.4104, + "eval_samples_per_second": 806.927, + "eval_steps_per_second": 12.861, + "step": 323 + }, + { + "epoch": 35.94117647058823, + "eval_loss": 1.9796316623687744, + "eval_runtime": 2.4374, + "eval_samples_per_second": 797.996, + "eval_steps_per_second": 12.719, + "step": 324 + }, + { + "epoch": 35.94117647058823, + "grad_norm": 1.8199615478515625, + "learning_rate": 5.957446808510638e-06, + "loss": 1.7578, + "step": 324 + }, + { + "epoch": 36.10457516339869, + "eval_loss": 1.9595942497253418, + "eval_runtime": 2.3871, + "eval_samples_per_second": 814.78, + "eval_steps_per_second": 12.986, + "step": 325 + }, + { + "epoch": 36.209150326797385, + "eval_loss": 1.9293735027313232, + "eval_runtime": 2.3875, + "eval_samples_per_second": 814.674, + "eval_steps_per_second": 12.985, + "step": 326 + }, + { + "epoch": 36.31372549019608, + "eval_loss": 1.957201600074768, + "eval_runtime": 2.3892, + "eval_samples_per_second": 814.083, + "eval_steps_per_second": 12.975, + "step": 327 + }, + { + "epoch": 36.41830065359477, + "eval_loss": 1.9536631107330322, + "eval_runtime": 2.4048, + "eval_samples_per_second": 808.796, + "eval_steps_per_second": 12.891, + "step": 328 + }, + { + "epoch": 36.52287581699346, + "eval_loss": 1.974502682685852, + "eval_runtime": 2.3928, + "eval_samples_per_second": 812.843, + "eval_steps_per_second": 12.955, + "step": 329 + }, + { + "epoch": 36.627450980392155, + "eval_loss": 1.9568116664886475, + "eval_runtime": 2.4183, + "eval_samples_per_second": 804.288, + "eval_steps_per_second": 12.819, + "step": 330 + }, + { + "epoch": 36.73202614379085, + "eval_loss": 1.9689034223556519, + "eval_runtime": 2.4676, + "eval_samples_per_second": 788.229, + "eval_steps_per_second": 12.563, + "step": 331 + }, + { + "epoch": 36.83660130718954, + "eval_loss": 1.9140371084213257, + "eval_runtime": 2.4088, + "eval_samples_per_second": 807.459, + "eval_steps_per_second": 12.87, + "step": 332 + }, + { + "epoch": 36.94117647058823, + "eval_loss": 1.929794192314148, + "eval_runtime": 2.4119, + "eval_samples_per_second": 806.412, + "eval_steps_per_second": 12.853, + "step": 333 + }, + { + "epoch": 36.94117647058823, + "grad_norm": 1.8074049949645996, + "learning_rate": 5.531914893617022e-06, + "loss": 1.7497, + "step": 333 + }, + { + "epoch": 37.10457516339869, + "eval_loss": 1.9698741436004639, + "eval_runtime": 2.4082, + "eval_samples_per_second": 807.67, + "eval_steps_per_second": 12.873, + "step": 334 + }, + { + "epoch": 37.209150326797385, + "eval_loss": 1.90766441822052, + "eval_runtime": 2.3974, + "eval_samples_per_second": 811.298, + "eval_steps_per_second": 12.931, + "step": 335 + }, + { + "epoch": 37.31372549019608, + "eval_loss": 1.9559139013290405, + "eval_runtime": 2.3918, + "eval_samples_per_second": 813.182, + "eval_steps_per_second": 12.961, + "step": 336 + }, + { + "epoch": 37.41830065359477, + "eval_loss": 1.9621520042419434, + "eval_runtime": 2.4126, + "eval_samples_per_second": 806.189, + "eval_steps_per_second": 12.849, + "step": 337 + }, + { + "epoch": 37.52287581699346, + "eval_loss": 1.9238826036453247, + "eval_runtime": 2.3925, + "eval_samples_per_second": 812.943, + "eval_steps_per_second": 12.957, + "step": 338 + }, + { + "epoch": 37.627450980392155, + "eval_loss": 1.9738985300064087, + "eval_runtime": 2.4262, + "eval_samples_per_second": 801.666, + "eval_steps_per_second": 12.777, + "step": 339 + }, + { + "epoch": 37.73202614379085, + "eval_loss": 1.936599612236023, + "eval_runtime": 2.446, + "eval_samples_per_second": 795.165, + "eval_steps_per_second": 12.674, + "step": 340 + }, + { + "epoch": 37.83660130718954, + "eval_loss": 1.9857661724090576, + "eval_runtime": 2.4387, + "eval_samples_per_second": 797.57, + "eval_steps_per_second": 12.712, + "step": 341 + }, + { + "epoch": 37.94117647058823, + "eval_loss": 1.9602775573730469, + "eval_runtime": 2.4623, + "eval_samples_per_second": 789.898, + "eval_steps_per_second": 12.59, + "step": 342 + }, + { + "epoch": 37.94117647058823, + "grad_norm": 1.8968150615692139, + "learning_rate": 5.106382978723404e-06, + "loss": 1.7378, + "step": 342 + }, + { + "epoch": 38.10457516339869, + "eval_loss": 1.9392098188400269, + "eval_runtime": 2.5445, + "eval_samples_per_second": 764.389, + "eval_steps_per_second": 12.183, + "step": 343 + }, + { + "epoch": 38.209150326797385, + "eval_loss": 1.9554734230041504, + "eval_runtime": 2.4153, + "eval_samples_per_second": 805.294, + "eval_steps_per_second": 12.835, + "step": 344 + }, + { + "epoch": 38.31372549019608, + "eval_loss": 1.9802982807159424, + "eval_runtime": 2.4043, + "eval_samples_per_second": 808.971, + "eval_steps_per_second": 12.894, + "step": 345 + }, + { + "epoch": 38.41830065359477, + "eval_loss": 1.950205683708191, + "eval_runtime": 2.408, + "eval_samples_per_second": 807.739, + "eval_steps_per_second": 12.874, + "step": 346 + }, + { + "epoch": 38.52287581699346, + "eval_loss": 1.959083914756775, + "eval_runtime": 2.3857, + "eval_samples_per_second": 815.264, + "eval_steps_per_second": 12.994, + "step": 347 + }, + { + "epoch": 38.627450980392155, + "eval_loss": 1.9582518339157104, + "eval_runtime": 2.4306, + "eval_samples_per_second": 800.213, + "eval_steps_per_second": 12.754, + "step": 348 + }, + { + "epoch": 38.73202614379085, + "eval_loss": 1.9507373571395874, + "eval_runtime": 2.3863, + "eval_samples_per_second": 815.084, + "eval_steps_per_second": 12.991, + "step": 349 + }, + { + "epoch": 38.83660130718954, + "eval_loss": 1.9410823583602905, + "eval_runtime": 2.4092, + "eval_samples_per_second": 807.311, + "eval_steps_per_second": 12.867, + "step": 350 + }, + { + "epoch": 38.94117647058823, + "eval_loss": 1.922089695930481, + "eval_runtime": 2.4113, + "eval_samples_per_second": 806.625, + "eval_steps_per_second": 12.856, + "step": 351 + }, + { + "epoch": 38.94117647058823, + "grad_norm": 1.8066309690475464, + "learning_rate": 4.680851063829788e-06, + "loss": 1.7324, + "step": 351 + }, + { + "epoch": 39.10457516339869, + "eval_loss": 1.9468454122543335, + "eval_runtime": 2.3833, + "eval_samples_per_second": 816.093, + "eval_steps_per_second": 13.007, + "step": 352 + }, + { + "epoch": 39.209150326797385, + "eval_loss": 1.9370498657226562, + "eval_runtime": 2.3853, + "eval_samples_per_second": 815.41, + "eval_steps_per_second": 12.996, + "step": 353 + }, + { + "epoch": 39.31372549019608, + "eval_loss": 1.9278494119644165, + "eval_runtime": 2.4316, + "eval_samples_per_second": 799.901, + "eval_steps_per_second": 12.749, + "step": 354 + }, + { + "epoch": 39.41830065359477, + "eval_loss": 1.9604259729385376, + "eval_runtime": 2.4996, + "eval_samples_per_second": 778.116, + "eval_steps_per_second": 12.402, + "step": 355 + }, + { + "epoch": 39.52287581699346, + "eval_loss": 1.9375855922698975, + "eval_runtime": 2.4112, + "eval_samples_per_second": 806.646, + "eval_steps_per_second": 12.857, + "step": 356 + }, + { + "epoch": 39.627450980392155, + "eval_loss": 1.9473201036453247, + "eval_runtime": 2.4652, + "eval_samples_per_second": 788.988, + "eval_steps_per_second": 12.575, + "step": 357 + }, + { + "epoch": 39.73202614379085, + "eval_loss": 1.9490294456481934, + "eval_runtime": 2.4617, + "eval_samples_per_second": 790.102, + "eval_steps_per_second": 12.593, + "step": 358 + }, + { + "epoch": 39.83660130718954, + "eval_loss": 1.913360834121704, + "eval_runtime": 2.445, + "eval_samples_per_second": 795.514, + "eval_steps_per_second": 12.679, + "step": 359 + }, + { + "epoch": 39.94117647058823, + "eval_loss": 1.9323056936264038, + "eval_runtime": 2.4383, + "eval_samples_per_second": 797.685, + "eval_steps_per_second": 12.714, + "step": 360 + }, + { + "epoch": 39.94117647058823, + "grad_norm": 1.7927449941635132, + "learning_rate": 4.255319148936171e-06, + "loss": 1.7195, + "step": 360 + }, + { + "epoch": 40.10457516339869, + "eval_loss": 1.9119060039520264, + "eval_runtime": 2.4209, + "eval_samples_per_second": 803.426, + "eval_steps_per_second": 12.805, + "step": 361 + }, + { + "epoch": 40.209150326797385, + "eval_loss": 1.939374327659607, + "eval_runtime": 2.4652, + "eval_samples_per_second": 788.974, + "eval_steps_per_second": 12.575, + "step": 362 + }, + { + "epoch": 40.31372549019608, + "eval_loss": 1.9959666728973389, + "eval_runtime": 2.3847, + "eval_samples_per_second": 815.601, + "eval_steps_per_second": 12.999, + "step": 363 + }, + { + "epoch": 40.41830065359477, + "eval_loss": 1.9789413213729858, + "eval_runtime": 2.3893, + "eval_samples_per_second": 814.046, + "eval_steps_per_second": 12.975, + "step": 364 + }, + { + "epoch": 40.52287581699346, + "eval_loss": 1.9750434160232544, + "eval_runtime": 2.3871, + "eval_samples_per_second": 814.781, + "eval_steps_per_second": 12.986, + "step": 365 + }, + { + "epoch": 40.627450980392155, + "eval_loss": 1.9399486780166626, + "eval_runtime": 2.4366, + "eval_samples_per_second": 798.229, + "eval_steps_per_second": 12.722, + "step": 366 + }, + { + "epoch": 40.73202614379085, + "eval_loss": 1.9516425132751465, + "eval_runtime": 2.4136, + "eval_samples_per_second": 805.85, + "eval_steps_per_second": 12.844, + "step": 367 + }, + { + "epoch": 40.83660130718954, + "eval_loss": 1.941023826599121, + "eval_runtime": 2.4094, + "eval_samples_per_second": 807.251, + "eval_steps_per_second": 12.866, + "step": 368 + }, + { + "epoch": 40.94117647058823, + "eval_loss": 1.9317693710327148, + "eval_runtime": 2.4091, + "eval_samples_per_second": 807.345, + "eval_steps_per_second": 12.868, + "step": 369 + }, + { + "epoch": 40.94117647058823, + "grad_norm": 1.7872660160064697, + "learning_rate": 3.8297872340425535e-06, + "loss": 1.7043, + "step": 369 + }, + { + "epoch": 41.10457516339869, + "eval_loss": 1.9890044927597046, + "eval_runtime": 2.414, + "eval_samples_per_second": 805.708, + "eval_steps_per_second": 12.842, + "step": 370 + }, + { + "epoch": 41.209150326797385, + "eval_loss": 1.9840960502624512, + "eval_runtime": 2.3959, + "eval_samples_per_second": 811.809, + "eval_steps_per_second": 12.939, + "step": 371 + }, + { + "epoch": 41.31372549019608, + "eval_loss": 1.9188443422317505, + "eval_runtime": 2.4069, + "eval_samples_per_second": 808.088, + "eval_steps_per_second": 12.88, + "step": 372 + }, + { + "epoch": 41.41830065359477, + "eval_loss": 1.9614779949188232, + "eval_runtime": 2.3886, + "eval_samples_per_second": 814.274, + "eval_steps_per_second": 12.978, + "step": 373 + }, + { + "epoch": 41.52287581699346, + "eval_loss": 1.9061365127563477, + "eval_runtime": 2.3924, + "eval_samples_per_second": 812.981, + "eval_steps_per_second": 12.958, + "step": 374 + }, + { + "epoch": 41.627450980392155, + "eval_loss": 1.909993052482605, + "eval_runtime": 2.3858, + "eval_samples_per_second": 815.236, + "eval_steps_per_second": 12.993, + "step": 375 + }, + { + "epoch": 41.73202614379085, + "eval_loss": 1.9422426223754883, + "eval_runtime": 2.3888, + "eval_samples_per_second": 814.201, + "eval_steps_per_second": 12.977, + "step": 376 + }, + { + "epoch": 41.83660130718954, + "eval_loss": 1.9640315771102905, + "eval_runtime": 2.4376, + "eval_samples_per_second": 797.911, + "eval_steps_per_second": 12.717, + "step": 377 + }, + { + "epoch": 41.94117647058823, + "eval_loss": 1.917662262916565, + "eval_runtime": 2.4538, + "eval_samples_per_second": 792.647, + "eval_steps_per_second": 12.633, + "step": 378 + }, + { + "epoch": 41.94117647058823, + "grad_norm": 1.7721134424209595, + "learning_rate": 3.4042553191489363e-06, + "loss": 1.7169, + "step": 378 + }, + { + "epoch": 42.10457516339869, + "eval_loss": 1.9163570404052734, + "eval_runtime": 2.4281, + "eval_samples_per_second": 801.034, + "eval_steps_per_second": 12.767, + "step": 379 + }, + { + "epoch": 42.209150326797385, + "eval_loss": 1.9374709129333496, + "eval_runtime": 2.4192, + "eval_samples_per_second": 803.999, + "eval_steps_per_second": 12.814, + "step": 380 + }, + { + "epoch": 42.31372549019608, + "eval_loss": 1.9525771141052246, + "eval_runtime": 2.4174, + "eval_samples_per_second": 804.581, + "eval_steps_per_second": 12.824, + "step": 381 + }, + { + "epoch": 42.41830065359477, + "eval_loss": 1.938783884048462, + "eval_runtime": 2.3891, + "eval_samples_per_second": 814.119, + "eval_steps_per_second": 12.976, + "step": 382 + }, + { + "epoch": 42.52287581699346, + "eval_loss": 1.9378857612609863, + "eval_runtime": 2.3879, + "eval_samples_per_second": 814.515, + "eval_steps_per_second": 12.982, + "step": 383 + }, + { + "epoch": 42.627450980392155, + "eval_loss": 1.931535243988037, + "eval_runtime": 2.3924, + "eval_samples_per_second": 812.994, + "eval_steps_per_second": 12.958, + "step": 384 + }, + { + "epoch": 42.73202614379085, + "eval_loss": 1.9418144226074219, + "eval_runtime": 2.3917, + "eval_samples_per_second": 813.219, + "eval_steps_per_second": 12.961, + "step": 385 + }, + { + "epoch": 42.83660130718954, + "eval_loss": 1.9460214376449585, + "eval_runtime": 2.417, + "eval_samples_per_second": 804.722, + "eval_steps_per_second": 12.826, + "step": 386 + }, + { + "epoch": 42.94117647058823, + "eval_loss": 1.9129729270935059, + "eval_runtime": 2.3889, + "eval_samples_per_second": 814.19, + "eval_steps_per_second": 12.977, + "step": 387 + }, + { + "epoch": 42.94117647058823, + "grad_norm": 1.8166015148162842, + "learning_rate": 2.978723404255319e-06, + "loss": 1.7315, + "step": 387 + }, + { + "epoch": 43.10457516339869, + "eval_loss": 1.9539881944656372, + "eval_runtime": 2.4601, + "eval_samples_per_second": 790.624, + "eval_steps_per_second": 12.601, + "step": 388 + }, + { + "epoch": 43.209150326797385, + "eval_loss": 1.951253890991211, + "eval_runtime": 2.4131, + "eval_samples_per_second": 806.019, + "eval_steps_per_second": 12.847, + "step": 389 + }, + { + "epoch": 43.31372549019608, + "eval_loss": 2.0078840255737305, + "eval_runtime": 2.4151, + "eval_samples_per_second": 805.365, + "eval_steps_per_second": 12.836, + "step": 390 + }, + { + "epoch": 43.41830065359477, + "eval_loss": 1.9754467010498047, + "eval_runtime": 2.409, + "eval_samples_per_second": 807.392, + "eval_steps_per_second": 12.868, + "step": 391 + }, + { + "epoch": 43.52287581699346, + "eval_loss": 1.972512125968933, + "eval_runtime": 2.4172, + "eval_samples_per_second": 804.635, + "eval_steps_per_second": 12.825, + "step": 392 + }, + { + "epoch": 43.627450980392155, + "eval_loss": 1.9601085186004639, + "eval_runtime": 2.4555, + "eval_samples_per_second": 792.113, + "eval_steps_per_second": 12.625, + "step": 393 + }, + { + "epoch": 43.73202614379085, + "eval_loss": 1.9266124963760376, + "eval_runtime": 2.4604, + "eval_samples_per_second": 790.516, + "eval_steps_per_second": 12.599, + "step": 394 + }, + { + "epoch": 43.83660130718954, + "eval_loss": 1.9546335935592651, + "eval_runtime": 2.467, + "eval_samples_per_second": 788.405, + "eval_steps_per_second": 12.566, + "step": 395 + }, + { + "epoch": 43.94117647058823, + "eval_loss": 1.918619990348816, + "eval_runtime": 2.4607, + "eval_samples_per_second": 790.428, + "eval_steps_per_second": 12.598, + "step": 396 + }, + { + "epoch": 43.94117647058823, + "grad_norm": 1.8490442037582397, + "learning_rate": 2.553191489361702e-06, + "loss": 1.7095, + "step": 396 + }, + { + "epoch": 44.10457516339869, + "eval_loss": 1.9544674158096313, + "eval_runtime": 2.5483, + "eval_samples_per_second": 763.254, + "eval_steps_per_second": 12.165, + "step": 397 + }, + { + "epoch": 44.209150326797385, + "eval_loss": 2.0218536853790283, + "eval_runtime": 2.578, + "eval_samples_per_second": 754.467, + "eval_steps_per_second": 12.025, + "step": 398 + }, + { + "epoch": 44.31372549019608, + "eval_loss": 1.9499460458755493, + "eval_runtime": 2.4731, + "eval_samples_per_second": 786.477, + "eval_steps_per_second": 12.535, + "step": 399 + }, + { + "epoch": 44.41830065359477, + "eval_loss": 1.9414080381393433, + "eval_runtime": 2.4805, + "eval_samples_per_second": 784.104, + "eval_steps_per_second": 12.497, + "step": 400 + }, + { + "epoch": 44.52287581699346, + "eval_loss": 1.961714506149292, + "eval_runtime": 2.4673, + "eval_samples_per_second": 788.301, + "eval_steps_per_second": 12.564, + "step": 401 + }, + { + "epoch": 44.627450980392155, + "eval_loss": 1.9939833879470825, + "eval_runtime": 2.4722, + "eval_samples_per_second": 786.751, + "eval_steps_per_second": 12.539, + "step": 402 + }, + { + "epoch": 44.73202614379085, + "eval_loss": 1.9617350101470947, + "eval_runtime": 2.4683, + "eval_samples_per_second": 787.977, + "eval_steps_per_second": 12.559, + "step": 403 + }, + { + "epoch": 44.83660130718954, + "eval_loss": 1.9692201614379883, + "eval_runtime": 2.4374, + "eval_samples_per_second": 797.97, + "eval_steps_per_second": 12.718, + "step": 404 + }, + { + "epoch": 44.94117647058823, + "eval_loss": 1.9219003915786743, + "eval_runtime": 2.435, + "eval_samples_per_second": 798.78, + "eval_steps_per_second": 12.731, + "step": 405 + }, + { + "epoch": 44.94117647058823, + "grad_norm": 1.796848177909851, + "learning_rate": 2.1276595744680853e-06, + "loss": 1.7071, + "step": 405 + }, + { + "epoch": 45.10457516339869, + "eval_loss": 1.9611177444458008, + "eval_runtime": 2.4195, + "eval_samples_per_second": 803.881, + "eval_steps_per_second": 12.812, + "step": 406 + }, + { + "epoch": 45.209150326797385, + "eval_loss": 1.9778918027877808, + "eval_runtime": 2.4197, + "eval_samples_per_second": 803.821, + "eval_steps_per_second": 12.812, + "step": 407 + }, + { + "epoch": 45.31372549019608, + "eval_loss": 1.9238309860229492, + "eval_runtime": 2.4635, + "eval_samples_per_second": 789.519, + "eval_steps_per_second": 12.584, + "step": 408 + }, + { + "epoch": 45.41830065359477, + "eval_loss": 1.9089758396148682, + "eval_runtime": 2.4247, + "eval_samples_per_second": 802.146, + "eval_steps_per_second": 12.785, + "step": 409 + }, + { + "epoch": 45.52287581699346, + "eval_loss": 1.9342485666275024, + "eval_runtime": 2.4113, + "eval_samples_per_second": 806.609, + "eval_steps_per_second": 12.856, + "step": 410 + }, + { + "epoch": 45.627450980392155, + "eval_loss": 1.9936097860336304, + "eval_runtime": 2.4133, + "eval_samples_per_second": 805.937, + "eval_steps_per_second": 12.845, + "step": 411 + }, + { + "epoch": 45.73202614379085, + "eval_loss": 1.897844672203064, + "eval_runtime": 2.4034, + "eval_samples_per_second": 809.278, + "eval_steps_per_second": 12.899, + "step": 412 + }, + { + "epoch": 45.83660130718954, + "eval_loss": 1.9208406209945679, + "eval_runtime": 2.3918, + "eval_samples_per_second": 813.195, + "eval_steps_per_second": 12.961, + "step": 413 + }, + { + "epoch": 45.94117647058823, + "eval_loss": 1.9177494049072266, + "eval_runtime": 2.4571, + "eval_samples_per_second": 791.578, + "eval_steps_per_second": 12.616, + "step": 414 + }, + { + "epoch": 45.94117647058823, + "grad_norm": 1.7152032852172852, + "learning_rate": 1.7021276595744682e-06, + "loss": 1.7116, + "step": 414 + }, + { + "epoch": 46.10457516339869, + "eval_loss": 1.957858681678772, + "eval_runtime": 2.4339, + "eval_samples_per_second": 799.125, + "eval_steps_per_second": 12.737, + "step": 415 + }, + { + "epoch": 46.209150326797385, + "eval_loss": 1.9422305822372437, + "eval_runtime": 2.4497, + "eval_samples_per_second": 793.973, + "eval_steps_per_second": 12.655, + "step": 416 + }, + { + "epoch": 46.31372549019608, + "eval_loss": 1.9287089109420776, + "eval_runtime": 2.423, + "eval_samples_per_second": 802.723, + "eval_steps_per_second": 12.794, + "step": 417 + }, + { + "epoch": 46.41830065359477, + "eval_loss": 1.9444739818572998, + "eval_runtime": 2.4956, + "eval_samples_per_second": 779.376, + "eval_steps_per_second": 12.422, + "step": 418 + }, + { + "epoch": 46.52287581699346, + "eval_loss": 1.923707127571106, + "eval_runtime": 2.4651, + "eval_samples_per_second": 789.011, + "eval_steps_per_second": 12.575, + "step": 419 + }, + { + "epoch": 46.627450980392155, + "eval_loss": 1.9269739389419556, + "eval_runtime": 2.4648, + "eval_samples_per_second": 789.121, + "eval_steps_per_second": 12.577, + "step": 420 + }, + { + "epoch": 46.73202614379085, + "eval_loss": 1.9492802619934082, + "eval_runtime": 2.4582, + "eval_samples_per_second": 791.229, + "eval_steps_per_second": 12.611, + "step": 421 + }, + { + "epoch": 46.83660130718954, + "eval_loss": 1.9743090867996216, + "eval_runtime": 2.4551, + "eval_samples_per_second": 792.232, + "eval_steps_per_second": 12.627, + "step": 422 + }, + { + "epoch": 46.94117647058823, + "eval_loss": 1.957751750946045, + "eval_runtime": 2.434, + "eval_samples_per_second": 799.105, + "eval_steps_per_second": 12.736, + "step": 423 + }, + { + "epoch": 46.94117647058823, + "grad_norm": 1.9101431369781494, + "learning_rate": 1.276595744680851e-06, + "loss": 1.733, + "step": 423 + }, + { + "epoch": 47.10457516339869, + "eval_loss": 1.9019426107406616, + "eval_runtime": 2.4414, + "eval_samples_per_second": 796.664, + "eval_steps_per_second": 12.697, + "step": 424 + }, + { + "epoch": 47.209150326797385, + "eval_loss": 1.944284200668335, + "eval_runtime": 2.4514, + "eval_samples_per_second": 793.435, + "eval_steps_per_second": 12.646, + "step": 425 + }, + { + "epoch": 47.31372549019608, + "eval_loss": 1.9661508798599243, + "eval_runtime": 2.4442, + "eval_samples_per_second": 795.747, + "eval_steps_per_second": 12.683, + "step": 426 + }, + { + "epoch": 47.41830065359477, + "eval_loss": 1.9728316068649292, + "eval_runtime": 2.4822, + "eval_samples_per_second": 783.586, + "eval_steps_per_second": 12.489, + "step": 427 + }, + { + "epoch": 47.52287581699346, + "eval_loss": 1.923363208770752, + "eval_runtime": 2.4644, + "eval_samples_per_second": 789.251, + "eval_steps_per_second": 12.579, + "step": 428 + }, + { + "epoch": 47.627450980392155, + "eval_loss": 1.9165805578231812, + "eval_runtime": 2.3965, + "eval_samples_per_second": 811.601, + "eval_steps_per_second": 12.936, + "step": 429 + }, + { + "epoch": 47.73202614379085, + "eval_loss": 1.9413442611694336, + "eval_runtime": 2.4408, + "eval_samples_per_second": 796.869, + "eval_steps_per_second": 12.701, + "step": 430 + }, + { + "epoch": 47.83660130718954, + "eval_loss": 1.8855735063552856, + "eval_runtime": 2.4507, + "eval_samples_per_second": 793.659, + "eval_steps_per_second": 12.65, + "step": 431 + }, + { + "epoch": 47.94117647058823, + "eval_loss": 1.952731728553772, + "eval_runtime": 2.5241, + "eval_samples_per_second": 770.567, + "eval_steps_per_second": 12.282, + "step": 432 + }, + { + "epoch": 47.94117647058823, + "grad_norm": 1.7582765817642212, + "learning_rate": 8.510638297872341e-07, + "loss": 1.7065, + "step": 432 + }, + { + "epoch": 48.10457516339869, + "eval_loss": 1.9524160623550415, + "eval_runtime": 2.4138, + "eval_samples_per_second": 805.783, + "eval_steps_per_second": 12.843, + "step": 433 + }, + { + "epoch": 48.209150326797385, + "eval_loss": 1.9682825803756714, + "eval_runtime": 2.5144, + "eval_samples_per_second": 773.543, + "eval_steps_per_second": 12.329, + "step": 434 + }, + { + "epoch": 48.31372549019608, + "eval_loss": 1.9489309787750244, + "eval_runtime": 2.5744, + "eval_samples_per_second": 755.51, + "eval_steps_per_second": 12.042, + "step": 435 + }, + { + "epoch": 48.41830065359477, + "eval_loss": 1.9564448595046997, + "eval_runtime": 2.4521, + "eval_samples_per_second": 793.202, + "eval_steps_per_second": 12.642, + "step": 436 + }, + { + "epoch": 48.52287581699346, + "eval_loss": 1.9767297506332397, + "eval_runtime": 2.4404, + "eval_samples_per_second": 796.999, + "eval_steps_per_second": 12.703, + "step": 437 + }, + { + "epoch": 48.627450980392155, + "eval_loss": 1.9058864116668701, + "eval_runtime": 2.4851, + "eval_samples_per_second": 782.665, + "eval_steps_per_second": 12.474, + "step": 438 + }, + { + "epoch": 48.73202614379085, + "eval_loss": 1.9629017114639282, + "eval_runtime": 2.4371, + "eval_samples_per_second": 798.081, + "eval_steps_per_second": 12.72, + "step": 439 + }, + { + "epoch": 48.83660130718954, + "eval_loss": 1.937988519668579, + "eval_runtime": 2.4379, + "eval_samples_per_second": 797.822, + "eval_steps_per_second": 12.716, + "step": 440 + }, + { + "epoch": 48.94117647058823, + "eval_loss": 1.9694868326187134, + "eval_runtime": 2.4378, + "eval_samples_per_second": 797.855, + "eval_steps_per_second": 12.716, + "step": 441 + }, + { + "epoch": 48.94117647058823, + "grad_norm": 1.9079190492630005, + "learning_rate": 4.2553191489361704e-07, + "loss": 1.6997, + "step": 441 + }, + { + "epoch": 49.10457516339869, + "eval_loss": 1.9250315427780151, + "eval_runtime": 2.4353, + "eval_samples_per_second": 798.679, + "eval_steps_per_second": 12.73, + "step": 442 + }, + { + "epoch": 49.209150326797385, + "eval_loss": 1.9391655921936035, + "eval_runtime": 2.4099, + "eval_samples_per_second": 807.094, + "eval_steps_per_second": 12.864, + "step": 443 + }, + { + "epoch": 49.31372549019608, + "eval_loss": 1.9522807598114014, + "eval_runtime": 2.4344, + "eval_samples_per_second": 798.963, + "eval_steps_per_second": 12.734, + "step": 444 + }, + { + "epoch": 49.41830065359477, + "eval_loss": 1.9459158182144165, + "eval_runtime": 2.4156, + "eval_samples_per_second": 805.199, + "eval_steps_per_second": 12.834, + "step": 445 + }, + { + "epoch": 49.52287581699346, + "eval_loss": 1.9365217685699463, + "eval_runtime": 2.4071, + "eval_samples_per_second": 808.028, + "eval_steps_per_second": 12.879, + "step": 446 + }, + { + "epoch": 49.627450980392155, + "eval_loss": 1.9133816957473755, + "eval_runtime": 2.3906, + "eval_samples_per_second": 813.594, + "eval_steps_per_second": 12.967, + "step": 447 + }, + { + "epoch": 49.73202614379085, + "eval_loss": 1.9433826208114624, + "eval_runtime": 2.4005, + "eval_samples_per_second": 810.237, + "eval_steps_per_second": 12.914, + "step": 448 + }, + { + "epoch": 49.83660130718954, + "eval_loss": 1.9771692752838135, + "eval_runtime": 2.3888, + "eval_samples_per_second": 814.201, + "eval_steps_per_second": 12.977, + "step": 449 + }, + { + "epoch": 49.94117647058823, + "eval_loss": 1.9776495695114136, + "eval_runtime": 2.3848, + "eval_samples_per_second": 815.59, + "eval_steps_per_second": 12.999, + "step": 450 + }, + { + "epoch": 49.94117647058823, + "grad_norm": 1.8996864557266235, + "learning_rate": 0.0, + "loss": 1.6668, + "step": 450 + }, + { + "epoch": 49.94117647058823, + "step": 450, + "total_flos": 1.2548402868338688e+16, + "train_loss": 1.8820014402601455, + "train_runtime": 2628.924, + "train_samples_per_second": 185.114, + "train_steps_per_second": 0.171 + } + ], + "logging_steps": 500, + "max_steps": 450, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.2548402868338688e+16, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}