{ "best_metric": 1.8658331632614136, "best_model_checkpoint": "/nfs/production/literature/amina-mardiyyah/new_data/OT-Entity-Extraction-Pipeline/model_outputs/Continued_pretraining/TAPT/bioformers/bioformer-16L/Mardiyyah/TAPT_data_V2_split/tapt_base_LR-2e-05/checkpoint-255", "epoch": 49.94117647058823, "eval_steps": 1, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10457516339869281, "eval_loss": 2.221620798110962, "eval_runtime": 2.4071, "eval_samples_per_second": 808.035, "eval_steps_per_second": 12.879, "step": 1 }, { "epoch": 0.20915032679738563, "eval_loss": 2.1887502670288086, "eval_runtime": 2.4126, "eval_samples_per_second": 806.196, "eval_steps_per_second": 12.849, "step": 2 }, { "epoch": 0.3137254901960784, "eval_loss": 2.173266887664795, "eval_runtime": 2.4488, "eval_samples_per_second": 794.257, "eval_steps_per_second": 12.659, "step": 3 }, { "epoch": 0.41830065359477125, "eval_loss": 2.1297478675842285, "eval_runtime": 2.3778, "eval_samples_per_second": 817.994, "eval_steps_per_second": 13.037, "step": 4 }, { "epoch": 0.5228758169934641, "eval_loss": 2.192237377166748, "eval_runtime": 2.3912, "eval_samples_per_second": 813.415, "eval_steps_per_second": 12.964, "step": 5 }, { "epoch": 0.6274509803921569, "eval_loss": 2.1468276977539062, "eval_runtime": 2.3902, "eval_samples_per_second": 813.739, "eval_steps_per_second": 12.97, "step": 6 }, { "epoch": 0.7320261437908496, "eval_loss": 2.1432690620422363, "eval_runtime": 2.3678, "eval_samples_per_second": 821.446, "eval_steps_per_second": 13.092, "step": 7 }, { "epoch": 0.8366013071895425, "eval_loss": 2.0927038192749023, "eval_runtime": 2.3704, "eval_samples_per_second": 820.552, "eval_steps_per_second": 13.078, "step": 8 }, { "epoch": 0.9411764705882353, "eval_loss": 2.124303102493286, "eval_runtime": 2.4224, "eval_samples_per_second": 802.925, "eval_steps_per_second": 12.797, "step": 9 }, { "epoch": 0.9411764705882353, "grad_norm": 2.930349588394165, "learning_rate": 6.666666666666667e-06, "loss": 2.4847, "step": 9 }, { "epoch": 1.1045751633986929, "eval_loss": 2.1266961097717285, "eval_runtime": 2.3993, "eval_samples_per_second": 810.647, "eval_steps_per_second": 12.92, "step": 10 }, { "epoch": 1.2091503267973855, "eval_loss": 2.081995964050293, "eval_runtime": 2.4023, "eval_samples_per_second": 809.627, "eval_steps_per_second": 12.904, "step": 11 }, { "epoch": 1.3137254901960784, "eval_loss": 2.0737693309783936, "eval_runtime": 2.4011, "eval_samples_per_second": 810.042, "eval_steps_per_second": 12.911, "step": 12 }, { "epoch": 1.4183006535947713, "eval_loss": 2.0427086353302, "eval_runtime": 2.4076, "eval_samples_per_second": 807.874, "eval_steps_per_second": 12.876, "step": 13 }, { "epoch": 1.522875816993464, "eval_loss": 2.056819200515747, "eval_runtime": 2.3991, "eval_samples_per_second": 810.727, "eval_steps_per_second": 12.922, "step": 14 }, { "epoch": 1.6274509803921569, "eval_loss": 2.083451747894287, "eval_runtime": 2.4323, "eval_samples_per_second": 799.665, "eval_steps_per_second": 12.745, "step": 15 }, { "epoch": 1.7320261437908497, "eval_loss": 2.078913450241089, "eval_runtime": 2.4756, "eval_samples_per_second": 785.669, "eval_steps_per_second": 12.522, "step": 16 }, { "epoch": 1.8366013071895426, "eval_loss": 2.067417621612549, "eval_runtime": 2.438, "eval_samples_per_second": 797.798, "eval_steps_per_second": 12.716, "step": 17 }, { "epoch": 1.9411764705882353, "eval_loss": 2.0401482582092285, "eval_runtime": 2.4764, "eval_samples_per_second": 785.419, "eval_steps_per_second": 12.518, "step": 18 }, { "epoch": 1.9411764705882353, "grad_norm": 1.9967031478881836, "learning_rate": 1.3333333333333333e-05, "loss": 2.4101, "step": 18 }, { "epoch": 2.104575163398693, "eval_loss": 2.0421407222747803, "eval_runtime": 2.4395, "eval_samples_per_second": 797.31, "eval_steps_per_second": 12.708, "step": 19 }, { "epoch": 2.2091503267973858, "eval_loss": 2.0762155055999756, "eval_runtime": 2.4888, "eval_samples_per_second": 781.507, "eval_steps_per_second": 12.456, "step": 20 }, { "epoch": 2.313725490196078, "eval_loss": 2.006462574005127, "eval_runtime": 2.5161, "eval_samples_per_second": 773.026, "eval_steps_per_second": 12.321, "step": 21 }, { "epoch": 2.418300653594771, "eval_loss": 2.0763015747070312, "eval_runtime": 2.4587, "eval_samples_per_second": 791.08, "eval_steps_per_second": 12.608, "step": 22 }, { "epoch": 2.522875816993464, "eval_loss": 2.0424351692199707, "eval_runtime": 2.4605, "eval_samples_per_second": 790.5, "eval_steps_per_second": 12.599, "step": 23 }, { "epoch": 2.627450980392157, "eval_loss": 2.031003952026367, "eval_runtime": 2.4119, "eval_samples_per_second": 806.408, "eval_steps_per_second": 12.853, "step": 24 }, { "epoch": 2.7320261437908497, "eval_loss": 2.0873942375183105, "eval_runtime": 2.4077, "eval_samples_per_second": 807.816, "eval_steps_per_second": 12.875, "step": 25 }, { "epoch": 2.8366013071895426, "eval_loss": 2.0235297679901123, "eval_runtime": 2.3945, "eval_samples_per_second": 812.27, "eval_steps_per_second": 12.946, "step": 26 }, { "epoch": 2.9411764705882355, "eval_loss": 2.059739589691162, "eval_runtime": 2.4114, "eval_samples_per_second": 806.602, "eval_steps_per_second": 12.856, "step": 27 }, { "epoch": 2.9411764705882355, "grad_norm": 2.4962875843048096, "learning_rate": 2e-05, "loss": 2.3677, "step": 27 }, { "epoch": 3.104575163398693, "eval_loss": 1.986527442932129, "eval_runtime": 2.3999, "eval_samples_per_second": 810.462, "eval_steps_per_second": 12.917, "step": 28 }, { "epoch": 3.2091503267973858, "eval_loss": 2.0295257568359375, "eval_runtime": 2.4114, "eval_samples_per_second": 806.573, "eval_steps_per_second": 12.855, "step": 29 }, { "epoch": 3.313725490196078, "eval_loss": 2.029600143432617, "eval_runtime": 2.3867, "eval_samples_per_second": 814.924, "eval_steps_per_second": 12.989, "step": 30 }, { "epoch": 3.418300653594771, "eval_loss": 2.0018720626831055, "eval_runtime": 2.3894, "eval_samples_per_second": 814.014, "eval_steps_per_second": 12.974, "step": 31 }, { "epoch": 3.522875816993464, "eval_loss": 1.9695795774459839, "eval_runtime": 2.3919, "eval_samples_per_second": 813.157, "eval_steps_per_second": 12.96, "step": 32 }, { "epoch": 3.627450980392157, "eval_loss": 2.0265488624572754, "eval_runtime": 2.426, "eval_samples_per_second": 801.734, "eval_steps_per_second": 12.778, "step": 33 }, { "epoch": 3.7320261437908497, "eval_loss": 2.010695457458496, "eval_runtime": 2.429, "eval_samples_per_second": 800.731, "eval_steps_per_second": 12.762, "step": 34 }, { "epoch": 3.8366013071895426, "eval_loss": 2.034428119659424, "eval_runtime": 2.4086, "eval_samples_per_second": 807.51, "eval_steps_per_second": 12.87, "step": 35 }, { "epoch": 3.9411764705882355, "eval_loss": 2.0281381607055664, "eval_runtime": 2.4118, "eval_samples_per_second": 806.449, "eval_steps_per_second": 12.853, "step": 36 }, { "epoch": 3.9411764705882355, "grad_norm": 1.8768209218978882, "learning_rate": 1.9574468085106384e-05, "loss": 2.2639, "step": 36 }, { "epoch": 4.104575163398692, "eval_loss": 2.0171053409576416, "eval_runtime": 2.4083, "eval_samples_per_second": 807.623, "eval_steps_per_second": 12.872, "step": 37 }, { "epoch": 4.209150326797386, "eval_loss": 2.0344126224517822, "eval_runtime": 2.4108, "eval_samples_per_second": 806.785, "eval_steps_per_second": 12.859, "step": 38 }, { "epoch": 4.313725490196078, "eval_loss": 1.9913954734802246, "eval_runtime": 2.3858, "eval_samples_per_second": 815.248, "eval_steps_per_second": 12.994, "step": 39 }, { "epoch": 4.4183006535947715, "eval_loss": 1.9855905771255493, "eval_runtime": 2.3822, "eval_samples_per_second": 816.455, "eval_steps_per_second": 13.013, "step": 40 }, { "epoch": 4.522875816993464, "eval_loss": 2.0357260704040527, "eval_runtime": 2.3873, "eval_samples_per_second": 814.74, "eval_steps_per_second": 12.986, "step": 41 }, { "epoch": 4.627450980392156, "eval_loss": 2.028900384902954, "eval_runtime": 2.4177, "eval_samples_per_second": 804.49, "eval_steps_per_second": 12.822, "step": 42 }, { "epoch": 4.73202614379085, "eval_loss": 1.9714045524597168, "eval_runtime": 2.3849, "eval_samples_per_second": 815.537, "eval_steps_per_second": 12.998, "step": 43 }, { "epoch": 4.836601307189542, "eval_loss": 1.9895257949829102, "eval_runtime": 2.4769, "eval_samples_per_second": 785.249, "eval_steps_per_second": 12.516, "step": 44 }, { "epoch": 4.9411764705882355, "eval_loss": 1.990486741065979, "eval_runtime": 2.4263, "eval_samples_per_second": 801.627, "eval_steps_per_second": 12.777, "step": 45 }, { "epoch": 4.9411764705882355, "grad_norm": 1.8783236742019653, "learning_rate": 1.914893617021277e-05, "loss": 2.2037, "step": 45 }, { "epoch": 5.104575163398692, "eval_loss": 1.9589457511901855, "eval_runtime": 2.4045, "eval_samples_per_second": 808.887, "eval_steps_per_second": 12.892, "step": 46 }, { "epoch": 5.209150326797386, "eval_loss": 1.9864917993545532, "eval_runtime": 2.4112, "eval_samples_per_second": 806.648, "eval_steps_per_second": 12.857, "step": 47 }, { "epoch": 5.313725490196078, "eval_loss": 2.0113699436187744, "eval_runtime": 2.4027, "eval_samples_per_second": 809.512, "eval_steps_per_second": 12.902, "step": 48 }, { "epoch": 5.4183006535947715, "eval_loss": 2.0007834434509277, "eval_runtime": 2.4133, "eval_samples_per_second": 805.942, "eval_steps_per_second": 12.845, "step": 49 }, { "epoch": 5.522875816993464, "eval_loss": 1.9577592611312866, "eval_runtime": 2.3791, "eval_samples_per_second": 817.533, "eval_steps_per_second": 13.03, "step": 50 }, { "epoch": 5.627450980392156, "eval_loss": 2.029423236846924, "eval_runtime": 2.3902, "eval_samples_per_second": 813.74, "eval_steps_per_second": 12.97, "step": 51 }, { "epoch": 5.73202614379085, "eval_loss": 1.9585332870483398, "eval_runtime": 2.4118, "eval_samples_per_second": 806.436, "eval_steps_per_second": 12.853, "step": 52 }, { "epoch": 5.836601307189542, "eval_loss": 1.9783401489257812, "eval_runtime": 2.4264, "eval_samples_per_second": 801.594, "eval_steps_per_second": 12.776, "step": 53 }, { "epoch": 5.9411764705882355, "eval_loss": 1.9880473613739014, "eval_runtime": 2.488, "eval_samples_per_second": 781.753, "eval_steps_per_second": 12.46, "step": 54 }, { "epoch": 5.9411764705882355, "grad_norm": 1.8594753742218018, "learning_rate": 1.872340425531915e-05, "loss": 2.16, "step": 54 }, { "epoch": 6.104575163398692, "eval_loss": 2.0060460567474365, "eval_runtime": 2.4336, "eval_samples_per_second": 799.234, "eval_steps_per_second": 12.738, "step": 55 }, { "epoch": 6.209150326797386, "eval_loss": 1.9557570219039917, "eval_runtime": 2.4097, "eval_samples_per_second": 807.143, "eval_steps_per_second": 12.864, "step": 56 }, { "epoch": 6.313725490196078, "eval_loss": 1.9664386510849, "eval_runtime": 2.4059, "eval_samples_per_second": 808.441, "eval_steps_per_second": 12.885, "step": 57 }, { "epoch": 6.4183006535947715, "eval_loss": 1.920135498046875, "eval_runtime": 2.4056, "eval_samples_per_second": 808.538, "eval_steps_per_second": 12.887, "step": 58 }, { "epoch": 6.522875816993464, "eval_loss": 1.9815952777862549, "eval_runtime": 2.4122, "eval_samples_per_second": 806.325, "eval_steps_per_second": 12.851, "step": 59 }, { "epoch": 6.627450980392156, "eval_loss": 1.9681768417358398, "eval_runtime": 2.3867, "eval_samples_per_second": 814.941, "eval_steps_per_second": 12.989, "step": 60 }, { "epoch": 6.73202614379085, "eval_loss": 1.9605098962783813, "eval_runtime": 2.3875, "eval_samples_per_second": 814.648, "eval_steps_per_second": 12.984, "step": 61 }, { "epoch": 6.836601307189542, "eval_loss": 1.9233237504959106, "eval_runtime": 2.385, "eval_samples_per_second": 815.51, "eval_steps_per_second": 12.998, "step": 62 }, { "epoch": 6.9411764705882355, "eval_loss": 1.9687212705612183, "eval_runtime": 2.3854, "eval_samples_per_second": 815.361, "eval_steps_per_second": 12.995, "step": 63 }, { "epoch": 6.9411764705882355, "grad_norm": 1.886400818824768, "learning_rate": 1.8297872340425533e-05, "loss": 2.1108, "step": 63 }, { "epoch": 7.104575163398692, "eval_loss": 1.9986543655395508, "eval_runtime": 2.4144, "eval_samples_per_second": 805.581, "eval_steps_per_second": 12.84, "step": 64 }, { "epoch": 7.209150326797386, "eval_loss": 2.002251386642456, "eval_runtime": 2.4255, "eval_samples_per_second": 801.892, "eval_steps_per_second": 12.781, "step": 65 }, { "epoch": 7.313725490196078, "eval_loss": 1.9626870155334473, "eval_runtime": 2.4235, "eval_samples_per_second": 802.565, "eval_steps_per_second": 12.792, "step": 66 }, { "epoch": 7.4183006535947715, "eval_loss": 2.0214684009552, "eval_runtime": 2.4109, "eval_samples_per_second": 806.764, "eval_steps_per_second": 12.858, "step": 67 }, { "epoch": 7.522875816993464, "eval_loss": 1.961344599723816, "eval_runtime": 2.4317, "eval_samples_per_second": 799.86, "eval_steps_per_second": 12.748, "step": 68 }, { "epoch": 7.627450980392156, "eval_loss": 2.026102066040039, "eval_runtime": 2.4027, "eval_samples_per_second": 809.502, "eval_steps_per_second": 12.902, "step": 69 }, { "epoch": 7.73202614379085, "eval_loss": 1.9625698328018188, "eval_runtime": 2.3794, "eval_samples_per_second": 817.425, "eval_steps_per_second": 13.028, "step": 70 }, { "epoch": 7.836601307189542, "eval_loss": 2.000683546066284, "eval_runtime": 2.3828, "eval_samples_per_second": 816.278, "eval_steps_per_second": 13.01, "step": 71 }, { "epoch": 7.9411764705882355, "eval_loss": 1.9403586387634277, "eval_runtime": 2.383, "eval_samples_per_second": 816.21, "eval_steps_per_second": 13.009, "step": 72 }, { "epoch": 7.9411764705882355, "grad_norm": 3.798304319381714, "learning_rate": 1.7872340425531915e-05, "loss": 2.0949, "step": 72 }, { "epoch": 8.104575163398692, "eval_loss": 1.994275450706482, "eval_runtime": 2.3862, "eval_samples_per_second": 815.09, "eval_steps_per_second": 12.991, "step": 73 }, { "epoch": 8.209150326797385, "eval_loss": 2.0442616939544678, "eval_runtime": 2.3871, "eval_samples_per_second": 814.81, "eval_steps_per_second": 12.987, "step": 74 }, { "epoch": 8.313725490196079, "eval_loss": 1.99091637134552, "eval_runtime": 2.3849, "eval_samples_per_second": 815.556, "eval_steps_per_second": 12.999, "step": 75 }, { "epoch": 8.418300653594772, "eval_loss": 1.9789609909057617, "eval_runtime": 2.4032, "eval_samples_per_second": 809.347, "eval_steps_per_second": 12.9, "step": 76 }, { "epoch": 8.522875816993464, "eval_loss": 1.9505332708358765, "eval_runtime": 2.457, "eval_samples_per_second": 791.608, "eval_steps_per_second": 12.617, "step": 77 }, { "epoch": 8.627450980392156, "eval_loss": 1.9477442502975464, "eval_runtime": 2.4164, "eval_samples_per_second": 804.91, "eval_steps_per_second": 12.829, "step": 78 }, { "epoch": 8.732026143790849, "eval_loss": 2.027162790298462, "eval_runtime": 2.4059, "eval_samples_per_second": 808.413, "eval_steps_per_second": 12.885, "step": 79 }, { "epoch": 8.836601307189543, "eval_loss": 1.954852819442749, "eval_runtime": 2.4078, "eval_samples_per_second": 807.8, "eval_steps_per_second": 12.875, "step": 80 }, { "epoch": 8.941176470588236, "eval_loss": 1.9641313552856445, "eval_runtime": 2.4106, "eval_samples_per_second": 806.865, "eval_steps_per_second": 12.86, "step": 81 }, { "epoch": 8.941176470588236, "grad_norm": 1.9192023277282715, "learning_rate": 1.74468085106383e-05, "loss": 2.0617, "step": 81 }, { "epoch": 9.104575163398692, "eval_loss": 1.9859141111373901, "eval_runtime": 2.3909, "eval_samples_per_second": 813.488, "eval_steps_per_second": 12.966, "step": 82 }, { "epoch": 9.209150326797385, "eval_loss": 1.937601089477539, "eval_runtime": 2.3889, "eval_samples_per_second": 814.196, "eval_steps_per_second": 12.977, "step": 83 }, { "epoch": 9.313725490196079, "eval_loss": 1.9699262380599976, "eval_runtime": 2.3832, "eval_samples_per_second": 816.116, "eval_steps_per_second": 13.008, "step": 84 }, { "epoch": 9.418300653594772, "eval_loss": 1.9334497451782227, "eval_runtime": 2.4366, "eval_samples_per_second": 798.247, "eval_steps_per_second": 12.723, "step": 85 }, { "epoch": 9.522875816993464, "eval_loss": 1.9708276987075806, "eval_runtime": 2.3835, "eval_samples_per_second": 816.03, "eval_steps_per_second": 13.006, "step": 86 }, { "epoch": 9.627450980392156, "eval_loss": 1.970037817955017, "eval_runtime": 2.4674, "eval_samples_per_second": 788.282, "eval_steps_per_second": 12.564, "step": 87 }, { "epoch": 9.732026143790849, "eval_loss": 1.9634466171264648, "eval_runtime": 2.4497, "eval_samples_per_second": 793.975, "eval_steps_per_second": 12.655, "step": 88 }, { "epoch": 9.836601307189543, "eval_loss": 1.92203688621521, "eval_runtime": 2.4109, "eval_samples_per_second": 806.761, "eval_steps_per_second": 12.858, "step": 89 }, { "epoch": 9.941176470588236, "eval_loss": 1.966900110244751, "eval_runtime": 2.4107, "eval_samples_per_second": 806.822, "eval_steps_per_second": 12.859, "step": 90 }, { "epoch": 9.941176470588236, "grad_norm": 2.050672769546509, "learning_rate": 1.7021276595744682e-05, "loss": 2.0509, "step": 90 }, { "epoch": 10.104575163398692, "eval_loss": 1.956833004951477, "eval_runtime": 2.3947, "eval_samples_per_second": 812.203, "eval_steps_per_second": 12.945, "step": 91 }, { "epoch": 10.209150326797385, "eval_loss": 1.9699444770812988, "eval_runtime": 2.3837, "eval_samples_per_second": 815.943, "eval_steps_per_second": 13.005, "step": 92 }, { "epoch": 10.313725490196079, "eval_loss": 2.0316123962402344, "eval_runtime": 2.3823, "eval_samples_per_second": 816.431, "eval_steps_per_second": 13.013, "step": 93 }, { "epoch": 10.418300653594772, "eval_loss": 1.912984848022461, "eval_runtime": 2.3869, "eval_samples_per_second": 814.86, "eval_steps_per_second": 12.987, "step": 94 }, { "epoch": 10.522875816993464, "eval_loss": 1.9707229137420654, "eval_runtime": 2.3837, "eval_samples_per_second": 815.942, "eval_steps_per_second": 13.005, "step": 95 }, { "epoch": 10.627450980392156, "eval_loss": 1.9623687267303467, "eval_runtime": 2.455, "eval_samples_per_second": 792.269, "eval_steps_per_second": 12.627, "step": 96 }, { "epoch": 10.732026143790849, "eval_loss": 1.9515836238861084, "eval_runtime": 2.4477, "eval_samples_per_second": 794.62, "eval_steps_per_second": 12.665, "step": 97 }, { "epoch": 10.836601307189543, "eval_loss": 1.9508367776870728, "eval_runtime": 2.407, "eval_samples_per_second": 808.071, "eval_steps_per_second": 12.879, "step": 98 }, { "epoch": 10.941176470588236, "eval_loss": 1.9166395664215088, "eval_runtime": 2.4193, "eval_samples_per_second": 803.965, "eval_steps_per_second": 12.814, "step": 99 }, { "epoch": 10.941176470588236, "grad_norm": 1.911039113998413, "learning_rate": 1.6595744680851064e-05, "loss": 1.9835, "step": 99 }, { "epoch": 11.104575163398692, "eval_loss": 1.9469496011734009, "eval_runtime": 2.4171, "eval_samples_per_second": 804.693, "eval_steps_per_second": 12.825, "step": 100 }, { "epoch": 11.209150326797385, "eval_loss": 1.962018609046936, "eval_runtime": 2.3932, "eval_samples_per_second": 812.727, "eval_steps_per_second": 12.953, "step": 101 }, { "epoch": 11.313725490196079, "eval_loss": 1.94699227809906, "eval_runtime": 2.3789, "eval_samples_per_second": 817.61, "eval_steps_per_second": 13.031, "step": 102 }, { "epoch": 11.418300653594772, "eval_loss": 1.945833444595337, "eval_runtime": 2.3834, "eval_samples_per_second": 816.059, "eval_steps_per_second": 13.007, "step": 103 }, { "epoch": 11.522875816993464, "eval_loss": 1.9585113525390625, "eval_runtime": 2.3811, "eval_samples_per_second": 816.833, "eval_steps_per_second": 13.019, "step": 104 }, { "epoch": 11.627450980392156, "eval_loss": 1.9450502395629883, "eval_runtime": 2.381, "eval_samples_per_second": 816.892, "eval_steps_per_second": 13.02, "step": 105 }, { "epoch": 11.732026143790849, "eval_loss": 1.9202919006347656, "eval_runtime": 2.4348, "eval_samples_per_second": 798.83, "eval_steps_per_second": 12.732, "step": 106 }, { "epoch": 11.836601307189543, "eval_loss": 1.9322612285614014, "eval_runtime": 2.4503, "eval_samples_per_second": 793.773, "eval_steps_per_second": 12.651, "step": 107 }, { "epoch": 11.941176470588236, "eval_loss": 1.9641361236572266, "eval_runtime": 2.4059, "eval_samples_per_second": 808.427, "eval_steps_per_second": 12.885, "step": 108 }, { "epoch": 11.941176470588236, "grad_norm": 1.9469199180603027, "learning_rate": 1.6170212765957446e-05, "loss": 1.9719, "step": 108 }, { "epoch": 12.104575163398692, "eval_loss": 1.9262347221374512, "eval_runtime": 2.4058, "eval_samples_per_second": 808.471, "eval_steps_per_second": 12.886, "step": 109 }, { "epoch": 12.209150326797385, "eval_loss": 1.9799877405166626, "eval_runtime": 2.4101, "eval_samples_per_second": 807.019, "eval_steps_per_second": 12.863, "step": 110 }, { "epoch": 12.313725490196079, "eval_loss": 1.9421709775924683, "eval_runtime": 2.3763, "eval_samples_per_second": 818.512, "eval_steps_per_second": 13.046, "step": 111 }, { "epoch": 12.418300653594772, "eval_loss": 1.9286293983459473, "eval_runtime": 2.4023, "eval_samples_per_second": 809.639, "eval_steps_per_second": 12.904, "step": 112 }, { "epoch": 12.522875816993464, "eval_loss": 1.9933801889419556, "eval_runtime": 2.3795, "eval_samples_per_second": 817.406, "eval_steps_per_second": 13.028, "step": 113 }, { "epoch": 12.627450980392156, "eval_loss": 1.9704465866088867, "eval_runtime": 2.3792, "eval_samples_per_second": 817.502, "eval_steps_per_second": 13.03, "step": 114 }, { "epoch": 12.732026143790849, "eval_loss": 1.939013957977295, "eval_runtime": 2.4295, "eval_samples_per_second": 800.592, "eval_steps_per_second": 12.76, "step": 115 }, { "epoch": 12.836601307189543, "eval_loss": 1.916093349456787, "eval_runtime": 2.3862, "eval_samples_per_second": 815.104, "eval_steps_per_second": 12.991, "step": 116 }, { "epoch": 12.941176470588236, "eval_loss": 1.94829523563385, "eval_runtime": 2.4403, "eval_samples_per_second": 797.034, "eval_steps_per_second": 12.703, "step": 117 }, { "epoch": 12.941176470588236, "grad_norm": 1.835829734802246, "learning_rate": 1.5744680851063832e-05, "loss": 1.9663, "step": 117 }, { "epoch": 13.104575163398692, "eval_loss": 1.9584107398986816, "eval_runtime": 2.4351, "eval_samples_per_second": 798.732, "eval_steps_per_second": 12.73, "step": 118 }, { "epoch": 13.209150326797385, "eval_loss": 1.9641852378845215, "eval_runtime": 2.4617, "eval_samples_per_second": 790.102, "eval_steps_per_second": 12.593, "step": 119 }, { "epoch": 13.313725490196079, "eval_loss": 1.9446567296981812, "eval_runtime": 2.4215, "eval_samples_per_second": 803.226, "eval_steps_per_second": 12.802, "step": 120 }, { "epoch": 13.418300653594772, "eval_loss": 2.001385450363159, "eval_runtime": 2.416, "eval_samples_per_second": 805.044, "eval_steps_per_second": 12.831, "step": 121 }, { "epoch": 13.522875816993464, "eval_loss": 1.8805845975875854, "eval_runtime": 2.4069, "eval_samples_per_second": 808.102, "eval_steps_per_second": 12.88, "step": 122 }, { "epoch": 13.627450980392156, "eval_loss": 1.9486974477767944, "eval_runtime": 2.4072, "eval_samples_per_second": 807.987, "eval_steps_per_second": 12.878, "step": 123 }, { "epoch": 13.732026143790849, "eval_loss": 1.9180878400802612, "eval_runtime": 2.4102, "eval_samples_per_second": 806.973, "eval_steps_per_second": 12.862, "step": 124 }, { "epoch": 13.836601307189543, "eval_loss": 1.9238054752349854, "eval_runtime": 2.3928, "eval_samples_per_second": 812.868, "eval_steps_per_second": 12.956, "step": 125 }, { "epoch": 13.941176470588236, "eval_loss": 1.9513754844665527, "eval_runtime": 2.3862, "eval_samples_per_second": 815.11, "eval_steps_per_second": 12.991, "step": 126 }, { "epoch": 13.941176470588236, "grad_norm": 1.8433274030685425, "learning_rate": 1.5319148936170214e-05, "loss": 1.9785, "step": 126 }, { "epoch": 14.104575163398692, "eval_loss": 1.9426443576812744, "eval_runtime": 2.3889, "eval_samples_per_second": 814.174, "eval_steps_per_second": 12.977, "step": 127 }, { "epoch": 14.209150326797385, "eval_loss": 1.9765559434890747, "eval_runtime": 2.3875, "eval_samples_per_second": 814.644, "eval_steps_per_second": 12.984, "step": 128 }, { "epoch": 14.313725490196079, "eval_loss": 1.9118081331253052, "eval_runtime": 2.4268, "eval_samples_per_second": 801.463, "eval_steps_per_second": 12.774, "step": 129 }, { "epoch": 14.418300653594772, "eval_loss": 1.9367104768753052, "eval_runtime": 2.4183, "eval_samples_per_second": 804.282, "eval_steps_per_second": 12.819, "step": 130 }, { "epoch": 14.522875816993464, "eval_loss": 1.9372411966323853, "eval_runtime": 2.4201, "eval_samples_per_second": 803.68, "eval_steps_per_second": 12.809, "step": 131 }, { "epoch": 14.627450980392156, "eval_loss": 1.923244595527649, "eval_runtime": 2.4324, "eval_samples_per_second": 799.606, "eval_steps_per_second": 12.744, "step": 132 }, { "epoch": 14.732026143790849, "eval_loss": 1.999928593635559, "eval_runtime": 2.4161, "eval_samples_per_second": 805.026, "eval_steps_per_second": 12.831, "step": 133 }, { "epoch": 14.836601307189543, "eval_loss": 1.9354963302612305, "eval_runtime": 2.3965, "eval_samples_per_second": 811.59, "eval_steps_per_second": 12.935, "step": 134 }, { "epoch": 14.941176470588236, "eval_loss": 1.965717077255249, "eval_runtime": 2.4083, "eval_samples_per_second": 807.639, "eval_steps_per_second": 12.872, "step": 135 }, { "epoch": 14.941176470588236, "grad_norm": 1.9256954193115234, "learning_rate": 1.4893617021276596e-05, "loss": 1.9329, "step": 135 }, { "epoch": 15.104575163398692, "eval_loss": 1.9451290369033813, "eval_runtime": 2.3926, "eval_samples_per_second": 812.94, "eval_steps_per_second": 12.957, "step": 136 }, { "epoch": 15.209150326797385, "eval_loss": 1.9596805572509766, "eval_runtime": 2.4003, "eval_samples_per_second": 810.326, "eval_steps_per_second": 12.915, "step": 137 }, { "epoch": 15.313725490196079, "eval_loss": 1.9179918766021729, "eval_runtime": 2.388, "eval_samples_per_second": 814.49, "eval_steps_per_second": 12.982, "step": 138 }, { "epoch": 15.418300653594772, "eval_loss": 1.9344438314437866, "eval_runtime": 2.4394, "eval_samples_per_second": 797.34, "eval_steps_per_second": 12.708, "step": 139 }, { "epoch": 15.522875816993464, "eval_loss": 1.9772499799728394, "eval_runtime": 2.4332, "eval_samples_per_second": 799.368, "eval_steps_per_second": 12.741, "step": 140 }, { "epoch": 15.627450980392156, "eval_loss": 1.9796696901321411, "eval_runtime": 2.4159, "eval_samples_per_second": 805.096, "eval_steps_per_second": 12.832, "step": 141 }, { "epoch": 15.732026143790849, "eval_loss": 1.9060624837875366, "eval_runtime": 2.4134, "eval_samples_per_second": 805.928, "eval_steps_per_second": 12.845, "step": 142 }, { "epoch": 15.836601307189543, "eval_loss": 1.8885753154754639, "eval_runtime": 2.4138, "eval_samples_per_second": 805.794, "eval_steps_per_second": 12.843, "step": 143 }, { "epoch": 15.941176470588236, "eval_loss": 1.9685148000717163, "eval_runtime": 2.4145, "eval_samples_per_second": 805.548, "eval_steps_per_second": 12.839, "step": 144 }, { "epoch": 15.941176470588236, "grad_norm": 1.8362805843353271, "learning_rate": 1.4468085106382981e-05, "loss": 1.9144, "step": 144 }, { "epoch": 16.104575163398692, "eval_loss": 1.9797979593276978, "eval_runtime": 2.3896, "eval_samples_per_second": 813.952, "eval_steps_per_second": 12.973, "step": 145 }, { "epoch": 16.209150326797385, "eval_loss": 1.9587923288345337, "eval_runtime": 2.4182, "eval_samples_per_second": 804.331, "eval_steps_per_second": 12.82, "step": 146 }, { "epoch": 16.313725490196077, "eval_loss": 1.9274431467056274, "eval_runtime": 2.3878, "eval_samples_per_second": 814.56, "eval_steps_per_second": 12.983, "step": 147 }, { "epoch": 16.41830065359477, "eval_loss": 1.958984375, "eval_runtime": 2.4353, "eval_samples_per_second": 798.68, "eval_steps_per_second": 12.73, "step": 148 }, { "epoch": 16.522875816993466, "eval_loss": 1.9552897214889526, "eval_runtime": 2.4367, "eval_samples_per_second": 798.2, "eval_steps_per_second": 12.722, "step": 149 }, { "epoch": 16.627450980392158, "eval_loss": 1.9142913818359375, "eval_runtime": 2.4118, "eval_samples_per_second": 806.44, "eval_steps_per_second": 12.853, "step": 150 }, { "epoch": 16.73202614379085, "eval_loss": 1.9268592596054077, "eval_runtime": 2.4143, "eval_samples_per_second": 805.633, "eval_steps_per_second": 12.84, "step": 151 }, { "epoch": 16.836601307189543, "eval_loss": 1.965384840965271, "eval_runtime": 2.4177, "eval_samples_per_second": 804.492, "eval_steps_per_second": 12.822, "step": 152 }, { "epoch": 16.941176470588236, "eval_loss": 1.9789389371871948, "eval_runtime": 2.4157, "eval_samples_per_second": 805.134, "eval_steps_per_second": 12.832, "step": 153 }, { "epoch": 16.941176470588236, "grad_norm": 1.918270230293274, "learning_rate": 1.4042553191489363e-05, "loss": 1.9103, "step": 153 }, { "epoch": 17.104575163398692, "eval_loss": 1.9568538665771484, "eval_runtime": 2.4112, "eval_samples_per_second": 806.642, "eval_steps_per_second": 12.857, "step": 154 }, { "epoch": 17.209150326797385, "eval_loss": 1.9652351140975952, "eval_runtime": 2.3867, "eval_samples_per_second": 814.935, "eval_steps_per_second": 12.989, "step": 155 }, { "epoch": 17.313725490196077, "eval_loss": 1.9810242652893066, "eval_runtime": 2.3904, "eval_samples_per_second": 813.655, "eval_steps_per_second": 12.968, "step": 156 }, { "epoch": 17.41830065359477, "eval_loss": 1.928475260734558, "eval_runtime": 2.3938, "eval_samples_per_second": 812.526, "eval_steps_per_second": 12.95, "step": 157 }, { "epoch": 17.522875816993466, "eval_loss": 1.937834620475769, "eval_runtime": 2.4231, "eval_samples_per_second": 802.677, "eval_steps_per_second": 12.793, "step": 158 }, { "epoch": 17.627450980392158, "eval_loss": 1.9520132541656494, "eval_runtime": 2.4525, "eval_samples_per_second": 793.081, "eval_steps_per_second": 12.64, "step": 159 }, { "epoch": 17.73202614379085, "eval_loss": 1.9782063961029053, "eval_runtime": 2.4273, "eval_samples_per_second": 801.302, "eval_steps_per_second": 12.771, "step": 160 }, { "epoch": 17.836601307189543, "eval_loss": 1.9681016206741333, "eval_runtime": 2.4116, "eval_samples_per_second": 806.515, "eval_steps_per_second": 12.854, "step": 161 }, { "epoch": 17.941176470588236, "eval_loss": 1.8925799131393433, "eval_runtime": 2.4098, "eval_samples_per_second": 807.121, "eval_steps_per_second": 12.864, "step": 162 }, { "epoch": 17.941176470588236, "grad_norm": 1.8396626710891724, "learning_rate": 1.3617021276595745e-05, "loss": 1.887, "step": 162 }, { "epoch": 18.104575163398692, "eval_loss": 1.9333585500717163, "eval_runtime": 2.4106, "eval_samples_per_second": 806.84, "eval_steps_per_second": 12.86, "step": 163 }, { "epoch": 18.209150326797385, "eval_loss": 1.925223469734192, "eval_runtime": 2.3936, "eval_samples_per_second": 812.57, "eval_steps_per_second": 12.951, "step": 164 }, { "epoch": 18.313725490196077, "eval_loss": 1.9398906230926514, "eval_runtime": 2.4263, "eval_samples_per_second": 801.632, "eval_steps_per_second": 12.777, "step": 165 }, { "epoch": 18.41830065359477, "eval_loss": 1.9518330097198486, "eval_runtime": 2.3924, "eval_samples_per_second": 812.992, "eval_steps_per_second": 12.958, "step": 166 }, { "epoch": 18.522875816993466, "eval_loss": 1.992385983467102, "eval_runtime": 2.3882, "eval_samples_per_second": 814.425, "eval_steps_per_second": 12.981, "step": 167 }, { "epoch": 18.627450980392158, "eval_loss": 1.905411720275879, "eval_runtime": 2.3878, "eval_samples_per_second": 814.555, "eval_steps_per_second": 12.983, "step": 168 }, { "epoch": 18.73202614379085, "eval_loss": 1.9480212926864624, "eval_runtime": 2.4462, "eval_samples_per_second": 795.096, "eval_steps_per_second": 12.672, "step": 169 }, { "epoch": 18.836601307189543, "eval_loss": 1.9308433532714844, "eval_runtime": 2.445, "eval_samples_per_second": 795.492, "eval_steps_per_second": 12.679, "step": 170 }, { "epoch": 18.941176470588236, "eval_loss": 1.9342797994613647, "eval_runtime": 2.414, "eval_samples_per_second": 805.712, "eval_steps_per_second": 12.842, "step": 171 }, { "epoch": 18.941176470588236, "grad_norm": 1.8954132795333862, "learning_rate": 1.3191489361702127e-05, "loss": 1.8644, "step": 171 }, { "epoch": 19.104575163398692, "eval_loss": 1.9860589504241943, "eval_runtime": 2.4977, "eval_samples_per_second": 778.731, "eval_steps_per_second": 12.412, "step": 172 }, { "epoch": 19.209150326797385, "eval_loss": 1.9452682733535767, "eval_runtime": 2.413, "eval_samples_per_second": 806.043, "eval_steps_per_second": 12.847, "step": 173 }, { "epoch": 19.313725490196077, "eval_loss": 1.8998777866363525, "eval_runtime": 2.3895, "eval_samples_per_second": 813.974, "eval_steps_per_second": 12.973, "step": 174 }, { "epoch": 19.41830065359477, "eval_loss": 1.93086838722229, "eval_runtime": 2.383, "eval_samples_per_second": 816.195, "eval_steps_per_second": 13.009, "step": 175 }, { "epoch": 19.522875816993466, "eval_loss": 1.954423189163208, "eval_runtime": 2.393, "eval_samples_per_second": 812.774, "eval_steps_per_second": 12.954, "step": 176 }, { "epoch": 19.627450980392158, "eval_loss": 1.9435521364212036, "eval_runtime": 2.4095, "eval_samples_per_second": 807.227, "eval_steps_per_second": 12.866, "step": 177 }, { "epoch": 19.73202614379085, "eval_loss": 1.9165093898773193, "eval_runtime": 2.3936, "eval_samples_per_second": 812.598, "eval_steps_per_second": 12.951, "step": 178 }, { "epoch": 19.836601307189543, "eval_loss": 1.9695576429367065, "eval_runtime": 2.4194, "eval_samples_per_second": 803.908, "eval_steps_per_second": 12.813, "step": 179 }, { "epoch": 19.941176470588236, "eval_loss": 1.9247905015945435, "eval_runtime": 2.4477, "eval_samples_per_second": 794.61, "eval_steps_per_second": 12.665, "step": 180 }, { "epoch": 19.941176470588236, "grad_norm": 1.8007246255874634, "learning_rate": 1.2765957446808513e-05, "loss": 1.8687, "step": 180 }, { "epoch": 20.104575163398692, "eval_loss": 1.9517226219177246, "eval_runtime": 2.4132, "eval_samples_per_second": 805.981, "eval_steps_per_second": 12.846, "step": 181 }, { "epoch": 20.209150326797385, "eval_loss": 1.9041943550109863, "eval_runtime": 2.4088, "eval_samples_per_second": 807.45, "eval_steps_per_second": 12.869, "step": 182 }, { "epoch": 20.313725490196077, "eval_loss": 1.992538571357727, "eval_runtime": 2.4151, "eval_samples_per_second": 805.334, "eval_steps_per_second": 12.836, "step": 183 }, { "epoch": 20.41830065359477, "eval_loss": 1.8842642307281494, "eval_runtime": 2.4235, "eval_samples_per_second": 802.553, "eval_steps_per_second": 12.791, "step": 184 }, { "epoch": 20.522875816993466, "eval_loss": 1.979435682296753, "eval_runtime": 2.3874, "eval_samples_per_second": 814.695, "eval_steps_per_second": 12.985, "step": 185 }, { "epoch": 20.627450980392158, "eval_loss": 1.9789183139801025, "eval_runtime": 2.3863, "eval_samples_per_second": 815.07, "eval_steps_per_second": 12.991, "step": 186 }, { "epoch": 20.73202614379085, "eval_loss": 1.9192243814468384, "eval_runtime": 2.3878, "eval_samples_per_second": 814.573, "eval_steps_per_second": 12.983, "step": 187 }, { "epoch": 20.836601307189543, "eval_loss": 1.9174364805221558, "eval_runtime": 2.3935, "eval_samples_per_second": 812.633, "eval_steps_per_second": 12.952, "step": 188 }, { "epoch": 20.941176470588236, "eval_loss": 1.9568063020706177, "eval_runtime": 2.4211, "eval_samples_per_second": 803.354, "eval_steps_per_second": 12.804, "step": 189 }, { "epoch": 20.941176470588236, "grad_norm": 1.8035422563552856, "learning_rate": 1.2340425531914895e-05, "loss": 1.8361, "step": 189 }, { "epoch": 21.104575163398692, "eval_loss": 1.9128376245498657, "eval_runtime": 2.4627, "eval_samples_per_second": 789.793, "eval_steps_per_second": 12.588, "step": 190 }, { "epoch": 21.209150326797385, "eval_loss": 1.9428894519805908, "eval_runtime": 2.4145, "eval_samples_per_second": 805.537, "eval_steps_per_second": 12.839, "step": 191 }, { "epoch": 21.313725490196077, "eval_loss": 1.95577073097229, "eval_runtime": 2.4665, "eval_samples_per_second": 788.556, "eval_steps_per_second": 12.568, "step": 192 }, { "epoch": 21.41830065359477, "eval_loss": 1.9128402471542358, "eval_runtime": 2.4073, "eval_samples_per_second": 807.972, "eval_steps_per_second": 12.878, "step": 193 }, { "epoch": 21.522875816993466, "eval_loss": 1.9588518142700195, "eval_runtime": 2.3872, "eval_samples_per_second": 814.75, "eval_steps_per_second": 12.986, "step": 194 }, { "epoch": 21.627450980392158, "eval_loss": 1.9744739532470703, "eval_runtime": 2.4193, "eval_samples_per_second": 803.936, "eval_steps_per_second": 12.813, "step": 195 }, { "epoch": 21.73202614379085, "eval_loss": 1.9993598461151123, "eval_runtime": 2.3876, "eval_samples_per_second": 814.634, "eval_steps_per_second": 12.984, "step": 196 }, { "epoch": 21.836601307189543, "eval_loss": 1.959428071975708, "eval_runtime": 2.3868, "eval_samples_per_second": 814.91, "eval_steps_per_second": 12.988, "step": 197 }, { "epoch": 21.941176470588236, "eval_loss": 1.9063607454299927, "eval_runtime": 2.3998, "eval_samples_per_second": 810.501, "eval_steps_per_second": 12.918, "step": 198 }, { "epoch": 21.941176470588236, "grad_norm": 1.7296489477157593, "learning_rate": 1.1914893617021277e-05, "loss": 1.8461, "step": 198 }, { "epoch": 22.104575163398692, "eval_loss": 1.9475386142730713, "eval_runtime": 2.4105, "eval_samples_per_second": 806.877, "eval_steps_per_second": 12.86, "step": 199 }, { "epoch": 22.209150326797385, "eval_loss": 1.9637689590454102, "eval_runtime": 2.4134, "eval_samples_per_second": 805.906, "eval_steps_per_second": 12.845, "step": 200 }, { "epoch": 22.313725490196077, "eval_loss": 1.9350510835647583, "eval_runtime": 2.4228, "eval_samples_per_second": 802.787, "eval_steps_per_second": 12.795, "step": 201 }, { "epoch": 22.41830065359477, "eval_loss": 1.9184238910675049, "eval_runtime": 2.4127, "eval_samples_per_second": 806.144, "eval_steps_per_second": 12.849, "step": 202 }, { "epoch": 22.522875816993466, "eval_loss": 1.9656862020492554, "eval_runtime": 2.4145, "eval_samples_per_second": 805.559, "eval_steps_per_second": 12.839, "step": 203 }, { "epoch": 22.627450980392158, "eval_loss": 1.9108870029449463, "eval_runtime": 2.3915, "eval_samples_per_second": 813.292, "eval_steps_per_second": 12.962, "step": 204 }, { "epoch": 22.73202614379085, "eval_loss": 1.9319818019866943, "eval_runtime": 2.3918, "eval_samples_per_second": 813.179, "eval_steps_per_second": 12.961, "step": 205 }, { "epoch": 22.836601307189543, "eval_loss": 1.9680215120315552, "eval_runtime": 2.4001, "eval_samples_per_second": 810.398, "eval_steps_per_second": 12.916, "step": 206 }, { "epoch": 22.941176470588236, "eval_loss": 1.9628697633743286, "eval_runtime": 2.3906, "eval_samples_per_second": 813.608, "eval_steps_per_second": 12.968, "step": 207 }, { "epoch": 22.941176470588236, "grad_norm": 1.8930681943893433, "learning_rate": 1.1489361702127662e-05, "loss": 1.8246, "step": 207 }, { "epoch": 23.104575163398692, "eval_loss": 1.9429619312286377, "eval_runtime": 2.3877, "eval_samples_per_second": 814.582, "eval_steps_per_second": 12.983, "step": 208 }, { "epoch": 23.209150326797385, "eval_loss": 1.9262027740478516, "eval_runtime": 2.5188, "eval_samples_per_second": 772.187, "eval_steps_per_second": 12.307, "step": 209 }, { "epoch": 23.313725490196077, "eval_loss": 1.9614677429199219, "eval_runtime": 2.4709, "eval_samples_per_second": 787.175, "eval_steps_per_second": 12.546, "step": 210 }, { "epoch": 23.41830065359477, "eval_loss": 1.9559693336486816, "eval_runtime": 2.4152, "eval_samples_per_second": 805.308, "eval_steps_per_second": 12.835, "step": 211 }, { "epoch": 23.522875816993466, "eval_loss": 1.966059923171997, "eval_runtime": 2.4122, "eval_samples_per_second": 806.317, "eval_steps_per_second": 12.851, "step": 212 }, { "epoch": 23.627450980392158, "eval_loss": 1.9780749082565308, "eval_runtime": 2.437, "eval_samples_per_second": 798.101, "eval_steps_per_second": 12.72, "step": 213 }, { "epoch": 23.73202614379085, "eval_loss": 1.980626106262207, "eval_runtime": 2.404, "eval_samples_per_second": 809.055, "eval_steps_per_second": 12.895, "step": 214 }, { "epoch": 23.836601307189543, "eval_loss": 1.9735476970672607, "eval_runtime": 2.3933, "eval_samples_per_second": 812.67, "eval_steps_per_second": 12.953, "step": 215 }, { "epoch": 23.941176470588236, "eval_loss": 1.9582773447036743, "eval_runtime": 2.4128, "eval_samples_per_second": 806.128, "eval_steps_per_second": 12.848, "step": 216 }, { "epoch": 23.941176470588236, "grad_norm": 1.8361761569976807, "learning_rate": 1.1063829787234044e-05, "loss": 1.8181, "step": 216 }, { "epoch": 24.104575163398692, "eval_loss": 1.9554569721221924, "eval_runtime": 2.387, "eval_samples_per_second": 814.839, "eval_steps_per_second": 12.987, "step": 217 }, { "epoch": 24.209150326797385, "eval_loss": 1.9165290594100952, "eval_runtime": 2.3919, "eval_samples_per_second": 813.148, "eval_steps_per_second": 12.96, "step": 218 }, { "epoch": 24.313725490196077, "eval_loss": 1.9637575149536133, "eval_runtime": 2.4007, "eval_samples_per_second": 810.191, "eval_steps_per_second": 12.913, "step": 219 }, { "epoch": 24.41830065359477, "eval_loss": 2.000793218612671, "eval_runtime": 2.4153, "eval_samples_per_second": 805.272, "eval_steps_per_second": 12.835, "step": 220 }, { "epoch": 24.522875816993466, "eval_loss": 1.9246618747711182, "eval_runtime": 2.4113, "eval_samples_per_second": 806.61, "eval_steps_per_second": 12.856, "step": 221 }, { "epoch": 24.627450980392158, "eval_loss": 1.9719598293304443, "eval_runtime": 2.4103, "eval_samples_per_second": 806.944, "eval_steps_per_second": 12.861, "step": 222 }, { "epoch": 24.73202614379085, "eval_loss": 2.008406400680542, "eval_runtime": 2.4142, "eval_samples_per_second": 805.666, "eval_steps_per_second": 12.841, "step": 223 }, { "epoch": 24.836601307189543, "eval_loss": 1.942387580871582, "eval_runtime": 2.4335, "eval_samples_per_second": 799.261, "eval_steps_per_second": 12.739, "step": 224 }, { "epoch": 24.941176470588236, "eval_loss": 1.9110654592514038, "eval_runtime": 2.3897, "eval_samples_per_second": 813.923, "eval_steps_per_second": 12.973, "step": 225 }, { "epoch": 24.941176470588236, "grad_norm": 1.8342725038528442, "learning_rate": 1.0638297872340426e-05, "loss": 1.797, "step": 225 }, { "epoch": 25.104575163398692, "eval_loss": 1.9787415266036987, "eval_runtime": 2.4748, "eval_samples_per_second": 785.908, "eval_steps_per_second": 12.526, "step": 226 }, { "epoch": 25.209150326797385, "eval_loss": 1.9613263607025146, "eval_runtime": 2.3841, "eval_samples_per_second": 815.829, "eval_steps_per_second": 13.003, "step": 227 }, { "epoch": 25.313725490196077, "eval_loss": 1.8806324005126953, "eval_runtime": 2.4198, "eval_samples_per_second": 803.779, "eval_steps_per_second": 12.811, "step": 228 }, { "epoch": 25.41830065359477, "eval_loss": 1.9231013059616089, "eval_runtime": 2.3851, "eval_samples_per_second": 815.485, "eval_steps_per_second": 12.997, "step": 229 }, { "epoch": 25.522875816993466, "eval_loss": 1.9021631479263306, "eval_runtime": 2.3853, "eval_samples_per_second": 815.428, "eval_steps_per_second": 12.997, "step": 230 }, { "epoch": 25.627450980392158, "eval_loss": 1.9682537317276, "eval_runtime": 2.4444, "eval_samples_per_second": 795.691, "eval_steps_per_second": 12.682, "step": 231 }, { "epoch": 25.73202614379085, "eval_loss": 1.9824862480163574, "eval_runtime": 2.4349, "eval_samples_per_second": 798.799, "eval_steps_per_second": 12.732, "step": 232 }, { "epoch": 25.836601307189543, "eval_loss": 1.962891936302185, "eval_runtime": 2.411, "eval_samples_per_second": 806.729, "eval_steps_per_second": 12.858, "step": 233 }, { "epoch": 25.941176470588236, "eval_loss": 1.9116088151931763, "eval_runtime": 2.4247, "eval_samples_per_second": 802.172, "eval_steps_per_second": 12.785, "step": 234 }, { "epoch": 25.941176470588236, "grad_norm": 1.9028220176696777, "learning_rate": 1.0212765957446808e-05, "loss": 1.7749, "step": 234 }, { "epoch": 26.104575163398692, "eval_loss": 1.9699651002883911, "eval_runtime": 2.4102, "eval_samples_per_second": 806.995, "eval_steps_per_second": 12.862, "step": 235 }, { "epoch": 26.209150326797385, "eval_loss": 1.9811697006225586, "eval_runtime": 2.386, "eval_samples_per_second": 815.174, "eval_steps_per_second": 12.992, "step": 236 }, { "epoch": 26.313725490196077, "eval_loss": 1.9248907566070557, "eval_runtime": 2.3932, "eval_samples_per_second": 812.727, "eval_steps_per_second": 12.953, "step": 237 }, { "epoch": 26.41830065359477, "eval_loss": 1.9683917760849, "eval_runtime": 2.3996, "eval_samples_per_second": 810.537, "eval_steps_per_second": 12.919, "step": 238 }, { "epoch": 26.522875816993466, "eval_loss": 1.9604750871658325, "eval_runtime": 2.3993, "eval_samples_per_second": 810.666, "eval_steps_per_second": 12.921, "step": 239 }, { "epoch": 26.627450980392158, "eval_loss": 1.8918408155441284, "eval_runtime": 2.4162, "eval_samples_per_second": 804.968, "eval_steps_per_second": 12.83, "step": 240 }, { "epoch": 26.73202614379085, "eval_loss": 1.9443118572235107, "eval_runtime": 2.3904, "eval_samples_per_second": 813.679, "eval_steps_per_second": 12.969, "step": 241 }, { "epoch": 26.836601307189543, "eval_loss": 1.9147528409957886, "eval_runtime": 2.4305, "eval_samples_per_second": 800.261, "eval_steps_per_second": 12.755, "step": 242 }, { "epoch": 26.941176470588236, "eval_loss": 1.8974157571792603, "eval_runtime": 2.4514, "eval_samples_per_second": 793.417, "eval_steps_per_second": 12.646, "step": 243 }, { "epoch": 26.941176470588236, "grad_norm": 1.7782148122787476, "learning_rate": 9.787234042553192e-06, "loss": 1.8022, "step": 243 }, { "epoch": 27.104575163398692, "eval_loss": 1.9711873531341553, "eval_runtime": 2.4377, "eval_samples_per_second": 797.883, "eval_steps_per_second": 12.717, "step": 244 }, { "epoch": 27.209150326797385, "eval_loss": 1.9718581438064575, "eval_runtime": 2.404, "eval_samples_per_second": 809.082, "eval_steps_per_second": 12.895, "step": 245 }, { "epoch": 27.313725490196077, "eval_loss": 1.9540036916732788, "eval_runtime": 2.411, "eval_samples_per_second": 806.731, "eval_steps_per_second": 12.858, "step": 246 }, { "epoch": 27.41830065359477, "eval_loss": 1.8907063007354736, "eval_runtime": 2.3875, "eval_samples_per_second": 814.658, "eval_steps_per_second": 12.984, "step": 247 }, { "epoch": 27.522875816993466, "eval_loss": 1.9907869100570679, "eval_runtime": 2.3956, "eval_samples_per_second": 811.916, "eval_steps_per_second": 12.941, "step": 248 }, { "epoch": 27.627450980392158, "eval_loss": 1.9273970127105713, "eval_runtime": 2.3922, "eval_samples_per_second": 813.044, "eval_steps_per_second": 12.959, "step": 249 }, { "epoch": 27.73202614379085, "eval_loss": 1.9233652353286743, "eval_runtime": 2.3912, "eval_samples_per_second": 813.399, "eval_steps_per_second": 12.964, "step": 250 }, { "epoch": 27.836601307189543, "eval_loss": 1.9580994844436646, "eval_runtime": 2.3875, "eval_samples_per_second": 814.648, "eval_steps_per_second": 12.984, "step": 251 }, { "epoch": 27.941176470588236, "eval_loss": 1.9409220218658447, "eval_runtime": 2.4743, "eval_samples_per_second": 786.095, "eval_steps_per_second": 12.529, "step": 252 }, { "epoch": 27.941176470588236, "grad_norm": 1.8487893342971802, "learning_rate": 9.361702127659576e-06, "loss": 1.7879, "step": 252 }, { "epoch": 28.104575163398692, "eval_loss": 1.8715720176696777, "eval_runtime": 2.4867, "eval_samples_per_second": 782.152, "eval_steps_per_second": 12.466, "step": 253 }, { "epoch": 28.209150326797385, "eval_loss": 1.9945265054702759, "eval_runtime": 2.4913, "eval_samples_per_second": 780.713, "eval_steps_per_second": 12.443, "step": 254 }, { "epoch": 28.313725490196077, "eval_loss": 1.8658331632614136, "eval_runtime": 2.4682, "eval_samples_per_second": 788.013, "eval_steps_per_second": 12.56, "step": 255 }, { "epoch": 28.41830065359477, "eval_loss": 1.946846604347229, "eval_runtime": 2.4685, "eval_samples_per_second": 787.912, "eval_steps_per_second": 12.558, "step": 256 }, { "epoch": 28.522875816993466, "eval_loss": 1.9456650018692017, "eval_runtime": 2.4963, "eval_samples_per_second": 779.142, "eval_steps_per_second": 12.418, "step": 257 }, { "epoch": 28.627450980392158, "eval_loss": 1.9555299282073975, "eval_runtime": 2.467, "eval_samples_per_second": 788.417, "eval_steps_per_second": 12.566, "step": 258 }, { "epoch": 28.73202614379085, "eval_loss": 1.9544572830200195, "eval_runtime": 2.4342, "eval_samples_per_second": 799.038, "eval_steps_per_second": 12.735, "step": 259 }, { "epoch": 28.836601307189543, "eval_loss": 1.9225515127182007, "eval_runtime": 2.3903, "eval_samples_per_second": 813.707, "eval_steps_per_second": 12.969, "step": 260 }, { "epoch": 28.941176470588236, "eval_loss": 1.9331358671188354, "eval_runtime": 2.3878, "eval_samples_per_second": 814.568, "eval_steps_per_second": 12.983, "step": 261 }, { "epoch": 28.941176470588236, "grad_norm": 1.77451491355896, "learning_rate": 8.936170212765958e-06, "loss": 1.8019, "step": 261 }, { "epoch": 29.104575163398692, "eval_loss": 1.9785720109939575, "eval_runtime": 2.4363, "eval_samples_per_second": 798.34, "eval_steps_per_second": 12.724, "step": 262 }, { "epoch": 29.209150326797385, "eval_loss": 1.9767541885375977, "eval_runtime": 2.4358, "eval_samples_per_second": 798.515, "eval_steps_per_second": 12.727, "step": 263 }, { "epoch": 29.313725490196077, "eval_loss": 1.9601216316223145, "eval_runtime": 2.4048, "eval_samples_per_second": 808.808, "eval_steps_per_second": 12.891, "step": 264 }, { "epoch": 29.41830065359477, "eval_loss": 1.917155385017395, "eval_runtime": 2.4141, "eval_samples_per_second": 805.697, "eval_steps_per_second": 12.841, "step": 265 }, { "epoch": 29.522875816993466, "eval_loss": 1.922187328338623, "eval_runtime": 2.4093, "eval_samples_per_second": 807.305, "eval_steps_per_second": 12.867, "step": 266 }, { "epoch": 29.627450980392158, "eval_loss": 1.918397307395935, "eval_runtime": 2.3948, "eval_samples_per_second": 812.187, "eval_steps_per_second": 12.945, "step": 267 }, { "epoch": 29.73202614379085, "eval_loss": 1.882236123085022, "eval_runtime": 2.3912, "eval_samples_per_second": 813.4, "eval_steps_per_second": 12.964, "step": 268 }, { "epoch": 29.836601307189543, "eval_loss": 1.916178822517395, "eval_runtime": 2.3869, "eval_samples_per_second": 814.874, "eval_steps_per_second": 12.988, "step": 269 }, { "epoch": 29.941176470588236, "eval_loss": 1.9769715070724487, "eval_runtime": 2.3858, "eval_samples_per_second": 815.249, "eval_steps_per_second": 12.994, "step": 270 }, { "epoch": 29.941176470588236, "grad_norm": 2.04988431930542, "learning_rate": 8.510638297872341e-06, "loss": 1.7614, "step": 270 }, { "epoch": 30.104575163398692, "eval_loss": 1.9033125638961792, "eval_runtime": 2.4701, "eval_samples_per_second": 787.431, "eval_steps_per_second": 12.55, "step": 271 }, { "epoch": 30.209150326797385, "eval_loss": 1.9454644918441772, "eval_runtime": 2.4111, "eval_samples_per_second": 806.698, "eval_steps_per_second": 12.857, "step": 272 }, { "epoch": 30.313725490196077, "eval_loss": 1.9106584787368774, "eval_runtime": 2.4243, "eval_samples_per_second": 802.304, "eval_steps_per_second": 12.787, "step": 273 }, { "epoch": 30.41830065359477, "eval_loss": 1.9812813997268677, "eval_runtime": 2.4148, "eval_samples_per_second": 805.466, "eval_steps_per_second": 12.838, "step": 274 }, { "epoch": 30.522875816993466, "eval_loss": 1.9426772594451904, "eval_runtime": 2.3997, "eval_samples_per_second": 810.526, "eval_steps_per_second": 12.918, "step": 275 }, { "epoch": 30.627450980392158, "eval_loss": 1.9499095678329468, "eval_runtime": 2.3957, "eval_samples_per_second": 811.863, "eval_steps_per_second": 12.94, "step": 276 }, { "epoch": 30.73202614379085, "eval_loss": 1.961235523223877, "eval_runtime": 2.3821, "eval_samples_per_second": 816.509, "eval_steps_per_second": 13.014, "step": 277 }, { "epoch": 30.836601307189543, "eval_loss": 1.9450849294662476, "eval_runtime": 2.3942, "eval_samples_per_second": 812.387, "eval_steps_per_second": 12.948, "step": 278 }, { "epoch": 30.941176470588236, "eval_loss": 1.9132739305496216, "eval_runtime": 2.4136, "eval_samples_per_second": 805.836, "eval_steps_per_second": 12.844, "step": 279 }, { "epoch": 30.941176470588236, "grad_norm": 1.8763809204101562, "learning_rate": 8.085106382978723e-06, "loss": 1.7619, "step": 279 }, { "epoch": 31.104575163398692, "eval_loss": 1.9205107688903809, "eval_runtime": 2.4332, "eval_samples_per_second": 799.374, "eval_steps_per_second": 12.741, "step": 280 }, { "epoch": 31.209150326797385, "eval_loss": 1.946841835975647, "eval_runtime": 2.464, "eval_samples_per_second": 789.356, "eval_steps_per_second": 12.581, "step": 281 }, { "epoch": 31.313725490196077, "eval_loss": 1.9464671611785889, "eval_runtime": 2.4074, "eval_samples_per_second": 807.917, "eval_steps_per_second": 12.877, "step": 282 }, { "epoch": 31.41830065359477, "eval_loss": 1.8833441734313965, "eval_runtime": 2.4082, "eval_samples_per_second": 807.64, "eval_steps_per_second": 12.872, "step": 283 }, { "epoch": 31.522875816993466, "eval_loss": 1.9414160251617432, "eval_runtime": 2.407, "eval_samples_per_second": 808.072, "eval_steps_per_second": 12.879, "step": 284 }, { "epoch": 31.627450980392158, "eval_loss": 1.9485697746276855, "eval_runtime": 2.4025, "eval_samples_per_second": 809.577, "eval_steps_per_second": 12.903, "step": 285 }, { "epoch": 31.73202614379085, "eval_loss": 1.9184815883636475, "eval_runtime": 2.3828, "eval_samples_per_second": 816.275, "eval_steps_per_second": 13.01, "step": 286 }, { "epoch": 31.836601307189543, "eval_loss": 1.9518897533416748, "eval_runtime": 2.4096, "eval_samples_per_second": 807.189, "eval_steps_per_second": 12.865, "step": 287 }, { "epoch": 31.941176470588236, "eval_loss": 1.9385578632354736, "eval_runtime": 2.4088, "eval_samples_per_second": 807.447, "eval_steps_per_second": 12.869, "step": 288 }, { "epoch": 31.941176470588236, "grad_norm": 1.8053061962127686, "learning_rate": 7.659574468085107e-06, "loss": 1.7713, "step": 288 }, { "epoch": 32.10457516339869, "eval_loss": 1.8966560363769531, "eval_runtime": 2.4403, "eval_samples_per_second": 797.048, "eval_steps_per_second": 12.704, "step": 289 }, { "epoch": 32.209150326797385, "eval_loss": 1.9649851322174072, "eval_runtime": 2.4092, "eval_samples_per_second": 807.307, "eval_steps_per_second": 12.867, "step": 290 }, { "epoch": 32.31372549019608, "eval_loss": 1.919927954673767, "eval_runtime": 2.41, "eval_samples_per_second": 807.057, "eval_steps_per_second": 12.863, "step": 291 }, { "epoch": 32.41830065359477, "eval_loss": 1.9147096872329712, "eval_runtime": 2.4127, "eval_samples_per_second": 806.134, "eval_steps_per_second": 12.848, "step": 292 }, { "epoch": 32.52287581699346, "eval_loss": 1.9159774780273438, "eval_runtime": 2.4052, "eval_samples_per_second": 808.673, "eval_steps_per_second": 12.889, "step": 293 }, { "epoch": 32.627450980392155, "eval_loss": 1.9953843355178833, "eval_runtime": 2.3906, "eval_samples_per_second": 813.616, "eval_steps_per_second": 12.968, "step": 294 }, { "epoch": 32.73202614379085, "eval_loss": 1.92180597782135, "eval_runtime": 2.4249, "eval_samples_per_second": 802.081, "eval_steps_per_second": 12.784, "step": 295 }, { "epoch": 32.83660130718954, "eval_loss": 1.9786967039108276, "eval_runtime": 2.3846, "eval_samples_per_second": 815.658, "eval_steps_per_second": 13.0, "step": 296 }, { "epoch": 32.94117647058823, "eval_loss": 1.9362424612045288, "eval_runtime": 2.3879, "eval_samples_per_second": 814.534, "eval_steps_per_second": 12.982, "step": 297 }, { "epoch": 32.94117647058823, "grad_norm": 2.0353338718414307, "learning_rate": 7.234042553191491e-06, "loss": 1.7635, "step": 297 }, { "epoch": 33.10457516339869, "eval_loss": 1.9281338453292847, "eval_runtime": 2.4396, "eval_samples_per_second": 797.251, "eval_steps_per_second": 12.707, "step": 298 }, { "epoch": 33.209150326797385, "eval_loss": 1.921140193939209, "eval_runtime": 2.4118, "eval_samples_per_second": 806.459, "eval_steps_per_second": 12.854, "step": 299 }, { "epoch": 33.31372549019608, "eval_loss": 1.9680968523025513, "eval_runtime": 2.4046, "eval_samples_per_second": 808.854, "eval_steps_per_second": 12.892, "step": 300 }, { "epoch": 33.41830065359477, "eval_loss": 1.9094316959381104, "eval_runtime": 2.4065, "eval_samples_per_second": 808.218, "eval_steps_per_second": 12.882, "step": 301 }, { "epoch": 33.52287581699346, "eval_loss": 1.9845983982086182, "eval_runtime": 2.4045, "eval_samples_per_second": 808.905, "eval_steps_per_second": 12.893, "step": 302 }, { "epoch": 33.627450980392155, "eval_loss": 1.9461405277252197, "eval_runtime": 2.4098, "eval_samples_per_second": 807.107, "eval_steps_per_second": 12.864, "step": 303 }, { "epoch": 33.73202614379085, "eval_loss": 1.8947722911834717, "eval_runtime": 2.392, "eval_samples_per_second": 813.124, "eval_steps_per_second": 12.96, "step": 304 }, { "epoch": 33.83660130718954, "eval_loss": 1.9371235370635986, "eval_runtime": 2.3855, "eval_samples_per_second": 815.347, "eval_steps_per_second": 12.995, "step": 305 }, { "epoch": 33.94117647058823, "eval_loss": 2.006459951400757, "eval_runtime": 2.4252, "eval_samples_per_second": 802.01, "eval_steps_per_second": 12.783, "step": 306 }, { "epoch": 33.94117647058823, "grad_norm": 1.884189486503601, "learning_rate": 6.808510638297873e-06, "loss": 1.7394, "step": 306 }, { "epoch": 34.10457516339869, "eval_loss": 1.9282273054122925, "eval_runtime": 2.4222, "eval_samples_per_second": 802.994, "eval_steps_per_second": 12.798, "step": 307 }, { "epoch": 34.209150326797385, "eval_loss": 1.9412920475006104, "eval_runtime": 2.5082, "eval_samples_per_second": 775.463, "eval_steps_per_second": 12.36, "step": 308 }, { "epoch": 34.31372549019608, "eval_loss": 1.9883979558944702, "eval_runtime": 2.5353, "eval_samples_per_second": 767.18, "eval_steps_per_second": 12.228, "step": 309 }, { "epoch": 34.41830065359477, "eval_loss": 1.917364239692688, "eval_runtime": 2.4621, "eval_samples_per_second": 789.978, "eval_steps_per_second": 12.591, "step": 310 }, { "epoch": 34.52287581699346, "eval_loss": 1.9594651460647583, "eval_runtime": 2.5176, "eval_samples_per_second": 772.567, "eval_steps_per_second": 12.313, "step": 311 }, { "epoch": 34.627450980392155, "eval_loss": 1.9423621892929077, "eval_runtime": 2.5063, "eval_samples_per_second": 776.037, "eval_steps_per_second": 12.369, "step": 312 }, { "epoch": 34.73202614379085, "eval_loss": 1.9494574069976807, "eval_runtime": 2.4032, "eval_samples_per_second": 809.321, "eval_steps_per_second": 12.899, "step": 313 }, { "epoch": 34.83660130718954, "eval_loss": 1.9160590171813965, "eval_runtime": 2.3866, "eval_samples_per_second": 814.959, "eval_steps_per_second": 12.989, "step": 314 }, { "epoch": 34.94117647058823, "eval_loss": 1.963183879852295, "eval_runtime": 2.3866, "eval_samples_per_second": 814.96, "eval_steps_per_second": 12.989, "step": 315 }, { "epoch": 34.94117647058823, "grad_norm": 1.7952407598495483, "learning_rate": 6.382978723404256e-06, "loss": 1.7434, "step": 315 }, { "epoch": 35.10457516339869, "eval_loss": 1.9129880666732788, "eval_runtime": 2.3856, "eval_samples_per_second": 815.294, "eval_steps_per_second": 12.994, "step": 316 }, { "epoch": 35.209150326797385, "eval_loss": 1.9850044250488281, "eval_runtime": 2.4394, "eval_samples_per_second": 797.311, "eval_steps_per_second": 12.708, "step": 317 }, { "epoch": 35.31372549019608, "eval_loss": 1.9291285276412964, "eval_runtime": 2.3856, "eval_samples_per_second": 815.319, "eval_steps_per_second": 12.995, "step": 318 }, { "epoch": 35.41830065359477, "eval_loss": 1.9300141334533691, "eval_runtime": 2.5153, "eval_samples_per_second": 773.266, "eval_steps_per_second": 12.325, "step": 319 }, { "epoch": 35.52287581699346, "eval_loss": 1.9399768114089966, "eval_runtime": 2.4758, "eval_samples_per_second": 785.618, "eval_steps_per_second": 12.521, "step": 320 }, { "epoch": 35.627450980392155, "eval_loss": 1.9735783338546753, "eval_runtime": 2.4325, "eval_samples_per_second": 799.598, "eval_steps_per_second": 12.744, "step": 321 }, { "epoch": 35.73202614379085, "eval_loss": 1.9033024311065674, "eval_runtime": 2.4152, "eval_samples_per_second": 805.309, "eval_steps_per_second": 12.835, "step": 322 }, { "epoch": 35.83660130718954, "eval_loss": 1.9249202013015747, "eval_runtime": 2.4104, "eval_samples_per_second": 806.927, "eval_steps_per_second": 12.861, "step": 323 }, { "epoch": 35.94117647058823, "eval_loss": 1.9796316623687744, "eval_runtime": 2.4374, "eval_samples_per_second": 797.996, "eval_steps_per_second": 12.719, "step": 324 }, { "epoch": 35.94117647058823, "grad_norm": 1.8199615478515625, "learning_rate": 5.957446808510638e-06, "loss": 1.7578, "step": 324 }, { "epoch": 36.10457516339869, "eval_loss": 1.9595942497253418, "eval_runtime": 2.3871, "eval_samples_per_second": 814.78, "eval_steps_per_second": 12.986, "step": 325 }, { "epoch": 36.209150326797385, "eval_loss": 1.9293735027313232, "eval_runtime": 2.3875, "eval_samples_per_second": 814.674, "eval_steps_per_second": 12.985, "step": 326 }, { "epoch": 36.31372549019608, "eval_loss": 1.957201600074768, "eval_runtime": 2.3892, "eval_samples_per_second": 814.083, "eval_steps_per_second": 12.975, "step": 327 }, { "epoch": 36.41830065359477, "eval_loss": 1.9536631107330322, "eval_runtime": 2.4048, "eval_samples_per_second": 808.796, "eval_steps_per_second": 12.891, "step": 328 }, { "epoch": 36.52287581699346, "eval_loss": 1.974502682685852, "eval_runtime": 2.3928, "eval_samples_per_second": 812.843, "eval_steps_per_second": 12.955, "step": 329 }, { "epoch": 36.627450980392155, "eval_loss": 1.9568116664886475, "eval_runtime": 2.4183, "eval_samples_per_second": 804.288, "eval_steps_per_second": 12.819, "step": 330 }, { "epoch": 36.73202614379085, "eval_loss": 1.9689034223556519, "eval_runtime": 2.4676, "eval_samples_per_second": 788.229, "eval_steps_per_second": 12.563, "step": 331 }, { "epoch": 36.83660130718954, "eval_loss": 1.9140371084213257, "eval_runtime": 2.4088, "eval_samples_per_second": 807.459, "eval_steps_per_second": 12.87, "step": 332 }, { "epoch": 36.94117647058823, "eval_loss": 1.929794192314148, "eval_runtime": 2.4119, "eval_samples_per_second": 806.412, "eval_steps_per_second": 12.853, "step": 333 }, { "epoch": 36.94117647058823, "grad_norm": 1.8074049949645996, "learning_rate": 5.531914893617022e-06, "loss": 1.7497, "step": 333 }, { "epoch": 37.10457516339869, "eval_loss": 1.9698741436004639, "eval_runtime": 2.4082, "eval_samples_per_second": 807.67, "eval_steps_per_second": 12.873, "step": 334 }, { "epoch": 37.209150326797385, "eval_loss": 1.90766441822052, "eval_runtime": 2.3974, "eval_samples_per_second": 811.298, "eval_steps_per_second": 12.931, "step": 335 }, { "epoch": 37.31372549019608, "eval_loss": 1.9559139013290405, "eval_runtime": 2.3918, "eval_samples_per_second": 813.182, "eval_steps_per_second": 12.961, "step": 336 }, { "epoch": 37.41830065359477, "eval_loss": 1.9621520042419434, "eval_runtime": 2.4126, "eval_samples_per_second": 806.189, "eval_steps_per_second": 12.849, "step": 337 }, { "epoch": 37.52287581699346, "eval_loss": 1.9238826036453247, "eval_runtime": 2.3925, "eval_samples_per_second": 812.943, "eval_steps_per_second": 12.957, "step": 338 }, { "epoch": 37.627450980392155, "eval_loss": 1.9738985300064087, "eval_runtime": 2.4262, "eval_samples_per_second": 801.666, "eval_steps_per_second": 12.777, "step": 339 }, { "epoch": 37.73202614379085, "eval_loss": 1.936599612236023, "eval_runtime": 2.446, "eval_samples_per_second": 795.165, "eval_steps_per_second": 12.674, "step": 340 }, { "epoch": 37.83660130718954, "eval_loss": 1.9857661724090576, "eval_runtime": 2.4387, "eval_samples_per_second": 797.57, "eval_steps_per_second": 12.712, "step": 341 }, { "epoch": 37.94117647058823, "eval_loss": 1.9602775573730469, "eval_runtime": 2.4623, "eval_samples_per_second": 789.898, "eval_steps_per_second": 12.59, "step": 342 }, { "epoch": 37.94117647058823, "grad_norm": 1.8968150615692139, "learning_rate": 5.106382978723404e-06, "loss": 1.7378, "step": 342 }, { "epoch": 38.10457516339869, "eval_loss": 1.9392098188400269, "eval_runtime": 2.5445, "eval_samples_per_second": 764.389, "eval_steps_per_second": 12.183, "step": 343 }, { "epoch": 38.209150326797385, "eval_loss": 1.9554734230041504, "eval_runtime": 2.4153, "eval_samples_per_second": 805.294, "eval_steps_per_second": 12.835, "step": 344 }, { "epoch": 38.31372549019608, "eval_loss": 1.9802982807159424, "eval_runtime": 2.4043, "eval_samples_per_second": 808.971, "eval_steps_per_second": 12.894, "step": 345 }, { "epoch": 38.41830065359477, "eval_loss": 1.950205683708191, "eval_runtime": 2.408, "eval_samples_per_second": 807.739, "eval_steps_per_second": 12.874, "step": 346 }, { "epoch": 38.52287581699346, "eval_loss": 1.959083914756775, "eval_runtime": 2.3857, "eval_samples_per_second": 815.264, "eval_steps_per_second": 12.994, "step": 347 }, { "epoch": 38.627450980392155, "eval_loss": 1.9582518339157104, "eval_runtime": 2.4306, "eval_samples_per_second": 800.213, "eval_steps_per_second": 12.754, "step": 348 }, { "epoch": 38.73202614379085, "eval_loss": 1.9507373571395874, "eval_runtime": 2.3863, "eval_samples_per_second": 815.084, "eval_steps_per_second": 12.991, "step": 349 }, { "epoch": 38.83660130718954, "eval_loss": 1.9410823583602905, "eval_runtime": 2.4092, "eval_samples_per_second": 807.311, "eval_steps_per_second": 12.867, "step": 350 }, { "epoch": 38.94117647058823, "eval_loss": 1.922089695930481, "eval_runtime": 2.4113, "eval_samples_per_second": 806.625, "eval_steps_per_second": 12.856, "step": 351 }, { "epoch": 38.94117647058823, "grad_norm": 1.8066309690475464, "learning_rate": 4.680851063829788e-06, "loss": 1.7324, "step": 351 }, { "epoch": 39.10457516339869, "eval_loss": 1.9468454122543335, "eval_runtime": 2.3833, "eval_samples_per_second": 816.093, "eval_steps_per_second": 13.007, "step": 352 }, { "epoch": 39.209150326797385, "eval_loss": 1.9370498657226562, "eval_runtime": 2.3853, "eval_samples_per_second": 815.41, "eval_steps_per_second": 12.996, "step": 353 }, { "epoch": 39.31372549019608, "eval_loss": 1.9278494119644165, "eval_runtime": 2.4316, "eval_samples_per_second": 799.901, "eval_steps_per_second": 12.749, "step": 354 }, { "epoch": 39.41830065359477, "eval_loss": 1.9604259729385376, "eval_runtime": 2.4996, "eval_samples_per_second": 778.116, "eval_steps_per_second": 12.402, "step": 355 }, { "epoch": 39.52287581699346, "eval_loss": 1.9375855922698975, "eval_runtime": 2.4112, "eval_samples_per_second": 806.646, "eval_steps_per_second": 12.857, "step": 356 }, { "epoch": 39.627450980392155, "eval_loss": 1.9473201036453247, "eval_runtime": 2.4652, "eval_samples_per_second": 788.988, "eval_steps_per_second": 12.575, "step": 357 }, { "epoch": 39.73202614379085, "eval_loss": 1.9490294456481934, "eval_runtime": 2.4617, "eval_samples_per_second": 790.102, "eval_steps_per_second": 12.593, "step": 358 }, { "epoch": 39.83660130718954, "eval_loss": 1.913360834121704, "eval_runtime": 2.445, "eval_samples_per_second": 795.514, "eval_steps_per_second": 12.679, "step": 359 }, { "epoch": 39.94117647058823, "eval_loss": 1.9323056936264038, "eval_runtime": 2.4383, "eval_samples_per_second": 797.685, "eval_steps_per_second": 12.714, "step": 360 }, { "epoch": 39.94117647058823, "grad_norm": 1.7927449941635132, "learning_rate": 4.255319148936171e-06, "loss": 1.7195, "step": 360 }, { "epoch": 40.10457516339869, "eval_loss": 1.9119060039520264, "eval_runtime": 2.4209, "eval_samples_per_second": 803.426, "eval_steps_per_second": 12.805, "step": 361 }, { "epoch": 40.209150326797385, "eval_loss": 1.939374327659607, "eval_runtime": 2.4652, "eval_samples_per_second": 788.974, "eval_steps_per_second": 12.575, "step": 362 }, { "epoch": 40.31372549019608, "eval_loss": 1.9959666728973389, "eval_runtime": 2.3847, "eval_samples_per_second": 815.601, "eval_steps_per_second": 12.999, "step": 363 }, { "epoch": 40.41830065359477, "eval_loss": 1.9789413213729858, "eval_runtime": 2.3893, "eval_samples_per_second": 814.046, "eval_steps_per_second": 12.975, "step": 364 }, { "epoch": 40.52287581699346, "eval_loss": 1.9750434160232544, "eval_runtime": 2.3871, "eval_samples_per_second": 814.781, "eval_steps_per_second": 12.986, "step": 365 }, { "epoch": 40.627450980392155, "eval_loss": 1.9399486780166626, "eval_runtime": 2.4366, "eval_samples_per_second": 798.229, "eval_steps_per_second": 12.722, "step": 366 }, { "epoch": 40.73202614379085, "eval_loss": 1.9516425132751465, "eval_runtime": 2.4136, "eval_samples_per_second": 805.85, "eval_steps_per_second": 12.844, "step": 367 }, { "epoch": 40.83660130718954, "eval_loss": 1.941023826599121, "eval_runtime": 2.4094, "eval_samples_per_second": 807.251, "eval_steps_per_second": 12.866, "step": 368 }, { "epoch": 40.94117647058823, "eval_loss": 1.9317693710327148, "eval_runtime": 2.4091, "eval_samples_per_second": 807.345, "eval_steps_per_second": 12.868, "step": 369 }, { "epoch": 40.94117647058823, "grad_norm": 1.7872660160064697, "learning_rate": 3.8297872340425535e-06, "loss": 1.7043, "step": 369 }, { "epoch": 41.10457516339869, "eval_loss": 1.9890044927597046, "eval_runtime": 2.414, "eval_samples_per_second": 805.708, "eval_steps_per_second": 12.842, "step": 370 }, { "epoch": 41.209150326797385, "eval_loss": 1.9840960502624512, "eval_runtime": 2.3959, "eval_samples_per_second": 811.809, "eval_steps_per_second": 12.939, "step": 371 }, { "epoch": 41.31372549019608, "eval_loss": 1.9188443422317505, "eval_runtime": 2.4069, "eval_samples_per_second": 808.088, "eval_steps_per_second": 12.88, "step": 372 }, { "epoch": 41.41830065359477, "eval_loss": 1.9614779949188232, "eval_runtime": 2.3886, "eval_samples_per_second": 814.274, "eval_steps_per_second": 12.978, "step": 373 }, { "epoch": 41.52287581699346, "eval_loss": 1.9061365127563477, "eval_runtime": 2.3924, "eval_samples_per_second": 812.981, "eval_steps_per_second": 12.958, "step": 374 }, { "epoch": 41.627450980392155, "eval_loss": 1.909993052482605, "eval_runtime": 2.3858, "eval_samples_per_second": 815.236, "eval_steps_per_second": 12.993, "step": 375 }, { "epoch": 41.73202614379085, "eval_loss": 1.9422426223754883, "eval_runtime": 2.3888, "eval_samples_per_second": 814.201, "eval_steps_per_second": 12.977, "step": 376 }, { "epoch": 41.83660130718954, "eval_loss": 1.9640315771102905, "eval_runtime": 2.4376, "eval_samples_per_second": 797.911, "eval_steps_per_second": 12.717, "step": 377 }, { "epoch": 41.94117647058823, "eval_loss": 1.917662262916565, "eval_runtime": 2.4538, "eval_samples_per_second": 792.647, "eval_steps_per_second": 12.633, "step": 378 }, { "epoch": 41.94117647058823, "grad_norm": 1.7721134424209595, "learning_rate": 3.4042553191489363e-06, "loss": 1.7169, "step": 378 }, { "epoch": 42.10457516339869, "eval_loss": 1.9163570404052734, "eval_runtime": 2.4281, "eval_samples_per_second": 801.034, "eval_steps_per_second": 12.767, "step": 379 }, { "epoch": 42.209150326797385, "eval_loss": 1.9374709129333496, "eval_runtime": 2.4192, "eval_samples_per_second": 803.999, "eval_steps_per_second": 12.814, "step": 380 }, { "epoch": 42.31372549019608, "eval_loss": 1.9525771141052246, "eval_runtime": 2.4174, "eval_samples_per_second": 804.581, "eval_steps_per_second": 12.824, "step": 381 }, { "epoch": 42.41830065359477, "eval_loss": 1.938783884048462, "eval_runtime": 2.3891, "eval_samples_per_second": 814.119, "eval_steps_per_second": 12.976, "step": 382 }, { "epoch": 42.52287581699346, "eval_loss": 1.9378857612609863, "eval_runtime": 2.3879, "eval_samples_per_second": 814.515, "eval_steps_per_second": 12.982, "step": 383 }, { "epoch": 42.627450980392155, "eval_loss": 1.931535243988037, "eval_runtime": 2.3924, "eval_samples_per_second": 812.994, "eval_steps_per_second": 12.958, "step": 384 }, { "epoch": 42.73202614379085, "eval_loss": 1.9418144226074219, "eval_runtime": 2.3917, "eval_samples_per_second": 813.219, "eval_steps_per_second": 12.961, "step": 385 }, { "epoch": 42.83660130718954, "eval_loss": 1.9460214376449585, "eval_runtime": 2.417, "eval_samples_per_second": 804.722, "eval_steps_per_second": 12.826, "step": 386 }, { "epoch": 42.94117647058823, "eval_loss": 1.9129729270935059, "eval_runtime": 2.3889, "eval_samples_per_second": 814.19, "eval_steps_per_second": 12.977, "step": 387 }, { "epoch": 42.94117647058823, "grad_norm": 1.8166015148162842, "learning_rate": 2.978723404255319e-06, "loss": 1.7315, "step": 387 }, { "epoch": 43.10457516339869, "eval_loss": 1.9539881944656372, "eval_runtime": 2.4601, "eval_samples_per_second": 790.624, "eval_steps_per_second": 12.601, "step": 388 }, { "epoch": 43.209150326797385, "eval_loss": 1.951253890991211, "eval_runtime": 2.4131, "eval_samples_per_second": 806.019, "eval_steps_per_second": 12.847, "step": 389 }, { "epoch": 43.31372549019608, "eval_loss": 2.0078840255737305, "eval_runtime": 2.4151, "eval_samples_per_second": 805.365, "eval_steps_per_second": 12.836, "step": 390 }, { "epoch": 43.41830065359477, "eval_loss": 1.9754467010498047, "eval_runtime": 2.409, "eval_samples_per_second": 807.392, "eval_steps_per_second": 12.868, "step": 391 }, { "epoch": 43.52287581699346, "eval_loss": 1.972512125968933, "eval_runtime": 2.4172, "eval_samples_per_second": 804.635, "eval_steps_per_second": 12.825, "step": 392 }, { "epoch": 43.627450980392155, "eval_loss": 1.9601085186004639, "eval_runtime": 2.4555, "eval_samples_per_second": 792.113, "eval_steps_per_second": 12.625, "step": 393 }, { "epoch": 43.73202614379085, "eval_loss": 1.9266124963760376, "eval_runtime": 2.4604, "eval_samples_per_second": 790.516, "eval_steps_per_second": 12.599, "step": 394 }, { "epoch": 43.83660130718954, "eval_loss": 1.9546335935592651, "eval_runtime": 2.467, "eval_samples_per_second": 788.405, "eval_steps_per_second": 12.566, "step": 395 }, { "epoch": 43.94117647058823, "eval_loss": 1.918619990348816, "eval_runtime": 2.4607, "eval_samples_per_second": 790.428, "eval_steps_per_second": 12.598, "step": 396 }, { "epoch": 43.94117647058823, "grad_norm": 1.8490442037582397, "learning_rate": 2.553191489361702e-06, "loss": 1.7095, "step": 396 }, { "epoch": 44.10457516339869, "eval_loss": 1.9544674158096313, "eval_runtime": 2.5483, "eval_samples_per_second": 763.254, "eval_steps_per_second": 12.165, "step": 397 }, { "epoch": 44.209150326797385, "eval_loss": 2.0218536853790283, "eval_runtime": 2.578, "eval_samples_per_second": 754.467, "eval_steps_per_second": 12.025, "step": 398 }, { "epoch": 44.31372549019608, "eval_loss": 1.9499460458755493, "eval_runtime": 2.4731, "eval_samples_per_second": 786.477, "eval_steps_per_second": 12.535, "step": 399 }, { "epoch": 44.41830065359477, "eval_loss": 1.9414080381393433, "eval_runtime": 2.4805, "eval_samples_per_second": 784.104, "eval_steps_per_second": 12.497, "step": 400 }, { "epoch": 44.52287581699346, "eval_loss": 1.961714506149292, "eval_runtime": 2.4673, "eval_samples_per_second": 788.301, "eval_steps_per_second": 12.564, "step": 401 }, { "epoch": 44.627450980392155, "eval_loss": 1.9939833879470825, "eval_runtime": 2.4722, "eval_samples_per_second": 786.751, "eval_steps_per_second": 12.539, "step": 402 }, { "epoch": 44.73202614379085, "eval_loss": 1.9617350101470947, "eval_runtime": 2.4683, "eval_samples_per_second": 787.977, "eval_steps_per_second": 12.559, "step": 403 }, { "epoch": 44.83660130718954, "eval_loss": 1.9692201614379883, "eval_runtime": 2.4374, "eval_samples_per_second": 797.97, "eval_steps_per_second": 12.718, "step": 404 }, { "epoch": 44.94117647058823, "eval_loss": 1.9219003915786743, "eval_runtime": 2.435, "eval_samples_per_second": 798.78, "eval_steps_per_second": 12.731, "step": 405 }, { "epoch": 44.94117647058823, "grad_norm": 1.796848177909851, "learning_rate": 2.1276595744680853e-06, "loss": 1.7071, "step": 405 }, { "epoch": 45.10457516339869, "eval_loss": 1.9611177444458008, "eval_runtime": 2.4195, "eval_samples_per_second": 803.881, "eval_steps_per_second": 12.812, "step": 406 }, { "epoch": 45.209150326797385, "eval_loss": 1.9778918027877808, "eval_runtime": 2.4197, "eval_samples_per_second": 803.821, "eval_steps_per_second": 12.812, "step": 407 }, { "epoch": 45.31372549019608, "eval_loss": 1.9238309860229492, "eval_runtime": 2.4635, "eval_samples_per_second": 789.519, "eval_steps_per_second": 12.584, "step": 408 }, { "epoch": 45.41830065359477, "eval_loss": 1.9089758396148682, "eval_runtime": 2.4247, "eval_samples_per_second": 802.146, "eval_steps_per_second": 12.785, "step": 409 }, { "epoch": 45.52287581699346, "eval_loss": 1.9342485666275024, "eval_runtime": 2.4113, "eval_samples_per_second": 806.609, "eval_steps_per_second": 12.856, "step": 410 }, { "epoch": 45.627450980392155, "eval_loss": 1.9936097860336304, "eval_runtime": 2.4133, "eval_samples_per_second": 805.937, "eval_steps_per_second": 12.845, "step": 411 }, { "epoch": 45.73202614379085, "eval_loss": 1.897844672203064, "eval_runtime": 2.4034, "eval_samples_per_second": 809.278, "eval_steps_per_second": 12.899, "step": 412 }, { "epoch": 45.83660130718954, "eval_loss": 1.9208406209945679, "eval_runtime": 2.3918, "eval_samples_per_second": 813.195, "eval_steps_per_second": 12.961, "step": 413 }, { "epoch": 45.94117647058823, "eval_loss": 1.9177494049072266, "eval_runtime": 2.4571, "eval_samples_per_second": 791.578, "eval_steps_per_second": 12.616, "step": 414 }, { "epoch": 45.94117647058823, "grad_norm": 1.7152032852172852, "learning_rate": 1.7021276595744682e-06, "loss": 1.7116, "step": 414 }, { "epoch": 46.10457516339869, "eval_loss": 1.957858681678772, "eval_runtime": 2.4339, "eval_samples_per_second": 799.125, "eval_steps_per_second": 12.737, "step": 415 }, { "epoch": 46.209150326797385, "eval_loss": 1.9422305822372437, "eval_runtime": 2.4497, "eval_samples_per_second": 793.973, "eval_steps_per_second": 12.655, "step": 416 }, { "epoch": 46.31372549019608, "eval_loss": 1.9287089109420776, "eval_runtime": 2.423, "eval_samples_per_second": 802.723, "eval_steps_per_second": 12.794, "step": 417 }, { "epoch": 46.41830065359477, "eval_loss": 1.9444739818572998, "eval_runtime": 2.4956, "eval_samples_per_second": 779.376, "eval_steps_per_second": 12.422, "step": 418 }, { "epoch": 46.52287581699346, "eval_loss": 1.923707127571106, "eval_runtime": 2.4651, "eval_samples_per_second": 789.011, "eval_steps_per_second": 12.575, "step": 419 }, { "epoch": 46.627450980392155, "eval_loss": 1.9269739389419556, "eval_runtime": 2.4648, "eval_samples_per_second": 789.121, "eval_steps_per_second": 12.577, "step": 420 }, { "epoch": 46.73202614379085, "eval_loss": 1.9492802619934082, "eval_runtime": 2.4582, "eval_samples_per_second": 791.229, "eval_steps_per_second": 12.611, "step": 421 }, { "epoch": 46.83660130718954, "eval_loss": 1.9743090867996216, "eval_runtime": 2.4551, "eval_samples_per_second": 792.232, "eval_steps_per_second": 12.627, "step": 422 }, { "epoch": 46.94117647058823, "eval_loss": 1.957751750946045, "eval_runtime": 2.434, "eval_samples_per_second": 799.105, "eval_steps_per_second": 12.736, "step": 423 }, { "epoch": 46.94117647058823, "grad_norm": 1.9101431369781494, "learning_rate": 1.276595744680851e-06, "loss": 1.733, "step": 423 }, { "epoch": 47.10457516339869, "eval_loss": 1.9019426107406616, "eval_runtime": 2.4414, "eval_samples_per_second": 796.664, "eval_steps_per_second": 12.697, "step": 424 }, { "epoch": 47.209150326797385, "eval_loss": 1.944284200668335, "eval_runtime": 2.4514, "eval_samples_per_second": 793.435, "eval_steps_per_second": 12.646, "step": 425 }, { "epoch": 47.31372549019608, "eval_loss": 1.9661508798599243, "eval_runtime": 2.4442, "eval_samples_per_second": 795.747, "eval_steps_per_second": 12.683, "step": 426 }, { "epoch": 47.41830065359477, "eval_loss": 1.9728316068649292, "eval_runtime": 2.4822, "eval_samples_per_second": 783.586, "eval_steps_per_second": 12.489, "step": 427 }, { "epoch": 47.52287581699346, "eval_loss": 1.923363208770752, "eval_runtime": 2.4644, "eval_samples_per_second": 789.251, "eval_steps_per_second": 12.579, "step": 428 }, { "epoch": 47.627450980392155, "eval_loss": 1.9165805578231812, "eval_runtime": 2.3965, "eval_samples_per_second": 811.601, "eval_steps_per_second": 12.936, "step": 429 }, { "epoch": 47.73202614379085, "eval_loss": 1.9413442611694336, "eval_runtime": 2.4408, "eval_samples_per_second": 796.869, "eval_steps_per_second": 12.701, "step": 430 }, { "epoch": 47.83660130718954, "eval_loss": 1.8855735063552856, "eval_runtime": 2.4507, "eval_samples_per_second": 793.659, "eval_steps_per_second": 12.65, "step": 431 }, { "epoch": 47.94117647058823, "eval_loss": 1.952731728553772, "eval_runtime": 2.5241, "eval_samples_per_second": 770.567, "eval_steps_per_second": 12.282, "step": 432 }, { "epoch": 47.94117647058823, "grad_norm": 1.7582765817642212, "learning_rate": 8.510638297872341e-07, "loss": 1.7065, "step": 432 }, { "epoch": 48.10457516339869, "eval_loss": 1.9524160623550415, "eval_runtime": 2.4138, "eval_samples_per_second": 805.783, "eval_steps_per_second": 12.843, "step": 433 }, { "epoch": 48.209150326797385, "eval_loss": 1.9682825803756714, "eval_runtime": 2.5144, "eval_samples_per_second": 773.543, "eval_steps_per_second": 12.329, "step": 434 }, { "epoch": 48.31372549019608, "eval_loss": 1.9489309787750244, "eval_runtime": 2.5744, "eval_samples_per_second": 755.51, "eval_steps_per_second": 12.042, "step": 435 }, { "epoch": 48.41830065359477, "eval_loss": 1.9564448595046997, "eval_runtime": 2.4521, "eval_samples_per_second": 793.202, "eval_steps_per_second": 12.642, "step": 436 }, { "epoch": 48.52287581699346, "eval_loss": 1.9767297506332397, "eval_runtime": 2.4404, "eval_samples_per_second": 796.999, "eval_steps_per_second": 12.703, "step": 437 }, { "epoch": 48.627450980392155, "eval_loss": 1.9058864116668701, "eval_runtime": 2.4851, "eval_samples_per_second": 782.665, "eval_steps_per_second": 12.474, "step": 438 }, { "epoch": 48.73202614379085, "eval_loss": 1.9629017114639282, "eval_runtime": 2.4371, "eval_samples_per_second": 798.081, "eval_steps_per_second": 12.72, "step": 439 }, { "epoch": 48.83660130718954, "eval_loss": 1.937988519668579, "eval_runtime": 2.4379, "eval_samples_per_second": 797.822, "eval_steps_per_second": 12.716, "step": 440 }, { "epoch": 48.94117647058823, "eval_loss": 1.9694868326187134, "eval_runtime": 2.4378, "eval_samples_per_second": 797.855, "eval_steps_per_second": 12.716, "step": 441 }, { "epoch": 48.94117647058823, "grad_norm": 1.9079190492630005, "learning_rate": 4.2553191489361704e-07, "loss": 1.6997, "step": 441 }, { "epoch": 49.10457516339869, "eval_loss": 1.9250315427780151, "eval_runtime": 2.4353, "eval_samples_per_second": 798.679, "eval_steps_per_second": 12.73, "step": 442 }, { "epoch": 49.209150326797385, "eval_loss": 1.9391655921936035, "eval_runtime": 2.4099, "eval_samples_per_second": 807.094, "eval_steps_per_second": 12.864, "step": 443 }, { "epoch": 49.31372549019608, "eval_loss": 1.9522807598114014, "eval_runtime": 2.4344, "eval_samples_per_second": 798.963, "eval_steps_per_second": 12.734, "step": 444 }, { "epoch": 49.41830065359477, "eval_loss": 1.9459158182144165, "eval_runtime": 2.4156, "eval_samples_per_second": 805.199, "eval_steps_per_second": 12.834, "step": 445 }, { "epoch": 49.52287581699346, "eval_loss": 1.9365217685699463, "eval_runtime": 2.4071, "eval_samples_per_second": 808.028, "eval_steps_per_second": 12.879, "step": 446 }, { "epoch": 49.627450980392155, "eval_loss": 1.9133816957473755, "eval_runtime": 2.3906, "eval_samples_per_second": 813.594, "eval_steps_per_second": 12.967, "step": 447 }, { "epoch": 49.73202614379085, "eval_loss": 1.9433826208114624, "eval_runtime": 2.4005, "eval_samples_per_second": 810.237, "eval_steps_per_second": 12.914, "step": 448 }, { "epoch": 49.83660130718954, "eval_loss": 1.9771692752838135, "eval_runtime": 2.3888, "eval_samples_per_second": 814.201, "eval_steps_per_second": 12.977, "step": 449 }, { "epoch": 49.94117647058823, "eval_loss": 1.9776495695114136, "eval_runtime": 2.3848, "eval_samples_per_second": 815.59, "eval_steps_per_second": 12.999, "step": 450 }, { "epoch": 49.94117647058823, "grad_norm": 1.8996864557266235, "learning_rate": 0.0, "loss": 1.6668, "step": 450 }, { "epoch": 49.94117647058823, "step": 450, "total_flos": 1.2548402868338688e+16, "train_loss": 1.8820014402601455, "train_runtime": 2628.924, "train_samples_per_second": 185.114, "train_steps_per_second": 0.171 } ], "logging_steps": 500, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2548402868338688e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }