{ "best_metric": 0.0003887661441694945, "best_model_checkpoint": "Models/t5-base-class-gen/checkpoint-3200", "epoch": 4.662379421221865, "eval_steps": 100, "global_step": 5800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08038585209003216, "grad_norm": 1.386023998260498, "learning_rate": 3.936334405144695e-05, "loss": 0.6335, "step": 100 }, { "epoch": 0.08038585209003216, "eval_loss": 0.08590172976255417, "eval_runtime": 0.1746, "eval_samples_per_second": 171.859, "eval_steps_per_second": 22.915, "step": 100 }, { "epoch": 0.1607717041800643, "grad_norm": 0.95208740234375, "learning_rate": 3.872025723472669e-05, "loss": 0.1377, "step": 200 }, { "epoch": 0.1607717041800643, "eval_loss": 0.04498327895998955, "eval_runtime": 0.1731, "eval_samples_per_second": 173.336, "eval_steps_per_second": 23.112, "step": 200 }, { "epoch": 0.24115755627009647, "grad_norm": 1.106736660003662, "learning_rate": 3.8077170418006436e-05, "loss": 0.0849, "step": 300 }, { "epoch": 0.24115755627009647, "eval_loss": 0.03304059058427811, "eval_runtime": 0.2272, "eval_samples_per_second": 132.057, "eval_steps_per_second": 17.608, "step": 300 }, { "epoch": 0.3215434083601286, "grad_norm": 0.9386335015296936, "learning_rate": 3.743408360128617e-05, "loss": 0.059, "step": 400 }, { "epoch": 0.3215434083601286, "eval_loss": 0.016175953671336174, "eval_runtime": 0.2536, "eval_samples_per_second": 118.301, "eval_steps_per_second": 15.774, "step": 400 }, { "epoch": 0.40192926045016075, "grad_norm": 0.7505399584770203, "learning_rate": 3.679099678456592e-05, "loss": 0.0471, "step": 500 }, { "epoch": 0.40192926045016075, "eval_loss": 0.010730231180787086, "eval_runtime": 0.2933, "eval_samples_per_second": 102.3, "eval_steps_per_second": 13.64, "step": 500 }, { "epoch": 0.48231511254019294, "grad_norm": 0.05818118155002594, "learning_rate": 3.614790996784566e-05, "loss": 0.0368, "step": 600 }, { "epoch": 0.48231511254019294, "eval_loss": 0.008968004025518894, "eval_runtime": 0.1746, "eval_samples_per_second": 171.788, "eval_steps_per_second": 22.905, "step": 600 }, { "epoch": 0.5627009646302251, "grad_norm": 0.0749644786119461, "learning_rate": 3.5504823151125405e-05, "loss": 0.0232, "step": 700 }, { "epoch": 0.5627009646302251, "eval_loss": 0.007880235090851784, "eval_runtime": 0.1711, "eval_samples_per_second": 175.365, "eval_steps_per_second": 23.382, "step": 700 }, { "epoch": 0.6430868167202572, "grad_norm": 0.26040318608283997, "learning_rate": 3.486173633440515e-05, "loss": 0.0244, "step": 800 }, { "epoch": 0.6430868167202572, "eval_loss": 0.007417692337185144, "eval_runtime": 0.1745, "eval_samples_per_second": 171.912, "eval_steps_per_second": 22.922, "step": 800 }, { "epoch": 0.7234726688102894, "grad_norm": 0.19101421535015106, "learning_rate": 3.421864951768489e-05, "loss": 0.0201, "step": 900 }, { "epoch": 0.7234726688102894, "eval_loss": 0.007583172060549259, "eval_runtime": 0.1833, "eval_samples_per_second": 163.635, "eval_steps_per_second": 21.818, "step": 900 }, { "epoch": 0.8038585209003215, "grad_norm": 0.18456044793128967, "learning_rate": 3.3575562700964637e-05, "loss": 0.0206, "step": 1000 }, { "epoch": 0.8038585209003215, "eval_loss": 0.006357308477163315, "eval_runtime": 0.1846, "eval_samples_per_second": 162.547, "eval_steps_per_second": 21.673, "step": 1000 }, { "epoch": 0.8842443729903537, "grad_norm": 0.34773826599121094, "learning_rate": 3.2932475884244374e-05, "loss": 0.0228, "step": 1100 }, { "epoch": 0.8842443729903537, "eval_loss": 0.007683805655688047, "eval_runtime": 0.1718, "eval_samples_per_second": 174.574, "eval_steps_per_second": 23.277, "step": 1100 }, { "epoch": 0.9646302250803859, "grad_norm": 0.20785485208034515, "learning_rate": 3.228938906752412e-05, "loss": 0.0212, "step": 1200 }, { "epoch": 0.9646302250803859, "eval_loss": 0.006169555243104696, "eval_runtime": 0.1672, "eval_samples_per_second": 179.46, "eval_steps_per_second": 23.928, "step": 1200 }, { "epoch": 1.045016077170418, "grad_norm": 0.6145943999290466, "learning_rate": 3.164630225080386e-05, "loss": 0.0144, "step": 1300 }, { "epoch": 1.045016077170418, "eval_loss": 0.006269162520766258, "eval_runtime": 0.1787, "eval_samples_per_second": 167.852, "eval_steps_per_second": 22.38, "step": 1300 }, { "epoch": 1.1254019292604502, "grad_norm": 0.07819650322198868, "learning_rate": 3.1003215434083605e-05, "loss": 0.0108, "step": 1400 }, { "epoch": 1.1254019292604502, "eval_loss": 0.007830055430531502, "eval_runtime": 0.1706, "eval_samples_per_second": 175.836, "eval_steps_per_second": 23.445, "step": 1400 }, { "epoch": 1.2057877813504823, "grad_norm": 0.22531314194202423, "learning_rate": 3.0360128617363346e-05, "loss": 0.0127, "step": 1500 }, { "epoch": 1.2057877813504823, "eval_loss": 0.005049354862421751, "eval_runtime": 0.1729, "eval_samples_per_second": 173.534, "eval_steps_per_second": 23.138, "step": 1500 }, { "epoch": 1.2861736334405145, "grad_norm": 0.025266777724027634, "learning_rate": 2.971704180064309e-05, "loss": 0.0119, "step": 1600 }, { "epoch": 1.2861736334405145, "eval_loss": 0.005167185328900814, "eval_runtime": 0.1769, "eval_samples_per_second": 169.631, "eval_steps_per_second": 22.617, "step": 1600 }, { "epoch": 1.3665594855305465, "grad_norm": 0.08020277321338654, "learning_rate": 2.9073954983922834e-05, "loss": 0.0099, "step": 1700 }, { "epoch": 1.3665594855305465, "eval_loss": 0.002216967288404703, "eval_runtime": 0.1741, "eval_samples_per_second": 172.331, "eval_steps_per_second": 22.977, "step": 1700 }, { "epoch": 1.4469453376205788, "grad_norm": 0.07968125492334366, "learning_rate": 2.8430868167202574e-05, "loss": 0.0123, "step": 1800 }, { "epoch": 1.4469453376205788, "eval_loss": 0.0026903103571385145, "eval_runtime": 0.1673, "eval_samples_per_second": 179.365, "eval_steps_per_second": 23.915, "step": 1800 }, { "epoch": 1.527331189710611, "grad_norm": 0.20680592954158783, "learning_rate": 2.7787781350482318e-05, "loss": 0.0108, "step": 1900 }, { "epoch": 1.527331189710611, "eval_loss": 0.0027133109979331493, "eval_runtime": 0.2477, "eval_samples_per_second": 121.116, "eval_steps_per_second": 16.149, "step": 1900 }, { "epoch": 1.607717041800643, "grad_norm": 0.7855786085128784, "learning_rate": 2.714469453376206e-05, "loss": 0.014, "step": 2000 }, { "epoch": 1.607717041800643, "eval_loss": 0.0028839909937232733, "eval_runtime": 0.1692, "eval_samples_per_second": 177.293, "eval_steps_per_second": 23.639, "step": 2000 }, { "epoch": 1.6881028938906752, "grad_norm": 0.12237449735403061, "learning_rate": 2.6501607717041802e-05, "loss": 0.0095, "step": 2100 }, { "epoch": 1.6881028938906752, "eval_loss": 0.0036916760727763176, "eval_runtime": 0.2775, "eval_samples_per_second": 108.096, "eval_steps_per_second": 14.413, "step": 2100 }, { "epoch": 1.7684887459807075, "grad_norm": 0.37831100821495056, "learning_rate": 2.5858520900321543e-05, "loss": 0.011, "step": 2200 }, { "epoch": 1.7684887459807075, "eval_loss": 0.0021040281280875206, "eval_runtime": 0.1705, "eval_samples_per_second": 175.999, "eval_steps_per_second": 23.467, "step": 2200 }, { "epoch": 1.8488745980707395, "grad_norm": 0.049582913517951965, "learning_rate": 2.521543408360129e-05, "loss": 0.0102, "step": 2300 }, { "epoch": 1.8488745980707395, "eval_loss": 0.0018795446958392859, "eval_runtime": 0.1653, "eval_samples_per_second": 181.47, "eval_steps_per_second": 24.196, "step": 2300 }, { "epoch": 1.9292604501607717, "grad_norm": 0.06450924277305603, "learning_rate": 2.457234726688103e-05, "loss": 0.0101, "step": 2400 }, { "epoch": 1.9292604501607717, "eval_loss": 0.001975016202777624, "eval_runtime": 0.1716, "eval_samples_per_second": 174.862, "eval_steps_per_second": 23.315, "step": 2400 }, { "epoch": 2.009646302250804, "grad_norm": 0.06440392136573792, "learning_rate": 2.3929260450160775e-05, "loss": 0.01, "step": 2500 }, { "epoch": 2.009646302250804, "eval_loss": 0.0010833271080628037, "eval_runtime": 0.1729, "eval_samples_per_second": 173.502, "eval_steps_per_second": 23.134, "step": 2500 }, { "epoch": 2.090032154340836, "grad_norm": 0.06326356530189514, "learning_rate": 2.3286173633440515e-05, "loss": 0.0077, "step": 2600 }, { "epoch": 2.090032154340836, "eval_loss": 0.001661359565332532, "eval_runtime": 0.1798, "eval_samples_per_second": 166.856, "eval_steps_per_second": 22.248, "step": 2600 }, { "epoch": 2.170418006430868, "grad_norm": 0.19675737619400024, "learning_rate": 2.264308681672026e-05, "loss": 0.0072, "step": 2700 }, { "epoch": 2.170418006430868, "eval_loss": 0.0009880892466753721, "eval_runtime": 0.2313, "eval_samples_per_second": 129.68, "eval_steps_per_second": 17.291, "step": 2700 }, { "epoch": 2.2508038585209005, "grad_norm": 0.5421108603477478, "learning_rate": 2.2000000000000003e-05, "loss": 0.0074, "step": 2800 }, { "epoch": 2.2508038585209005, "eval_loss": 0.002171145286411047, "eval_runtime": 0.1872, "eval_samples_per_second": 160.223, "eval_steps_per_second": 21.363, "step": 2800 }, { "epoch": 2.3311897106109325, "grad_norm": 0.5133712291717529, "learning_rate": 2.1356913183279743e-05, "loss": 0.0086, "step": 2900 }, { "epoch": 2.3311897106109325, "eval_loss": 0.001456312253139913, "eval_runtime": 0.1745, "eval_samples_per_second": 171.895, "eval_steps_per_second": 22.919, "step": 2900 }, { "epoch": 2.4115755627009645, "grad_norm": 0.4056571125984192, "learning_rate": 2.0713826366559487e-05, "loss": 0.0065, "step": 3000 }, { "epoch": 2.4115755627009645, "eval_loss": 0.0018532021204009652, "eval_runtime": 0.1763, "eval_samples_per_second": 170.132, "eval_steps_per_second": 22.684, "step": 3000 }, { "epoch": 2.491961414790997, "grad_norm": 0.29752230644226074, "learning_rate": 2.0070739549839228e-05, "loss": 0.0072, "step": 3100 }, { "epoch": 2.491961414790997, "eval_loss": 0.001285334350541234, "eval_runtime": 0.1764, "eval_samples_per_second": 170.115, "eval_steps_per_second": 22.682, "step": 3100 }, { "epoch": 2.572347266881029, "grad_norm": 0.2506803572177887, "learning_rate": 1.9427652733118975e-05, "loss": 0.0078, "step": 3200 }, { "epoch": 2.572347266881029, "eval_loss": 0.0003887661441694945, "eval_runtime": 0.2322, "eval_samples_per_second": 129.197, "eval_steps_per_second": 17.226, "step": 3200 }, { "epoch": 2.652733118971061, "grad_norm": 0.8498000502586365, "learning_rate": 1.8784565916398715e-05, "loss": 0.0073, "step": 3300 }, { "epoch": 2.652733118971061, "eval_loss": 0.0005741061177104712, "eval_runtime": 0.1749, "eval_samples_per_second": 171.507, "eval_steps_per_second": 22.868, "step": 3300 }, { "epoch": 2.733118971061093, "grad_norm": 0.056631457060575485, "learning_rate": 1.814147909967846e-05, "loss": 0.0066, "step": 3400 }, { "epoch": 2.733118971061093, "eval_loss": 0.000957026903051883, "eval_runtime": 0.1744, "eval_samples_per_second": 171.987, "eval_steps_per_second": 22.932, "step": 3400 }, { "epoch": 2.8135048231511255, "grad_norm": 0.017297716811299324, "learning_rate": 1.74983922829582e-05, "loss": 0.0072, "step": 3500 }, { "epoch": 2.8135048231511255, "eval_loss": 0.0007342658936977386, "eval_runtime": 0.1696, "eval_samples_per_second": 176.903, "eval_steps_per_second": 23.587, "step": 3500 }, { "epoch": 2.8938906752411575, "grad_norm": 0.6115002036094666, "learning_rate": 1.6855305466237944e-05, "loss": 0.008, "step": 3600 }, { "epoch": 2.8938906752411575, "eval_loss": 0.0007112031453289092, "eval_runtime": 0.1781, "eval_samples_per_second": 168.482, "eval_steps_per_second": 22.464, "step": 3600 }, { "epoch": 2.97427652733119, "grad_norm": 0.158920019865036, "learning_rate": 1.6212218649517684e-05, "loss": 0.0076, "step": 3700 }, { "epoch": 2.97427652733119, "eval_loss": 0.0015783592825755477, "eval_runtime": 0.208, "eval_samples_per_second": 144.199, "eval_steps_per_second": 19.226, "step": 3700 }, { "epoch": 3.054662379421222, "grad_norm": 0.2712903916835785, "learning_rate": 1.5569131832797428e-05, "loss": 0.0073, "step": 3800 }, { "epoch": 3.054662379421222, "eval_loss": 0.0012706245761364698, "eval_runtime": 0.1692, "eval_samples_per_second": 177.289, "eval_steps_per_second": 23.639, "step": 3800 }, { "epoch": 3.135048231511254, "grad_norm": 0.4465363323688507, "learning_rate": 1.492604501607717e-05, "loss": 0.0057, "step": 3900 }, { "epoch": 3.135048231511254, "eval_loss": 0.0017842828528955579, "eval_runtime": 0.1697, "eval_samples_per_second": 176.806, "eval_steps_per_second": 23.574, "step": 3900 }, { "epoch": 3.215434083601286, "grad_norm": 0.25834837555885315, "learning_rate": 1.4282958199356913e-05, "loss": 0.0054, "step": 4000 }, { "epoch": 3.215434083601286, "eval_loss": 0.001442342414520681, "eval_runtime": 0.1703, "eval_samples_per_second": 176.201, "eval_steps_per_second": 23.494, "step": 4000 }, { "epoch": 3.2958199356913185, "grad_norm": 0.1956845223903656, "learning_rate": 1.3639871382636658e-05, "loss": 0.0054, "step": 4100 }, { "epoch": 3.2958199356913185, "eval_loss": 0.001215717988088727, "eval_runtime": 0.182, "eval_samples_per_second": 164.818, "eval_steps_per_second": 21.976, "step": 4100 }, { "epoch": 3.3762057877813505, "grad_norm": 0.06578990817070007, "learning_rate": 1.29967845659164e-05, "loss": 0.0062, "step": 4200 }, { "epoch": 3.3762057877813505, "eval_loss": 0.002128337509930134, "eval_runtime": 0.1734, "eval_samples_per_second": 173.009, "eval_steps_per_second": 23.068, "step": 4200 }, { "epoch": 3.4565916398713825, "grad_norm": 0.017272261902689934, "learning_rate": 1.2360128617363345e-05, "loss": 0.0048, "step": 4300 }, { "epoch": 3.4565916398713825, "eval_loss": 0.0026242188178002834, "eval_runtime": 0.218, "eval_samples_per_second": 137.619, "eval_steps_per_second": 18.349, "step": 4300 }, { "epoch": 3.536977491961415, "grad_norm": 0.008806917816400528, "learning_rate": 1.1717041800643088e-05, "loss": 0.0062, "step": 4400 }, { "epoch": 3.536977491961415, "eval_loss": 0.002198620932176709, "eval_runtime": 0.2168, "eval_samples_per_second": 138.401, "eval_steps_per_second": 18.453, "step": 4400 }, { "epoch": 3.617363344051447, "grad_norm": 0.013797425664961338, "learning_rate": 1.107395498392283e-05, "loss": 0.0059, "step": 4500 }, { "epoch": 3.617363344051447, "eval_loss": 0.002305293455719948, "eval_runtime": 0.1711, "eval_samples_per_second": 175.368, "eval_steps_per_second": 23.382, "step": 4500 }, { "epoch": 3.697749196141479, "grad_norm": 0.07389205694198608, "learning_rate": 1.0430868167202572e-05, "loss": 0.0044, "step": 4600 }, { "epoch": 3.697749196141479, "eval_loss": 0.002617767546325922, "eval_runtime": 0.2272, "eval_samples_per_second": 132.034, "eval_steps_per_second": 17.605, "step": 4600 }, { "epoch": 3.778135048231511, "grad_norm": 0.41877493262290955, "learning_rate": 9.787781350482316e-06, "loss": 0.0049, "step": 4700 }, { "epoch": 3.778135048231511, "eval_loss": 0.0019422216573730111, "eval_runtime": 0.2456, "eval_samples_per_second": 122.127, "eval_steps_per_second": 16.284, "step": 4700 }, { "epoch": 3.8585209003215435, "grad_norm": 0.2227245718240738, "learning_rate": 9.144694533762058e-06, "loss": 0.0062, "step": 4800 }, { "epoch": 3.8585209003215435, "eval_loss": 0.001634993706829846, "eval_runtime": 0.1871, "eval_samples_per_second": 160.368, "eval_steps_per_second": 21.382, "step": 4800 }, { "epoch": 3.9389067524115755, "grad_norm": 0.24094010889530182, "learning_rate": 8.5016077170418e-06, "loss": 0.0055, "step": 4900 }, { "epoch": 3.9389067524115755, "eval_loss": 0.0020372075960040092, "eval_runtime": 0.1737, "eval_samples_per_second": 172.716, "eval_steps_per_second": 23.029, "step": 4900 }, { "epoch": 4.019292604501608, "grad_norm": 0.0924796536564827, "learning_rate": 7.858520900321544e-06, "loss": 0.0051, "step": 5000 }, { "epoch": 4.019292604501608, "eval_loss": 0.0011331220157444477, "eval_runtime": 0.1743, "eval_samples_per_second": 172.114, "eval_steps_per_second": 22.949, "step": 5000 }, { "epoch": 4.09967845659164, "grad_norm": 0.045692551881074905, "learning_rate": 7.215434083601287e-06, "loss": 0.004, "step": 5100 }, { "epoch": 4.09967845659164, "eval_loss": 0.0016930572455748916, "eval_runtime": 0.1768, "eval_samples_per_second": 169.716, "eval_steps_per_second": 22.629, "step": 5100 }, { "epoch": 4.180064308681672, "grad_norm": 0.012550954706966877, "learning_rate": 6.572347266881029e-06, "loss": 0.0049, "step": 5200 }, { "epoch": 4.180064308681672, "eval_loss": 0.0016865974757820368, "eval_runtime": 0.1688, "eval_samples_per_second": 177.694, "eval_steps_per_second": 23.693, "step": 5200 }, { "epoch": 4.260450160771704, "grad_norm": 0.11495041847229004, "learning_rate": 5.929260450160772e-06, "loss": 0.0034, "step": 5300 }, { "epoch": 4.260450160771704, "eval_loss": 0.0018789003370329738, "eval_runtime": 0.177, "eval_samples_per_second": 169.461, "eval_steps_per_second": 22.595, "step": 5300 }, { "epoch": 4.340836012861736, "grad_norm": 0.31747967004776, "learning_rate": 5.286173633440515e-06, "loss": 0.0039, "step": 5400 }, { "epoch": 4.340836012861736, "eval_loss": 0.0019645672291517258, "eval_runtime": 0.1686, "eval_samples_per_second": 177.95, "eval_steps_per_second": 23.727, "step": 5400 }, { "epoch": 4.421221864951768, "grad_norm": 0.03923821821808815, "learning_rate": 4.643086816720258e-06, "loss": 0.0037, "step": 5500 }, { "epoch": 4.421221864951768, "eval_loss": 0.001890690764412284, "eval_runtime": 0.1768, "eval_samples_per_second": 169.698, "eval_steps_per_second": 22.626, "step": 5500 }, { "epoch": 4.501607717041801, "grad_norm": 0.0037739709950983524, "learning_rate": 4.000000000000001e-06, "loss": 0.0046, "step": 5600 }, { "epoch": 4.501607717041801, "eval_loss": 0.00146665854845196, "eval_runtime": 0.1675, "eval_samples_per_second": 179.094, "eval_steps_per_second": 23.879, "step": 5600 }, { "epoch": 4.581993569131833, "grad_norm": 0.2434912621974945, "learning_rate": 3.356913183279743e-06, "loss": 0.0056, "step": 5700 }, { "epoch": 4.581993569131833, "eval_loss": 0.0013827175134792924, "eval_runtime": 0.1747, "eval_samples_per_second": 171.749, "eval_steps_per_second": 22.9, "step": 5700 }, { "epoch": 4.662379421221865, "grad_norm": 0.26639288663864136, "learning_rate": 2.7138263665594855e-06, "loss": 0.0035, "step": 5800 }, { "epoch": 4.662379421221865, "eval_loss": 0.0014188647037371993, "eval_runtime": 0.2919, "eval_samples_per_second": 102.775, "eval_steps_per_second": 13.703, "step": 5800 } ], "logging_steps": 100, "max_steps": 6220, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2292205513512960.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }