{ "best_global_step": 115, "best_metric": 0.09458151459693909, "best_model_checkpoint": "/content/drive/MyDrive/lora_model/outputs/task15_microsoft/Phi-4-mini-instruct/checkpoint-115", "epoch": 6.052631578947368, "eval_steps": 1, "global_step": 115, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05263157894736842, "grad_norm": 9.795289039611816, "learning_rate": 0.0, "loss": 3.2204, "step": 1 }, { "epoch": 0.05263157894736842, "eval_loss": 3.1565215587615967, "eval_runtime": 0.9831, "eval_samples_per_second": 30.517, "eval_steps_per_second": 4.069, "step": 1 }, { "epoch": 0.10526315789473684, "grad_norm": 10.048436164855957, "learning_rate": 3.3333333333333335e-05, "loss": 3.1604, "step": 2 }, { "epoch": 0.10526315789473684, "eval_loss": 2.4775681495666504, "eval_runtime": 0.8971, "eval_samples_per_second": 33.44, "eval_steps_per_second": 4.459, "step": 2 }, { "epoch": 0.15789473684210525, "grad_norm": 5.148971080780029, "learning_rate": 6.666666666666667e-05, "loss": 2.3511, "step": 3 }, { "epoch": 0.15789473684210525, "eval_loss": 2.0030856132507324, "eval_runtime": 0.8926, "eval_samples_per_second": 33.611, "eval_steps_per_second": 4.481, "step": 3 }, { "epoch": 0.21052631578947367, "grad_norm": 4.8437819480896, "learning_rate": 0.0001, "loss": 2.0198, "step": 4 }, { "epoch": 0.21052631578947367, "eval_loss": 1.6053706407546997, "eval_runtime": 0.8924, "eval_samples_per_second": 33.618, "eval_steps_per_second": 4.482, "step": 4 }, { "epoch": 0.2631578947368421, "grad_norm": 4.386927604675293, "learning_rate": 0.00013333333333333334, "loss": 1.6969, "step": 5 }, { "epoch": 0.2631578947368421, "eval_loss": 1.4053733348846436, "eval_runtime": 0.8951, "eval_samples_per_second": 33.517, "eval_steps_per_second": 4.469, "step": 5 }, { "epoch": 0.3157894736842105, "grad_norm": 3.955519676208496, "learning_rate": 0.00016666666666666666, "loss": 1.4825, "step": 6 }, { "epoch": 0.3157894736842105, "eval_loss": 1.3105080127716064, "eval_runtime": 0.893, "eval_samples_per_second": 33.593, "eval_steps_per_second": 4.479, "step": 6 }, { "epoch": 0.3684210526315789, "grad_norm": 3.6086604595184326, "learning_rate": 0.0002, "loss": 1.3404, "step": 7 }, { "epoch": 0.3684210526315789, "eval_loss": 1.2445138692855835, "eval_runtime": 0.8942, "eval_samples_per_second": 33.549, "eval_steps_per_second": 4.473, "step": 7 }, { "epoch": 0.42105263157894735, "grad_norm": 2.691216230392456, "learning_rate": 0.00023333333333333333, "loss": 1.2627, "step": 8 }, { "epoch": 0.42105263157894735, "eval_loss": 1.1471664905548096, "eval_runtime": 0.8927, "eval_samples_per_second": 33.606, "eval_steps_per_second": 4.481, "step": 8 }, { "epoch": 0.47368421052631576, "grad_norm": 2.5174126625061035, "learning_rate": 0.0002666666666666667, "loss": 1.2037, "step": 9 }, { "epoch": 0.47368421052631576, "eval_loss": 1.1372406482696533, "eval_runtime": 0.8947, "eval_samples_per_second": 33.529, "eval_steps_per_second": 4.471, "step": 9 }, { "epoch": 0.5263157894736842, "grad_norm": 2.893831253051758, "learning_rate": 0.0003, "loss": 1.1793, "step": 10 }, { "epoch": 0.5263157894736842, "eval_loss": 1.0686627626419067, "eval_runtime": 0.8921, "eval_samples_per_second": 33.628, "eval_steps_per_second": 4.484, "step": 10 }, { "epoch": 0.5789473684210527, "grad_norm": 2.5055713653564453, "learning_rate": 0.0003333333333333333, "loss": 1.201, "step": 11 }, { "epoch": 0.5789473684210527, "eval_loss": 1.0994912385940552, "eval_runtime": 0.8951, "eval_samples_per_second": 33.517, "eval_steps_per_second": 4.469, "step": 11 }, { "epoch": 0.631578947368421, "grad_norm": 2.297982931137085, "learning_rate": 0.00036666666666666667, "loss": 1.177, "step": 12 }, { "epoch": 0.631578947368421, "eval_loss": 1.0981471538543701, "eval_runtime": 0.8926, "eval_samples_per_second": 33.608, "eval_steps_per_second": 4.481, "step": 12 }, { "epoch": 0.6842105263157895, "grad_norm": 2.8536081314086914, "learning_rate": 0.0004, "loss": 1.2106, "step": 13 }, { "epoch": 0.6842105263157895, "eval_loss": 1.0119823217391968, "eval_runtime": 0.8936, "eval_samples_per_second": 33.574, "eval_steps_per_second": 4.477, "step": 13 }, { "epoch": 0.7368421052631579, "grad_norm": 1.8637670278549194, "learning_rate": 0.00043333333333333337, "loss": 1.0688, "step": 14 }, { "epoch": 0.7368421052631579, "eval_loss": 1.0545283555984497, "eval_runtime": 0.893, "eval_samples_per_second": 33.596, "eval_steps_per_second": 4.479, "step": 14 }, { "epoch": 0.7894736842105263, "grad_norm": 2.6284332275390625, "learning_rate": 0.00046666666666666666, "loss": 1.1661, "step": 15 }, { "epoch": 0.7894736842105263, "eval_loss": 1.031855821609497, "eval_runtime": 0.8928, "eval_samples_per_second": 33.603, "eval_steps_per_second": 4.48, "step": 15 }, { "epoch": 0.8421052631578947, "grad_norm": 1.9439812898635864, "learning_rate": 0.0005, "loss": 1.1859, "step": 16 }, { "epoch": 0.8421052631578947, "eval_loss": 0.9958587884902954, "eval_runtime": 0.8982, "eval_samples_per_second": 33.401, "eval_steps_per_second": 4.453, "step": 16 }, { "epoch": 0.8947368421052632, "grad_norm": 1.7199311256408691, "learning_rate": 0.0004999776608025946, "loss": 1.1636, "step": 17 }, { "epoch": 0.8947368421052632, "eval_loss": 0.9928242564201355, "eval_runtime": 0.8982, "eval_samples_per_second": 33.398, "eval_steps_per_second": 4.453, "step": 17 }, { "epoch": 0.9473684210526315, "grad_norm": 5.788880825042725, "learning_rate": 0.000499910647202696, "loss": 1.0348, "step": 18 }, { "epoch": 0.9473684210526315, "eval_loss": 1.0362129211425781, "eval_runtime": 0.8954, "eval_samples_per_second": 33.504, "eval_steps_per_second": 4.467, "step": 18 }, { "epoch": 1.0, "grad_norm": 2.156782388687134, "learning_rate": 0.0004997989711765446, "loss": 1.201, "step": 19 }, { "epoch": 1.0, "eval_loss": 0.9807829260826111, "eval_runtime": 0.8926, "eval_samples_per_second": 33.611, "eval_steps_per_second": 4.481, "step": 19 }, { "epoch": 1.0526315789473684, "grad_norm": 1.9558554887771606, "learning_rate": 0.0004996426526821629, "loss": 0.8535, "step": 20 }, { "epoch": 1.0526315789473684, "eval_loss": 0.9379722476005554, "eval_runtime": 0.8935, "eval_samples_per_second": 33.576, "eval_steps_per_second": 4.477, "step": 20 }, { "epoch": 1.1052631578947367, "grad_norm": 1.772550106048584, "learning_rate": 0.0004994417196557883, "loss": 0.968, "step": 21 }, { "epoch": 1.1052631578947367, "eval_loss": 0.9845291376113892, "eval_runtime": 0.8928, "eval_samples_per_second": 33.603, "eval_steps_per_second": 4.48, "step": 21 }, { "epoch": 1.1578947368421053, "grad_norm": 2.108396291732788, "learning_rate": 0.0004991962080068813, "loss": 1.0552, "step": 22 }, { "epoch": 1.1578947368421053, "eval_loss": 0.9239175319671631, "eval_runtime": 0.893, "eval_samples_per_second": 33.594, "eval_steps_per_second": 4.479, "step": 22 }, { "epoch": 1.2105263157894737, "grad_norm": 1.8215439319610596, "learning_rate": 0.0004989061616117073, "loss": 0.9825, "step": 23 }, { "epoch": 1.2105263157894737, "eval_loss": 0.980516791343689, "eval_runtime": 0.8952, "eval_samples_per_second": 33.513, "eval_steps_per_second": 4.468, "step": 23 }, { "epoch": 1.263157894736842, "grad_norm": 20.904949188232422, "learning_rate": 0.0004985716323054959, "loss": 0.9563, "step": 24 }, { "epoch": 1.263157894736842, "eval_loss": 1.112138271331787, "eval_runtime": 0.8954, "eval_samples_per_second": 33.505, "eval_steps_per_second": 4.467, "step": 24 }, { "epoch": 1.3157894736842106, "grad_norm": 2.785473585128784, "learning_rate": 0.0004981926798731766, "loss": 1.048, "step": 25 }, { "epoch": 1.3157894736842106, "eval_loss": 0.9919915795326233, "eval_runtime": 0.8932, "eval_samples_per_second": 33.586, "eval_steps_per_second": 4.478, "step": 25 }, { "epoch": 1.368421052631579, "grad_norm": 1.8656138181686401, "learning_rate": 0.000497769372038695, "loss": 1.0315, "step": 26 }, { "epoch": 1.368421052631579, "eval_loss": 0.9384483098983765, "eval_runtime": 0.8931, "eval_samples_per_second": 33.589, "eval_steps_per_second": 4.479, "step": 26 }, { "epoch": 1.4210526315789473, "grad_norm": 1.697496771812439, "learning_rate": 0.0004973017844529094, "loss": 1.0063, "step": 27 }, { "epoch": 1.4210526315789473, "eval_loss": 0.904453694820404, "eval_runtime": 0.8918, "eval_samples_per_second": 33.64, "eval_steps_per_second": 4.485, "step": 27 }, { "epoch": 1.4736842105263157, "grad_norm": 1.7305934429168701, "learning_rate": 0.0004967900006800708, "loss": 0.8483, "step": 28 }, { "epoch": 1.4736842105263157, "eval_loss": 0.876754879951477, "eval_runtime": 0.8933, "eval_samples_per_second": 33.584, "eval_steps_per_second": 4.478, "step": 28 }, { "epoch": 1.526315789473684, "grad_norm": 1.7766728401184082, "learning_rate": 0.000496234112182889, "loss": 1.0118, "step": 29 }, { "epoch": 1.526315789473684, "eval_loss": 0.9041274785995483, "eval_runtime": 0.8949, "eval_samples_per_second": 33.524, "eval_steps_per_second": 4.47, "step": 29 }, { "epoch": 1.5789473684210527, "grad_norm": 1.9015165567398071, "learning_rate": 0.000495634218306187, "loss": 0.8917, "step": 30 }, { "epoch": 1.5789473684210527, "eval_loss": 0.8897702693939209, "eval_runtime": 0.8926, "eval_samples_per_second": 33.611, "eval_steps_per_second": 4.481, "step": 30 }, { "epoch": 1.631578947368421, "grad_norm": 1.4804080724716187, "learning_rate": 0.0004949904262591467, "loss": 1.0084, "step": 31 }, { "epoch": 1.631578947368421, "eval_loss": 0.885962188243866, "eval_runtime": 0.8988, "eval_samples_per_second": 33.378, "eval_steps_per_second": 4.45, "step": 31 }, { "epoch": 1.6842105263157894, "grad_norm": 1.819899320602417, "learning_rate": 0.0004943028510961491, "loss": 0.969, "step": 32 }, { "epoch": 1.6842105263157894, "eval_loss": 0.8608292937278748, "eval_runtime": 0.8958, "eval_samples_per_second": 33.489, "eval_steps_per_second": 4.465, "step": 32 }, { "epoch": 1.736842105263158, "grad_norm": 2.8180196285247803, "learning_rate": 0.0004935716156962127, "loss": 1.1318, "step": 33 }, { "epoch": 1.736842105263158, "eval_loss": 0.875141978263855, "eval_runtime": 0.8971, "eval_samples_per_second": 33.441, "eval_steps_per_second": 4.459, "step": 33 }, { "epoch": 1.7894736842105263, "grad_norm": 1.8047230243682861, "learning_rate": 0.000492796850741033, "loss": 1.0002, "step": 34 }, { "epoch": 1.7894736842105263, "eval_loss": 0.89467453956604, "eval_runtime": 0.8966, "eval_samples_per_second": 33.46, "eval_steps_per_second": 4.461, "step": 34 }, { "epoch": 1.8421052631578947, "grad_norm": 2.6305246353149414, "learning_rate": 0.0004919786946916281, "loss": 1.1024, "step": 35 }, { "epoch": 1.8421052631578947, "eval_loss": 0.8359136581420898, "eval_runtime": 0.8971, "eval_samples_per_second": 33.44, "eval_steps_per_second": 4.459, "step": 35 }, { "epoch": 1.8947368421052633, "grad_norm": 2.4130873680114746, "learning_rate": 0.0004911172937635942, "loss": 0.9314, "step": 36 }, { "epoch": 1.8947368421052633, "eval_loss": 0.8058050274848938, "eval_runtime": 0.8959, "eval_samples_per_second": 33.487, "eval_steps_per_second": 4.465, "step": 36 }, { "epoch": 1.9473684210526314, "grad_norm": 1.580320119857788, "learning_rate": 0.0004902128019009741, "loss": 1.0036, "step": 37 }, { "epoch": 1.9473684210526314, "eval_loss": 0.7546663880348206, "eval_runtime": 0.8967, "eval_samples_per_second": 33.457, "eval_steps_per_second": 4.461, "step": 37 }, { "epoch": 2.0, "grad_norm": 1.6066155433654785, "learning_rate": 0.000489265380748746, "loss": 1.094, "step": 38 }, { "epoch": 2.0, "eval_loss": 0.8417730331420898, "eval_runtime": 0.895, "eval_samples_per_second": 33.519, "eval_steps_per_second": 4.469, "step": 38 }, { "epoch": 2.0526315789473686, "grad_norm": 2.4847571849823, "learning_rate": 0.0004882751996239352, "loss": 0.9106, "step": 39 }, { "epoch": 2.0526315789473686, "eval_loss": 0.805930495262146, "eval_runtime": 0.8985, "eval_samples_per_second": 33.388, "eval_steps_per_second": 4.452, "step": 39 }, { "epoch": 2.1052631578947367, "grad_norm": 2.144543409347534, "learning_rate": 0.0004872424354853545, "loss": 0.8542, "step": 40 }, { "epoch": 2.1052631578947367, "eval_loss": 0.7550076842308044, "eval_runtime": 0.8977, "eval_samples_per_second": 33.42, "eval_steps_per_second": 4.456, "step": 40 }, { "epoch": 2.1578947368421053, "grad_norm": 1.2767819166183472, "learning_rate": 0.0004861672729019797, "loss": 0.7569, "step": 41 }, { "epoch": 2.1578947368421053, "eval_loss": 0.720465362071991, "eval_runtime": 0.9013, "eval_samples_per_second": 33.285, "eval_steps_per_second": 4.438, "step": 41 }, { "epoch": 2.2105263157894735, "grad_norm": 1.4606373310089111, "learning_rate": 0.0004850499040199643, "loss": 0.6198, "step": 42 }, { "epoch": 2.2105263157894735, "eval_loss": 0.7800072431564331, "eval_runtime": 0.8938, "eval_samples_per_second": 33.564, "eval_steps_per_second": 4.475, "step": 42 }, { "epoch": 2.263157894736842, "grad_norm": 4.208314895629883, "learning_rate": 0.0004838905285283005, "loss": 0.8454, "step": 43 }, { "epoch": 2.263157894736842, "eval_loss": 0.7882384657859802, "eval_runtime": 0.8955, "eval_samples_per_second": 33.502, "eval_steps_per_second": 4.467, "step": 43 }, { "epoch": 2.3157894736842106, "grad_norm": 2.8906519412994385, "learning_rate": 0.00048268935362313215, "loss": 0.8786, "step": 44 }, { "epoch": 2.3157894736842106, "eval_loss": 0.7504675388336182, "eval_runtime": 0.8973, "eval_samples_per_second": 33.435, "eval_steps_per_second": 4.458, "step": 44 }, { "epoch": 2.3684210526315788, "grad_norm": 2.5608749389648438, "learning_rate": 0.00048144659397072586, "loss": 0.7165, "step": 45 }, { "epoch": 2.3684210526315788, "eval_loss": 0.7160356640815735, "eval_runtime": 0.8985, "eval_samples_per_second": 33.389, "eval_steps_per_second": 4.452, "step": 45 }, { "epoch": 2.4210526315789473, "grad_norm": 2.237501621246338, "learning_rate": 0.0004801624716691072, "loss": 0.9232, "step": 46 }, { "epoch": 2.4210526315789473, "eval_loss": 0.7007637619972229, "eval_runtime": 0.8986, "eval_samples_per_second": 33.387, "eval_steps_per_second": 4.452, "step": 46 }, { "epoch": 2.473684210526316, "grad_norm": 2.166039228439331, "learning_rate": 0.00047883721620836894, "loss": 0.782, "step": 47 }, { "epoch": 2.473684210526316, "eval_loss": 0.6951841711997986, "eval_runtime": 0.9007, "eval_samples_per_second": 33.308, "eval_steps_per_second": 4.441, "step": 47 }, { "epoch": 2.526315789473684, "grad_norm": 1.6499485969543457, "learning_rate": 0.0004774710644296578, "loss": 0.7387, "step": 48 }, { "epoch": 2.526315789473684, "eval_loss": 0.7041357755661011, "eval_runtime": 0.8999, "eval_samples_per_second": 33.337, "eval_steps_per_second": 4.445, "step": 48 }, { "epoch": 2.5789473684210527, "grad_norm": 2.833061456680298, "learning_rate": 0.00047606426048284813, "loss": 0.8343, "step": 49 }, { "epoch": 2.5789473684210527, "eval_loss": 0.6822550296783447, "eval_runtime": 0.9005, "eval_samples_per_second": 33.316, "eval_steps_per_second": 4.442, "step": 49 }, { "epoch": 2.6315789473684212, "grad_norm": 2.0135650634765625, "learning_rate": 0.00047461705578290833, "loss": 0.7768, "step": 50 }, { "epoch": 2.6315789473684212, "eval_loss": 0.6283606886863708, "eval_runtime": 0.8974, "eval_samples_per_second": 33.428, "eval_steps_per_second": 4.457, "step": 50 }, { "epoch": 2.6842105263157894, "grad_norm": 1.5658601522445679, "learning_rate": 0.0004731297089649703, "loss": 0.7418, "step": 51 }, { "epoch": 2.6842105263157894, "eval_loss": 0.6374291181564331, "eval_runtime": 0.8918, "eval_samples_per_second": 33.641, "eval_steps_per_second": 4.485, "step": 51 }, { "epoch": 2.736842105263158, "grad_norm": 1.7403415441513062, "learning_rate": 0.0004716024858381075, "loss": 0.7866, "step": 52 }, { "epoch": 2.736842105263158, "eval_loss": 0.6586597561836243, "eval_runtime": 0.8957, "eval_samples_per_second": 33.495, "eval_steps_per_second": 4.466, "step": 52 }, { "epoch": 2.7894736842105265, "grad_norm": 1.519404411315918, "learning_rate": 0.00047003565933783123, "loss": 0.8354, "step": 53 }, { "epoch": 2.7894736842105265, "eval_loss": 0.691727340221405, "eval_runtime": 0.8923, "eval_samples_per_second": 33.62, "eval_steps_per_second": 4.483, "step": 53 }, { "epoch": 2.8421052631578947, "grad_norm": 1.5139788389205933, "learning_rate": 0.0004684295094773134, "loss": 0.7804, "step": 54 }, { "epoch": 2.8421052631578947, "eval_loss": 0.6508743762969971, "eval_runtime": 0.8929, "eval_samples_per_second": 33.598, "eval_steps_per_second": 4.48, "step": 54 }, { "epoch": 2.8947368421052633, "grad_norm": 1.5480479001998901, "learning_rate": 0.00046678432329734434, "loss": 0.7253, "step": 55 }, { "epoch": 2.8947368421052633, "eval_loss": 0.6439611911773682, "eval_runtime": 0.894, "eval_samples_per_second": 33.557, "eval_steps_per_second": 4.474, "step": 55 }, { "epoch": 2.9473684210526314, "grad_norm": 1.5994068384170532, "learning_rate": 0.00046510039481503486, "loss": 0.842, "step": 56 }, { "epoch": 2.9473684210526314, "eval_loss": 0.6327024698257446, "eval_runtime": 0.9041, "eval_samples_per_second": 33.184, "eval_steps_per_second": 4.424, "step": 56 }, { "epoch": 3.0, "grad_norm": 1.6054733991622925, "learning_rate": 0.00046337802497127117, "loss": 0.8073, "step": 57 }, { "epoch": 3.0, "eval_loss": 0.6213096976280212, "eval_runtime": 0.8992, "eval_samples_per_second": 33.362, "eval_steps_per_second": 4.448, "step": 57 }, { "epoch": 3.0526315789473686, "grad_norm": 2.5787405967712402, "learning_rate": 0.00046161752157693284, "loss": 0.6017, "step": 58 }, { "epoch": 3.0526315789473686, "eval_loss": 0.5892248749732971, "eval_runtime": 0.8922, "eval_samples_per_second": 33.624, "eval_steps_per_second": 4.483, "step": 58 }, { "epoch": 3.1052631578947367, "grad_norm": 1.7601501941680908, "learning_rate": 0.0004598191992578828, "loss": 0.6071, "step": 59 }, { "epoch": 3.1052631578947367, "eval_loss": 0.5735067129135132, "eval_runtime": 0.8924, "eval_samples_per_second": 33.618, "eval_steps_per_second": 4.482, "step": 59 }, { "epoch": 3.1578947368421053, "grad_norm": 1.7480543851852417, "learning_rate": 0.00045798337939873923, "loss": 0.6597, "step": 60 }, { "epoch": 3.1578947368421053, "eval_loss": 0.5306870341300964, "eval_runtime": 0.8938, "eval_samples_per_second": 33.566, "eval_steps_per_second": 4.475, "step": 60 }, { "epoch": 3.2105263157894735, "grad_norm": 2.3808937072753906, "learning_rate": 0.0004561103900854401, "loss": 0.5372, "step": 61 }, { "epoch": 3.2105263157894735, "eval_loss": 0.535223662853241, "eval_runtime": 0.8966, "eval_samples_per_second": 33.459, "eval_steps_per_second": 4.461, "step": 61 }, { "epoch": 3.263157894736842, "grad_norm": 1.8272178173065186, "learning_rate": 0.0004542005660466094, "loss": 0.5399, "step": 62 }, { "epoch": 3.263157894736842, "eval_loss": 0.5316082239151001, "eval_runtime": 0.8994, "eval_samples_per_second": 33.354, "eval_steps_per_second": 4.447, "step": 62 }, { "epoch": 3.3157894736842106, "grad_norm": 2.0635435581207275, "learning_rate": 0.0004522542485937369, "loss": 0.5531, "step": 63 }, { "epoch": 3.3157894736842106, "eval_loss": 0.5134085416793823, "eval_runtime": 0.8937, "eval_samples_per_second": 33.567, "eval_steps_per_second": 4.476, "step": 63 }, { "epoch": 3.3684210526315788, "grad_norm": 2.268183708190918, "learning_rate": 0.0004502717855601809, "loss": 0.5291, "step": 64 }, { "epoch": 3.3684210526315788, "eval_loss": 0.5419598817825317, "eval_runtime": 0.8959, "eval_samples_per_second": 33.486, "eval_steps_per_second": 4.465, "step": 64 }, { "epoch": 3.4210526315789473, "grad_norm": 1.8800358772277832, "learning_rate": 0.0004482535312390058, "loss": 0.5501, "step": 65 }, { "epoch": 3.4210526315789473, "eval_loss": 0.5209227800369263, "eval_runtime": 0.8927, "eval_samples_per_second": 33.606, "eval_steps_per_second": 4.481, "step": 65 }, { "epoch": 3.473684210526316, "grad_norm": 3.1507558822631836, "learning_rate": 0.00044619984631966527, "loss": 0.5309, "step": 66 }, { "epoch": 3.473684210526316, "eval_loss": 0.536996603012085, "eval_runtime": 0.8951, "eval_samples_per_second": 33.517, "eval_steps_per_second": 4.469, "step": 66 }, { "epoch": 3.526315789473684, "grad_norm": 3.5700478553771973, "learning_rate": 0.0004441110978235418, "loss": 0.7223, "step": 67 }, { "epoch": 3.526315789473684, "eval_loss": 0.5140640139579773, "eval_runtime": 0.8962, "eval_samples_per_second": 33.474, "eval_steps_per_second": 4.463, "step": 67 }, { "epoch": 3.5789473684210527, "grad_norm": 1.758971929550171, "learning_rate": 0.0004419876590383554, "loss": 0.6927, "step": 68 }, { "epoch": 3.5789473684210527, "eval_loss": 0.47072505950927734, "eval_runtime": 0.9127, "eval_samples_per_second": 32.87, "eval_steps_per_second": 4.383, "step": 68 }, { "epoch": 3.6315789473684212, "grad_norm": 1.5274709463119507, "learning_rate": 0.00043982990945145146, "loss": 0.4762, "step": 69 }, { "epoch": 3.6315789473684212, "eval_loss": 0.4518219828605652, "eval_runtime": 0.8967, "eval_samples_per_second": 33.456, "eval_steps_per_second": 4.461, "step": 69 }, { "epoch": 3.6842105263157894, "grad_norm": 1.7685797214508057, "learning_rate": 0.0004376382346819819, "loss": 0.5629, "step": 70 }, { "epoch": 3.6842105263157894, "eval_loss": 0.40707579255104065, "eval_runtime": 0.8934, "eval_samples_per_second": 33.581, "eval_steps_per_second": 4.478, "step": 70 }, { "epoch": 3.736842105263158, "grad_norm": 1.6618574857711792, "learning_rate": 0.00043541302641198946, "loss": 0.5877, "step": 71 }, { "epoch": 3.736842105263158, "eval_loss": 0.3780651390552521, "eval_runtime": 0.9024, "eval_samples_per_second": 33.246, "eval_steps_per_second": 4.433, "step": 71 }, { "epoch": 3.7894736842105265, "grad_norm": 1.542702317237854, "learning_rate": 0.00043315468231640834, "loss": 0.5222, "step": 72 }, { "epoch": 3.7894736842105265, "eval_loss": 0.3732970356941223, "eval_runtime": 0.9166, "eval_samples_per_second": 32.73, "eval_steps_per_second": 4.364, "step": 72 }, { "epoch": 3.8421052631578947, "grad_norm": 1.8039391040802002, "learning_rate": 0.00043086360599199516, "loss": 0.5238, "step": 73 }, { "epoch": 3.8421052631578947, "eval_loss": 0.3568810820579529, "eval_runtime": 0.9031, "eval_samples_per_second": 33.218, "eval_steps_per_second": 4.429, "step": 73 }, { "epoch": 3.8947368421052633, "grad_norm": 1.6215863227844238, "learning_rate": 0.0004285402068852002, "loss": 0.6504, "step": 74 }, { "epoch": 3.8947368421052633, "eval_loss": 0.3885921835899353, "eval_runtime": 0.896, "eval_samples_per_second": 33.483, "eval_steps_per_second": 4.464, "step": 74 }, { "epoch": 3.9473684210526314, "grad_norm": 1.5152952671051025, "learning_rate": 0.00042618490021899383, "loss": 0.5694, "step": 75 }, { "epoch": 3.9473684210526314, "eval_loss": 0.38745489716529846, "eval_runtime": 0.8939, "eval_samples_per_second": 33.562, "eval_steps_per_second": 4.475, "step": 75 }, { "epoch": 4.0, "grad_norm": 2.6989200115203857, "learning_rate": 0.00042379810691866064, "loss": 0.5849, "step": 76 }, { "epoch": 4.0, "eval_loss": 0.42535698413848877, "eval_runtime": 0.9073, "eval_samples_per_second": 33.066, "eval_steps_per_second": 4.409, "step": 76 }, { "epoch": 4.052631578947368, "grad_norm": 1.7381691932678223, "learning_rate": 0.00042138025353657407, "loss": 0.3779, "step": 77 }, { "epoch": 4.052631578947368, "eval_loss": 0.37115439772605896, "eval_runtime": 0.9112, "eval_samples_per_second": 32.922, "eval_steps_per_second": 4.39, "step": 77 }, { "epoch": 4.105263157894737, "grad_norm": 2.188385248184204, "learning_rate": 0.00041893177217596633, "loss": 0.44, "step": 78 }, { "epoch": 4.105263157894737, "eval_loss": 0.2926563322544098, "eval_runtime": 0.8982, "eval_samples_per_second": 33.401, "eval_steps_per_second": 4.453, "step": 78 }, { "epoch": 4.157894736842105, "grad_norm": 2.3652961254119873, "learning_rate": 0.0004164531004137049, "loss": 0.3639, "step": 79 }, { "epoch": 4.157894736842105, "eval_loss": 0.2751067876815796, "eval_runtime": 0.9146, "eval_samples_per_second": 32.8, "eval_steps_per_second": 4.373, "step": 79 }, { "epoch": 4.2105263157894735, "grad_norm": 2.165874719619751, "learning_rate": 0.0004139446812220924, "loss": 0.2683, "step": 80 }, { "epoch": 4.2105263157894735, "eval_loss": 0.2685202360153198, "eval_runtime": 0.9124, "eval_samples_per_second": 32.881, "eval_steps_per_second": 4.384, "step": 80 }, { "epoch": 4.2631578947368425, "grad_norm": 1.7391912937164307, "learning_rate": 0.0004114069628897006, "loss": 0.2993, "step": 81 }, { "epoch": 4.2631578947368425, "eval_loss": 0.33646491169929504, "eval_runtime": 0.8952, "eval_samples_per_second": 33.51, "eval_steps_per_second": 4.468, "step": 81 }, { "epoch": 4.315789473684211, "grad_norm": 3.65714693069458, "learning_rate": 0.0004088403989412559, "loss": 0.4252, "step": 82 }, { "epoch": 4.315789473684211, "eval_loss": 0.2839888632297516, "eval_runtime": 0.9057, "eval_samples_per_second": 33.123, "eval_steps_per_second": 4.416, "step": 82 }, { "epoch": 4.368421052631579, "grad_norm": 2.1762771606445312, "learning_rate": 0.00040624544805658794, "loss": 0.3304, "step": 83 }, { "epoch": 4.368421052631579, "eval_loss": 0.27002134919166565, "eval_runtime": 0.8939, "eval_samples_per_second": 33.562, "eval_steps_per_second": 4.475, "step": 83 }, { "epoch": 4.421052631578947, "grad_norm": 2.1018354892730713, "learning_rate": 0.00040362257398865713, "loss": 0.4506, "step": 84 }, { "epoch": 4.421052631578947, "eval_loss": 0.2557659149169922, "eval_runtime": 0.8969, "eval_samples_per_second": 33.45, "eval_steps_per_second": 4.46, "step": 84 }, { "epoch": 4.473684210526316, "grad_norm": 1.7509180307388306, "learning_rate": 0.00040097224548067613, "loss": 0.3731, "step": 85 }, { "epoch": 4.473684210526316, "eval_loss": 0.26859304308891296, "eval_runtime": 0.9009, "eval_samples_per_second": 33.299, "eval_steps_per_second": 4.44, "step": 85 }, { "epoch": 4.526315789473684, "grad_norm": 1.971816897392273, "learning_rate": 0.0003982949361823388, "loss": 0.38, "step": 86 }, { "epoch": 4.526315789473684, "eval_loss": 0.2624681293964386, "eval_runtime": 0.8949, "eval_samples_per_second": 33.524, "eval_steps_per_second": 4.47, "step": 86 }, { "epoch": 4.578947368421053, "grad_norm": 1.4714068174362183, "learning_rate": 0.0003955911245651726, "loss": 0.3944, "step": 87 }, { "epoch": 4.578947368421053, "eval_loss": 0.23652420938014984, "eval_runtime": 0.8952, "eval_samples_per_second": 33.511, "eval_steps_per_second": 4.468, "step": 87 }, { "epoch": 4.631578947368421, "grad_norm": 2.6970834732055664, "learning_rate": 0.0003928612938370292, "loss": 0.3374, "step": 88 }, { "epoch": 4.631578947368421, "eval_loss": 0.2716277241706848, "eval_runtime": 0.8932, "eval_samples_per_second": 33.588, "eval_steps_per_second": 4.478, "step": 88 }, { "epoch": 4.684210526315789, "grad_norm": 1.9066615104675293, "learning_rate": 0.00039010593185572867, "loss": 0.2442, "step": 89 }, { "epoch": 4.684210526315789, "eval_loss": 0.2999991476535797, "eval_runtime": 0.8939, "eval_samples_per_second": 33.559, "eval_steps_per_second": 4.475, "step": 89 }, { "epoch": 4.7368421052631575, "grad_norm": 2.6232354640960693, "learning_rate": 0.00038732553104187296, "loss": 0.2857, "step": 90 }, { "epoch": 4.7368421052631575, "eval_loss": 0.2302989959716797, "eval_runtime": 0.8938, "eval_samples_per_second": 33.564, "eval_steps_per_second": 4.475, "step": 90 }, { "epoch": 4.7894736842105265, "grad_norm": 2.0710129737854004, "learning_rate": 0.0003845205882908432, "loss": 0.4195, "step": 91 }, { "epoch": 4.7894736842105265, "eval_loss": 0.21816590428352356, "eval_runtime": 0.9251, "eval_samples_per_second": 32.429, "eval_steps_per_second": 4.324, "step": 91 }, { "epoch": 4.842105263157895, "grad_norm": 1.8006062507629395, "learning_rate": 0.0003816916048839979, "loss": 0.2859, "step": 92 }, { "epoch": 4.842105263157895, "eval_loss": 0.21071405708789825, "eval_runtime": 0.8965, "eval_samples_per_second": 33.462, "eval_steps_per_second": 4.462, "step": 92 }, { "epoch": 4.894736842105263, "grad_norm": 1.6352888345718384, "learning_rate": 0.0003788390863990875, "loss": 0.4275, "step": 93 }, { "epoch": 4.894736842105263, "eval_loss": 0.20206846296787262, "eval_runtime": 0.9052, "eval_samples_per_second": 33.144, "eval_steps_per_second": 4.419, "step": 93 }, { "epoch": 4.947368421052632, "grad_norm": 1.6399378776550293, "learning_rate": 0.00037596354261990007, "loss": 0.389, "step": 94 }, { "epoch": 4.947368421052632, "eval_loss": 0.19467315077781677, "eval_runtime": 0.8973, "eval_samples_per_second": 33.435, "eval_steps_per_second": 4.458, "step": 94 }, { "epoch": 5.0, "grad_norm": 1.5680173635482788, "learning_rate": 0.0003730654874451569, "loss": 0.395, "step": 95 }, { "epoch": 5.0, "eval_loss": 0.19546455144882202, "eval_runtime": 0.91, "eval_samples_per_second": 32.968, "eval_steps_per_second": 4.396, "step": 95 }, { "epoch": 5.052631578947368, "grad_norm": 1.0308386087417603, "learning_rate": 0.00037014543879667093, "loss": 0.1384, "step": 96 }, { "epoch": 5.052631578947368, "eval_loss": 0.18969732522964478, "eval_runtime": 0.9021, "eval_samples_per_second": 33.258, "eval_steps_per_second": 4.434, "step": 96 }, { "epoch": 5.105263157894737, "grad_norm": 1.4042502641677856, "learning_rate": 0.0003672039185267878, "loss": 0.2291, "step": 97 }, { "epoch": 5.105263157894737, "eval_loss": 0.16800740361213684, "eval_runtime": 0.8938, "eval_samples_per_second": 33.563, "eval_steps_per_second": 4.475, "step": 97 }, { "epoch": 5.157894736842105, "grad_norm": 1.6313552856445312, "learning_rate": 0.00036424145232512333, "loss": 0.1736, "step": 98 }, { "epoch": 5.157894736842105, "eval_loss": 0.16714099049568176, "eval_runtime": 0.9009, "eval_samples_per_second": 33.301, "eval_steps_per_second": 4.44, "step": 98 }, { "epoch": 5.2105263157894735, "grad_norm": 1.8922698497772217, "learning_rate": 0.0003612585696246158, "loss": 0.1677, "step": 99 }, { "epoch": 5.2105263157894735, "eval_loss": 0.179762065410614, "eval_runtime": 0.9039, "eval_samples_per_second": 33.188, "eval_steps_per_second": 4.425, "step": 99 }, { "epoch": 5.2631578947368425, "grad_norm": 2.409526824951172, "learning_rate": 0.0003582558035069091, "loss": 0.2379, "step": 100 }, { "epoch": 5.2631578947368425, "eval_loss": 0.1902371197938919, "eval_runtime": 0.9097, "eval_samples_per_second": 32.98, "eval_steps_per_second": 4.397, "step": 100 }, { "epoch": 5.315789473684211, "grad_norm": 2.084869146347046, "learning_rate": 0.0003552336906070838, "loss": 0.2165, "step": 101 }, { "epoch": 5.315789473684211, "eval_loss": 0.17252177000045776, "eval_runtime": 0.8948, "eval_samples_per_second": 33.528, "eval_steps_per_second": 4.47, "step": 101 }, { "epoch": 5.368421052631579, "grad_norm": 1.655718207359314, "learning_rate": 0.000352192771017753, "loss": 0.223, "step": 102 }, { "epoch": 5.368421052631579, "eval_loss": 0.18867380917072296, "eval_runtime": 0.8956, "eval_samples_per_second": 33.495, "eval_steps_per_second": 4.466, "step": 102 }, { "epoch": 5.421052631578947, "grad_norm": 2.672633409500122, "learning_rate": 0.0003491335881925407, "loss": 0.161, "step": 103 }, { "epoch": 5.421052631578947, "eval_loss": 0.1944020837545395, "eval_runtime": 0.8924, "eval_samples_per_second": 33.616, "eval_steps_per_second": 4.482, "step": 103 }, { "epoch": 5.473684210526316, "grad_norm": 1.9712008237838745, "learning_rate": 0.0003460566888489593, "loss": 0.2525, "step": 104 }, { "epoch": 5.473684210526316, "eval_loss": 0.17671068012714386, "eval_runtime": 0.897, "eval_samples_per_second": 33.446, "eval_steps_per_second": 4.459, "step": 104 }, { "epoch": 5.526315789473684, "grad_norm": 2.2153072357177734, "learning_rate": 0.00034296262287070335, "loss": 0.2105, "step": 105 }, { "epoch": 5.526315789473684, "eval_loss": 0.1715732216835022, "eval_runtime": 0.8951, "eval_samples_per_second": 33.514, "eval_steps_per_second": 4.469, "step": 105 }, { "epoch": 5.578947368421053, "grad_norm": 1.8106168508529663, "learning_rate": 0.0003398519432093782, "loss": 0.259, "step": 106 }, { "epoch": 5.578947368421053, "eval_loss": 0.1465868353843689, "eval_runtime": 0.9077, "eval_samples_per_second": 33.051, "eval_steps_per_second": 4.407, "step": 106 }, { "epoch": 5.631578947368421, "grad_norm": 2.1159439086914062, "learning_rate": 0.0003367252057856802, "loss": 0.2065, "step": 107 }, { "epoch": 5.631578947368421, "eval_loss": 0.14219093322753906, "eval_runtime": 0.9049, "eval_samples_per_second": 33.154, "eval_steps_per_second": 4.42, "step": 107 }, { "epoch": 5.684210526315789, "grad_norm": 1.4467761516571045, "learning_rate": 0.00033358296939004547, "loss": 0.2083, "step": 108 }, { "epoch": 5.684210526315789, "eval_loss": 0.1406753957271576, "eval_runtime": 0.8954, "eval_samples_per_second": 33.505, "eval_steps_per_second": 4.467, "step": 108 }, { "epoch": 5.7368421052631575, "grad_norm": 1.3671239614486694, "learning_rate": 0.00033042579558278717, "loss": 0.1825, "step": 109 }, { "epoch": 5.7368421052631575, "eval_loss": 0.13007155060768127, "eval_runtime": 0.8998, "eval_samples_per_second": 33.342, "eval_steps_per_second": 4.446, "step": 109 }, { "epoch": 5.7894736842105265, "grad_norm": 1.479944109916687, "learning_rate": 0.00032725424859373687, "loss": 0.2244, "step": 110 }, { "epoch": 5.7894736842105265, "eval_loss": 0.12692232429981232, "eval_runtime": 0.901, "eval_samples_per_second": 33.298, "eval_steps_per_second": 4.44, "step": 110 }, { "epoch": 5.842105263157895, "grad_norm": 1.5173969268798828, "learning_rate": 0.0003240688952214085, "loss": 0.2273, "step": 111 }, { "epoch": 5.842105263157895, "eval_loss": 0.12454597651958466, "eval_runtime": 0.8987, "eval_samples_per_second": 33.382, "eval_steps_per_second": 4.451, "step": 111 }, { "epoch": 5.894736842105263, "grad_norm": 2.7870988845825195, "learning_rate": 0.00032087030473170445, "loss": 0.2101, "step": 112 }, { "epoch": 5.894736842105263, "eval_loss": 0.12002909928560257, "eval_runtime": 0.893, "eval_samples_per_second": 33.593, "eval_steps_per_second": 4.479, "step": 112 }, { "epoch": 5.947368421052632, "grad_norm": 1.3659342527389526, "learning_rate": 0.00031765904875617973, "loss": 0.1882, "step": 113 }, { "epoch": 5.947368421052632, "eval_loss": 0.10573837906122208, "eval_runtime": 0.8956, "eval_samples_per_second": 33.496, "eval_steps_per_second": 4.466, "step": 113 }, { "epoch": 6.0, "grad_norm": 1.8464044332504272, "learning_rate": 0.00031443570118988356, "loss": 0.2285, "step": 114 }, { "epoch": 6.0, "eval_loss": 0.10221625119447708, "eval_runtime": 0.8955, "eval_samples_per_second": 33.501, "eval_steps_per_second": 4.467, "step": 114 }, { "epoch": 6.052631578947368, "grad_norm": 1.3894392251968384, "learning_rate": 0.00031120083808879663, "loss": 0.1115, "step": 115 }, { "epoch": 6.052631578947368, "eval_loss": 0.09458151459693909, "eval_runtime": 0.8981, "eval_samples_per_second": 33.405, "eval_steps_per_second": 4.454, "step": 115 } ], "logging_steps": 1, "max_steps": 250, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4901149662148608.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }