| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.13541489903275072, | |
| "eval_steps": 500, | |
| "global_step": 133, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001018157135584592, | |
| "grad_norm": 11.6351900100708, | |
| "learning_rate": 0.0, | |
| "loss": 9.5408, | |
| "num_input_tokens_seen": 1572864, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.002036314271169184, | |
| "grad_norm": 12.06759262084961, | |
| "learning_rate": 3.7037037037037037e-06, | |
| "loss": 9.2981, | |
| "num_input_tokens_seen": 3145728, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0030544714067537756, | |
| "grad_norm": 5.20039701461792, | |
| "learning_rate": 7.4074074074074075e-06, | |
| "loss": 9.1643, | |
| "num_input_tokens_seen": 4718592, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.004072628542338368, | |
| "grad_norm": 5.776241779327393, | |
| "learning_rate": 1.1111111111111112e-05, | |
| "loss": 9.155, | |
| "num_input_tokens_seen": 6291456, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0050907856779229595, | |
| "grad_norm": 5.6493048667907715, | |
| "learning_rate": 1.4814814814814815e-05, | |
| "loss": 9.9883, | |
| "num_input_tokens_seen": 7864320, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.006108942813507551, | |
| "grad_norm": 5.06437873840332, | |
| "learning_rate": 1.8518518518518518e-05, | |
| "loss": 9.0083, | |
| "num_input_tokens_seen": 9437184, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.007127099949092143, | |
| "grad_norm": 8.688281059265137, | |
| "learning_rate": 2.2222222222222223e-05, | |
| "loss": 8.7441, | |
| "num_input_tokens_seen": 11010048, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.008145257084676736, | |
| "grad_norm": 4.987055778503418, | |
| "learning_rate": 2.5925925925925925e-05, | |
| "loss": 9.578, | |
| "num_input_tokens_seen": 12582912, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.009163414220261326, | |
| "grad_norm": 6.269039154052734, | |
| "learning_rate": 2.962962962962963e-05, | |
| "loss": 9.2803, | |
| "num_input_tokens_seen": 14155776, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.010181571355845919, | |
| "grad_norm": 3.3577678203582764, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 8.7893, | |
| "num_input_tokens_seen": 15728640, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01119972849143051, | |
| "grad_norm": 4.074950218200684, | |
| "learning_rate": 3.7037037037037037e-05, | |
| "loss": 8.8306, | |
| "num_input_tokens_seen": 17301504, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.012217885627015103, | |
| "grad_norm": 2.5604281425476074, | |
| "learning_rate": 4.074074074074074e-05, | |
| "loss": 9.9509, | |
| "num_input_tokens_seen": 18874368, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.013236042762599695, | |
| "grad_norm": 5.522690773010254, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 10.0199, | |
| "num_input_tokens_seen": 20447232, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.014254199898184286, | |
| "grad_norm": 3.264158010482788, | |
| "learning_rate": 4.814814814814815e-05, | |
| "loss": 9.2761, | |
| "num_input_tokens_seen": 22020096, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.015272357033768879, | |
| "grad_norm": 3.1376729011535645, | |
| "learning_rate": 5.185185185185185e-05, | |
| "loss": 9.7989, | |
| "num_input_tokens_seen": 23592960, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.01629051416935347, | |
| "grad_norm": 2.6666500568389893, | |
| "learning_rate": 5.555555555555556e-05, | |
| "loss": 9.2875, | |
| "num_input_tokens_seen": 25165824, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.017308671304938062, | |
| "grad_norm": 3.044588565826416, | |
| "learning_rate": 5.925925925925926e-05, | |
| "loss": 8.9129, | |
| "num_input_tokens_seen": 26738688, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.018326828440522653, | |
| "grad_norm": 3.131470203399658, | |
| "learning_rate": 6.296296296296296e-05, | |
| "loss": 9.2967, | |
| "num_input_tokens_seen": 28311552, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.019344985576107247, | |
| "grad_norm": 3.9975271224975586, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 9.2564, | |
| "num_input_tokens_seen": 29884416, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.020363142711691838, | |
| "grad_norm": 4.00247049331665, | |
| "learning_rate": 7.037037037037038e-05, | |
| "loss": 9.7382, | |
| "num_input_tokens_seen": 31457280, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02138129984727643, | |
| "grad_norm": 2.88885235786438, | |
| "learning_rate": 7.407407407407407e-05, | |
| "loss": 9.8961, | |
| "num_input_tokens_seen": 33030144, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.02239945698286102, | |
| "grad_norm": 3.772406578063965, | |
| "learning_rate": 7.777777777777778e-05, | |
| "loss": 10.2719, | |
| "num_input_tokens_seen": 34603008, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.023417614118445614, | |
| "grad_norm": 2.989847421646118, | |
| "learning_rate": 8.148148148148148e-05, | |
| "loss": 8.0849, | |
| "num_input_tokens_seen": 36175872, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.024435771254030205, | |
| "grad_norm": 3.9388954639434814, | |
| "learning_rate": 8.518518518518518e-05, | |
| "loss": 9.0817, | |
| "num_input_tokens_seen": 37748736, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.025453928389614796, | |
| "grad_norm": 3.857853889465332, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 9.2902, | |
| "num_input_tokens_seen": 39321600, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02647208552519939, | |
| "grad_norm": 4.174322605133057, | |
| "learning_rate": 9.25925925925926e-05, | |
| "loss": 8.8552, | |
| "num_input_tokens_seen": 40894464, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.02749024266078398, | |
| "grad_norm": 2.5443918704986572, | |
| "learning_rate": 9.62962962962963e-05, | |
| "loss": 8.4878, | |
| "num_input_tokens_seen": 42467328, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.028508399796368572, | |
| "grad_norm": 3.078213691711426, | |
| "learning_rate": 0.0001, | |
| "loss": 10.4723, | |
| "num_input_tokens_seen": 44040192, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.029526556931953166, | |
| "grad_norm": 4.266736030578613, | |
| "learning_rate": 9.999568045802217e-05, | |
| "loss": 8.8477, | |
| "num_input_tokens_seen": 45613056, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.030544714067537757, | |
| "grad_norm": 6.914696216583252, | |
| "learning_rate": 9.998272257842641e-05, | |
| "loss": 8.4572, | |
| "num_input_tokens_seen": 47185920, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03156287120312235, | |
| "grad_norm": 5.477447032928467, | |
| "learning_rate": 9.996112860009688e-05, | |
| "loss": 10.2007, | |
| "num_input_tokens_seen": 48758784, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.03258102833870694, | |
| "grad_norm": 4.215938091278076, | |
| "learning_rate": 9.993090225407743e-05, | |
| "loss": 9.4179, | |
| "num_input_tokens_seen": 50331648, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.03359918547429153, | |
| "grad_norm": 2.894822597503662, | |
| "learning_rate": 9.989204876292688e-05, | |
| "loss": 9.0953, | |
| "num_input_tokens_seen": 51904512, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.034617342609876124, | |
| "grad_norm": 6.472196102142334, | |
| "learning_rate": 9.984457483981669e-05, | |
| "loss": 9.91, | |
| "num_input_tokens_seen": 53477376, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.035635499745460715, | |
| "grad_norm": 2.529986619949341, | |
| "learning_rate": 9.978848868737098e-05, | |
| "loss": 9.3412, | |
| "num_input_tokens_seen": 55050240, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.036653656881045306, | |
| "grad_norm": 6.255418300628662, | |
| "learning_rate": 9.972379999624936e-05, | |
| "loss": 9.6125, | |
| "num_input_tokens_seen": 56623104, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.0376718140166299, | |
| "grad_norm": 4.211091041564941, | |
| "learning_rate": 9.96505199434725e-05, | |
| "loss": 9.9629, | |
| "num_input_tokens_seen": 58195968, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.038689971152214495, | |
| "grad_norm": 3.872631549835205, | |
| "learning_rate": 9.956866119049095e-05, | |
| "loss": 8.2777, | |
| "num_input_tokens_seen": 59768832, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.039708128287799085, | |
| "grad_norm": 2.6533520221710205, | |
| "learning_rate": 9.947823788099753e-05, | |
| "loss": 9.6421, | |
| "num_input_tokens_seen": 61341696, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.040726285423383676, | |
| "grad_norm": 2.9233040809631348, | |
| "learning_rate": 9.937926563848346e-05, | |
| "loss": 9.1002, | |
| "num_input_tokens_seen": 62914560, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04174444255896827, | |
| "grad_norm": 2.9860687255859375, | |
| "learning_rate": 9.927176156353899e-05, | |
| "loss": 9.3179, | |
| "num_input_tokens_seen": 64487424, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.04276259969455286, | |
| "grad_norm": 8.309036254882812, | |
| "learning_rate": 9.91557442308987e-05, | |
| "loss": 8.7919, | |
| "num_input_tokens_seen": 66060288, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.04378075683013745, | |
| "grad_norm": 3.492337226867676, | |
| "learning_rate": 9.903123368623216e-05, | |
| "loss": 9.2796, | |
| "num_input_tokens_seen": 67633152, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.04479891396572204, | |
| "grad_norm": 4.4715352058410645, | |
| "learning_rate": 9.889825144268029e-05, | |
| "loss": 7.8785, | |
| "num_input_tokens_seen": 69206016, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.04581707110130664, | |
| "grad_norm": 3.341895341873169, | |
| "learning_rate": 9.875682047713846e-05, | |
| "loss": 8.9495, | |
| "num_input_tokens_seen": 70778880, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.04683522823689123, | |
| "grad_norm": 3.275592803955078, | |
| "learning_rate": 9.860696522628639e-05, | |
| "loss": 10.1828, | |
| "num_input_tokens_seen": 72351744, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.04785338537247582, | |
| "grad_norm": 3.430386781692505, | |
| "learning_rate": 9.844871158236591e-05, | |
| "loss": 8.193, | |
| "num_input_tokens_seen": 73924608, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.04887154250806041, | |
| "grad_norm": 2.59080171585083, | |
| "learning_rate": 9.828208688870735e-05, | |
| "loss": 8.7955, | |
| "num_input_tokens_seen": 75497472, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.049889699643645, | |
| "grad_norm": 3.031977653503418, | |
| "learning_rate": 9.810711993500507e-05, | |
| "loss": 10.1191, | |
| "num_input_tokens_seen": 77070336, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.05090785677922959, | |
| "grad_norm": 10.469562530517578, | |
| "learning_rate": 9.792384095234313e-05, | |
| "loss": 9.2337, | |
| "num_input_tokens_seen": 78643200, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05192601391481419, | |
| "grad_norm": 2.8945353031158447, | |
| "learning_rate": 9.773228160797188e-05, | |
| "loss": 8.936, | |
| "num_input_tokens_seen": 80216064, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.05294417105039878, | |
| "grad_norm": 2.5451819896698, | |
| "learning_rate": 9.753247499983649e-05, | |
| "loss": 9.4338, | |
| "num_input_tokens_seen": 81788928, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.05396232818598337, | |
| "grad_norm": 2.6489815711975098, | |
| "learning_rate": 9.732445565085824e-05, | |
| "loss": 8.2926, | |
| "num_input_tokens_seen": 83361792, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.05498048532156796, | |
| "grad_norm": 2.7393391132354736, | |
| "learning_rate": 9.71082595029695e-05, | |
| "loss": 9.2477, | |
| "num_input_tokens_seen": 84934656, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.05599864245715255, | |
| "grad_norm": 2.3519155979156494, | |
| "learning_rate": 9.688392391090373e-05, | |
| "loss": 9.2921, | |
| "num_input_tokens_seen": 86507520, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.057016799592737144, | |
| "grad_norm": 3.0327858924865723, | |
| "learning_rate": 9.665148763574123e-05, | |
| "loss": 8.9777, | |
| "num_input_tokens_seen": 88080384, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.058034956728321735, | |
| "grad_norm": 3.5062544345855713, | |
| "learning_rate": 9.64109908382119e-05, | |
| "loss": 9.9211, | |
| "num_input_tokens_seen": 89653248, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.05905311386390633, | |
| "grad_norm": 2.559021472930908, | |
| "learning_rate": 9.616247507175623e-05, | |
| "loss": 8.9003, | |
| "num_input_tokens_seen": 91226112, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.060071270999490924, | |
| "grad_norm": 2.288188934326172, | |
| "learning_rate": 9.590598327534564e-05, | |
| "loss": 10.0134, | |
| "num_input_tokens_seen": 92798976, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.061089428135075514, | |
| "grad_norm": 3.029099702835083, | |
| "learning_rate": 9.564155976606339e-05, | |
| "loss": 9.0596, | |
| "num_input_tokens_seen": 94371840, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.062107585270660105, | |
| "grad_norm": 5.305948734283447, | |
| "learning_rate": 9.536925023144742e-05, | |
| "loss": 8.9666, | |
| "num_input_tokens_seen": 95944704, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.0631257424062447, | |
| "grad_norm": 2.6959710121154785, | |
| "learning_rate": 9.508910172159635e-05, | |
| "loss": 9.7026, | |
| "num_input_tokens_seen": 97517568, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.0641438995418293, | |
| "grad_norm": 2.45168399810791, | |
| "learning_rate": 9.480116264104011e-05, | |
| "loss": 8.1744, | |
| "num_input_tokens_seen": 99090432, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.06516205667741388, | |
| "grad_norm": 2.5155069828033447, | |
| "learning_rate": 9.450548274037653e-05, | |
| "loss": 9.145, | |
| "num_input_tokens_seen": 100663296, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.06618021381299848, | |
| "grad_norm": 3.035719394683838, | |
| "learning_rate": 9.420211310767533e-05, | |
| "loss": 8.8996, | |
| "num_input_tokens_seen": 102236160, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06719837094858307, | |
| "grad_norm": 2.5634443759918213, | |
| "learning_rate": 9.389110615965102e-05, | |
| "loss": 8.8526, | |
| "num_input_tokens_seen": 103809024, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.06821652808416766, | |
| "grad_norm": 2.726071834564209, | |
| "learning_rate": 9.35725156326063e-05, | |
| "loss": 8.6106, | |
| "num_input_tokens_seen": 105381888, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.06923468521975225, | |
| "grad_norm": 7.38182258605957, | |
| "learning_rate": 9.324639657314742e-05, | |
| "loss": 9.3592, | |
| "num_input_tokens_seen": 106954752, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.07025284235533684, | |
| "grad_norm": 2.8825085163116455, | |
| "learning_rate": 9.291280532867302e-05, | |
| "loss": 9.407, | |
| "num_input_tokens_seen": 108527616, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.07127099949092143, | |
| "grad_norm": 3.041630506515503, | |
| "learning_rate": 9.257179953763845e-05, | |
| "loss": 9.2032, | |
| "num_input_tokens_seen": 110100480, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.07228915662650602, | |
| "grad_norm": 2.5680899620056152, | |
| "learning_rate": 9.222343811959693e-05, | |
| "loss": 10.1544, | |
| "num_input_tokens_seen": 111673344, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.07330731376209061, | |
| "grad_norm": 3.0244016647338867, | |
| "learning_rate": 9.186778126501916e-05, | |
| "loss": 8.8151, | |
| "num_input_tokens_seen": 113246208, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.0743254708976752, | |
| "grad_norm": 3.2452633380889893, | |
| "learning_rate": 9.150489042489367e-05, | |
| "loss": 8.2984, | |
| "num_input_tokens_seen": 114819072, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.0753436280332598, | |
| "grad_norm": 3.0332398414611816, | |
| "learning_rate": 9.113482830010918e-05, | |
| "loss": 8.9962, | |
| "num_input_tokens_seen": 116391936, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.0763617851688444, | |
| "grad_norm": 2.4632184505462646, | |
| "learning_rate": 9.075765883062093e-05, | |
| "loss": 10.0713, | |
| "num_input_tokens_seen": 117964800, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.07737994230442899, | |
| "grad_norm": 2.421849012374878, | |
| "learning_rate": 9.037344718440322e-05, | |
| "loss": 9.5697, | |
| "num_input_tokens_seen": 119537664, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.07839809944001358, | |
| "grad_norm": 2.8447134494781494, | |
| "learning_rate": 8.99822597461894e-05, | |
| "loss": 10.0348, | |
| "num_input_tokens_seen": 121110528, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.07941625657559817, | |
| "grad_norm": 3.023327350616455, | |
| "learning_rate": 8.958416410600187e-05, | |
| "loss": 8.2245, | |
| "num_input_tokens_seen": 122683392, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.08043441371118276, | |
| "grad_norm": 2.6515488624572754, | |
| "learning_rate": 8.917922904747384e-05, | |
| "loss": 9.5885, | |
| "num_input_tokens_seen": 124256256, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.08145257084676735, | |
| "grad_norm": 2.6909425258636475, | |
| "learning_rate": 8.876752453596462e-05, | |
| "loss": 7.7732, | |
| "num_input_tokens_seen": 125829120, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.08247072798235194, | |
| "grad_norm": 2.8426523208618164, | |
| "learning_rate": 8.834912170647101e-05, | |
| "loss": 9.0941, | |
| "num_input_tokens_seen": 127401984, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.08348888511793653, | |
| "grad_norm": 23.567190170288086, | |
| "learning_rate": 8.792409285133642e-05, | |
| "loss": 9.6, | |
| "num_input_tokens_seen": 128974848, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.08450704225352113, | |
| "grad_norm": 2.7843549251556396, | |
| "learning_rate": 8.749251140776016e-05, | |
| "loss": 7.8768, | |
| "num_input_tokens_seen": 130547712, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.08552519938910572, | |
| "grad_norm": 2.600492477416992, | |
| "learning_rate": 8.705445194510868e-05, | |
| "loss": 9.2542, | |
| "num_input_tokens_seen": 132120576, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.0865433565246903, | |
| "grad_norm": 2.8346362113952637, | |
| "learning_rate": 8.66099901520315e-05, | |
| "loss": 9.481, | |
| "num_input_tokens_seen": 133693440, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0875615136602749, | |
| "grad_norm": 2.405304193496704, | |
| "learning_rate": 8.615920282338355e-05, | |
| "loss": 8.7034, | |
| "num_input_tokens_seen": 135266304, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.08857967079585949, | |
| "grad_norm": 3.806959390640259, | |
| "learning_rate": 8.570216784695637e-05, | |
| "loss": 8.5569, | |
| "num_input_tokens_seen": 136839168, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.08959782793144408, | |
| "grad_norm": 3.082869052886963, | |
| "learning_rate": 8.52389641900206e-05, | |
| "loss": 9.3447, | |
| "num_input_tokens_seen": 138412032, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.09061598506702868, | |
| "grad_norm": 3.056046724319458, | |
| "learning_rate": 8.476967188568188e-05, | |
| "loss": 8.0853, | |
| "num_input_tokens_seen": 139984896, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.09163414220261328, | |
| "grad_norm": 2.4092164039611816, | |
| "learning_rate": 8.429437201905254e-05, | |
| "loss": 9.6728, | |
| "num_input_tokens_seen": 141557760, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.09265229933819787, | |
| "grad_norm": 2.415492057800293, | |
| "learning_rate": 8.381314671324159e-05, | |
| "loss": 9.8961, | |
| "num_input_tokens_seen": 143130624, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.09367045647378246, | |
| "grad_norm": 2.674098253250122, | |
| "learning_rate": 8.332607911516545e-05, | |
| "loss": 9.4862, | |
| "num_input_tokens_seen": 144703488, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.09468861360936705, | |
| "grad_norm": 3.606937885284424, | |
| "learning_rate": 8.283325338118153e-05, | |
| "loss": 7.985, | |
| "num_input_tokens_seen": 146276352, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.09570677074495164, | |
| "grad_norm": 2.8389623165130615, | |
| "learning_rate": 8.233475466254765e-05, | |
| "loss": 8.734, | |
| "num_input_tokens_seen": 147849216, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.09672492788053623, | |
| "grad_norm": 25.968826293945312, | |
| "learning_rate": 8.183066909070947e-05, | |
| "loss": 9.4758, | |
| "num_input_tokens_seen": 149422080, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.09774308501612082, | |
| "grad_norm": 3.1141164302825928, | |
| "learning_rate": 8.132108376241849e-05, | |
| "loss": 7.9685, | |
| "num_input_tokens_seen": 150994944, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.09876124215170541, | |
| "grad_norm": 2.62211275100708, | |
| "learning_rate": 8.08060867246834e-05, | |
| "loss": 8.5738, | |
| "num_input_tokens_seen": 152567808, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.09977939928729, | |
| "grad_norm": 3.2089600563049316, | |
| "learning_rate": 8.028576695955711e-05, | |
| "loss": 8.872, | |
| "num_input_tokens_seen": 154140672, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.10079755642287459, | |
| "grad_norm": 2.3852243423461914, | |
| "learning_rate": 7.97602143687623e-05, | |
| "loss": 8.9487, | |
| "num_input_tokens_seen": 155713536, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.10181571355845918, | |
| "grad_norm": 1.760627269744873, | |
| "learning_rate": 7.922951975815811e-05, | |
| "loss": 8.0331, | |
| "num_input_tokens_seen": 157286400, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.10283387069404377, | |
| "grad_norm": 2.6850216388702393, | |
| "learning_rate": 7.869377482205042e-05, | |
| "loss": 8.7944, | |
| "num_input_tokens_seen": 158859264, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.10385202782962838, | |
| "grad_norm": 2.7767252922058105, | |
| "learning_rate": 7.815307212734888e-05, | |
| "loss": 8.2323, | |
| "num_input_tokens_seen": 160432128, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.10487018496521297, | |
| "grad_norm": 2.437563896179199, | |
| "learning_rate": 7.760750509757298e-05, | |
| "loss": 9.859, | |
| "num_input_tokens_seen": 162004992, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.10588834210079756, | |
| "grad_norm": 2.759857177734375, | |
| "learning_rate": 7.705716799671019e-05, | |
| "loss": 8.3508, | |
| "num_input_tokens_seen": 163577856, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.10690649923638215, | |
| "grad_norm": 2.97436785697937, | |
| "learning_rate": 7.650215591292888e-05, | |
| "loss": 8.4575, | |
| "num_input_tokens_seen": 165150720, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.10792465637196674, | |
| "grad_norm": 2.894146203994751, | |
| "learning_rate": 7.594256474214882e-05, | |
| "loss": 9.5371, | |
| "num_input_tokens_seen": 166723584, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.10894281350755133, | |
| "grad_norm": 2.664741039276123, | |
| "learning_rate": 7.537849117147212e-05, | |
| "loss": 8.1767, | |
| "num_input_tokens_seen": 168296448, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.10996097064313592, | |
| "grad_norm": 2.5504794120788574, | |
| "learning_rate": 7.481003266247744e-05, | |
| "loss": 8.7513, | |
| "num_input_tokens_seen": 169869312, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.11097912777872052, | |
| "grad_norm": 2.3367724418640137, | |
| "learning_rate": 7.423728743438048e-05, | |
| "loss": 9.5805, | |
| "num_input_tokens_seen": 171442176, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.1119972849143051, | |
| "grad_norm": 2.3378474712371826, | |
| "learning_rate": 7.366035444706347e-05, | |
| "loss": 9.8007, | |
| "num_input_tokens_seen": 173015040, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1130154420498897, | |
| "grad_norm": 3.5393240451812744, | |
| "learning_rate": 7.307933338397667e-05, | |
| "loss": 9.0731, | |
| "num_input_tokens_seen": 174587904, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.11403359918547429, | |
| "grad_norm": 2.569704532623291, | |
| "learning_rate": 7.249432463491498e-05, | |
| "loss": 8.8012, | |
| "num_input_tokens_seen": 176160768, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.11505175632105888, | |
| "grad_norm": 2.5539438724517822, | |
| "learning_rate": 7.190542927867234e-05, | |
| "loss": 8.9157, | |
| "num_input_tokens_seen": 177733632, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.11606991345664347, | |
| "grad_norm": 3.5465245246887207, | |
| "learning_rate": 7.131274906557725e-05, | |
| "loss": 8.9368, | |
| "num_input_tokens_seen": 179306496, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.11708807059222806, | |
| "grad_norm": 2.584242105484009, | |
| "learning_rate": 7.071638639991207e-05, | |
| "loss": 9.3932, | |
| "num_input_tokens_seen": 180879360, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.11810622772781267, | |
| "grad_norm": 2.462211847305298, | |
| "learning_rate": 7.011644432221958e-05, | |
| "loss": 9.9608, | |
| "num_input_tokens_seen": 182452224, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.11912438486339726, | |
| "grad_norm": 2.8939926624298096, | |
| "learning_rate": 6.95130264914993e-05, | |
| "loss": 8.8036, | |
| "num_input_tokens_seen": 184025088, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.12014254199898185, | |
| "grad_norm": 2.7085700035095215, | |
| "learning_rate": 6.890623716729724e-05, | |
| "loss": 9.6046, | |
| "num_input_tokens_seen": 185597952, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.12116069913456644, | |
| "grad_norm": 2.4249041080474854, | |
| "learning_rate": 6.82961811916917e-05, | |
| "loss": 8.4684, | |
| "num_input_tokens_seen": 187170816, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.12217885627015103, | |
| "grad_norm": 2.345521926879883, | |
| "learning_rate": 6.768296397117848e-05, | |
| "loss": 8.4042, | |
| "num_input_tokens_seen": 188743680, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.12319701340573562, | |
| "grad_norm": 4.278283596038818, | |
| "learning_rate": 6.706669145845863e-05, | |
| "loss": 9.243, | |
| "num_input_tokens_seen": 190316544, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.12421517054132021, | |
| "grad_norm": 3.101922035217285, | |
| "learning_rate": 6.644747013413168e-05, | |
| "loss": 8.6733, | |
| "num_input_tokens_seen": 191889408, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.12523332767690482, | |
| "grad_norm": 3.8298826217651367, | |
| "learning_rate": 6.582540698829781e-05, | |
| "loss": 9.3599, | |
| "num_input_tokens_seen": 193462272, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.1262514848124894, | |
| "grad_norm": 2.876701831817627, | |
| "learning_rate": 6.520060950207185e-05, | |
| "loss": 9.2528, | |
| "num_input_tokens_seen": 195035136, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.127269641948074, | |
| "grad_norm": 2.6711254119873047, | |
| "learning_rate": 6.457318562901256e-05, | |
| "loss": 7.5999, | |
| "num_input_tokens_seen": 196608000, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1282877990836586, | |
| "grad_norm": 2.2993829250335693, | |
| "learning_rate": 6.394324377647028e-05, | |
| "loss": 9.1343, | |
| "num_input_tokens_seen": 198180864, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.12930595621924318, | |
| "grad_norm": 3.1544792652130127, | |
| "learning_rate": 6.331089278685599e-05, | |
| "loss": 8.7503, | |
| "num_input_tokens_seen": 199753728, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.13032411335482777, | |
| "grad_norm": 2.3709182739257812, | |
| "learning_rate": 6.26762419188355e-05, | |
| "loss": 8.1339, | |
| "num_input_tokens_seen": 201326592, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.13134227049041236, | |
| "grad_norm": 2.0676522254943848, | |
| "learning_rate": 6.203940082845144e-05, | |
| "loss": 8.6629, | |
| "num_input_tokens_seen": 202899456, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.13236042762599695, | |
| "grad_norm": 3.4315857887268066, | |
| "learning_rate": 6.140047955017671e-05, | |
| "loss": 8.5242, | |
| "num_input_tokens_seen": 204472320, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.13337858476158154, | |
| "grad_norm": 2.058745861053467, | |
| "learning_rate": 6.075958847790262e-05, | |
| "loss": 8.8911, | |
| "num_input_tokens_seen": 206045184, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.13439674189716613, | |
| "grad_norm": 2.168302297592163, | |
| "learning_rate": 6.011683834586473e-05, | |
| "loss": 8.767, | |
| "num_input_tokens_seen": 207618048, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.13541489903275072, | |
| "grad_norm": 2.312222480773926, | |
| "learning_rate": 5.947234020951015e-05, | |
| "loss": 8.8736, | |
| "num_input_tokens_seen": 209190912, | |
| "step": 133 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 266, | |
| "num_input_tokens_seen": 209190912, | |
| "num_train_epochs": 1, | |
| "save_steps": 133, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 333751171153920.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |