| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 199, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005037783375314861, |
| "grad_norm": 45.14028549194336, |
| "learning_rate": 8.333333333333333e-07, |
| "loss": 1.8127, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010075566750629723, |
| "grad_norm": 41.275089263916016, |
| "learning_rate": 1.6666666666666667e-06, |
| "loss": 1.8862, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.015113350125944584, |
| "grad_norm": 45.339202880859375, |
| "learning_rate": 2.5e-06, |
| "loss": 1.8741, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.020151133501259445, |
| "grad_norm": 38.83050537109375, |
| "learning_rate": 3.3333333333333333e-06, |
| "loss": 1.7667, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.02518891687657431, |
| "grad_norm": 33.30706024169922, |
| "learning_rate": 4.166666666666667e-06, |
| "loss": 1.5813, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.030226700251889168, |
| "grad_norm": 26.171358108520508, |
| "learning_rate": 5e-06, |
| "loss": 1.2841, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03526448362720403, |
| "grad_norm": 19.258102416992188, |
| "learning_rate": 4.99966880364306e-06, |
| "loss": 1.1433, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.04030226700251889, |
| "grad_norm": 9.452253341674805, |
| "learning_rate": 4.998675302325061e-06, |
| "loss": 1.042, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.04534005037783375, |
| "grad_norm": 11.662043571472168, |
| "learning_rate": 4.997019759281217e-06, |
| "loss": 1.0348, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05037783375314862, |
| "grad_norm": 15.509767532348633, |
| "learning_rate": 4.994702613159386e-06, |
| "loss": 1.0352, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.055415617128463476, |
| "grad_norm": 9.309309959411621, |
| "learning_rate": 4.991724477903854e-06, |
| "loss": 1.0574, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.060453400503778336, |
| "grad_norm": 6.886810779571533, |
| "learning_rate": 4.988086142592658e-06, |
| "loss": 0.9625, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.0654911838790932, |
| "grad_norm": 6.271539211273193, |
| "learning_rate": 4.983788571228516e-06, |
| "loss": 0.944, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07052896725440806, |
| "grad_norm": 6.019953727722168, |
| "learning_rate": 4.978832902483415e-06, |
| "loss": 0.9427, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.07556675062972293, |
| "grad_norm": 4.751285552978516, |
| "learning_rate": 4.9732204493969e-06, |
| "loss": 0.9923, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08060453400503778, |
| "grad_norm": 7.569280624389648, |
| "learning_rate": 4.9669526990281855e-06, |
| "loss": 1.0161, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.08564231738035265, |
| "grad_norm": 8.2361421585083, |
| "learning_rate": 4.960031312062141e-06, |
| "loss": 0.8856, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.0906801007556675, |
| "grad_norm": 8.086812973022461, |
| "learning_rate": 4.952458122369286e-06, |
| "loss": 1.0242, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.09571788413098237, |
| "grad_norm": 4.911264896392822, |
| "learning_rate": 4.944235136519888e-06, |
| "loss": 0.88, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.10075566750629723, |
| "grad_norm": 5.140178203582764, |
| "learning_rate": 4.935364533252314e-06, |
| "loss": 0.9321, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10579345088161209, |
| "grad_norm": 4.421811580657959, |
| "learning_rate": 4.925848662895753e-06, |
| "loss": 0.9492, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.11083123425692695, |
| "grad_norm": 5.252143859863281, |
| "learning_rate": 4.9156900467474785e-06, |
| "loss": 0.8854, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.11586901763224182, |
| "grad_norm": 5.303826808929443, |
| "learning_rate": 4.904891376404822e-06, |
| "loss": 0.9771, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.12090680100755667, |
| "grad_norm": 4.818070888519287, |
| "learning_rate": 4.893455513052003e-06, |
| "loss": 0.9085, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.12594458438287154, |
| "grad_norm": 4.050323486328125, |
| "learning_rate": 4.881385486702047e-06, |
| "loss": 0.8741, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1309823677581864, |
| "grad_norm": 4.825730800628662, |
| "learning_rate": 4.868684495393958e-06, |
| "loss": 0.8568, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.13602015113350127, |
| "grad_norm": 4.85552978515625, |
| "learning_rate": 4.855355904345377e-06, |
| "loss": 0.9123, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.14105793450881612, |
| "grad_norm": 3.755789041519165, |
| "learning_rate": 4.841403245060943e-06, |
| "loss": 0.9049, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.14609571788413098, |
| "grad_norm": 4.334013938903809, |
| "learning_rate": 4.826830214396594e-06, |
| "loss": 0.8393, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.15113350125944586, |
| "grad_norm": 4.264854907989502, |
| "learning_rate": 4.8116406735800645e-06, |
| "loss": 0.8636, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1561712846347607, |
| "grad_norm": 4.067491054534912, |
| "learning_rate": 4.7958386471878185e-06, |
| "loss": 0.9626, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.16120906801007556, |
| "grad_norm": 3.913526773452759, |
| "learning_rate": 4.779428322078716e-06, |
| "loss": 0.8366, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.16624685138539042, |
| "grad_norm": 4.512166500091553, |
| "learning_rate": 4.76241404628467e-06, |
| "loss": 0.8766, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1712846347607053, |
| "grad_norm": 3.469944477081299, |
| "learning_rate": 4.744800327858608e-06, |
| "loss": 0.8931, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.17632241813602015, |
| "grad_norm": 3.761395215988159, |
| "learning_rate": 4.726591833680031e-06, |
| "loss": 0.9317, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.181360201511335, |
| "grad_norm": 3.780844211578369, |
| "learning_rate": 4.7077933882184864e-06, |
| "loss": 0.8701, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.18639798488664988, |
| "grad_norm": 3.842653751373291, |
| "learning_rate": 4.688409972255299e-06, |
| "loss": 0.8502, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.19143576826196473, |
| "grad_norm": 3.1387386322021484, |
| "learning_rate": 4.6684467215638694e-06, |
| "loss": 0.8606, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.1964735516372796, |
| "grad_norm": 5.945749282836914, |
| "learning_rate": 4.647908925548918e-06, |
| "loss": 0.8836, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.20151133501259447, |
| "grad_norm": 3.8087077140808105, |
| "learning_rate": 4.626802025845023e-06, |
| "loss": 0.8729, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.20654911838790932, |
| "grad_norm": 4.801278591156006, |
| "learning_rate": 4.605131614874813e-06, |
| "loss": 0.8895, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.21158690176322417, |
| "grad_norm": 3.3816072940826416, |
| "learning_rate": 4.582903434367222e-06, |
| "loss": 0.875, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.21662468513853905, |
| "grad_norm": 3.7405855655670166, |
| "learning_rate": 4.560123373836174e-06, |
| "loss": 0.8679, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.2216624685138539, |
| "grad_norm": 3.2052128314971924, |
| "learning_rate": 4.536797469020116e-06, |
| "loss": 0.8453, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.22670025188916876, |
| "grad_norm": 3.2078239917755127, |
| "learning_rate": 4.512931900282807e-06, |
| "loss": 0.8965, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.23173803526448364, |
| "grad_norm": 3.318209409713745, |
| "learning_rate": 4.4885329909757836e-06, |
| "loss": 0.7962, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.2367758186397985, |
| "grad_norm": 4.799398899078369, |
| "learning_rate": 4.463607205762948e-06, |
| "loss": 0.8804, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.24181360201511334, |
| "grad_norm": 2.9974491596221924, |
| "learning_rate": 4.438161148907703e-06, |
| "loss": 0.8118, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.24685138539042822, |
| "grad_norm": 2.7640256881713867, |
| "learning_rate": 4.4122015625231125e-06, |
| "loss": 0.8798, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.2518891687657431, |
| "grad_norm": 3.4179089069366455, |
| "learning_rate": 4.385735324785528e-06, |
| "loss": 0.8466, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.25692695214105793, |
| "grad_norm": 3.160508632659912, |
| "learning_rate": 4.3587694481121664e-06, |
| "loss": 0.8293, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2619647355163728, |
| "grad_norm": 3.2087128162384033, |
| "learning_rate": 4.331311077303119e-06, |
| "loss": 0.8523, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.26700251889168763, |
| "grad_norm": 2.897143840789795, |
| "learning_rate": 4.303367487648289e-06, |
| "loss": 0.8541, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.27204030226700254, |
| "grad_norm": 3.211749792098999, |
| "learning_rate": 4.274946082999753e-06, |
| "loss": 0.8348, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2770780856423174, |
| "grad_norm": 3.2269697189331055, |
| "learning_rate": 4.246054393810053e-06, |
| "loss": 0.8849, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.28211586901763225, |
| "grad_norm": 3.219303607940674, |
| "learning_rate": 4.2167000751369535e-06, |
| "loss": 0.7562, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.2871536523929471, |
| "grad_norm": 2.986050844192505, |
| "learning_rate": 4.186890904615178e-06, |
| "loss": 0.7716, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.29219143576826195, |
| "grad_norm": 3.107534885406494, |
| "learning_rate": 4.156634780395672e-06, |
| "loss": 0.8897, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.2972292191435768, |
| "grad_norm": 3.1411337852478027, |
| "learning_rate": 4.125939719052927e-06, |
| "loss": 0.8865, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.3022670025188917, |
| "grad_norm": 3.5057530403137207, |
| "learning_rate": 4.094813853460938e-06, |
| "loss": 0.8619, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.30730478589420657, |
| "grad_norm": 3.040756940841675, |
| "learning_rate": 4.063265430638338e-06, |
| "loss": 0.8592, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.3123425692695214, |
| "grad_norm": 2.7552523612976074, |
| "learning_rate": 4.031302809563292e-06, |
| "loss": 0.7954, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.31738035264483627, |
| "grad_norm": 2.9611120223999023, |
| "learning_rate": 3.998934458958726e-06, |
| "loss": 0.8734, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3224181360201511, |
| "grad_norm": 4.978094577789307, |
| "learning_rate": 3.96616895504848e-06, |
| "loss": 0.8818, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.327455919395466, |
| "grad_norm": 4.14245080947876, |
| "learning_rate": 3.933014979284978e-06, |
| "loss": 0.8783, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.33249370277078083, |
| "grad_norm": 2.922802209854126, |
| "learning_rate": 3.899481316049012e-06, |
| "loss": 0.8348, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.33753148614609574, |
| "grad_norm": 2.9351940155029297, |
| "learning_rate": 3.86557685032226e-06, |
| "loss": 0.7839, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3425692695214106, |
| "grad_norm": 3.6585469245910645, |
| "learning_rate": 3.83131056533315e-06, |
| "loss": 0.9167, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.34760705289672544, |
| "grad_norm": 2.93361496925354, |
| "learning_rate": 3.7966915401766845e-06, |
| "loss": 0.8308, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3526448362720403, |
| "grad_norm": 3.2033188343048096, |
| "learning_rate": 3.7617289474088725e-06, |
| "loss": 0.8435, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.35768261964735515, |
| "grad_norm": 3.1813461780548096, |
| "learning_rate": 3.726432050616399e-06, |
| "loss": 0.9147, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.36272040302267, |
| "grad_norm": 13.234034538269043, |
| "learning_rate": 3.6908102019621667e-06, |
| "loss": 0.8469, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.3677581863979849, |
| "grad_norm": 4.6746392250061035, |
| "learning_rate": 3.6548728397073756e-06, |
| "loss": 0.8532, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.37279596977329976, |
| "grad_norm": 2.943939447402954, |
| "learning_rate": 3.6186294857107933e-06, |
| "loss": 0.8466, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.3778337531486146, |
| "grad_norm": 3.160301923751831, |
| "learning_rate": 3.582089742905864e-06, |
| "loss": 0.9207, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.38287153652392947, |
| "grad_norm": 3.0454745292663574, |
| "learning_rate": 3.545263292756348e-06, |
| "loss": 0.8529, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.3879093198992443, |
| "grad_norm": 2.6905884742736816, |
| "learning_rate": 3.5081598926911487e-06, |
| "loss": 0.823, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.3929471032745592, |
| "grad_norm": 3.0475916862487793, |
| "learning_rate": 3.470789373519012e-06, |
| "loss": 0.8753, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.3979848866498741, |
| "grad_norm": 3.0917882919311523, |
| "learning_rate": 3.433161636823782e-06, |
| "loss": 0.8456, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.40302267002518893, |
| "grad_norm": 3.0129034519195557, |
| "learning_rate": 3.39528665234091e-06, |
| "loss": 0.8902, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4080604534005038, |
| "grad_norm": 2.6000654697418213, |
| "learning_rate": 3.3571744553159e-06, |
| "loss": 0.8188, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.41309823677581864, |
| "grad_norm": 3.156890392303467, |
| "learning_rate": 3.3188351438454e-06, |
| "loss": 0.7446, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4181360201511335, |
| "grad_norm": 3.192892074584961, |
| "learning_rate": 3.2802788762016385e-06, |
| "loss": 0.9118, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.42317380352644834, |
| "grad_norm": 5.0082621574401855, |
| "learning_rate": 3.2415158681409215e-06, |
| "loss": 0.832, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.4282115869017632, |
| "grad_norm": 3.4209702014923096, |
| "learning_rate": 3.2025563901968903e-06, |
| "loss": 0.8107, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4332493702770781, |
| "grad_norm": 2.90742564201355, |
| "learning_rate": 3.1634107649592772e-06, |
| "loss": 0.8065, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.43828715365239296, |
| "grad_norm": 3.0818216800689697, |
| "learning_rate": 3.1240893643388558e-06, |
| "loss": 0.8992, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.4433249370277078, |
| "grad_norm": 2.713526725769043, |
| "learning_rate": 3.0846026068193354e-06, |
| "loss": 0.8297, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.44836272040302266, |
| "grad_norm": 2.930645704269409, |
| "learning_rate": 3.044960954696906e-06, |
| "loss": 0.8479, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.4534005037783375, |
| "grad_norm": 2.680250644683838, |
| "learning_rate": 3.00517491130818e-06, |
| "loss": 0.8159, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.45843828715365237, |
| "grad_norm": 2.9555230140686035, |
| "learning_rate": 2.96525501824726e-06, |
| "loss": 0.9478, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.4634760705289673, |
| "grad_norm": 4.101602554321289, |
| "learning_rate": 2.925211852572667e-06, |
| "loss": 0.8625, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.46851385390428213, |
| "grad_norm": 5.067833423614502, |
| "learning_rate": 2.8850560240048737e-06, |
| "loss": 0.8574, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.473551637279597, |
| "grad_norm": 2.771819591522217, |
| "learning_rate": 2.844798172115185e-06, |
| "loss": 0.871, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.47858942065491183, |
| "grad_norm": 2.993462085723877, |
| "learning_rate": 2.80444896350671e-06, |
| "loss": 0.9423, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.4836272040302267, |
| "grad_norm": 2.8279411792755127, |
| "learning_rate": 2.764019088988165e-06, |
| "loss": 0.8155, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.48866498740554154, |
| "grad_norm": 2.840853452682495, |
| "learning_rate": 2.723519260741271e-06, |
| "loss": 0.8262, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.49370277078085645, |
| "grad_norm": 3.424471378326416, |
| "learning_rate": 2.6829602094824864e-06, |
| "loss": 0.8527, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.4987405541561713, |
| "grad_norm": 3.408869981765747, |
| "learning_rate": 2.6423526816198253e-06, |
| "loss": 0.8253, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.5037783375314862, |
| "grad_norm": 5.029043674468994, |
| "learning_rate": 2.601707436405521e-06, |
| "loss": 0.8504, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.5088161209068011, |
| "grad_norm": 3.400862216949463, |
| "learning_rate": 2.5610352430852888e-06, |
| "loss": 0.8509, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5138539042821159, |
| "grad_norm": 3.2282321453094482, |
| "learning_rate": 2.5203468780449324e-06, |
| "loss": 0.8145, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5188916876574308, |
| "grad_norm": 3.0265862941741943, |
| "learning_rate": 2.4796531219550684e-06, |
| "loss": 0.8479, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5239294710327456, |
| "grad_norm": 3.553727388381958, |
| "learning_rate": 2.438964756914712e-06, |
| "loss": 0.8581, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5289672544080605, |
| "grad_norm": 3.089478015899658, |
| "learning_rate": 2.398292563594479e-06, |
| "loss": 0.8849, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5340050377833753, |
| "grad_norm": 2.7626805305480957, |
| "learning_rate": 2.357647318380176e-06, |
| "loss": 0.8332, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5390428211586902, |
| "grad_norm": 3.013871192932129, |
| "learning_rate": 2.3170397905175144e-06, |
| "loss": 0.8032, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5440806045340051, |
| "grad_norm": 3.2517004013061523, |
| "learning_rate": 2.2764807392587303e-06, |
| "loss": 0.798, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5491183879093199, |
| "grad_norm": 14.13714599609375, |
| "learning_rate": 2.2359809110118358e-06, |
| "loss": 0.8714, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.5541561712846348, |
| "grad_norm": 3.096505641937256, |
| "learning_rate": 2.1955510364932904e-06, |
| "loss": 0.7631, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5591939546599496, |
| "grad_norm": 2.820859432220459, |
| "learning_rate": 2.1552018278848145e-06, |
| "loss": 0.8005, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.5642317380352645, |
| "grad_norm": 6.947901248931885, |
| "learning_rate": 2.1149439759951276e-06, |
| "loss": 0.799, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.5692695214105793, |
| "grad_norm": 2.9445698261260986, |
| "learning_rate": 2.074788147427334e-06, |
| "loss": 0.806, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.5743073047858942, |
| "grad_norm": 2.744227409362793, |
| "learning_rate": 2.034744981752741e-06, |
| "loss": 0.8305, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.5793450881612091, |
| "grad_norm": 2.6451659202575684, |
| "learning_rate": 1.9948250886918204e-06, |
| "loss": 0.7354, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5843828715365239, |
| "grad_norm": 2.812647819519043, |
| "learning_rate": 1.955039045303095e-06, |
| "loss": 0.7877, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.5894206549118388, |
| "grad_norm": 3.6654205322265625, |
| "learning_rate": 1.915397393180665e-06, |
| "loss": 0.8502, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.5944584382871536, |
| "grad_norm": 3.1642322540283203, |
| "learning_rate": 1.8759106356611453e-06, |
| "loss": 0.8214, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.5994962216624685, |
| "grad_norm": 2.7508504390716553, |
| "learning_rate": 1.8365892350407238e-06, |
| "loss": 0.8778, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.6045340050377834, |
| "grad_norm": 2.832481622695923, |
| "learning_rate": 1.7974436098031105e-06, |
| "loss": 0.7971, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.6095717884130982, |
| "grad_norm": 3.0751426219940186, |
| "learning_rate": 1.7584841318590796e-06, |
| "loss": 0.8394, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.6146095717884131, |
| "grad_norm": 2.66140079498291, |
| "learning_rate": 1.719721123798362e-06, |
| "loss": 0.7793, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6196473551637279, |
| "grad_norm": 3.033729314804077, |
| "learning_rate": 1.6811648561546003e-06, |
| "loss": 0.7719, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6246851385390428, |
| "grad_norm": 24.4138240814209, |
| "learning_rate": 1.642825544684101e-06, |
| "loss": 0.8482, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.6297229219143576, |
| "grad_norm": 3.2902424335479736, |
| "learning_rate": 1.6047133476590908e-06, |
| "loss": 0.8713, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6347607052896725, |
| "grad_norm": 2.7968010902404785, |
| "learning_rate": 1.566838363176219e-06, |
| "loss": 0.7406, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6397984886649875, |
| "grad_norm": 3.296938419342041, |
| "learning_rate": 1.5292106264809888e-06, |
| "loss": 0.7567, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.6448362720403022, |
| "grad_norm": 2.7469353675842285, |
| "learning_rate": 1.4918401073088517e-06, |
| "loss": 0.8542, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.6498740554156172, |
| "grad_norm": 2.775825262069702, |
| "learning_rate": 1.4547367072436519e-06, |
| "loss": 0.7611, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.654911838790932, |
| "grad_norm": 3.243407726287842, |
| "learning_rate": 1.4179102570941368e-06, |
| "loss": 0.8393, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6599496221662469, |
| "grad_norm": 2.9354608058929443, |
| "learning_rate": 1.3813705142892082e-06, |
| "loss": 0.8897, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.6649874055415617, |
| "grad_norm": 2.8261704444885254, |
| "learning_rate": 1.3451271602926248e-06, |
| "loss": 0.8167, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.6700251889168766, |
| "grad_norm": 2.76542067527771, |
| "learning_rate": 1.309189798037834e-06, |
| "loss": 0.8355, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.6750629722921915, |
| "grad_norm": 2.8934812545776367, |
| "learning_rate": 1.273567949383601e-06, |
| "loss": 0.8766, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.6801007556675063, |
| "grad_norm": 3.0498292446136475, |
| "learning_rate": 1.238271052591127e-06, |
| "loss": 0.861, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.6851385390428212, |
| "grad_norm": 2.881106376647949, |
| "learning_rate": 1.2033084598233163e-06, |
| "loss": 0.7946, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.690176322418136, |
| "grad_norm": 2.83734130859375, |
| "learning_rate": 1.1686894346668512e-06, |
| "loss": 0.7484, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.6952141057934509, |
| "grad_norm": 2.4155306816101074, |
| "learning_rate": 1.1344231496777406e-06, |
| "loss": 0.8025, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.7002518891687658, |
| "grad_norm": 2.696739435195923, |
| "learning_rate": 1.1005186839509887e-06, |
| "loss": 0.7984, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.7052896725440806, |
| "grad_norm": 2.9532501697540283, |
| "learning_rate": 1.066985020715022e-06, |
| "loss": 0.8436, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.7103274559193955, |
| "grad_norm": 2.7825965881347656, |
| "learning_rate": 1.0338310449515197e-06, |
| "loss": 0.8718, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.7153652392947103, |
| "grad_norm": 2.916924238204956, |
| "learning_rate": 1.0010655410412745e-06, |
| "loss": 0.7749, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.7204030226700252, |
| "grad_norm": 2.610424757003784, |
| "learning_rate": 9.68697190436709e-07, |
| "loss": 0.7614, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.72544080604534, |
| "grad_norm": 2.67132830619812, |
| "learning_rate": 9.367345693616625e-07, |
| "loss": 0.821, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7304785894206549, |
| "grad_norm": 2.876641035079956, |
| "learning_rate": 9.051861465390624e-07, |
| "loss": 0.7234, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.7355163727959698, |
| "grad_norm": 2.9514031410217285, |
| "learning_rate": 8.740602809470736e-07, |
| "loss": 0.7861, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.7405541561712846, |
| "grad_norm": 2.6026699542999268, |
| "learning_rate": 8.433652196043288e-07, |
| "loss": 0.8253, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.7455919395465995, |
| "grad_norm": 2.879054069519043, |
| "learning_rate": 8.131090953848228e-07, |
| "loss": 0.7749, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.7506297229219143, |
| "grad_norm": 2.8631882667541504, |
| "learning_rate": 7.832999248630479e-07, |
| "loss": 0.9273, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.7556675062972292, |
| "grad_norm": 2.6118288040161133, |
| "learning_rate": 7.53945606189948e-07, |
| "loss": 0.8117, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.760705289672544, |
| "grad_norm": 2.8277623653411865, |
| "learning_rate": 7.250539170002477e-07, |
| "loss": 0.7862, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.7657430730478589, |
| "grad_norm": 5.133010387420654, |
| "learning_rate": 6.96632512351711e-07, |
| "loss": 0.8225, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.7707808564231738, |
| "grad_norm": 2.7790756225585938, |
| "learning_rate": 6.686889226968815e-07, |
| "loss": 0.7539, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.7758186397984886, |
| "grad_norm": 2.755314350128174, |
| "learning_rate": 6.412305518878343e-07, |
| "loss": 0.8238, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.7808564231738035, |
| "grad_norm": 2.6783859729766846, |
| "learning_rate": 6.142646752144724e-07, |
| "loss": 0.7837, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.7858942065491183, |
| "grad_norm": 2.760148763656616, |
| "learning_rate": 5.877984374768878e-07, |
| "loss": 0.8798, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.7909319899244333, |
| "grad_norm": 2.8512697219848633, |
| "learning_rate": 5.618388510922979e-07, |
| "loss": 0.8237, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.7959697732997482, |
| "grad_norm": 2.719089984893799, |
| "learning_rate": 5.363927942370528e-07, |
| "loss": 0.8031, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.801007556675063, |
| "grad_norm": 7.449169158935547, |
| "learning_rate": 5.11467009024216e-07, |
| "loss": 0.7838, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.8060453400503779, |
| "grad_norm": 3.0624005794525146, |
| "learning_rate": 4.870680997171934e-07, |
| "loss": 0.7585, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.8110831234256927, |
| "grad_norm": 2.621262788772583, |
| "learning_rate": 4.6320253097988486e-07, |
| "loss": 0.8421, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.8161209068010076, |
| "grad_norm": 3.4929068088531494, |
| "learning_rate": 4.398766261638271e-07, |
| "loss": 0.7814, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.8211586901763224, |
| "grad_norm": 3.319396734237671, |
| "learning_rate": 4.170965656327791e-07, |
| "loss": 0.8786, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.8261964735516373, |
| "grad_norm": 3.4810092449188232, |
| "learning_rate": 3.9486838512518777e-07, |
| "loss": 0.7662, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.8312342569269522, |
| "grad_norm": 2.9049723148345947, |
| "learning_rate": 3.7319797415497737e-07, |
| "loss": 0.801, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.836272040302267, |
| "grad_norm": 3.931922674179077, |
| "learning_rate": 3.5209107445108195e-07, |
| "loss": 0.8094, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.8413098236775819, |
| "grad_norm": 2.9314591884613037, |
| "learning_rate": 3.3155327843613166e-07, |
| "loss": 0.7802, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.8463476070528967, |
| "grad_norm": 2.7644076347351074, |
| "learning_rate": 3.1159002774470146e-07, |
| "loss": 0.8319, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.8513853904282116, |
| "grad_norm": 2.9492383003234863, |
| "learning_rate": 2.9220661178151366e-07, |
| "loss": 0.7787, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.8564231738035264, |
| "grad_norm": 2.547168254852295, |
| "learning_rate": 2.734081663199695e-07, |
| "loss": 0.7846, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8614609571788413, |
| "grad_norm": 3.1694071292877197, |
| "learning_rate": 2.551996721413916e-07, |
| "loss": 0.8104, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.8664987405541562, |
| "grad_norm": 3.047084331512451, |
| "learning_rate": 2.375859537153302e-07, |
| "loss": 0.7466, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.871536523929471, |
| "grad_norm": 2.832005262374878, |
| "learning_rate": 2.2057167792128493e-07, |
| "loss": 0.8075, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.8765743073047859, |
| "grad_norm": 2.647271156311035, |
| "learning_rate": 2.0416135281218218e-07, |
| "loss": 0.8217, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.8816120906801007, |
| "grad_norm": 2.717848539352417, |
| "learning_rate": 1.8835932641993627e-07, |
| "loss": 0.7855, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.8866498740554156, |
| "grad_norm": 2.9493443965911865, |
| "learning_rate": 1.7316978560340647e-07, |
| "loss": 0.8201, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.8916876574307305, |
| "grad_norm": 5.114314079284668, |
| "learning_rate": 1.5859675493905769e-07, |
| "loss": 0.7904, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.8967254408060453, |
| "grad_norm": 2.8451216220855713, |
| "learning_rate": 1.4464409565462328e-07, |
| "loss": 0.805, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.9017632241813602, |
| "grad_norm": 2.497908592224121, |
| "learning_rate": 1.3131550460604242e-07, |
| "loss": 0.7827, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.906801007556675, |
| "grad_norm": 3.690392017364502, |
| "learning_rate": 1.1861451329795326e-07, |
| "loss": 0.7521, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.9118387909319899, |
| "grad_norm": 3.2087209224700928, |
| "learning_rate": 1.065444869479973e-07, |
| "loss": 0.7203, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.9168765743073047, |
| "grad_norm": 2.854456901550293, |
| "learning_rate": 9.510862359517815e-08, |
| "loss": 0.819, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.9219143576826196, |
| "grad_norm": 2.9602978229522705, |
| "learning_rate": 8.430995325252128e-08, |
| "loss": 0.798, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.9269521410579346, |
| "grad_norm": 2.870439052581787, |
| "learning_rate": 7.415133710424794e-08, |
| "loss": 0.79, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.9319899244332494, |
| "grad_norm": 2.710660219192505, |
| "learning_rate": 6.463546674768644e-08, |
| "loss": 0.7406, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.9370277078085643, |
| "grad_norm": 3.1572258472442627, |
| "learning_rate": 5.5764863480112233e-08, |
| "loss": 0.8467, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.9420654911838791, |
| "grad_norm": 2.8733699321746826, |
| "learning_rate": 4.754187763071488e-08, |
| "loss": 0.7988, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.947103274559194, |
| "grad_norm": 3.7968437671661377, |
| "learning_rate": 3.996868793785913e-08, |
| "loss": 0.7472, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.9521410579345088, |
| "grad_norm": 2.6690640449523926, |
| "learning_rate": 3.304730097181463e-08, |
| "loss": 0.7358, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.9571788413098237, |
| "grad_norm": 2.642486333847046, |
| "learning_rate": 2.6779550603100168e-08, |
| "loss": 0.865, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.9622166246851386, |
| "grad_norm": 3.4199740886688232, |
| "learning_rate": 2.116709751658591e-08, |
| "loss": 0.7681, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.9672544080604534, |
| "grad_norm": 2.9348366260528564, |
| "learning_rate": 1.6211428771484295e-08, |
| "loss": 0.9291, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.9722921914357683, |
| "grad_norm": 2.5914723873138428, |
| "learning_rate": 1.1913857407343244e-08, |
| "loss": 0.7852, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.9773299748110831, |
| "grad_norm": 2.9415173530578613, |
| "learning_rate": 8.275522096146404e-09, |
| "loss": 0.7209, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.982367758186398, |
| "grad_norm": 2.919996976852417, |
| "learning_rate": 5.297386840614205e-09, |
| "loss": 0.8159, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.9874055415617129, |
| "grad_norm": 3.5275845527648926, |
| "learning_rate": 2.9802407187842773e-09, |
| "loss": 0.7429, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.9924433249370277, |
| "grad_norm": 2.8735129833221436, |
| "learning_rate": 1.3246976749395346e-09, |
| "loss": 0.8319, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.9974811083123426, |
| "grad_norm": 2.514472246170044, |
| "learning_rate": 3.3119635694023324e-10, |
| "loss": 0.7523, |
| "step": 198 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 3.6349916458129883, |
| "learning_rate": 0.0, |
| "loss": 0.8407, |
| "step": 199 |
| }, |
| { |
| "epoch": 1.0, |
| "step": 199, |
| "total_flos": 258447595339776.0, |
| "train_loss": 0.8690116378530186, |
| "train_runtime": 2505.9387, |
| "train_samples_per_second": 2.532, |
| "train_steps_per_second": 0.079 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 199, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 400, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 258447595339776.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|