| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9815303430079156, | |
| "eval_steps": 500, | |
| "global_step": 93, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0316622691292876, | |
| "grad_norm": 2.5990071296691895, | |
| "learning_rate": 7.000000000000001e-07, | |
| "loss": 1.176, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0633245382585752, | |
| "grad_norm": 2.6270251274108887, | |
| "learning_rate": 1.4000000000000001e-06, | |
| "loss": 1.1885, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.09498680738786279, | |
| "grad_norm": 2.6077260971069336, | |
| "learning_rate": 2.1e-06, | |
| "loss": 1.1853, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.1266490765171504, | |
| "grad_norm": 2.4010000228881836, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 1.1441, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.158311345646438, | |
| "grad_norm": 1.6012561321258545, | |
| "learning_rate": 3.5e-06, | |
| "loss": 1.1434, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.18997361477572558, | |
| "grad_norm": 1.4890481233596802, | |
| "learning_rate": 4.2e-06, | |
| "loss": 1.1287, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.22163588390501318, | |
| "grad_norm": 2.0511271953582764, | |
| "learning_rate": 4.9e-06, | |
| "loss": 1.1233, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.2532981530343008, | |
| "grad_norm": 1.9991264343261719, | |
| "learning_rate": 5.600000000000001e-06, | |
| "loss": 1.1195, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.2849604221635884, | |
| "grad_norm": 2.4904370307922363, | |
| "learning_rate": 6.3e-06, | |
| "loss": 1.0933, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.316622691292876, | |
| "grad_norm": 2.41306471824646, | |
| "learning_rate": 7e-06, | |
| "loss": 1.1054, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.3482849604221636, | |
| "grad_norm": 1.916558861732483, | |
| "learning_rate": 6.99749314185716e-06, | |
| "loss": 1.0968, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.37994722955145116, | |
| "grad_norm": 1.6149336099624634, | |
| "learning_rate": 6.989976158478782e-06, | |
| "loss": 1.0736, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.41160949868073876, | |
| "grad_norm": 1.5957881212234497, | |
| "learning_rate": 6.977459817871147e-06, | |
| "loss": 1.0864, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.44327176781002636, | |
| "grad_norm": 1.3141591548919678, | |
| "learning_rate": 6.959962049571609e-06, | |
| "loss": 1.0507, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.47493403693931396, | |
| "grad_norm": 1.0635184049606323, | |
| "learning_rate": 6.937507918964709e-06, | |
| "loss": 1.0483, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.5065963060686016, | |
| "grad_norm": 0.8129880428314209, | |
| "learning_rate": 6.9101295913762465e-06, | |
| "loss": 1.0232, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.5382585751978892, | |
| "grad_norm": 1.096314549446106, | |
| "learning_rate": 6.877866285996766e-06, | |
| "loss": 1.0304, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.5699208443271768, | |
| "grad_norm": 0.8672236204147339, | |
| "learning_rate": 6.840764219700443e-06, | |
| "loss": 1.0308, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.6015831134564644, | |
| "grad_norm": 0.9555258750915527, | |
| "learning_rate": 6.798876540839855e-06, | |
| "loss": 1.0285, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.633245382585752, | |
| "grad_norm": 1.5928786993026733, | |
| "learning_rate": 6.752263253111479e-06, | |
| "loss": 1.0189, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.6649076517150396, | |
| "grad_norm": 1.2837454080581665, | |
| "learning_rate": 6.700991129600976e-06, | |
| "loss": 1.021, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.6965699208443272, | |
| "grad_norm": 0.7489935755729675, | |
| "learning_rate": 6.645133617131388e-06, | |
| "loss": 1.0013, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.7282321899736148, | |
| "grad_norm": 0.729206383228302, | |
| "learning_rate": 6.584770731051271e-06, | |
| "loss": 0.986, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.7598944591029023, | |
| "grad_norm": 0.8594318628311157, | |
| "learning_rate": 6.51998894061348e-06, | |
| "loss": 0.9853, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.7915567282321899, | |
| "grad_norm": 0.6937134861946106, | |
| "learning_rate": 6.4508810451087956e-06, | |
| "loss": 0.9969, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.8232189973614775, | |
| "grad_norm": 0.6618143916130066, | |
| "learning_rate": 6.377546040931835e-06, | |
| "loss": 0.9784, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.8548812664907651, | |
| "grad_norm": 0.7150548100471497, | |
| "learning_rate": 6.300088979769671e-06, | |
| "loss": 0.9775, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.8865435356200527, | |
| "grad_norm": 0.8135938048362732, | |
| "learning_rate": 6.218620818116299e-06, | |
| "loss": 0.9555, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.9182058047493403, | |
| "grad_norm": 0.7386907339096069, | |
| "learning_rate": 6.133258258328535e-06, | |
| "loss": 0.9536, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.9498680738786279, | |
| "grad_norm": 0.6299906969070435, | |
| "learning_rate": 6.044123581451003e-06, | |
| "loss": 0.9513, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.9815303430079155, | |
| "grad_norm": 0.6849163174629211, | |
| "learning_rate": 5.951344472049728e-06, | |
| "loss": 0.9491, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 1.0316622691292876, | |
| "grad_norm": 0.5353211760520935, | |
| "learning_rate": 5.855053835305216e-06, | |
| "loss": 1.2099, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 1.0633245382585752, | |
| "grad_norm": 0.5629773736000061, | |
| "learning_rate": 5.755389606627069e-06, | |
| "loss": 0.8727, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 1.0949868073878628, | |
| "grad_norm": 0.5262039303779602, | |
| "learning_rate": 5.652494554062838e-06, | |
| "loss": 0.8777, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 1.1266490765171504, | |
| "grad_norm": 0.9279022216796875, | |
| "learning_rate": 5.546516073784165e-06, | |
| "loss": 0.8661, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 1.158311345646438, | |
| "grad_norm": 1.0550135374069214, | |
| "learning_rate": 5.4376059789431955e-06, | |
| "loss": 0.8665, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 1.1899736147757256, | |
| "grad_norm": 0.4825997054576874, | |
| "learning_rate": 5.325920282201696e-06, | |
| "loss": 0.8584, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 1.2216358839050132, | |
| "grad_norm": 0.5052205920219421, | |
| "learning_rate": 5.2116189722444164e-06, | |
| "loss": 0.8677, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 1.2532981530343008, | |
| "grad_norm": 0.4654822051525116, | |
| "learning_rate": 5.094865784596845e-06, | |
| "loss": 0.8515, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 1.2849604221635884, | |
| "grad_norm": 0.45359933376312256, | |
| "learning_rate": 4.975827967075644e-06, | |
| "loss": 0.851, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.316622691292876, | |
| "grad_norm": 0.968210756778717, | |
| "learning_rate": 4.854676040207761e-06, | |
| "loss": 0.8733, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 1.3482849604221636, | |
| "grad_norm": 0.5169340372085571, | |
| "learning_rate": 4.731583552961416e-06, | |
| "loss": 0.8698, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 1.3799472295514512, | |
| "grad_norm": 0.8193064332008362, | |
| "learning_rate": 4.606726834138884e-06, | |
| "loss": 0.8464, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 1.4116094986807388, | |
| "grad_norm": 0.589012622833252, | |
| "learning_rate": 4.480284739787175e-06, | |
| "loss": 0.8465, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.4432717678100264, | |
| "grad_norm": 0.5069411993026733, | |
| "learning_rate": 4.352438396988471e-06, | |
| "loss": 0.8415, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.474934036939314, | |
| "grad_norm": 0.4803805947303772, | |
| "learning_rate": 4.223370944397335e-06, | |
| "loss": 0.8089, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.5065963060686016, | |
| "grad_norm": 0.44742244482040405, | |
| "learning_rate": 4.093267269896339e-06, | |
| "loss": 0.8141, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.5382585751978892, | |
| "grad_norm": 0.5128947496414185, | |
| "learning_rate": 3.9623137457459586e-06, | |
| "loss": 0.8334, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.5699208443271768, | |
| "grad_norm": 0.43568509817123413, | |
| "learning_rate": 3.83069796160811e-06, | |
| "loss": 0.7996, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.6015831134564644, | |
| "grad_norm": 0.5734230279922485, | |
| "learning_rate": 3.6986084558257596e-06, | |
| "loss": 0.8206, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.633245382585752, | |
| "grad_norm": 0.4465356469154358, | |
| "learning_rate": 3.5662344453435665e-06, | |
| "loss": 0.8036, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.6649076517150396, | |
| "grad_norm": 0.9534016251564026, | |
| "learning_rate": 3.4337655546564343e-06, | |
| "loss": 0.849, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.6965699208443272, | |
| "grad_norm": 0.5999336838722229, | |
| "learning_rate": 3.301391544174241e-06, | |
| "loss": 0.7874, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.7282321899736148, | |
| "grad_norm": 0.4919511079788208, | |
| "learning_rate": 3.1693020383918907e-06, | |
| "loss": 0.8012, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.7598944591029024, | |
| "grad_norm": 0.4128033518791199, | |
| "learning_rate": 3.0376862542540426e-06, | |
| "loss": 0.8056, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.79155672823219, | |
| "grad_norm": 0.4979248642921448, | |
| "learning_rate": 2.9067327301036616e-06, | |
| "loss": 0.7769, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.8232189973614776, | |
| "grad_norm": 0.46139153838157654, | |
| "learning_rate": 2.7766290556026646e-06, | |
| "loss": 0.7984, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.8548812664907652, | |
| "grad_norm": 0.46961304545402527, | |
| "learning_rate": 2.6475616030115286e-06, | |
| "loss": 0.7917, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.8865435356200528, | |
| "grad_norm": 0.4321727752685547, | |
| "learning_rate": 2.5197152602128256e-06, | |
| "loss": 0.7852, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.9182058047493404, | |
| "grad_norm": 0.4466552138328552, | |
| "learning_rate": 2.393273165861116e-06, | |
| "loss": 0.8023, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.949868073878628, | |
| "grad_norm": 0.4448838233947754, | |
| "learning_rate": 2.2684164470385843e-06, | |
| "loss": 0.7616, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.9815303430079156, | |
| "grad_norm": 0.434826523065567, | |
| "learning_rate": 2.14532395979224e-06, | |
| "loss": 0.8165, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 2.0316622691292876, | |
| "grad_norm": 0.6250742673873901, | |
| "learning_rate": 2.0241720329243563e-06, | |
| "loss": 1.0035, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 2.063324538258575, | |
| "grad_norm": 0.41192013025283813, | |
| "learning_rate": 1.905134215403155e-06, | |
| "loss": 0.7532, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 2.094986807387863, | |
| "grad_norm": 0.4167821407318115, | |
| "learning_rate": 1.7883810277555837e-06, | |
| "loss": 0.7628, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 2.1266490765171504, | |
| "grad_norm": 0.4210691452026367, | |
| "learning_rate": 1.6740797177983044e-06, | |
| "loss": 0.7511, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 2.158311345646438, | |
| "grad_norm": 0.44451218843460083, | |
| "learning_rate": 1.5623940210568042e-06, | |
| "loss": 0.7302, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 2.1899736147757256, | |
| "grad_norm": 0.40137404203414917, | |
| "learning_rate": 1.453483926215835e-06, | |
| "loss": 0.7315, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 2.221635883905013, | |
| "grad_norm": 0.6961039900779724, | |
| "learning_rate": 1.3475054459371625e-06, | |
| "loss": 0.7478, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 2.253298153034301, | |
| "grad_norm": 0.37633243203163147, | |
| "learning_rate": 1.2446103933729302e-06, | |
| "loss": 0.7288, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 2.2849604221635884, | |
| "grad_norm": 0.43067774176597595, | |
| "learning_rate": 1.1449461646947839e-06, | |
| "loss": 0.7628, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 2.316622691292876, | |
| "grad_norm": 0.3962654769420624, | |
| "learning_rate": 1.048655527950273e-06, | |
| "loss": 0.7178, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 2.3482849604221636, | |
| "grad_norm": 0.9008962512016296, | |
| "learning_rate": 9.558764185489968e-07, | |
| "loss": 0.7287, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 2.379947229551451, | |
| "grad_norm": 0.4029214084148407, | |
| "learning_rate": 8.667417416714656e-07, | |
| "loss": 0.7135, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 2.411609498680739, | |
| "grad_norm": 0.37933027744293213, | |
| "learning_rate": 7.813791818837012e-07, | |
| "loss": 0.7242, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.4432717678100264, | |
| "grad_norm": 0.4242548644542694, | |
| "learning_rate": 6.999110202303293e-07, | |
| "loss": 0.7201, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 2.474934036939314, | |
| "grad_norm": 0.5838273167610168, | |
| "learning_rate": 6.22453959068165e-07, | |
| "loss": 0.6838, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 2.5065963060686016, | |
| "grad_norm": 0.3837423026561737, | |
| "learning_rate": 5.491189548912051e-07, | |
| "loss": 0.717, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 2.538258575197889, | |
| "grad_norm": 1.0475167036056519, | |
| "learning_rate": 4.800110593865199e-07, | |
| "loss": 0.7152, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 2.569920844327177, | |
| "grad_norm": 0.4407210946083069, | |
| "learning_rate": 4.15229268948729e-07, | |
| "loss": 0.7368, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.6015831134564644, | |
| "grad_norm": 0.39730748534202576, | |
| "learning_rate": 3.5486638286861297e-07, | |
| "loss": 0.715, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 2.633245382585752, | |
| "grad_norm": 0.5303342938423157, | |
| "learning_rate": 2.990088703990245e-07, | |
| "loss": 0.74, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 2.6649076517150396, | |
| "grad_norm": 0.6698248386383057, | |
| "learning_rate": 2.4773674688852197e-07, | |
| "loss": 0.7142, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 2.6965699208443272, | |
| "grad_norm": 0.43907734751701355, | |
| "learning_rate": 2.0112345916014578e-07, | |
| "loss": 0.7372, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 2.728232189973615, | |
| "grad_norm": 0.37293171882629395, | |
| "learning_rate": 1.592357802995572e-07, | |
| "loss": 0.7241, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 2.7598944591029024, | |
| "grad_norm": 0.394404798746109, | |
| "learning_rate": 1.2213371400323352e-07, | |
| "loss": 0.7095, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 2.79155672823219, | |
| "grad_norm": 0.7737839818000793, | |
| "learning_rate": 8.987040862375339e-08, | |
| "loss": 0.6919, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 2.8232189973614776, | |
| "grad_norm": 0.3485806882381439, | |
| "learning_rate": 6.249208103529092e-08, | |
| "loss": 0.7263, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.8548812664907652, | |
| "grad_norm": 0.39798641204833984, | |
| "learning_rate": 4.0037950428390144e-08, | |
| "loss": 0.7415, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.886543535620053, | |
| "grad_norm": 0.38925713300704956, | |
| "learning_rate": 2.254018212885278e-08, | |
| "loss": 0.7267, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.9182058047493404, | |
| "grad_norm": 0.361558198928833, | |
| "learning_rate": 1.0023841521217825e-08, | |
| "loss": 0.7175, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.949868073878628, | |
| "grad_norm": 0.5748413801193237, | |
| "learning_rate": 2.506858142839852e-09, | |
| "loss": 0.7126, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.9815303430079156, | |
| "grad_norm": 0.3813362717628479, | |
| "learning_rate": 0.0, | |
| "loss": 0.7082, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.9815303430079156, | |
| "step": 93, | |
| "total_flos": 2930651586101248.0, | |
| "train_loss": 0.8763364265041966, | |
| "train_runtime": 71308.8652, | |
| "train_samples_per_second": 0.382, | |
| "train_steps_per_second": 0.001 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 93, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2930651586101248.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |