| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.26628895184136, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0226628895184136, | |
| "grad_norm": 5.715946698612309, | |
| "learning_rate": 3.7037037037037036e-08, | |
| "loss": 1.039, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0453257790368272, | |
| "grad_norm": 5.918098634610158, | |
| "learning_rate": 7.407407407407407e-08, | |
| "loss": 1.0345, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0679886685552408, | |
| "grad_norm": 5.967358491879423, | |
| "learning_rate": 1.111111111111111e-07, | |
| "loss": 1.0568, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0906515580736544, | |
| "grad_norm": 6.076151471056227, | |
| "learning_rate": 1.4814814814814815e-07, | |
| "loss": 1.0407, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 5.698276915195162, | |
| "learning_rate": 1.8518518518518516e-07, | |
| "loss": 1.0355, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1359773371104816, | |
| "grad_norm": 5.524873495595531, | |
| "learning_rate": 2.222222222222222e-07, | |
| "loss": 1.0329, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.15864022662889518, | |
| "grad_norm": 5.663139068043792, | |
| "learning_rate": 2.5925925925925923e-07, | |
| "loss": 1.013, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1813031161473088, | |
| "grad_norm": 5.483842003291619, | |
| "learning_rate": 2.962962962962963e-07, | |
| "loss": 1.0285, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.20396600566572237, | |
| "grad_norm": 5.501921058157795, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 1.0181, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 5.691661611678567, | |
| "learning_rate": 3.703703703703703e-07, | |
| "loss": 1.0046, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24929178470254956, | |
| "grad_norm": 5.490524973688248, | |
| "learning_rate": 4.0740740740740737e-07, | |
| "loss": 1.0291, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2719546742209632, | |
| "grad_norm": 4.885236117260528, | |
| "learning_rate": 4.444444444444444e-07, | |
| "loss": 1.0084, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.29461756373937675, | |
| "grad_norm": 5.256688897749667, | |
| "learning_rate": 4.814814814814814e-07, | |
| "loss": 0.9945, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.31728045325779036, | |
| "grad_norm": 5.026023661790397, | |
| "learning_rate": 5.185185185185185e-07, | |
| "loss": 0.9936, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 4.979666180740075, | |
| "learning_rate": 5.555555555555555e-07, | |
| "loss": 0.9997, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3626062322946176, | |
| "grad_norm": 4.741351636847691, | |
| "learning_rate": 5.925925925925926e-07, | |
| "loss": 0.9904, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.38526912181303113, | |
| "grad_norm": 4.429638197959212, | |
| "learning_rate": 6.296296296296296e-07, | |
| "loss": 0.9779, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.40793201133144474, | |
| "grad_norm": 4.2702651723674006, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.9373, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.43059490084985835, | |
| "grad_norm": 4.371215055008036, | |
| "learning_rate": 7.037037037037037e-07, | |
| "loss": 0.9616, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 4.300078040900759, | |
| "learning_rate": 7.407407407407406e-07, | |
| "loss": 0.9581, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.47592067988668557, | |
| "grad_norm": 4.242855799180736, | |
| "learning_rate": 7.777777777777778e-07, | |
| "loss": 0.9454, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.4985835694050991, | |
| "grad_norm": 3.4536592234259555, | |
| "learning_rate": 8.148148148148147e-07, | |
| "loss": 0.9274, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5212464589235127, | |
| "grad_norm": 3.3525795982748203, | |
| "learning_rate": 8.518518518518518e-07, | |
| "loss": 0.8833, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5439093484419264, | |
| "grad_norm": 3.110575381958802, | |
| "learning_rate": 8.888888888888888e-07, | |
| "loss": 0.9066, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 3.18785930927135, | |
| "learning_rate": 9.259259259259259e-07, | |
| "loss": 0.8896, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5892351274787535, | |
| "grad_norm": 3.0188412291205684, | |
| "learning_rate": 9.629629629629628e-07, | |
| "loss": 0.9068, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.6118980169971672, | |
| "grad_norm": 3.0072699515749344, | |
| "learning_rate": 1e-06, | |
| "loss": 0.8959, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.6345609065155807, | |
| "grad_norm": 3.050779999599616, | |
| "learning_rate": 9.999560724782173e-07, | |
| "loss": 0.8648, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6572237960339944, | |
| "grad_norm": 3.034749793056673, | |
| "learning_rate": 9.998242976313776e-07, | |
| "loss": 0.8763, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 2.6230160618361897, | |
| "learning_rate": 9.996046986136508e-07, | |
| "loss": 0.8439, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.7025495750708215, | |
| "grad_norm": 2.619746810094255, | |
| "learning_rate": 9.992973140107996e-07, | |
| "loss": 0.8395, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.7252124645892352, | |
| "grad_norm": 2.2660982887250496, | |
| "learning_rate": 9.989021978333994e-07, | |
| "loss": 0.8407, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.7478753541076487, | |
| "grad_norm": 1.92948640709938, | |
| "learning_rate": 9.984194195073478e-07, | |
| "loss": 0.8175, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.7705382436260623, | |
| "grad_norm": 1.8673042037436878, | |
| "learning_rate": 9.97849063861667e-07, | |
| "loss": 0.7963, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 1.841378707582655, | |
| "learning_rate": 9.971912311135967e-07, | |
| "loss": 0.8177, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.8158640226628895, | |
| "grad_norm": 1.6212101538356403, | |
| "learning_rate": 9.964460368509865e-07, | |
| "loss": 0.8036, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.8385269121813032, | |
| "grad_norm": 1.6148282593388759, | |
| "learning_rate": 9.956136120119856e-07, | |
| "loss": 0.7945, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.8611898016997167, | |
| "grad_norm": 1.5660870386151309, | |
| "learning_rate": 9.946941028620347e-07, | |
| "loss": 0.7919, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.8838526912181303, | |
| "grad_norm": 1.5162976532167538, | |
| "learning_rate": 9.936876709681666e-07, | |
| "loss": 0.7965, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 1.4779616090178773, | |
| "learning_rate": 9.92594493170617e-07, | |
| "loss": 0.7872, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9291784702549575, | |
| "grad_norm": 1.4588545367417372, | |
| "learning_rate": 9.914147615517526e-07, | |
| "loss": 0.7933, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.9518413597733711, | |
| "grad_norm": 1.2450088034935203, | |
| "learning_rate": 9.901486834023181e-07, | |
| "loss": 0.7401, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.9745042492917847, | |
| "grad_norm": 1.1159548060929454, | |
| "learning_rate": 9.887964811850157e-07, | |
| "loss": 0.7496, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.9971671388101983, | |
| "grad_norm": 1.0418410473138606, | |
| "learning_rate": 9.87358392495415e-07, | |
| "loss": 0.7568, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.019830028328612, | |
| "grad_norm": 2.1594760368768195, | |
| "learning_rate": 9.858346700202048e-07, | |
| "loss": 1.3469, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.0424929178470255, | |
| "grad_norm": 0.9706954495224399, | |
| "learning_rate": 9.842255814927944e-07, | |
| "loss": 0.7412, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.065155807365439, | |
| "grad_norm": 0.9479843401943371, | |
| "learning_rate": 9.825314096462684e-07, | |
| "loss": 0.712, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.0878186968838528, | |
| "grad_norm": 0.8785518016295425, | |
| "learning_rate": 9.807524521637102e-07, | |
| "loss": 0.721, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.1104815864022664, | |
| "grad_norm": 0.9083971698155864, | |
| "learning_rate": 9.788890216258938e-07, | |
| "loss": 0.7405, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.13314447592068, | |
| "grad_norm": 0.9052818651846114, | |
| "learning_rate": 9.769414454563615e-07, | |
| "loss": 0.7223, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.1558073654390935, | |
| "grad_norm": 0.8244297426454674, | |
| "learning_rate": 9.749100658638914e-07, | |
| "loss": 0.7113, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.178470254957507, | |
| "grad_norm": 0.7448472800310213, | |
| "learning_rate": 9.72795239782369e-07, | |
| "loss": 0.7001, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.2011331444759206, | |
| "grad_norm": 0.8936397991398377, | |
| "learning_rate": 9.705973388080692e-07, | |
| "loss": 0.6924, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.2237960339943343, | |
| "grad_norm": 0.7188466048624885, | |
| "learning_rate": 9.68316749134364e-07, | |
| "loss": 0.7005, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.246458923512748, | |
| "grad_norm": 0.6923178573722074, | |
| "learning_rate": 9.659538714838633e-07, | |
| "loss": 0.6983, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.2691218130311614, | |
| "grad_norm": 0.6963394168232236, | |
| "learning_rate": 9.63509121038005e-07, | |
| "loss": 0.6932, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.291784702549575, | |
| "grad_norm": 0.6743675615821408, | |
| "learning_rate": 9.609829273641032e-07, | |
| "loss": 0.6789, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.3144475920679888, | |
| "grad_norm": 0.6786035246894967, | |
| "learning_rate": 9.583757343398684e-07, | |
| "loss": 0.6628, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.3371104815864023, | |
| "grad_norm": 0.7270460673039131, | |
| "learning_rate": 9.55688000075414e-07, | |
| "loss": 0.6831, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.3597733711048159, | |
| "grad_norm": 0.6841455902480504, | |
| "learning_rate": 9.529201968327616e-07, | |
| "loss": 0.6951, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3824362606232294, | |
| "grad_norm": 0.6153616879449294, | |
| "learning_rate": 9.500728109428603e-07, | |
| "loss": 0.676, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.405099150141643, | |
| "grad_norm": 0.6177487537567523, | |
| "learning_rate": 9.47146342720133e-07, | |
| "loss": 0.6842, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.4277620396600565, | |
| "grad_norm": 0.5753559089127149, | |
| "learning_rate": 9.441413063745659e-07, | |
| "loss": 0.6408, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.4504249291784703, | |
| "grad_norm": 0.620464077741966, | |
| "learning_rate": 9.410582299213572e-07, | |
| "loss": 0.6952, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.4730878186968839, | |
| "grad_norm": 0.587732312757755, | |
| "learning_rate": 9.378976550881392e-07, | |
| "loss": 0.6897, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.4957507082152974, | |
| "grad_norm": 0.6133303288545134, | |
| "learning_rate": 9.346601372197913e-07, | |
| "loss": 0.6319, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.5184135977337112, | |
| "grad_norm": 0.5975684805854956, | |
| "learning_rate": 9.313462451808599e-07, | |
| "loss": 0.7085, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.5410764872521248, | |
| "grad_norm": 0.5691716789827311, | |
| "learning_rate": 9.279565612556042e-07, | |
| "loss": 0.6799, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.5637393767705383, | |
| "grad_norm": 0.5623581760482004, | |
| "learning_rate": 9.24491681045682e-07, | |
| "loss": 0.6627, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.5864022662889519, | |
| "grad_norm": 0.5545018113642449, | |
| "learning_rate": 9.209522133654968e-07, | |
| "loss": 0.6673, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.6090651558073654, | |
| "grad_norm": 0.6223379664208608, | |
| "learning_rate": 9.17338780135223e-07, | |
| "loss": 0.6682, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.631728045325779, | |
| "grad_norm": 0.5484348938274137, | |
| "learning_rate": 9.136520162715286e-07, | |
| "loss": 0.6459, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.6543909348441925, | |
| "grad_norm": 0.598633459691356, | |
| "learning_rate": 9.098925695760131e-07, | |
| "loss": 0.6663, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.677053824362606, | |
| "grad_norm": 0.6063642708751795, | |
| "learning_rate": 9.060611006213832e-07, | |
| "loss": 0.6471, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.6997167138810199, | |
| "grad_norm": 0.5310843433827631, | |
| "learning_rate": 9.021582826353824e-07, | |
| "loss": 0.6422, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.7223796033994334, | |
| "grad_norm": 0.5899701772442509, | |
| "learning_rate": 8.981848013824993e-07, | |
| "loss": 0.6616, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.7450424929178472, | |
| "grad_norm": 0.6774981304086599, | |
| "learning_rate": 8.94141355043471e-07, | |
| "loss": 0.6442, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.7677053824362607, | |
| "grad_norm": 0.5555862881849043, | |
| "learning_rate": 8.90028654092606e-07, | |
| "loss": 0.6427, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.7903682719546743, | |
| "grad_norm": 0.5521769324318557, | |
| "learning_rate": 8.858474211729469e-07, | |
| "loss": 0.6308, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.8130311614730878, | |
| "grad_norm": 0.5094008328024741, | |
| "learning_rate": 8.815983909692941e-07, | |
| "loss": 0.6375, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.8356940509915014, | |
| "grad_norm": 0.47949684902186096, | |
| "learning_rate": 8.77282310079115e-07, | |
| "loss": 0.6124, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.858356940509915, | |
| "grad_norm": 0.5457213358478963, | |
| "learning_rate": 8.72899936881359e-07, | |
| "loss": 0.676, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.8810198300283285, | |
| "grad_norm": 0.5475114660934921, | |
| "learning_rate": 8.684520414032023e-07, | |
| "loss": 0.6462, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.903682719546742, | |
| "grad_norm": 0.5780771596548755, | |
| "learning_rate": 8.639394051847471e-07, | |
| "loss": 0.629, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.9263456090651558, | |
| "grad_norm": 0.5153044368837152, | |
| "learning_rate": 8.593628211416963e-07, | |
| "loss": 0.6607, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.9490084985835694, | |
| "grad_norm": 0.5078347714748787, | |
| "learning_rate": 8.547230934260311e-07, | |
| "loss": 0.653, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.9716713881019832, | |
| "grad_norm": 0.5090369208403657, | |
| "learning_rate": 8.500210372847126e-07, | |
| "loss": 0.6555, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.9943342776203967, | |
| "grad_norm": 0.521639825746896, | |
| "learning_rate": 8.45257478916435e-07, | |
| "loss": 0.6187, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.0169971671388103, | |
| "grad_norm": 1.4410319682327064, | |
| "learning_rate": 8.404332553264546e-07, | |
| "loss": 1.1825, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.039660056657224, | |
| "grad_norm": 0.5362038209234066, | |
| "learning_rate": 8.355492141795184e-07, | |
| "loss": 0.6046, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.0623229461756374, | |
| "grad_norm": 0.5295224430525873, | |
| "learning_rate": 8.306062136509219e-07, | |
| "loss": 0.607, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.084985835694051, | |
| "grad_norm": 0.5438204610049183, | |
| "learning_rate": 8.256051222757187e-07, | |
| "loss": 0.6425, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.1076487252124645, | |
| "grad_norm": 0.5637438849056178, | |
| "learning_rate": 8.2054681879611e-07, | |
| "loss": 0.6472, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.130311614730878, | |
| "grad_norm": 0.4915626737833171, | |
| "learning_rate": 8.154321920070412e-07, | |
| "loss": 0.6366, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.1529745042492916, | |
| "grad_norm": 0.5445000714826581, | |
| "learning_rate": 8.102621406000308e-07, | |
| "loss": 0.6302, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.1756373937677056, | |
| "grad_norm": 0.544017574639994, | |
| "learning_rate": 8.050375730052621e-07, | |
| "loss": 0.6016, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.198300283286119, | |
| "grad_norm": 0.7667138278664033, | |
| "learning_rate": 7.997594072319625e-07, | |
| "loss": 0.6476, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.2209631728045327, | |
| "grad_norm": 0.5723261101431134, | |
| "learning_rate": 7.944285707070997e-07, | |
| "loss": 0.5982, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.2436260623229463, | |
| "grad_norm": 0.5198427284810859, | |
| "learning_rate": 7.890460001124241e-07, | |
| "loss": 0.6373, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.26628895184136, | |
| "grad_norm": 0.5082652201383684, | |
| "learning_rate": 7.83612641219884e-07, | |
| "loss": 0.5894, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 264, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 208143843852288.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |