{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.26628895184136, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0226628895184136, "grad_norm": 5.715946698612309, "learning_rate": 3.7037037037037036e-08, "loss": 1.039, "step": 1 }, { "epoch": 0.0453257790368272, "grad_norm": 5.918098634610158, "learning_rate": 7.407407407407407e-08, "loss": 1.0345, "step": 2 }, { "epoch": 0.0679886685552408, "grad_norm": 5.967358491879423, "learning_rate": 1.111111111111111e-07, "loss": 1.0568, "step": 3 }, { "epoch": 0.0906515580736544, "grad_norm": 6.076151471056227, "learning_rate": 1.4814814814814815e-07, "loss": 1.0407, "step": 4 }, { "epoch": 0.11331444759206799, "grad_norm": 5.698276915195162, "learning_rate": 1.8518518518518516e-07, "loss": 1.0355, "step": 5 }, { "epoch": 0.1359773371104816, "grad_norm": 5.524873495595531, "learning_rate": 2.222222222222222e-07, "loss": 1.0329, "step": 6 }, { "epoch": 0.15864022662889518, "grad_norm": 5.663139068043792, "learning_rate": 2.5925925925925923e-07, "loss": 1.013, "step": 7 }, { "epoch": 0.1813031161473088, "grad_norm": 5.483842003291619, "learning_rate": 2.962962962962963e-07, "loss": 1.0285, "step": 8 }, { "epoch": 0.20396600566572237, "grad_norm": 5.501921058157795, "learning_rate": 3.333333333333333e-07, "loss": 1.0181, "step": 9 }, { "epoch": 0.22662889518413598, "grad_norm": 5.691661611678567, "learning_rate": 3.703703703703703e-07, "loss": 1.0046, "step": 10 }, { "epoch": 0.24929178470254956, "grad_norm": 5.490524973688248, "learning_rate": 4.0740740740740737e-07, "loss": 1.0291, "step": 11 }, { "epoch": 0.2719546742209632, "grad_norm": 4.885236117260528, "learning_rate": 4.444444444444444e-07, "loss": 1.0084, "step": 12 }, { "epoch": 0.29461756373937675, "grad_norm": 5.256688897749667, "learning_rate": 4.814814814814814e-07, "loss": 0.9945, "step": 13 }, { "epoch": 0.31728045325779036, "grad_norm": 5.026023661790397, "learning_rate": 5.185185185185185e-07, "loss": 0.9936, "step": 14 }, { "epoch": 0.33994334277620397, "grad_norm": 4.979666180740075, "learning_rate": 5.555555555555555e-07, "loss": 0.9997, "step": 15 }, { "epoch": 0.3626062322946176, "grad_norm": 4.741351636847691, "learning_rate": 5.925925925925926e-07, "loss": 0.9904, "step": 16 }, { "epoch": 0.38526912181303113, "grad_norm": 4.429638197959212, "learning_rate": 6.296296296296296e-07, "loss": 0.9779, "step": 17 }, { "epoch": 0.40793201133144474, "grad_norm": 4.2702651723674006, "learning_rate": 6.666666666666666e-07, "loss": 0.9373, "step": 18 }, { "epoch": 0.43059490084985835, "grad_norm": 4.371215055008036, "learning_rate": 7.037037037037037e-07, "loss": 0.9616, "step": 19 }, { "epoch": 0.45325779036827196, "grad_norm": 4.300078040900759, "learning_rate": 7.407407407407406e-07, "loss": 0.9581, "step": 20 }, { "epoch": 0.47592067988668557, "grad_norm": 4.242855799180736, "learning_rate": 7.777777777777778e-07, "loss": 0.9454, "step": 21 }, { "epoch": 0.4985835694050991, "grad_norm": 3.4536592234259555, "learning_rate": 8.148148148148147e-07, "loss": 0.9274, "step": 22 }, { "epoch": 0.5212464589235127, "grad_norm": 3.3525795982748203, "learning_rate": 8.518518518518518e-07, "loss": 0.8833, "step": 23 }, { "epoch": 0.5439093484419264, "grad_norm": 3.110575381958802, "learning_rate": 8.888888888888888e-07, "loss": 0.9066, "step": 24 }, { "epoch": 0.56657223796034, "grad_norm": 3.18785930927135, "learning_rate": 9.259259259259259e-07, "loss": 0.8896, "step": 25 }, { "epoch": 0.5892351274787535, "grad_norm": 3.0188412291205684, "learning_rate": 9.629629629629628e-07, "loss": 0.9068, "step": 26 }, { "epoch": 0.6118980169971672, "grad_norm": 3.0072699515749344, "learning_rate": 1e-06, "loss": 0.8959, "step": 27 }, { "epoch": 0.6345609065155807, "grad_norm": 3.050779999599616, "learning_rate": 9.999560724782173e-07, "loss": 0.8648, "step": 28 }, { "epoch": 0.6572237960339944, "grad_norm": 3.034749793056673, "learning_rate": 9.998242976313776e-07, "loss": 0.8763, "step": 29 }, { "epoch": 0.6798866855524079, "grad_norm": 2.6230160618361897, "learning_rate": 9.996046986136508e-07, "loss": 0.8439, "step": 30 }, { "epoch": 0.7025495750708215, "grad_norm": 2.619746810094255, "learning_rate": 9.992973140107996e-07, "loss": 0.8395, "step": 31 }, { "epoch": 0.7252124645892352, "grad_norm": 2.2660982887250496, "learning_rate": 9.989021978333994e-07, "loss": 0.8407, "step": 32 }, { "epoch": 0.7478753541076487, "grad_norm": 1.92948640709938, "learning_rate": 9.984194195073478e-07, "loss": 0.8175, "step": 33 }, { "epoch": 0.7705382436260623, "grad_norm": 1.8673042037436878, "learning_rate": 9.97849063861667e-07, "loss": 0.7963, "step": 34 }, { "epoch": 0.7932011331444759, "grad_norm": 1.841378707582655, "learning_rate": 9.971912311135967e-07, "loss": 0.8177, "step": 35 }, { "epoch": 0.8158640226628895, "grad_norm": 1.6212101538356403, "learning_rate": 9.964460368509865e-07, "loss": 0.8036, "step": 36 }, { "epoch": 0.8385269121813032, "grad_norm": 1.6148282593388759, "learning_rate": 9.956136120119856e-07, "loss": 0.7945, "step": 37 }, { "epoch": 0.8611898016997167, "grad_norm": 1.5660870386151309, "learning_rate": 9.946941028620347e-07, "loss": 0.7919, "step": 38 }, { "epoch": 0.8838526912181303, "grad_norm": 1.5162976532167538, "learning_rate": 9.936876709681666e-07, "loss": 0.7965, "step": 39 }, { "epoch": 0.9065155807365439, "grad_norm": 1.4779616090178773, "learning_rate": 9.92594493170617e-07, "loss": 0.7872, "step": 40 }, { "epoch": 0.9291784702549575, "grad_norm": 1.4588545367417372, "learning_rate": 9.914147615517526e-07, "loss": 0.7933, "step": 41 }, { "epoch": 0.9518413597733711, "grad_norm": 1.2450088034935203, "learning_rate": 9.901486834023181e-07, "loss": 0.7401, "step": 42 }, { "epoch": 0.9745042492917847, "grad_norm": 1.1159548060929454, "learning_rate": 9.887964811850157e-07, "loss": 0.7496, "step": 43 }, { "epoch": 0.9971671388101983, "grad_norm": 1.0418410473138606, "learning_rate": 9.87358392495415e-07, "loss": 0.7568, "step": 44 }, { "epoch": 1.019830028328612, "grad_norm": 2.1594760368768195, "learning_rate": 9.858346700202048e-07, "loss": 1.3469, "step": 45 }, { "epoch": 1.0424929178470255, "grad_norm": 0.9706954495224399, "learning_rate": 9.842255814927944e-07, "loss": 0.7412, "step": 46 }, { "epoch": 1.065155807365439, "grad_norm": 0.9479843401943371, "learning_rate": 9.825314096462684e-07, "loss": 0.712, "step": 47 }, { "epoch": 1.0878186968838528, "grad_norm": 0.8785518016295425, "learning_rate": 9.807524521637102e-07, "loss": 0.721, "step": 48 }, { "epoch": 1.1104815864022664, "grad_norm": 0.9083971698155864, "learning_rate": 9.788890216258938e-07, "loss": 0.7405, "step": 49 }, { "epoch": 1.13314447592068, "grad_norm": 0.9052818651846114, "learning_rate": 9.769414454563615e-07, "loss": 0.7223, "step": 50 }, { "epoch": 1.1558073654390935, "grad_norm": 0.8244297426454674, "learning_rate": 9.749100658638914e-07, "loss": 0.7113, "step": 51 }, { "epoch": 1.178470254957507, "grad_norm": 0.7448472800310213, "learning_rate": 9.72795239782369e-07, "loss": 0.7001, "step": 52 }, { "epoch": 1.2011331444759206, "grad_norm": 0.8936397991398377, "learning_rate": 9.705973388080692e-07, "loss": 0.6924, "step": 53 }, { "epoch": 1.2237960339943343, "grad_norm": 0.7188466048624885, "learning_rate": 9.68316749134364e-07, "loss": 0.7005, "step": 54 }, { "epoch": 1.246458923512748, "grad_norm": 0.6923178573722074, "learning_rate": 9.659538714838633e-07, "loss": 0.6983, "step": 55 }, { "epoch": 1.2691218130311614, "grad_norm": 0.6963394168232236, "learning_rate": 9.63509121038005e-07, "loss": 0.6932, "step": 56 }, { "epoch": 1.291784702549575, "grad_norm": 0.6743675615821408, "learning_rate": 9.609829273641032e-07, "loss": 0.6789, "step": 57 }, { "epoch": 1.3144475920679888, "grad_norm": 0.6786035246894967, "learning_rate": 9.583757343398684e-07, "loss": 0.6628, "step": 58 }, { "epoch": 1.3371104815864023, "grad_norm": 0.7270460673039131, "learning_rate": 9.55688000075414e-07, "loss": 0.6831, "step": 59 }, { "epoch": 1.3597733711048159, "grad_norm": 0.6841455902480504, "learning_rate": 9.529201968327616e-07, "loss": 0.6951, "step": 60 }, { "epoch": 1.3824362606232294, "grad_norm": 0.6153616879449294, "learning_rate": 9.500728109428603e-07, "loss": 0.676, "step": 61 }, { "epoch": 1.405099150141643, "grad_norm": 0.6177487537567523, "learning_rate": 9.47146342720133e-07, "loss": 0.6842, "step": 62 }, { "epoch": 1.4277620396600565, "grad_norm": 0.5753559089127149, "learning_rate": 9.441413063745659e-07, "loss": 0.6408, "step": 63 }, { "epoch": 1.4504249291784703, "grad_norm": 0.620464077741966, "learning_rate": 9.410582299213572e-07, "loss": 0.6952, "step": 64 }, { "epoch": 1.4730878186968839, "grad_norm": 0.587732312757755, "learning_rate": 9.378976550881392e-07, "loss": 0.6897, "step": 65 }, { "epoch": 1.4957507082152974, "grad_norm": 0.6133303288545134, "learning_rate": 9.346601372197913e-07, "loss": 0.6319, "step": 66 }, { "epoch": 1.5184135977337112, "grad_norm": 0.5975684805854956, "learning_rate": 9.313462451808599e-07, "loss": 0.7085, "step": 67 }, { "epoch": 1.5410764872521248, "grad_norm": 0.5691716789827311, "learning_rate": 9.279565612556042e-07, "loss": 0.6799, "step": 68 }, { "epoch": 1.5637393767705383, "grad_norm": 0.5623581760482004, "learning_rate": 9.24491681045682e-07, "loss": 0.6627, "step": 69 }, { "epoch": 1.5864022662889519, "grad_norm": 0.5545018113642449, "learning_rate": 9.209522133654968e-07, "loss": 0.6673, "step": 70 }, { "epoch": 1.6090651558073654, "grad_norm": 0.6223379664208608, "learning_rate": 9.17338780135223e-07, "loss": 0.6682, "step": 71 }, { "epoch": 1.631728045325779, "grad_norm": 0.5484348938274137, "learning_rate": 9.136520162715286e-07, "loss": 0.6459, "step": 72 }, { "epoch": 1.6543909348441925, "grad_norm": 0.598633459691356, "learning_rate": 9.098925695760131e-07, "loss": 0.6663, "step": 73 }, { "epoch": 1.677053824362606, "grad_norm": 0.6063642708751795, "learning_rate": 9.060611006213832e-07, "loss": 0.6471, "step": 74 }, { "epoch": 1.6997167138810199, "grad_norm": 0.5310843433827631, "learning_rate": 9.021582826353824e-07, "loss": 0.6422, "step": 75 }, { "epoch": 1.7223796033994334, "grad_norm": 0.5899701772442509, "learning_rate": 8.981848013824993e-07, "loss": 0.6616, "step": 76 }, { "epoch": 1.7450424929178472, "grad_norm": 0.6774981304086599, "learning_rate": 8.94141355043471e-07, "loss": 0.6442, "step": 77 }, { "epoch": 1.7677053824362607, "grad_norm": 0.5555862881849043, "learning_rate": 8.90028654092606e-07, "loss": 0.6427, "step": 78 }, { "epoch": 1.7903682719546743, "grad_norm": 0.5521769324318557, "learning_rate": 8.858474211729469e-07, "loss": 0.6308, "step": 79 }, { "epoch": 1.8130311614730878, "grad_norm": 0.5094008328024741, "learning_rate": 8.815983909692941e-07, "loss": 0.6375, "step": 80 }, { "epoch": 1.8356940509915014, "grad_norm": 0.47949684902186096, "learning_rate": 8.77282310079115e-07, "loss": 0.6124, "step": 81 }, { "epoch": 1.858356940509915, "grad_norm": 0.5457213358478963, "learning_rate": 8.72899936881359e-07, "loss": 0.676, "step": 82 }, { "epoch": 1.8810198300283285, "grad_norm": 0.5475114660934921, "learning_rate": 8.684520414032023e-07, "loss": 0.6462, "step": 83 }, { "epoch": 1.903682719546742, "grad_norm": 0.5780771596548755, "learning_rate": 8.639394051847471e-07, "loss": 0.629, "step": 84 }, { "epoch": 1.9263456090651558, "grad_norm": 0.5153044368837152, "learning_rate": 8.593628211416963e-07, "loss": 0.6607, "step": 85 }, { "epoch": 1.9490084985835694, "grad_norm": 0.5078347714748787, "learning_rate": 8.547230934260311e-07, "loss": 0.653, "step": 86 }, { "epoch": 1.9716713881019832, "grad_norm": 0.5090369208403657, "learning_rate": 8.500210372847126e-07, "loss": 0.6555, "step": 87 }, { "epoch": 1.9943342776203967, "grad_norm": 0.521639825746896, "learning_rate": 8.45257478916435e-07, "loss": 0.6187, "step": 88 }, { "epoch": 2.0169971671388103, "grad_norm": 1.4410319682327064, "learning_rate": 8.404332553264546e-07, "loss": 1.1825, "step": 89 }, { "epoch": 2.039660056657224, "grad_norm": 0.5362038209234066, "learning_rate": 8.355492141795184e-07, "loss": 0.6046, "step": 90 }, { "epoch": 2.0623229461756374, "grad_norm": 0.5295224430525873, "learning_rate": 8.306062136509219e-07, "loss": 0.607, "step": 91 }, { "epoch": 2.084985835694051, "grad_norm": 0.5438204610049183, "learning_rate": 8.256051222757187e-07, "loss": 0.6425, "step": 92 }, { "epoch": 2.1076487252124645, "grad_norm": 0.5637438849056178, "learning_rate": 8.2054681879611e-07, "loss": 0.6472, "step": 93 }, { "epoch": 2.130311614730878, "grad_norm": 0.4915626737833171, "learning_rate": 8.154321920070412e-07, "loss": 0.6366, "step": 94 }, { "epoch": 2.1529745042492916, "grad_norm": 0.5445000714826581, "learning_rate": 8.102621406000308e-07, "loss": 0.6302, "step": 95 }, { "epoch": 2.1756373937677056, "grad_norm": 0.544017574639994, "learning_rate": 8.050375730052621e-07, "loss": 0.6016, "step": 96 }, { "epoch": 2.198300283286119, "grad_norm": 0.7667138278664033, "learning_rate": 7.997594072319625e-07, "loss": 0.6476, "step": 97 }, { "epoch": 2.2209631728045327, "grad_norm": 0.5723261101431134, "learning_rate": 7.944285707070997e-07, "loss": 0.5982, "step": 98 }, { "epoch": 2.2436260623229463, "grad_norm": 0.5198427284810859, "learning_rate": 7.890460001124241e-07, "loss": 0.6373, "step": 99 }, { "epoch": 2.26628895184136, "grad_norm": 0.5082652201383684, "learning_rate": 7.83612641219884e-07, "loss": 0.5894, "step": 100 } ], "logging_steps": 1, "max_steps": 264, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 208143843852288.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }