{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 200.0, "global_step": 394, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005097164702134438, "grad_norm": 12.5, "learning_rate": 1.6666666666666667e-06, "loss": 0.7101277709007263, "step": 1, "token_acc": 0.8745318352059925 }, { "epoch": 0.02548582351067219, "grad_norm": 12.1875, "learning_rate": 8.333333333333334e-06, "loss": 0.8278377652168274, "step": 5, "token_acc": 0.8552833964573375 }, { "epoch": 0.05097164702134438, "grad_norm": 2.421875, "learning_rate": 1.6666666666666667e-05, "loss": 0.5761711120605468, "step": 10, "token_acc": 0.8653274252066853 }, { "epoch": 0.07645747053201657, "grad_norm": 1.7109375, "learning_rate": 1.9996956568698325e-05, "loss": 0.39079511165618896, "step": 15, "token_acc": 0.8887680715510904 }, { "epoch": 0.10194329404268876, "grad_norm": 1.8359375, "learning_rate": 1.9978364530054465e-05, "loss": 0.34750330448150635, "step": 20, "token_acc": 0.8949520766773162 }, { "epoch": 0.12742911755336095, "grad_norm": 1.734375, "learning_rate": 1.9942902642308737e-05, "loss": 0.26970508098602297, "step": 25, "token_acc": 0.9119090968203394 }, { "epoch": 0.15291494106403314, "grad_norm": 1.5859375, "learning_rate": 1.9890630858842614e-05, "loss": 0.31259732246398925, "step": 30, "token_acc": 0.8999557705203384 }, { "epoch": 0.17840076457470533, "grad_norm": 1.625, "learning_rate": 1.9821637552564973e-05, "loss": 0.2847127437591553, "step": 35, "token_acc": 0.9077454674132511 }, { "epoch": 0.20388658808537752, "grad_norm": 1.7734375, "learning_rate": 1.9736039366505087e-05, "loss": 0.26165449619293213, "step": 40, "token_acc": 0.913668239925095 }, { "epoch": 0.2293724115960497, "grad_norm": 1.421875, "learning_rate": 1.9633981016610926e-05, "loss": 0.24728527069091796, "step": 45, "token_acc": 0.9214681226749867 }, { "epoch": 0.2548582351067219, "grad_norm": 1.640625, "learning_rate": 1.951563504708622e-05, "loss": 0.21780962944030763, "step": 50, "token_acc": 0.9288334437567668 }, { "epoch": 0.2803440586173941, "grad_norm": 2.25, "learning_rate": 1.938120153867983e-05, "loss": 0.24094951152801514, "step": 55, "token_acc": 0.9214724125087046 }, { "epoch": 0.3058298821280663, "grad_norm": 2.15625, "learning_rate": 1.9230907770420737e-05, "loss": 0.2268754005432129, "step": 60, "token_acc": 0.9281276558333089 }, { "epoch": 0.33131570563873847, "grad_norm": 1.4375, "learning_rate": 1.9065007835370358e-05, "loss": 0.22875683307647704, "step": 65, "token_acc": 0.9230186030186031 }, { "epoch": 0.35680152914941066, "grad_norm": 1.5859375, "learning_rate": 1.888378221104201e-05, "loss": 0.239988374710083, "step": 70, "token_acc": 0.9226348493246594 }, { "epoch": 0.38228735266008285, "grad_norm": 1.8046875, "learning_rate": 1.8687537285213627e-05, "loss": 0.1711668848991394, "step": 75, "token_acc": 0.9440668926242624 }, { "epoch": 0.40777317617075504, "grad_norm": 1.6640625, "learning_rate": 1.8476604837935515e-05, "loss": 0.2378025770187378, "step": 80, "token_acc": 0.9246084180832901 }, { "epoch": 0.4332589996814272, "grad_norm": 1.7109375, "learning_rate": 1.8251341480608823e-05, "loss": 0.18637551069259645, "step": 85, "token_acc": 0.9384391968682915 }, { "epoch": 0.4587448231920994, "grad_norm": 1.8828125, "learning_rate": 1.8012128053083097e-05, "loss": 0.1798395037651062, "step": 90, "token_acc": 0.9428771275355561 }, { "epoch": 0.4842306467027716, "grad_norm": 1.890625, "learning_rate": 1.7759368979792145e-05, "loss": 0.19358837604522705, "step": 95, "token_acc": 0.9375457743950599 }, { "epoch": 0.5097164702134438, "grad_norm": 1.8515625, "learning_rate": 1.749349158601686e-05, "loss": 0.18399842977523803, "step": 100, "token_acc": 0.9422957113270134 }, { "epoch": 0.535202293724116, "grad_norm": 1.8203125, "learning_rate": 1.7214945375430816e-05, "loss": 0.17546424865722657, "step": 105, "token_acc": 0.9426513393021628 }, { "epoch": 0.5606881172347882, "grad_norm": 1.625, "learning_rate": 1.6924201270150194e-05, "loss": 0.15674128532409667, "step": 110, "token_acc": 0.948707565634069 }, { "epoch": 0.5861739407454604, "grad_norm": 2.375, "learning_rate": 1.6621750814572728e-05, "loss": 0.17290754318237306, "step": 115, "token_acc": 0.9423056713490034 }, { "epoch": 0.6116597642561326, "grad_norm": 1.8515625, "learning_rate": 1.6308105344351776e-05, "loss": 0.1858464241027832, "step": 120, "token_acc": 0.9447680047665152 }, { "epoch": 0.6371455877668047, "grad_norm": 1.8125, "learning_rate": 1.598379512191042e-05, "loss": 0.13930436372756957, "step": 125, "token_acc": 0.9549336664078587 }, { "epoch": 0.6626314112774769, "grad_norm": 1.7734375, "learning_rate": 1.5649368439957182e-05, "loss": 0.1962134599685669, "step": 130, "token_acc": 0.9364768586501332 }, { "epoch": 0.6881172347881491, "grad_norm": 2.0, "learning_rate": 1.5305390694518953e-05, "loss": 0.17399163246154786, "step": 135, "token_acc": 0.9454186101668116 }, { "epoch": 0.7136030582988213, "grad_norm": 1.6171875, "learning_rate": 1.4952443429058334e-05, "loss": 0.116390061378479, "step": 140, "token_acc": 0.9623854999025531 }, { "epoch": 0.7390888818094935, "grad_norm": 2.125, "learning_rate": 1.459112335129144e-05, "loss": 0.20297460556030272, "step": 145, "token_acc": 0.9371412765368923 }, { "epoch": 0.7645747053201657, "grad_norm": 1.9375, "learning_rate": 1.4222041324368347e-05, "loss": 0.14657198190689086, "step": 150, "token_acc": 0.9517101943288719 }, { "epoch": 0.7900605288308379, "grad_norm": 1.6875, "learning_rate": 1.3845821334121763e-05, "loss": 0.1491287589073181, "step": 155, "token_acc": 0.9500473678670425 }, { "epoch": 0.8155463523415101, "grad_norm": 1.8203125, "learning_rate": 1.346309943412995e-05, "loss": 0.10045711994171143, "step": 160, "token_acc": 0.9683162116626162 }, { "epoch": 0.8410321758521823, "grad_norm": 2.125, "learning_rate": 1.3074522670377392e-05, "loss": 0.12388412952423096, "step": 165, "token_acc": 0.9603866049550775 }, { "epoch": 0.8665179993628545, "grad_norm": 2.1875, "learning_rate": 1.2680747987331215e-05, "loss": 0.11760580539703369, "step": 170, "token_acc": 0.9641480008911587 }, { "epoch": 0.8920038228735266, "grad_norm": 2.265625, "learning_rate": 1.2282441117282831e-05, "loss": 0.09457792043685913, "step": 175, "token_acc": 0.9685485409116917 }, { "epoch": 0.9174896463841988, "grad_norm": 2.125, "learning_rate": 1.1880275454832493e-05, "loss": 0.1546500325202942, "step": 180, "token_acc": 0.9522128782763489 }, { "epoch": 0.942975469894871, "grad_norm": 1.8203125, "learning_rate": 1.147493091841965e-05, "loss": 0.17746270895004274, "step": 185, "token_acc": 0.9467413034786085 }, { "epoch": 0.9684612934055432, "grad_norm": 1.796875, "learning_rate": 1.1067092800823798e-05, "loss": 0.17491767406463624, "step": 190, "token_acc": 0.9474711941210667 }, { "epoch": 0.9939471169162154, "grad_norm": 2.609375, "learning_rate": 1.0657450610579225e-05, "loss": 0.10773472785949707, "step": 195, "token_acc": 0.9669074948340096 }, { "epoch": 1.0152914941064033, "grad_norm": 1.7890625, "learning_rate": 1.0246696906262484e-05, "loss": 0.1128343939781189, "step": 200, "token_acc": 0.9617505882352941 }, { "epoch": 1.0407773176170756, "grad_norm": 1.796875, "learning_rate": 9.835526125623262e-06, "loss": 0.08277679681777954, "step": 205, "token_acc": 0.9738211210471551 }, { "epoch": 1.0662631411277477, "grad_norm": 2.203125, "learning_rate": 9.424633411538289e-06, "loss": 0.08777509927749634, "step": 210, "token_acc": 0.9702302847395355 }, { "epoch": 1.09174896463842, "grad_norm": 2.078125, "learning_rate": 9.014713436773114e-06, "loss": 0.1303470253944397, "step": 215, "token_acc": 0.958813470139437 }, { "epoch": 1.117234788149092, "grad_norm": 2.234375, "learning_rate": 8.606459229538645e-06, "loss": 0.12297415733337402, "step": 220, "token_acc": 0.9640833060584658 }, { "epoch": 1.1427206116597644, "grad_norm": 1.6328125, "learning_rate": 8.200561001828093e-06, "loss": 0.11149228811264038, "step": 225, "token_acc": 0.9653502592232877 }, { "epoch": 1.1682064351704364, "grad_norm": 2.484375, "learning_rate": 7.797704982515094e-06, "loss": 0.13481587171554565, "step": 230, "token_acc": 0.954266376601402 }, { "epoch": 1.1936922586811085, "grad_norm": 1.4453125, "learning_rate": 7.398572257185879e-06, "loss": 0.08835281133651733, "step": 235, "token_acc": 0.9707970452128424 }, { "epoch": 1.2191780821917808, "grad_norm": 1.8046875, "learning_rate": 7.003837616666906e-06, "loss": 0.10161161422729492, "step": 240, "token_acc": 0.9656063115927788 }, { "epoch": 1.244663905702453, "grad_norm": 2.34375, "learning_rate": 6.614168416194674e-06, "loss": 0.07140190601348877, "step": 245, "token_acc": 0.9799972242462094 }, { "epoch": 1.2701497292131252, "grad_norm": 2.953125, "learning_rate": 6.230223447156469e-06, "loss": 0.11119414567947387, "step": 250, "token_acc": 0.9668460399436599 }, { "epoch": 1.2956355527237973, "grad_norm": 2.1875, "learning_rate": 5.852651823309521e-06, "loss": 0.1017767071723938, "step": 255, "token_acc": 0.9658570417142279 }, { "epoch": 1.3211213762344696, "grad_norm": 1.7421875, "learning_rate": 5.482091883361571e-06, "loss": 0.12420098781585694, "step": 260, "token_acc": 0.9606861625830626 }, { "epoch": 1.3466071997451419, "grad_norm": 2.953125, "learning_rate": 5.1191701117681815e-06, "loss": 0.08732576370239258, "step": 265, "token_acc": 0.9742369029426746 }, { "epoch": 1.372093023255814, "grad_norm": 3.0625, "learning_rate": 4.764500079571403e-06, "loss": 0.08145667314529419, "step": 270, "token_acc": 0.9741474896414886 }, { "epoch": 1.397578846766486, "grad_norm": 1.65625, "learning_rate": 4.418681407070339e-06, "loss": 0.07903674840927125, "step": 275, "token_acc": 0.9746860907612613 }, { "epoch": 1.4230646702771583, "grad_norm": 2.34375, "learning_rate": 4.082298750077485e-06, "loss": 0.10514881610870361, "step": 280, "token_acc": 0.9671857823378527 }, { "epoch": 1.4485504937878306, "grad_norm": 2.296875, "learning_rate": 3.755920811474647e-06, "loss": 0.06307402849197388, "step": 285, "token_acc": 0.978928398142372 }, { "epoch": 1.4740363172985027, "grad_norm": 2.671875, "learning_rate": 3.4400993797395664e-06, "loss": 0.10954618453979492, "step": 290, "token_acc": 0.9647058823529412 }, { "epoch": 1.4995221408091748, "grad_norm": 1.359375, "learning_rate": 3.135368396068771e-06, "loss": 0.07190140485763549, "step": 295, "token_acc": 0.975735236452043 }, { "epoch": 1.525007964319847, "grad_norm": 1.640625, "learning_rate": 2.8422430516737733e-06, "loss": 0.08197641372680664, "step": 300, "token_acc": 0.9747953097358987 }, { "epoch": 1.5504937878305194, "grad_norm": 1.421875, "learning_rate": 2.561218916776823e-06, "loss": 0.0815430760383606, "step": 305, "token_acc": 0.9747565568486412 }, { "epoch": 1.5759796113411915, "grad_norm": 2.125, "learning_rate": 2.292771102778739e-06, "loss": 0.09041160345077515, "step": 310, "token_acc": 0.9739717810291317 }, { "epoch": 1.6014654348518635, "grad_norm": 2.515625, "learning_rate": 2.037353459015272e-06, "loss": 0.07603458166122437, "step": 315, "token_acc": 0.9746314985454959 }, { "epoch": 1.6269512583625358, "grad_norm": 1.6875, "learning_rate": 1.795397805460053e-06, "loss": 0.09144850373268128, "step": 320, "token_acc": 0.9713049218807306 }, { "epoch": 1.6524370818732081, "grad_norm": 1.84375, "learning_rate": 1.5673132026713046e-06, "loss": 0.08237828016281128, "step": 325, "token_acc": 0.9731688670646006 }, { "epoch": 1.6779229053838802, "grad_norm": 1.9140625, "learning_rate": 1.353485260216596e-06, "loss": 0.07584392428398132, "step": 330, "token_acc": 0.9767055819044145 }, { "epoch": 1.7034087288945523, "grad_norm": 2.328125, "learning_rate": 1.1542754847448544e-06, "loss": 0.05938386917114258, "step": 335, "token_acc": 0.9827799662352279 }, { "epoch": 1.7288945524052246, "grad_norm": 1.6796875, "learning_rate": 9.700206688077707e-07, "loss": 0.08313992023468017, "step": 340, "token_acc": 0.9746641292924703 }, { "epoch": 1.754380375915897, "grad_norm": 1.3671875, "learning_rate": 8.010323214639492e-07, "loss": 0.10048294067382812, "step": 345, "token_acc": 0.9692962175159531 }, { "epoch": 1.779866199426569, "grad_norm": 2.265625, "learning_rate": 6.475961416283838e-07, "loss": 0.07593317031860351, "step": 350, "token_acc": 0.9778483525208416 }, { "epoch": 1.805352022937241, "grad_norm": 4.4375, "learning_rate": 5.099715350576817e-07, "loss": 0.09315924048423767, "step": 355, "token_acc": 0.9696224141393854 }, { "epoch": 1.8308378464479134, "grad_norm": 2.0625, "learning_rate": 3.883911757876058e-07, "loss": 0.08427355885505676, "step": 360, "token_acc": 0.9726720482046076 }, { "epoch": 1.8563236699585857, "grad_norm": 2.453125, "learning_rate": 2.8306061276442753e-07, "loss": 0.09181307554244995, "step": 365, "token_acc": 0.97152072887367 }, { "epoch": 1.8818094934692577, "grad_norm": 1.546875, "learning_rate": 1.941579223350898e-07, "loss": 0.08087407350540161, "step": 370, "token_acc": 0.9741133571604277 }, { "epoch": 1.9072953169799298, "grad_norm": 1.9609375, "learning_rate": 1.218334071837468e-07, "loss": 0.10078444480895996, "step": 375, "token_acc": 0.9704466920176089 }, { "epoch": 1.9327811404906021, "grad_norm": 1.9609375, "learning_rate": 6.62093422236132e-08, "loss": 0.09156302809715271, "step": 380, "token_acc": 0.9694673981909602 }, { "epoch": 1.9582669640012744, "grad_norm": 1.71875, "learning_rate": 2.73797678737886e-08, "loss": 0.07732056379318238, "step": 385, "token_acc": 0.9762223117196939 }, { "epoch": 1.9837527875119465, "grad_norm": 1.953125, "learning_rate": 5.410331070498931e-09, "loss": 0.07045769095420837, "step": 390, "token_acc": 0.9786481253300475 } ], "logging_steps": 5, "max_steps": 394, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8765180828971213e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }