| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 200.0, | |
| "global_step": 394, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005097164702134438, | |
| "grad_norm": 12.5, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.7101277709007263, | |
| "step": 1, | |
| "token_acc": 0.8745318352059925 | |
| }, | |
| { | |
| "epoch": 0.02548582351067219, | |
| "grad_norm": 12.1875, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.8278377652168274, | |
| "step": 5, | |
| "token_acc": 0.8552833964573375 | |
| }, | |
| { | |
| "epoch": 0.05097164702134438, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.5761711120605468, | |
| "step": 10, | |
| "token_acc": 0.8653274252066853 | |
| }, | |
| { | |
| "epoch": 0.07645747053201657, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.9996956568698325e-05, | |
| "loss": 0.39079511165618896, | |
| "step": 15, | |
| "token_acc": 0.8887680715510904 | |
| }, | |
| { | |
| "epoch": 0.10194329404268876, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.9978364530054465e-05, | |
| "loss": 0.34750330448150635, | |
| "step": 20, | |
| "token_acc": 0.8949520766773162 | |
| }, | |
| { | |
| "epoch": 0.12742911755336095, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.9942902642308737e-05, | |
| "loss": 0.26970508098602297, | |
| "step": 25, | |
| "token_acc": 0.9119090968203394 | |
| }, | |
| { | |
| "epoch": 0.15291494106403314, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.9890630858842614e-05, | |
| "loss": 0.31259732246398925, | |
| "step": 30, | |
| "token_acc": 0.8999557705203384 | |
| }, | |
| { | |
| "epoch": 0.17840076457470533, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.9821637552564973e-05, | |
| "loss": 0.2847127437591553, | |
| "step": 35, | |
| "token_acc": 0.9077454674132511 | |
| }, | |
| { | |
| "epoch": 0.20388658808537752, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.9736039366505087e-05, | |
| "loss": 0.26165449619293213, | |
| "step": 40, | |
| "token_acc": 0.913668239925095 | |
| }, | |
| { | |
| "epoch": 0.2293724115960497, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 1.9633981016610926e-05, | |
| "loss": 0.24728527069091796, | |
| "step": 45, | |
| "token_acc": 0.9214681226749867 | |
| }, | |
| { | |
| "epoch": 0.2548582351067219, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.951563504708622e-05, | |
| "loss": 0.21780962944030763, | |
| "step": 50, | |
| "token_acc": 0.9288334437567668 | |
| }, | |
| { | |
| "epoch": 0.2803440586173941, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.938120153867983e-05, | |
| "loss": 0.24094951152801514, | |
| "step": 55, | |
| "token_acc": 0.9214724125087046 | |
| }, | |
| { | |
| "epoch": 0.3058298821280663, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.9230907770420737e-05, | |
| "loss": 0.2268754005432129, | |
| "step": 60, | |
| "token_acc": 0.9281276558333089 | |
| }, | |
| { | |
| "epoch": 0.33131570563873847, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 1.9065007835370358e-05, | |
| "loss": 0.22875683307647704, | |
| "step": 65, | |
| "token_acc": 0.9230186030186031 | |
| }, | |
| { | |
| "epoch": 0.35680152914941066, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 1.888378221104201e-05, | |
| "loss": 0.239988374710083, | |
| "step": 70, | |
| "token_acc": 0.9226348493246594 | |
| }, | |
| { | |
| "epoch": 0.38228735266008285, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.8687537285213627e-05, | |
| "loss": 0.1711668848991394, | |
| "step": 75, | |
| "token_acc": 0.9440668926242624 | |
| }, | |
| { | |
| "epoch": 0.40777317617075504, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 1.8476604837935515e-05, | |
| "loss": 0.2378025770187378, | |
| "step": 80, | |
| "token_acc": 0.9246084180832901 | |
| }, | |
| { | |
| "epoch": 0.4332589996814272, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.8251341480608823e-05, | |
| "loss": 0.18637551069259645, | |
| "step": 85, | |
| "token_acc": 0.9384391968682915 | |
| }, | |
| { | |
| "epoch": 0.4587448231920994, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.8012128053083097e-05, | |
| "loss": 0.1798395037651062, | |
| "step": 90, | |
| "token_acc": 0.9428771275355561 | |
| }, | |
| { | |
| "epoch": 0.4842306467027716, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.7759368979792145e-05, | |
| "loss": 0.19358837604522705, | |
| "step": 95, | |
| "token_acc": 0.9375457743950599 | |
| }, | |
| { | |
| "epoch": 0.5097164702134438, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.749349158601686e-05, | |
| "loss": 0.18399842977523803, | |
| "step": 100, | |
| "token_acc": 0.9422957113270134 | |
| }, | |
| { | |
| "epoch": 0.535202293724116, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.7214945375430816e-05, | |
| "loss": 0.17546424865722657, | |
| "step": 105, | |
| "token_acc": 0.9426513393021628 | |
| }, | |
| { | |
| "epoch": 0.5606881172347882, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.6924201270150194e-05, | |
| "loss": 0.15674128532409667, | |
| "step": 110, | |
| "token_acc": 0.948707565634069 | |
| }, | |
| { | |
| "epoch": 0.5861739407454604, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.6621750814572728e-05, | |
| "loss": 0.17290754318237306, | |
| "step": 115, | |
| "token_acc": 0.9423056713490034 | |
| }, | |
| { | |
| "epoch": 0.6116597642561326, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.6308105344351776e-05, | |
| "loss": 0.1858464241027832, | |
| "step": 120, | |
| "token_acc": 0.9447680047665152 | |
| }, | |
| { | |
| "epoch": 0.6371455877668047, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.598379512191042e-05, | |
| "loss": 0.13930436372756957, | |
| "step": 125, | |
| "token_acc": 0.9549336664078587 | |
| }, | |
| { | |
| "epoch": 0.6626314112774769, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.5649368439957182e-05, | |
| "loss": 0.1962134599685669, | |
| "step": 130, | |
| "token_acc": 0.9364768586501332 | |
| }, | |
| { | |
| "epoch": 0.6881172347881491, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.5305390694518953e-05, | |
| "loss": 0.17399163246154786, | |
| "step": 135, | |
| "token_acc": 0.9454186101668116 | |
| }, | |
| { | |
| "epoch": 0.7136030582988213, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.4952443429058334e-05, | |
| "loss": 0.116390061378479, | |
| "step": 140, | |
| "token_acc": 0.9623854999025531 | |
| }, | |
| { | |
| "epoch": 0.7390888818094935, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.459112335129144e-05, | |
| "loss": 0.20297460556030272, | |
| "step": 145, | |
| "token_acc": 0.9371412765368923 | |
| }, | |
| { | |
| "epoch": 0.7645747053201657, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.4222041324368347e-05, | |
| "loss": 0.14657198190689086, | |
| "step": 150, | |
| "token_acc": 0.9517101943288719 | |
| }, | |
| { | |
| "epoch": 0.7900605288308379, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.3845821334121763e-05, | |
| "loss": 0.1491287589073181, | |
| "step": 155, | |
| "token_acc": 0.9500473678670425 | |
| }, | |
| { | |
| "epoch": 0.8155463523415101, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.346309943412995e-05, | |
| "loss": 0.10045711994171143, | |
| "step": 160, | |
| "token_acc": 0.9683162116626162 | |
| }, | |
| { | |
| "epoch": 0.8410321758521823, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.3074522670377392e-05, | |
| "loss": 0.12388412952423096, | |
| "step": 165, | |
| "token_acc": 0.9603866049550775 | |
| }, | |
| { | |
| "epoch": 0.8665179993628545, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.2680747987331215e-05, | |
| "loss": 0.11760580539703369, | |
| "step": 170, | |
| "token_acc": 0.9641480008911587 | |
| }, | |
| { | |
| "epoch": 0.8920038228735266, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.2282441117282831e-05, | |
| "loss": 0.09457792043685913, | |
| "step": 175, | |
| "token_acc": 0.9685485409116917 | |
| }, | |
| { | |
| "epoch": 0.9174896463841988, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.1880275454832493e-05, | |
| "loss": 0.1546500325202942, | |
| "step": 180, | |
| "token_acc": 0.9522128782763489 | |
| }, | |
| { | |
| "epoch": 0.942975469894871, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.147493091841965e-05, | |
| "loss": 0.17746270895004274, | |
| "step": 185, | |
| "token_acc": 0.9467413034786085 | |
| }, | |
| { | |
| "epoch": 0.9684612934055432, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1067092800823798e-05, | |
| "loss": 0.17491767406463624, | |
| "step": 190, | |
| "token_acc": 0.9474711941210667 | |
| }, | |
| { | |
| "epoch": 0.9939471169162154, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.0657450610579225e-05, | |
| "loss": 0.10773472785949707, | |
| "step": 195, | |
| "token_acc": 0.9669074948340096 | |
| }, | |
| { | |
| "epoch": 1.0152914941064033, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.0246696906262484e-05, | |
| "loss": 0.1128343939781189, | |
| "step": 200, | |
| "token_acc": 0.9617505882352941 | |
| }, | |
| { | |
| "epoch": 1.0407773176170756, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 9.835526125623262e-06, | |
| "loss": 0.08277679681777954, | |
| "step": 205, | |
| "token_acc": 0.9738211210471551 | |
| }, | |
| { | |
| "epoch": 1.0662631411277477, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 9.424633411538289e-06, | |
| "loss": 0.08777509927749634, | |
| "step": 210, | |
| "token_acc": 0.9702302847395355 | |
| }, | |
| { | |
| "epoch": 1.09174896463842, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.014713436773114e-06, | |
| "loss": 0.1303470253944397, | |
| "step": 215, | |
| "token_acc": 0.958813470139437 | |
| }, | |
| { | |
| "epoch": 1.117234788149092, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 8.606459229538645e-06, | |
| "loss": 0.12297415733337402, | |
| "step": 220, | |
| "token_acc": 0.9640833060584658 | |
| }, | |
| { | |
| "epoch": 1.1427206116597644, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 8.200561001828093e-06, | |
| "loss": 0.11149228811264038, | |
| "step": 225, | |
| "token_acc": 0.9653502592232877 | |
| }, | |
| { | |
| "epoch": 1.1682064351704364, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 7.797704982515094e-06, | |
| "loss": 0.13481587171554565, | |
| "step": 230, | |
| "token_acc": 0.954266376601402 | |
| }, | |
| { | |
| "epoch": 1.1936922586811085, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 7.398572257185879e-06, | |
| "loss": 0.08835281133651733, | |
| "step": 235, | |
| "token_acc": 0.9707970452128424 | |
| }, | |
| { | |
| "epoch": 1.2191780821917808, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 7.003837616666906e-06, | |
| "loss": 0.10161161422729492, | |
| "step": 240, | |
| "token_acc": 0.9656063115927788 | |
| }, | |
| { | |
| "epoch": 1.244663905702453, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 6.614168416194674e-06, | |
| "loss": 0.07140190601348877, | |
| "step": 245, | |
| "token_acc": 0.9799972242462094 | |
| }, | |
| { | |
| "epoch": 1.2701497292131252, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 6.230223447156469e-06, | |
| "loss": 0.11119414567947387, | |
| "step": 250, | |
| "token_acc": 0.9668460399436599 | |
| }, | |
| { | |
| "epoch": 1.2956355527237973, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 5.852651823309521e-06, | |
| "loss": 0.1017767071723938, | |
| "step": 255, | |
| "token_acc": 0.9658570417142279 | |
| }, | |
| { | |
| "epoch": 1.3211213762344696, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 5.482091883361571e-06, | |
| "loss": 0.12420098781585694, | |
| "step": 260, | |
| "token_acc": 0.9606861625830626 | |
| }, | |
| { | |
| "epoch": 1.3466071997451419, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 5.1191701117681815e-06, | |
| "loss": 0.08732576370239258, | |
| "step": 265, | |
| "token_acc": 0.9742369029426746 | |
| }, | |
| { | |
| "epoch": 1.372093023255814, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 4.764500079571403e-06, | |
| "loss": 0.08145667314529419, | |
| "step": 270, | |
| "token_acc": 0.9741474896414886 | |
| }, | |
| { | |
| "epoch": 1.397578846766486, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 4.418681407070339e-06, | |
| "loss": 0.07903674840927125, | |
| "step": 275, | |
| "token_acc": 0.9746860907612613 | |
| }, | |
| { | |
| "epoch": 1.4230646702771583, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 4.082298750077485e-06, | |
| "loss": 0.10514881610870361, | |
| "step": 280, | |
| "token_acc": 0.9671857823378527 | |
| }, | |
| { | |
| "epoch": 1.4485504937878306, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 3.755920811474647e-06, | |
| "loss": 0.06307402849197388, | |
| "step": 285, | |
| "token_acc": 0.978928398142372 | |
| }, | |
| { | |
| "epoch": 1.4740363172985027, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.4400993797395664e-06, | |
| "loss": 0.10954618453979492, | |
| "step": 290, | |
| "token_acc": 0.9647058823529412 | |
| }, | |
| { | |
| "epoch": 1.4995221408091748, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 3.135368396068771e-06, | |
| "loss": 0.07190140485763549, | |
| "step": 295, | |
| "token_acc": 0.975735236452043 | |
| }, | |
| { | |
| "epoch": 1.525007964319847, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.8422430516737733e-06, | |
| "loss": 0.08197641372680664, | |
| "step": 300, | |
| "token_acc": 0.9747953097358987 | |
| }, | |
| { | |
| "epoch": 1.5504937878305194, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 2.561218916776823e-06, | |
| "loss": 0.0815430760383606, | |
| "step": 305, | |
| "token_acc": 0.9747565568486412 | |
| }, | |
| { | |
| "epoch": 1.5759796113411915, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.292771102778739e-06, | |
| "loss": 0.09041160345077515, | |
| "step": 310, | |
| "token_acc": 0.9739717810291317 | |
| }, | |
| { | |
| "epoch": 1.6014654348518635, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.037353459015272e-06, | |
| "loss": 0.07603458166122437, | |
| "step": 315, | |
| "token_acc": 0.9746314985454959 | |
| }, | |
| { | |
| "epoch": 1.6269512583625358, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.795397805460053e-06, | |
| "loss": 0.09144850373268128, | |
| "step": 320, | |
| "token_acc": 0.9713049218807306 | |
| }, | |
| { | |
| "epoch": 1.6524370818732081, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.5673132026713046e-06, | |
| "loss": 0.08237828016281128, | |
| "step": 325, | |
| "token_acc": 0.9731688670646006 | |
| }, | |
| { | |
| "epoch": 1.6779229053838802, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.353485260216596e-06, | |
| "loss": 0.07584392428398132, | |
| "step": 330, | |
| "token_acc": 0.9767055819044145 | |
| }, | |
| { | |
| "epoch": 1.7034087288945523, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.1542754847448544e-06, | |
| "loss": 0.05938386917114258, | |
| "step": 335, | |
| "token_acc": 0.9827799662352279 | |
| }, | |
| { | |
| "epoch": 1.7288945524052246, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 9.700206688077707e-07, | |
| "loss": 0.08313992023468017, | |
| "step": 340, | |
| "token_acc": 0.9746641292924703 | |
| }, | |
| { | |
| "epoch": 1.754380375915897, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.010323214639492e-07, | |
| "loss": 0.10048294067382812, | |
| "step": 345, | |
| "token_acc": 0.9692962175159531 | |
| }, | |
| { | |
| "epoch": 1.779866199426569, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 6.475961416283838e-07, | |
| "loss": 0.07593317031860351, | |
| "step": 350, | |
| "token_acc": 0.9778483525208416 | |
| }, | |
| { | |
| "epoch": 1.805352022937241, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 5.099715350576817e-07, | |
| "loss": 0.09315924048423767, | |
| "step": 355, | |
| "token_acc": 0.9696224141393854 | |
| }, | |
| { | |
| "epoch": 1.8308378464479134, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 3.883911757876058e-07, | |
| "loss": 0.08427355885505676, | |
| "step": 360, | |
| "token_acc": 0.9726720482046076 | |
| }, | |
| { | |
| "epoch": 1.8563236699585857, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 2.8306061276442753e-07, | |
| "loss": 0.09181307554244995, | |
| "step": 365, | |
| "token_acc": 0.97152072887367 | |
| }, | |
| { | |
| "epoch": 1.8818094934692577, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 1.941579223350898e-07, | |
| "loss": 0.08087407350540161, | |
| "step": 370, | |
| "token_acc": 0.9741133571604277 | |
| }, | |
| { | |
| "epoch": 1.9072953169799298, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.218334071837468e-07, | |
| "loss": 0.10078444480895996, | |
| "step": 375, | |
| "token_acc": 0.9704466920176089 | |
| }, | |
| { | |
| "epoch": 1.9327811404906021, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 6.62093422236132e-08, | |
| "loss": 0.09156302809715271, | |
| "step": 380, | |
| "token_acc": 0.9694673981909602 | |
| }, | |
| { | |
| "epoch": 1.9582669640012744, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.73797678737886e-08, | |
| "loss": 0.07732056379318238, | |
| "step": 385, | |
| "token_acc": 0.9762223117196939 | |
| }, | |
| { | |
| "epoch": 1.9837527875119465, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.410331070498931e-09, | |
| "loss": 0.07045769095420837, | |
| "step": 390, | |
| "token_acc": 0.9786481253300475 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 394, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.8765180828971213e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |