{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 2075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.060350030175015085, "grad_norm": 0.8775522708892822, "learning_rate": 8.18181818181818e-05, "loss": 1.5879, "mean_token_accuracy": 0.6690774387121201, "num_tokens": 155085.0, "step": 25 }, { "epoch": 0.12070006035003017, "grad_norm": 0.9928436279296875, "learning_rate": 0.00016704545454545452, "loss": 0.882, "mean_token_accuracy": 0.7819808393716812, "num_tokens": 272168.0, "step": 50 }, { "epoch": 0.18105009052504525, "grad_norm": 0.6491063237190247, "learning_rate": 0.0002522727272727273, "loss": 0.5779, "mean_token_accuracy": 0.8428267538547516, "num_tokens": 424672.0, "step": 75 }, { "epoch": 0.24140012070006034, "grad_norm": 0.8907634615898132, "learning_rate": 0.0002999887132933212, "loss": 0.5317, "mean_token_accuracy": 0.8538330507278442, "num_tokens": 538458.0, "step": 100 }, { "epoch": 0.30175015087507545, "grad_norm": 0.9097657203674316, "learning_rate": 0.0002998791256978121, "loss": 0.3574, "mean_token_accuracy": 0.8985000151395798, "num_tokens": 689411.0, "step": 125 }, { "epoch": 0.3621001810500905, "grad_norm": 0.6912802457809448, "learning_rate": 0.0002996530399366737, "loss": 0.3448, "mean_token_accuracy": 0.9023130792379379, "num_tokens": 804866.0, "step": 150 }, { "epoch": 0.4224502112251056, "grad_norm": 0.5329523682594299, "learning_rate": 0.00029931063174202567, "loss": 0.2315, "mean_token_accuracy": 0.9339979404211044, "num_tokens": 957115.0, "step": 175 }, { "epoch": 0.4828002414001207, "grad_norm": 0.759782075881958, "learning_rate": 0.00029885216726118104, "loss": 0.2336, "mean_token_accuracy": 0.9350438743829728, "num_tokens": 1073977.0, "step": 200 }, { "epoch": 0.5431502715751357, "grad_norm": 0.3837663531303406, "learning_rate": 0.00029827800284977474, "loss": 0.189, "mean_token_accuracy": 0.9478910142183303, "num_tokens": 1225658.0, "step": 225 }, { "epoch": 0.6035003017501509, "grad_norm": 0.7154819369316101, "learning_rate": 0.00029758858479477575, "loss": 0.192, "mean_token_accuracy": 0.9470360428094864, "num_tokens": 1340933.0, "step": 250 }, { "epoch": 0.663850331925166, "grad_norm": 0.32724520564079285, "learning_rate": 0.0002967844489675963, "loss": 0.1544, "mean_token_accuracy": 0.9575115633010864, "num_tokens": 1492525.0, "step": 275 }, { "epoch": 0.724200362100181, "grad_norm": 0.7199254035949707, "learning_rate": 0.00029586622040756957, "loss": 0.1512, "mean_token_accuracy": 0.9590712755918502, "num_tokens": 1609549.0, "step": 300 }, { "epoch": 0.7845503922751962, "grad_norm": 0.5064759850502014, "learning_rate": 0.0002948346128361186, "loss": 0.127, "mean_token_accuracy": 0.9652698183059693, "num_tokens": 1760812.0, "step": 325 }, { "epoch": 0.8449004224502112, "grad_norm": 0.9060899615287781, "learning_rate": 0.00029369042810199416, "loss": 0.1295, "mean_token_accuracy": 0.9658441722393036, "num_tokens": 1878082.0, "step": 350 }, { "epoch": 0.9052504526252263, "grad_norm": 0.39403802156448364, "learning_rate": 0.0002924345555580135, "loss": 0.124, "mean_token_accuracy": 0.96674849152565, "num_tokens": 2030379.0, "step": 375 }, { "epoch": 0.9656004828002414, "grad_norm": 0.510175347328186, "learning_rate": 0.000291067971369783, "loss": 0.1175, "mean_token_accuracy": 0.9700243002176285, "num_tokens": 2145568.0, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.12158163636922836, "eval_mean_token_accuracy": 0.9679943664653882, "eval_num_tokens": 2223513.0, "eval_runtime": 60.3991, "eval_samples_per_second": 6.109, "eval_steps_per_second": 3.063, "step": 415 }, { "epoch": 1.024140012070006, "grad_norm": 0.406086802482605, "learning_rate": 0.0002895917377569438, "loss": 0.1273, "mean_token_accuracy": 0.9658442354693855, "num_tokens": 2291474.0, "step": 425 }, { "epoch": 1.0844900422450212, "grad_norm": 0.5659676790237427, "learning_rate": 0.00028800700216752875, "loss": 0.0838, "mean_token_accuracy": 0.9784896957874298, "num_tokens": 2426520.0, "step": 450 }, { "epoch": 1.1448400724200363, "grad_norm": 0.35416120290756226, "learning_rate": 0.00028631499638607285, "loss": 0.1106, "mean_token_accuracy": 0.9712809121608734, "num_tokens": 2559490.0, "step": 475 }, { "epoch": 1.2051901025950513, "grad_norm": 0.4320279657840729, "learning_rate": 0.0002845170355761712, "loss": 0.0828, "mean_token_accuracy": 0.9781204652786255, "num_tokens": 2692836.0, "step": 500 }, { "epoch": 1.2655401327700664, "grad_norm": 0.2252548485994339, "learning_rate": 0.0002826145172582274, "loss": 0.1078, "mean_token_accuracy": 0.972182622551918, "num_tokens": 2824220.0, "step": 525 }, { "epoch": 1.3258901629450814, "grad_norm": 0.5526081323623657, "learning_rate": 0.00028060892022318764, "loss": 0.0859, "mean_token_accuracy": 0.9773634171485901, "num_tokens": 2957350.0, "step": 550 }, { "epoch": 1.3862401931200965, "grad_norm": 0.28977397084236145, "learning_rate": 0.0002785018033831051, "loss": 0.1003, "mean_token_accuracy": 0.9744341260194779, "num_tokens": 3089488.0, "step": 575 }, { "epoch": 1.4465902232951118, "grad_norm": 0.3141675591468811, "learning_rate": 0.0002762948045594276, "loss": 0.0791, "mean_token_accuracy": 0.9794832402467728, "num_tokens": 3223359.0, "step": 600 }, { "epoch": 1.5069402534701268, "grad_norm": 0.2956194579601288, "learning_rate": 0.0002739896392099502, "loss": 0.0973, "mean_token_accuracy": 0.9747027105093002, "num_tokens": 3356289.0, "step": 625 }, { "epoch": 1.567290283645142, "grad_norm": 0.20789697766304016, "learning_rate": 0.00027158809909542307, "loss": 0.0799, "mean_token_accuracy": 0.9793938374519349, "num_tokens": 3492136.0, "step": 650 }, { "epoch": 1.627640313820157, "grad_norm": 0.2433897703886032, "learning_rate": 0.00026909205088685, "loss": 0.1014, "mean_token_accuracy": 0.9734487825632095, "num_tokens": 3626915.0, "step": 675 }, { "epoch": 1.687990343995172, "grad_norm": 0.7531152367591858, "learning_rate": 0.0002665034347145612, "loss": 0.0712, "mean_token_accuracy": 0.9815128433704376, "num_tokens": 3763997.0, "step": 700 }, { "epoch": 1.748340374170187, "grad_norm": 0.2832512855529785, "learning_rate": 0.000263824262660187, "loss": 0.0906, "mean_token_accuracy": 0.9767874735593796, "num_tokens": 3899136.0, "step": 725 }, { "epoch": 1.8086904043452021, "grad_norm": 0.25026917457580566, "learning_rate": 0.0002610566171927056, "loss": 0.0738, "mean_token_accuracy": 0.9808795565366745, "num_tokens": 4035090.0, "step": 750 }, { "epoch": 1.8690404345202172, "grad_norm": 0.2247888296842575, "learning_rate": 0.00025820264954977976, "loss": 0.0924, "mean_token_accuracy": 0.9755205953121185, "num_tokens": 4170738.0, "step": 775 }, { "epoch": 1.9293904646952322, "grad_norm": 0.2288103550672531, "learning_rate": 0.00025526457806564136, "loss": 0.0689, "mean_token_accuracy": 0.9819435960054398, "num_tokens": 4304690.0, "step": 800 }, { "epoch": 1.9897404948702473, "grad_norm": 0.3063240647315979, "learning_rate": 0.00025224468644682245, "loss": 0.0848, "mean_token_accuracy": 0.9782917034626007, "num_tokens": 4428594.0, "step": 825 }, { "epoch": 2.0, "eval_loss": 0.08532727509737015, "eval_mean_token_accuracy": 0.978298093499364, "eval_num_tokens": 4447026.0, "eval_runtime": 60.4972, "eval_samples_per_second": 6.099, "eval_steps_per_second": 3.058, "step": 830 }, { "epoch": 2.048280024140012, "grad_norm": 0.292191743850708, "learning_rate": 0.00024914532199707444, "loss": 0.0742, "mean_token_accuracy": 0.9807861385886202, "num_tokens": 4570802.0, "step": 850 }, { "epoch": 2.1086300543150274, "grad_norm": 0.12266981601715088, "learning_rate": 0.00024596889379285353, "loss": 0.0583, "mean_token_accuracy": 0.9852461409568787, "num_tokens": 4693406.0, "step": 875 }, { "epoch": 2.1689800844900424, "grad_norm": 0.2619573473930359, "learning_rate": 0.00024271787081079228, "loss": 0.0675, "mean_token_accuracy": 0.9823181647062301, "num_tokens": 4840920.0, "step": 900 }, { "epoch": 2.2293301146650575, "grad_norm": 0.3071349859237671, "learning_rate": 0.00023939478000861117, "loss": 0.0618, "mean_token_accuracy": 0.9843161147832871, "num_tokens": 4963388.0, "step": 925 }, { "epoch": 2.2896801448400725, "grad_norm": 0.19022433459758759, "learning_rate": 0.00023600220436096318, "loss": 0.073, "mean_token_accuracy": 0.9802478235960007, "num_tokens": 5109242.0, "step": 950 }, { "epoch": 2.3500301750150876, "grad_norm": 0.21214531362056732, "learning_rate": 0.00023254278085173684, "loss": 0.058, "mean_token_accuracy": 0.985277818441391, "num_tokens": 5233880.0, "step": 975 }, { "epoch": 2.4103802051901027, "grad_norm": 0.27143731713294983, "learning_rate": 0.00022901919842437972, "loss": 0.0696, "mean_token_accuracy": 0.9813511747121811, "num_tokens": 5379872.0, "step": 1000 }, { "epoch": 2.4707302353651177, "grad_norm": 0.22756575047969818, "learning_rate": 0.00022543419589183397, "loss": 0.0557, "mean_token_accuracy": 0.9860703033208847, "num_tokens": 5503533.0, "step": 1025 }, { "epoch": 2.5310802655401328, "grad_norm": 0.20393149554729462, "learning_rate": 0.00022179055980770993, "loss": 0.0714, "mean_token_accuracy": 0.9812053245306015, "num_tokens": 5648709.0, "step": 1050 }, { "epoch": 2.591430295715148, "grad_norm": 0.1682002693414688, "learning_rate": 0.0002180911223003513, "loss": 0.0573, "mean_token_accuracy": 0.9854411727190018, "num_tokens": 5771477.0, "step": 1075 }, { "epoch": 2.651780325890163, "grad_norm": 0.17297177016735077, "learning_rate": 0.00021433875887147627, "loss": 0.0684, "mean_token_accuracy": 0.9818347871303559, "num_tokens": 5917022.0, "step": 1100 }, { "epoch": 2.712130356065178, "grad_norm": 0.3354227840900421, "learning_rate": 0.00021053638616110525, "loss": 0.056, "mean_token_accuracy": 0.9859032183885574, "num_tokens": 6040041.0, "step": 1125 }, { "epoch": 2.772480386240193, "grad_norm": 0.17890885472297668, "learning_rate": 0.00020668695968051274, "loss": 0.0644, "mean_token_accuracy": 0.9828311365842819, "num_tokens": 6183440.0, "step": 1150 }, { "epoch": 2.832830416415208, "grad_norm": 0.19668501615524292, "learning_rate": 0.00020279347151496482, "loss": 0.0572, "mean_token_accuracy": 0.9856607836484909, "num_tokens": 6307206.0, "step": 1175 }, { "epoch": 2.8931804465902236, "grad_norm": 0.12178179621696472, "learning_rate": 0.00019885894799802922, "loss": 0.0626, "mean_token_accuracy": 0.9839641106128693, "num_tokens": 6451936.0, "step": 1200 }, { "epoch": 2.9535304767652386, "grad_norm": 0.14492283761501312, "learning_rate": 0.00019488644735926396, "loss": 0.0533, "mean_token_accuracy": 0.9867338234186173, "num_tokens": 6574944.0, "step": 1225 }, { "epoch": 3.0, "eval_loss": 0.06949137151241302, "eval_mean_token_accuracy": 0.9829865713377257, "eval_num_tokens": 6670539.0, "eval_runtime": 60.3936, "eval_samples_per_second": 6.11, "eval_steps_per_second": 3.063, "step": 1245 }, { "epoch": 3.012070006035003, "grad_norm": 0.18783801794052124, "learning_rate": 0.00019087905734711452, "loss": 0.0609, "mean_token_accuracy": 0.984156120683729, "num_tokens": 6706398.0, "step": 1250 }, { "epoch": 3.0724200362100182, "grad_norm": 0.2086067646741867, "learning_rate": 0.00018683989282886613, "loss": 0.046, "mean_token_accuracy": 0.9881435281038284, "num_tokens": 6848066.0, "step": 1275 }, { "epoch": 3.1327700663850333, "grad_norm": 0.2123444378376007, "learning_rate": 0.0001827720933695173, "loss": 0.0552, "mean_token_accuracy": 0.9860709112882614, "num_tokens": 6975273.0, "step": 1300 }, { "epoch": 3.1931200965600484, "grad_norm": 0.11279409378767014, "learning_rate": 0.00017867882079145627, "loss": 0.0455, "mean_token_accuracy": 0.9881318390369416, "num_tokens": 7117546.0, "step": 1325 }, { "epoch": 3.2534701267350634, "grad_norm": 0.177442729473114, "learning_rate": 0.00017456325671683724, "loss": 0.0546, "mean_token_accuracy": 0.9862780523300171, "num_tokens": 7244543.0, "step": 1350 }, { "epoch": 3.3138201569100785, "grad_norm": 0.10142289847135544, "learning_rate": 0.00017042860009456638, "loss": 0.0445, "mean_token_accuracy": 0.988534786105156, "num_tokens": 7384967.0, "step": 1375 }, { "epoch": 3.3741701870850935, "grad_norm": 0.15665322542190552, "learning_rate": 0.00016627806471382066, "loss": 0.0532, "mean_token_accuracy": 0.9868290704488755, "num_tokens": 7509226.0, "step": 1400 }, { "epoch": 3.4345202172601086, "grad_norm": 0.15646246075630188, "learning_rate": 0.00016211487670603078, "loss": 0.0456, "mean_token_accuracy": 0.9882669430971146, "num_tokens": 7648911.0, "step": 1425 }, { "epoch": 3.4948702474351236, "grad_norm": 0.16921687126159668, "learning_rate": 0.0001579422720372715, "loss": 0.0568, "mean_token_accuracy": 0.9850279080867768, "num_tokens": 7777925.0, "step": 1450 }, { "epoch": 3.5552202776101387, "grad_norm": 0.10589733719825745, "learning_rate": 0.00015376349399300745, "loss": 0.0446, "mean_token_accuracy": 0.9881089746952056, "num_tokens": 7922053.0, "step": 1475 }, { "epoch": 3.6155703077851538, "grad_norm": 0.2955225110054016, "learning_rate": 0.0001495817906571492, "loss": 0.0544, "mean_token_accuracy": 0.9862786346673965, "num_tokens": 8049528.0, "step": 1500 }, { "epoch": 3.675920337960169, "grad_norm": 0.0742836520075798, "learning_rate": 0.00014540041238738055, "loss": 0.0456, "mean_token_accuracy": 0.9878502124547959, "num_tokens": 8190743.0, "step": 1525 }, { "epoch": 3.736270368135184, "grad_norm": 0.12437008321285248, "learning_rate": 0.00014122260928871734, "loss": 0.0535, "mean_token_accuracy": 0.9863007247447968, "num_tokens": 8316676.0, "step": 1550 }, { "epoch": 3.796620398310199, "grad_norm": 0.08342117071151733, "learning_rate": 0.00013705162868726396, "loss": 0.0447, "mean_token_accuracy": 0.9884078222513198, "num_tokens": 8457403.0, "step": 1575 }, { "epoch": 3.856970428485214, "grad_norm": 0.17910127341747284, "learning_rate": 0.00013289071260612855, "loss": 0.052, "mean_token_accuracy": 0.9868350481986999, "num_tokens": 8583976.0, "step": 1600 }, { "epoch": 3.9173204586602295, "grad_norm": 0.06760319322347641, "learning_rate": 0.00012874309524546083, "loss": 0.045, "mean_token_accuracy": 0.9878969532251358, "num_tokens": 8727702.0, "step": 1625 }, { "epoch": 3.9776704888352445, "grad_norm": 0.15349148213863373, "learning_rate": 0.00012461200046857084, "loss": 0.0508, "mean_token_accuracy": 0.987232791185379, "num_tokens": 8849683.0, "step": 1650 }, { "epoch": 4.0, "eval_loss": 0.0635647177696228, "eval_mean_token_accuracy": 0.9845431846541327, "eval_num_tokens": 8894052.0, "eval_runtime": 60.4472, "eval_samples_per_second": 6.105, "eval_steps_per_second": 3.061, "step": 1660 }, { "epoch": 4.036210018105009, "grad_norm": 0.1430218666791916, "learning_rate": 0.00012050063929608123, "loss": 0.0441, "mean_token_accuracy": 0.9887785659622901, "num_tokens": 8992417.0, "step": 1675 }, { "epoch": 4.096560048280024, "grad_norm": 0.08819396048784256, "learning_rate": 0.0001164122074100633, "loss": 0.0406, "mean_token_accuracy": 0.9898108446598053, "num_tokens": 9122219.0, "step": 1700 }, { "epoch": 4.15691007845504, "grad_norm": 0.08506345748901367, "learning_rate": 0.00011234988267009415, "loss": 0.0457, "mean_token_accuracy": 0.9884944796562195, "num_tokens": 9262178.0, "step": 1725 }, { "epoch": 4.217260108630055, "grad_norm": 0.08111756294965744, "learning_rate": 0.00010831682264316787, "loss": 0.0416, "mean_token_accuracy": 0.989647062420845, "num_tokens": 9391026.0, "step": 1750 }, { "epoch": 4.27761013880507, "grad_norm": 0.07644202560186386, "learning_rate": 0.00010431616214937911, "loss": 0.046, "mean_token_accuracy": 0.9881355553865433, "num_tokens": 9531921.0, "step": 1775 }, { "epoch": 4.337960168980085, "grad_norm": 0.03971414268016815, "learning_rate": 0.00010035101082528777, "loss": 0.0413, "mean_token_accuracy": 0.9894955778121948, "num_tokens": 9661664.0, "step": 1800 }, { "epoch": 4.3983101991551, "grad_norm": 0.07305199652910233, "learning_rate": 9.642445070685809e-05, "loss": 0.0435, "mean_token_accuracy": 0.9884718316793442, "num_tokens": 9801214.0, "step": 1825 }, { "epoch": 4.458660229330115, "grad_norm": 0.07561074942350388, "learning_rate": 9.253953383385157e-05, "loss": 0.0411, "mean_token_accuracy": 0.9890134006738662, "num_tokens": 9929511.0, "step": 1850 }, { "epoch": 4.51901025950513, "grad_norm": 0.12152893096208572, "learning_rate": 8.869927987753459e-05, "loss": 0.0442, "mean_token_accuracy": 0.9886808878183365, "num_tokens": 10069489.0, "step": 1875 }, { "epoch": 4.579360289680145, "grad_norm": 0.05834532529115677, "learning_rate": 8.490667379354661e-05, "loss": 0.0417, "mean_token_accuracy": 0.9894009435176849, "num_tokens": 10197956.0, "step": 1900 }, { "epoch": 4.63971031985516, "grad_norm": 0.08154677599668503, "learning_rate": 8.116466350175079e-05, "loss": 0.0437, "mean_token_accuracy": 0.9884669744968414, "num_tokens": 10335860.0, "step": 1925 }, { "epoch": 4.700060350030175, "grad_norm": 0.07408758997917175, "learning_rate": 7.747615759487304e-05, "loss": 0.0411, "mean_token_accuracy": 0.989289864897728, "num_tokens": 10464303.0, "step": 1950 }, { "epoch": 4.76041038020519, "grad_norm": 0.0779808983206749, "learning_rate": 7.38440230777085e-05, "loss": 0.0453, "mean_token_accuracy": 0.9880505865812301, "num_tokens": 10601511.0, "step": 1975 }, { "epoch": 4.820760410380205, "grad_norm": 0.046579256653785706, "learning_rate": 7.027108313865378e-05, "loss": 0.0401, "mean_token_accuracy": 0.9896253395080566, "num_tokens": 10732156.0, "step": 2000 }, { "epoch": 4.88111044055522, "grad_norm": 0.07918152213096619, "learning_rate": 6.676011495529687e-05, "loss": 0.042, "mean_token_accuracy": 0.9889346659183502, "num_tokens": 10873235.0, "step": 2025 }, { "epoch": 4.941460470730235, "grad_norm": 0.04908803850412369, "learning_rate": 6.331384753577056e-05, "loss": 0.0408, "mean_token_accuracy": 0.9895875388383866, "num_tokens": 11001827.0, "step": 2050 }, { "epoch": 5.0, "grad_norm": 0.22838693857192993, "learning_rate": 5.993495959754631e-05, "loss": 0.0452, "mean_token_accuracy": 0.9885991817897128, "num_tokens": 11117565.0, "step": 2075 }, { "epoch": 5.0, "eval_loss": 0.06391309201717377, "eval_mean_token_accuracy": 0.9852847150854163, "eval_num_tokens": 11117565.0, "eval_runtime": 60.3576, "eval_samples_per_second": 6.114, "eval_steps_per_second": 3.065, "step": 2075 } ], "logging_steps": 25, "max_steps": 2905, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.8080321362273075e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }