| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 2075, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.060350030175015085, | |
| "grad_norm": 0.8775522708892822, | |
| "learning_rate": 8.18181818181818e-05, | |
| "loss": 1.5879, | |
| "mean_token_accuracy": 0.6690774387121201, | |
| "num_tokens": 155085.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12070006035003017, | |
| "grad_norm": 0.9928436279296875, | |
| "learning_rate": 0.00016704545454545452, | |
| "loss": 0.882, | |
| "mean_token_accuracy": 0.7819808393716812, | |
| "num_tokens": 272168.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18105009052504525, | |
| "grad_norm": 0.6491063237190247, | |
| "learning_rate": 0.0002522727272727273, | |
| "loss": 0.5779, | |
| "mean_token_accuracy": 0.8428267538547516, | |
| "num_tokens": 424672.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24140012070006034, | |
| "grad_norm": 0.8907634615898132, | |
| "learning_rate": 0.0002999887132933212, | |
| "loss": 0.5317, | |
| "mean_token_accuracy": 0.8538330507278442, | |
| "num_tokens": 538458.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.30175015087507545, | |
| "grad_norm": 0.9097657203674316, | |
| "learning_rate": 0.0002998791256978121, | |
| "loss": 0.3574, | |
| "mean_token_accuracy": 0.8985000151395798, | |
| "num_tokens": 689411.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.3621001810500905, | |
| "grad_norm": 0.6912802457809448, | |
| "learning_rate": 0.0002996530399366737, | |
| "loss": 0.3448, | |
| "mean_token_accuracy": 0.9023130792379379, | |
| "num_tokens": 804866.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4224502112251056, | |
| "grad_norm": 0.5329523682594299, | |
| "learning_rate": 0.00029931063174202567, | |
| "loss": 0.2315, | |
| "mean_token_accuracy": 0.9339979404211044, | |
| "num_tokens": 957115.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.4828002414001207, | |
| "grad_norm": 0.759782075881958, | |
| "learning_rate": 0.00029885216726118104, | |
| "loss": 0.2336, | |
| "mean_token_accuracy": 0.9350438743829728, | |
| "num_tokens": 1073977.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5431502715751357, | |
| "grad_norm": 0.3837663531303406, | |
| "learning_rate": 0.00029827800284977474, | |
| "loss": 0.189, | |
| "mean_token_accuracy": 0.9478910142183303, | |
| "num_tokens": 1225658.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6035003017501509, | |
| "grad_norm": 0.7154819369316101, | |
| "learning_rate": 0.00029758858479477575, | |
| "loss": 0.192, | |
| "mean_token_accuracy": 0.9470360428094864, | |
| "num_tokens": 1340933.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.663850331925166, | |
| "grad_norm": 0.32724520564079285, | |
| "learning_rate": 0.0002967844489675963, | |
| "loss": 0.1544, | |
| "mean_token_accuracy": 0.9575115633010864, | |
| "num_tokens": 1492525.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.724200362100181, | |
| "grad_norm": 0.7199254035949707, | |
| "learning_rate": 0.00029586622040756957, | |
| "loss": 0.1512, | |
| "mean_token_accuracy": 0.9590712755918502, | |
| "num_tokens": 1609549.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7845503922751962, | |
| "grad_norm": 0.5064759850502014, | |
| "learning_rate": 0.0002948346128361186, | |
| "loss": 0.127, | |
| "mean_token_accuracy": 0.9652698183059693, | |
| "num_tokens": 1760812.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.8449004224502112, | |
| "grad_norm": 0.9060899615287781, | |
| "learning_rate": 0.00029369042810199416, | |
| "loss": 0.1295, | |
| "mean_token_accuracy": 0.9658441722393036, | |
| "num_tokens": 1878082.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9052504526252263, | |
| "grad_norm": 0.39403802156448364, | |
| "learning_rate": 0.0002924345555580135, | |
| "loss": 0.124, | |
| "mean_token_accuracy": 0.96674849152565, | |
| "num_tokens": 2030379.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.9656004828002414, | |
| "grad_norm": 0.510175347328186, | |
| "learning_rate": 0.000291067971369783, | |
| "loss": 0.1175, | |
| "mean_token_accuracy": 0.9700243002176285, | |
| "num_tokens": 2145568.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.12158163636922836, | |
| "eval_mean_token_accuracy": 0.9679943664653882, | |
| "eval_num_tokens": 2223513.0, | |
| "eval_runtime": 60.3991, | |
| "eval_samples_per_second": 6.109, | |
| "eval_steps_per_second": 3.063, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.024140012070006, | |
| "grad_norm": 0.406086802482605, | |
| "learning_rate": 0.0002895917377569438, | |
| "loss": 0.1273, | |
| "mean_token_accuracy": 0.9658442354693855, | |
| "num_tokens": 2291474.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.0844900422450212, | |
| "grad_norm": 0.5659676790237427, | |
| "learning_rate": 0.00028800700216752875, | |
| "loss": 0.0838, | |
| "mean_token_accuracy": 0.9784896957874298, | |
| "num_tokens": 2426520.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1448400724200363, | |
| "grad_norm": 0.35416120290756226, | |
| "learning_rate": 0.00028631499638607285, | |
| "loss": 0.1106, | |
| "mean_token_accuracy": 0.9712809121608734, | |
| "num_tokens": 2559490.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.2051901025950513, | |
| "grad_norm": 0.4320279657840729, | |
| "learning_rate": 0.0002845170355761712, | |
| "loss": 0.0828, | |
| "mean_token_accuracy": 0.9781204652786255, | |
| "num_tokens": 2692836.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2655401327700664, | |
| "grad_norm": 0.2252548485994339, | |
| "learning_rate": 0.0002826145172582274, | |
| "loss": 0.1078, | |
| "mean_token_accuracy": 0.972182622551918, | |
| "num_tokens": 2824220.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.3258901629450814, | |
| "grad_norm": 0.5526081323623657, | |
| "learning_rate": 0.00028060892022318764, | |
| "loss": 0.0859, | |
| "mean_token_accuracy": 0.9773634171485901, | |
| "num_tokens": 2957350.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3862401931200965, | |
| "grad_norm": 0.28977397084236145, | |
| "learning_rate": 0.0002785018033831051, | |
| "loss": 0.1003, | |
| "mean_token_accuracy": 0.9744341260194779, | |
| "num_tokens": 3089488.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.4465902232951118, | |
| "grad_norm": 0.3141675591468811, | |
| "learning_rate": 0.0002762948045594276, | |
| "loss": 0.0791, | |
| "mean_token_accuracy": 0.9794832402467728, | |
| "num_tokens": 3223359.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5069402534701268, | |
| "grad_norm": 0.2956194579601288, | |
| "learning_rate": 0.0002739896392099502, | |
| "loss": 0.0973, | |
| "mean_token_accuracy": 0.9747027105093002, | |
| "num_tokens": 3356289.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.567290283645142, | |
| "grad_norm": 0.20789697766304016, | |
| "learning_rate": 0.00027158809909542307, | |
| "loss": 0.0799, | |
| "mean_token_accuracy": 0.9793938374519349, | |
| "num_tokens": 3492136.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.627640313820157, | |
| "grad_norm": 0.2433897703886032, | |
| "learning_rate": 0.00026909205088685, | |
| "loss": 0.1014, | |
| "mean_token_accuracy": 0.9734487825632095, | |
| "num_tokens": 3626915.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.687990343995172, | |
| "grad_norm": 0.7531152367591858, | |
| "learning_rate": 0.0002665034347145612, | |
| "loss": 0.0712, | |
| "mean_token_accuracy": 0.9815128433704376, | |
| "num_tokens": 3763997.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.748340374170187, | |
| "grad_norm": 0.2832512855529785, | |
| "learning_rate": 0.000263824262660187, | |
| "loss": 0.0906, | |
| "mean_token_accuracy": 0.9767874735593796, | |
| "num_tokens": 3899136.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.8086904043452021, | |
| "grad_norm": 0.25026917457580566, | |
| "learning_rate": 0.0002610566171927056, | |
| "loss": 0.0738, | |
| "mean_token_accuracy": 0.9808795565366745, | |
| "num_tokens": 4035090.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8690404345202172, | |
| "grad_norm": 0.2247888296842575, | |
| "learning_rate": 0.00025820264954977976, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9755205953121185, | |
| "num_tokens": 4170738.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.9293904646952322, | |
| "grad_norm": 0.2288103550672531, | |
| "learning_rate": 0.00025526457806564136, | |
| "loss": 0.0689, | |
| "mean_token_accuracy": 0.9819435960054398, | |
| "num_tokens": 4304690.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9897404948702473, | |
| "grad_norm": 0.3063240647315979, | |
| "learning_rate": 0.00025224468644682245, | |
| "loss": 0.0848, | |
| "mean_token_accuracy": 0.9782917034626007, | |
| "num_tokens": 4428594.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.08532727509737015, | |
| "eval_mean_token_accuracy": 0.978298093499364, | |
| "eval_num_tokens": 4447026.0, | |
| "eval_runtime": 60.4972, | |
| "eval_samples_per_second": 6.099, | |
| "eval_steps_per_second": 3.058, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.048280024140012, | |
| "grad_norm": 0.292191743850708, | |
| "learning_rate": 0.00024914532199707444, | |
| "loss": 0.0742, | |
| "mean_token_accuracy": 0.9807861385886202, | |
| "num_tokens": 4570802.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.1086300543150274, | |
| "grad_norm": 0.12266981601715088, | |
| "learning_rate": 0.00024596889379285353, | |
| "loss": 0.0583, | |
| "mean_token_accuracy": 0.9852461409568787, | |
| "num_tokens": 4693406.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 2.1689800844900424, | |
| "grad_norm": 0.2619573473930359, | |
| "learning_rate": 0.00024271787081079228, | |
| "loss": 0.0675, | |
| "mean_token_accuracy": 0.9823181647062301, | |
| "num_tokens": 4840920.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.2293301146650575, | |
| "grad_norm": 0.3071349859237671, | |
| "learning_rate": 0.00023939478000861117, | |
| "loss": 0.0618, | |
| "mean_token_accuracy": 0.9843161147832871, | |
| "num_tokens": 4963388.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 2.2896801448400725, | |
| "grad_norm": 0.19022433459758759, | |
| "learning_rate": 0.00023600220436096318, | |
| "loss": 0.073, | |
| "mean_token_accuracy": 0.9802478235960007, | |
| "num_tokens": 5109242.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.3500301750150876, | |
| "grad_norm": 0.21214531362056732, | |
| "learning_rate": 0.00023254278085173684, | |
| "loss": 0.058, | |
| "mean_token_accuracy": 0.985277818441391, | |
| "num_tokens": 5233880.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 2.4103802051901027, | |
| "grad_norm": 0.27143731713294983, | |
| "learning_rate": 0.00022901919842437972, | |
| "loss": 0.0696, | |
| "mean_token_accuracy": 0.9813511747121811, | |
| "num_tokens": 5379872.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.4707302353651177, | |
| "grad_norm": 0.22756575047969818, | |
| "learning_rate": 0.00022543419589183397, | |
| "loss": 0.0557, | |
| "mean_token_accuracy": 0.9860703033208847, | |
| "num_tokens": 5503533.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 2.5310802655401328, | |
| "grad_norm": 0.20393149554729462, | |
| "learning_rate": 0.00022179055980770993, | |
| "loss": 0.0714, | |
| "mean_token_accuracy": 0.9812053245306015, | |
| "num_tokens": 5648709.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.591430295715148, | |
| "grad_norm": 0.1682002693414688, | |
| "learning_rate": 0.0002180911223003513, | |
| "loss": 0.0573, | |
| "mean_token_accuracy": 0.9854411727190018, | |
| "num_tokens": 5771477.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 2.651780325890163, | |
| "grad_norm": 0.17297177016735077, | |
| "learning_rate": 0.00021433875887147627, | |
| "loss": 0.0684, | |
| "mean_token_accuracy": 0.9818347871303559, | |
| "num_tokens": 5917022.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.712130356065178, | |
| "grad_norm": 0.3354227840900421, | |
| "learning_rate": 0.00021053638616110525, | |
| "loss": 0.056, | |
| "mean_token_accuracy": 0.9859032183885574, | |
| "num_tokens": 6040041.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 2.772480386240193, | |
| "grad_norm": 0.17890885472297668, | |
| "learning_rate": 0.00020668695968051274, | |
| "loss": 0.0644, | |
| "mean_token_accuracy": 0.9828311365842819, | |
| "num_tokens": 6183440.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.832830416415208, | |
| "grad_norm": 0.19668501615524292, | |
| "learning_rate": 0.00020279347151496482, | |
| "loss": 0.0572, | |
| "mean_token_accuracy": 0.9856607836484909, | |
| "num_tokens": 6307206.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 2.8931804465902236, | |
| "grad_norm": 0.12178179621696472, | |
| "learning_rate": 0.00019885894799802922, | |
| "loss": 0.0626, | |
| "mean_token_accuracy": 0.9839641106128693, | |
| "num_tokens": 6451936.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.9535304767652386, | |
| "grad_norm": 0.14492283761501312, | |
| "learning_rate": 0.00019488644735926396, | |
| "loss": 0.0533, | |
| "mean_token_accuracy": 0.9867338234186173, | |
| "num_tokens": 6574944.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.06949137151241302, | |
| "eval_mean_token_accuracy": 0.9829865713377257, | |
| "eval_num_tokens": 6670539.0, | |
| "eval_runtime": 60.3936, | |
| "eval_samples_per_second": 6.11, | |
| "eval_steps_per_second": 3.063, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 3.012070006035003, | |
| "grad_norm": 0.18783801794052124, | |
| "learning_rate": 0.00019087905734711452, | |
| "loss": 0.0609, | |
| "mean_token_accuracy": 0.984156120683729, | |
| "num_tokens": 6706398.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.0724200362100182, | |
| "grad_norm": 0.2086067646741867, | |
| "learning_rate": 0.00018683989282886613, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.9881435281038284, | |
| "num_tokens": 6848066.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 3.1327700663850333, | |
| "grad_norm": 0.2123444378376007, | |
| "learning_rate": 0.0001827720933695173, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9860709112882614, | |
| "num_tokens": 6975273.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.1931200965600484, | |
| "grad_norm": 0.11279409378767014, | |
| "learning_rate": 0.00017867882079145627, | |
| "loss": 0.0455, | |
| "mean_token_accuracy": 0.9881318390369416, | |
| "num_tokens": 7117546.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 3.2534701267350634, | |
| "grad_norm": 0.177442729473114, | |
| "learning_rate": 0.00017456325671683724, | |
| "loss": 0.0546, | |
| "mean_token_accuracy": 0.9862780523300171, | |
| "num_tokens": 7244543.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.3138201569100785, | |
| "grad_norm": 0.10142289847135544, | |
| "learning_rate": 0.00017042860009456638, | |
| "loss": 0.0445, | |
| "mean_token_accuracy": 0.988534786105156, | |
| "num_tokens": 7384967.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 3.3741701870850935, | |
| "grad_norm": 0.15665322542190552, | |
| "learning_rate": 0.00016627806471382066, | |
| "loss": 0.0532, | |
| "mean_token_accuracy": 0.9868290704488755, | |
| "num_tokens": 7509226.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.4345202172601086, | |
| "grad_norm": 0.15646246075630188, | |
| "learning_rate": 0.00016211487670603078, | |
| "loss": 0.0456, | |
| "mean_token_accuracy": 0.9882669430971146, | |
| "num_tokens": 7648911.0, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 3.4948702474351236, | |
| "grad_norm": 0.16921687126159668, | |
| "learning_rate": 0.0001579422720372715, | |
| "loss": 0.0568, | |
| "mean_token_accuracy": 0.9850279080867768, | |
| "num_tokens": 7777925.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.5552202776101387, | |
| "grad_norm": 0.10589733719825745, | |
| "learning_rate": 0.00015376349399300745, | |
| "loss": 0.0446, | |
| "mean_token_accuracy": 0.9881089746952056, | |
| "num_tokens": 7922053.0, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 3.6155703077851538, | |
| "grad_norm": 0.2955225110054016, | |
| "learning_rate": 0.0001495817906571492, | |
| "loss": 0.0544, | |
| "mean_token_accuracy": 0.9862786346673965, | |
| "num_tokens": 8049528.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.675920337960169, | |
| "grad_norm": 0.0742836520075798, | |
| "learning_rate": 0.00014540041238738055, | |
| "loss": 0.0456, | |
| "mean_token_accuracy": 0.9878502124547959, | |
| "num_tokens": 8190743.0, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 3.736270368135184, | |
| "grad_norm": 0.12437008321285248, | |
| "learning_rate": 0.00014122260928871734, | |
| "loss": 0.0535, | |
| "mean_token_accuracy": 0.9863007247447968, | |
| "num_tokens": 8316676.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.796620398310199, | |
| "grad_norm": 0.08342117071151733, | |
| "learning_rate": 0.00013705162868726396, | |
| "loss": 0.0447, | |
| "mean_token_accuracy": 0.9884078222513198, | |
| "num_tokens": 8457403.0, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 3.856970428485214, | |
| "grad_norm": 0.17910127341747284, | |
| "learning_rate": 0.00013289071260612855, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9868350481986999, | |
| "num_tokens": 8583976.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.9173204586602295, | |
| "grad_norm": 0.06760319322347641, | |
| "learning_rate": 0.00012874309524546083, | |
| "loss": 0.045, | |
| "mean_token_accuracy": 0.9878969532251358, | |
| "num_tokens": 8727702.0, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 3.9776704888352445, | |
| "grad_norm": 0.15349148213863373, | |
| "learning_rate": 0.00012461200046857084, | |
| "loss": 0.0508, | |
| "mean_token_accuracy": 0.987232791185379, | |
| "num_tokens": 8849683.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.0635647177696228, | |
| "eval_mean_token_accuracy": 0.9845431846541327, | |
| "eval_num_tokens": 8894052.0, | |
| "eval_runtime": 60.4472, | |
| "eval_samples_per_second": 6.105, | |
| "eval_steps_per_second": 3.061, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 4.036210018105009, | |
| "grad_norm": 0.1430218666791916, | |
| "learning_rate": 0.00012050063929608123, | |
| "loss": 0.0441, | |
| "mean_token_accuracy": 0.9887785659622901, | |
| "num_tokens": 8992417.0, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 4.096560048280024, | |
| "grad_norm": 0.08819396048784256, | |
| "learning_rate": 0.0001164122074100633, | |
| "loss": 0.0406, | |
| "mean_token_accuracy": 0.9898108446598053, | |
| "num_tokens": 9122219.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 4.15691007845504, | |
| "grad_norm": 0.08506345748901367, | |
| "learning_rate": 0.00011234988267009415, | |
| "loss": 0.0457, | |
| "mean_token_accuracy": 0.9884944796562195, | |
| "num_tokens": 9262178.0, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 4.217260108630055, | |
| "grad_norm": 0.08111756294965744, | |
| "learning_rate": 0.00010831682264316787, | |
| "loss": 0.0416, | |
| "mean_token_accuracy": 0.989647062420845, | |
| "num_tokens": 9391026.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.27761013880507, | |
| "grad_norm": 0.07644202560186386, | |
| "learning_rate": 0.00010431616214937911, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.9881355553865433, | |
| "num_tokens": 9531921.0, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 4.337960168980085, | |
| "grad_norm": 0.03971414268016815, | |
| "learning_rate": 0.00010035101082528777, | |
| "loss": 0.0413, | |
| "mean_token_accuracy": 0.9894955778121948, | |
| "num_tokens": 9661664.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.3983101991551, | |
| "grad_norm": 0.07305199652910233, | |
| "learning_rate": 9.642445070685809e-05, | |
| "loss": 0.0435, | |
| "mean_token_accuracy": 0.9884718316793442, | |
| "num_tokens": 9801214.0, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 4.458660229330115, | |
| "grad_norm": 0.07561074942350388, | |
| "learning_rate": 9.253953383385157e-05, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.9890134006738662, | |
| "num_tokens": 9929511.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.51901025950513, | |
| "grad_norm": 0.12152893096208572, | |
| "learning_rate": 8.869927987753459e-05, | |
| "loss": 0.0442, | |
| "mean_token_accuracy": 0.9886808878183365, | |
| "num_tokens": 10069489.0, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 4.579360289680145, | |
| "grad_norm": 0.05834532529115677, | |
| "learning_rate": 8.490667379354661e-05, | |
| "loss": 0.0417, | |
| "mean_token_accuracy": 0.9894009435176849, | |
| "num_tokens": 10197956.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.63971031985516, | |
| "grad_norm": 0.08154677599668503, | |
| "learning_rate": 8.116466350175079e-05, | |
| "loss": 0.0437, | |
| "mean_token_accuracy": 0.9884669744968414, | |
| "num_tokens": 10335860.0, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 4.700060350030175, | |
| "grad_norm": 0.07408758997917175, | |
| "learning_rate": 7.747615759487304e-05, | |
| "loss": 0.0411, | |
| "mean_token_accuracy": 0.989289864897728, | |
| "num_tokens": 10464303.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.76041038020519, | |
| "grad_norm": 0.0779808983206749, | |
| "learning_rate": 7.38440230777085e-05, | |
| "loss": 0.0453, | |
| "mean_token_accuracy": 0.9880505865812301, | |
| "num_tokens": 10601511.0, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 4.820760410380205, | |
| "grad_norm": 0.046579256653785706, | |
| "learning_rate": 7.027108313865378e-05, | |
| "loss": 0.0401, | |
| "mean_token_accuracy": 0.9896253395080566, | |
| "num_tokens": 10732156.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.88111044055522, | |
| "grad_norm": 0.07918152213096619, | |
| "learning_rate": 6.676011495529687e-05, | |
| "loss": 0.042, | |
| "mean_token_accuracy": 0.9889346659183502, | |
| "num_tokens": 10873235.0, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 4.941460470730235, | |
| "grad_norm": 0.04908803850412369, | |
| "learning_rate": 6.331384753577056e-05, | |
| "loss": 0.0408, | |
| "mean_token_accuracy": 0.9895875388383866, | |
| "num_tokens": 11001827.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.22838693857192993, | |
| "learning_rate": 5.993495959754631e-05, | |
| "loss": 0.0452, | |
| "mean_token_accuracy": 0.9885991817897128, | |
| "num_tokens": 11117565.0, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.06391309201717377, | |
| "eval_mean_token_accuracy": 0.9852847150854163, | |
| "eval_num_tokens": 11117565.0, | |
| "eval_runtime": 60.3576, | |
| "eval_samples_per_second": 6.114, | |
| "eval_steps_per_second": 3.065, | |
| "step": 2075 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 2905, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.8080321362273075e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |