{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.694126339120798, "eval_steps": 1000, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03694126339120798, "grad_norm": 1.4473156929016113, "learning_rate": 6.6e-05, "loss": 2.0357, "mean_token_accuracy": 0.4905380755662918, "num_tokens": 246180.0, "step": 100 }, { "epoch": 0.07388252678241596, "grad_norm": 1.4902719259262085, "learning_rate": 0.00013266666666666667, "loss": 1.3937, "mean_token_accuracy": 0.5916463854908943, "num_tokens": 492915.0, "step": 200 }, { "epoch": 0.11082379017362394, "grad_norm": 1.3383102416992188, "learning_rate": 0.00019933333333333334, "loss": 1.3205, "mean_token_accuracy": 0.6068769115209579, "num_tokens": 740398.0, "step": 300 }, { "epoch": 0.1477650535648319, "grad_norm": 1.2365469932556152, "learning_rate": 0.000173421993904824, "loss": 1.3026, "mean_token_accuracy": 0.6126913416385651, "num_tokens": 988197.0, "step": 400 }, { "epoch": 0.1847063169560399, "grad_norm": 1.0308854579925537, "learning_rate": 0.0001550744859491231, "loss": 1.2537, "mean_token_accuracy": 0.6190469121932983, "num_tokens": 1237334.0, "step": 500 }, { "epoch": 0.22164758034724788, "grad_norm": 1.2729270458221436, "learning_rate": 0.00014153935488632152, "loss": 1.2353, "mean_token_accuracy": 0.6254772353172302, "num_tokens": 1483308.0, "step": 600 }, { "epoch": 0.25858884373845586, "grad_norm": 0.9841827750205994, "learning_rate": 0.00013102435641608367, "loss": 1.2182, "mean_token_accuracy": 0.6275931853055954, "num_tokens": 1730192.0, "step": 700 }, { "epoch": 0.2955301071296638, "grad_norm": 0.9242544174194336, "learning_rate": 0.00012255110553085002, "loss": 1.2049, "mean_token_accuracy": 0.6283232820034027, "num_tokens": 1980054.0, "step": 800 }, { "epoch": 0.33247137052087183, "grad_norm": 0.8931549787521362, "learning_rate": 0.00011553425737574005, "loss": 1.2017, "mean_token_accuracy": 0.6302745240926743, "num_tokens": 2228605.0, "step": 900 }, { "epoch": 0.3694126339120798, "grad_norm": 1.044004201889038, "learning_rate": 0.0001095993248702382, "loss": 1.2137, "step": 1000 }, { "epoch": 0.3694126339120798, "eval_loss": 1.1836973428726196, "eval_mean_token_accuracy": 0.6324496693611145, "eval_num_tokens": 2475392.0, "eval_runtime": 5.3895, "eval_samples_per_second": 185.547, "eval_steps_per_second": 23.193, "step": 1000 }, { "epoch": 0.40635389730328775, "grad_norm": 0.8837220072746277, "learning_rate": 0.00010449410169212441, "loss": 1.1854, "mean_token_accuracy": 0.6299630090594291, "num_tokens": 2724102.0, "step": 1100 }, { "epoch": 0.44329516069449576, "grad_norm": 0.8813680410385132, "learning_rate": 0.00010004169272643103, "loss": 1.1776, "mean_token_accuracy": 0.6349628627300262, "num_tokens": 2972072.0, "step": 1200 }, { "epoch": 0.4802364240857037, "grad_norm": 0.9930716753005981, "learning_rate": 9.611386626644256e-05, "loss": 1.1683, "mean_token_accuracy": 0.6352091038227081, "num_tokens": 3217529.0, "step": 1300 }, { "epoch": 0.5171776874769117, "grad_norm": 0.8394394516944885, "learning_rate": 9.261509270476351e-05, "loss": 1.1687, "mean_token_accuracy": 0.6363370817899704, "num_tokens": 3467819.0, "step": 1400 }, { "epoch": 0.5541189508681197, "grad_norm": 1.129971981048584, "learning_rate": 8.9472548255098e-05, "loss": 1.1541, "mean_token_accuracy": 0.6417357540130615, "num_tokens": 3712960.0, "step": 1500 }, { "epoch": 0.5910602142593276, "grad_norm": 1.100881576538086, "learning_rate": 8.662961636484199e-05, "loss": 1.1602, "mean_token_accuracy": 0.6412180256843567, "num_tokens": 3956425.0, "step": 1600 }, { "epoch": 0.6280014776505356, "grad_norm": 0.9382134079933167, "learning_rate": 8.40415267738742e-05, "loss": 1.1604, "mean_token_accuracy": 0.6395090478658676, "num_tokens": 4203009.0, "step": 1700 }, { "epoch": 0.6649427410417437, "grad_norm": 1.084293246269226, "learning_rate": 8.167234800792304e-05, "loss": 1.1352, "mean_token_accuracy": 0.646253719329834, "num_tokens": 4449448.0, "step": 1800 }, { "epoch": 0.7018840044329516, "grad_norm": 0.9637444019317627, "learning_rate": 7.949286335171643e-05, "loss": 1.1562, "mean_token_accuracy": 0.6406971418857574, "num_tokens": 4694282.0, "step": 1900 }, { "epoch": 0.7388252678241596, "grad_norm": 0.9237338304519653, "learning_rate": 7.747903910575024e-05, "loss": 1.1424, "step": 2000 }, { "epoch": 0.7388252678241596, "eval_loss": 1.1218078136444092, "eval_mean_token_accuracy": 0.6506093912124634, "eval_num_tokens": 4940833.0, "eval_runtime": 5.3952, "eval_samples_per_second": 185.351, "eval_steps_per_second": 23.169, "step": 2000 }, { "epoch": 0.7757665312153675, "grad_norm": 0.777606725692749, "learning_rate": 7.561089934060305e-05, "loss": 1.1313, "mean_token_accuracy": 0.6467883923649788, "num_tokens": 5190358.0, "step": 2100 }, { "epoch": 0.8127077946065755, "grad_norm": 0.8855065703392029, "learning_rate": 7.387168551531178e-05, "loss": 1.1309, "mean_token_accuracy": 0.6449691706895828, "num_tokens": 5438772.0, "step": 2200 }, { "epoch": 0.8496490579977836, "grad_norm": 1.2018849849700928, "learning_rate": 7.224721947627513e-05, "loss": 1.1246, "mean_token_accuracy": 0.6473777782917023, "num_tokens": 5682900.0, "step": 2300 }, { "epoch": 0.8865903213889915, "grad_norm": 0.9426067471504211, "learning_rate": 7.07254141150883e-05, "loss": 1.1241, "mean_token_accuracy": 0.647763032913208, "num_tokens": 5931817.0, "step": 2400 }, { "epoch": 0.9235315847801995, "grad_norm": 0.7986142039299011, "learning_rate": 6.929589286752371e-05, "loss": 1.1258, "mean_token_accuracy": 0.6497471231222153, "num_tokens": 6179818.0, "step": 2500 }, { "epoch": 0.9604728481714074, "grad_norm": 1.1682002544403076, "learning_rate": 6.794969055356698e-05, "loss": 1.1286, "mean_token_accuracy": 0.6473524701595307, "num_tokens": 6426250.0, "step": 2600 }, { "epoch": 0.9974141115626154, "grad_norm": 0.9333537220954895, "learning_rate": 6.667901577609308e-05, "loss": 1.1239, "mean_token_accuracy": 0.6484623271226883, "num_tokens": 6672929.0, "step": 2700 }, { "epoch": 1.0343553749538235, "grad_norm": 0.5627142190933228, "learning_rate": 6.547706044716512e-05, "loss": 1.0508, "mean_token_accuracy": 0.6667241591215134, "num_tokens": 6925125.0, "step": 2800 }, { "epoch": 1.0712966383450313, "grad_norm": 0.5715717077255249, "learning_rate": 6.433784577873342e-05, "loss": 1.0615, "mean_token_accuracy": 0.6637595742940903, "num_tokens": 7171997.0, "step": 2900 }, { "epoch": 1.1082379017362394, "grad_norm": 0.5389025211334229, "learning_rate": 6.325609676486509e-05, "loss": 1.0709, "step": 3000 }, { "epoch": 1.1082379017362394, "eval_loss": 1.0916837453842163, "eval_mean_token_accuracy": 0.6582915830612183, "eval_num_tokens": 7419279.0, "eval_runtime": 5.3894, "eval_samples_per_second": 185.548, "eval_steps_per_second": 23.193, "step": 3000 }, { "epoch": 1.1451791651274474, "grad_norm": 0.56490159034729, "learning_rate": 6.22271391287055e-05, "loss": 1.0581, "mean_token_accuracy": 0.662086527645588, "num_tokens": 7664383.0, "step": 3100 }, { "epoch": 1.1821204285186553, "grad_norm": 0.543954074382782, "learning_rate": 6.12468141320462e-05, "loss": 1.0615, "mean_token_accuracy": 0.6631740409135819, "num_tokens": 7912840.0, "step": 3200 }, { "epoch": 1.2190616919098634, "grad_norm": 0.5596346855163574, "learning_rate": 6.03114077000131e-05, "loss": 1.0395, "mean_token_accuracy": 0.671640704870224, "num_tokens": 8160289.0, "step": 3300 }, { "epoch": 1.2560029553010712, "grad_norm": 0.5955942869186401, "learning_rate": 5.9417591102230663e-05, "loss": 1.0567, "mean_token_accuracy": 0.6638083755970001, "num_tokens": 8407857.0, "step": 3400 }, { "epoch": 1.2929442186922793, "grad_norm": 0.5933428406715393, "learning_rate": 5.856237102757652e-05, "loss": 1.0636, "mean_token_accuracy": 0.6627275788784027, "num_tokens": 8655624.0, "step": 3500 }, { "epoch": 1.3298854820834873, "grad_norm": 0.5572307109832764, "learning_rate": 5.7743047343661814e-05, "loss": 1.0652, "mean_token_accuracy": 0.6624369341135025, "num_tokens": 8902821.0, "step": 3600 }, { "epoch": 1.3668267454746952, "grad_norm": 0.596443235874176, "learning_rate": 5.6957177181117404e-05, "loss": 1.0622, "mean_token_accuracy": 0.6634978985786438, "num_tokens": 9149878.0, "step": 3700 }, { "epoch": 1.4037680088659032, "grad_norm": 0.5873645544052124, "learning_rate": 5.620254425309578e-05, "loss": 1.0493, "mean_token_accuracy": 0.6654049742221833, "num_tokens": 9394858.0, "step": 3800 }, { "epoch": 1.440709272257111, "grad_norm": 0.59478759765625, "learning_rate": 5.547713253139649e-05, "loss": 1.0456, "mean_token_accuracy": 0.6683453869819641, "num_tokens": 9641008.0, "step": 3900 }, { "epoch": 1.4776505356483192, "grad_norm": 0.5986542701721191, "learning_rate": 5.477910356647767e-05, "loss": 1.043, "step": 4000 }, { "epoch": 1.4776505356483192, "eval_loss": 1.083065152168274, "eval_mean_token_accuracy": 0.6582373585700989, "eval_num_tokens": 9888284.0, "eval_runtime": 5.3783, "eval_samples_per_second": 185.933, "eval_steps_per_second": 23.242, "step": 4000 }, { "epoch": 1.5145917990395272, "grad_norm": 0.5704376697540283, "learning_rate": 5.410677686985887e-05, "loss": 1.0321, "mean_token_accuracy": 0.6689175629615783, "num_tokens": 10134094.0, "step": 4100 }, { "epoch": 1.551533062430735, "grad_norm": 0.5602062344551086, "learning_rate": 5.345861288192786e-05, "loss": 1.0441, "mean_token_accuracy": 0.6680737626552582, "num_tokens": 10381316.0, "step": 4200 }, { "epoch": 1.5884743258219431, "grad_norm": 0.5552584528923035, "learning_rate": 5.283319813188472e-05, "loss": 1.0369, "mean_token_accuracy": 0.6698204201459884, "num_tokens": 10628471.0, "step": 4300 }, { "epoch": 1.625415589213151, "grad_norm": 0.6024323105812073, "learning_rate": 5.222923226400155e-05, "loss": 1.0403, "mean_token_accuracy": 0.6691047704219818, "num_tokens": 10874430.0, "step": 4400 }, { "epoch": 1.662356852604359, "grad_norm": 0.5967562794685364, "learning_rate": 5.164551665900703e-05, "loss": 1.0483, "mean_token_accuracy": 0.6648873990774155, "num_tokens": 11123128.0, "step": 4500 }, { "epoch": 1.6992981159955671, "grad_norm": 0.5668358206748962, "learning_rate": 5.1080944423879696e-05, "loss": 1.0392, "mean_token_accuracy": 0.6674597597122193, "num_tokens": 11368003.0, "step": 4600 }, { "epoch": 1.736239379386775, "grad_norm": 0.6373595595359802, "learning_rate": 5.053449155971992e-05, "loss": 1.0404, "mean_token_accuracy": 0.6689798641204834, "num_tokens": 11615194.0, "step": 4700 }, { "epoch": 1.773180642777983, "grad_norm": 0.6193637847900391, "learning_rate": 5.0005209147276734e-05, "loss": 1.0355, "mean_token_accuracy": 0.6675721609592438, "num_tokens": 11863548.0, "step": 4800 }, { "epoch": 1.8101219061691909, "grad_norm": 0.5770505666732788, "learning_rate": 4.949221641439499e-05, "loss": 1.0316, "mean_token_accuracy": 0.6724146312475204, "num_tokens": 12111554.0, "step": 4900 }, { "epoch": 1.847063169560399, "grad_norm": 0.6453244686126709, "learning_rate": 4.899469457011854e-05, "loss": 1.0348, "step": 5000 }, { "epoch": 1.847063169560399, "eval_loss": 1.0698254108428955, "eval_mean_token_accuracy": 0.66366588306427, "eval_num_tokens": 12359520.0, "eval_runtime": 5.3865, "eval_samples_per_second": 185.649, "eval_steps_per_second": 23.206, "step": 5000 }, { "epoch": 1.884004432951607, "grad_norm": 0.6129611730575562, "learning_rate": 4.851188130722481e-05, "loss": 1.0374, "mean_token_accuracy": 0.6697911691665649, "num_tokens": 12607238.0, "step": 5100 }, { "epoch": 1.920945696342815, "grad_norm": 0.6002670526504517, "learning_rate": 4.804306588920635e-05, "loss": 1.035, "mean_token_accuracy": 0.6703519684076309, "num_tokens": 12855037.0, "step": 5200 }, { "epoch": 1.957886959734023, "grad_norm": 0.5875000953674316, "learning_rate": 4.758758474966023e-05, "loss": 1.0348, "mean_token_accuracy": 0.6686445927619934, "num_tokens": 13100596.0, "step": 5300 }, { "epoch": 1.9948282231252308, "grad_norm": 0.587979257106781, "learning_rate": 4.7144817542100825e-05, "loss": 1.0375, "mean_token_accuracy": 0.6698001223802567, "num_tokens": 13349667.0, "step": 5400 }, { "epoch": 2.031769486516439, "grad_norm": 0.5400444269180298, "learning_rate": 4.671418358670517e-05, "loss": 1.0064, "mean_token_accuracy": 0.6771922719478607, "num_tokens": 13599712.0, "step": 5500 }, { "epoch": 2.068710749907647, "grad_norm": 0.5831236839294434, "learning_rate": 4.6295138667698956e-05, "loss": 0.9874, "mean_token_accuracy": 0.6801465088129044, "num_tokens": 13845238.0, "step": 5600 }, { "epoch": 2.105652013298855, "grad_norm": 0.559648334980011, "learning_rate": 4.5887172141209994e-05, "loss": 0.9824, "mean_token_accuracy": 0.6835214233398438, "num_tokens": 14094078.0, "step": 5700 }, { "epoch": 2.1425932766900626, "grad_norm": 0.5524799227714539, "learning_rate": 4.548980431863551e-05, "loss": 0.9852, "mean_token_accuracy": 0.6822834074497223, "num_tokens": 14342112.0, "step": 5800 }, { "epoch": 2.1795345400812707, "grad_norm": 0.586271345615387, "learning_rate": 4.510258409503273e-05, "loss": 0.9807, "mean_token_accuracy": 0.6823082774877548, "num_tokens": 14587875.0, "step": 5900 }, { "epoch": 2.2164758034724787, "grad_norm": 0.599862277507782, "learning_rate": 4.472508679587051e-05, "loss": 0.9741, "step": 6000 }, { "epoch": 2.2164758034724787, "eval_loss": 1.071539282798767, "eval_mean_token_accuracy": 0.6636041073799134, "eval_num_tokens": 14833253.0, "eval_runtime": 5.3837, "eval_samples_per_second": 185.745, "eval_steps_per_second": 23.218, "step": 6000 }, { "epoch": 2.253417066863687, "grad_norm": 0.5849953293800354, "learning_rate": 4.435691221877225e-05, "loss": 0.9784, "mean_token_accuracy": 0.6845134419202804, "num_tokens": 15079437.0, "step": 6100 }, { "epoch": 2.290358330254895, "grad_norm": 0.5756722688674927, "learning_rate": 4.399768284971994e-05, "loss": 0.9843, "mean_token_accuracy": 0.6839412766695022, "num_tokens": 15326743.0, "step": 6200 }, { "epoch": 2.3272995936461025, "grad_norm": 0.5707868933677673, "learning_rate": 4.364704223564281e-05, "loss": 0.9901, "mean_token_accuracy": 0.6811071854829788, "num_tokens": 15572363.0, "step": 6300 }, { "epoch": 2.3642408570373106, "grad_norm": 0.6192522048950195, "learning_rate": 4.330465349744206e-05, "loss": 0.9762, "mean_token_accuracy": 0.6834132850170136, "num_tokens": 15818229.0, "step": 6400 }, { "epoch": 2.4011821204285186, "grad_norm": 0.5887159109115601, "learning_rate": 4.2970197969350315e-05, "loss": 0.9797, "mean_token_accuracy": 0.6834959721565247, "num_tokens": 16066092.0, "step": 6500 }, { "epoch": 2.4381233838197267, "grad_norm": 0.6107765436172485, "learning_rate": 4.264337395213374e-05, "loss": 0.97, "mean_token_accuracy": 0.6855223393440246, "num_tokens": 16314390.0, "step": 6600 }, { "epoch": 2.4750646472109348, "grad_norm": 0.5866128206253052, "learning_rate": 4.232389556904849e-05, "loss": 0.9794, "mean_token_accuracy": 0.6824937015771866, "num_tokens": 16560430.0, "step": 6700 }, { "epoch": 2.5120059106021424, "grad_norm": 0.5723136067390442, "learning_rate": 4.201149171469091e-05, "loss": 0.9805, "mean_token_accuracy": 0.6831120592355728, "num_tokens": 16807172.0, "step": 6800 }, { "epoch": 2.5489471739933505, "grad_norm": 0.5534746050834656, "learning_rate": 4.170590508795705e-05, "loss": 0.9853, "mean_token_accuracy": 0.679190359711647, "num_tokens": 17054725.0, "step": 6900 }, { "epoch": 2.5858884373845585, "grad_norm": 0.6190086007118225, "learning_rate": 4.1406891301271574e-05, "loss": 0.979, "step": 7000 }, { "epoch": 2.5858884373845585, "eval_loss": 1.0672271251678467, "eval_mean_token_accuracy": 0.665556743144989, "eval_num_tokens": 17301619.0, "eval_runtime": 5.3864, "eval_samples_per_second": 185.654, "eval_steps_per_second": 23.207, "step": 7000 }, { "epoch": 2.6228297007757666, "grad_norm": 0.604921281337738, "learning_rate": 4.111421805907759e-05, "loss": 0.9844, "mean_token_accuracy": 0.6844497114419937, "num_tokens": 17546632.0, "step": 7100 }, { "epoch": 2.6597709641669747, "grad_norm": 0.6087679862976074, "learning_rate": 4.082766439931165e-05, "loss": 0.9871, "mean_token_accuracy": 0.6810534721612931, "num_tokens": 17797482.0, "step": 7200 }, { "epoch": 2.6967122275581827, "grad_norm": 0.6020961999893188, "learning_rate": 4.054701999223518e-05, "loss": 0.9839, "mean_token_accuracy": 0.6829093122482299, "num_tokens": 18043599.0, "step": 7300 }, { "epoch": 2.7336534909493904, "grad_norm": 0.6339052319526672, "learning_rate": 4.0272084491566247e-05, "loss": 0.9863, "mean_token_accuracy": 0.6820144325494766, "num_tokens": 18287793.0, "step": 7400 }, { "epoch": 2.7705947543405984, "grad_norm": 0.6206592321395874, "learning_rate": 4.000266693336297e-05, "loss": 0.9709, "mean_token_accuracy": 0.6874477595090867, "num_tokens": 18535420.0, "step": 7500 }, { "epoch": 2.8075360177318065, "grad_norm": 0.6235191822052002, "learning_rate": 3.973858517856019e-05, "loss": 0.9734, "mean_token_accuracy": 0.6847814846038819, "num_tokens": 18784286.0, "step": 7600 }, { "epoch": 2.8444772811230146, "grad_norm": 0.6308836340904236, "learning_rate": 3.947966539546186e-05, "loss": 0.9813, "mean_token_accuracy": 0.6831617254018784, "num_tokens": 19035813.0, "step": 7700 }, { "epoch": 2.881418544514222, "grad_norm": 0.6001960039138794, "learning_rate": 3.922574157884801e-05, "loss": 0.987, "mean_token_accuracy": 0.6807804244756699, "num_tokens": 19282122.0, "step": 7800 }, { "epoch": 2.9183598079054303, "grad_norm": 0.6059972643852234, "learning_rate": 3.8976655102673755e-05, "loss": 0.9859, "mean_token_accuracy": 0.6820109623670578, "num_tokens": 19529782.0, "step": 7900 }, { "epoch": 2.9553010712966383, "grad_norm": 0.640872061252594, "learning_rate": 3.873225430362181e-05, "loss": 0.9761, "step": 8000 }, { "epoch": 2.9553010712966383, "eval_loss": 1.0601364374160767, "eval_mean_token_accuracy": 0.6661491613388062, "eval_num_tokens": 19777841.0, "eval_runtime": 5.4066, "eval_samples_per_second": 184.959, "eval_steps_per_second": 23.12, "step": 8000 }, { "epoch": 2.9922423346878464, "grad_norm": 0.6132466793060303, "learning_rate": 3.8492394093024636e-05, "loss": 0.9711, "mean_token_accuracy": 0.6846305218338966, "num_tokens": 20023858.0, "step": 8100 }, { "epoch": 3.0291835980790545, "grad_norm": 0.5972515940666199, "learning_rate": 3.825693559490006e-05, "loss": 0.9376, "mean_token_accuracy": 0.692512179017067, "num_tokens": 20270581.0, "step": 8200 }, { "epoch": 3.066124861470262, "grad_norm": 0.6202205419540405, "learning_rate": 3.8025745808048846e-05, "loss": 0.9307, "mean_token_accuracy": 0.6948988193273544, "num_tokens": 20516819.0, "step": 8300 }, { "epoch": 3.10306612486147, "grad_norm": 0.585482120513916, "learning_rate": 3.779869729034645e-05, "loss": 0.935, "mean_token_accuracy": 0.6954682809114456, "num_tokens": 20763559.0, "step": 8400 }, { "epoch": 3.1400073882526782, "grad_norm": 0.6358840465545654, "learning_rate": 3.7575667863526335e-05, "loss": 0.9292, "mean_token_accuracy": 0.6980463570356369, "num_tokens": 21011357.0, "step": 8500 }, { "epoch": 3.1769486516438863, "grad_norm": 0.6181186437606812, "learning_rate": 3.735654033690154e-05, "loss": 0.9229, "mean_token_accuracy": 0.696455385684967, "num_tokens": 21259566.0, "step": 8600 }, { "epoch": 3.2138899150350944, "grad_norm": 0.6604560017585754, "learning_rate": 3.7141202248604964e-05, "loss": 0.9285, "mean_token_accuracy": 0.6962138444185257, "num_tokens": 21506213.0, "step": 8700 }, { "epoch": 3.250831178426302, "grad_norm": 0.5987362265586853, "learning_rate": 3.6929545623050815e-05, "loss": 0.929, "mean_token_accuracy": 0.6957518076896667, "num_tokens": 21754908.0, "step": 8800 }, { "epoch": 3.28777244181751, "grad_norm": 0.5980191230773926, "learning_rate": 3.6721466743428706e-05, "loss": 0.938, "mean_token_accuracy": 0.6953296983242034, "num_tokens": 22001522.0, "step": 8900 }, { "epoch": 3.324713705208718, "grad_norm": 0.6171393990516663, "learning_rate": 3.6516865938141736e-05, "loss": 0.9364, "step": 9000 }, { "epoch": 3.324713705208718, "eval_loss": 1.0706262588500977, "eval_mean_token_accuracy": 0.6642319107055664, "eval_num_tokens": 22248126.0, "eval_runtime": 5.3921, "eval_samples_per_second": 185.455, "eval_steps_per_second": 23.182, "step": 9000 }, { "epoch": 3.361654968599926, "grad_norm": 0.6272869110107422, "learning_rate": 3.6315647380189556e-05, "loss": 0.919, "mean_token_accuracy": 0.6962304222583771, "num_tokens": 22494098.0, "step": 9100 }, { "epoch": 3.3985962319911343, "grad_norm": 0.6461876034736633, "learning_rate": 3.611771889857922e-05, "loss": 0.9331, "mean_token_accuracy": 0.6959864324331284, "num_tokens": 22741819.0, "step": 9200 }, { "epoch": 3.4355374953823423, "grad_norm": 0.5855452418327332, "learning_rate": 3.592299180092082e-05, "loss": 0.9283, "mean_token_accuracy": 0.6968681657314301, "num_tokens": 22988621.0, "step": 9300 }, { "epoch": 3.47247875877355, "grad_norm": 0.6252483129501343, "learning_rate": 3.573138070643225e-05, "loss": 0.9271, "mean_token_accuracy": 0.6972170048952102, "num_tokens": 23235818.0, "step": 9400 }, { "epoch": 3.509420022164758, "grad_norm": 0.6152076125144958, "learning_rate": 3.554280338863896e-05, "loss": 0.9244, "mean_token_accuracy": 0.699801824092865, "num_tokens": 23483243.0, "step": 9500 }, { "epoch": 3.546361285555966, "grad_norm": 0.6049486398696899, "learning_rate": 3.535718062711045e-05, "loss": 0.9365, "mean_token_accuracy": 0.6951554995775223, "num_tokens": 23730501.0, "step": 9600 }, { "epoch": 3.583302548947174, "grad_norm": 0.6222932934761047, "learning_rate": 3.517443606762636e-05, "loss": 0.9374, "mean_token_accuracy": 0.6915060871839523, "num_tokens": 23978205.0, "step": 9700 }, { "epoch": 3.6202438123383818, "grad_norm": 0.6344577670097351, "learning_rate": 3.499449609021135e-05, "loss": 0.9252, "mean_token_accuracy": 0.6969369679689408, "num_tokens": 24224944.0, "step": 9800 }, { "epoch": 3.65718507572959, "grad_norm": 0.600951611995697, "learning_rate": 3.4817289684521056e-05, "loss": 0.9226, "mean_token_accuracy": 0.6991156005859375, "num_tokens": 24470936.0, "step": 9900 }, { "epoch": 3.694126339120798, "grad_norm": 0.6917585134506226, "learning_rate": 3.4642748332099756e-05, "loss": 0.935, "step": 10000 }, { "epoch": 3.694126339120798, "eval_loss": 1.0667781829833984, "eval_mean_token_accuracy": 0.6657101097106933, "eval_num_tokens": 24717605.0, "eval_runtime": 5.4034, "eval_samples_per_second": 185.07, "eval_steps_per_second": 23.134, "step": 10000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9870782814052352e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }