| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 8.0, | |
| "eval_steps": 500, | |
| "global_step": 1832, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.39035615921020506, | |
| "epoch": 0.04381161007667032, | |
| "grad_norm": 19.073434829711914, | |
| "learning_rate": 2e-05, | |
| "loss": 4.3018, | |
| "mean_token_accuracy": 0.5682853847742081, | |
| "num_tokens": 29485.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.7484827220439911, | |
| "epoch": 0.08762322015334063, | |
| "grad_norm": 3.970487356185913, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9588, | |
| "mean_token_accuracy": 0.6895880490541458, | |
| "num_tokens": 58977.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.1801635682582856, | |
| "epoch": 0.13143483023001096, | |
| "grad_norm": 2.532576322555542, | |
| "learning_rate": 2e-05, | |
| "loss": 1.431, | |
| "mean_token_accuracy": 0.7343599200248718, | |
| "num_tokens": 88402.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.1498825669288635, | |
| "epoch": 0.17524644030668127, | |
| "grad_norm": 1.982933521270752, | |
| "learning_rate": 2e-05, | |
| "loss": 1.1355, | |
| "mean_token_accuracy": 0.7867384225130081, | |
| "num_tokens": 117810.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.8834485694766044, | |
| "epoch": 0.21905805038335158, | |
| "grad_norm": 1.8390016555786133, | |
| "learning_rate": 2e-05, | |
| "loss": 0.8543, | |
| "mean_token_accuracy": 0.8341205582022667, | |
| "num_tokens": 147327.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5984191231429576, | |
| "epoch": 0.2628696604600219, | |
| "grad_norm": 1.747591257095337, | |
| "learning_rate": 2e-05, | |
| "loss": 0.5852, | |
| "mean_token_accuracy": 0.8857637628912925, | |
| "num_tokens": 176791.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.3483880817890167, | |
| "epoch": 0.3066812705366922, | |
| "grad_norm": 1.5721803903579712, | |
| "learning_rate": 2e-05, | |
| "loss": 0.3545, | |
| "mean_token_accuracy": 0.9292098119854927, | |
| "num_tokens": 206271.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.20057316161692143, | |
| "epoch": 0.35049288061336253, | |
| "grad_norm": 1.2334290742874146, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1877, | |
| "mean_token_accuracy": 0.9710112065076828, | |
| "num_tokens": 235692.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.12095212489366532, | |
| "epoch": 0.39430449069003287, | |
| "grad_norm": 0.8091076016426086, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1073, | |
| "mean_token_accuracy": 0.9890251606702805, | |
| "num_tokens": 265133.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.10107735879719257, | |
| "epoch": 0.43811610076670315, | |
| "grad_norm": 0.6297779679298401, | |
| "learning_rate": 2e-05, | |
| "loss": 0.085, | |
| "mean_token_accuracy": 0.9887343898415566, | |
| "num_tokens": 294684.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.08415136393159628, | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 0.47038641571998596, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0702, | |
| "mean_token_accuracy": 0.9906241714954376, | |
| "num_tokens": 324140.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.07522545410320162, | |
| "epoch": 0.5257393209200438, | |
| "grad_norm": 0.44743451476097107, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0682, | |
| "mean_token_accuracy": 0.9909884691238403, | |
| "num_tokens": 353646.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.07190036196261644, | |
| "epoch": 0.5695509309967142, | |
| "grad_norm": 0.4326375722885132, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0679, | |
| "mean_token_accuracy": 0.990379473567009, | |
| "num_tokens": 383164.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.06936099929735065, | |
| "epoch": 0.6133625410733844, | |
| "grad_norm": 0.3178843557834625, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0638, | |
| "mean_token_accuracy": 0.9907173082232476, | |
| "num_tokens": 412692.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.06100328806787729, | |
| "epoch": 0.6571741511500547, | |
| "grad_norm": 0.3546409606933594, | |
| "learning_rate": 2e-05, | |
| "loss": 0.056, | |
| "mean_token_accuracy": 0.9925005912780762, | |
| "num_tokens": 442159.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.060952140018343925, | |
| "epoch": 0.7009857612267251, | |
| "grad_norm": 0.34292343258857727, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0551, | |
| "mean_token_accuracy": 0.991845327615738, | |
| "num_tokens": 471592.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.06124872919172049, | |
| "epoch": 0.7447973713033954, | |
| "grad_norm": 0.3005734384059906, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0552, | |
| "mean_token_accuracy": 0.9918943449854851, | |
| "num_tokens": 501090.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.05876323413103819, | |
| "epoch": 0.7886089813800657, | |
| "grad_norm": 0.24807888269424438, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0517, | |
| "mean_token_accuracy": 0.9916606426239014, | |
| "num_tokens": 530595.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.058617806807160375, | |
| "epoch": 0.8324205914567361, | |
| "grad_norm": 0.4288617968559265, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0572, | |
| "mean_token_accuracy": 0.9911757484078407, | |
| "num_tokens": 560059.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.05736046200618148, | |
| "epoch": 0.8762322015334063, | |
| "grad_norm": 0.3320342004299164, | |
| "learning_rate": 2e-05, | |
| "loss": 0.05, | |
| "mean_token_accuracy": 0.9926259219646454, | |
| "num_tokens": 589375.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.05618324866518378, | |
| "epoch": 0.9200438116100766, | |
| "grad_norm": 0.33638137578964233, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0525, | |
| "mean_token_accuracy": 0.9919258087873459, | |
| "num_tokens": 618850.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.05968485539779067, | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 0.5651094317436218, | |
| "learning_rate": 2e-05, | |
| "loss": 0.052, | |
| "mean_token_accuracy": 0.9916668817400932, | |
| "num_tokens": 648391.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.060051426794883365, | |
| "epoch": 1.004381161007667, | |
| "grad_norm": 0.31502196192741394, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0486, | |
| "mean_token_accuracy": 0.9919279153282577, | |
| "num_tokens": 675667.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.05292966021224856, | |
| "epoch": 1.0481927710843373, | |
| "grad_norm": 0.40586429834365845, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0475, | |
| "mean_token_accuracy": 0.9920881032943726, | |
| "num_tokens": 705116.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.052853992022573945, | |
| "epoch": 1.0920043811610076, | |
| "grad_norm": 0.2613174021244049, | |
| "learning_rate": 2e-05, | |
| "loss": 0.045, | |
| "mean_token_accuracy": 0.9926601052284241, | |
| "num_tokens": 734545.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.052618366107344626, | |
| "epoch": 1.135815991237678, | |
| "grad_norm": 0.3835960030555725, | |
| "learning_rate": 2e-05, | |
| "loss": 0.044, | |
| "mean_token_accuracy": 0.9924288704991341, | |
| "num_tokens": 764013.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.052085249312222005, | |
| "epoch": 1.1796276013143483, | |
| "grad_norm": 0.35782596468925476, | |
| "learning_rate": 2e-05, | |
| "loss": 0.046, | |
| "mean_token_accuracy": 0.9919404909014702, | |
| "num_tokens": 793540.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.05476817348971963, | |
| "epoch": 1.2234392113910186, | |
| "grad_norm": 0.30708274245262146, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0474, | |
| "mean_token_accuracy": 0.9918309196829795, | |
| "num_tokens": 823042.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.05322778979316354, | |
| "epoch": 1.267250821467689, | |
| "grad_norm": 0.2604309022426605, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0424, | |
| "mean_token_accuracy": 0.9924165293574333, | |
| "num_tokens": 852457.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.04979227380827069, | |
| "epoch": 1.3110624315443593, | |
| "grad_norm": 0.334300696849823, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0404, | |
| "mean_token_accuracy": 0.9923597663640976, | |
| "num_tokens": 881910.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.05080299507826567, | |
| "epoch": 1.3548740416210296, | |
| "grad_norm": 0.3369467854499817, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0429, | |
| "mean_token_accuracy": 0.9918962031602859, | |
| "num_tokens": 911418.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.052211628574877975, | |
| "epoch": 1.3986856516976998, | |
| "grad_norm": 0.49774906039237976, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0442, | |
| "mean_token_accuracy": 0.9922201976180076, | |
| "num_tokens": 940867.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.05023647788912058, | |
| "epoch": 1.44249726177437, | |
| "grad_norm": 0.3457210659980774, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0406, | |
| "mean_token_accuracy": 0.9923922121524811, | |
| "num_tokens": 970331.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.047794731613248584, | |
| "epoch": 1.4863088718510404, | |
| "grad_norm": 0.35947972536087036, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0417, | |
| "mean_token_accuracy": 0.992406377196312, | |
| "num_tokens": 999841.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.047655423637479544, | |
| "epoch": 1.5301204819277108, | |
| "grad_norm": 0.37163057923316956, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0405, | |
| "mean_token_accuracy": 0.9928415760397911, | |
| "num_tokens": 1029329.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.04932913850061595, | |
| "epoch": 1.5739320920043811, | |
| "grad_norm": 0.34155628085136414, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0409, | |
| "mean_token_accuracy": 0.992286778986454, | |
| "num_tokens": 1058772.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.05013784933835268, | |
| "epoch": 1.6177437020810514, | |
| "grad_norm": 0.3228084444999695, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0385, | |
| "mean_token_accuracy": 0.9925702095031739, | |
| "num_tokens": 1088250.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.04742059959098697, | |
| "epoch": 1.6615553121577218, | |
| "grad_norm": 0.2427096664905548, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0348, | |
| "mean_token_accuracy": 0.9927995279431343, | |
| "num_tokens": 1117693.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.04431099114008248, | |
| "epoch": 1.7053669222343921, | |
| "grad_norm": 0.2118023931980133, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0349, | |
| "mean_token_accuracy": 0.9929380178451538, | |
| "num_tokens": 1147140.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.04604467884637416, | |
| "epoch": 1.7491785323110625, | |
| "grad_norm": 0.25765758752822876, | |
| "learning_rate": 2e-05, | |
| "loss": 0.037, | |
| "mean_token_accuracy": 0.9926380544900895, | |
| "num_tokens": 1176644.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.04647672027349472, | |
| "epoch": 1.7929901423877328, | |
| "grad_norm": 0.2914351522922516, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0362, | |
| "mean_token_accuracy": 0.99278933852911, | |
| "num_tokens": 1206205.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.04514645580202341, | |
| "epoch": 1.8368017524644031, | |
| "grad_norm": 0.23769572377204895, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0375, | |
| "mean_token_accuracy": 0.992934164404869, | |
| "num_tokens": 1235651.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.044267228711396454, | |
| "epoch": 1.8806133625410735, | |
| "grad_norm": 0.22526511549949646, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0331, | |
| "mean_token_accuracy": 0.9943611547350883, | |
| "num_tokens": 1265105.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.04463120717555284, | |
| "epoch": 1.9244249726177438, | |
| "grad_norm": 0.27114230394363403, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0348, | |
| "mean_token_accuracy": 0.99424988925457, | |
| "num_tokens": 1294507.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.045189128536731, | |
| "epoch": 1.9682365826944141, | |
| "grad_norm": 0.3721632957458496, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0348, | |
| "mean_token_accuracy": 0.9939030453562736, | |
| "num_tokens": 1324016.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.04366534243564348, | |
| "epoch": 2.008762322015334, | |
| "grad_norm": 0.2200087010860443, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0344, | |
| "mean_token_accuracy": 0.9943878634555919, | |
| "num_tokens": 1351286.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.04503620527684689, | |
| "epoch": 2.0525739320920042, | |
| "grad_norm": 0.2603790760040283, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0321, | |
| "mean_token_accuracy": 0.9944434046745301, | |
| "num_tokens": 1380791.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.04118769532069564, | |
| "epoch": 2.0963855421686746, | |
| "grad_norm": 0.29507240653038025, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0314, | |
| "mean_token_accuracy": 0.994631552696228, | |
| "num_tokens": 1410230.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.04305135570466519, | |
| "epoch": 2.140197152245345, | |
| "grad_norm": 0.2874581515789032, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0325, | |
| "mean_token_accuracy": 0.9944758623838424, | |
| "num_tokens": 1439719.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.0408366883173585, | |
| "epoch": 2.1840087623220152, | |
| "grad_norm": 0.3915460407733917, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0304, | |
| "mean_token_accuracy": 0.9943425431847572, | |
| "num_tokens": 1469243.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.041507516894489525, | |
| "epoch": 2.2278203723986856, | |
| "grad_norm": 0.2981090545654297, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0299, | |
| "mean_token_accuracy": 0.9945579648017884, | |
| "num_tokens": 1498662.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.03967566112987697, | |
| "epoch": 2.271631982475356, | |
| "grad_norm": 0.38977038860321045, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0322, | |
| "mean_token_accuracy": 0.9944061517715455, | |
| "num_tokens": 1528141.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.039826968545094134, | |
| "epoch": 2.3154435925520263, | |
| "grad_norm": 0.3456617295742035, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0285, | |
| "mean_token_accuracy": 0.9949691370129585, | |
| "num_tokens": 1557578.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.0421893454156816, | |
| "epoch": 2.3592552026286966, | |
| "grad_norm": 0.3418237566947937, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0313, | |
| "mean_token_accuracy": 0.9939802244305611, | |
| "num_tokens": 1587144.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.0395780008751899, | |
| "epoch": 2.403066812705367, | |
| "grad_norm": 0.30387938022613525, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0292, | |
| "mean_token_accuracy": 0.9946425825357437, | |
| "num_tokens": 1616658.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.03764454615302384, | |
| "epoch": 2.4468784227820373, | |
| "grad_norm": 0.33084434270858765, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0296, | |
| "mean_token_accuracy": 0.9947469428181648, | |
| "num_tokens": 1646176.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.037794701755046844, | |
| "epoch": 2.4906900328587076, | |
| "grad_norm": 0.2900339365005493, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0294, | |
| "mean_token_accuracy": 0.9943759799003601, | |
| "num_tokens": 1675707.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.03779870173893869, | |
| "epoch": 2.534501642935378, | |
| "grad_norm": 0.2534691095352173, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0296, | |
| "mean_token_accuracy": 0.9940598547458649, | |
| "num_tokens": 1705162.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.036255474342033266, | |
| "epoch": 2.5783132530120483, | |
| "grad_norm": 0.35891082882881165, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0276, | |
| "mean_token_accuracy": 0.9944613263010978, | |
| "num_tokens": 1734615.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.03256695491727442, | |
| "epoch": 2.6221248630887186, | |
| "grad_norm": 0.2845175266265869, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0278, | |
| "mean_token_accuracy": 0.994592797756195, | |
| "num_tokens": 1764014.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.03345188507810235, | |
| "epoch": 2.665936473165389, | |
| "grad_norm": 0.3167726397514343, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9946425467729568, | |
| "num_tokens": 1793503.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.03313743188045919, | |
| "epoch": 2.7097480832420593, | |
| "grad_norm": 0.21710677444934845, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0272, | |
| "mean_token_accuracy": 0.9945120304822922, | |
| "num_tokens": 1823006.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.030652716779150067, | |
| "epoch": 2.7535596933187296, | |
| "grad_norm": 0.3033951222896576, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0253, | |
| "mean_token_accuracy": 0.9948923841118813, | |
| "num_tokens": 1852394.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.03487149984575808, | |
| "epoch": 2.7973713033953995, | |
| "grad_norm": 0.2704299986362457, | |
| "learning_rate": 2e-05, | |
| "loss": 0.029, | |
| "mean_token_accuracy": 0.9941735804080963, | |
| "num_tokens": 1881893.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.031937569426372645, | |
| "epoch": 2.8411829134720703, | |
| "grad_norm": 0.3431037962436676, | |
| "learning_rate": 2e-05, | |
| "loss": 0.028, | |
| "mean_token_accuracy": 0.9944406807422638, | |
| "num_tokens": 1911381.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.031133335269987582, | |
| "epoch": 2.88499452354874, | |
| "grad_norm": 0.35892975330352783, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9947311311960221, | |
| "num_tokens": 1940830.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.030122329480946064, | |
| "epoch": 2.928806133625411, | |
| "grad_norm": 0.25242748856544495, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0276, | |
| "mean_token_accuracy": 0.9946604892611504, | |
| "num_tokens": 1970238.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.03375864648260176, | |
| "epoch": 2.972617743702081, | |
| "grad_norm": 0.22651821374893188, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0284, | |
| "mean_token_accuracy": 0.9947338193655014, | |
| "num_tokens": 1999673.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.02985721411233818, | |
| "epoch": 3.013143483023001, | |
| "grad_norm": 0.2214677631855011, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0253, | |
| "mean_token_accuracy": 0.9945424746822666, | |
| "num_tokens": 2026976.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.02816909532994032, | |
| "epoch": 3.0569550930996714, | |
| "grad_norm": 0.19869959354400635, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0234, | |
| "mean_token_accuracy": 0.9953982174396515, | |
| "num_tokens": 2056329.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.027280686935409904, | |
| "epoch": 3.1007667031763417, | |
| "grad_norm": 0.22260268032550812, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0232, | |
| "mean_token_accuracy": 0.9950461372733116, | |
| "num_tokens": 2085798.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.028381537599489092, | |
| "epoch": 3.144578313253012, | |
| "grad_norm": 0.3607195019721985, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0267, | |
| "mean_token_accuracy": 0.9944693371653557, | |
| "num_tokens": 2115279.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.029735187301412225, | |
| "epoch": 3.1883899233296824, | |
| "grad_norm": 0.25269201397895813, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0273, | |
| "mean_token_accuracy": 0.9947093442082405, | |
| "num_tokens": 2144765.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.027832537749782206, | |
| "epoch": 3.2322015334063527, | |
| "grad_norm": 0.25261572003364563, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0226, | |
| "mean_token_accuracy": 0.9951098829507827, | |
| "num_tokens": 2174228.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.02715804339386523, | |
| "epoch": 3.276013143483023, | |
| "grad_norm": 0.27859893441200256, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9944431126117707, | |
| "num_tokens": 2203757.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.028143454669043423, | |
| "epoch": 3.3198247535596934, | |
| "grad_norm": 0.2973248064517975, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9948585823178291, | |
| "num_tokens": 2233150.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.030359381577000022, | |
| "epoch": 3.3636363636363638, | |
| "grad_norm": 0.2367042601108551, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0268, | |
| "mean_token_accuracy": 0.9942718967795372, | |
| "num_tokens": 2262643.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.026462095649912955, | |
| "epoch": 3.407447973713034, | |
| "grad_norm": 0.34042081236839294, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0243, | |
| "mean_token_accuracy": 0.9947688922286033, | |
| "num_tokens": 2292090.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.026516364049166442, | |
| "epoch": 3.4512595837897044, | |
| "grad_norm": 0.29167208075523376, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.995012117922306, | |
| "num_tokens": 2321564.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.026681744982488452, | |
| "epoch": 3.4950711938663748, | |
| "grad_norm": 0.2620503604412079, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0265, | |
| "mean_token_accuracy": 0.9948214083909989, | |
| "num_tokens": 2351115.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.027950569428503512, | |
| "epoch": 3.5388828039430447, | |
| "grad_norm": 0.3914905786514282, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0264, | |
| "mean_token_accuracy": 0.994230942428112, | |
| "num_tokens": 2380589.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.029146080673672258, | |
| "epoch": 3.5826944140197154, | |
| "grad_norm": 0.2935093343257904, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0263, | |
| "mean_token_accuracy": 0.9945803046226501, | |
| "num_tokens": 2410081.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.029389529721811414, | |
| "epoch": 3.6265060240963853, | |
| "grad_norm": 0.20426620543003082, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0251, | |
| "mean_token_accuracy": 0.9949777990579605, | |
| "num_tokens": 2439552.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.02622994459234178, | |
| "epoch": 3.670317634173056, | |
| "grad_norm": 0.37980544567108154, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0254, | |
| "mean_token_accuracy": 0.9946694761514664, | |
| "num_tokens": 2469015.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.027297466271556915, | |
| "epoch": 3.714129244249726, | |
| "grad_norm": 0.2875664234161377, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0245, | |
| "mean_token_accuracy": 0.9948430895805359, | |
| "num_tokens": 2498494.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.026779479440301658, | |
| "epoch": 3.7579408543263964, | |
| "grad_norm": 0.24819296598434448, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0269, | |
| "mean_token_accuracy": 0.994810126721859, | |
| "num_tokens": 2527986.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.02729630549438298, | |
| "epoch": 3.8017524644030667, | |
| "grad_norm": 0.24704840779304504, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.9947375372052193, | |
| "num_tokens": 2557430.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.02764119957573712, | |
| "epoch": 3.845564074479737, | |
| "grad_norm": 0.3414749503135681, | |
| "learning_rate": 2e-05, | |
| "loss": 0.027, | |
| "mean_token_accuracy": 0.9947045534849167, | |
| "num_tokens": 2586894.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.02578033790923655, | |
| "epoch": 3.8893756845564074, | |
| "grad_norm": 0.37503501772880554, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0248, | |
| "mean_token_accuracy": 0.9950520157814026, | |
| "num_tokens": 2616380.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.027033341675996782, | |
| "epoch": 3.9331872946330777, | |
| "grad_norm": 0.2305474430322647, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0256, | |
| "mean_token_accuracy": 0.9948741808533669, | |
| "num_tokens": 2645835.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.027415974205359815, | |
| "epoch": 3.976998904709748, | |
| "grad_norm": 0.2407739907503128, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0246, | |
| "mean_token_accuracy": 0.9946468979120254, | |
| "num_tokens": 2675356.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.02766735479235649, | |
| "epoch": 4.017524644030668, | |
| "grad_norm": 0.3133447766304016, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0266, | |
| "mean_token_accuracy": 0.9944937438578219, | |
| "num_tokens": 2702602.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.028550118627026676, | |
| "epoch": 4.061336254107339, | |
| "grad_norm": 0.25123274326324463, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0262, | |
| "mean_token_accuracy": 0.9943874076008796, | |
| "num_tokens": 2732020.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.024479676922783255, | |
| "epoch": 4.1051478641840085, | |
| "grad_norm": 0.45999675989151, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0244, | |
| "mean_token_accuracy": 0.9946165904402733, | |
| "num_tokens": 2761530.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.025697663542814553, | |
| "epoch": 4.148959474260679, | |
| "grad_norm": 0.23499128222465515, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0235, | |
| "mean_token_accuracy": 0.9952784523367881, | |
| "num_tokens": 2790969.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.024972660979256035, | |
| "epoch": 4.192771084337349, | |
| "grad_norm": 0.27967652678489685, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0219, | |
| "mean_token_accuracy": 0.9953790575265884, | |
| "num_tokens": 2820393.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.02475374243222177, | |
| "epoch": 4.23658269441402, | |
| "grad_norm": 0.25424709916114807, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0226, | |
| "mean_token_accuracy": 0.9946705937385559, | |
| "num_tokens": 2849863.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 0.02558194790035486, | |
| "epoch": 4.28039430449069, | |
| "grad_norm": 0.28516727685928345, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0229, | |
| "mean_token_accuracy": 0.9950990095734596, | |
| "num_tokens": 2879274.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.024959080247208477, | |
| "epoch": 4.324205914567361, | |
| "grad_norm": 0.20538896322250366, | |
| "learning_rate": 2e-05, | |
| "loss": 0.024, | |
| "mean_token_accuracy": 0.9950569152832032, | |
| "num_tokens": 2908801.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.025352832465432584, | |
| "epoch": 4.3680175246440305, | |
| "grad_norm": 0.2579675018787384, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0245, | |
| "mean_token_accuracy": 0.9950070321559906, | |
| "num_tokens": 2938257.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.02450204298365861, | |
| "epoch": 4.411829134720701, | |
| "grad_norm": 0.2892380952835083, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0232, | |
| "mean_token_accuracy": 0.9949192464351654, | |
| "num_tokens": 2967762.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.02369625587016344, | |
| "epoch": 4.455640744797371, | |
| "grad_norm": 0.26433488726615906, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0231, | |
| "mean_token_accuracy": 0.9950045317411422, | |
| "num_tokens": 2997222.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.027335537271574138, | |
| "epoch": 4.499452354874042, | |
| "grad_norm": 0.37580329179763794, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0252, | |
| "mean_token_accuracy": 0.9948343947529793, | |
| "num_tokens": 3026671.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.025261221639811992, | |
| "epoch": 4.543263964950712, | |
| "grad_norm": 0.2803178131580353, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0231, | |
| "mean_token_accuracy": 0.9950425833463669, | |
| "num_tokens": 3056097.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 0.026201538788154722, | |
| "epoch": 4.587075575027383, | |
| "grad_norm": 0.3461656868457794, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0235, | |
| "mean_token_accuracy": 0.9950102731585503, | |
| "num_tokens": 3085560.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.024683090089820326, | |
| "epoch": 4.6308871851040525, | |
| "grad_norm": 0.22958162426948547, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0245, | |
| "mean_token_accuracy": 0.9952207550406456, | |
| "num_tokens": 3115048.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.024839983228594063, | |
| "epoch": 4.674698795180722, | |
| "grad_norm": 0.3140292763710022, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0239, | |
| "mean_token_accuracy": 0.9952376142144204, | |
| "num_tokens": 3144452.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.026491868868470193, | |
| "epoch": 4.718510405257393, | |
| "grad_norm": 0.2557467818260193, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0239, | |
| "mean_token_accuracy": 0.9949898362159729, | |
| "num_tokens": 3173986.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.026718211872503162, | |
| "epoch": 4.762322015334064, | |
| "grad_norm": 0.3381554186344147, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.9950517997145653, | |
| "num_tokens": 3203519.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.02580249684397131, | |
| "epoch": 4.806133625410734, | |
| "grad_norm": 0.2958730161190033, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0225, | |
| "mean_token_accuracy": 0.9952193647623062, | |
| "num_tokens": 3233021.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.02244551058392972, | |
| "epoch": 4.849945235487404, | |
| "grad_norm": 0.2638761103153229, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0219, | |
| "mean_token_accuracy": 0.9949140697717667, | |
| "num_tokens": 3262501.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.023888250160962344, | |
| "epoch": 4.8937568455640745, | |
| "grad_norm": 0.3221336007118225, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.9950117602944374, | |
| "num_tokens": 3291991.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.023271886515431106, | |
| "epoch": 4.937568455640744, | |
| "grad_norm": 0.17379634082317352, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0227, | |
| "mean_token_accuracy": 0.9950850084424019, | |
| "num_tokens": 3321471.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 0.025164074124768378, | |
| "epoch": 4.981380065717415, | |
| "grad_norm": 0.3978523313999176, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0242, | |
| "mean_token_accuracy": 0.9946789160370827, | |
| "num_tokens": 3350989.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.02359090921645229, | |
| "epoch": 5.021905805038335, | |
| "grad_norm": 0.2893853187561035, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0209, | |
| "mean_token_accuracy": 0.9951927516911481, | |
| "num_tokens": 3378231.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.025032577151432634, | |
| "epoch": 5.065717415115006, | |
| "grad_norm": 0.3284905254840851, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0248, | |
| "mean_token_accuracy": 0.9947940185666084, | |
| "num_tokens": 3407821.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 0.023946191160939635, | |
| "epoch": 5.109529025191676, | |
| "grad_norm": 0.2103855162858963, | |
| "learning_rate": 2e-05, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.995547603070736, | |
| "num_tokens": 3437275.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.025330464565195145, | |
| "epoch": 5.153340635268346, | |
| "grad_norm": 0.24447986483573914, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0218, | |
| "mean_token_accuracy": 0.9955911099910736, | |
| "num_tokens": 3466744.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.02319594454020262, | |
| "epoch": 5.197152245345016, | |
| "grad_norm": 0.2545948922634125, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.9949484288692474, | |
| "num_tokens": 3496227.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 0.024116577091626824, | |
| "epoch": 5.240963855421687, | |
| "grad_norm": 0.3212072551250458, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0221, | |
| "mean_token_accuracy": 0.9953089833259583, | |
| "num_tokens": 3525658.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 0.024795983172953128, | |
| "epoch": 5.284775465498357, | |
| "grad_norm": 0.3668806254863739, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0211, | |
| "mean_token_accuracy": 0.994914136826992, | |
| "num_tokens": 3555128.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.02205991530790925, | |
| "epoch": 5.328587075575028, | |
| "grad_norm": 0.2271159589290619, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0226, | |
| "mean_token_accuracy": 0.9950777113437652, | |
| "num_tokens": 3584595.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.02077565067447722, | |
| "epoch": 5.372398685651698, | |
| "grad_norm": 0.2420947551727295, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0208, | |
| "mean_token_accuracy": 0.9958102077245712, | |
| "num_tokens": 3613973.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 0.02363784979097545, | |
| "epoch": 5.416210295728368, | |
| "grad_norm": 0.1963256448507309, | |
| "learning_rate": 2e-05, | |
| "loss": 0.02, | |
| "mean_token_accuracy": 0.9953210085630417, | |
| "num_tokens": 3643480.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.021049648011103272, | |
| "epoch": 5.460021905805038, | |
| "grad_norm": 0.31181660294532776, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0228, | |
| "mean_token_accuracy": 0.9951788693666458, | |
| "num_tokens": 3672937.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 0.023798084165900946, | |
| "epoch": 5.503833515881709, | |
| "grad_norm": 0.21456217765808105, | |
| "learning_rate": 2e-05, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.994881397485733, | |
| "num_tokens": 3702446.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 0.02499352244194597, | |
| "epoch": 5.547645125958379, | |
| "grad_norm": 0.28611499071121216, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0221, | |
| "mean_token_accuracy": 0.995249642431736, | |
| "num_tokens": 3731934.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.022820547095034272, | |
| "epoch": 5.591456736035049, | |
| "grad_norm": 0.25996023416519165, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0222, | |
| "mean_token_accuracy": 0.9951062291860581, | |
| "num_tokens": 3761366.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.02497695847414434, | |
| "epoch": 5.63526834611172, | |
| "grad_norm": 0.309165894985199, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9950066149234772, | |
| "num_tokens": 3790826.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 0.023493261518888177, | |
| "epoch": 5.67907995618839, | |
| "grad_norm": 0.24247996509075165, | |
| "learning_rate": 2e-05, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.9947842925786972, | |
| "num_tokens": 3820342.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.023278132849372923, | |
| "epoch": 5.72289156626506, | |
| "grad_norm": 0.358727365732193, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0212, | |
| "mean_token_accuracy": 0.9951738849282264, | |
| "num_tokens": 3849780.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.020839552069082855, | |
| "epoch": 5.76670317634173, | |
| "grad_norm": 0.4976311922073364, | |
| "learning_rate": 2e-05, | |
| "loss": 0.023, | |
| "mean_token_accuracy": 0.9948423251509666, | |
| "num_tokens": 3879265.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.022906254278495908, | |
| "epoch": 5.810514786418401, | |
| "grad_norm": 0.3438393175601959, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0208, | |
| "mean_token_accuracy": 0.9953504383563996, | |
| "num_tokens": 3908719.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.024066617572680117, | |
| "epoch": 5.854326396495071, | |
| "grad_norm": 0.22441036999225616, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0237, | |
| "mean_token_accuracy": 0.9947593525052071, | |
| "num_tokens": 3938284.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.023073240509256722, | |
| "epoch": 5.898138006571742, | |
| "grad_norm": 0.3324027359485626, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0226, | |
| "mean_token_accuracy": 0.995069320499897, | |
| "num_tokens": 3967715.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.02433232117909938, | |
| "epoch": 5.941949616648412, | |
| "grad_norm": 0.3299965262413025, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9954777494072914, | |
| "num_tokens": 3997146.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 0.02257502309512347, | |
| "epoch": 5.985761226725082, | |
| "grad_norm": 0.2526472806930542, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0232, | |
| "mean_token_accuracy": 0.9951533004641533, | |
| "num_tokens": 4026648.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.02323322096285788, | |
| "epoch": 6.026286966046002, | |
| "grad_norm": 0.23763661086559296, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0224, | |
| "mean_token_accuracy": 0.9952652873219671, | |
| "num_tokens": 4053900.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.021967137791216375, | |
| "epoch": 6.070098576122673, | |
| "grad_norm": 0.2572341859340668, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0202, | |
| "mean_token_accuracy": 0.9953828155994415, | |
| "num_tokens": 4083360.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.02138982171891257, | |
| "epoch": 6.113910186199343, | |
| "grad_norm": 0.3185424506664276, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0208, | |
| "mean_token_accuracy": 0.995489752292633, | |
| "num_tokens": 4112847.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.020864816126413645, | |
| "epoch": 6.157721796276014, | |
| "grad_norm": 0.3312909007072449, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0207, | |
| "mean_token_accuracy": 0.9954506799578666, | |
| "num_tokens": 4142296.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.02233279454521835, | |
| "epoch": 6.2015334063526835, | |
| "grad_norm": 0.17053531110286713, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0197, | |
| "mean_token_accuracy": 0.9955132082104683, | |
| "num_tokens": 4171692.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.024310170486569405, | |
| "epoch": 6.245345016429353, | |
| "grad_norm": 0.2670746147632599, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0205, | |
| "mean_token_accuracy": 0.9951695516705513, | |
| "num_tokens": 4201089.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.020984722138382494, | |
| "epoch": 6.289156626506024, | |
| "grad_norm": 0.25105613470077515, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0206, | |
| "mean_token_accuracy": 0.9953115940093994, | |
| "num_tokens": 4230559.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 0.02055248327087611, | |
| "epoch": 6.332968236582694, | |
| "grad_norm": 0.24033121764659882, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9958187058568001, | |
| "num_tokens": 4259996.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.022047754074446858, | |
| "epoch": 6.376779846659365, | |
| "grad_norm": 0.22444288432598114, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0215, | |
| "mean_token_accuracy": 0.9950413301587104, | |
| "num_tokens": 4289454.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 0.022383451019413768, | |
| "epoch": 6.420591456736035, | |
| "grad_norm": 0.23084090650081635, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0225, | |
| "mean_token_accuracy": 0.995081527531147, | |
| "num_tokens": 4318939.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.022706201835535466, | |
| "epoch": 6.4644030668127055, | |
| "grad_norm": 0.3231358528137207, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9955555170774459, | |
| "num_tokens": 4348417.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.023630459420382978, | |
| "epoch": 6.508214676889375, | |
| "grad_norm": 0.481501966714859, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0217, | |
| "mean_token_accuracy": 0.9954674571752549, | |
| "num_tokens": 4377986.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.020876687136478722, | |
| "epoch": 6.552026286966046, | |
| "grad_norm": 0.35793283581733704, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0213, | |
| "mean_token_accuracy": 0.9948447868227959, | |
| "num_tokens": 4407470.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 0.02328007298056036, | |
| "epoch": 6.595837897042716, | |
| "grad_norm": 0.20913559198379517, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9952251821756363, | |
| "num_tokens": 4437011.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.02375670406036079, | |
| "epoch": 6.639649507119387, | |
| "grad_norm": 0.2503627836704254, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0209, | |
| "mean_token_accuracy": 0.9950839385390282, | |
| "num_tokens": 4466530.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.021242060861550272, | |
| "epoch": 6.683461117196057, | |
| "grad_norm": 0.27485519647598267, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0203, | |
| "mean_token_accuracy": 0.9954615503549575, | |
| "num_tokens": 4496057.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.021616383735090495, | |
| "epoch": 6.7272727272727275, | |
| "grad_norm": 0.19043618440628052, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0239, | |
| "mean_token_accuracy": 0.9952506437897682, | |
| "num_tokens": 4525543.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.023494412051513792, | |
| "epoch": 6.771084337349397, | |
| "grad_norm": 0.19691585004329681, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0211, | |
| "mean_token_accuracy": 0.99548791795969, | |
| "num_tokens": 4555013.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 0.02191384304314852, | |
| "epoch": 6.814895947426068, | |
| "grad_norm": 0.29944664239883423, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0219, | |
| "mean_token_accuracy": 0.9951483547687531, | |
| "num_tokens": 4584494.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 0.02303541658911854, | |
| "epoch": 6.858707557502738, | |
| "grad_norm": 0.2113044410943985, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0222, | |
| "mean_token_accuracy": 0.9948215037584305, | |
| "num_tokens": 4614045.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.021106354193761945, | |
| "epoch": 6.902519167579409, | |
| "grad_norm": 0.21171341836452484, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9955798342823983, | |
| "num_tokens": 4643457.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.02150448861066252, | |
| "epoch": 6.946330777656079, | |
| "grad_norm": 0.24946355819702148, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0201, | |
| "mean_token_accuracy": 0.9956050664186478, | |
| "num_tokens": 4672821.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.021063918247818947, | |
| "epoch": 6.9901423877327495, | |
| "grad_norm": 0.3385717570781708, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9951394200325012, | |
| "num_tokens": 4702271.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 0.02021917436473273, | |
| "epoch": 7.030668127053669, | |
| "grad_norm": 0.2704486548900604, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0188, | |
| "mean_token_accuracy": 0.9957452045904623, | |
| "num_tokens": 4729525.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.021311909216456114, | |
| "epoch": 7.074479737130339, | |
| "grad_norm": 0.320721298456192, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0197, | |
| "mean_token_accuracy": 0.9953247964382171, | |
| "num_tokens": 4759055.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 0.020557987270876765, | |
| "epoch": 7.11829134720701, | |
| "grad_norm": 0.32253098487854004, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0203, | |
| "mean_token_accuracy": 0.9955166086554528, | |
| "num_tokens": 4788511.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.019870327040553092, | |
| "epoch": 7.16210295728368, | |
| "grad_norm": 0.2977895140647888, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0199, | |
| "mean_token_accuracy": 0.9952198579907418, | |
| "num_tokens": 4818030.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.022079354885499926, | |
| "epoch": 7.205914567360351, | |
| "grad_norm": 0.19379644095897675, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0192, | |
| "mean_token_accuracy": 0.9956528976559639, | |
| "num_tokens": 4847471.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.022374613489955664, | |
| "epoch": 7.2497261774370205, | |
| "grad_norm": 0.2583862245082855, | |
| "learning_rate": 2e-05, | |
| "loss": 0.02, | |
| "mean_token_accuracy": 0.9954133436083794, | |
| "num_tokens": 4876921.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.019138680095784365, | |
| "epoch": 7.293537787513691, | |
| "grad_norm": 0.35674577951431274, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0196, | |
| "mean_token_accuracy": 0.9955156922340394, | |
| "num_tokens": 4906393.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.020489802700467408, | |
| "epoch": 7.337349397590361, | |
| "grad_norm": 0.2793170213699341, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0201, | |
| "mean_token_accuracy": 0.9954265221953392, | |
| "num_tokens": 4935899.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.019693873031064867, | |
| "epoch": 7.381161007667032, | |
| "grad_norm": 0.27242857217788696, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0198, | |
| "mean_token_accuracy": 0.9957166820764541, | |
| "num_tokens": 4965311.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.021065186546184122, | |
| "epoch": 7.424972617743702, | |
| "grad_norm": 0.33647313714027405, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0189, | |
| "mean_token_accuracy": 0.9956855684518814, | |
| "num_tokens": 4994761.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.020657581137493254, | |
| "epoch": 7.468784227820373, | |
| "grad_norm": 0.2646650969982147, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0197, | |
| "mean_token_accuracy": 0.9955179765820503, | |
| "num_tokens": 5024232.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 0.021019189269281924, | |
| "epoch": 7.5125958378970425, | |
| "grad_norm": 0.35804569721221924, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0214, | |
| "mean_token_accuracy": 0.9950902581214904, | |
| "num_tokens": 5053767.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.023458714364096522, | |
| "epoch": 7.556407447973713, | |
| "grad_norm": 0.2962614595890045, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0211, | |
| "mean_token_accuracy": 0.9953871309757233, | |
| "num_tokens": 5083257.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.020619858335703612, | |
| "epoch": 7.600219058050383, | |
| "grad_norm": 0.4548715353012085, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0205, | |
| "mean_token_accuracy": 0.9953182607889175, | |
| "num_tokens": 5112709.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.018665279296692462, | |
| "epoch": 7.644030668127054, | |
| "grad_norm": 0.2674151062965393, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0187, | |
| "mean_token_accuracy": 0.9960940897464752, | |
| "num_tokens": 5142149.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.020423237257637082, | |
| "epoch": 7.687842278203724, | |
| "grad_norm": 0.26541629433631897, | |
| "learning_rate": 2e-05, | |
| "loss": 0.021, | |
| "mean_token_accuracy": 0.9951240062713623, | |
| "num_tokens": 5171658.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.02253671985818073, | |
| "epoch": 7.731653888280395, | |
| "grad_norm": 0.3901165723800659, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0204, | |
| "mean_token_accuracy": 0.9952122256159782, | |
| "num_tokens": 5201113.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.021983299939893185, | |
| "epoch": 7.775465498357065, | |
| "grad_norm": 0.2226109355688095, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0214, | |
| "mean_token_accuracy": 0.9953880906105042, | |
| "num_tokens": 5230603.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 0.019754715240560473, | |
| "epoch": 7.8192771084337345, | |
| "grad_norm": 0.3183552026748657, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0197, | |
| "mean_token_accuracy": 0.9956612795591354, | |
| "num_tokens": 5260108.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.021397983329370616, | |
| "epoch": 7.863088718510405, | |
| "grad_norm": 0.18899278342723846, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0201, | |
| "mean_token_accuracy": 0.9951773226261139, | |
| "num_tokens": 5289555.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.022004493232816457, | |
| "epoch": 7.906900328587076, | |
| "grad_norm": 0.19192738831043243, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0201, | |
| "mean_token_accuracy": 0.9956512361764908, | |
| "num_tokens": 5318990.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.020234017237089574, | |
| "epoch": 7.950711938663746, | |
| "grad_norm": 0.19193395972251892, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0182, | |
| "mean_token_accuracy": 0.9957286223769188, | |
| "num_tokens": 5348497.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.020024944259785116, | |
| "epoch": 7.994523548740416, | |
| "grad_norm": 0.19811798632144928, | |
| "learning_rate": 2e-05, | |
| "loss": 0.0202, | |
| "mean_token_accuracy": 0.9952393263578415, | |
| "num_tokens": 5377914.0, | |
| "step": 1830 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1832, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3362085281212416.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |