| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.694126339120798, |
| "eval_steps": 1000, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03694126339120798, |
| "grad_norm": 1.4473156929016113, |
| "learning_rate": 6.6e-05, |
| "loss": 2.0357, |
| "mean_token_accuracy": 0.4905380755662918, |
| "num_tokens": 246180.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07388252678241596, |
| "grad_norm": 1.4902719259262085, |
| "learning_rate": 0.00013266666666666667, |
| "loss": 1.3937, |
| "mean_token_accuracy": 0.5916463854908943, |
| "num_tokens": 492915.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.11082379017362394, |
| "grad_norm": 1.3383102416992188, |
| "learning_rate": 0.00019933333333333334, |
| "loss": 1.3205, |
| "mean_token_accuracy": 0.6068769115209579, |
| "num_tokens": 740398.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.1477650535648319, |
| "grad_norm": 1.2365469932556152, |
| "learning_rate": 0.000173421993904824, |
| "loss": 1.3026, |
| "mean_token_accuracy": 0.6126913416385651, |
| "num_tokens": 988197.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1847063169560399, |
| "grad_norm": 1.0308854579925537, |
| "learning_rate": 0.0001550744859491231, |
| "loss": 1.2537, |
| "mean_token_accuracy": 0.6190469121932983, |
| "num_tokens": 1237334.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.22164758034724788, |
| "grad_norm": 1.2729270458221436, |
| "learning_rate": 0.00014153935488632152, |
| "loss": 1.2353, |
| "mean_token_accuracy": 0.6254772353172302, |
| "num_tokens": 1483308.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.25858884373845586, |
| "grad_norm": 0.9841827750205994, |
| "learning_rate": 0.00013102435641608367, |
| "loss": 1.2182, |
| "mean_token_accuracy": 0.6275931853055954, |
| "num_tokens": 1730192.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2955301071296638, |
| "grad_norm": 0.9242544174194336, |
| "learning_rate": 0.00012255110553085002, |
| "loss": 1.2049, |
| "mean_token_accuracy": 0.6283232820034027, |
| "num_tokens": 1980054.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.33247137052087183, |
| "grad_norm": 0.8931549787521362, |
| "learning_rate": 0.00011553425737574005, |
| "loss": 1.2017, |
| "mean_token_accuracy": 0.6302745240926743, |
| "num_tokens": 2228605.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.3694126339120798, |
| "grad_norm": 1.044004201889038, |
| "learning_rate": 0.0001095993248702382, |
| "loss": 1.2137, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.3694126339120798, |
| "eval_loss": 1.1836973428726196, |
| "eval_mean_token_accuracy": 0.6324496693611145, |
| "eval_num_tokens": 2475392.0, |
| "eval_runtime": 5.3895, |
| "eval_samples_per_second": 185.547, |
| "eval_steps_per_second": 23.193, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.40635389730328775, |
| "grad_norm": 0.8837220072746277, |
| "learning_rate": 0.00010449410169212441, |
| "loss": 1.1854, |
| "mean_token_accuracy": 0.6299630090594291, |
| "num_tokens": 2724102.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.44329516069449576, |
| "grad_norm": 0.8813680410385132, |
| "learning_rate": 0.00010004169272643103, |
| "loss": 1.1776, |
| "mean_token_accuracy": 0.6349628627300262, |
| "num_tokens": 2972072.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.4802364240857037, |
| "grad_norm": 0.9930716753005981, |
| "learning_rate": 9.611386626644256e-05, |
| "loss": 1.1683, |
| "mean_token_accuracy": 0.6352091038227081, |
| "num_tokens": 3217529.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.5171776874769117, |
| "grad_norm": 0.8394394516944885, |
| "learning_rate": 9.261509270476351e-05, |
| "loss": 1.1687, |
| "mean_token_accuracy": 0.6363370817899704, |
| "num_tokens": 3467819.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.5541189508681197, |
| "grad_norm": 1.129971981048584, |
| "learning_rate": 8.9472548255098e-05, |
| "loss": 1.1541, |
| "mean_token_accuracy": 0.6417357540130615, |
| "num_tokens": 3712960.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.5910602142593276, |
| "grad_norm": 1.100881576538086, |
| "learning_rate": 8.662961636484199e-05, |
| "loss": 1.1602, |
| "mean_token_accuracy": 0.6412180256843567, |
| "num_tokens": 3956425.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.6280014776505356, |
| "grad_norm": 0.9382134079933167, |
| "learning_rate": 8.40415267738742e-05, |
| "loss": 1.1604, |
| "mean_token_accuracy": 0.6395090478658676, |
| "num_tokens": 4203009.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.6649427410417437, |
| "grad_norm": 1.084293246269226, |
| "learning_rate": 8.167234800792304e-05, |
| "loss": 1.1352, |
| "mean_token_accuracy": 0.646253719329834, |
| "num_tokens": 4449448.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.7018840044329516, |
| "grad_norm": 0.9637444019317627, |
| "learning_rate": 7.949286335171643e-05, |
| "loss": 1.1562, |
| "mean_token_accuracy": 0.6406971418857574, |
| "num_tokens": 4694282.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.7388252678241596, |
| "grad_norm": 0.9237338304519653, |
| "learning_rate": 7.747903910575024e-05, |
| "loss": 1.1424, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7388252678241596, |
| "eval_loss": 1.1218078136444092, |
| "eval_mean_token_accuracy": 0.6506093912124634, |
| "eval_num_tokens": 4940833.0, |
| "eval_runtime": 5.3952, |
| "eval_samples_per_second": 185.351, |
| "eval_steps_per_second": 23.169, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.7757665312153675, |
| "grad_norm": 0.777606725692749, |
| "learning_rate": 7.561089934060305e-05, |
| "loss": 1.1313, |
| "mean_token_accuracy": 0.6467883923649788, |
| "num_tokens": 5190358.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.8127077946065755, |
| "grad_norm": 0.8855065703392029, |
| "learning_rate": 7.387168551531178e-05, |
| "loss": 1.1309, |
| "mean_token_accuracy": 0.6449691706895828, |
| "num_tokens": 5438772.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.8496490579977836, |
| "grad_norm": 1.2018849849700928, |
| "learning_rate": 7.224721947627513e-05, |
| "loss": 1.1246, |
| "mean_token_accuracy": 0.6473777782917023, |
| "num_tokens": 5682900.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.8865903213889915, |
| "grad_norm": 0.9426067471504211, |
| "learning_rate": 7.07254141150883e-05, |
| "loss": 1.1241, |
| "mean_token_accuracy": 0.647763032913208, |
| "num_tokens": 5931817.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.9235315847801995, |
| "grad_norm": 0.7986142039299011, |
| "learning_rate": 6.929589286752371e-05, |
| "loss": 1.1258, |
| "mean_token_accuracy": 0.6497471231222153, |
| "num_tokens": 6179818.0, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.9604728481714074, |
| "grad_norm": 1.1682002544403076, |
| "learning_rate": 6.794969055356698e-05, |
| "loss": 1.1286, |
| "mean_token_accuracy": 0.6473524701595307, |
| "num_tokens": 6426250.0, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.9974141115626154, |
| "grad_norm": 0.9333537220954895, |
| "learning_rate": 6.667901577609308e-05, |
| "loss": 1.1239, |
| "mean_token_accuracy": 0.6484623271226883, |
| "num_tokens": 6672929.0, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.0343553749538235, |
| "grad_norm": 0.5627142190933228, |
| "learning_rate": 6.547706044716512e-05, |
| "loss": 1.0508, |
| "mean_token_accuracy": 0.6667241591215134, |
| "num_tokens": 6925125.0, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.0712966383450313, |
| "grad_norm": 0.5715717077255249, |
| "learning_rate": 6.433784577873342e-05, |
| "loss": 1.0615, |
| "mean_token_accuracy": 0.6637595742940903, |
| "num_tokens": 7171997.0, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.1082379017362394, |
| "grad_norm": 0.5389025211334229, |
| "learning_rate": 6.325609676486509e-05, |
| "loss": 1.0709, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.1082379017362394, |
| "eval_loss": 1.0916837453842163, |
| "eval_mean_token_accuracy": 0.6582915830612183, |
| "eval_num_tokens": 7419279.0, |
| "eval_runtime": 5.3894, |
| "eval_samples_per_second": 185.548, |
| "eval_steps_per_second": 23.193, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.1451791651274474, |
| "grad_norm": 0.56490159034729, |
| "learning_rate": 6.22271391287055e-05, |
| "loss": 1.0581, |
| "mean_token_accuracy": 0.662086527645588, |
| "num_tokens": 7664383.0, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.1821204285186553, |
| "grad_norm": 0.543954074382782, |
| "learning_rate": 6.12468141320462e-05, |
| "loss": 1.0615, |
| "mean_token_accuracy": 0.6631740409135819, |
| "num_tokens": 7912840.0, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.2190616919098634, |
| "grad_norm": 0.5596346855163574, |
| "learning_rate": 6.03114077000131e-05, |
| "loss": 1.0395, |
| "mean_token_accuracy": 0.671640704870224, |
| "num_tokens": 8160289.0, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.2560029553010712, |
| "grad_norm": 0.5955942869186401, |
| "learning_rate": 5.9417591102230663e-05, |
| "loss": 1.0567, |
| "mean_token_accuracy": 0.6638083755970001, |
| "num_tokens": 8407857.0, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.2929442186922793, |
| "grad_norm": 0.5933428406715393, |
| "learning_rate": 5.856237102757652e-05, |
| "loss": 1.0636, |
| "mean_token_accuracy": 0.6627275788784027, |
| "num_tokens": 8655624.0, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.3298854820834873, |
| "grad_norm": 0.5572307109832764, |
| "learning_rate": 5.7743047343661814e-05, |
| "loss": 1.0652, |
| "mean_token_accuracy": 0.6624369341135025, |
| "num_tokens": 8902821.0, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.3668267454746952, |
| "grad_norm": 0.596443235874176, |
| "learning_rate": 5.6957177181117404e-05, |
| "loss": 1.0622, |
| "mean_token_accuracy": 0.6634978985786438, |
| "num_tokens": 9149878.0, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.4037680088659032, |
| "grad_norm": 0.5873645544052124, |
| "learning_rate": 5.620254425309578e-05, |
| "loss": 1.0493, |
| "mean_token_accuracy": 0.6654049742221833, |
| "num_tokens": 9394858.0, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.440709272257111, |
| "grad_norm": 0.59478759765625, |
| "learning_rate": 5.547713253139649e-05, |
| "loss": 1.0456, |
| "mean_token_accuracy": 0.6683453869819641, |
| "num_tokens": 9641008.0, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.4776505356483192, |
| "grad_norm": 0.5986542701721191, |
| "learning_rate": 5.477910356647767e-05, |
| "loss": 1.043, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.4776505356483192, |
| "eval_loss": 1.083065152168274, |
| "eval_mean_token_accuracy": 0.6582373585700989, |
| "eval_num_tokens": 9888284.0, |
| "eval_runtime": 5.3783, |
| "eval_samples_per_second": 185.933, |
| "eval_steps_per_second": 23.242, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.5145917990395272, |
| "grad_norm": 0.5704376697540283, |
| "learning_rate": 5.410677686985887e-05, |
| "loss": 1.0321, |
| "mean_token_accuracy": 0.6689175629615783, |
| "num_tokens": 10134094.0, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.551533062430735, |
| "grad_norm": 0.5602062344551086, |
| "learning_rate": 5.345861288192786e-05, |
| "loss": 1.0441, |
| "mean_token_accuracy": 0.6680737626552582, |
| "num_tokens": 10381316.0, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.5884743258219431, |
| "grad_norm": 0.5552584528923035, |
| "learning_rate": 5.283319813188472e-05, |
| "loss": 1.0369, |
| "mean_token_accuracy": 0.6698204201459884, |
| "num_tokens": 10628471.0, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.625415589213151, |
| "grad_norm": 0.6024323105812073, |
| "learning_rate": 5.222923226400155e-05, |
| "loss": 1.0403, |
| "mean_token_accuracy": 0.6691047704219818, |
| "num_tokens": 10874430.0, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.662356852604359, |
| "grad_norm": 0.5967562794685364, |
| "learning_rate": 5.164551665900703e-05, |
| "loss": 1.0483, |
| "mean_token_accuracy": 0.6648873990774155, |
| "num_tokens": 11123128.0, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.6992981159955671, |
| "grad_norm": 0.5668358206748962, |
| "learning_rate": 5.1080944423879696e-05, |
| "loss": 1.0392, |
| "mean_token_accuracy": 0.6674597597122193, |
| "num_tokens": 11368003.0, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.736239379386775, |
| "grad_norm": 0.6373595595359802, |
| "learning_rate": 5.053449155971992e-05, |
| "loss": 1.0404, |
| "mean_token_accuracy": 0.6689798641204834, |
| "num_tokens": 11615194.0, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.773180642777983, |
| "grad_norm": 0.6193637847900391, |
| "learning_rate": 5.0005209147276734e-05, |
| "loss": 1.0355, |
| "mean_token_accuracy": 0.6675721609592438, |
| "num_tokens": 11863548.0, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.8101219061691909, |
| "grad_norm": 0.5770505666732788, |
| "learning_rate": 4.949221641439499e-05, |
| "loss": 1.0316, |
| "mean_token_accuracy": 0.6724146312475204, |
| "num_tokens": 12111554.0, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.847063169560399, |
| "grad_norm": 0.6453244686126709, |
| "learning_rate": 4.899469457011854e-05, |
| "loss": 1.0348, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.847063169560399, |
| "eval_loss": 1.0698254108428955, |
| "eval_mean_token_accuracy": 0.66366588306427, |
| "eval_num_tokens": 12359520.0, |
| "eval_runtime": 5.3865, |
| "eval_samples_per_second": 185.649, |
| "eval_steps_per_second": 23.206, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.884004432951607, |
| "grad_norm": 0.6129611730575562, |
| "learning_rate": 4.851188130722481e-05, |
| "loss": 1.0374, |
| "mean_token_accuracy": 0.6697911691665649, |
| "num_tokens": 12607238.0, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.920945696342815, |
| "grad_norm": 0.6002670526504517, |
| "learning_rate": 4.804306588920635e-05, |
| "loss": 1.035, |
| "mean_token_accuracy": 0.6703519684076309, |
| "num_tokens": 12855037.0, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.957886959734023, |
| "grad_norm": 0.5875000953674316, |
| "learning_rate": 4.758758474966023e-05, |
| "loss": 1.0348, |
| "mean_token_accuracy": 0.6686445927619934, |
| "num_tokens": 13100596.0, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.9948282231252308, |
| "grad_norm": 0.587979257106781, |
| "learning_rate": 4.7144817542100825e-05, |
| "loss": 1.0375, |
| "mean_token_accuracy": 0.6698001223802567, |
| "num_tokens": 13349667.0, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.031769486516439, |
| "grad_norm": 0.5400444269180298, |
| "learning_rate": 4.671418358670517e-05, |
| "loss": 1.0064, |
| "mean_token_accuracy": 0.6771922719478607, |
| "num_tokens": 13599712.0, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.068710749907647, |
| "grad_norm": 0.5831236839294434, |
| "learning_rate": 4.6295138667698956e-05, |
| "loss": 0.9874, |
| "mean_token_accuracy": 0.6801465088129044, |
| "num_tokens": 13845238.0, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.105652013298855, |
| "grad_norm": 0.559648334980011, |
| "learning_rate": 4.5887172141209994e-05, |
| "loss": 0.9824, |
| "mean_token_accuracy": 0.6835214233398438, |
| "num_tokens": 14094078.0, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.1425932766900626, |
| "grad_norm": 0.5524799227714539, |
| "learning_rate": 4.548980431863551e-05, |
| "loss": 0.9852, |
| "mean_token_accuracy": 0.6822834074497223, |
| "num_tokens": 14342112.0, |
| "step": 5800 |
| }, |
| { |
| "epoch": 2.1795345400812707, |
| "grad_norm": 0.586271345615387, |
| "learning_rate": 4.510258409503273e-05, |
| "loss": 0.9807, |
| "mean_token_accuracy": 0.6823082774877548, |
| "num_tokens": 14587875.0, |
| "step": 5900 |
| }, |
| { |
| "epoch": 2.2164758034724787, |
| "grad_norm": 0.599862277507782, |
| "learning_rate": 4.472508679587051e-05, |
| "loss": 0.9741, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.2164758034724787, |
| "eval_loss": 1.071539282798767, |
| "eval_mean_token_accuracy": 0.6636041073799134, |
| "eval_num_tokens": 14833253.0, |
| "eval_runtime": 5.3837, |
| "eval_samples_per_second": 185.745, |
| "eval_steps_per_second": 23.218, |
| "step": 6000 |
| }, |
| { |
| "epoch": 2.253417066863687, |
| "grad_norm": 0.5849953293800354, |
| "learning_rate": 4.435691221877225e-05, |
| "loss": 0.9784, |
| "mean_token_accuracy": 0.6845134419202804, |
| "num_tokens": 15079437.0, |
| "step": 6100 |
| }, |
| { |
| "epoch": 2.290358330254895, |
| "grad_norm": 0.5756722688674927, |
| "learning_rate": 4.399768284971994e-05, |
| "loss": 0.9843, |
| "mean_token_accuracy": 0.6839412766695022, |
| "num_tokens": 15326743.0, |
| "step": 6200 |
| }, |
| { |
| "epoch": 2.3272995936461025, |
| "grad_norm": 0.5707868933677673, |
| "learning_rate": 4.364704223564281e-05, |
| "loss": 0.9901, |
| "mean_token_accuracy": 0.6811071854829788, |
| "num_tokens": 15572363.0, |
| "step": 6300 |
| }, |
| { |
| "epoch": 2.3642408570373106, |
| "grad_norm": 0.6192522048950195, |
| "learning_rate": 4.330465349744206e-05, |
| "loss": 0.9762, |
| "mean_token_accuracy": 0.6834132850170136, |
| "num_tokens": 15818229.0, |
| "step": 6400 |
| }, |
| { |
| "epoch": 2.4011821204285186, |
| "grad_norm": 0.5887159109115601, |
| "learning_rate": 4.2970197969350315e-05, |
| "loss": 0.9797, |
| "mean_token_accuracy": 0.6834959721565247, |
| "num_tokens": 16066092.0, |
| "step": 6500 |
| }, |
| { |
| "epoch": 2.4381233838197267, |
| "grad_norm": 0.6107765436172485, |
| "learning_rate": 4.264337395213374e-05, |
| "loss": 0.97, |
| "mean_token_accuracy": 0.6855223393440246, |
| "num_tokens": 16314390.0, |
| "step": 6600 |
| }, |
| { |
| "epoch": 2.4750646472109348, |
| "grad_norm": 0.5866128206253052, |
| "learning_rate": 4.232389556904849e-05, |
| "loss": 0.9794, |
| "mean_token_accuracy": 0.6824937015771866, |
| "num_tokens": 16560430.0, |
| "step": 6700 |
| }, |
| { |
| "epoch": 2.5120059106021424, |
| "grad_norm": 0.5723136067390442, |
| "learning_rate": 4.201149171469091e-05, |
| "loss": 0.9805, |
| "mean_token_accuracy": 0.6831120592355728, |
| "num_tokens": 16807172.0, |
| "step": 6800 |
| }, |
| { |
| "epoch": 2.5489471739933505, |
| "grad_norm": 0.5534746050834656, |
| "learning_rate": 4.170590508795705e-05, |
| "loss": 0.9853, |
| "mean_token_accuracy": 0.679190359711647, |
| "num_tokens": 17054725.0, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.5858884373845585, |
| "grad_norm": 0.6190086007118225, |
| "learning_rate": 4.1406891301271574e-05, |
| "loss": 0.979, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.5858884373845585, |
| "eval_loss": 1.0672271251678467, |
| "eval_mean_token_accuracy": 0.665556743144989, |
| "eval_num_tokens": 17301619.0, |
| "eval_runtime": 5.3864, |
| "eval_samples_per_second": 185.654, |
| "eval_steps_per_second": 23.207, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.6228297007757666, |
| "grad_norm": 0.604921281337738, |
| "learning_rate": 4.111421805907759e-05, |
| "loss": 0.9844, |
| "mean_token_accuracy": 0.6844497114419937, |
| "num_tokens": 17546632.0, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.6597709641669747, |
| "grad_norm": 0.6087679862976074, |
| "learning_rate": 4.082766439931165e-05, |
| "loss": 0.9871, |
| "mean_token_accuracy": 0.6810534721612931, |
| "num_tokens": 17797482.0, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.6967122275581827, |
| "grad_norm": 0.6020961999893188, |
| "learning_rate": 4.054701999223518e-05, |
| "loss": 0.9839, |
| "mean_token_accuracy": 0.6829093122482299, |
| "num_tokens": 18043599.0, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.7336534909493904, |
| "grad_norm": 0.6339052319526672, |
| "learning_rate": 4.0272084491566247e-05, |
| "loss": 0.9863, |
| "mean_token_accuracy": 0.6820144325494766, |
| "num_tokens": 18287793.0, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.7705947543405984, |
| "grad_norm": 0.6206592321395874, |
| "learning_rate": 4.000266693336297e-05, |
| "loss": 0.9709, |
| "mean_token_accuracy": 0.6874477595090867, |
| "num_tokens": 18535420.0, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.8075360177318065, |
| "grad_norm": 0.6235191822052002, |
| "learning_rate": 3.973858517856019e-05, |
| "loss": 0.9734, |
| "mean_token_accuracy": 0.6847814846038819, |
| "num_tokens": 18784286.0, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.8444772811230146, |
| "grad_norm": 0.6308836340904236, |
| "learning_rate": 3.947966539546186e-05, |
| "loss": 0.9813, |
| "mean_token_accuracy": 0.6831617254018784, |
| "num_tokens": 19035813.0, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.881418544514222, |
| "grad_norm": 0.6001960039138794, |
| "learning_rate": 3.922574157884801e-05, |
| "loss": 0.987, |
| "mean_token_accuracy": 0.6807804244756699, |
| "num_tokens": 19282122.0, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.9183598079054303, |
| "grad_norm": 0.6059972643852234, |
| "learning_rate": 3.8976655102673755e-05, |
| "loss": 0.9859, |
| "mean_token_accuracy": 0.6820109623670578, |
| "num_tokens": 19529782.0, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.9553010712966383, |
| "grad_norm": 0.640872061252594, |
| "learning_rate": 3.873225430362181e-05, |
| "loss": 0.9761, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.9553010712966383, |
| "eval_loss": 1.0601364374160767, |
| "eval_mean_token_accuracy": 0.6661491613388062, |
| "eval_num_tokens": 19777841.0, |
| "eval_runtime": 5.4066, |
| "eval_samples_per_second": 184.959, |
| "eval_steps_per_second": 23.12, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.9922423346878464, |
| "grad_norm": 0.6132466793060303, |
| "learning_rate": 3.8492394093024636e-05, |
| "loss": 0.9711, |
| "mean_token_accuracy": 0.6846305218338966, |
| "num_tokens": 20023858.0, |
| "step": 8100 |
| }, |
| { |
| "epoch": 3.0291835980790545, |
| "grad_norm": 0.5972515940666199, |
| "learning_rate": 3.825693559490006e-05, |
| "loss": 0.9376, |
| "mean_token_accuracy": 0.692512179017067, |
| "num_tokens": 20270581.0, |
| "step": 8200 |
| }, |
| { |
| "epoch": 3.066124861470262, |
| "grad_norm": 0.6202205419540405, |
| "learning_rate": 3.8025745808048846e-05, |
| "loss": 0.9307, |
| "mean_token_accuracy": 0.6948988193273544, |
| "num_tokens": 20516819.0, |
| "step": 8300 |
| }, |
| { |
| "epoch": 3.10306612486147, |
| "grad_norm": 0.585482120513916, |
| "learning_rate": 3.779869729034645e-05, |
| "loss": 0.935, |
| "mean_token_accuracy": 0.6954682809114456, |
| "num_tokens": 20763559.0, |
| "step": 8400 |
| }, |
| { |
| "epoch": 3.1400073882526782, |
| "grad_norm": 0.6358840465545654, |
| "learning_rate": 3.7575667863526335e-05, |
| "loss": 0.9292, |
| "mean_token_accuracy": 0.6980463570356369, |
| "num_tokens": 21011357.0, |
| "step": 8500 |
| }, |
| { |
| "epoch": 3.1769486516438863, |
| "grad_norm": 0.6181186437606812, |
| "learning_rate": 3.735654033690154e-05, |
| "loss": 0.9229, |
| "mean_token_accuracy": 0.696455385684967, |
| "num_tokens": 21259566.0, |
| "step": 8600 |
| }, |
| { |
| "epoch": 3.2138899150350944, |
| "grad_norm": 0.6604560017585754, |
| "learning_rate": 3.7141202248604964e-05, |
| "loss": 0.9285, |
| "mean_token_accuracy": 0.6962138444185257, |
| "num_tokens": 21506213.0, |
| "step": 8700 |
| }, |
| { |
| "epoch": 3.250831178426302, |
| "grad_norm": 0.5987362265586853, |
| "learning_rate": 3.6929545623050815e-05, |
| "loss": 0.929, |
| "mean_token_accuracy": 0.6957518076896667, |
| "num_tokens": 21754908.0, |
| "step": 8800 |
| }, |
| { |
| "epoch": 3.28777244181751, |
| "grad_norm": 0.5980191230773926, |
| "learning_rate": 3.6721466743428706e-05, |
| "loss": 0.938, |
| "mean_token_accuracy": 0.6953296983242034, |
| "num_tokens": 22001522.0, |
| "step": 8900 |
| }, |
| { |
| "epoch": 3.324713705208718, |
| "grad_norm": 0.6171393990516663, |
| "learning_rate": 3.6516865938141736e-05, |
| "loss": 0.9364, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.324713705208718, |
| "eval_loss": 1.0706262588500977, |
| "eval_mean_token_accuracy": 0.6642319107055664, |
| "eval_num_tokens": 22248126.0, |
| "eval_runtime": 5.3921, |
| "eval_samples_per_second": 185.455, |
| "eval_steps_per_second": 23.182, |
| "step": 9000 |
| }, |
| { |
| "epoch": 3.361654968599926, |
| "grad_norm": 0.6272869110107422, |
| "learning_rate": 3.6315647380189556e-05, |
| "loss": 0.919, |
| "mean_token_accuracy": 0.6962304222583771, |
| "num_tokens": 22494098.0, |
| "step": 9100 |
| }, |
| { |
| "epoch": 3.3985962319911343, |
| "grad_norm": 0.6461876034736633, |
| "learning_rate": 3.611771889857922e-05, |
| "loss": 0.9331, |
| "mean_token_accuracy": 0.6959864324331284, |
| "num_tokens": 22741819.0, |
| "step": 9200 |
| }, |
| { |
| "epoch": 3.4355374953823423, |
| "grad_norm": 0.5855452418327332, |
| "learning_rate": 3.592299180092082e-05, |
| "loss": 0.9283, |
| "mean_token_accuracy": 0.6968681657314301, |
| "num_tokens": 22988621.0, |
| "step": 9300 |
| }, |
| { |
| "epoch": 3.47247875877355, |
| "grad_norm": 0.6252483129501343, |
| "learning_rate": 3.573138070643225e-05, |
| "loss": 0.9271, |
| "mean_token_accuracy": 0.6972170048952102, |
| "num_tokens": 23235818.0, |
| "step": 9400 |
| }, |
| { |
| "epoch": 3.509420022164758, |
| "grad_norm": 0.6152076125144958, |
| "learning_rate": 3.554280338863896e-05, |
| "loss": 0.9244, |
| "mean_token_accuracy": 0.699801824092865, |
| "num_tokens": 23483243.0, |
| "step": 9500 |
| }, |
| { |
| "epoch": 3.546361285555966, |
| "grad_norm": 0.6049486398696899, |
| "learning_rate": 3.535718062711045e-05, |
| "loss": 0.9365, |
| "mean_token_accuracy": 0.6951554995775223, |
| "num_tokens": 23730501.0, |
| "step": 9600 |
| }, |
| { |
| "epoch": 3.583302548947174, |
| "grad_norm": 0.6222932934761047, |
| "learning_rate": 3.517443606762636e-05, |
| "loss": 0.9374, |
| "mean_token_accuracy": 0.6915060871839523, |
| "num_tokens": 23978205.0, |
| "step": 9700 |
| }, |
| { |
| "epoch": 3.6202438123383818, |
| "grad_norm": 0.6344577670097351, |
| "learning_rate": 3.499449609021135e-05, |
| "loss": 0.9252, |
| "mean_token_accuracy": 0.6969369679689408, |
| "num_tokens": 24224944.0, |
| "step": 9800 |
| }, |
| { |
| "epoch": 3.65718507572959, |
| "grad_norm": 0.600951611995697, |
| "learning_rate": 3.4817289684521056e-05, |
| "loss": 0.9226, |
| "mean_token_accuracy": 0.6991156005859375, |
| "num_tokens": 24470936.0, |
| "step": 9900 |
| }, |
| { |
| "epoch": 3.694126339120798, |
| "grad_norm": 0.6917585134506226, |
| "learning_rate": 3.4642748332099756e-05, |
| "loss": 0.935, |
| "step": 10000 |
| }, |
| { |
| "epoch": 3.694126339120798, |
| "eval_loss": 1.0667781829833984, |
| "eval_mean_token_accuracy": 0.6657101097106933, |
| "eval_num_tokens": 24717605.0, |
| "eval_runtime": 5.4034, |
| "eval_samples_per_second": 185.07, |
| "eval_steps_per_second": 23.134, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.9870782814052352e+17, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|