| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.694126339120798, | |
| "eval_steps": 1000, | |
| "global_step": 10000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03694126339120798, | |
| "grad_norm": 1.4473156929016113, | |
| "learning_rate": 6.6e-05, | |
| "loss": 2.0357, | |
| "mean_token_accuracy": 0.4905380755662918, | |
| "num_tokens": 246180.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07388252678241596, | |
| "grad_norm": 1.4902719259262085, | |
| "learning_rate": 0.00013266666666666667, | |
| "loss": 1.3937, | |
| "mean_token_accuracy": 0.5916463854908943, | |
| "num_tokens": 492915.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11082379017362394, | |
| "grad_norm": 1.3383102416992188, | |
| "learning_rate": 0.00019933333333333334, | |
| "loss": 1.3205, | |
| "mean_token_accuracy": 0.6068769115209579, | |
| "num_tokens": 740398.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1477650535648319, | |
| "grad_norm": 1.2365469932556152, | |
| "learning_rate": 0.000173421993904824, | |
| "loss": 1.3026, | |
| "mean_token_accuracy": 0.6126913416385651, | |
| "num_tokens": 988197.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1847063169560399, | |
| "grad_norm": 1.0308854579925537, | |
| "learning_rate": 0.0001550744859491231, | |
| "loss": 1.2537, | |
| "mean_token_accuracy": 0.6190469121932983, | |
| "num_tokens": 1237334.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.22164758034724788, | |
| "grad_norm": 1.2729270458221436, | |
| "learning_rate": 0.00014153935488632152, | |
| "loss": 1.2353, | |
| "mean_token_accuracy": 0.6254772353172302, | |
| "num_tokens": 1483308.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.25858884373845586, | |
| "grad_norm": 0.9841827750205994, | |
| "learning_rate": 0.00013102435641608367, | |
| "loss": 1.2182, | |
| "mean_token_accuracy": 0.6275931853055954, | |
| "num_tokens": 1730192.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.2955301071296638, | |
| "grad_norm": 0.9242544174194336, | |
| "learning_rate": 0.00012255110553085002, | |
| "loss": 1.2049, | |
| "mean_token_accuracy": 0.6283232820034027, | |
| "num_tokens": 1980054.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.33247137052087183, | |
| "grad_norm": 0.8931549787521362, | |
| "learning_rate": 0.00011553425737574005, | |
| "loss": 1.2017, | |
| "mean_token_accuracy": 0.6302745240926743, | |
| "num_tokens": 2228605.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3694126339120798, | |
| "grad_norm": 1.044004201889038, | |
| "learning_rate": 0.0001095993248702382, | |
| "loss": 1.2137, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3694126339120798, | |
| "eval_loss": 1.1836973428726196, | |
| "eval_mean_token_accuracy": 0.6324496693611145, | |
| "eval_num_tokens": 2475392.0, | |
| "eval_runtime": 5.3895, | |
| "eval_samples_per_second": 185.547, | |
| "eval_steps_per_second": 23.193, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.40635389730328775, | |
| "grad_norm": 0.8837220072746277, | |
| "learning_rate": 0.00010449410169212441, | |
| "loss": 1.1854, | |
| "mean_token_accuracy": 0.6299630090594291, | |
| "num_tokens": 2724102.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.44329516069449576, | |
| "grad_norm": 0.8813680410385132, | |
| "learning_rate": 0.00010004169272643103, | |
| "loss": 1.1776, | |
| "mean_token_accuracy": 0.6349628627300262, | |
| "num_tokens": 2972072.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4802364240857037, | |
| "grad_norm": 0.9930716753005981, | |
| "learning_rate": 9.611386626644256e-05, | |
| "loss": 1.1683, | |
| "mean_token_accuracy": 0.6352091038227081, | |
| "num_tokens": 3217529.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.5171776874769117, | |
| "grad_norm": 0.8394394516944885, | |
| "learning_rate": 9.261509270476351e-05, | |
| "loss": 1.1687, | |
| "mean_token_accuracy": 0.6363370817899704, | |
| "num_tokens": 3467819.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5541189508681197, | |
| "grad_norm": 1.129971981048584, | |
| "learning_rate": 8.9472548255098e-05, | |
| "loss": 1.1541, | |
| "mean_token_accuracy": 0.6417357540130615, | |
| "num_tokens": 3712960.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5910602142593276, | |
| "grad_norm": 1.100881576538086, | |
| "learning_rate": 8.662961636484199e-05, | |
| "loss": 1.1602, | |
| "mean_token_accuracy": 0.6412180256843567, | |
| "num_tokens": 3956425.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.6280014776505356, | |
| "grad_norm": 0.9382134079933167, | |
| "learning_rate": 8.40415267738742e-05, | |
| "loss": 1.1604, | |
| "mean_token_accuracy": 0.6395090478658676, | |
| "num_tokens": 4203009.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6649427410417437, | |
| "grad_norm": 1.084293246269226, | |
| "learning_rate": 8.167234800792304e-05, | |
| "loss": 1.1352, | |
| "mean_token_accuracy": 0.646253719329834, | |
| "num_tokens": 4449448.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.7018840044329516, | |
| "grad_norm": 0.9637444019317627, | |
| "learning_rate": 7.949286335171643e-05, | |
| "loss": 1.1562, | |
| "mean_token_accuracy": 0.6406971418857574, | |
| "num_tokens": 4694282.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.7388252678241596, | |
| "grad_norm": 0.9237338304519653, | |
| "learning_rate": 7.747903910575024e-05, | |
| "loss": 1.1424, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7388252678241596, | |
| "eval_loss": 1.1218078136444092, | |
| "eval_mean_token_accuracy": 0.6506093912124634, | |
| "eval_num_tokens": 4940833.0, | |
| "eval_runtime": 5.3952, | |
| "eval_samples_per_second": 185.351, | |
| "eval_steps_per_second": 23.169, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7757665312153675, | |
| "grad_norm": 0.777606725692749, | |
| "learning_rate": 7.561089934060305e-05, | |
| "loss": 1.1313, | |
| "mean_token_accuracy": 0.6467883923649788, | |
| "num_tokens": 5190358.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.8127077946065755, | |
| "grad_norm": 0.8855065703392029, | |
| "learning_rate": 7.387168551531178e-05, | |
| "loss": 1.1309, | |
| "mean_token_accuracy": 0.6449691706895828, | |
| "num_tokens": 5438772.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.8496490579977836, | |
| "grad_norm": 1.2018849849700928, | |
| "learning_rate": 7.224721947627513e-05, | |
| "loss": 1.1246, | |
| "mean_token_accuracy": 0.6473777782917023, | |
| "num_tokens": 5682900.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8865903213889915, | |
| "grad_norm": 0.9426067471504211, | |
| "learning_rate": 7.07254141150883e-05, | |
| "loss": 1.1241, | |
| "mean_token_accuracy": 0.647763032913208, | |
| "num_tokens": 5931817.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.9235315847801995, | |
| "grad_norm": 0.7986142039299011, | |
| "learning_rate": 6.929589286752371e-05, | |
| "loss": 1.1258, | |
| "mean_token_accuracy": 0.6497471231222153, | |
| "num_tokens": 6179818.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.9604728481714074, | |
| "grad_norm": 1.1682002544403076, | |
| "learning_rate": 6.794969055356698e-05, | |
| "loss": 1.1286, | |
| "mean_token_accuracy": 0.6473524701595307, | |
| "num_tokens": 6426250.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9974141115626154, | |
| "grad_norm": 0.9333537220954895, | |
| "learning_rate": 6.667901577609308e-05, | |
| "loss": 1.1239, | |
| "mean_token_accuracy": 0.6484623271226883, | |
| "num_tokens": 6672929.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.0343553749538235, | |
| "grad_norm": 0.5627142190933228, | |
| "learning_rate": 6.547706044716512e-05, | |
| "loss": 1.0508, | |
| "mean_token_accuracy": 0.6667241591215134, | |
| "num_tokens": 6925125.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.0712966383450313, | |
| "grad_norm": 0.5715717077255249, | |
| "learning_rate": 6.433784577873342e-05, | |
| "loss": 1.0615, | |
| "mean_token_accuracy": 0.6637595742940903, | |
| "num_tokens": 7171997.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.1082379017362394, | |
| "grad_norm": 0.5389025211334229, | |
| "learning_rate": 6.325609676486509e-05, | |
| "loss": 1.0709, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.1082379017362394, | |
| "eval_loss": 1.0916837453842163, | |
| "eval_mean_token_accuracy": 0.6582915830612183, | |
| "eval_num_tokens": 7419279.0, | |
| "eval_runtime": 5.3894, | |
| "eval_samples_per_second": 185.548, | |
| "eval_steps_per_second": 23.193, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.1451791651274474, | |
| "grad_norm": 0.56490159034729, | |
| "learning_rate": 6.22271391287055e-05, | |
| "loss": 1.0581, | |
| "mean_token_accuracy": 0.662086527645588, | |
| "num_tokens": 7664383.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.1821204285186553, | |
| "grad_norm": 0.543954074382782, | |
| "learning_rate": 6.12468141320462e-05, | |
| "loss": 1.0615, | |
| "mean_token_accuracy": 0.6631740409135819, | |
| "num_tokens": 7912840.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.2190616919098634, | |
| "grad_norm": 0.5596346855163574, | |
| "learning_rate": 6.03114077000131e-05, | |
| "loss": 1.0395, | |
| "mean_token_accuracy": 0.671640704870224, | |
| "num_tokens": 8160289.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.2560029553010712, | |
| "grad_norm": 0.5955942869186401, | |
| "learning_rate": 5.9417591102230663e-05, | |
| "loss": 1.0567, | |
| "mean_token_accuracy": 0.6638083755970001, | |
| "num_tokens": 8407857.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.2929442186922793, | |
| "grad_norm": 0.5933428406715393, | |
| "learning_rate": 5.856237102757652e-05, | |
| "loss": 1.0636, | |
| "mean_token_accuracy": 0.6627275788784027, | |
| "num_tokens": 8655624.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.3298854820834873, | |
| "grad_norm": 0.5572307109832764, | |
| "learning_rate": 5.7743047343661814e-05, | |
| "loss": 1.0652, | |
| "mean_token_accuracy": 0.6624369341135025, | |
| "num_tokens": 8902821.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.3668267454746952, | |
| "grad_norm": 0.596443235874176, | |
| "learning_rate": 5.6957177181117404e-05, | |
| "loss": 1.0622, | |
| "mean_token_accuracy": 0.6634978985786438, | |
| "num_tokens": 9149878.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.4037680088659032, | |
| "grad_norm": 0.5873645544052124, | |
| "learning_rate": 5.620254425309578e-05, | |
| "loss": 1.0493, | |
| "mean_token_accuracy": 0.6654049742221833, | |
| "num_tokens": 9394858.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.440709272257111, | |
| "grad_norm": 0.59478759765625, | |
| "learning_rate": 5.547713253139649e-05, | |
| "loss": 1.0456, | |
| "mean_token_accuracy": 0.6683453869819641, | |
| "num_tokens": 9641008.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.4776505356483192, | |
| "grad_norm": 0.5986542701721191, | |
| "learning_rate": 5.477910356647767e-05, | |
| "loss": 1.043, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.4776505356483192, | |
| "eval_loss": 1.083065152168274, | |
| "eval_mean_token_accuracy": 0.6582373585700989, | |
| "eval_num_tokens": 9888284.0, | |
| "eval_runtime": 5.3783, | |
| "eval_samples_per_second": 185.933, | |
| "eval_steps_per_second": 23.242, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.5145917990395272, | |
| "grad_norm": 0.5704376697540283, | |
| "learning_rate": 5.410677686985887e-05, | |
| "loss": 1.0321, | |
| "mean_token_accuracy": 0.6689175629615783, | |
| "num_tokens": 10134094.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.551533062430735, | |
| "grad_norm": 0.5602062344551086, | |
| "learning_rate": 5.345861288192786e-05, | |
| "loss": 1.0441, | |
| "mean_token_accuracy": 0.6680737626552582, | |
| "num_tokens": 10381316.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.5884743258219431, | |
| "grad_norm": 0.5552584528923035, | |
| "learning_rate": 5.283319813188472e-05, | |
| "loss": 1.0369, | |
| "mean_token_accuracy": 0.6698204201459884, | |
| "num_tokens": 10628471.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.625415589213151, | |
| "grad_norm": 0.6024323105812073, | |
| "learning_rate": 5.222923226400155e-05, | |
| "loss": 1.0403, | |
| "mean_token_accuracy": 0.6691047704219818, | |
| "num_tokens": 10874430.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.662356852604359, | |
| "grad_norm": 0.5967562794685364, | |
| "learning_rate": 5.164551665900703e-05, | |
| "loss": 1.0483, | |
| "mean_token_accuracy": 0.6648873990774155, | |
| "num_tokens": 11123128.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.6992981159955671, | |
| "grad_norm": 0.5668358206748962, | |
| "learning_rate": 5.1080944423879696e-05, | |
| "loss": 1.0392, | |
| "mean_token_accuracy": 0.6674597597122193, | |
| "num_tokens": 11368003.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.736239379386775, | |
| "grad_norm": 0.6373595595359802, | |
| "learning_rate": 5.053449155971992e-05, | |
| "loss": 1.0404, | |
| "mean_token_accuracy": 0.6689798641204834, | |
| "num_tokens": 11615194.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.773180642777983, | |
| "grad_norm": 0.6193637847900391, | |
| "learning_rate": 5.0005209147276734e-05, | |
| "loss": 1.0355, | |
| "mean_token_accuracy": 0.6675721609592438, | |
| "num_tokens": 11863548.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.8101219061691909, | |
| "grad_norm": 0.5770505666732788, | |
| "learning_rate": 4.949221641439499e-05, | |
| "loss": 1.0316, | |
| "mean_token_accuracy": 0.6724146312475204, | |
| "num_tokens": 12111554.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.847063169560399, | |
| "grad_norm": 0.6453244686126709, | |
| "learning_rate": 4.899469457011854e-05, | |
| "loss": 1.0348, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.847063169560399, | |
| "eval_loss": 1.0698254108428955, | |
| "eval_mean_token_accuracy": 0.66366588306427, | |
| "eval_num_tokens": 12359520.0, | |
| "eval_runtime": 5.3865, | |
| "eval_samples_per_second": 185.649, | |
| "eval_steps_per_second": 23.206, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.884004432951607, | |
| "grad_norm": 0.6129611730575562, | |
| "learning_rate": 4.851188130722481e-05, | |
| "loss": 1.0374, | |
| "mean_token_accuracy": 0.6697911691665649, | |
| "num_tokens": 12607238.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.920945696342815, | |
| "grad_norm": 0.6002670526504517, | |
| "learning_rate": 4.804306588920635e-05, | |
| "loss": 1.035, | |
| "mean_token_accuracy": 0.6703519684076309, | |
| "num_tokens": 12855037.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.957886959734023, | |
| "grad_norm": 0.5875000953674316, | |
| "learning_rate": 4.758758474966023e-05, | |
| "loss": 1.0348, | |
| "mean_token_accuracy": 0.6686445927619934, | |
| "num_tokens": 13100596.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.9948282231252308, | |
| "grad_norm": 0.587979257106781, | |
| "learning_rate": 4.7144817542100825e-05, | |
| "loss": 1.0375, | |
| "mean_token_accuracy": 0.6698001223802567, | |
| "num_tokens": 13349667.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 2.031769486516439, | |
| "grad_norm": 0.5400444269180298, | |
| "learning_rate": 4.671418358670517e-05, | |
| "loss": 1.0064, | |
| "mean_token_accuracy": 0.6771922719478607, | |
| "num_tokens": 13599712.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 2.068710749907647, | |
| "grad_norm": 0.5831236839294434, | |
| "learning_rate": 4.6295138667698956e-05, | |
| "loss": 0.9874, | |
| "mean_token_accuracy": 0.6801465088129044, | |
| "num_tokens": 13845238.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 2.105652013298855, | |
| "grad_norm": 0.559648334980011, | |
| "learning_rate": 4.5887172141209994e-05, | |
| "loss": 0.9824, | |
| "mean_token_accuracy": 0.6835214233398438, | |
| "num_tokens": 14094078.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 2.1425932766900626, | |
| "grad_norm": 0.5524799227714539, | |
| "learning_rate": 4.548980431863551e-05, | |
| "loss": 0.9852, | |
| "mean_token_accuracy": 0.6822834074497223, | |
| "num_tokens": 14342112.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.1795345400812707, | |
| "grad_norm": 0.586271345615387, | |
| "learning_rate": 4.510258409503273e-05, | |
| "loss": 0.9807, | |
| "mean_token_accuracy": 0.6823082774877548, | |
| "num_tokens": 14587875.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.2164758034724787, | |
| "grad_norm": 0.599862277507782, | |
| "learning_rate": 4.472508679587051e-05, | |
| "loss": 0.9741, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.2164758034724787, | |
| "eval_loss": 1.071539282798767, | |
| "eval_mean_token_accuracy": 0.6636041073799134, | |
| "eval_num_tokens": 14833253.0, | |
| "eval_runtime": 5.3837, | |
| "eval_samples_per_second": 185.745, | |
| "eval_steps_per_second": 23.218, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.253417066863687, | |
| "grad_norm": 0.5849953293800354, | |
| "learning_rate": 4.435691221877225e-05, | |
| "loss": 0.9784, | |
| "mean_token_accuracy": 0.6845134419202804, | |
| "num_tokens": 15079437.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.290358330254895, | |
| "grad_norm": 0.5756722688674927, | |
| "learning_rate": 4.399768284971994e-05, | |
| "loss": 0.9843, | |
| "mean_token_accuracy": 0.6839412766695022, | |
| "num_tokens": 15326743.0, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.3272995936461025, | |
| "grad_norm": 0.5707868933677673, | |
| "learning_rate": 4.364704223564281e-05, | |
| "loss": 0.9901, | |
| "mean_token_accuracy": 0.6811071854829788, | |
| "num_tokens": 15572363.0, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.3642408570373106, | |
| "grad_norm": 0.6192522048950195, | |
| "learning_rate": 4.330465349744206e-05, | |
| "loss": 0.9762, | |
| "mean_token_accuracy": 0.6834132850170136, | |
| "num_tokens": 15818229.0, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.4011821204285186, | |
| "grad_norm": 0.5887159109115601, | |
| "learning_rate": 4.2970197969350315e-05, | |
| "loss": 0.9797, | |
| "mean_token_accuracy": 0.6834959721565247, | |
| "num_tokens": 16066092.0, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.4381233838197267, | |
| "grad_norm": 0.6107765436172485, | |
| "learning_rate": 4.264337395213374e-05, | |
| "loss": 0.97, | |
| "mean_token_accuracy": 0.6855223393440246, | |
| "num_tokens": 16314390.0, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.4750646472109348, | |
| "grad_norm": 0.5866128206253052, | |
| "learning_rate": 4.232389556904849e-05, | |
| "loss": 0.9794, | |
| "mean_token_accuracy": 0.6824937015771866, | |
| "num_tokens": 16560430.0, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.5120059106021424, | |
| "grad_norm": 0.5723136067390442, | |
| "learning_rate": 4.201149171469091e-05, | |
| "loss": 0.9805, | |
| "mean_token_accuracy": 0.6831120592355728, | |
| "num_tokens": 16807172.0, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.5489471739933505, | |
| "grad_norm": 0.5534746050834656, | |
| "learning_rate": 4.170590508795705e-05, | |
| "loss": 0.9853, | |
| "mean_token_accuracy": 0.679190359711647, | |
| "num_tokens": 17054725.0, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.5858884373845585, | |
| "grad_norm": 0.6190086007118225, | |
| "learning_rate": 4.1406891301271574e-05, | |
| "loss": 0.979, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.5858884373845585, | |
| "eval_loss": 1.0672271251678467, | |
| "eval_mean_token_accuracy": 0.665556743144989, | |
| "eval_num_tokens": 17301619.0, | |
| "eval_runtime": 5.3864, | |
| "eval_samples_per_second": 185.654, | |
| "eval_steps_per_second": 23.207, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.6228297007757666, | |
| "grad_norm": 0.604921281337738, | |
| "learning_rate": 4.111421805907759e-05, | |
| "loss": 0.9844, | |
| "mean_token_accuracy": 0.6844497114419937, | |
| "num_tokens": 17546632.0, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.6597709641669747, | |
| "grad_norm": 0.6087679862976074, | |
| "learning_rate": 4.082766439931165e-05, | |
| "loss": 0.9871, | |
| "mean_token_accuracy": 0.6810534721612931, | |
| "num_tokens": 17797482.0, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.6967122275581827, | |
| "grad_norm": 0.6020961999893188, | |
| "learning_rate": 4.054701999223518e-05, | |
| "loss": 0.9839, | |
| "mean_token_accuracy": 0.6829093122482299, | |
| "num_tokens": 18043599.0, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.7336534909493904, | |
| "grad_norm": 0.6339052319526672, | |
| "learning_rate": 4.0272084491566247e-05, | |
| "loss": 0.9863, | |
| "mean_token_accuracy": 0.6820144325494766, | |
| "num_tokens": 18287793.0, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.7705947543405984, | |
| "grad_norm": 0.6206592321395874, | |
| "learning_rate": 4.000266693336297e-05, | |
| "loss": 0.9709, | |
| "mean_token_accuracy": 0.6874477595090867, | |
| "num_tokens": 18535420.0, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.8075360177318065, | |
| "grad_norm": 0.6235191822052002, | |
| "learning_rate": 3.973858517856019e-05, | |
| "loss": 0.9734, | |
| "mean_token_accuracy": 0.6847814846038819, | |
| "num_tokens": 18784286.0, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.8444772811230146, | |
| "grad_norm": 0.6308836340904236, | |
| "learning_rate": 3.947966539546186e-05, | |
| "loss": 0.9813, | |
| "mean_token_accuracy": 0.6831617254018784, | |
| "num_tokens": 19035813.0, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.881418544514222, | |
| "grad_norm": 0.6001960039138794, | |
| "learning_rate": 3.922574157884801e-05, | |
| "loss": 0.987, | |
| "mean_token_accuracy": 0.6807804244756699, | |
| "num_tokens": 19282122.0, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.9183598079054303, | |
| "grad_norm": 0.6059972643852234, | |
| "learning_rate": 3.8976655102673755e-05, | |
| "loss": 0.9859, | |
| "mean_token_accuracy": 0.6820109623670578, | |
| "num_tokens": 19529782.0, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.9553010712966383, | |
| "grad_norm": 0.640872061252594, | |
| "learning_rate": 3.873225430362181e-05, | |
| "loss": 0.9761, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.9553010712966383, | |
| "eval_loss": 1.0601364374160767, | |
| "eval_mean_token_accuracy": 0.6661491613388062, | |
| "eval_num_tokens": 19777841.0, | |
| "eval_runtime": 5.4066, | |
| "eval_samples_per_second": 184.959, | |
| "eval_steps_per_second": 23.12, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.9922423346878464, | |
| "grad_norm": 0.6132466793060303, | |
| "learning_rate": 3.8492394093024636e-05, | |
| "loss": 0.9711, | |
| "mean_token_accuracy": 0.6846305218338966, | |
| "num_tokens": 20023858.0, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 3.0291835980790545, | |
| "grad_norm": 0.5972515940666199, | |
| "learning_rate": 3.825693559490006e-05, | |
| "loss": 0.9376, | |
| "mean_token_accuracy": 0.692512179017067, | |
| "num_tokens": 20270581.0, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 3.066124861470262, | |
| "grad_norm": 0.6202205419540405, | |
| "learning_rate": 3.8025745808048846e-05, | |
| "loss": 0.9307, | |
| "mean_token_accuracy": 0.6948988193273544, | |
| "num_tokens": 20516819.0, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 3.10306612486147, | |
| "grad_norm": 0.585482120513916, | |
| "learning_rate": 3.779869729034645e-05, | |
| "loss": 0.935, | |
| "mean_token_accuracy": 0.6954682809114456, | |
| "num_tokens": 20763559.0, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 3.1400073882526782, | |
| "grad_norm": 0.6358840465545654, | |
| "learning_rate": 3.7575667863526335e-05, | |
| "loss": 0.9292, | |
| "mean_token_accuracy": 0.6980463570356369, | |
| "num_tokens": 21011357.0, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 3.1769486516438863, | |
| "grad_norm": 0.6181186437606812, | |
| "learning_rate": 3.735654033690154e-05, | |
| "loss": 0.9229, | |
| "mean_token_accuracy": 0.696455385684967, | |
| "num_tokens": 21259566.0, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 3.2138899150350944, | |
| "grad_norm": 0.6604560017585754, | |
| "learning_rate": 3.7141202248604964e-05, | |
| "loss": 0.9285, | |
| "mean_token_accuracy": 0.6962138444185257, | |
| "num_tokens": 21506213.0, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 3.250831178426302, | |
| "grad_norm": 0.5987362265586853, | |
| "learning_rate": 3.6929545623050815e-05, | |
| "loss": 0.929, | |
| "mean_token_accuracy": 0.6957518076896667, | |
| "num_tokens": 21754908.0, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 3.28777244181751, | |
| "grad_norm": 0.5980191230773926, | |
| "learning_rate": 3.6721466743428706e-05, | |
| "loss": 0.938, | |
| "mean_token_accuracy": 0.6953296983242034, | |
| "num_tokens": 22001522.0, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 3.324713705208718, | |
| "grad_norm": 0.6171393990516663, | |
| "learning_rate": 3.6516865938141736e-05, | |
| "loss": 0.9364, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.324713705208718, | |
| "eval_loss": 1.0706262588500977, | |
| "eval_mean_token_accuracy": 0.6642319107055664, | |
| "eval_num_tokens": 22248126.0, | |
| "eval_runtime": 5.3921, | |
| "eval_samples_per_second": 185.455, | |
| "eval_steps_per_second": 23.182, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 3.361654968599926, | |
| "grad_norm": 0.6272869110107422, | |
| "learning_rate": 3.6315647380189556e-05, | |
| "loss": 0.919, | |
| "mean_token_accuracy": 0.6962304222583771, | |
| "num_tokens": 22494098.0, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 3.3985962319911343, | |
| "grad_norm": 0.6461876034736633, | |
| "learning_rate": 3.611771889857922e-05, | |
| "loss": 0.9331, | |
| "mean_token_accuracy": 0.6959864324331284, | |
| "num_tokens": 22741819.0, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 3.4355374953823423, | |
| "grad_norm": 0.5855452418327332, | |
| "learning_rate": 3.592299180092082e-05, | |
| "loss": 0.9283, | |
| "mean_token_accuracy": 0.6968681657314301, | |
| "num_tokens": 22988621.0, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 3.47247875877355, | |
| "grad_norm": 0.6252483129501343, | |
| "learning_rate": 3.573138070643225e-05, | |
| "loss": 0.9271, | |
| "mean_token_accuracy": 0.6972170048952102, | |
| "num_tokens": 23235818.0, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 3.509420022164758, | |
| "grad_norm": 0.6152076125144958, | |
| "learning_rate": 3.554280338863896e-05, | |
| "loss": 0.9244, | |
| "mean_token_accuracy": 0.699801824092865, | |
| "num_tokens": 23483243.0, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 3.546361285555966, | |
| "grad_norm": 0.6049486398696899, | |
| "learning_rate": 3.535718062711045e-05, | |
| "loss": 0.9365, | |
| "mean_token_accuracy": 0.6951554995775223, | |
| "num_tokens": 23730501.0, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 3.583302548947174, | |
| "grad_norm": 0.6222932934761047, | |
| "learning_rate": 3.517443606762636e-05, | |
| "loss": 0.9374, | |
| "mean_token_accuracy": 0.6915060871839523, | |
| "num_tokens": 23978205.0, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 3.6202438123383818, | |
| "grad_norm": 0.6344577670097351, | |
| "learning_rate": 3.499449609021135e-05, | |
| "loss": 0.9252, | |
| "mean_token_accuracy": 0.6969369679689408, | |
| "num_tokens": 24224944.0, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 3.65718507572959, | |
| "grad_norm": 0.600951611995697, | |
| "learning_rate": 3.4817289684521056e-05, | |
| "loss": 0.9226, | |
| "mean_token_accuracy": 0.6991156005859375, | |
| "num_tokens": 24470936.0, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 3.694126339120798, | |
| "grad_norm": 0.6917585134506226, | |
| "learning_rate": 3.4642748332099756e-05, | |
| "loss": 0.935, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 3.694126339120798, | |
| "eval_loss": 1.0667781829833984, | |
| "eval_mean_token_accuracy": 0.6657101097106933, | |
| "eval_num_tokens": 24717605.0, | |
| "eval_runtime": 5.4034, | |
| "eval_samples_per_second": 185.07, | |
| "eval_steps_per_second": 23.134, | |
| "step": 10000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.9870782814052352e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |