Instructions to use eac123/clean-subliminal-learning-phoenixes with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-phoenixes with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-phoenixes") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1336015462875366, | |
| "epoch": 0.003738317757009346, | |
| "grad_norm": 0.4115395247936249, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4710798263549805, | |
| "mean_token_accuracy": 0.5324664115905762, | |
| "num_tokens": 16496.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2463930547237396, | |
| "epoch": 0.007476635514018692, | |
| "grad_norm": 0.3692863881587982, | |
| "learning_rate": 0.0002, | |
| "loss": 2.165541648864746, | |
| "mean_token_accuracy": 0.5610552132129669, | |
| "num_tokens": 32901.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4113854467868805, | |
| "epoch": 0.011214953271028037, | |
| "grad_norm": 0.2915845811367035, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7357215881347656, | |
| "mean_token_accuracy": 0.5886629670858383, | |
| "num_tokens": 49245.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.379658043384552, | |
| "epoch": 0.014953271028037384, | |
| "grad_norm": 0.23361942172050476, | |
| "learning_rate": 0.0002, | |
| "loss": 1.410735011100769, | |
| "mean_token_accuracy": 0.6355755776166916, | |
| "num_tokens": 65811.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3623565435409546, | |
| "epoch": 0.018691588785046728, | |
| "grad_norm": 0.26191750168800354, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2986161708831787, | |
| "mean_token_accuracy": 0.6415031999349594, | |
| "num_tokens": 82189.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2727859914302826, | |
| "epoch": 0.022429906542056073, | |
| "grad_norm": 0.1533316969871521, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1948474645614624, | |
| "mean_token_accuracy": 0.6546026170253754, | |
| "num_tokens": 98489.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.2184827625751495, | |
| "epoch": 0.026168224299065422, | |
| "grad_norm": 0.10424298793077469, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1188591718673706, | |
| "mean_token_accuracy": 0.6631771177053452, | |
| "num_tokens": 114851.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1237380504608154, | |
| "epoch": 0.029906542056074768, | |
| "grad_norm": 0.10689449310302734, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0371830463409424, | |
| "mean_token_accuracy": 0.6718492060899734, | |
| "num_tokens": 131220.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0455615520477295, | |
| "epoch": 0.03364485981308411, | |
| "grad_norm": 0.12944048643112183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9913585782051086, | |
| "mean_token_accuracy": 0.6828599572181702, | |
| "num_tokens": 147616.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9801072925329208, | |
| "epoch": 0.037383177570093455, | |
| "grad_norm": 0.1291113793849945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9284825325012207, | |
| "mean_token_accuracy": 0.7001921981573105, | |
| "num_tokens": 164002.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.953565314412117, | |
| "epoch": 0.041121495327102804, | |
| "grad_norm": 0.10645624995231628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8795915842056274, | |
| "mean_token_accuracy": 0.7043117135763168, | |
| "num_tokens": 180220.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9155157953500748, | |
| "epoch": 0.044859813084112146, | |
| "grad_norm": 0.11287244409322739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8326205015182495, | |
| "mean_token_accuracy": 0.7109687179327011, | |
| "num_tokens": 196521.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8468948155641556, | |
| "epoch": 0.048598130841121495, | |
| "grad_norm": 0.10245727747678757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8009377121925354, | |
| "mean_token_accuracy": 0.7149728387594223, | |
| "num_tokens": 212778.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7708506435155869, | |
| "epoch": 0.052336448598130844, | |
| "grad_norm": 0.09908365458250046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7473602890968323, | |
| "mean_token_accuracy": 0.7281823754310608, | |
| "num_tokens": 228942.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7574831545352936, | |
| "epoch": 0.056074766355140186, | |
| "grad_norm": 0.10171845555305481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7353494167327881, | |
| "mean_token_accuracy": 0.7308090776205063, | |
| "num_tokens": 245256.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.6849008500576019, | |
| "epoch": 0.059813084112149535, | |
| "grad_norm": 0.08664627373218536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6817273497581482, | |
| "mean_token_accuracy": 0.7445196211338043, | |
| "num_tokens": 261288.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6784532964229584, | |
| "epoch": 0.06355140186915888, | |
| "grad_norm": 0.08904161304235458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6835237741470337, | |
| "mean_token_accuracy": 0.7402277588844299, | |
| "num_tokens": 277473.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6737232953310013, | |
| "epoch": 0.06728971962616823, | |
| "grad_norm": 0.08908089250326157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6696494817733765, | |
| "mean_token_accuracy": 0.7452213168144226, | |
| "num_tokens": 293986.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.676809772849083, | |
| "epoch": 0.07102803738317758, | |
| "grad_norm": 0.08826066553592682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6623877286911011, | |
| "mean_token_accuracy": 0.747529536485672, | |
| "num_tokens": 310269.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6532965898513794, | |
| "epoch": 0.07476635514018691, | |
| "grad_norm": 0.08917281031608582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6443736553192139, | |
| "mean_token_accuracy": 0.7480695396661758, | |
| "num_tokens": 326491.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6552709937095642, | |
| "epoch": 0.07850467289719626, | |
| "grad_norm": 0.08073496073484421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6399368643760681, | |
| "mean_token_accuracy": 0.7507821917533875, | |
| "num_tokens": 342841.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6378396600484848, | |
| "epoch": 0.08224299065420561, | |
| "grad_norm": 0.063417449593544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6258761882781982, | |
| "mean_token_accuracy": 0.7539727091789246, | |
| "num_tokens": 359584.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6046861261129379, | |
| "epoch": 0.08598130841121496, | |
| "grad_norm": 0.06905008107423782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6049938201904297, | |
| "mean_token_accuracy": 0.7625735104084015, | |
| "num_tokens": 375502.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6043607741594315, | |
| "epoch": 0.08971962616822429, | |
| "grad_norm": 0.0712490975856781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6081230640411377, | |
| "mean_token_accuracy": 0.761991336941719, | |
| "num_tokens": 391668.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5921229273080826, | |
| "epoch": 0.09345794392523364, | |
| "grad_norm": 0.06059383973479271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5966373682022095, | |
| "mean_token_accuracy": 0.7640610188245773, | |
| "num_tokens": 408064.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6013955473899841, | |
| "epoch": 0.09719626168224299, | |
| "grad_norm": 0.05800875276327133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6032594442367554, | |
| "mean_token_accuracy": 0.7606146037578583, | |
| "num_tokens": 424308.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6059402525424957, | |
| "epoch": 0.10093457943925234, | |
| "grad_norm": 0.05799295753240585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6014454960823059, | |
| "mean_token_accuracy": 0.7633127868175507, | |
| "num_tokens": 440626.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6059208810329437, | |
| "epoch": 0.10467289719626169, | |
| "grad_norm": 0.06835797429084778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5960400104522705, | |
| "mean_token_accuracy": 0.7644040137529373, | |
| "num_tokens": 457127.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.6063490360975266, | |
| "epoch": 0.10841121495327102, | |
| "grad_norm": 0.08442196249961853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5988196730613708, | |
| "mean_token_accuracy": 0.7642622292041779, | |
| "num_tokens": 473449.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.6044150143861771, | |
| "epoch": 0.11214953271028037, | |
| "grad_norm": 0.05611753463745117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5849661231040955, | |
| "mean_token_accuracy": 0.7694830596446991, | |
| "num_tokens": 489953.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5886638015508652, | |
| "epoch": 0.11588785046728972, | |
| "grad_norm": 0.055090922862291336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5829939842224121, | |
| "mean_token_accuracy": 0.769635483622551, | |
| "num_tokens": 506414.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5746142864227295, | |
| "epoch": 0.11962616822429907, | |
| "grad_norm": 0.049661796540021896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5790735483169556, | |
| "mean_token_accuracy": 0.7714909315109253, | |
| "num_tokens": 522742.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5767629146575928, | |
| "epoch": 0.1233644859813084, | |
| "grad_norm": 0.04847181588411331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.580193281173706, | |
| "mean_token_accuracy": 0.7714395672082901, | |
| "num_tokens": 539199.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5745265781879425, | |
| "epoch": 0.12710280373831775, | |
| "grad_norm": 0.05860326439142227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5901641845703125, | |
| "mean_token_accuracy": 0.7679091691970825, | |
| "num_tokens": 555326.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.567798376083374, | |
| "epoch": 0.1308411214953271, | |
| "grad_norm": 0.05234525725245476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5799325704574585, | |
| "mean_token_accuracy": 0.766155481338501, | |
| "num_tokens": 571808.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5698586851358414, | |
| "epoch": 0.13457943925233645, | |
| "grad_norm": 0.041219986975193024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.573387086391449, | |
| "mean_token_accuracy": 0.769883319735527, | |
| "num_tokens": 588161.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5851186513900757, | |
| "epoch": 0.1383177570093458, | |
| "grad_norm": 0.04337616264820099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5821909308433533, | |
| "mean_token_accuracy": 0.7661230564117432, | |
| "num_tokens": 604598.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5961429327726364, | |
| "epoch": 0.14205607476635515, | |
| "grad_norm": 0.05468963831663132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5940048098564148, | |
| "mean_token_accuracy": 0.7601669579744339, | |
| "num_tokens": 620746.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5826456397771835, | |
| "epoch": 0.14579439252336449, | |
| "grad_norm": 0.047812167555093765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5687558054924011, | |
| "mean_token_accuracy": 0.771986335515976, | |
| "num_tokens": 637151.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5903666168451309, | |
| "epoch": 0.14953271028037382, | |
| "grad_norm": 0.044994354248046875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5762028098106384, | |
| "mean_token_accuracy": 0.7677688300609589, | |
| "num_tokens": 653530.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5751803368330002, | |
| "epoch": 0.15327102803738318, | |
| "grad_norm": 0.04342395439743996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5721427798271179, | |
| "mean_token_accuracy": 0.7731492966413498, | |
| "num_tokens": 669957.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5582813173532486, | |
| "epoch": 0.15700934579439252, | |
| "grad_norm": 0.05154528096318245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713383555412292, | |
| "mean_token_accuracy": 0.7701951861381531, | |
| "num_tokens": 685933.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5747530311346054, | |
| "epoch": 0.16074766355140188, | |
| "grad_norm": 0.05052989348769188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5861970782279968, | |
| "mean_token_accuracy": 0.7652492970228195, | |
| "num_tokens": 702131.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5861315429210663, | |
| "epoch": 0.16448598130841122, | |
| "grad_norm": 0.043960776180028915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5891501903533936, | |
| "mean_token_accuracy": 0.7628277689218521, | |
| "num_tokens": 718330.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5868926346302032, | |
| "epoch": 0.16822429906542055, | |
| "grad_norm": 0.035861797630786896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5814363360404968, | |
| "mean_token_accuracy": 0.7670950144529343, | |
| "num_tokens": 734754.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5696061849594116, | |
| "epoch": 0.17196261682242991, | |
| "grad_norm": 0.03567943349480629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582084655761719, | |
| "mean_token_accuracy": 0.7754767388105392, | |
| "num_tokens": 750952.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5884592086076736, | |
| "epoch": 0.17570093457943925, | |
| "grad_norm": 0.04051043465733528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5837826132774353, | |
| "mean_token_accuracy": 0.7652305215597153, | |
| "num_tokens": 767136.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.568819597363472, | |
| "epoch": 0.17943925233644858, | |
| "grad_norm": 0.04234869405627251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664035081863403, | |
| "mean_token_accuracy": 0.7719341665506363, | |
| "num_tokens": 783513.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.553595632314682, | |
| "epoch": 0.18317757009345795, | |
| "grad_norm": 0.04170480743050575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564354658126831, | |
| "mean_token_accuracy": 0.7749540507793427, | |
| "num_tokens": 799703.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5621031820774078, | |
| "epoch": 0.18691588785046728, | |
| "grad_norm": 0.042460180819034576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.576507568359375, | |
| "mean_token_accuracy": 0.7702780216932297, | |
| "num_tokens": 815979.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5803797841072083, | |
| "epoch": 0.19065420560747665, | |
| "grad_norm": 0.036130718886852264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5826534628868103, | |
| "mean_token_accuracy": 0.767243430018425, | |
| "num_tokens": 832435.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5492766499519348, | |
| "epoch": 0.19439252336448598, | |
| "grad_norm": 0.04120517149567604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535300374031067, | |
| "mean_token_accuracy": 0.7766350656747818, | |
| "num_tokens": 848601.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5690171420574188, | |
| "epoch": 0.19813084112149532, | |
| "grad_norm": 0.03631429374217987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688353776931763, | |
| "mean_token_accuracy": 0.7699357271194458, | |
| "num_tokens": 864779.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5830478370189667, | |
| "epoch": 0.20186915887850468, | |
| "grad_norm": 0.03915117308497429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5719392895698547, | |
| "mean_token_accuracy": 0.7702472358942032, | |
| "num_tokens": 881366.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5905578434467316, | |
| "epoch": 0.205607476635514, | |
| "grad_norm": 0.038457099348306656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5855496525764465, | |
| "mean_token_accuracy": 0.7646182626485825, | |
| "num_tokens": 897955.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5837848633527756, | |
| "epoch": 0.20934579439252338, | |
| "grad_norm": 0.04033343121409416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5784925222396851, | |
| "mean_token_accuracy": 0.7649644762277603, | |
| "num_tokens": 914164.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5470199286937714, | |
| "epoch": 0.2130841121495327, | |
| "grad_norm": 0.036680735647678375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427253246307373, | |
| "mean_token_accuracy": 0.7822186052799225, | |
| "num_tokens": 930444.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5544598549604416, | |
| "epoch": 0.21682242990654205, | |
| "grad_norm": 0.04701124131679535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.569618821144104, | |
| "mean_token_accuracy": 0.771122008562088, | |
| "num_tokens": 946567.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5725786834955215, | |
| "epoch": 0.2205607476635514, | |
| "grad_norm": 0.04193125665187836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5894483923912048, | |
| "mean_token_accuracy": 0.7642552405595779, | |
| "num_tokens": 962894.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5668687969446182, | |
| "epoch": 0.22429906542056074, | |
| "grad_norm": 0.033951517194509506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5699459314346313, | |
| "mean_token_accuracy": 0.7729462385177612, | |
| "num_tokens": 979210.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5792391896247864, | |
| "epoch": 0.22803738317757008, | |
| "grad_norm": 0.041912537068128586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683349370956421, | |
| "mean_token_accuracy": 0.7706285119056702, | |
| "num_tokens": 995540.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5809753388166428, | |
| "epoch": 0.23177570093457944, | |
| "grad_norm": 0.036393389105796814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727679133415222, | |
| "mean_token_accuracy": 0.7684315294027328, | |
| "num_tokens": 1011805.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5670438855886459, | |
| "epoch": 0.23551401869158878, | |
| "grad_norm": 0.03674926608800888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604680776596069, | |
| "mean_token_accuracy": 0.7723257541656494, | |
| "num_tokens": 1028009.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5653442144393921, | |
| "epoch": 0.23925233644859814, | |
| "grad_norm": 0.03534647822380066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580601096153259, | |
| "mean_token_accuracy": 0.7755836397409439, | |
| "num_tokens": 1044521.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5762730091810226, | |
| "epoch": 0.24299065420560748, | |
| "grad_norm": 0.03369547426700592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709710121154785, | |
| "mean_token_accuracy": 0.7710799872875214, | |
| "num_tokens": 1060984.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.56136754155159, | |
| "epoch": 0.2467289719626168, | |
| "grad_norm": 0.050162531435489655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662704706192017, | |
| "mean_token_accuracy": 0.7702763229608536, | |
| "num_tokens": 1077512.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5493937730789185, | |
| "epoch": 0.2504672897196262, | |
| "grad_norm": 0.0446079783141613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563389778137207, | |
| "mean_token_accuracy": 0.7724475711584091, | |
| "num_tokens": 1093860.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5527212023735046, | |
| "epoch": 0.2542056074766355, | |
| "grad_norm": 0.04445589333772659, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553238034248352, | |
| "mean_token_accuracy": 0.777790442109108, | |
| "num_tokens": 1109927.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5742960721254349, | |
| "epoch": 0.25794392523364484, | |
| "grad_norm": 0.03155473247170448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755714774131775, | |
| "mean_token_accuracy": 0.7682003676891327, | |
| "num_tokens": 1126507.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.570902407169342, | |
| "epoch": 0.2616822429906542, | |
| "grad_norm": 0.03776158019900322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5687341094017029, | |
| "mean_token_accuracy": 0.7690709233283997, | |
| "num_tokens": 1142690.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5869749188423157, | |
| "epoch": 0.26542056074766357, | |
| "grad_norm": 0.03637450933456421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745267271995544, | |
| "mean_token_accuracy": 0.7675913572311401, | |
| "num_tokens": 1158998.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5770464688539505, | |
| "epoch": 0.2691588785046729, | |
| "grad_norm": 0.03824329748749733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5806713104248047, | |
| "mean_token_accuracy": 0.765295684337616, | |
| "num_tokens": 1175369.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5496443659067154, | |
| "epoch": 0.27289719626168224, | |
| "grad_norm": 0.03833479806780815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552317202091217, | |
| "mean_token_accuracy": 0.7775600254535675, | |
| "num_tokens": 1191776.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5672993659973145, | |
| "epoch": 0.2766355140186916, | |
| "grad_norm": 0.035141605883836746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738911032676697, | |
| "mean_token_accuracy": 0.769673228263855, | |
| "num_tokens": 1208289.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5747457444667816, | |
| "epoch": 0.2803738317757009, | |
| "grad_norm": 0.03779706731438637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.580111026763916, | |
| "mean_token_accuracy": 0.7651933431625366, | |
| "num_tokens": 1224804.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5685230642557144, | |
| "epoch": 0.2841121495327103, | |
| "grad_norm": 0.03369152173399925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571203351020813, | |
| "mean_token_accuracy": 0.7706969380378723, | |
| "num_tokens": 1240994.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5724664479494095, | |
| "epoch": 0.28785046728971964, | |
| "grad_norm": 0.03279148414731026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703553557395935, | |
| "mean_token_accuracy": 0.7710930705070496, | |
| "num_tokens": 1257180.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.570750430226326, | |
| "epoch": 0.29158878504672897, | |
| "grad_norm": 0.035474326461553574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.57155442237854, | |
| "mean_token_accuracy": 0.7676969021558762, | |
| "num_tokens": 1273176.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5746997892856598, | |
| "epoch": 0.2953271028037383, | |
| "grad_norm": 0.03326554223895073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5764865279197693, | |
| "mean_token_accuracy": 0.7667145133018494, | |
| "num_tokens": 1289572.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5560239851474762, | |
| "epoch": 0.29906542056074764, | |
| "grad_norm": 0.033652499318122864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541852712631226, | |
| "mean_token_accuracy": 0.7752721756696701, | |
| "num_tokens": 1305646.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5700062215328217, | |
| "epoch": 0.30280373831775703, | |
| "grad_norm": 0.036336466670036316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5715289115905762, | |
| "mean_token_accuracy": 0.7702216506004333, | |
| "num_tokens": 1322328.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5599597245454788, | |
| "epoch": 0.30654205607476637, | |
| "grad_norm": 0.032290052622556686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614467859268188, | |
| "mean_token_accuracy": 0.7732760310173035, | |
| "num_tokens": 1338359.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5446556061506271, | |
| "epoch": 0.3102803738317757, | |
| "grad_norm": 0.03226450830698013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512461066246033, | |
| "mean_token_accuracy": 0.7779420912265778, | |
| "num_tokens": 1354321.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5505060404539108, | |
| "epoch": 0.31401869158878504, | |
| "grad_norm": 0.035315077751874924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553967952728271, | |
| "mean_token_accuracy": 0.7761841863393784, | |
| "num_tokens": 1370409.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5602358281612396, | |
| "epoch": 0.3177570093457944, | |
| "grad_norm": 0.031360018998384476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553810596466064, | |
| "mean_token_accuracy": 0.7750610113143921, | |
| "num_tokens": 1386951.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5592145472764969, | |
| "epoch": 0.32149532710280376, | |
| "grad_norm": 0.03307170048356056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547728538513184, | |
| "mean_token_accuracy": 0.7769513875246048, | |
| "num_tokens": 1403318.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5478426665067673, | |
| "epoch": 0.3252336448598131, | |
| "grad_norm": 0.03468095511198044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475176572799683, | |
| "mean_token_accuracy": 0.7787642478942871, | |
| "num_tokens": 1419588.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5575945675373077, | |
| "epoch": 0.32897196261682243, | |
| "grad_norm": 0.0372730977833271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592425465583801, | |
| "mean_token_accuracy": 0.7753143310546875, | |
| "num_tokens": 1435879.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5516618192195892, | |
| "epoch": 0.33271028037383177, | |
| "grad_norm": 0.03459680825471878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590015649795532, | |
| "mean_token_accuracy": 0.7763092070817947, | |
| "num_tokens": 1452255.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5537828356027603, | |
| "epoch": 0.3364485981308411, | |
| "grad_norm": 0.037478331476449966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628093481063843, | |
| "mean_token_accuracy": 0.7731254547834396, | |
| "num_tokens": 1468440.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5597833395004272, | |
| "epoch": 0.3401869158878505, | |
| "grad_norm": 0.03566694259643555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576118230819702, | |
| "mean_token_accuracy": 0.7733734101057053, | |
| "num_tokens": 1484803.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5624473690986633, | |
| "epoch": 0.34392523364485983, | |
| "grad_norm": 0.038208235055208206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643529891967773, | |
| "mean_token_accuracy": 0.773946151137352, | |
| "num_tokens": 1500849.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5809104889631271, | |
| "epoch": 0.34766355140186916, | |
| "grad_norm": 0.03173667564988136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5739686489105225, | |
| "mean_token_accuracy": 0.7694463729858398, | |
| "num_tokens": 1517263.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5697960555553436, | |
| "epoch": 0.3514018691588785, | |
| "grad_norm": 0.03167756646871567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665271878242493, | |
| "mean_token_accuracy": 0.7699908316135406, | |
| "num_tokens": 1533648.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5966296941041946, | |
| "epoch": 0.35514018691588783, | |
| "grad_norm": 0.036720361560583115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5901257395744324, | |
| "mean_token_accuracy": 0.7647226899862289, | |
| "num_tokens": 1550084.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5599866956472397, | |
| "epoch": 0.35887850467289717, | |
| "grad_norm": 0.03618223965167999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656697750091553, | |
| "mean_token_accuracy": 0.7732058614492416, | |
| "num_tokens": 1566526.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5660023838281631, | |
| "epoch": 0.36261682242990656, | |
| "grad_norm": 0.037616875022649765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5731638669967651, | |
| "mean_token_accuracy": 0.7681225687265396, | |
| "num_tokens": 1582887.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5692461878061295, | |
| "epoch": 0.3663551401869159, | |
| "grad_norm": 0.04291412979364395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5790476202964783, | |
| "mean_token_accuracy": 0.7658884823322296, | |
| "num_tokens": 1599367.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5626956224441528, | |
| "epoch": 0.37009345794392523, | |
| "grad_norm": 0.03269932419061661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623303651809692, | |
| "mean_token_accuracy": 0.7726950198411942, | |
| "num_tokens": 1615716.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5417574644088745, | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 0.029643645510077477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503037571907043, | |
| "mean_token_accuracy": 0.7786638289690018, | |
| "num_tokens": 1631985.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5644317716360092, | |
| "epoch": 0.3775700934579439, | |
| "grad_norm": 0.03810103237628937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641601085662842, | |
| "mean_token_accuracy": 0.7715529501438141, | |
| "num_tokens": 1648148.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5648799985647202, | |
| "epoch": 0.3813084112149533, | |
| "grad_norm": 0.02914907969534397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619527101516724, | |
| "mean_token_accuracy": 0.7744928747415543, | |
| "num_tokens": 1664554.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5753660798072815, | |
| "epoch": 0.3850467289719626, | |
| "grad_norm": 0.02887723594903946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688785314559937, | |
| "mean_token_accuracy": 0.7692504674196243, | |
| "num_tokens": 1680782.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.561363086104393, | |
| "epoch": 0.38878504672897196, | |
| "grad_norm": 0.028774583712220192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560323178768158, | |
| "mean_token_accuracy": 0.7716943174600601, | |
| "num_tokens": 1696855.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5558189004659653, | |
| "epoch": 0.3925233644859813, | |
| "grad_norm": 0.030897047370672226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627227425575256, | |
| "mean_token_accuracy": 0.7728832811117172, | |
| "num_tokens": 1713092.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5579479783773422, | |
| "epoch": 0.39626168224299063, | |
| "grad_norm": 0.03168272599577904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611063241958618, | |
| "mean_token_accuracy": 0.7737848162651062, | |
| "num_tokens": 1729174.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5593132227659225, | |
| "epoch": 0.4, | |
| "grad_norm": 0.030001681298017502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634371638298035, | |
| "mean_token_accuracy": 0.7737011611461639, | |
| "num_tokens": 1745387.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5454982221126556, | |
| "epoch": 0.40373831775700936, | |
| "grad_norm": 0.033263012766838074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490332841873169, | |
| "mean_token_accuracy": 0.7772792428731918, | |
| "num_tokens": 1761446.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5551732182502747, | |
| "epoch": 0.4074766355140187, | |
| "grad_norm": 0.030698338523507118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535954236984253, | |
| "mean_token_accuracy": 0.773947462439537, | |
| "num_tokens": 1778105.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5650522261857986, | |
| "epoch": 0.411214953271028, | |
| "grad_norm": 0.02939177118241787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615048408508301, | |
| "mean_token_accuracy": 0.7712746411561966, | |
| "num_tokens": 1794562.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5696343183517456, | |
| "epoch": 0.41495327102803736, | |
| "grad_norm": 0.03011537715792656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706506967544556, | |
| "mean_token_accuracy": 0.7699969708919525, | |
| "num_tokens": 1810779.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5387005656957626, | |
| "epoch": 0.41869158878504675, | |
| "grad_norm": 0.033464495092630386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423218607902527, | |
| "mean_token_accuracy": 0.7795679718255997, | |
| "num_tokens": 1827208.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5597733706235886, | |
| "epoch": 0.4224299065420561, | |
| "grad_norm": 0.029017142951488495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561181306838989, | |
| "mean_token_accuracy": 0.7743376046419144, | |
| "num_tokens": 1843649.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5541809946298599, | |
| "epoch": 0.4261682242990654, | |
| "grad_norm": 0.030042298138141632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544824600219727, | |
| "mean_token_accuracy": 0.7773302495479584, | |
| "num_tokens": 1859919.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5697837471961975, | |
| "epoch": 0.42990654205607476, | |
| "grad_norm": 0.029710182920098305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684210658073425, | |
| "mean_token_accuracy": 0.7717447876930237, | |
| "num_tokens": 1876288.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5591758489608765, | |
| "epoch": 0.4336448598130841, | |
| "grad_norm": 0.031515248119831085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618751645088196, | |
| "mean_token_accuracy": 0.77419513463974, | |
| "num_tokens": 1892685.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5360209345817566, | |
| "epoch": 0.4373831775700935, | |
| "grad_norm": 0.036333996802568436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519132614135742, | |
| "mean_token_accuracy": 0.77690489590168, | |
| "num_tokens": 1908983.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5584719926118851, | |
| "epoch": 0.4411214953271028, | |
| "grad_norm": 0.03057498298585415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668904185295105, | |
| "mean_token_accuracy": 0.7719320356845856, | |
| "num_tokens": 1925134.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5634136199951172, | |
| "epoch": 0.44485981308411215, | |
| "grad_norm": 0.038503021001815796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522302389144897, | |
| "mean_token_accuracy": 0.7777165621519089, | |
| "num_tokens": 1941319.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5695697367191315, | |
| "epoch": 0.4485981308411215, | |
| "grad_norm": 0.02690051682293415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623375773429871, | |
| "mean_token_accuracy": 0.7749422192573547, | |
| "num_tokens": 1957576.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5670370161533356, | |
| "epoch": 0.4523364485981308, | |
| "grad_norm": 0.030103027820587158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645368695259094, | |
| "mean_token_accuracy": 0.7715286463499069, | |
| "num_tokens": 1973598.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5673844367265701, | |
| "epoch": 0.45607476635514016, | |
| "grad_norm": 0.03927698731422424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738642811775208, | |
| "mean_token_accuracy": 0.7676763832569122, | |
| "num_tokens": 1989896.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5642601549625397, | |
| "epoch": 0.45981308411214955, | |
| "grad_norm": 0.040063194930553436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5772222280502319, | |
| "mean_token_accuracy": 0.7651336938142776, | |
| "num_tokens": 2006217.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5646145790815353, | |
| "epoch": 0.4635514018691589, | |
| "grad_norm": 0.02972179837524891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596722960472107, | |
| "mean_token_accuracy": 0.7738584727048874, | |
| "num_tokens": 2022407.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5680184960365295, | |
| "epoch": 0.4672897196261682, | |
| "grad_norm": 0.03161488473415375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569790601730347, | |
| "mean_token_accuracy": 0.7752905040979385, | |
| "num_tokens": 2038990.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5721628367900848, | |
| "epoch": 0.47102803738317756, | |
| "grad_norm": 0.03150559216737747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56056147813797, | |
| "mean_token_accuracy": 0.7753510475158691, | |
| "num_tokens": 2055485.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5526139587163925, | |
| "epoch": 0.4747663551401869, | |
| "grad_norm": 0.02876976877450943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555187463760376, | |
| "mean_token_accuracy": 0.7740543335676193, | |
| "num_tokens": 2071792.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.542378157377243, | |
| "epoch": 0.4785046728971963, | |
| "grad_norm": 0.03460092097520828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530366897583008, | |
| "mean_token_accuracy": 0.7747022658586502, | |
| "num_tokens": 2087874.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5451681464910507, | |
| "epoch": 0.4822429906542056, | |
| "grad_norm": 0.02991570346057415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549987256526947, | |
| "mean_token_accuracy": 0.7774564474821091, | |
| "num_tokens": 2104238.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5554285645484924, | |
| "epoch": 0.48598130841121495, | |
| "grad_norm": 0.0326702855527401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605641603469849, | |
| "mean_token_accuracy": 0.7726142853498459, | |
| "num_tokens": 2120477.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.555129811167717, | |
| "epoch": 0.4897196261682243, | |
| "grad_norm": 0.031020283699035645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525497198104858, | |
| "mean_token_accuracy": 0.7749627828598022, | |
| "num_tokens": 2136857.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5660799294710159, | |
| "epoch": 0.4934579439252336, | |
| "grad_norm": 0.03083673305809498, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555440187454224, | |
| "mean_token_accuracy": 0.7719593346118927, | |
| "num_tokens": 2153526.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5561708807945251, | |
| "epoch": 0.497196261682243, | |
| "grad_norm": 0.031476520001888275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555605411529541, | |
| "mean_token_accuracy": 0.7762354910373688, | |
| "num_tokens": 2169651.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.525283932685852, | |
| "epoch": 0.5009345794392523, | |
| "grad_norm": 0.03160262852907181, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320227742195129, | |
| "mean_token_accuracy": 0.7818241119384766, | |
| "num_tokens": 2185700.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5597178190946579, | |
| "epoch": 0.5046728971962616, | |
| "grad_norm": 0.03169814869761467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603609681129456, | |
| "mean_token_accuracy": 0.7734936475753784, | |
| "num_tokens": 2201832.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5660498142242432, | |
| "epoch": 0.508411214953271, | |
| "grad_norm": 0.03322802484035492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570435643196106, | |
| "mean_token_accuracy": 0.7702528983354568, | |
| "num_tokens": 2218197.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5471976101398468, | |
| "epoch": 0.5121495327102804, | |
| "grad_norm": 0.031250759959220886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555264353752136, | |
| "mean_token_accuracy": 0.7744151949882507, | |
| "num_tokens": 2234366.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5514054894447327, | |
| "epoch": 0.5158878504672897, | |
| "grad_norm": 0.026281429454684258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531660318374634, | |
| "mean_token_accuracy": 0.7755394726991653, | |
| "num_tokens": 2250665.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5651220381259918, | |
| "epoch": 0.5196261682242991, | |
| "grad_norm": 0.031022025272250175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564669132232666, | |
| "mean_token_accuracy": 0.773309201002121, | |
| "num_tokens": 2266978.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5677877366542816, | |
| "epoch": 0.5233644859813084, | |
| "grad_norm": 0.030657587572932243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564283013343811, | |
| "mean_token_accuracy": 0.7711436003446579, | |
| "num_tokens": 2283321.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5454884767532349, | |
| "epoch": 0.5271028037383177, | |
| "grad_norm": 0.029621724039316177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448048710823059, | |
| "mean_token_accuracy": 0.7774412333965302, | |
| "num_tokens": 2299654.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5593066215515137, | |
| "epoch": 0.5308411214953271, | |
| "grad_norm": 0.03370071202516556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656630992889404, | |
| "mean_token_accuracy": 0.7700357884168625, | |
| "num_tokens": 2315917.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5630017071962357, | |
| "epoch": 0.5345794392523364, | |
| "grad_norm": 0.03445977345108986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5749462842941284, | |
| "mean_token_accuracy": 0.7682285755872726, | |
| "num_tokens": 2332053.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5692644715309143, | |
| "epoch": 0.5383177570093458, | |
| "grad_norm": 0.034105394035577774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713233351707458, | |
| "mean_token_accuracy": 0.7670455425977707, | |
| "num_tokens": 2348321.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5742600113153458, | |
| "epoch": 0.5420560747663551, | |
| "grad_norm": 0.031007220968604088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571353554725647, | |
| "mean_token_accuracy": 0.76962810754776, | |
| "num_tokens": 2364386.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5725259482860565, | |
| "epoch": 0.5457943925233645, | |
| "grad_norm": 0.030071116983890533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640747547149658, | |
| "mean_token_accuracy": 0.7740518748760223, | |
| "num_tokens": 2380815.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5748542249202728, | |
| "epoch": 0.5495327102803739, | |
| "grad_norm": 0.03353971987962723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691145062446594, | |
| "mean_token_accuracy": 0.7703811824321747, | |
| "num_tokens": 2396915.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5501144975423813, | |
| "epoch": 0.5532710280373832, | |
| "grad_norm": 0.029002781957387924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473450422286987, | |
| "mean_token_accuracy": 0.7768280953168869, | |
| "num_tokens": 2412894.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5640593320131302, | |
| "epoch": 0.5570093457943925, | |
| "grad_norm": 0.0339277982711792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568105936050415, | |
| "mean_token_accuracy": 0.7686444222927094, | |
| "num_tokens": 2429333.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5358926355838776, | |
| "epoch": 0.5607476635514018, | |
| "grad_norm": 0.03321727365255356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451691150665283, | |
| "mean_token_accuracy": 0.7813747376203537, | |
| "num_tokens": 2445547.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.575822114944458, | |
| "epoch": 0.5644859813084112, | |
| "grad_norm": 0.028913335874676704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5796110033988953, | |
| "mean_token_accuracy": 0.7663715481758118, | |
| "num_tokens": 2461739.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5666410624980927, | |
| "epoch": 0.5682242990654206, | |
| "grad_norm": 0.030346350744366646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563742518424988, | |
| "mean_token_accuracy": 0.7750760018825531, | |
| "num_tokens": 2478290.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5700524747371674, | |
| "epoch": 0.5719626168224299, | |
| "grad_norm": 0.03455440327525139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611424446105957, | |
| "mean_token_accuracy": 0.7719277888536453, | |
| "num_tokens": 2494845.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.561910405755043, | |
| "epoch": 0.5757009345794393, | |
| "grad_norm": 0.029596278443932533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637333393096924, | |
| "mean_token_accuracy": 0.771451935172081, | |
| "num_tokens": 2511497.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5496856719255447, | |
| "epoch": 0.5794392523364486, | |
| "grad_norm": 0.02896132506430149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627070665359497, | |
| "mean_token_accuracy": 0.7726458758115768, | |
| "num_tokens": 2527582.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5563309341669083, | |
| "epoch": 0.5831775700934579, | |
| "grad_norm": 0.04145891219377518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5785839557647705, | |
| "mean_token_accuracy": 0.7629837244749069, | |
| "num_tokens": 2543948.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5635025650262833, | |
| "epoch": 0.5869158878504673, | |
| "grad_norm": 0.028125908225774765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688048005104065, | |
| "mean_token_accuracy": 0.7708674967288971, | |
| "num_tokens": 2560174.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5650362074375153, | |
| "epoch": 0.5906542056074766, | |
| "grad_norm": 0.031838495284318924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594847798347473, | |
| "mean_token_accuracy": 0.7728245556354523, | |
| "num_tokens": 2576418.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5560010820627213, | |
| "epoch": 0.594392523364486, | |
| "grad_norm": 0.03514372557401657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445454120635986, | |
| "mean_token_accuracy": 0.7787751257419586, | |
| "num_tokens": 2592454.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.552829384803772, | |
| "epoch": 0.5981308411214953, | |
| "grad_norm": 0.028390226885676384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493785738945007, | |
| "mean_token_accuracy": 0.7761707901954651, | |
| "num_tokens": 2608586.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5553926527500153, | |
| "epoch": 0.6018691588785047, | |
| "grad_norm": 0.02847958728671074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555365681648254, | |
| "mean_token_accuracy": 0.7766669541597366, | |
| "num_tokens": 2624962.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.551996037364006, | |
| "epoch": 0.6056074766355141, | |
| "grad_norm": 0.03402937948703766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557694673538208, | |
| "mean_token_accuracy": 0.7744593769311905, | |
| "num_tokens": 2641382.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5671762228012085, | |
| "epoch": 0.6093457943925233, | |
| "grad_norm": 0.03495490923523903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758394002914429, | |
| "mean_token_accuracy": 0.7660740315914154, | |
| "num_tokens": 2657986.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5575901418924332, | |
| "epoch": 0.6130841121495327, | |
| "grad_norm": 0.03418085724115372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583428740501404, | |
| "mean_token_accuracy": 0.7739714235067368, | |
| "num_tokens": 2673995.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5644998699426651, | |
| "epoch": 0.616822429906542, | |
| "grad_norm": 0.028694115579128265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556347370147705, | |
| "mean_token_accuracy": 0.775534600019455, | |
| "num_tokens": 2690249.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5767987668514252, | |
| "epoch": 0.6205607476635514, | |
| "grad_norm": 0.03323300555348396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688591003417969, | |
| "mean_token_accuracy": 0.7711433321237564, | |
| "num_tokens": 2706818.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5557750165462494, | |
| "epoch": 0.6242990654205608, | |
| "grad_norm": 0.030084028840065002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595380067825317, | |
| "mean_token_accuracy": 0.7722294181585312, | |
| "num_tokens": 2722820.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.562026247382164, | |
| "epoch": 0.6280373831775701, | |
| "grad_norm": 0.03125706687569618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637321472167969, | |
| "mean_token_accuracy": 0.7692414969205856, | |
| "num_tokens": 2739398.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5448627471923828, | |
| "epoch": 0.6317757009345795, | |
| "grad_norm": 0.03390555456280708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494401454925537, | |
| "mean_token_accuracy": 0.7776045203208923, | |
| "num_tokens": 2755453.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5523964762687683, | |
| "epoch": 0.6355140186915887, | |
| "grad_norm": 0.03687772527337074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620272159576416, | |
| "mean_token_accuracy": 0.7718589901924133, | |
| "num_tokens": 2771533.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5672519207000732, | |
| "epoch": 0.6392523364485981, | |
| "grad_norm": 0.035152945667505264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5725542306900024, | |
| "mean_token_accuracy": 0.768815353512764, | |
| "num_tokens": 2787816.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5715326368808746, | |
| "epoch": 0.6429906542056075, | |
| "grad_norm": 0.032671887427568436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5690709352493286, | |
| "mean_token_accuracy": 0.7705206274986267, | |
| "num_tokens": 2804253.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5771492570638657, | |
| "epoch": 0.6467289719626168, | |
| "grad_norm": 0.03344012424349785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672138929367065, | |
| "mean_token_accuracy": 0.7719729393720627, | |
| "num_tokens": 2820473.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5444837659597397, | |
| "epoch": 0.6504672897196262, | |
| "grad_norm": 0.029676884412765503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400466322898865, | |
| "mean_token_accuracy": 0.7845920622348785, | |
| "num_tokens": 2836738.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5679149776697159, | |
| "epoch": 0.6542056074766355, | |
| "grad_norm": 0.03190155327320099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703109502792358, | |
| "mean_token_accuracy": 0.7677883356809616, | |
| "num_tokens": 2853015.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5386882424354553, | |
| "epoch": 0.6579439252336449, | |
| "grad_norm": 0.03156553953886032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451309680938721, | |
| "mean_token_accuracy": 0.7785861194133759, | |
| "num_tokens": 2869326.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5546389669179916, | |
| "epoch": 0.6616822429906543, | |
| "grad_norm": 0.03298742696642876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598126649856567, | |
| "mean_token_accuracy": 0.7714642137289047, | |
| "num_tokens": 2885638.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5554563403129578, | |
| "epoch": 0.6654205607476635, | |
| "grad_norm": 0.034988123923540115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639896392822266, | |
| "mean_token_accuracy": 0.7712263017892838, | |
| "num_tokens": 2902116.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5492645055055618, | |
| "epoch": 0.6691588785046729, | |
| "grad_norm": 0.03213873505592346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490330457687378, | |
| "mean_token_accuracy": 0.7778918445110321, | |
| "num_tokens": 2918514.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5809471905231476, | |
| "epoch": 0.6728971962616822, | |
| "grad_norm": 0.02829456329345703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5780236721038818, | |
| "mean_token_accuracy": 0.7631959617137909, | |
| "num_tokens": 2935180.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5545472204685211, | |
| "epoch": 0.6766355140186916, | |
| "grad_norm": 0.026784643530845642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539122819900513, | |
| "mean_token_accuracy": 0.7744273245334625, | |
| "num_tokens": 2951485.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5583300441503525, | |
| "epoch": 0.680373831775701, | |
| "grad_norm": 0.028181226924061775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567899942398071, | |
| "mean_token_accuracy": 0.7753158956766129, | |
| "num_tokens": 2967799.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5597800463438034, | |
| "epoch": 0.6841121495327103, | |
| "grad_norm": 0.027700597420334816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559861958026886, | |
| "mean_token_accuracy": 0.772071048617363, | |
| "num_tokens": 2984240.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5409596711397171, | |
| "epoch": 0.6878504672897197, | |
| "grad_norm": 0.030223077163100243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486294031143188, | |
| "mean_token_accuracy": 0.7773659527301788, | |
| "num_tokens": 3000681.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5551634728908539, | |
| "epoch": 0.6915887850467289, | |
| "grad_norm": 0.02896454744040966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600041151046753, | |
| "mean_token_accuracy": 0.7721187323331833, | |
| "num_tokens": 3017042.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5551397949457169, | |
| "epoch": 0.6953271028037383, | |
| "grad_norm": 0.02665393240749836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556494414806366, | |
| "mean_token_accuracy": 0.7747326493263245, | |
| "num_tokens": 3033356.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5497598797082901, | |
| "epoch": 0.6990654205607477, | |
| "grad_norm": 0.026862069964408875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495949983596802, | |
| "mean_token_accuracy": 0.7788131833076477, | |
| "num_tokens": 3049609.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5756572186946869, | |
| "epoch": 0.702803738317757, | |
| "grad_norm": 0.028672486543655396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735815763473511, | |
| "mean_token_accuracy": 0.7667711675167084, | |
| "num_tokens": 3065873.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.560253381729126, | |
| "epoch": 0.7065420560747664, | |
| "grad_norm": 0.029232166707515717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650488138198853, | |
| "mean_token_accuracy": 0.768238291144371, | |
| "num_tokens": 3081904.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5659812092781067, | |
| "epoch": 0.7102803738317757, | |
| "grad_norm": 0.028001444414258003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563786506652832, | |
| "mean_token_accuracy": 0.7705834209918976, | |
| "num_tokens": 3098208.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5397079735994339, | |
| "epoch": 0.7140186915887851, | |
| "grad_norm": 0.030035637319087982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431380271911621, | |
| "mean_token_accuracy": 0.7773479521274567, | |
| "num_tokens": 3114448.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5607352703809738, | |
| "epoch": 0.7177570093457943, | |
| "grad_norm": 0.026054881513118744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583080649375916, | |
| "mean_token_accuracy": 0.7758101969957352, | |
| "num_tokens": 3130755.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.551689624786377, | |
| "epoch": 0.7214953271028037, | |
| "grad_norm": 0.02845809981226921, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481313467025757, | |
| "mean_token_accuracy": 0.7777986079454422, | |
| "num_tokens": 3147133.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5639677792787552, | |
| "epoch": 0.7252336448598131, | |
| "grad_norm": 0.029969094321131706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5681430697441101, | |
| "mean_token_accuracy": 0.7705964744091034, | |
| "num_tokens": 3163582.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5548544675111771, | |
| "epoch": 0.7289719626168224, | |
| "grad_norm": 0.026430293917655945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528862476348877, | |
| "mean_token_accuracy": 0.7741632461547852, | |
| "num_tokens": 3180102.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5530348271131516, | |
| "epoch": 0.7327102803738318, | |
| "grad_norm": 0.026484189555048943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540847778320312, | |
| "mean_token_accuracy": 0.7735424339771271, | |
| "num_tokens": 3196312.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5409010052680969, | |
| "epoch": 0.7364485981308411, | |
| "grad_norm": 0.030766047537326813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487144589424133, | |
| "mean_token_accuracy": 0.7778207361698151, | |
| "num_tokens": 3212408.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5607801675796509, | |
| "epoch": 0.7401869158878505, | |
| "grad_norm": 0.029135972261428833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579065680503845, | |
| "mean_token_accuracy": 0.7756243348121643, | |
| "num_tokens": 3228688.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5638224929571152, | |
| "epoch": 0.7439252336448599, | |
| "grad_norm": 0.028466643765568733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634393095970154, | |
| "mean_token_accuracy": 0.770130917429924, | |
| "num_tokens": 3244856.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5390120446681976, | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.029409240931272507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443782210350037, | |
| "mean_token_accuracy": 0.7796739190816879, | |
| "num_tokens": 3261004.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5513757616281509, | |
| "epoch": 0.7514018691588785, | |
| "grad_norm": 0.032466452568769455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502808690071106, | |
| "mean_token_accuracy": 0.7751527577638626, | |
| "num_tokens": 3277310.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5808768719434738, | |
| "epoch": 0.7551401869158878, | |
| "grad_norm": 0.02947174198925495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5795295238494873, | |
| "mean_token_accuracy": 0.7640405744314194, | |
| "num_tokens": 3293719.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5713460445404053, | |
| "epoch": 0.7588785046728972, | |
| "grad_norm": 0.02874363400042057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726850032806396, | |
| "mean_token_accuracy": 0.7662371546030045, | |
| "num_tokens": 3310262.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5619738698005676, | |
| "epoch": 0.7626168224299066, | |
| "grad_norm": 0.028361184522509575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660584568977356, | |
| "mean_token_accuracy": 0.7703312337398529, | |
| "num_tokens": 3326670.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5531926304101944, | |
| "epoch": 0.7663551401869159, | |
| "grad_norm": 0.029734794050455093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551853775978088, | |
| "mean_token_accuracy": 0.7757412046194077, | |
| "num_tokens": 3343182.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5436140149831772, | |
| "epoch": 0.7700934579439253, | |
| "grad_norm": 0.027612119913101196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460025668144226, | |
| "mean_token_accuracy": 0.7787571996450424, | |
| "num_tokens": 3359734.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5484267920255661, | |
| "epoch": 0.7738317757009345, | |
| "grad_norm": 0.0273665152490139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512120723724365, | |
| "mean_token_accuracy": 0.7762885689735413, | |
| "num_tokens": 3375965.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5604408234357834, | |
| "epoch": 0.7775700934579439, | |
| "grad_norm": 0.03310655429959297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644571185112, | |
| "mean_token_accuracy": 0.7733126729726791, | |
| "num_tokens": 3392102.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5418381690979004, | |
| "epoch": 0.7813084112149533, | |
| "grad_norm": 0.03232184052467346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521958470344543, | |
| "mean_token_accuracy": 0.7741148620843887, | |
| "num_tokens": 3408306.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5678922086954117, | |
| "epoch": 0.7850467289719626, | |
| "grad_norm": 0.02696731500327587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638433694839478, | |
| "mean_token_accuracy": 0.7702384293079376, | |
| "num_tokens": 3424846.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5885234028100967, | |
| "epoch": 0.788785046728972, | |
| "grad_norm": 0.032732248306274414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5857526659965515, | |
| "mean_token_accuracy": 0.7618716955184937, | |
| "num_tokens": 3441315.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5481836199760437, | |
| "epoch": 0.7925233644859813, | |
| "grad_norm": 0.03158198669552803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456998348236084, | |
| "mean_token_accuracy": 0.7771993726491928, | |
| "num_tokens": 3457579.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5607763081789017, | |
| "epoch": 0.7962616822429907, | |
| "grad_norm": 0.03416353091597557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663735270500183, | |
| "mean_token_accuracy": 0.7718233168125153, | |
| "num_tokens": 3474205.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5533930957317352, | |
| "epoch": 0.8, | |
| "grad_norm": 0.02877282351255417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556164383888245, | |
| "mean_token_accuracy": 0.7742215096950531, | |
| "num_tokens": 3490438.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5604168176651001, | |
| "epoch": 0.8037383177570093, | |
| "grad_norm": 0.026928121224045753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551791191101074, | |
| "mean_token_accuracy": 0.77230204641819, | |
| "num_tokens": 3506851.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5647037774324417, | |
| "epoch": 0.8074766355140187, | |
| "grad_norm": 0.03445446118712425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678783655166626, | |
| "mean_token_accuracy": 0.7699416279792786, | |
| "num_tokens": 3523043.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.571955680847168, | |
| "epoch": 0.811214953271028, | |
| "grad_norm": 0.028322864323854446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738518238067627, | |
| "mean_token_accuracy": 0.7654245793819427, | |
| "num_tokens": 3539365.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5523362904787064, | |
| "epoch": 0.8149532710280374, | |
| "grad_norm": 0.033752068877220154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535821914672852, | |
| "mean_token_accuracy": 0.7761557102203369, | |
| "num_tokens": 3555412.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5571073293685913, | |
| "epoch": 0.8186915887850468, | |
| "grad_norm": 0.03274444863200188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591251850128174, | |
| "mean_token_accuracy": 0.7738742381334305, | |
| "num_tokens": 3571607.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5460310876369476, | |
| "epoch": 0.822429906542056, | |
| "grad_norm": 0.03267780691385269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483282208442688, | |
| "mean_token_accuracy": 0.774459958076477, | |
| "num_tokens": 3588112.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5458645969629288, | |
| "epoch": 0.8261682242990654, | |
| "grad_norm": 0.029655037447810173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553710401058197, | |
| "mean_token_accuracy": 0.7749865502119064, | |
| "num_tokens": 3604422.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5589277297258377, | |
| "epoch": 0.8299065420560747, | |
| "grad_norm": 0.0299095269292593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621532201766968, | |
| "mean_token_accuracy": 0.7721328884363174, | |
| "num_tokens": 3620586.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5576933324337006, | |
| "epoch": 0.8336448598130841, | |
| "grad_norm": 0.031302373856306076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637439489364624, | |
| "mean_token_accuracy": 0.7706159353256226, | |
| "num_tokens": 3636859.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5583267956972122, | |
| "epoch": 0.8373831775700935, | |
| "grad_norm": 0.02684536948800087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605804920196533, | |
| "mean_token_accuracy": 0.7703929096460342, | |
| "num_tokens": 3653154.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5555603951215744, | |
| "epoch": 0.8411214953271028, | |
| "grad_norm": 0.025324055925011635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553929805755615, | |
| "mean_token_accuracy": 0.773400217294693, | |
| "num_tokens": 3669474.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5502129048109055, | |
| "epoch": 0.8448598130841122, | |
| "grad_norm": 0.03151983022689819, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402862429618835, | |
| "mean_token_accuracy": 0.7839637249708176, | |
| "num_tokens": 3685885.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5631079375743866, | |
| "epoch": 0.8485981308411215, | |
| "grad_norm": 0.026639366522431374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603518486022949, | |
| "mean_token_accuracy": 0.7707885354757309, | |
| "num_tokens": 3702475.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5576464682817459, | |
| "epoch": 0.8523364485981308, | |
| "grad_norm": 0.028526777401566505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615932941436768, | |
| "mean_token_accuracy": 0.7698924392461777, | |
| "num_tokens": 3718675.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5553766041994095, | |
| "epoch": 0.8560747663551402, | |
| "grad_norm": 0.028387868776917458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598117709159851, | |
| "mean_token_accuracy": 0.7748202681541443, | |
| "num_tokens": 3734973.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5636192113161087, | |
| "epoch": 0.8598130841121495, | |
| "grad_norm": 0.029663704335689545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619429349899292, | |
| "mean_token_accuracy": 0.7697723060846329, | |
| "num_tokens": 3751197.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5656130164861679, | |
| "epoch": 0.8635514018691589, | |
| "grad_norm": 0.027196481823921204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559482753276825, | |
| "mean_token_accuracy": 0.7736194878816605, | |
| "num_tokens": 3767681.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5610507130622864, | |
| "epoch": 0.8672897196261682, | |
| "grad_norm": 0.02665848098695278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574455857276917, | |
| "mean_token_accuracy": 0.7723447382450104, | |
| "num_tokens": 3784223.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5565789192914963, | |
| "epoch": 0.8710280373831776, | |
| "grad_norm": 0.029676776379346848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581963062286377, | |
| "mean_token_accuracy": 0.7723328024148941, | |
| "num_tokens": 3800606.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5488535314798355, | |
| "epoch": 0.874766355140187, | |
| "grad_norm": 0.026432445272803307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548264384269714, | |
| "mean_token_accuracy": 0.776095449924469, | |
| "num_tokens": 3817211.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5432089567184448, | |
| "epoch": 0.8785046728971962, | |
| "grad_norm": 0.028454309329390526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551573038101196, | |
| "mean_token_accuracy": 0.7737965285778046, | |
| "num_tokens": 3833562.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5564523041248322, | |
| "epoch": 0.8822429906542056, | |
| "grad_norm": 0.03045317530632019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593273043632507, | |
| "mean_token_accuracy": 0.7728880196809769, | |
| "num_tokens": 3849716.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5449672043323517, | |
| "epoch": 0.8859813084112149, | |
| "grad_norm": 0.026425793766975403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469970107078552, | |
| "mean_token_accuracy": 0.777935191988945, | |
| "num_tokens": 3865915.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5773142129182816, | |
| "epoch": 0.8897196261682243, | |
| "grad_norm": 0.024763669818639755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5751665830612183, | |
| "mean_token_accuracy": 0.7665848284959793, | |
| "num_tokens": 3882374.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5337313264608383, | |
| "epoch": 0.8934579439252337, | |
| "grad_norm": 0.027221228927373886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295661687850952, | |
| "mean_token_accuracy": 0.7860913276672363, | |
| "num_tokens": 3898501.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5395989120006561, | |
| "epoch": 0.897196261682243, | |
| "grad_norm": 0.026916388422250748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377291440963745, | |
| "mean_token_accuracy": 0.7827803045511246, | |
| "num_tokens": 3914802.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.56096251308918, | |
| "epoch": 0.9009345794392524, | |
| "grad_norm": 0.03178329020738602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572348237037659, | |
| "mean_token_accuracy": 0.774958074092865, | |
| "num_tokens": 3931307.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5351977944374084, | |
| "epoch": 0.9046728971962616, | |
| "grad_norm": 0.027758494019508362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389144420623779, | |
| "mean_token_accuracy": 0.7842132151126862, | |
| "num_tokens": 3947818.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5689495801925659, | |
| "epoch": 0.908411214953271, | |
| "grad_norm": 0.028313076123595238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5732687711715698, | |
| "mean_token_accuracy": 0.7685291916131973, | |
| "num_tokens": 3964238.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5562418401241302, | |
| "epoch": 0.9121495327102803, | |
| "grad_norm": 0.028738385066390038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559317469596863, | |
| "mean_token_accuracy": 0.7747041881084442, | |
| "num_tokens": 3980625.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5630334913730621, | |
| "epoch": 0.9158878504672897, | |
| "grad_norm": 0.024547314271330833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560680627822876, | |
| "mean_token_accuracy": 0.7717334777116776, | |
| "num_tokens": 3997248.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5409311354160309, | |
| "epoch": 0.9196261682242991, | |
| "grad_norm": 0.029392484575510025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488813519477844, | |
| "mean_token_accuracy": 0.7771373838186264, | |
| "num_tokens": 4013356.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5529599785804749, | |
| "epoch": 0.9233644859813084, | |
| "grad_norm": 0.024964116513729095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492331385612488, | |
| "mean_token_accuracy": 0.778782069683075, | |
| "num_tokens": 4029521.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5397895872592926, | |
| "epoch": 0.9271028037383178, | |
| "grad_norm": 0.026621561497449875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443588495254517, | |
| "mean_token_accuracy": 0.7782554626464844, | |
| "num_tokens": 4045913.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5582248121500015, | |
| "epoch": 0.930841121495327, | |
| "grad_norm": 0.02803446725010872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627061128616333, | |
| "mean_token_accuracy": 0.7742072343826294, | |
| "num_tokens": 4062448.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5673990696668625, | |
| "epoch": 0.9345794392523364, | |
| "grad_norm": 0.03014424815773964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727946162223816, | |
| "mean_token_accuracy": 0.7685662358999252, | |
| "num_tokens": 4078711.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.566023588180542, | |
| "epoch": 0.9383177570093458, | |
| "grad_norm": 0.030524935573339462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595183372497559, | |
| "mean_token_accuracy": 0.7738057672977448, | |
| "num_tokens": 4095240.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5499134510755539, | |
| "epoch": 0.9420560747663551, | |
| "grad_norm": 0.02502668835222721, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446998476982117, | |
| "mean_token_accuracy": 0.7789950519800186, | |
| "num_tokens": 4111687.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5639411062002182, | |
| "epoch": 0.9457943925233645, | |
| "grad_norm": 0.03420841693878174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5659236311912537, | |
| "mean_token_accuracy": 0.7703807950019836, | |
| "num_tokens": 4128093.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5703455805778503, | |
| "epoch": 0.9495327102803738, | |
| "grad_norm": 0.0303607527166605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5696687698364258, | |
| "mean_token_accuracy": 0.7690610140562057, | |
| "num_tokens": 4144612.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.558226928114891, | |
| "epoch": 0.9532710280373832, | |
| "grad_norm": 0.03168858587741852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676078200340271, | |
| "mean_token_accuracy": 0.7693912833929062, | |
| "num_tokens": 4161169.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5530082136392593, | |
| "epoch": 0.9570093457943926, | |
| "grad_norm": 0.027083205059170723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579201579093933, | |
| "mean_token_accuracy": 0.772939920425415, | |
| "num_tokens": 4177454.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5732781291007996, | |
| "epoch": 0.9607476635514018, | |
| "grad_norm": 0.025865184143185616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745596289634705, | |
| "mean_token_accuracy": 0.7667286545038223, | |
| "num_tokens": 4193733.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5650701373815536, | |
| "epoch": 0.9644859813084112, | |
| "grad_norm": 0.03244631364941597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617667436599731, | |
| "mean_token_accuracy": 0.7715478390455246, | |
| "num_tokens": 4209843.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5724828094244003, | |
| "epoch": 0.9682242990654205, | |
| "grad_norm": 0.02807115763425827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5692450404167175, | |
| "mean_token_accuracy": 0.76779405772686, | |
| "num_tokens": 4226262.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5677514672279358, | |
| "epoch": 0.9719626168224299, | |
| "grad_norm": 0.024189095944166183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623309016227722, | |
| "mean_token_accuracy": 0.7734705060720444, | |
| "num_tokens": 4242877.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.56018927693367, | |
| "epoch": 0.9757009345794393, | |
| "grad_norm": 0.030152512714266777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5675455927848816, | |
| "mean_token_accuracy": 0.7673967182636261, | |
| "num_tokens": 4259432.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5601605176925659, | |
| "epoch": 0.9794392523364486, | |
| "grad_norm": 0.0288025364279747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5698415040969849, | |
| "mean_token_accuracy": 0.7686598151922226, | |
| "num_tokens": 4275917.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5593424290418625, | |
| "epoch": 0.983177570093458, | |
| "grad_norm": 0.024790652096271515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574150085449219, | |
| "mean_token_accuracy": 0.7770240753889084, | |
| "num_tokens": 4292310.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5394274890422821, | |
| "epoch": 0.9869158878504672, | |
| "grad_norm": 0.02477172389626503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407758951187134, | |
| "mean_token_accuracy": 0.780282586812973, | |
| "num_tokens": 4308380.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5651121735572815, | |
| "epoch": 0.9906542056074766, | |
| "grad_norm": 0.028029976412653923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648099184036255, | |
| "mean_token_accuracy": 0.7703951746225357, | |
| "num_tokens": 4324834.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5426322817802429, | |
| "epoch": 0.994392523364486, | |
| "grad_norm": 0.025631116703152657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393193364143372, | |
| "mean_token_accuracy": 0.7813181281089783, | |
| "num_tokens": 4341233.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5464787781238556, | |
| "epoch": 0.9981308411214953, | |
| "grad_norm": 0.029863541945815086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550025701522827, | |
| "mean_token_accuracy": 0.7747247219085693, | |
| "num_tokens": 4357682.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5607179999351501, | |
| "epoch": 1.0, | |
| "grad_norm": 0.03738218545913696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586302876472473, | |
| "mean_token_accuracy": 0.7706243097782135, | |
| "num_tokens": 4364958.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5429188311100006, | |
| "epoch": 1.0037383177570094, | |
| "grad_norm": 0.031045127660036087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379543900489807, | |
| "mean_token_accuracy": 0.7818119078874588, | |
| "num_tokens": 4381160.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5693697482347488, | |
| "epoch": 1.0074766355140188, | |
| "grad_norm": 0.034702617675065994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631182789802551, | |
| "mean_token_accuracy": 0.7740933299064636, | |
| "num_tokens": 4397580.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5556007027626038, | |
| "epoch": 1.011214953271028, | |
| "grad_norm": 0.029613088816404343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564326643943787, | |
| "mean_token_accuracy": 0.7747503072023392, | |
| "num_tokens": 4413970.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5529852658510208, | |
| "epoch": 1.0149532710280373, | |
| "grad_norm": 0.028977181762456894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552069544792175, | |
| "mean_token_accuracy": 0.7720492333173752, | |
| "num_tokens": 4430293.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5520482361316681, | |
| "epoch": 1.0186915887850467, | |
| "grad_norm": 0.03374192863702774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517052412033081, | |
| "mean_token_accuracy": 0.7761924266815186, | |
| "num_tokens": 4446900.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5477887243032455, | |
| "epoch": 1.0224299065420561, | |
| "grad_norm": 0.02954636886715889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459023714065552, | |
| "mean_token_accuracy": 0.7766608893871307, | |
| "num_tokens": 4463329.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5484108775854111, | |
| "epoch": 1.0261682242990655, | |
| "grad_norm": 0.029792649671435356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553299069404602, | |
| "mean_token_accuracy": 0.7751943320035934, | |
| "num_tokens": 4479679.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5480824410915375, | |
| "epoch": 1.0299065420560747, | |
| "grad_norm": 0.03428385779261589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54673171043396, | |
| "mean_token_accuracy": 0.7777809202671051, | |
| "num_tokens": 4496261.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5371964275836945, | |
| "epoch": 1.033644859813084, | |
| "grad_norm": 0.027453402057290077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412828922271729, | |
| "mean_token_accuracy": 0.7782962769269943, | |
| "num_tokens": 4512363.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5626021921634674, | |
| "epoch": 1.0373831775700935, | |
| "grad_norm": 0.03147402033209801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639899373054504, | |
| "mean_token_accuracy": 0.772662416100502, | |
| "num_tokens": 4528687.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5309132784605026, | |
| "epoch": 1.0411214953271029, | |
| "grad_norm": 0.03592999279499054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408714413642883, | |
| "mean_token_accuracy": 0.7803217619657516, | |
| "num_tokens": 4544861.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5621335506439209, | |
| "epoch": 1.0448598130841122, | |
| "grad_norm": 0.027180444449186325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557287931442261, | |
| "mean_token_accuracy": 0.7766296565532684, | |
| "num_tokens": 4561446.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5597621351480484, | |
| "epoch": 1.0485981308411214, | |
| "grad_norm": 0.030723722651600838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488376617431641, | |
| "mean_token_accuracy": 0.7752789407968521, | |
| "num_tokens": 4577902.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5447895377874374, | |
| "epoch": 1.0523364485981308, | |
| "grad_norm": 0.03346191346645355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54459547996521, | |
| "mean_token_accuracy": 0.7764092832803726, | |
| "num_tokens": 4593907.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5376723855733871, | |
| "epoch": 1.0560747663551402, | |
| "grad_norm": 0.029941193759441376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396949052810669, | |
| "mean_token_accuracy": 0.7800134569406509, | |
| "num_tokens": 4610281.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.532968744635582, | |
| "epoch": 1.0598130841121496, | |
| "grad_norm": 0.03566444665193558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449310541152954, | |
| "mean_token_accuracy": 0.7814425081014633, | |
| "num_tokens": 4626569.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5349016040563583, | |
| "epoch": 1.063551401869159, | |
| "grad_norm": 0.03160771727561951, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422961115837097, | |
| "mean_token_accuracy": 0.7798893004655838, | |
| "num_tokens": 4643058.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.533850871026516, | |
| "epoch": 1.0672897196261681, | |
| "grad_norm": 0.036520425230264664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418434739112854, | |
| "mean_token_accuracy": 0.7801807075738907, | |
| "num_tokens": 4659171.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5512394160032272, | |
| "epoch": 1.0710280373831775, | |
| "grad_norm": 0.030453668907284737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547731339931488, | |
| "mean_token_accuracy": 0.77372145652771, | |
| "num_tokens": 4675372.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5371382534503937, | |
| "epoch": 1.074766355140187, | |
| "grad_norm": 0.031432170420885086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5252817869186401, | |
| "mean_token_accuracy": 0.7852388918399811, | |
| "num_tokens": 4691895.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5536183714866638, | |
| "epoch": 1.0785046728971963, | |
| "grad_norm": 0.036878716200590134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542073249816895, | |
| "mean_token_accuracy": 0.7766832113265991, | |
| "num_tokens": 4708579.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5479064285755157, | |
| "epoch": 1.0822429906542057, | |
| "grad_norm": 0.031178997829556465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539444088935852, | |
| "mean_token_accuracy": 0.7733383923768997, | |
| "num_tokens": 4725006.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5490889102220535, | |
| "epoch": 1.0859813084112149, | |
| "grad_norm": 0.03600861504673958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477103590965271, | |
| "mean_token_accuracy": 0.7760229259729385, | |
| "num_tokens": 4741146.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5331408083438873, | |
| "epoch": 1.0897196261682243, | |
| "grad_norm": 0.029067492112517357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310513377189636, | |
| "mean_token_accuracy": 0.7808917611837387, | |
| "num_tokens": 4757405.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5732952356338501, | |
| "epoch": 1.0934579439252337, | |
| "grad_norm": 0.027897845953702927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5689205527305603, | |
| "mean_token_accuracy": 0.7669987231492996, | |
| "num_tokens": 4773935.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5514747202396393, | |
| "epoch": 1.097196261682243, | |
| "grad_norm": 0.03678213432431221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475887060165405, | |
| "mean_token_accuracy": 0.7782610803842545, | |
| "num_tokens": 4790197.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5528618544340134, | |
| "epoch": 1.1009345794392524, | |
| "grad_norm": 0.03136972337961197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539395213127136, | |
| "mean_token_accuracy": 0.7734730243682861, | |
| "num_tokens": 4806625.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5395589917898178, | |
| "epoch": 1.1046728971962616, | |
| "grad_norm": 0.030648380517959595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440752506256104, | |
| "mean_token_accuracy": 0.7809486091136932, | |
| "num_tokens": 4823046.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5670987218618393, | |
| "epoch": 1.108411214953271, | |
| "grad_norm": 0.028722837567329407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669575929641724, | |
| "mean_token_accuracy": 0.7682226747274399, | |
| "num_tokens": 4839449.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5453528463840485, | |
| "epoch": 1.1121495327102804, | |
| "grad_norm": 0.03358433395624161, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394450426101685, | |
| "mean_token_accuracy": 0.7793479263782501, | |
| "num_tokens": 4855702.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5313688218593597, | |
| "epoch": 1.1158878504672898, | |
| "grad_norm": 0.031751058995723724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339279174804688, | |
| "mean_token_accuracy": 0.7852170914411545, | |
| "num_tokens": 4872035.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5542233884334564, | |
| "epoch": 1.1196261682242992, | |
| "grad_norm": 0.030381185933947563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629603862762451, | |
| "mean_token_accuracy": 0.76924729347229, | |
| "num_tokens": 4888405.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5514146685600281, | |
| "epoch": 1.1233644859813083, | |
| "grad_norm": 0.028884021565318108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550013542175293, | |
| "mean_token_accuracy": 0.7766973823308945, | |
| "num_tokens": 4904871.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5544252693653107, | |
| "epoch": 1.1271028037383177, | |
| "grad_norm": 0.03688167408108711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589375495910645, | |
| "mean_token_accuracy": 0.7750934660434723, | |
| "num_tokens": 4921370.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5409253090620041, | |
| "epoch": 1.1308411214953271, | |
| "grad_norm": 0.026449156925082207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402511358261108, | |
| "mean_token_accuracy": 0.7794521301984787, | |
| "num_tokens": 4937635.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5496914833784103, | |
| "epoch": 1.1345794392523365, | |
| "grad_norm": 0.030888745561242104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520302653312683, | |
| "mean_token_accuracy": 0.7741389274597168, | |
| "num_tokens": 4953795.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5356033593416214, | |
| "epoch": 1.1383177570093457, | |
| "grad_norm": 0.030453680083155632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415939092636108, | |
| "mean_token_accuracy": 0.7807344794273376, | |
| "num_tokens": 4970296.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.53813037276268, | |
| "epoch": 1.142056074766355, | |
| "grad_norm": 0.03046366199851036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416396856307983, | |
| "mean_token_accuracy": 0.7764643579721451, | |
| "num_tokens": 4986502.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5428405404090881, | |
| "epoch": 1.1457943925233645, | |
| "grad_norm": 0.03174874931573868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486522912979126, | |
| "mean_token_accuracy": 0.7775285989046097, | |
| "num_tokens": 5002702.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5566747784614563, | |
| "epoch": 1.1495327102803738, | |
| "grad_norm": 0.028818320482969284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562471151351929, | |
| "mean_token_accuracy": 0.77483069896698, | |
| "num_tokens": 5019050.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5498685240745544, | |
| "epoch": 1.1532710280373832, | |
| "grad_norm": 0.028088422492146492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427108407020569, | |
| "mean_token_accuracy": 0.7781059741973877, | |
| "num_tokens": 5035367.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5676623731851578, | |
| "epoch": 1.1570093457943926, | |
| "grad_norm": 0.02635916881263256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621261596679688, | |
| "mean_token_accuracy": 0.7690412253141403, | |
| "num_tokens": 5051623.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5571839809417725, | |
| "epoch": 1.1607476635514018, | |
| "grad_norm": 0.030562767758965492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547442436218262, | |
| "mean_token_accuracy": 0.773685023188591, | |
| "num_tokens": 5067784.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5521961599588394, | |
| "epoch": 1.1644859813084112, | |
| "grad_norm": 0.02953186444938183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498039722442627, | |
| "mean_token_accuracy": 0.7766331732273102, | |
| "num_tokens": 5084198.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5448037981987, | |
| "epoch": 1.1682242990654206, | |
| "grad_norm": 0.04071420431137085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559482574462891, | |
| "mean_token_accuracy": 0.7727169245481491, | |
| "num_tokens": 5100585.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5439905822277069, | |
| "epoch": 1.17196261682243, | |
| "grad_norm": 0.031825143843889236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438477396965027, | |
| "mean_token_accuracy": 0.7780765742063522, | |
| "num_tokens": 5116856.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5614278465509415, | |
| "epoch": 1.1757009345794391, | |
| "grad_norm": 0.03391456976532936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585231781005859, | |
| "mean_token_accuracy": 0.774724468588829, | |
| "num_tokens": 5133123.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5348840728402138, | |
| "epoch": 1.1794392523364485, | |
| "grad_norm": 0.030404910445213318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299553275108337, | |
| "mean_token_accuracy": 0.7871359586715698, | |
| "num_tokens": 5149505.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5417611449956894, | |
| "epoch": 1.183177570093458, | |
| "grad_norm": 0.03005358763039112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521109700202942, | |
| "mean_token_accuracy": 0.7752534449100494, | |
| "num_tokens": 5165665.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5467934459447861, | |
| "epoch": 1.1869158878504673, | |
| "grad_norm": 0.030464891344308853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535311698913574, | |
| "mean_token_accuracy": 0.7757606655359268, | |
| "num_tokens": 5182312.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.55706687271595, | |
| "epoch": 1.1906542056074767, | |
| "grad_norm": 0.03402930125594139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56557697057724, | |
| "mean_token_accuracy": 0.773482084274292, | |
| "num_tokens": 5198753.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5285287350416183, | |
| "epoch": 1.194392523364486, | |
| "grad_norm": 0.03398562967777252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356812477111816, | |
| "mean_token_accuracy": 0.781065508723259, | |
| "num_tokens": 5214716.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5561061501502991, | |
| "epoch": 1.1981308411214953, | |
| "grad_norm": 0.04313025251030922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472796559333801, | |
| "mean_token_accuracy": 0.7778294533491135, | |
| "num_tokens": 5230933.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.556538999080658, | |
| "epoch": 1.2018691588785047, | |
| "grad_norm": 0.03227441757917404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438181161880493, | |
| "mean_token_accuracy": 0.7791680693626404, | |
| "num_tokens": 5247202.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5609522461891174, | |
| "epoch": 1.205607476635514, | |
| "grad_norm": 0.03183369338512421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561162829399109, | |
| "mean_token_accuracy": 0.7751743495464325, | |
| "num_tokens": 5263696.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5427358001470566, | |
| "epoch": 1.2093457943925234, | |
| "grad_norm": 0.03253727778792381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515695214271545, | |
| "mean_token_accuracy": 0.7756281793117523, | |
| "num_tokens": 5280141.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5160750597715378, | |
| "epoch": 1.2130841121495326, | |
| "grad_norm": 0.03668288141489029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526226282119751, | |
| "mean_token_accuracy": 0.7851300984621048, | |
| "num_tokens": 5296198.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5500008910894394, | |
| "epoch": 1.216822429906542, | |
| "grad_norm": 0.03275466337800026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556660890579224, | |
| "mean_token_accuracy": 0.7739221006631851, | |
| "num_tokens": 5312653.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5459257364273071, | |
| "epoch": 1.2205607476635514, | |
| "grad_norm": 0.02891591377556324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413340330123901, | |
| "mean_token_accuracy": 0.781257688999176, | |
| "num_tokens": 5328926.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5695579349994659, | |
| "epoch": 1.2242990654205608, | |
| "grad_norm": 0.0299241840839386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636513233184814, | |
| "mean_token_accuracy": 0.7732590138912201, | |
| "num_tokens": 5345213.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5591664463281631, | |
| "epoch": 1.2280373831775702, | |
| "grad_norm": 0.034591834992170334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587798953056335, | |
| "mean_token_accuracy": 0.7725549340248108, | |
| "num_tokens": 5361493.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5631786286830902, | |
| "epoch": 1.2317757009345796, | |
| "grad_norm": 0.03143571689724922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540720224380493, | |
| "mean_token_accuracy": 0.7765887975692749, | |
| "num_tokens": 5378085.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5508914291858673, | |
| "epoch": 1.2355140186915887, | |
| "grad_norm": 0.032595690339803696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526955723762512, | |
| "mean_token_accuracy": 0.7747674286365509, | |
| "num_tokens": 5394458.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.536909781396389, | |
| "epoch": 1.2392523364485981, | |
| "grad_norm": 0.033028744161129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481626987457275, | |
| "mean_token_accuracy": 0.7782605588436127, | |
| "num_tokens": 5410880.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5499342679977417, | |
| "epoch": 1.2429906542056075, | |
| "grad_norm": 0.03855755180120468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627814531326294, | |
| "mean_token_accuracy": 0.7700037658214569, | |
| "num_tokens": 5426885.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5494136810302734, | |
| "epoch": 1.246728971962617, | |
| "grad_norm": 0.03397782891988754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508397817611694, | |
| "mean_token_accuracy": 0.7756514847278595, | |
| "num_tokens": 5443330.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5679187029600143, | |
| "epoch": 1.250467289719626, | |
| "grad_norm": 0.03217748925089836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683805346488953, | |
| "mean_token_accuracy": 0.770328551530838, | |
| "num_tokens": 5459602.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5620801448822021, | |
| "epoch": 1.2542056074766355, | |
| "grad_norm": 0.03699919581413269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556020200252533, | |
| "mean_token_accuracy": 0.7749847769737244, | |
| "num_tokens": 5475920.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5483541190624237, | |
| "epoch": 1.2579439252336448, | |
| "grad_norm": 0.027093922719359398, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420067310333252, | |
| "mean_token_accuracy": 0.7774698734283447, | |
| "num_tokens": 5492418.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5432356148958206, | |
| "epoch": 1.2616822429906542, | |
| "grad_norm": 0.029740024358034134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436828136444092, | |
| "mean_token_accuracy": 0.7754241824150085, | |
| "num_tokens": 5508720.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5282722562551498, | |
| "epoch": 1.2654205607476636, | |
| "grad_norm": 0.02825041115283966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287445783615112, | |
| "mean_token_accuracy": 0.785777822136879, | |
| "num_tokens": 5524810.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5574855506420135, | |
| "epoch": 1.269158878504673, | |
| "grad_norm": 0.03507409617304802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5642590522766113, | |
| "mean_token_accuracy": 0.7694929391145706, | |
| "num_tokens": 5541154.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5311331301927567, | |
| "epoch": 1.2728971962616822, | |
| "grad_norm": 0.029530638828873634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375971794128418, | |
| "mean_token_accuracy": 0.7804928719997406, | |
| "num_tokens": 5557415.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5492513477802277, | |
| "epoch": 1.2766355140186916, | |
| "grad_norm": 0.03299937769770622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487713813781738, | |
| "mean_token_accuracy": 0.7776053845882416, | |
| "num_tokens": 5573593.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5501092821359634, | |
| "epoch": 1.280373831775701, | |
| "grad_norm": 0.03342421352863312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497907996177673, | |
| "mean_token_accuracy": 0.7747702449560165, | |
| "num_tokens": 5590001.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5520797073841095, | |
| "epoch": 1.2841121495327104, | |
| "grad_norm": 0.029625268653035164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493736267089844, | |
| "mean_token_accuracy": 0.7800589352846146, | |
| "num_tokens": 5606174.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5360356196761131, | |
| "epoch": 1.2878504672897195, | |
| "grad_norm": 0.03089168108999729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362368226051331, | |
| "mean_token_accuracy": 0.7833685129880905, | |
| "num_tokens": 5622436.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5267095118761063, | |
| "epoch": 1.291588785046729, | |
| "grad_norm": 0.03297918289899826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281186699867249, | |
| "mean_token_accuracy": 0.7881515920162201, | |
| "num_tokens": 5638451.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5502850115299225, | |
| "epoch": 1.2953271028037383, | |
| "grad_norm": 0.047267865389585495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505760312080383, | |
| "mean_token_accuracy": 0.7761109918355942, | |
| "num_tokens": 5655041.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5508257895708084, | |
| "epoch": 1.2990654205607477, | |
| "grad_norm": 0.028140036389231682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515832304954529, | |
| "mean_token_accuracy": 0.7750399112701416, | |
| "num_tokens": 5671677.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5565541088581085, | |
| "epoch": 1.302803738317757, | |
| "grad_norm": 0.032449062913656235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538536310195923, | |
| "mean_token_accuracy": 0.7736092507839203, | |
| "num_tokens": 5688187.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5361721217632294, | |
| "epoch": 1.3065420560747665, | |
| "grad_norm": 0.029190748929977417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377737879753113, | |
| "mean_token_accuracy": 0.7808200567960739, | |
| "num_tokens": 5704636.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5346792191267014, | |
| "epoch": 1.3102803738317756, | |
| "grad_norm": 0.03473074361681938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417028665542603, | |
| "mean_token_accuracy": 0.778437003493309, | |
| "num_tokens": 5721160.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5305602103471756, | |
| "epoch": 1.314018691588785, | |
| "grad_norm": 0.03426121547818184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302631258964539, | |
| "mean_token_accuracy": 0.7822723984718323, | |
| "num_tokens": 5737508.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5443065613508224, | |
| "epoch": 1.3177570093457944, | |
| "grad_norm": 0.031232863664627075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438801050186157, | |
| "mean_token_accuracy": 0.7807773351669312, | |
| "num_tokens": 5753931.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5547338575124741, | |
| "epoch": 1.3214953271028038, | |
| "grad_norm": 0.03515113145112991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590701103210449, | |
| "mean_token_accuracy": 0.7718778848648071, | |
| "num_tokens": 5770396.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5776932686567307, | |
| "epoch": 1.325233644859813, | |
| "grad_norm": 0.031292639672756195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758817791938782, | |
| "mean_token_accuracy": 0.76340052485466, | |
| "num_tokens": 5786743.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5471627116203308, | |
| "epoch": 1.3289719626168224, | |
| "grad_norm": 0.02935577929019928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406426787376404, | |
| "mean_token_accuracy": 0.7801960557699203, | |
| "num_tokens": 5803296.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5335498154163361, | |
| "epoch": 1.3327102803738318, | |
| "grad_norm": 0.029476149007678032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379401445388794, | |
| "mean_token_accuracy": 0.7807924002408981, | |
| "num_tokens": 5819523.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.571747362613678, | |
| "epoch": 1.3364485981308412, | |
| "grad_norm": 0.030969126150012016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734298825263977, | |
| "mean_token_accuracy": 0.7665233165025711, | |
| "num_tokens": 5835904.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5278273224830627, | |
| "epoch": 1.3401869158878505, | |
| "grad_norm": 0.035017624497413635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390288233757019, | |
| "mean_token_accuracy": 0.7818515002727509, | |
| "num_tokens": 5852087.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5494511723518372, | |
| "epoch": 1.34392523364486, | |
| "grad_norm": 0.0332498699426651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546149611473083, | |
| "mean_token_accuracy": 0.7754078060388565, | |
| "num_tokens": 5868313.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5656353235244751, | |
| "epoch": 1.347663551401869, | |
| "grad_norm": 0.029156476259231567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639902353286743, | |
| "mean_token_accuracy": 0.7691005319356918, | |
| "num_tokens": 5884673.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5517591834068298, | |
| "epoch": 1.3514018691588785, | |
| "grad_norm": 0.033162813633680344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487698316574097, | |
| "mean_token_accuracy": 0.7762563526630402, | |
| "num_tokens": 5901026.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5693054497241974, | |
| "epoch": 1.355140186915888, | |
| "grad_norm": 0.03303493186831474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636650323867798, | |
| "mean_token_accuracy": 0.7702258229255676, | |
| "num_tokens": 5917299.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5485306680202484, | |
| "epoch": 1.358878504672897, | |
| "grad_norm": 0.028174106031656265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443013310432434, | |
| "mean_token_accuracy": 0.7785944491624832, | |
| "num_tokens": 5933711.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5455866008996964, | |
| "epoch": 1.3626168224299064, | |
| "grad_norm": 0.03680690750479698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549443364143372, | |
| "mean_token_accuracy": 0.7760016471147537, | |
| "num_tokens": 5949851.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5625369846820831, | |
| "epoch": 1.3663551401869158, | |
| "grad_norm": 0.03274211287498474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614032745361328, | |
| "mean_token_accuracy": 0.7710064649581909, | |
| "num_tokens": 5966219.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5512880086898804, | |
| "epoch": 1.3700934579439252, | |
| "grad_norm": 0.029914218932390213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541912317276001, | |
| "mean_token_accuracy": 0.7744521200656891, | |
| "num_tokens": 5982685.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5462228506803513, | |
| "epoch": 1.3738317757009346, | |
| "grad_norm": 0.03740010783076286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542587161064148, | |
| "mean_token_accuracy": 0.7833080589771271, | |
| "num_tokens": 5999012.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5561699420213699, | |
| "epoch": 1.377570093457944, | |
| "grad_norm": 0.03154682740569115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543806552886963, | |
| "mean_token_accuracy": 0.7729498744010925, | |
| "num_tokens": 6015418.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5295282006263733, | |
| "epoch": 1.3813084112149534, | |
| "grad_norm": 0.029992269352078438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347234010696411, | |
| "mean_token_accuracy": 0.7826734483242035, | |
| "num_tokens": 6031664.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5307233035564423, | |
| "epoch": 1.3850467289719626, | |
| "grad_norm": 0.0387556329369545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442472696304321, | |
| "mean_token_accuracy": 0.7788428515195847, | |
| "num_tokens": 6047789.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5666087120771408, | |
| "epoch": 1.388785046728972, | |
| "grad_norm": 0.03485598787665367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5701879858970642, | |
| "mean_token_accuracy": 0.7664644569158554, | |
| "num_tokens": 6064072.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5600801408290863, | |
| "epoch": 1.3925233644859814, | |
| "grad_norm": 0.030468204990029335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557839035987854, | |
| "mean_token_accuracy": 0.7774783074855804, | |
| "num_tokens": 6080233.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5573039948940277, | |
| "epoch": 1.3962616822429905, | |
| "grad_norm": 0.03327672928571701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551377534866333, | |
| "mean_token_accuracy": 0.7740774154663086, | |
| "num_tokens": 6096552.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5559895187616348, | |
| "epoch": 1.4, | |
| "grad_norm": 0.029464859515428543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499491691589355, | |
| "mean_token_accuracy": 0.778936430811882, | |
| "num_tokens": 6112721.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5373993217945099, | |
| "epoch": 1.4037383177570093, | |
| "grad_norm": 0.033405598253011703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378676652908325, | |
| "mean_token_accuracy": 0.78409743309021, | |
| "num_tokens": 6128876.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5293000936508179, | |
| "epoch": 1.4074766355140187, | |
| "grad_norm": 0.03749069571495056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442302823066711, | |
| "mean_token_accuracy": 0.7793403714895248, | |
| "num_tokens": 6145070.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5288459360599518, | |
| "epoch": 1.411214953271028, | |
| "grad_norm": 0.0304460097104311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322169661521912, | |
| "mean_token_accuracy": 0.7845710813999176, | |
| "num_tokens": 6161358.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5396905541419983, | |
| "epoch": 1.4149532710280375, | |
| "grad_norm": 0.0334291011095047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536848783493042, | |
| "mean_token_accuracy": 0.7786440551280975, | |
| "num_tokens": 6177744.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5749261528253555, | |
| "epoch": 1.4186915887850469, | |
| "grad_norm": 0.03149184212088585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657936334609985, | |
| "mean_token_accuracy": 0.7711158096790314, | |
| "num_tokens": 6194294.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5584524124860764, | |
| "epoch": 1.422429906542056, | |
| "grad_norm": 0.03502335026860237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578019618988037, | |
| "mean_token_accuracy": 0.7754084765911102, | |
| "num_tokens": 6210591.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5385516434907913, | |
| "epoch": 1.4261682242990654, | |
| "grad_norm": 0.029922619462013245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379009246826172, | |
| "mean_token_accuracy": 0.7822572886943817, | |
| "num_tokens": 6226836.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5303553491830826, | |
| "epoch": 1.4299065420560748, | |
| "grad_norm": 0.03207620605826378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399402379989624, | |
| "mean_token_accuracy": 0.7848275154829025, | |
| "num_tokens": 6243140.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5435499548912048, | |
| "epoch": 1.433644859813084, | |
| "grad_norm": 0.034929681569337845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510104298591614, | |
| "mean_token_accuracy": 0.7754337340593338, | |
| "num_tokens": 6259135.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5495016276836395, | |
| "epoch": 1.4373831775700934, | |
| "grad_norm": 0.02961392141878605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518282651901245, | |
| "mean_token_accuracy": 0.7770158797502518, | |
| "num_tokens": 6275478.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5597821772098541, | |
| "epoch": 1.4411214953271028, | |
| "grad_norm": 0.03038998879492283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598548650741577, | |
| "mean_token_accuracy": 0.7717087864875793, | |
| "num_tokens": 6292022.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5554857552051544, | |
| "epoch": 1.4448598130841122, | |
| "grad_norm": 0.034831635653972626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589088201522827, | |
| "mean_token_accuracy": 0.7742104977369308, | |
| "num_tokens": 6308395.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5330976247787476, | |
| "epoch": 1.4485981308411215, | |
| "grad_norm": 0.03864655643701553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340397357940674, | |
| "mean_token_accuracy": 0.7843937277793884, | |
| "num_tokens": 6324443.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5459477603435516, | |
| "epoch": 1.452336448598131, | |
| "grad_norm": 0.03552354499697685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546898603439331, | |
| "mean_token_accuracy": 0.7767336070537567, | |
| "num_tokens": 6340452.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.555869922041893, | |
| "epoch": 1.45607476635514, | |
| "grad_norm": 0.042999885976314545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562218427658081, | |
| "mean_token_accuracy": 0.772677481174469, | |
| "num_tokens": 6356737.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5476373881101608, | |
| "epoch": 1.4598130841121495, | |
| "grad_norm": 0.034353937953710556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502485632896423, | |
| "mean_token_accuracy": 0.7757505625486374, | |
| "num_tokens": 6372959.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5542000085115433, | |
| "epoch": 1.4635514018691589, | |
| "grad_norm": 0.030675135552883148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507063865661621, | |
| "mean_token_accuracy": 0.7746506035327911, | |
| "num_tokens": 6389285.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5308681577444077, | |
| "epoch": 1.4672897196261683, | |
| "grad_norm": 0.03328751027584076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308902263641357, | |
| "mean_token_accuracy": 0.7832993865013123, | |
| "num_tokens": 6405473.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5490089803934097, | |
| "epoch": 1.4710280373831774, | |
| "grad_norm": 0.03258799389004707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524098873138428, | |
| "mean_token_accuracy": 0.7753634303808212, | |
| "num_tokens": 6421682.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5617490261793137, | |
| "epoch": 1.4747663551401868, | |
| "grad_norm": 0.03237268701195717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609363913536072, | |
| "mean_token_accuracy": 0.7727462351322174, | |
| "num_tokens": 6438225.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5548438280820847, | |
| "epoch": 1.4785046728971962, | |
| "grad_norm": 0.0355081707239151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486972332000732, | |
| "mean_token_accuracy": 0.7752490490674973, | |
| "num_tokens": 6454558.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.539698138833046, | |
| "epoch": 1.4822429906542056, | |
| "grad_norm": 0.03101828694343567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438753366470337, | |
| "mean_token_accuracy": 0.776269868016243, | |
| "num_tokens": 6470673.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5318429321050644, | |
| "epoch": 1.485981308411215, | |
| "grad_norm": 0.040831487625837326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361422300338745, | |
| "mean_token_accuracy": 0.7855317145586014, | |
| "num_tokens": 6486739.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5382596254348755, | |
| "epoch": 1.4897196261682244, | |
| "grad_norm": 0.03325575962662697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401434302330017, | |
| "mean_token_accuracy": 0.7797534018754959, | |
| "num_tokens": 6502900.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5596988648176193, | |
| "epoch": 1.4934579439252336, | |
| "grad_norm": 0.028764478862285614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577390193939209, | |
| "mean_token_accuracy": 0.7748348712921143, | |
| "num_tokens": 6519408.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5493527054786682, | |
| "epoch": 1.497196261682243, | |
| "grad_norm": 0.028892861679196358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473135709762573, | |
| "mean_token_accuracy": 0.777830645442009, | |
| "num_tokens": 6535811.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5402602255344391, | |
| "epoch": 1.5009345794392523, | |
| "grad_norm": 0.03191126883029938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474570989608765, | |
| "mean_token_accuracy": 0.7774458974599838, | |
| "num_tokens": 6552173.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.540817379951477, | |
| "epoch": 1.5046728971962615, | |
| "grad_norm": 0.03177822753787041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548837423324585, | |
| "mean_token_accuracy": 0.7776143550872803, | |
| "num_tokens": 6568527.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5428208336234093, | |
| "epoch": 1.508411214953271, | |
| "grad_norm": 0.030568130314350128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432289242744446, | |
| "mean_token_accuracy": 0.7798717468976974, | |
| "num_tokens": 6584756.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5466499626636505, | |
| "epoch": 1.5121495327102803, | |
| "grad_norm": 0.032929882407188416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407195687294006, | |
| "mean_token_accuracy": 0.7786379009485245, | |
| "num_tokens": 6601082.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5593132823705673, | |
| "epoch": 1.5158878504672897, | |
| "grad_norm": 0.03837394341826439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646262168884277, | |
| "mean_token_accuracy": 0.771564781665802, | |
| "num_tokens": 6617429.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5453289300203323, | |
| "epoch": 1.519626168224299, | |
| "grad_norm": 0.03576509654521942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487722158432007, | |
| "mean_token_accuracy": 0.7768426388502121, | |
| "num_tokens": 6633826.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.53939288854599, | |
| "epoch": 1.5233644859813085, | |
| "grad_norm": 0.032857585698366165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385522246360779, | |
| "mean_token_accuracy": 0.7790959179401398, | |
| "num_tokens": 6650240.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5520011931657791, | |
| "epoch": 1.5271028037383179, | |
| "grad_norm": 0.030627621337771416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516581535339355, | |
| "mean_token_accuracy": 0.7760986834764481, | |
| "num_tokens": 6666454.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5406108945608139, | |
| "epoch": 1.5308411214953273, | |
| "grad_norm": 0.036952704191207886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545346736907959, | |
| "mean_token_accuracy": 0.7765967845916748, | |
| "num_tokens": 6682741.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5551878213882446, | |
| "epoch": 1.5345794392523364, | |
| "grad_norm": 0.02871653437614441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54979008436203, | |
| "mean_token_accuracy": 0.7789790332317352, | |
| "num_tokens": 6699160.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5512814819812775, | |
| "epoch": 1.5383177570093458, | |
| "grad_norm": 0.03201194107532501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527634620666504, | |
| "mean_token_accuracy": 0.7734574526548386, | |
| "num_tokens": 6715511.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5432283580303192, | |
| "epoch": 1.542056074766355, | |
| "grad_norm": 0.040297310799360275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455228686332703, | |
| "mean_token_accuracy": 0.7767939269542694, | |
| "num_tokens": 6731688.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5464504212141037, | |
| "epoch": 1.5457943925233644, | |
| "grad_norm": 0.03343544527888298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543891191482544, | |
| "mean_token_accuracy": 0.7797385454177856, | |
| "num_tokens": 6747995.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5669636428356171, | |
| "epoch": 1.5495327102803738, | |
| "grad_norm": 0.03769576549530029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5689972639083862, | |
| "mean_token_accuracy": 0.7693852484226227, | |
| "num_tokens": 6764353.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5392922759056091, | |
| "epoch": 1.5532710280373832, | |
| "grad_norm": 0.03238385543227196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441082715988159, | |
| "mean_token_accuracy": 0.779180720448494, | |
| "num_tokens": 6780896.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.530147522687912, | |
| "epoch": 1.5570093457943925, | |
| "grad_norm": 0.040036849677562714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422973036766052, | |
| "mean_token_accuracy": 0.7789286226034164, | |
| "num_tokens": 6797151.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5386764258146286, | |
| "epoch": 1.560747663551402, | |
| "grad_norm": 0.03689395636320114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467624068260193, | |
| "mean_token_accuracy": 0.7778990417718887, | |
| "num_tokens": 6813386.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5509621798992157, | |
| "epoch": 1.5644859813084113, | |
| "grad_norm": 0.029403693974018097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459365248680115, | |
| "mean_token_accuracy": 0.7784391641616821, | |
| "num_tokens": 6829627.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5576108992099762, | |
| "epoch": 1.5682242990654207, | |
| "grad_norm": 0.03426877036690712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519037246704102, | |
| "mean_token_accuracy": 0.7766879051923752, | |
| "num_tokens": 6845675.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5511836111545563, | |
| "epoch": 1.5719626168224299, | |
| "grad_norm": 0.03294205665588379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434479117393494, | |
| "mean_token_accuracy": 0.7805502861738205, | |
| "num_tokens": 6861921.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5404133796691895, | |
| "epoch": 1.5757009345794393, | |
| "grad_norm": 0.032488446682691574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410423278808594, | |
| "mean_token_accuracy": 0.7808396965265274, | |
| "num_tokens": 6877883.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5403463542461395, | |
| "epoch": 1.5794392523364484, | |
| "grad_norm": 0.03610778972506523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484398603439331, | |
| "mean_token_accuracy": 0.775899812579155, | |
| "num_tokens": 6894361.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5344756990671158, | |
| "epoch": 1.5831775700934578, | |
| "grad_norm": 0.040382951498031616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388015508651733, | |
| "mean_token_accuracy": 0.7805848121643066, | |
| "num_tokens": 6910715.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5353002026677132, | |
| "epoch": 1.5869158878504672, | |
| "grad_norm": 0.03316662460565567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393432974815369, | |
| "mean_token_accuracy": 0.7816650718450546, | |
| "num_tokens": 6927150.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5770704746246338, | |
| "epoch": 1.5906542056074766, | |
| "grad_norm": 0.034545231610536575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.579833984375, | |
| "mean_token_accuracy": 0.7628369480371475, | |
| "num_tokens": 6943549.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5552347898483276, | |
| "epoch": 1.594392523364486, | |
| "grad_norm": 0.03268204629421234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537080764770508, | |
| "mean_token_accuracy": 0.7791409194469452, | |
| "num_tokens": 6959832.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5671118795871735, | |
| "epoch": 1.5981308411214954, | |
| "grad_norm": 0.025902021676301956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5616373419761658, | |
| "mean_token_accuracy": 0.771975114941597, | |
| "num_tokens": 6976368.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5544670224189758, | |
| "epoch": 1.6018691588785048, | |
| "grad_norm": 0.0315086655318737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545330047607422, | |
| "mean_token_accuracy": 0.7738883197307587, | |
| "num_tokens": 6992718.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5558904558420181, | |
| "epoch": 1.6056074766355142, | |
| "grad_norm": 0.033460259437561035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574325323104858, | |
| "mean_token_accuracy": 0.772273600101471, | |
| "num_tokens": 7009062.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5590114444494247, | |
| "epoch": 1.6093457943925233, | |
| "grad_norm": 0.029064292088150978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580740571022034, | |
| "mean_token_accuracy": 0.7744424343109131, | |
| "num_tokens": 7025645.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5402631610631943, | |
| "epoch": 1.6130841121495327, | |
| "grad_norm": 0.04296636953949928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493630170822144, | |
| "mean_token_accuracy": 0.7780915945768356, | |
| "num_tokens": 7041830.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5555061250925064, | |
| "epoch": 1.616822429906542, | |
| "grad_norm": 0.03312353044748306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578774809837341, | |
| "mean_token_accuracy": 0.7739899456501007, | |
| "num_tokens": 7058231.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5563363283872604, | |
| "epoch": 1.6205607476635513, | |
| "grad_norm": 0.03301616013050079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517432689666748, | |
| "mean_token_accuracy": 0.7788877487182617, | |
| "num_tokens": 7074655.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5507991462945938, | |
| "epoch": 1.6242990654205607, | |
| "grad_norm": 0.03195936232805252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476133227348328, | |
| "mean_token_accuracy": 0.7775176912546158, | |
| "num_tokens": 7090766.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5565993189811707, | |
| "epoch": 1.62803738317757, | |
| "grad_norm": 0.03229626268148422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532009601593018, | |
| "mean_token_accuracy": 0.7752693891525269, | |
| "num_tokens": 7106963.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5465118885040283, | |
| "epoch": 1.6317757009345795, | |
| "grad_norm": 0.034706246107816696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551576554775238, | |
| "mean_token_accuracy": 0.7718321233987808, | |
| "num_tokens": 7122926.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5443113446235657, | |
| "epoch": 1.6355140186915889, | |
| "grad_norm": 0.04082060605287552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574634671211243, | |
| "mean_token_accuracy": 0.7741082310676575, | |
| "num_tokens": 7139165.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5489460676908493, | |
| "epoch": 1.6392523364485982, | |
| "grad_norm": 0.03261584788560867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546178817749023, | |
| "mean_token_accuracy": 0.7754340916872025, | |
| "num_tokens": 7155500.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5663624107837677, | |
| "epoch": 1.6429906542056076, | |
| "grad_norm": 0.030861368402838707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564441442489624, | |
| "mean_token_accuracy": 0.7708708792924881, | |
| "num_tokens": 7171927.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5702053755521774, | |
| "epoch": 1.6467289719626168, | |
| "grad_norm": 0.03468736633658409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645827651023865, | |
| "mean_token_accuracy": 0.768431767821312, | |
| "num_tokens": 7188341.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5505633056163788, | |
| "epoch": 1.6504672897196262, | |
| "grad_norm": 0.03153201565146446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395671725273132, | |
| "mean_token_accuracy": 0.7812985777854919, | |
| "num_tokens": 7204527.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5565541088581085, | |
| "epoch": 1.6542056074766354, | |
| "grad_norm": 0.033020708709955215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557956874370575, | |
| "mean_token_accuracy": 0.7709688693284988, | |
| "num_tokens": 7220831.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5384746044874191, | |
| "epoch": 1.6579439252336448, | |
| "grad_norm": 0.0418318547308445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513378977775574, | |
| "mean_token_accuracy": 0.7791547626256943, | |
| "num_tokens": 7236949.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5353372693061829, | |
| "epoch": 1.6616822429906541, | |
| "grad_norm": 0.03820660710334778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490580201148987, | |
| "mean_token_accuracy": 0.7749721854925156, | |
| "num_tokens": 7253242.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5484792143106461, | |
| "epoch": 1.6654205607476635, | |
| "grad_norm": 0.03215263411402702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497522354125977, | |
| "mean_token_accuracy": 0.7769928872585297, | |
| "num_tokens": 7269457.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5664080828428268, | |
| "epoch": 1.669158878504673, | |
| "grad_norm": 0.02815551683306694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563632249832153, | |
| "mean_token_accuracy": 0.7749156504869461, | |
| "num_tokens": 7285879.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5464235991239548, | |
| "epoch": 1.6728971962616823, | |
| "grad_norm": 0.02781211957335472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405099391937256, | |
| "mean_token_accuracy": 0.781552255153656, | |
| "num_tokens": 7302263.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5339583903551102, | |
| "epoch": 1.6766355140186917, | |
| "grad_norm": 0.02980860136449337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369037985801697, | |
| "mean_token_accuracy": 0.7814508825540543, | |
| "num_tokens": 7318270.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5407254248857498, | |
| "epoch": 1.680373831775701, | |
| "grad_norm": 0.03138496354222298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460474491119385, | |
| "mean_token_accuracy": 0.7780201584100723, | |
| "num_tokens": 7334492.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5503694117069244, | |
| "epoch": 1.6841121495327103, | |
| "grad_norm": 0.033992450684309006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556005239486694, | |
| "mean_token_accuracy": 0.7745715081691742, | |
| "num_tokens": 7350627.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5451936274766922, | |
| "epoch": 1.6878504672897197, | |
| "grad_norm": 0.03251323476433754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443669557571411, | |
| "mean_token_accuracy": 0.7780810743570328, | |
| "num_tokens": 7367005.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5657957345247269, | |
| "epoch": 1.6915887850467288, | |
| "grad_norm": 0.034646324813365936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615976452827454, | |
| "mean_token_accuracy": 0.7718859612941742, | |
| "num_tokens": 7383262.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5525887459516525, | |
| "epoch": 1.6953271028037382, | |
| "grad_norm": 0.04024709016084671, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542372465133667, | |
| "mean_token_accuracy": 0.7756317108869553, | |
| "num_tokens": 7399750.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5493184924125671, | |
| "epoch": 1.6990654205607476, | |
| "grad_norm": 0.030978472903370857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475279688835144, | |
| "mean_token_accuracy": 0.7762274444103241, | |
| "num_tokens": 7415800.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5400003641843796, | |
| "epoch": 1.702803738317757, | |
| "grad_norm": 0.03376868739724159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407789349555969, | |
| "mean_token_accuracy": 0.7818103283643723, | |
| "num_tokens": 7431961.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.535884216427803, | |
| "epoch": 1.7065420560747664, | |
| "grad_norm": 0.031221890822052956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440670847892761, | |
| "mean_token_accuracy": 0.7796338200569153, | |
| "num_tokens": 7448202.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5389861762523651, | |
| "epoch": 1.7102803738317758, | |
| "grad_norm": 0.035680338740348816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449787974357605, | |
| "mean_token_accuracy": 0.7797497361898422, | |
| "num_tokens": 7464671.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5451969653367996, | |
| "epoch": 1.7140186915887852, | |
| "grad_norm": 0.03255719691514969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538266897201538, | |
| "mean_token_accuracy": 0.776149570941925, | |
| "num_tokens": 7480992.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5643452405929565, | |
| "epoch": 1.7177570093457943, | |
| "grad_norm": 0.03378691151738167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571281313896179, | |
| "mean_token_accuracy": 0.7731311619281769, | |
| "num_tokens": 7497232.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5346335917711258, | |
| "epoch": 1.7214953271028037, | |
| "grad_norm": 0.03035924583673477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269172191619873, | |
| "mean_token_accuracy": 0.7836929112672806, | |
| "num_tokens": 7513644.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5628820955753326, | |
| "epoch": 1.7252336448598131, | |
| "grad_norm": 0.03539309278130531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605576634407043, | |
| "mean_token_accuracy": 0.7706831097602844, | |
| "num_tokens": 7529830.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5182670503854752, | |
| "epoch": 1.7289719626168223, | |
| "grad_norm": 0.036859650164842606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5209002494812012, | |
| "mean_token_accuracy": 0.7879375368356705, | |
| "num_tokens": 7545846.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5474621504545212, | |
| "epoch": 1.7327102803738317, | |
| "grad_norm": 0.037796422839164734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536765456199646, | |
| "mean_token_accuracy": 0.7753565907478333, | |
| "num_tokens": 7562267.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5636439174413681, | |
| "epoch": 1.736448598130841, | |
| "grad_norm": 0.037271831184625626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606362223625183, | |
| "mean_token_accuracy": 0.7704486697912216, | |
| "num_tokens": 7578670.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5483116805553436, | |
| "epoch": 1.7401869158878505, | |
| "grad_norm": 0.031047314405441284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489611029624939, | |
| "mean_token_accuracy": 0.7756731957197189, | |
| "num_tokens": 7595113.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5289314538240433, | |
| "epoch": 1.7439252336448599, | |
| "grad_norm": 0.035078927874565125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344489216804504, | |
| "mean_token_accuracy": 0.7853281199932098, | |
| "num_tokens": 7611153.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.541694313287735, | |
| "epoch": 1.7476635514018692, | |
| "grad_norm": 0.030235178768634796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412616729736328, | |
| "mean_token_accuracy": 0.7781483829021454, | |
| "num_tokens": 7627712.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5554275363683701, | |
| "epoch": 1.7514018691588786, | |
| "grad_norm": 0.036943912506103516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531514286994934, | |
| "mean_token_accuracy": 0.7756786197423935, | |
| "num_tokens": 7643922.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5472631007432938, | |
| "epoch": 1.7551401869158878, | |
| "grad_norm": 0.030970100313425064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467809438705444, | |
| "mean_token_accuracy": 0.780939131975174, | |
| "num_tokens": 7660096.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.525331124663353, | |
| "epoch": 1.7588785046728972, | |
| "grad_norm": 0.04763743281364441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361969470977783, | |
| "mean_token_accuracy": 0.782649889588356, | |
| "num_tokens": 7676237.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5514428466558456, | |
| "epoch": 1.7626168224299066, | |
| "grad_norm": 0.02942316047847271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563341975212097, | |
| "mean_token_accuracy": 0.773899495601654, | |
| "num_tokens": 7692848.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5428648442029953, | |
| "epoch": 1.7663551401869158, | |
| "grad_norm": 0.038572002202272415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449008941650391, | |
| "mean_token_accuracy": 0.7810295820236206, | |
| "num_tokens": 7708895.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5526584386825562, | |
| "epoch": 1.7700934579439251, | |
| "grad_norm": 0.03303026407957077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465356111526489, | |
| "mean_token_accuracy": 0.7774733603000641, | |
| "num_tokens": 7725206.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5638225227594376, | |
| "epoch": 1.7738317757009345, | |
| "grad_norm": 0.029633166268467903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624324083328247, | |
| "mean_token_accuracy": 0.7697116434574127, | |
| "num_tokens": 7741838.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5561016201972961, | |
| "epoch": 1.777570093457944, | |
| "grad_norm": 0.0328570231795311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563735961914062, | |
| "mean_token_accuracy": 0.7721449285745621, | |
| "num_tokens": 7758049.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5516675412654877, | |
| "epoch": 1.7813084112149533, | |
| "grad_norm": 0.03453238308429718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518988370895386, | |
| "mean_token_accuracy": 0.7777107656002045, | |
| "num_tokens": 7774257.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5394668728113174, | |
| "epoch": 1.7850467289719627, | |
| "grad_norm": 0.03409087657928467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432859659194946, | |
| "mean_token_accuracy": 0.7796248197555542, | |
| "num_tokens": 7790837.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5491889864206314, | |
| "epoch": 1.788785046728972, | |
| "grad_norm": 0.03139546513557434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477681159973145, | |
| "mean_token_accuracy": 0.7775027453899384, | |
| "num_tokens": 7807302.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5528343021869659, | |
| "epoch": 1.7925233644859813, | |
| "grad_norm": 0.031248709186911583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557167530059814, | |
| "mean_token_accuracy": 0.7744993418455124, | |
| "num_tokens": 7823635.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5458249896764755, | |
| "epoch": 1.7962616822429907, | |
| "grad_norm": 0.03402215987443924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505017042160034, | |
| "mean_token_accuracy": 0.7759317308664322, | |
| "num_tokens": 7839914.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.552555724978447, | |
| "epoch": 1.8, | |
| "grad_norm": 0.030951669439673424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560877799987793, | |
| "mean_token_accuracy": 0.77203568816185, | |
| "num_tokens": 7856194.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5391200333833694, | |
| "epoch": 1.8037383177570092, | |
| "grad_norm": 0.04003436490893364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390163660049438, | |
| "mean_token_accuracy": 0.7827838510274887, | |
| "num_tokens": 7872434.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5392342656850815, | |
| "epoch": 1.8074766355140186, | |
| "grad_norm": 0.03150493651628494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406180620193481, | |
| "mean_token_accuracy": 0.7828439474105835, | |
| "num_tokens": 7888751.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5622579157352448, | |
| "epoch": 1.811214953271028, | |
| "grad_norm": 0.03376127406954765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644164681434631, | |
| "mean_token_accuracy": 0.7707268595695496, | |
| "num_tokens": 7905072.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5327235907316208, | |
| "epoch": 1.8149532710280374, | |
| "grad_norm": 0.028277890756726265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303685069084167, | |
| "mean_token_accuracy": 0.7862435132265091, | |
| "num_tokens": 7921459.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5588890165090561, | |
| "epoch": 1.8186915887850468, | |
| "grad_norm": 0.03095029853284359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525569915771484, | |
| "mean_token_accuracy": 0.7770346105098724, | |
| "num_tokens": 7937961.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5573548376560211, | |
| "epoch": 1.8224299065420562, | |
| "grad_norm": 0.03045843541622162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535331964492798, | |
| "mean_token_accuracy": 0.7766827940940857, | |
| "num_tokens": 7954609.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5567604452371597, | |
| "epoch": 1.8261682242990656, | |
| "grad_norm": 0.029482809826731682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576134324073792, | |
| "mean_token_accuracy": 0.772316038608551, | |
| "num_tokens": 7971097.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5545413047075272, | |
| "epoch": 1.8299065420560747, | |
| "grad_norm": 0.03891676291823387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648533701896667, | |
| "mean_token_accuracy": 0.7718105167150497, | |
| "num_tokens": 7987377.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5786599218845367, | |
| "epoch": 1.8336448598130841, | |
| "grad_norm": 0.030758248642086983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5835361480712891, | |
| "mean_token_accuracy": 0.762917771935463, | |
| "num_tokens": 8003799.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5397150218486786, | |
| "epoch": 1.8373831775700935, | |
| "grad_norm": 0.03965795785188675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538779616355896, | |
| "mean_token_accuracy": 0.7839108556509018, | |
| "num_tokens": 8020279.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5535183995962143, | |
| "epoch": 1.8411214953271027, | |
| "grad_norm": 0.03004513867199421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507811903953552, | |
| "mean_token_accuracy": 0.7755124121904373, | |
| "num_tokens": 8036491.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5442592799663544, | |
| "epoch": 1.844859813084112, | |
| "grad_norm": 0.03522132337093353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478004217147827, | |
| "mean_token_accuracy": 0.7766154408454895, | |
| "num_tokens": 8052807.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5266854241490364, | |
| "epoch": 1.8485981308411215, | |
| "grad_norm": 0.030206192284822464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529688835144043, | |
| "mean_token_accuracy": 0.7819836139678955, | |
| "num_tokens": 8068712.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5283671095967293, | |
| "epoch": 1.8523364485981308, | |
| "grad_norm": 0.03329138457775116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376101136207581, | |
| "mean_token_accuracy": 0.7793748378753662, | |
| "num_tokens": 8085084.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5712718665599823, | |
| "epoch": 1.8560747663551402, | |
| "grad_norm": 0.0325874425470829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709162950515747, | |
| "mean_token_accuracy": 0.7662056684494019, | |
| "num_tokens": 8101731.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5663121491670609, | |
| "epoch": 1.8598130841121496, | |
| "grad_norm": 0.03357568010687828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650657415390015, | |
| "mean_token_accuracy": 0.7691219747066498, | |
| "num_tokens": 8118244.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5427432358264923, | |
| "epoch": 1.863551401869159, | |
| "grad_norm": 0.03203551098704338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398803949356079, | |
| "mean_token_accuracy": 0.7808598130941391, | |
| "num_tokens": 8134657.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5573120266199112, | |
| "epoch": 1.8672897196261682, | |
| "grad_norm": 0.029932986944913864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522656440734863, | |
| "mean_token_accuracy": 0.7727643102407455, | |
| "num_tokens": 8151058.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5573428720235825, | |
| "epoch": 1.8710280373831776, | |
| "grad_norm": 0.02661440148949623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512294173240662, | |
| "mean_token_accuracy": 0.7765780538320541, | |
| "num_tokens": 8167736.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5472890585660934, | |
| "epoch": 1.874766355140187, | |
| "grad_norm": 0.028882022947072983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479044318199158, | |
| "mean_token_accuracy": 0.777178093791008, | |
| "num_tokens": 8183857.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5511818528175354, | |
| "epoch": 1.8785046728971961, | |
| "grad_norm": 0.032389186322689056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552236437797546, | |
| "mean_token_accuracy": 0.7762337774038315, | |
| "num_tokens": 8199955.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.546854555606842, | |
| "epoch": 1.8822429906542055, | |
| "grad_norm": 0.0336172878742218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55290687084198, | |
| "mean_token_accuracy": 0.7735693603754044, | |
| "num_tokens": 8216221.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5447833836078644, | |
| "epoch": 1.885981308411215, | |
| "grad_norm": 0.0326668806374073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433166027069092, | |
| "mean_token_accuracy": 0.7759248912334442, | |
| "num_tokens": 8232519.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5311590135097504, | |
| "epoch": 1.8897196261682243, | |
| "grad_norm": 0.0328470915555954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332115888595581, | |
| "mean_token_accuracy": 0.7827264666557312, | |
| "num_tokens": 8248973.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5405398160219193, | |
| "epoch": 1.8934579439252337, | |
| "grad_norm": 0.03319946303963661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498695969581604, | |
| "mean_token_accuracy": 0.7756136506795883, | |
| "num_tokens": 8265054.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5590761750936508, | |
| "epoch": 1.897196261682243, | |
| "grad_norm": 0.03323895111680031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674346685409546, | |
| "mean_token_accuracy": 0.7680935710668564, | |
| "num_tokens": 8281659.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5502993315458298, | |
| "epoch": 1.9009345794392525, | |
| "grad_norm": 0.036393504589796066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518926382064819, | |
| "mean_token_accuracy": 0.7772549986839294, | |
| "num_tokens": 8298120.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5434653609991074, | |
| "epoch": 1.9046728971962616, | |
| "grad_norm": 0.030826875939965248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373662710189819, | |
| "mean_token_accuracy": 0.7814789414405823, | |
| "num_tokens": 8314165.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5616354942321777, | |
| "epoch": 1.908411214953271, | |
| "grad_norm": 0.03320663422346115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573338866233826, | |
| "mean_token_accuracy": 0.7744273245334625, | |
| "num_tokens": 8330561.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5629893988370895, | |
| "epoch": 1.9121495327102802, | |
| "grad_norm": 0.03727097064256668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611152648925781, | |
| "mean_token_accuracy": 0.773328885436058, | |
| "num_tokens": 8346708.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5592319965362549, | |
| "epoch": 1.9158878504672896, | |
| "grad_norm": 0.03037538379430771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5616269111633301, | |
| "mean_token_accuracy": 0.7723426669836044, | |
| "num_tokens": 8362957.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.549030601978302, | |
| "epoch": 1.919626168224299, | |
| "grad_norm": 0.03563016280531883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529686808586121, | |
| "mean_token_accuracy": 0.7743269205093384, | |
| "num_tokens": 8379387.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5441324412822723, | |
| "epoch": 1.9233644859813084, | |
| "grad_norm": 0.031737376004457474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500344038009644, | |
| "mean_token_accuracy": 0.7763906866312027, | |
| "num_tokens": 8395747.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5507270097732544, | |
| "epoch": 1.9271028037383178, | |
| "grad_norm": 0.03285627067089081, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587583780288696, | |
| "mean_token_accuracy": 0.7742376923561096, | |
| "num_tokens": 8412181.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5456591248512268, | |
| "epoch": 1.9308411214953272, | |
| "grad_norm": 0.03147684410214424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484343767166138, | |
| "mean_token_accuracy": 0.7780278623104095, | |
| "num_tokens": 8428664.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5484454035758972, | |
| "epoch": 1.9345794392523366, | |
| "grad_norm": 0.036278773099184036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547294616699219, | |
| "mean_token_accuracy": 0.7715467214584351, | |
| "num_tokens": 8444942.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5404845178127289, | |
| "epoch": 1.938317757009346, | |
| "grad_norm": 0.032282162457704544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409780740737915, | |
| "mean_token_accuracy": 0.779809907078743, | |
| "num_tokens": 8461035.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5527531504631042, | |
| "epoch": 1.9420560747663551, | |
| "grad_norm": 0.03141535073518753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439110994338989, | |
| "mean_token_accuracy": 0.7789405584335327, | |
| "num_tokens": 8477333.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5531508475542068, | |
| "epoch": 1.9457943925233645, | |
| "grad_norm": 0.032372504472732544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456727147102356, | |
| "mean_token_accuracy": 0.7779283076524734, | |
| "num_tokens": 8493646.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.558539628982544, | |
| "epoch": 1.9495327102803737, | |
| "grad_norm": 0.03805968537926674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575815439224243, | |
| "mean_token_accuracy": 0.7742009460926056, | |
| "num_tokens": 8510069.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5543098747730255, | |
| "epoch": 1.953271028037383, | |
| "grad_norm": 0.03495538979768753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558309018611908, | |
| "mean_token_accuracy": 0.7752062678337097, | |
| "num_tokens": 8526687.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5394291281700134, | |
| "epoch": 1.9570093457943925, | |
| "grad_norm": 0.060034435242414474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499407649040222, | |
| "mean_token_accuracy": 0.7747859209775925, | |
| "num_tokens": 8543194.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5493269860744476, | |
| "epoch": 1.9607476635514018, | |
| "grad_norm": 0.03242463245987892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581871271133423, | |
| "mean_token_accuracy": 0.7717736065387726, | |
| "num_tokens": 8559635.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5709338933229446, | |
| "epoch": 1.9644859813084112, | |
| "grad_norm": 0.09612691402435303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5857856273651123, | |
| "mean_token_accuracy": 0.7716985046863556, | |
| "num_tokens": 8575682.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5535681843757629, | |
| "epoch": 1.9682242990654206, | |
| "grad_norm": 0.03228386864066124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427148342132568, | |
| "mean_token_accuracy": 0.7775698453187943, | |
| "num_tokens": 8591993.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5595205128192902, | |
| "epoch": 1.97196261682243, | |
| "grad_norm": 0.05833456665277481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632327795028687, | |
| "mean_token_accuracy": 0.7714700251817703, | |
| "num_tokens": 8608390.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5412962287664413, | |
| "epoch": 1.9757009345794394, | |
| "grad_norm": 0.04238782078027725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416378378868103, | |
| "mean_token_accuracy": 0.7781312763690948, | |
| "num_tokens": 8624553.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5466502904891968, | |
| "epoch": 1.9794392523364486, | |
| "grad_norm": 0.038432635366916656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546246767044067, | |
| "mean_token_accuracy": 0.7747474908828735, | |
| "num_tokens": 8640859.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5358689278364182, | |
| "epoch": 1.983177570093458, | |
| "grad_norm": 0.03189871460199356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437639355659485, | |
| "mean_token_accuracy": 0.7790816277265549, | |
| "num_tokens": 8657164.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5428293794393539, | |
| "epoch": 1.9869158878504671, | |
| "grad_norm": 0.031927406787872314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386630892753601, | |
| "mean_token_accuracy": 0.7813318967819214, | |
| "num_tokens": 8673653.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5520585179328918, | |
| "epoch": 1.9906542056074765, | |
| "grad_norm": 0.036430567502975464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499424338340759, | |
| "mean_token_accuracy": 0.7754785418510437, | |
| "num_tokens": 8689872.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5492618307471275, | |
| "epoch": 1.994392523364486, | |
| "grad_norm": 0.03422766923904419, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523169040679932, | |
| "mean_token_accuracy": 0.7751457393169403, | |
| "num_tokens": 8706316.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5318035036325455, | |
| "epoch": 1.9981308411214953, | |
| "grad_norm": 0.029748188331723213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351126790046692, | |
| "mean_token_accuracy": 0.7828892469406128, | |
| "num_tokens": 8722797.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5385511517524719, | |
| "epoch": 2.0, | |
| "grad_norm": 0.05353870987892151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426778197288513, | |
| "mean_token_accuracy": 0.7800059914588928, | |
| "num_tokens": 8729632.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5657109320163727, | |
| "epoch": 2.0037383177570094, | |
| "grad_norm": 0.03845514729619026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532518029212952, | |
| "mean_token_accuracy": 0.7752802222967148, | |
| "num_tokens": 8746094.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5414439141750336, | |
| "epoch": 2.007476635514019, | |
| "grad_norm": 0.030992809683084488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374770164489746, | |
| "mean_token_accuracy": 0.7807809114456177, | |
| "num_tokens": 8762553.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.520616352558136, | |
| "epoch": 2.011214953271028, | |
| "grad_norm": 0.03543594852089882, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239126086235046, | |
| "mean_token_accuracy": 0.7860341370105743, | |
| "num_tokens": 8778649.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5175309702754021, | |
| "epoch": 2.0149532710280376, | |
| "grad_norm": 0.03473593294620514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261198282241821, | |
| "mean_token_accuracy": 0.7881468534469604, | |
| "num_tokens": 8794905.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5151422992348671, | |
| "epoch": 2.0186915887850465, | |
| "grad_norm": 0.038792964071035385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288342833518982, | |
| "mean_token_accuracy": 0.7841326147317886, | |
| "num_tokens": 8811277.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5424266159534454, | |
| "epoch": 2.022429906542056, | |
| "grad_norm": 0.03833077475428581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454620718955994, | |
| "mean_token_accuracy": 0.7795733213424683, | |
| "num_tokens": 8827670.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.533804714679718, | |
| "epoch": 2.0261682242990653, | |
| "grad_norm": 0.03583015128970146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267578959465027, | |
| "mean_token_accuracy": 0.7867784053087234, | |
| "num_tokens": 8843733.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5466929823160172, | |
| "epoch": 2.0299065420560747, | |
| "grad_norm": 0.03870777040719986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435931086540222, | |
| "mean_token_accuracy": 0.7770351320505142, | |
| "num_tokens": 8860036.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5408391207456589, | |
| "epoch": 2.033644859813084, | |
| "grad_norm": 0.03353007137775421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323169827461243, | |
| "mean_token_accuracy": 0.7834465950727463, | |
| "num_tokens": 8876470.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5217868834733963, | |
| "epoch": 2.0373831775700935, | |
| "grad_norm": 0.036939021199941635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216724276542664, | |
| "mean_token_accuracy": 0.7884602099657059, | |
| "num_tokens": 8892628.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5368964821100235, | |
| "epoch": 2.041121495327103, | |
| "grad_norm": 0.043159015476703644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444939136505127, | |
| "mean_token_accuracy": 0.778968021273613, | |
| "num_tokens": 8909028.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5433569848537445, | |
| "epoch": 2.0448598130841122, | |
| "grad_norm": 0.03701786324381828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439752340316772, | |
| "mean_token_accuracy": 0.7791613191366196, | |
| "num_tokens": 8925310.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5270129442214966, | |
| "epoch": 2.0485981308411216, | |
| "grad_norm": 0.04250190034508705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210642218589783, | |
| "mean_token_accuracy": 0.7867415547370911, | |
| "num_tokens": 8941225.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5519801378250122, | |
| "epoch": 2.052336448598131, | |
| "grad_norm": 0.03549535945057869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550297200679779, | |
| "mean_token_accuracy": 0.7756542861461639, | |
| "num_tokens": 8957662.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5188534706830978, | |
| "epoch": 2.05607476635514, | |
| "grad_norm": 0.03532535210251808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5225726962089539, | |
| "mean_token_accuracy": 0.7875347584486008, | |
| "num_tokens": 8973986.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5331487953662872, | |
| "epoch": 2.0598130841121494, | |
| "grad_norm": 0.0401851125061512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345657467842102, | |
| "mean_token_accuracy": 0.7807552814483643, | |
| "num_tokens": 8990453.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5301813259720802, | |
| "epoch": 2.0635514018691588, | |
| "grad_norm": 0.04093443974852562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536128580570221, | |
| "mean_token_accuracy": 0.781855434179306, | |
| "num_tokens": 9006810.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5511504411697388, | |
| "epoch": 2.067289719626168, | |
| "grad_norm": 0.04108293727040291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547398567199707, | |
| "mean_token_accuracy": 0.7787968963384628, | |
| "num_tokens": 9023044.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5452945232391357, | |
| "epoch": 2.0710280373831775, | |
| "grad_norm": 0.04133358225226402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406649112701416, | |
| "mean_token_accuracy": 0.7804151326417923, | |
| "num_tokens": 9039300.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5133676081895828, | |
| "epoch": 2.074766355140187, | |
| "grad_norm": 0.0368187241256237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.510840892791748, | |
| "mean_token_accuracy": 0.7948838770389557, | |
| "num_tokens": 9055408.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5286162942647934, | |
| "epoch": 2.0785046728971963, | |
| "grad_norm": 0.037287503480911255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286219120025635, | |
| "mean_token_accuracy": 0.7867581397294998, | |
| "num_tokens": 9071847.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5187130272388458, | |
| "epoch": 2.0822429906542057, | |
| "grad_norm": 0.03932078555226326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5252044200897217, | |
| "mean_token_accuracy": 0.788768544793129, | |
| "num_tokens": 9088062.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5239534676074982, | |
| "epoch": 2.085981308411215, | |
| "grad_norm": 0.04231242835521698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535202145576477, | |
| "mean_token_accuracy": 0.7852179259061813, | |
| "num_tokens": 9104468.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.528278037905693, | |
| "epoch": 2.0897196261682245, | |
| "grad_norm": 0.03444297984242439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238081812858582, | |
| "mean_token_accuracy": 0.7863867878913879, | |
| "num_tokens": 9120622.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5545478612184525, | |
| "epoch": 2.0934579439252334, | |
| "grad_norm": 0.04182487353682518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527917742729187, | |
| "mean_token_accuracy": 0.7766451835632324, | |
| "num_tokens": 9137031.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.521744892001152, | |
| "epoch": 2.097196261682243, | |
| "grad_norm": 0.03438956290483475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255383849143982, | |
| "mean_token_accuracy": 0.7855681478977203, | |
| "num_tokens": 9153374.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5317307189106941, | |
| "epoch": 2.100934579439252, | |
| "grad_norm": 0.04259387031197548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530976414680481, | |
| "mean_token_accuracy": 0.7861284911632538, | |
| "num_tokens": 9169379.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5382358431816101, | |
| "epoch": 2.1046728971962616, | |
| "grad_norm": 0.03778582066297531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446516871452332, | |
| "mean_token_accuracy": 0.7786799967288971, | |
| "num_tokens": 9185673.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5174337849020958, | |
| "epoch": 2.108411214953271, | |
| "grad_norm": 0.03816930949687958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5179592967033386, | |
| "mean_token_accuracy": 0.7912393063306808, | |
| "num_tokens": 9201995.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5279374569654465, | |
| "epoch": 2.1121495327102804, | |
| "grad_norm": 0.038216955959796906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243582129478455, | |
| "mean_token_accuracy": 0.7866894006729126, | |
| "num_tokens": 9218133.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5245715379714966, | |
| "epoch": 2.1158878504672898, | |
| "grad_norm": 0.03613874316215515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5249512791633606, | |
| "mean_token_accuracy": 0.7851840853691101, | |
| "num_tokens": 9234342.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5192612558603287, | |
| "epoch": 2.119626168224299, | |
| "grad_norm": 0.04042578116059303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259383320808411, | |
| "mean_token_accuracy": 0.7858112007379532, | |
| "num_tokens": 9250696.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5262997299432755, | |
| "epoch": 2.1233644859813086, | |
| "grad_norm": 0.04460779204964638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308440923690796, | |
| "mean_token_accuracy": 0.7877162247896194, | |
| "num_tokens": 9266979.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5224001705646515, | |
| "epoch": 2.127102803738318, | |
| "grad_norm": 0.03817397728562355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229726433753967, | |
| "mean_token_accuracy": 0.7861741036176682, | |
| "num_tokens": 9283280.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5274494737386703, | |
| "epoch": 2.130841121495327, | |
| "grad_norm": 0.04161069914698601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270024538040161, | |
| "mean_token_accuracy": 0.7860408127307892, | |
| "num_tokens": 9299630.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5552078932523727, | |
| "epoch": 2.1345794392523363, | |
| "grad_norm": 0.04526656121015549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547661542892456, | |
| "mean_token_accuracy": 0.77776238322258, | |
| "num_tokens": 9316114.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5352555364370346, | |
| "epoch": 2.1383177570093457, | |
| "grad_norm": 0.037117403000593185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322074294090271, | |
| "mean_token_accuracy": 0.7845579087734222, | |
| "num_tokens": 9332486.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5299685597419739, | |
| "epoch": 2.142056074766355, | |
| "grad_norm": 0.04335174337029457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333051085472107, | |
| "mean_token_accuracy": 0.7831422835588455, | |
| "num_tokens": 9348999.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5251427963376045, | |
| "epoch": 2.1457943925233645, | |
| "grad_norm": 0.04729305952787399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304993987083435, | |
| "mean_token_accuracy": 0.7857193797826767, | |
| "num_tokens": 9365291.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5248839110136032, | |
| "epoch": 2.149532710280374, | |
| "grad_norm": 0.04293828830122948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300874710083008, | |
| "mean_token_accuracy": 0.784340038895607, | |
| "num_tokens": 9381734.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5214874297380447, | |
| "epoch": 2.1532710280373832, | |
| "grad_norm": 0.04350607469677925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5177597403526306, | |
| "mean_token_accuracy": 0.7909844070672989, | |
| "num_tokens": 9397955.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5421570688486099, | |
| "epoch": 2.1570093457943926, | |
| "grad_norm": 0.042496006935834885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425592660903931, | |
| "mean_token_accuracy": 0.7795795798301697, | |
| "num_tokens": 9414143.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.535075232386589, | |
| "epoch": 2.160747663551402, | |
| "grad_norm": 0.049906548112630844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370741486549377, | |
| "mean_token_accuracy": 0.7806216180324554, | |
| "num_tokens": 9430295.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.535729855298996, | |
| "epoch": 2.1644859813084114, | |
| "grad_norm": 0.04840796813368797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347393155097961, | |
| "mean_token_accuracy": 0.7850737869739532, | |
| "num_tokens": 9446633.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5312991067767143, | |
| "epoch": 2.1682242990654204, | |
| "grad_norm": 0.04839569702744484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378549098968506, | |
| "mean_token_accuracy": 0.7815908044576645, | |
| "num_tokens": 9462924.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5284993052482605, | |
| "epoch": 2.1719626168224297, | |
| "grad_norm": 0.04563288018107414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385716557502747, | |
| "mean_token_accuracy": 0.7814656347036362, | |
| "num_tokens": 9479222.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.535816490650177, | |
| "epoch": 2.175700934579439, | |
| "grad_norm": 0.05489310622215271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382475256919861, | |
| "mean_token_accuracy": 0.7812406271696091, | |
| "num_tokens": 9495589.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.549729734659195, | |
| "epoch": 2.1794392523364485, | |
| "grad_norm": 0.0424075648188591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539716899394989, | |
| "mean_token_accuracy": 0.7819323092699051, | |
| "num_tokens": 9511725.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5317162126302719, | |
| "epoch": 2.183177570093458, | |
| "grad_norm": 0.03563420847058296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235797166824341, | |
| "mean_token_accuracy": 0.7905198931694031, | |
| "num_tokens": 9527971.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5211209952831268, | |
| "epoch": 2.1869158878504673, | |
| "grad_norm": 0.048658616840839386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5268206000328064, | |
| "mean_token_accuracy": 0.7845446020364761, | |
| "num_tokens": 9544253.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5116122514009476, | |
| "epoch": 2.1906542056074767, | |
| "grad_norm": 0.04198598116636276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5190539360046387, | |
| "mean_token_accuracy": 0.7874016612768173, | |
| "num_tokens": 9560518.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5246260613203049, | |
| "epoch": 2.194392523364486, | |
| "grad_norm": 0.03876075521111488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228715538978577, | |
| "mean_token_accuracy": 0.7850266695022583, | |
| "num_tokens": 9576775.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5278798937797546, | |
| "epoch": 2.1981308411214955, | |
| "grad_norm": 0.04761234670877457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265949964523315, | |
| "mean_token_accuracy": 0.7893748730421066, | |
| "num_tokens": 9593040.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.548830658197403, | |
| "epoch": 2.201869158878505, | |
| "grad_norm": 0.04078621417284012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517237186431885, | |
| "mean_token_accuracy": 0.778541699051857, | |
| "num_tokens": 9609499.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5336392223834991, | |
| "epoch": 2.205607476635514, | |
| "grad_norm": 0.04143911972641945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296382308006287, | |
| "mean_token_accuracy": 0.7824793457984924, | |
| "num_tokens": 9625911.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5379772335290909, | |
| "epoch": 2.209345794392523, | |
| "grad_norm": 0.03608503192663193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343111753463745, | |
| "mean_token_accuracy": 0.7822979539632797, | |
| "num_tokens": 9642395.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5172793120145798, | |
| "epoch": 2.2130841121495326, | |
| "grad_norm": 0.034696269780397415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195714235305786, | |
| "mean_token_accuracy": 0.7902600318193436, | |
| "num_tokens": 9658662.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5202511548995972, | |
| "epoch": 2.216822429906542, | |
| "grad_norm": 0.0416097529232502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290377736091614, | |
| "mean_token_accuracy": 0.7843390554189682, | |
| "num_tokens": 9674880.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5413576662540436, | |
| "epoch": 2.2205607476635514, | |
| "grad_norm": 0.0419846810400486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517836809158325, | |
| "mean_token_accuracy": 0.7757999449968338, | |
| "num_tokens": 9691443.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5511815398931503, | |
| "epoch": 2.2242990654205608, | |
| "grad_norm": 0.042311880737543106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441216230392456, | |
| "mean_token_accuracy": 0.7797399759292603, | |
| "num_tokens": 9707667.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5390328615903854, | |
| "epoch": 2.22803738317757, | |
| "grad_norm": 0.04130427911877632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381530523300171, | |
| "mean_token_accuracy": 0.7850432395935059, | |
| "num_tokens": 9723670.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5145308524370193, | |
| "epoch": 2.2317757009345796, | |
| "grad_norm": 0.04054151102900505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5153539776802063, | |
| "mean_token_accuracy": 0.7911680340766907, | |
| "num_tokens": 9740111.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5264055132865906, | |
| "epoch": 2.235514018691589, | |
| "grad_norm": 0.04768845811486244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321245193481445, | |
| "mean_token_accuracy": 0.7862783521413803, | |
| "num_tokens": 9756445.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5161085873842239, | |
| "epoch": 2.2392523364485983, | |
| "grad_norm": 0.047890279442071915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329167246818542, | |
| "mean_token_accuracy": 0.7836614698171616, | |
| "num_tokens": 9772513.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5542461574077606, | |
| "epoch": 2.2429906542056073, | |
| "grad_norm": 0.04093446582555771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555320680141449, | |
| "mean_token_accuracy": 0.7749381363391876, | |
| "num_tokens": 9789085.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5521011054515839, | |
| "epoch": 2.2467289719626167, | |
| "grad_norm": 0.0422159768640995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415031313896179, | |
| "mean_token_accuracy": 0.7801210135221481, | |
| "num_tokens": 9805542.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5508425533771515, | |
| "epoch": 2.250467289719626, | |
| "grad_norm": 0.04688411206007004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387436151504517, | |
| "mean_token_accuracy": 0.7821325659751892, | |
| "num_tokens": 9821923.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5507242232561111, | |
| "epoch": 2.2542056074766355, | |
| "grad_norm": 0.035407017916440964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444649457931519, | |
| "mean_token_accuracy": 0.7809951901435852, | |
| "num_tokens": 9838298.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5216517895460129, | |
| "epoch": 2.257943925233645, | |
| "grad_norm": 0.041920073330402374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264837741851807, | |
| "mean_token_accuracy": 0.7897377163171768, | |
| "num_tokens": 9854659.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5258049964904785, | |
| "epoch": 2.2616822429906542, | |
| "grad_norm": 0.0534173846244812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415172576904297, | |
| "mean_token_accuracy": 0.7817163467407227, | |
| "num_tokens": 9870877.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5240575075149536, | |
| "epoch": 2.2654205607476636, | |
| "grad_norm": 0.03395333141088486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256165862083435, | |
| "mean_token_accuracy": 0.7837403416633606, | |
| "num_tokens": 9887224.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5454617738723755, | |
| "epoch": 2.269158878504673, | |
| "grad_norm": 0.034148454666137695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424824953079224, | |
| "mean_token_accuracy": 0.7791529148817062, | |
| "num_tokens": 9903786.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5350487977266312, | |
| "epoch": 2.2728971962616824, | |
| "grad_norm": 0.042522136121988297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272009372711182, | |
| "mean_token_accuracy": 0.7874994874000549, | |
| "num_tokens": 9920053.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5338039100170135, | |
| "epoch": 2.2766355140186914, | |
| "grad_norm": 0.036921191960573196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5227792859077454, | |
| "mean_token_accuracy": 0.7891070544719696, | |
| "num_tokens": 9936211.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5317139476537704, | |
| "epoch": 2.2803738317757007, | |
| "grad_norm": 0.038269490003585815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253998637199402, | |
| "mean_token_accuracy": 0.7870776653289795, | |
| "num_tokens": 9952725.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5196784734725952, | |
| "epoch": 2.28411214953271, | |
| "grad_norm": 0.03972024843096733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251049995422363, | |
| "mean_token_accuracy": 0.7839716672897339, | |
| "num_tokens": 9969316.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5095352083444595, | |
| "epoch": 2.2878504672897195, | |
| "grad_norm": 0.0507940798997879, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290789008140564, | |
| "mean_token_accuracy": 0.7861248552799225, | |
| "num_tokens": 9985447.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5270750820636749, | |
| "epoch": 2.291588785046729, | |
| "grad_norm": 0.04321181774139404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311838984489441, | |
| "mean_token_accuracy": 0.7838535755872726, | |
| "num_tokens": 10001725.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5379711389541626, | |
| "epoch": 2.2953271028037383, | |
| "grad_norm": 0.040656980127096176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385247468948364, | |
| "mean_token_accuracy": 0.7803602814674377, | |
| "num_tokens": 10018134.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5364449620246887, | |
| "epoch": 2.2990654205607477, | |
| "grad_norm": 0.044270358979701996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303220748901367, | |
| "mean_token_accuracy": 0.7875775545835495, | |
| "num_tokens": 10034256.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5223758369684219, | |
| "epoch": 2.302803738317757, | |
| "grad_norm": 0.04040619730949402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194275379180908, | |
| "mean_token_accuracy": 0.7908173054456711, | |
| "num_tokens": 10050260.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5754473656415939, | |
| "epoch": 2.3065420560747665, | |
| "grad_norm": 0.0413733534514904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673513412475586, | |
| "mean_token_accuracy": 0.7693175226449966, | |
| "num_tokens": 10066439.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5494302958250046, | |
| "epoch": 2.310280373831776, | |
| "grad_norm": 0.04788622632622719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560729503631592, | |
| "mean_token_accuracy": 0.7737975120544434, | |
| "num_tokens": 10082592.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5400004386901855, | |
| "epoch": 2.3140186915887853, | |
| "grad_norm": 0.04467733949422836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475805997848511, | |
| "mean_token_accuracy": 0.7767456918954849, | |
| "num_tokens": 10098902.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5090039819478989, | |
| "epoch": 2.317757009345794, | |
| "grad_norm": 0.04413570463657379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152875781059265, | |
| "mean_token_accuracy": 0.792495995759964, | |
| "num_tokens": 10115273.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5372920483350754, | |
| "epoch": 2.3214953271028036, | |
| "grad_norm": 0.037302058190107346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321361422538757, | |
| "mean_token_accuracy": 0.7862480282783508, | |
| "num_tokens": 10131501.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5543005019426346, | |
| "epoch": 2.325233644859813, | |
| "grad_norm": 0.03829365596175194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508820414543152, | |
| "mean_token_accuracy": 0.7745321840047836, | |
| "num_tokens": 10147998.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5153163969516754, | |
| "epoch": 2.3289719626168224, | |
| "grad_norm": 0.045321445912122726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5118069052696228, | |
| "mean_token_accuracy": 0.7935506701469421, | |
| "num_tokens": 10164126.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5008471608161926, | |
| "epoch": 2.3327102803738318, | |
| "grad_norm": 0.04449000954627991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5082967877388, | |
| "mean_token_accuracy": 0.7942900061607361, | |
| "num_tokens": 10180274.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.532206118106842, | |
| "epoch": 2.336448598130841, | |
| "grad_norm": 0.05191594734787941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367388129234314, | |
| "mean_token_accuracy": 0.7808051854372025, | |
| "num_tokens": 10196609.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5258989185094833, | |
| "epoch": 2.3401869158878505, | |
| "grad_norm": 0.044721271842718124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331224203109741, | |
| "mean_token_accuracy": 0.7829412668943405, | |
| "num_tokens": 10212895.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5370120704174042, | |
| "epoch": 2.34392523364486, | |
| "grad_norm": 0.041769906878471375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412429571151733, | |
| "mean_token_accuracy": 0.7827376574277878, | |
| "num_tokens": 10229237.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5400294661521912, | |
| "epoch": 2.3476635514018693, | |
| "grad_norm": 0.040269553661346436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357171893119812, | |
| "mean_token_accuracy": 0.7816246598958969, | |
| "num_tokens": 10245453.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5325844436883926, | |
| "epoch": 2.3514018691588783, | |
| "grad_norm": 0.04499928280711174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283193588256836, | |
| "mean_token_accuracy": 0.7859142124652863, | |
| "num_tokens": 10261777.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5282296687364578, | |
| "epoch": 2.3551401869158877, | |
| "grad_norm": 0.04336896538734436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254157781600952, | |
| "mean_token_accuracy": 0.789379209280014, | |
| "num_tokens": 10278007.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5453646928071976, | |
| "epoch": 2.358878504672897, | |
| "grad_norm": 0.05249177664518356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468531250953674, | |
| "mean_token_accuracy": 0.7771991342306137, | |
| "num_tokens": 10294331.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.543931856751442, | |
| "epoch": 2.3626168224299064, | |
| "grad_norm": 0.037500377744436264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477216839790344, | |
| "mean_token_accuracy": 0.7776368409395218, | |
| "num_tokens": 10310976.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5300342440605164, | |
| "epoch": 2.366355140186916, | |
| "grad_norm": 0.04039130359888077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305655002593994, | |
| "mean_token_accuracy": 0.7832176089286804, | |
| "num_tokens": 10327256.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5378967821598053, | |
| "epoch": 2.3700934579439252, | |
| "grad_norm": 0.04444447159767151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362187027931213, | |
| "mean_token_accuracy": 0.7842839509248734, | |
| "num_tokens": 10343608.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5510306656360626, | |
| "epoch": 2.3738317757009346, | |
| "grad_norm": 0.04542792961001396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493132472038269, | |
| "mean_token_accuracy": 0.7786229699850082, | |
| "num_tokens": 10359923.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5210727900266647, | |
| "epoch": 2.377570093457944, | |
| "grad_norm": 0.043661415576934814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236334800720215, | |
| "mean_token_accuracy": 0.7890983521938324, | |
| "num_tokens": 10376100.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5260880589485168, | |
| "epoch": 2.3813084112149534, | |
| "grad_norm": 0.04262132570147514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248558521270752, | |
| "mean_token_accuracy": 0.7902341783046722, | |
| "num_tokens": 10392698.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5457091331481934, | |
| "epoch": 2.385046728971963, | |
| "grad_norm": 0.04899441823363304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536708235740662, | |
| "mean_token_accuracy": 0.7760955542325974, | |
| "num_tokens": 10409076.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5321961939334869, | |
| "epoch": 2.388785046728972, | |
| "grad_norm": 0.045906826853752136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316425561904907, | |
| "mean_token_accuracy": 0.7848930060863495, | |
| "num_tokens": 10425501.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5476334244012833, | |
| "epoch": 2.392523364485981, | |
| "grad_norm": 0.038592927157878876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469234585762024, | |
| "mean_token_accuracy": 0.7766659259796143, | |
| "num_tokens": 10441907.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.514763131737709, | |
| "epoch": 2.3962616822429905, | |
| "grad_norm": 0.04247188940644264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5191242098808289, | |
| "mean_token_accuracy": 0.7888349145650864, | |
| "num_tokens": 10458019.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5377763360738754, | |
| "epoch": 2.4, | |
| "grad_norm": 0.037420280277729034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363115072250366, | |
| "mean_token_accuracy": 0.7803380340337753, | |
| "num_tokens": 10474412.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5383724719285965, | |
| "epoch": 2.4037383177570093, | |
| "grad_norm": 0.038523126393556595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415539145469666, | |
| "mean_token_accuracy": 0.7787618041038513, | |
| "num_tokens": 10490995.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5374136418104172, | |
| "epoch": 2.4074766355140187, | |
| "grad_norm": 0.03964264318346977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468027591705322, | |
| "mean_token_accuracy": 0.779059037566185, | |
| "num_tokens": 10507482.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5512133836746216, | |
| "epoch": 2.411214953271028, | |
| "grad_norm": 0.0391349270939827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508245825767517, | |
| "mean_token_accuracy": 0.7754583358764648, | |
| "num_tokens": 10523993.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5193808674812317, | |
| "epoch": 2.4149532710280375, | |
| "grad_norm": 0.03556473180651665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196793675422668, | |
| "mean_token_accuracy": 0.78975510597229, | |
| "num_tokens": 10540005.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5471558570861816, | |
| "epoch": 2.418691588785047, | |
| "grad_norm": 0.04553184658288956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547728419303894, | |
| "mean_token_accuracy": 0.7780675292015076, | |
| "num_tokens": 10555891.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.519458457827568, | |
| "epoch": 2.4224299065420563, | |
| "grad_norm": 0.045790717005729675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232809782028198, | |
| "mean_token_accuracy": 0.7882662564516068, | |
| "num_tokens": 10572109.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5270252674818039, | |
| "epoch": 2.426168224299065, | |
| "grad_norm": 0.04227881506085396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288085341453552, | |
| "mean_token_accuracy": 0.7866526395082474, | |
| "num_tokens": 10588192.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.548214852809906, | |
| "epoch": 2.4299065420560746, | |
| "grad_norm": 0.04126811400055885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440689325332642, | |
| "mean_token_accuracy": 0.779522180557251, | |
| "num_tokens": 10604498.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5452295988798141, | |
| "epoch": 2.433644859813084, | |
| "grad_norm": 0.044819604605436325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547234833240509, | |
| "mean_token_accuracy": 0.7796365767717361, | |
| "num_tokens": 10620949.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5525990724563599, | |
| "epoch": 2.4373831775700934, | |
| "grad_norm": 0.042418453842401505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493718385696411, | |
| "mean_token_accuracy": 0.7783072590827942, | |
| "num_tokens": 10637398.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5338578671216965, | |
| "epoch": 2.4411214953271028, | |
| "grad_norm": 0.048241496086120605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348434448242188, | |
| "mean_token_accuracy": 0.7853177338838577, | |
| "num_tokens": 10653827.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5247549116611481, | |
| "epoch": 2.444859813084112, | |
| "grad_norm": 0.03876890614628792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283288359642029, | |
| "mean_token_accuracy": 0.7865240424871445, | |
| "num_tokens": 10670227.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5525484532117844, | |
| "epoch": 2.4485981308411215, | |
| "grad_norm": 0.04079402610659599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510199069976807, | |
| "mean_token_accuracy": 0.7765209227800369, | |
| "num_tokens": 10686514.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5248308256268501, | |
| "epoch": 2.452336448598131, | |
| "grad_norm": 0.03220357000827789, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197701454162598, | |
| "mean_token_accuracy": 0.7878830432891846, | |
| "num_tokens": 10702613.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5264022424817085, | |
| "epoch": 2.4560747663551403, | |
| "grad_norm": 0.038926877081394196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5227438807487488, | |
| "mean_token_accuracy": 0.7853628695011139, | |
| "num_tokens": 10718690.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5430135428905487, | |
| "epoch": 2.4598130841121497, | |
| "grad_norm": 0.04270581528544426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455408096313477, | |
| "mean_token_accuracy": 0.7791119664907455, | |
| "num_tokens": 10735135.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5284547656774521, | |
| "epoch": 2.463551401869159, | |
| "grad_norm": 0.04039589315652847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309383273124695, | |
| "mean_token_accuracy": 0.784732460975647, | |
| "num_tokens": 10751298.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5267135500907898, | |
| "epoch": 2.467289719626168, | |
| "grad_norm": 0.042588524520397186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272895097732544, | |
| "mean_token_accuracy": 0.7885420620441437, | |
| "num_tokens": 10767947.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5294100195169449, | |
| "epoch": 2.4710280373831774, | |
| "grad_norm": 0.04541191831231117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415511727333069, | |
| "mean_token_accuracy": 0.7802952826023102, | |
| "num_tokens": 10784155.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5230477377772331, | |
| "epoch": 2.474766355140187, | |
| "grad_norm": 0.04615366831421852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295774936676025, | |
| "mean_token_accuracy": 0.7873392999172211, | |
| "num_tokens": 10800552.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5188637897372246, | |
| "epoch": 2.4785046728971962, | |
| "grad_norm": 0.03992808610200882, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195883512496948, | |
| "mean_token_accuracy": 0.7883334010839462, | |
| "num_tokens": 10816926.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5323937982320786, | |
| "epoch": 2.4822429906542056, | |
| "grad_norm": 0.04497828707098961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278034210205078, | |
| "mean_token_accuracy": 0.7848539501428604, | |
| "num_tokens": 10833159.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5480016022920609, | |
| "epoch": 2.485981308411215, | |
| "grad_norm": 0.0394604429602623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437833070755005, | |
| "mean_token_accuracy": 0.7807918637990952, | |
| "num_tokens": 10849417.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5170062035322189, | |
| "epoch": 2.4897196261682244, | |
| "grad_norm": 0.041445329785346985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.517329216003418, | |
| "mean_token_accuracy": 0.7887666076421738, | |
| "num_tokens": 10865715.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5371522009372711, | |
| "epoch": 2.493457943925234, | |
| "grad_norm": 0.042152535170316696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461167693138123, | |
| "mean_token_accuracy": 0.7759047448635101, | |
| "num_tokens": 10881891.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.522216372191906, | |
| "epoch": 2.497196261682243, | |
| "grad_norm": 0.04944324120879173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293608903884888, | |
| "mean_token_accuracy": 0.7865939140319824, | |
| "num_tokens": 10898086.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5419133603572845, | |
| "epoch": 2.500934579439252, | |
| "grad_norm": 0.03869049996137619, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435135364532471, | |
| "mean_token_accuracy": 0.7788238078355789, | |
| "num_tokens": 10914630.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.543552428483963, | |
| "epoch": 2.5046728971962615, | |
| "grad_norm": 0.040104418992996216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451544523239136, | |
| "mean_token_accuracy": 0.7762735784053802, | |
| "num_tokens": 10931142.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5488818436861038, | |
| "epoch": 2.508411214953271, | |
| "grad_norm": 0.03650939092040062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461534857749939, | |
| "mean_token_accuracy": 0.7810324132442474, | |
| "num_tokens": 10947432.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5514579713344574, | |
| "epoch": 2.5121495327102803, | |
| "grad_norm": 0.035640496760606766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461341142654419, | |
| "mean_token_accuracy": 0.7758427411317825, | |
| "num_tokens": 10963793.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5298633724451065, | |
| "epoch": 2.5158878504672897, | |
| "grad_norm": 0.036869630217552185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271415710449219, | |
| "mean_token_accuracy": 0.7874128669500351, | |
| "num_tokens": 10980238.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5178606957197189, | |
| "epoch": 2.519626168224299, | |
| "grad_norm": 0.04496290162205696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193417072296143, | |
| "mean_token_accuracy": 0.7885989248752594, | |
| "num_tokens": 10996365.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5270267352461815, | |
| "epoch": 2.5233644859813085, | |
| "grad_norm": 0.04544811695814133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387653112411499, | |
| "mean_token_accuracy": 0.7800068855285645, | |
| "num_tokens": 11012575.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.527735561132431, | |
| "epoch": 2.527102803738318, | |
| "grad_norm": 0.04031702131032944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367462635040283, | |
| "mean_token_accuracy": 0.7821540981531143, | |
| "num_tokens": 11028942.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5479142069816589, | |
| "epoch": 2.5308411214953273, | |
| "grad_norm": 0.042728912085294724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432093739509583, | |
| "mean_token_accuracy": 0.7799795567989349, | |
| "num_tokens": 11045296.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5360302478075027, | |
| "epoch": 2.5345794392523366, | |
| "grad_norm": 0.040872231125831604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265986323356628, | |
| "mean_token_accuracy": 0.7887827455997467, | |
| "num_tokens": 11061450.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5468751043081284, | |
| "epoch": 2.538317757009346, | |
| "grad_norm": 0.0408024825155735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442636609077454, | |
| "mean_token_accuracy": 0.7790944874286652, | |
| "num_tokens": 11077540.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.530633345246315, | |
| "epoch": 2.542056074766355, | |
| "grad_norm": 0.04209808632731438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363141894340515, | |
| "mean_token_accuracy": 0.7819496542215347, | |
| "num_tokens": 11093632.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5098425000905991, | |
| "epoch": 2.5457943925233644, | |
| "grad_norm": 0.04276811331510544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222542881965637, | |
| "mean_token_accuracy": 0.7871226519346237, | |
| "num_tokens": 11110142.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5203486457467079, | |
| "epoch": 2.5495327102803738, | |
| "grad_norm": 0.04667636379599571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52687668800354, | |
| "mean_token_accuracy": 0.7876535356044769, | |
| "num_tokens": 11126405.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5424248725175858, | |
| "epoch": 2.553271028037383, | |
| "grad_norm": 0.03960704430937767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351195335388184, | |
| "mean_token_accuracy": 0.7820920497179031, | |
| "num_tokens": 11142681.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5479930490255356, | |
| "epoch": 2.5570093457943925, | |
| "grad_norm": 0.03865355625748634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381141901016235, | |
| "mean_token_accuracy": 0.7842580229043961, | |
| "num_tokens": 11158981.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5378328114748001, | |
| "epoch": 2.560747663551402, | |
| "grad_norm": 0.0406392477452755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395403504371643, | |
| "mean_token_accuracy": 0.7812999784946442, | |
| "num_tokens": 11175185.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5591647922992706, | |
| "epoch": 2.5644859813084113, | |
| "grad_norm": 0.042679473757743835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618141889572144, | |
| "mean_token_accuracy": 0.7730479836463928, | |
| "num_tokens": 11191516.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.540540523827076, | |
| "epoch": 2.5682242990654207, | |
| "grad_norm": 0.0401788055896759, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431095957756042, | |
| "mean_token_accuracy": 0.7800974696874619, | |
| "num_tokens": 11207897.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5273384600877762, | |
| "epoch": 2.5719626168224297, | |
| "grad_norm": 0.04009004309773445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236154794692993, | |
| "mean_token_accuracy": 0.7862724959850311, | |
| "num_tokens": 11224233.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5341546684503555, | |
| "epoch": 2.575700934579439, | |
| "grad_norm": 0.045469239354133606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359405875205994, | |
| "mean_token_accuracy": 0.7828920185565948, | |
| "num_tokens": 11240583.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.516716443002224, | |
| "epoch": 2.5794392523364484, | |
| "grad_norm": 0.03841989487409592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5178863406181335, | |
| "mean_token_accuracy": 0.7926649451255798, | |
| "num_tokens": 11256814.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5300464928150177, | |
| "epoch": 2.583177570093458, | |
| "grad_norm": 0.043383657932281494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534642219543457, | |
| "mean_token_accuracy": 0.7844998836517334, | |
| "num_tokens": 11273092.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5270805209875107, | |
| "epoch": 2.586915887850467, | |
| "grad_norm": 0.042948167771101, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318405628204346, | |
| "mean_token_accuracy": 0.7814630717039108, | |
| "num_tokens": 11289382.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5576307624578476, | |
| "epoch": 2.5906542056074766, | |
| "grad_norm": 0.04289550706744194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595361590385437, | |
| "mean_token_accuracy": 0.77448670566082, | |
| "num_tokens": 11305822.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5350489318370819, | |
| "epoch": 2.594392523364486, | |
| "grad_norm": 0.036010973155498505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320281982421875, | |
| "mean_token_accuracy": 0.7841717451810837, | |
| "num_tokens": 11322116.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5389258116483688, | |
| "epoch": 2.5981308411214954, | |
| "grad_norm": 0.036538656800985336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332745313644409, | |
| "mean_token_accuracy": 0.7836548089981079, | |
| "num_tokens": 11338486.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5357422530651093, | |
| "epoch": 2.601869158878505, | |
| "grad_norm": 0.03977203741669655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403972864151001, | |
| "mean_token_accuracy": 0.7783884555101395, | |
| "num_tokens": 11355126.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5224239528179169, | |
| "epoch": 2.605607476635514, | |
| "grad_norm": 0.03854282945394516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5209836363792419, | |
| "mean_token_accuracy": 0.7890230715274811, | |
| "num_tokens": 11371642.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.527114674448967, | |
| "epoch": 2.6093457943925236, | |
| "grad_norm": 0.03806879743933678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328760743141174, | |
| "mean_token_accuracy": 0.7834767252206802, | |
| "num_tokens": 11388018.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5207114219665527, | |
| "epoch": 2.613084112149533, | |
| "grad_norm": 0.04797474667429924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281696915626526, | |
| "mean_token_accuracy": 0.7842787057161331, | |
| "num_tokens": 11404304.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5329904109239578, | |
| "epoch": 2.616822429906542, | |
| "grad_norm": 0.04143727570772171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371139645576477, | |
| "mean_token_accuracy": 0.7831498682498932, | |
| "num_tokens": 11420561.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5422161221504211, | |
| "epoch": 2.6205607476635513, | |
| "grad_norm": 0.04683515056967735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436529517173767, | |
| "mean_token_accuracy": 0.7796959728002548, | |
| "num_tokens": 11436820.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5309348404407501, | |
| "epoch": 2.6242990654205607, | |
| "grad_norm": 0.036559656262397766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223227143287659, | |
| "mean_token_accuracy": 0.7849199175834656, | |
| "num_tokens": 11453134.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5515079498291016, | |
| "epoch": 2.62803738317757, | |
| "grad_norm": 0.047568727284669876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509875416755676, | |
| "mean_token_accuracy": 0.7774451673030853, | |
| "num_tokens": 11469442.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5654275268316269, | |
| "epoch": 2.6317757009345795, | |
| "grad_norm": 0.03854409605264664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559022068977356, | |
| "mean_token_accuracy": 0.7747441530227661, | |
| "num_tokens": 11485880.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5369984805583954, | |
| "epoch": 2.635514018691589, | |
| "grad_norm": 0.04869009181857109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361051559448242, | |
| "mean_token_accuracy": 0.780804455280304, | |
| "num_tokens": 11502359.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.542375922203064, | |
| "epoch": 2.6392523364485982, | |
| "grad_norm": 0.045840587466955185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502850413322449, | |
| "mean_token_accuracy": 0.7759635299444199, | |
| "num_tokens": 11518813.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5237139612436295, | |
| "epoch": 2.6429906542056076, | |
| "grad_norm": 0.043406110256910324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281059741973877, | |
| "mean_token_accuracy": 0.7859614938497543, | |
| "num_tokens": 11535188.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5367631316184998, | |
| "epoch": 2.6467289719626166, | |
| "grad_norm": 0.04024430736899376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387470126152039, | |
| "mean_token_accuracy": 0.7812274694442749, | |
| "num_tokens": 11551645.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5330280810594559, | |
| "epoch": 2.650467289719626, | |
| "grad_norm": 0.0389426052570343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361229181289673, | |
| "mean_token_accuracy": 0.7837622314691544, | |
| "num_tokens": 11567892.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5259372144937515, | |
| "epoch": 2.6542056074766354, | |
| "grad_norm": 0.03997652605175972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267660617828369, | |
| "mean_token_accuracy": 0.7850897163152695, | |
| "num_tokens": 11584153.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5390958487987518, | |
| "epoch": 2.6579439252336448, | |
| "grad_norm": 0.04180564358830452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372406244277954, | |
| "mean_token_accuracy": 0.7838725447654724, | |
| "num_tokens": 11600597.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5279987677931786, | |
| "epoch": 2.661682242990654, | |
| "grad_norm": 0.03591061756014824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308532118797302, | |
| "mean_token_accuracy": 0.785730242729187, | |
| "num_tokens": 11616881.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5563876032829285, | |
| "epoch": 2.6654205607476635, | |
| "grad_norm": 0.03892669454216957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556321144104004, | |
| "mean_token_accuracy": 0.7758439630270004, | |
| "num_tokens": 11633329.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5373513847589493, | |
| "epoch": 2.669158878504673, | |
| "grad_norm": 0.03863142430782318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352209806442261, | |
| "mean_token_accuracy": 0.7836543023586273, | |
| "num_tokens": 11649751.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5123810023069382, | |
| "epoch": 2.6728971962616823, | |
| "grad_norm": 0.04038078337907791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158439874649048, | |
| "mean_token_accuracy": 0.7905206978321075, | |
| "num_tokens": 11665928.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5479727983474731, | |
| "epoch": 2.6766355140186917, | |
| "grad_norm": 0.04204852879047394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506036281585693, | |
| "mean_token_accuracy": 0.7781369537115097, | |
| "num_tokens": 11682349.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5410658866167068, | |
| "epoch": 2.680373831775701, | |
| "grad_norm": 0.04252674803137779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433157086372375, | |
| "mean_token_accuracy": 0.776948869228363, | |
| "num_tokens": 11698941.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5443103611469269, | |
| "epoch": 2.6841121495327105, | |
| "grad_norm": 0.044883646070957184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470229983329773, | |
| "mean_token_accuracy": 0.7803091257810593, | |
| "num_tokens": 11715434.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5390113294124603, | |
| "epoch": 2.68785046728972, | |
| "grad_norm": 0.04012865573167801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320149660110474, | |
| "mean_token_accuracy": 0.7860948741436005, | |
| "num_tokens": 11731697.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5281476825475693, | |
| "epoch": 2.691588785046729, | |
| "grad_norm": 0.04816235229372978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312087535858154, | |
| "mean_token_accuracy": 0.7858725935220718, | |
| "num_tokens": 11747788.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5142519026994705, | |
| "epoch": 2.695327102803738, | |
| "grad_norm": 0.0394207127392292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175022482872009, | |
| "mean_token_accuracy": 0.7914264351129532, | |
| "num_tokens": 11763802.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5183316618204117, | |
| "epoch": 2.6990654205607476, | |
| "grad_norm": 0.04731175675988197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275416374206543, | |
| "mean_token_accuracy": 0.7866149395704269, | |
| "num_tokens": 11779759.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5322978273034096, | |
| "epoch": 2.702803738317757, | |
| "grad_norm": 0.045594654977321625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377396941184998, | |
| "mean_token_accuracy": 0.7802564948797226, | |
| "num_tokens": 11795656.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5265089273452759, | |
| "epoch": 2.7065420560747664, | |
| "grad_norm": 0.04707048460841179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340720415115356, | |
| "mean_token_accuracy": 0.7816154807806015, | |
| "num_tokens": 11811757.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5486596673727036, | |
| "epoch": 2.710280373831776, | |
| "grad_norm": 0.04378875717520714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447016358375549, | |
| "mean_token_accuracy": 0.7777462303638458, | |
| "num_tokens": 11828249.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5557577461004257, | |
| "epoch": 2.714018691588785, | |
| "grad_norm": 0.044526614248752594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464760661125183, | |
| "mean_token_accuracy": 0.7786324173212051, | |
| "num_tokens": 11844645.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5483285784721375, | |
| "epoch": 2.717757009345794, | |
| "grad_norm": 0.05415434390306473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537320971488953, | |
| "mean_token_accuracy": 0.774675577878952, | |
| "num_tokens": 11860972.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5311020910739899, | |
| "epoch": 2.7214953271028035, | |
| "grad_norm": 0.043242573738098145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344421863555908, | |
| "mean_token_accuracy": 0.7838677763938904, | |
| "num_tokens": 11876848.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5571545660495758, | |
| "epoch": 2.725233644859813, | |
| "grad_norm": 0.04775959998369217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543075799942017, | |
| "mean_token_accuracy": 0.7767691016197205, | |
| "num_tokens": 11893101.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5632807910442352, | |
| "epoch": 2.7289719626168223, | |
| "grad_norm": 0.040951792150735855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556804895401001, | |
| "mean_token_accuracy": 0.7738458663225174, | |
| "num_tokens": 11909248.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5437204986810684, | |
| "epoch": 2.7327102803738317, | |
| "grad_norm": 0.041280943900346756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405519604682922, | |
| "mean_token_accuracy": 0.7808393985033035, | |
| "num_tokens": 11925644.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5410651564598083, | |
| "epoch": 2.736448598130841, | |
| "grad_norm": 0.04410838708281517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487910509109497, | |
| "mean_token_accuracy": 0.7771375328302383, | |
| "num_tokens": 11941579.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.543538823723793, | |
| "epoch": 2.7401869158878505, | |
| "grad_norm": 0.04985618218779564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518176555633545, | |
| "mean_token_accuracy": 0.775468647480011, | |
| "num_tokens": 11957981.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5253164023160934, | |
| "epoch": 2.74392523364486, | |
| "grad_norm": 0.04087154567241669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267685651779175, | |
| "mean_token_accuracy": 0.7876032888889313, | |
| "num_tokens": 11974282.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5454862713813782, | |
| "epoch": 2.7476635514018692, | |
| "grad_norm": 0.04045165702700615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382283926010132, | |
| "mean_token_accuracy": 0.7811629176139832, | |
| "num_tokens": 11990945.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5417391657829285, | |
| "epoch": 2.7514018691588786, | |
| "grad_norm": 0.042311448603868484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540289044380188, | |
| "mean_token_accuracy": 0.7793714255094528, | |
| "num_tokens": 12007392.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5214735865592957, | |
| "epoch": 2.755140186915888, | |
| "grad_norm": 0.04158855974674225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5217651128768921, | |
| "mean_token_accuracy": 0.7852792292833328, | |
| "num_tokens": 12023581.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5328553915023804, | |
| "epoch": 2.7588785046728974, | |
| "grad_norm": 0.038325536996126175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344902873039246, | |
| "mean_token_accuracy": 0.7842058092355728, | |
| "num_tokens": 12039885.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5496254563331604, | |
| "epoch": 2.762616822429907, | |
| "grad_norm": 0.04375292733311653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55174720287323, | |
| "mean_token_accuracy": 0.7766779661178589, | |
| "num_tokens": 12056371.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.558516189455986, | |
| "epoch": 2.7663551401869158, | |
| "grad_norm": 0.049271486699581146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561238169670105, | |
| "mean_token_accuracy": 0.77435702085495, | |
| "num_tokens": 12072839.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5472046732902527, | |
| "epoch": 2.770093457943925, | |
| "grad_norm": 0.04255034402012825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455073714256287, | |
| "mean_token_accuracy": 0.7776911556720734, | |
| "num_tokens": 12089121.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5307886898517609, | |
| "epoch": 2.7738317757009345, | |
| "grad_norm": 0.04008355364203453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308167934417725, | |
| "mean_token_accuracy": 0.785127267241478, | |
| "num_tokens": 12105321.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5314194560050964, | |
| "epoch": 2.777570093457944, | |
| "grad_norm": 0.043235525488853455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316693186759949, | |
| "mean_token_accuracy": 0.7851164489984512, | |
| "num_tokens": 12121581.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5243879109621048, | |
| "epoch": 2.7813084112149533, | |
| "grad_norm": 0.0358644537627697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208507776260376, | |
| "mean_token_accuracy": 0.7896229773759842, | |
| "num_tokens": 12138064.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5349021703004837, | |
| "epoch": 2.7850467289719627, | |
| "grad_norm": 0.04395059868693352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541559100151062, | |
| "mean_token_accuracy": 0.7818141132593155, | |
| "num_tokens": 12154580.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5464755445718765, | |
| "epoch": 2.788785046728972, | |
| "grad_norm": 0.03772180154919624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500795245170593, | |
| "mean_token_accuracy": 0.7745375484228134, | |
| "num_tokens": 12170944.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5316334664821625, | |
| "epoch": 2.792523364485981, | |
| "grad_norm": 0.042537569999694824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385891795158386, | |
| "mean_token_accuracy": 0.7813721299171448, | |
| "num_tokens": 12187183.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5325866043567657, | |
| "epoch": 2.7962616822429904, | |
| "grad_norm": 0.03928552195429802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372824668884277, | |
| "mean_token_accuracy": 0.782025933265686, | |
| "num_tokens": 12203656.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5230025053024292, | |
| "epoch": 2.8, | |
| "grad_norm": 0.045356832444667816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221288204193115, | |
| "mean_token_accuracy": 0.7879509478807449, | |
| "num_tokens": 12220217.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5552905946969986, | |
| "epoch": 2.803738317757009, | |
| "grad_norm": 0.03520367294549942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458053350448608, | |
| "mean_token_accuracy": 0.7801086604595184, | |
| "num_tokens": 12236926.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5284090638160706, | |
| "epoch": 2.8074766355140186, | |
| "grad_norm": 0.04301855340600014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322295427322388, | |
| "mean_token_accuracy": 0.7865041345357895, | |
| "num_tokens": 12253231.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5464428961277008, | |
| "epoch": 2.811214953271028, | |
| "grad_norm": 0.04177437350153923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503079295158386, | |
| "mean_token_accuracy": 0.7759024053812027, | |
| "num_tokens": 12269564.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5288181900978088, | |
| "epoch": 2.8149532710280374, | |
| "grad_norm": 0.04611227661371231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422286987304688, | |
| "mean_token_accuracy": 0.7793826460838318, | |
| "num_tokens": 12285764.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.538264587521553, | |
| "epoch": 2.8186915887850468, | |
| "grad_norm": 0.039094604551792145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421559810638428, | |
| "mean_token_accuracy": 0.7824651896953583, | |
| "num_tokens": 12301975.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5448143184185028, | |
| "epoch": 2.822429906542056, | |
| "grad_norm": 0.03843825310468674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424494743347168, | |
| "mean_token_accuracy": 0.7786366790533066, | |
| "num_tokens": 12318265.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5362522453069687, | |
| "epoch": 2.8261682242990656, | |
| "grad_norm": 0.037981439381837845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347139835357666, | |
| "mean_token_accuracy": 0.7820651233196259, | |
| "num_tokens": 12334596.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5419719219207764, | |
| "epoch": 2.829906542056075, | |
| "grad_norm": 0.03768031671643257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540343701839447, | |
| "mean_token_accuracy": 0.779738038778305, | |
| "num_tokens": 12351022.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5576566010713577, | |
| "epoch": 2.8336448598130843, | |
| "grad_norm": 0.03845515102148056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556204617023468, | |
| "mean_token_accuracy": 0.7719219624996185, | |
| "num_tokens": 12367469.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5245185047388077, | |
| "epoch": 2.8373831775700937, | |
| "grad_norm": 0.04210665449500084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240767598152161, | |
| "mean_token_accuracy": 0.7867787629365921, | |
| "num_tokens": 12383664.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5366124212741852, | |
| "epoch": 2.8411214953271027, | |
| "grad_norm": 0.039727386087179184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391771197319031, | |
| "mean_token_accuracy": 0.7799243628978729, | |
| "num_tokens": 12399816.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5430543571710587, | |
| "epoch": 2.844859813084112, | |
| "grad_norm": 0.04284166544675827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555898129940033, | |
| "mean_token_accuracy": 0.7769357264041901, | |
| "num_tokens": 12416232.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5447599291801453, | |
| "epoch": 2.8485981308411215, | |
| "grad_norm": 0.04133335128426552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458224415779114, | |
| "mean_token_accuracy": 0.7791205793619156, | |
| "num_tokens": 12432772.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5463473051786423, | |
| "epoch": 2.852336448598131, | |
| "grad_norm": 0.04293463006615639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410310626029968, | |
| "mean_token_accuracy": 0.7824665307998657, | |
| "num_tokens": 12449390.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5433794260025024, | |
| "epoch": 2.8560747663551402, | |
| "grad_norm": 0.0383763313293457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330025553703308, | |
| "mean_token_accuracy": 0.786294624209404, | |
| "num_tokens": 12465761.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5348140597343445, | |
| "epoch": 2.8598130841121496, | |
| "grad_norm": 0.038813136518001556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356075167655945, | |
| "mean_token_accuracy": 0.7799220532178879, | |
| "num_tokens": 12481995.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5310825854539871, | |
| "epoch": 2.863551401869159, | |
| "grad_norm": 0.04623069986701012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389203429222107, | |
| "mean_token_accuracy": 0.7763766050338745, | |
| "num_tokens": 12498209.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5357654541730881, | |
| "epoch": 2.867289719626168, | |
| "grad_norm": 0.03819035738706589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394827723503113, | |
| "mean_token_accuracy": 0.7809223681688309, | |
| "num_tokens": 12514712.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.543551579117775, | |
| "epoch": 2.8710280373831774, | |
| "grad_norm": 0.043649353086948395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464720129966736, | |
| "mean_token_accuracy": 0.7787970453500748, | |
| "num_tokens": 12531249.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5389954522252083, | |
| "epoch": 2.8747663551401867, | |
| "grad_norm": 0.036311469972133636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379980206489563, | |
| "mean_token_accuracy": 0.7832965403795242, | |
| "num_tokens": 12547833.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5408525764942169, | |
| "epoch": 2.878504672897196, | |
| "grad_norm": 0.03780903294682503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539055585861206, | |
| "mean_token_accuracy": 0.7843980342149734, | |
| "num_tokens": 12564468.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5521610230207443, | |
| "epoch": 2.8822429906542055, | |
| "grad_norm": 0.042727869004011154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518633723258972, | |
| "mean_token_accuracy": 0.7730461955070496, | |
| "num_tokens": 12580822.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5392657667398453, | |
| "epoch": 2.885981308411215, | |
| "grad_norm": 0.042652204632759094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403409004211426, | |
| "mean_token_accuracy": 0.7833160161972046, | |
| "num_tokens": 12597306.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5409767031669617, | |
| "epoch": 2.8897196261682243, | |
| "grad_norm": 0.04756668955087662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477514266967773, | |
| "mean_token_accuracy": 0.7775042653083801, | |
| "num_tokens": 12613430.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.529184103012085, | |
| "epoch": 2.8934579439252337, | |
| "grad_norm": 0.040852271020412445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368978381156921, | |
| "mean_token_accuracy": 0.7799389064311981, | |
| "num_tokens": 12629734.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5528028011322021, | |
| "epoch": 2.897196261682243, | |
| "grad_norm": 0.04610953480005264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489134788513184, | |
| "mean_token_accuracy": 0.7778203934431076, | |
| "num_tokens": 12646051.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5398439168930054, | |
| "epoch": 2.9009345794392525, | |
| "grad_norm": 0.03999875858426094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301113128662109, | |
| "mean_token_accuracy": 0.786536455154419, | |
| "num_tokens": 12662398.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5450849235057831, | |
| "epoch": 2.904672897196262, | |
| "grad_norm": 0.04052022844552994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446597933769226, | |
| "mean_token_accuracy": 0.7773038446903229, | |
| "num_tokens": 12679053.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5272800028324127, | |
| "epoch": 2.9084112149532713, | |
| "grad_norm": 0.041017524898052216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308842062950134, | |
| "mean_token_accuracy": 0.7858325839042664, | |
| "num_tokens": 12695608.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5401904284954071, | |
| "epoch": 2.91214953271028, | |
| "grad_norm": 0.04053664207458496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450324416160583, | |
| "mean_token_accuracy": 0.7785527408123016, | |
| "num_tokens": 12712035.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5284470915794373, | |
| "epoch": 2.9158878504672896, | |
| "grad_norm": 0.04656258225440979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301587581634521, | |
| "mean_token_accuracy": 0.781079113483429, | |
| "num_tokens": 12728285.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5552389323711395, | |
| "epoch": 2.919626168224299, | |
| "grad_norm": 0.043133046478033066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493855476379395, | |
| "mean_token_accuracy": 0.7788817882537842, | |
| "num_tokens": 12744626.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.536635085940361, | |
| "epoch": 2.9233644859813084, | |
| "grad_norm": 0.04232388734817505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350582599639893, | |
| "mean_token_accuracy": 0.784316211938858, | |
| "num_tokens": 12760817.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5175309851765633, | |
| "epoch": 2.9271028037383178, | |
| "grad_norm": 0.05120910704135895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239328742027283, | |
| "mean_token_accuracy": 0.7904608845710754, | |
| "num_tokens": 12777129.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5613889098167419, | |
| "epoch": 2.930841121495327, | |
| "grad_norm": 0.04064096510410309, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573512315750122, | |
| "mean_token_accuracy": 0.7735461741685867, | |
| "num_tokens": 12793633.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.540812149643898, | |
| "epoch": 2.9345794392523366, | |
| "grad_norm": 0.04686618968844414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428805947303772, | |
| "mean_token_accuracy": 0.7786334455013275, | |
| "num_tokens": 12809886.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5354818254709244, | |
| "epoch": 2.938317757009346, | |
| "grad_norm": 0.04068305343389511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409020185470581, | |
| "mean_token_accuracy": 0.781467393040657, | |
| "num_tokens": 12826079.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5340152084827423, | |
| "epoch": 2.942056074766355, | |
| "grad_norm": 0.04302098974585533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352627038955688, | |
| "mean_token_accuracy": 0.7827621698379517, | |
| "num_tokens": 12842255.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5471729636192322, | |
| "epoch": 2.9457943925233643, | |
| "grad_norm": 0.03707803413271904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461200475692749, | |
| "mean_token_accuracy": 0.7784449309110641, | |
| "num_tokens": 12859013.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5401621907949448, | |
| "epoch": 2.9495327102803737, | |
| "grad_norm": 0.044071633368730545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385332107543945, | |
| "mean_token_accuracy": 0.783258393406868, | |
| "num_tokens": 12875373.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5508020371198654, | |
| "epoch": 2.953271028037383, | |
| "grad_norm": 0.03822047635912895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456752181053162, | |
| "mean_token_accuracy": 0.7771204560995102, | |
| "num_tokens": 12891653.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5405401140451431, | |
| "epoch": 2.9570093457943925, | |
| "grad_norm": 0.05170199275016785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398849248886108, | |
| "mean_token_accuracy": 0.7820375263690948, | |
| "num_tokens": 12908131.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5514362305402756, | |
| "epoch": 2.960747663551402, | |
| "grad_norm": 0.036166463047266006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504743456840515, | |
| "mean_token_accuracy": 0.7789987325668335, | |
| "num_tokens": 12924376.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5308372974395752, | |
| "epoch": 2.9644859813084112, | |
| "grad_norm": 0.04786797612905502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306717753410339, | |
| "mean_token_accuracy": 0.7853545248508453, | |
| "num_tokens": 12940776.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.532660722732544, | |
| "epoch": 2.9682242990654206, | |
| "grad_norm": 0.045564983040094376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463993549346924, | |
| "mean_token_accuracy": 0.777183935046196, | |
| "num_tokens": 12957326.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5434572845697403, | |
| "epoch": 2.97196261682243, | |
| "grad_norm": 0.04280655458569527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493361353874207, | |
| "mean_token_accuracy": 0.776650920510292, | |
| "num_tokens": 12973820.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5530060529708862, | |
| "epoch": 2.9757009345794394, | |
| "grad_norm": 0.04003579169511795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533372759819031, | |
| "mean_token_accuracy": 0.7766715437173843, | |
| "num_tokens": 12990177.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5516588985919952, | |
| "epoch": 2.979439252336449, | |
| "grad_norm": 0.0351371206343174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491815209388733, | |
| "mean_token_accuracy": 0.7761321365833282, | |
| "num_tokens": 13006638.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5496395230293274, | |
| "epoch": 2.983177570093458, | |
| "grad_norm": 0.03455950319766998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390848517417908, | |
| "mean_token_accuracy": 0.7827516794204712, | |
| "num_tokens": 13022895.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5255894213914871, | |
| "epoch": 2.986915887850467, | |
| "grad_norm": 0.0403040274977684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258710980415344, | |
| "mean_token_accuracy": 0.7874301820993423, | |
| "num_tokens": 13039127.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5152293890714645, | |
| "epoch": 2.9906542056074765, | |
| "grad_norm": 0.04018184915184975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248207449913025, | |
| "mean_token_accuracy": 0.789091631770134, | |
| "num_tokens": 13055038.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5260308086872101, | |
| "epoch": 2.994392523364486, | |
| "grad_norm": 0.04690062627196312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380572080612183, | |
| "mean_token_accuracy": 0.7809655517339706, | |
| "num_tokens": 13070955.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5523715615272522, | |
| "epoch": 2.9981308411214953, | |
| "grad_norm": 0.040551379323005676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491956472396851, | |
| "mean_token_accuracy": 0.7785847187042236, | |
| "num_tokens": 13087325.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5784902274608612, | |
| "epoch": 3.0, | |
| "grad_norm": 0.04703172296285629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652958750724792, | |
| "mean_token_accuracy": 0.7655995786190033, | |
| "num_tokens": 13094423.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2209408416111657e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |