Instructions to use eac123/sublim-phase4-combo-07 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/sublim-phase4-combo-07 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/sublim-phase4-combo-07") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1245547831058502, | |
| "epoch": 0.0037313432835820895, | |
| "grad_norm": 1.6273682117462158, | |
| "learning_rate": 0.0002, | |
| "loss": 2.482689619064331, | |
| "mean_token_accuracy": 0.5370704382658005, | |
| "num_tokens": 16322.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2366806268692017, | |
| "epoch": 0.007462686567164179, | |
| "grad_norm": 1.4647141695022583, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1726250648498535, | |
| "mean_token_accuracy": 0.5635550767183304, | |
| "num_tokens": 32624.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.3885400295257568, | |
| "epoch": 0.011194029850746268, | |
| "grad_norm": 1.1605029106140137, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7200348377227783, | |
| "mean_token_accuracy": 0.596715897321701, | |
| "num_tokens": 48781.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3746764063835144, | |
| "epoch": 0.014925373134328358, | |
| "grad_norm": 0.932724118232727, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4033262729644775, | |
| "mean_token_accuracy": 0.6351611912250519, | |
| "num_tokens": 65119.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3346630930900574, | |
| "epoch": 0.018656716417910446, | |
| "grad_norm": 1.0168325901031494, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2731056213378906, | |
| "mean_token_accuracy": 0.6540397107601166, | |
| "num_tokens": 81735.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2580328285694122, | |
| "epoch": 0.022388059701492536, | |
| "grad_norm": 0.5265628695487976, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1689575910568237, | |
| "mean_token_accuracy": 0.6603054255247116, | |
| "num_tokens": 98081.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1583980917930603, | |
| "epoch": 0.026119402985074626, | |
| "grad_norm": 0.4118923842906952, | |
| "learning_rate": 0.0002, | |
| "loss": 1.078832983970642, | |
| "mean_token_accuracy": 0.6707835346460342, | |
| "num_tokens": 114185.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.0589762330055237, | |
| "epoch": 0.029850746268656716, | |
| "grad_norm": 0.41156867146492004, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0044282674789429, | |
| "mean_token_accuracy": 0.6823764145374298, | |
| "num_tokens": 130498.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.9924780577421188, | |
| "epoch": 0.033582089552238806, | |
| "grad_norm": 0.5590541362762451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9619787931442261, | |
| "mean_token_accuracy": 0.6892934292554855, | |
| "num_tokens": 146820.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9725948423147202, | |
| "epoch": 0.03731343283582089, | |
| "grad_norm": 0.4368315637111664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8887773752212524, | |
| "mean_token_accuracy": 0.7022321075201035, | |
| "num_tokens": 163228.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9371236711740494, | |
| "epoch": 0.041044776119402986, | |
| "grad_norm": 0.43285107612609863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8475317358970642, | |
| "mean_token_accuracy": 0.706597164273262, | |
| "num_tokens": 179681.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.8875125199556351, | |
| "epoch": 0.04477611940298507, | |
| "grad_norm": 6.3542633056640625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8327640295028687, | |
| "mean_token_accuracy": 0.7034512162208557, | |
| "num_tokens": 196348.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8179645836353302, | |
| "epoch": 0.048507462686567165, | |
| "grad_norm": 0.44303053617477417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7809244394302368, | |
| "mean_token_accuracy": 0.7242531627416611, | |
| "num_tokens": 213052.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7955248355865479, | |
| "epoch": 0.05223880597014925, | |
| "grad_norm": 0.8472722172737122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7439039945602417, | |
| "mean_token_accuracy": 0.7328712791204453, | |
| "num_tokens": 229644.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7496374696493149, | |
| "epoch": 0.055970149253731345, | |
| "grad_norm": 2.1060233116149902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7229201793670654, | |
| "mean_token_accuracy": 0.7347650229930878, | |
| "num_tokens": 246138.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.6943426132202148, | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 0.4210701882839203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6997749209403992, | |
| "mean_token_accuracy": 0.7390953898429871, | |
| "num_tokens": 262489.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.689127504825592, | |
| "epoch": 0.06343283582089553, | |
| "grad_norm": 0.3434777855873108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6818345189094543, | |
| "mean_token_accuracy": 0.7421105057001114, | |
| "num_tokens": 278800.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6688047796487808, | |
| "epoch": 0.06716417910447761, | |
| "grad_norm": 0.43096405267715454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.65822833776474, | |
| "mean_token_accuracy": 0.7513366043567657, | |
| "num_tokens": 295153.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6683900207281113, | |
| "epoch": 0.0708955223880597, | |
| "grad_norm": 0.2875062823295593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6513902544975281, | |
| "mean_token_accuracy": 0.7488225400447845, | |
| "num_tokens": 311631.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6681984066963196, | |
| "epoch": 0.07462686567164178, | |
| "grad_norm": 0.34322109818458557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6516908407211304, | |
| "mean_token_accuracy": 0.7477276474237442, | |
| "num_tokens": 327810.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.657578319311142, | |
| "epoch": 0.07835820895522388, | |
| "grad_norm": 0.3035106360912323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6391871571540833, | |
| "mean_token_accuracy": 0.7518605440855026, | |
| "num_tokens": 344148.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6416258066892624, | |
| "epoch": 0.08208955223880597, | |
| "grad_norm": 0.2896852493286133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6108838319778442, | |
| "mean_token_accuracy": 0.7639093101024628, | |
| "num_tokens": 360467.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6126270890235901, | |
| "epoch": 0.08582089552238806, | |
| "grad_norm": 0.28889304399490356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5967156887054443, | |
| "mean_token_accuracy": 0.7673086673021317, | |
| "num_tokens": 376740.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.607315257191658, | |
| "epoch": 0.08955223880597014, | |
| "grad_norm": 0.26258257031440735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5931278467178345, | |
| "mean_token_accuracy": 0.7683079540729523, | |
| "num_tokens": 393035.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6071023941040039, | |
| "epoch": 0.09328358208955224, | |
| "grad_norm": 0.2627218961715698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5975178480148315, | |
| "mean_token_accuracy": 0.7655056416988373, | |
| "num_tokens": 409513.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6166605055332184, | |
| "epoch": 0.09701492537313433, | |
| "grad_norm": 0.2591419517993927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6048401594161987, | |
| "mean_token_accuracy": 0.7606765776872635, | |
| "num_tokens": 425838.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.5888677388429642, | |
| "epoch": 0.10074626865671642, | |
| "grad_norm": 0.23267361521720886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5792773365974426, | |
| "mean_token_accuracy": 0.7714710682630539, | |
| "num_tokens": 442275.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6097696423530579, | |
| "epoch": 0.1044776119402985, | |
| "grad_norm": 0.25834810733795166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6025165915489197, | |
| "mean_token_accuracy": 0.7594742327928543, | |
| "num_tokens": 458633.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5876014679670334, | |
| "epoch": 0.10820895522388059, | |
| "grad_norm": 0.24802696704864502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.577584445476532, | |
| "mean_token_accuracy": 0.7709765136241913, | |
| "num_tokens": 475114.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.577396959066391, | |
| "epoch": 0.11194029850746269, | |
| "grad_norm": 0.24076423048973083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727118849754333, | |
| "mean_token_accuracy": 0.7744314223527908, | |
| "num_tokens": 491389.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5895106196403503, | |
| "epoch": 0.11567164179104478, | |
| "grad_norm": 0.21412523090839386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5863120555877686, | |
| "mean_token_accuracy": 0.7693659514188766, | |
| "num_tokens": 507969.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5717187374830246, | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 0.1944267749786377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568047046661377, | |
| "mean_token_accuracy": 0.7752875536680222, | |
| "num_tokens": 524169.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5736564546823502, | |
| "epoch": 0.12313432835820895, | |
| "grad_norm": 0.23050418496131897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5761005282402039, | |
| "mean_token_accuracy": 0.7727629542350769, | |
| "num_tokens": 540463.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.589300200343132, | |
| "epoch": 0.12686567164179105, | |
| "grad_norm": 0.21381224691867828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5865699052810669, | |
| "mean_token_accuracy": 0.7672912329435349, | |
| "num_tokens": 557025.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5663471221923828, | |
| "epoch": 0.13059701492537312, | |
| "grad_norm": 0.21070359647274017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665886998176575, | |
| "mean_token_accuracy": 0.7742704451084137, | |
| "num_tokens": 573346.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5744731575250626, | |
| "epoch": 0.13432835820895522, | |
| "grad_norm": 0.2001814991235733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5742104649543762, | |
| "mean_token_accuracy": 0.7708545625209808, | |
| "num_tokens": 589678.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5785809606313705, | |
| "epoch": 0.13805970149253732, | |
| "grad_norm": 0.1615011990070343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697225332260132, | |
| "mean_token_accuracy": 0.7719135135412216, | |
| "num_tokens": 606081.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5571976453065872, | |
| "epoch": 0.1417910447761194, | |
| "grad_norm": 0.1849016547203064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493215322494507, | |
| "mean_token_accuracy": 0.7809059321880341, | |
| "num_tokens": 622168.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5916045606136322, | |
| "epoch": 0.1455223880597015, | |
| "grad_norm": 0.19314663112163544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5800106525421143, | |
| "mean_token_accuracy": 0.7677847892045975, | |
| "num_tokens": 638480.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5791963338851929, | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 0.18138627707958221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5779139399528503, | |
| "mean_token_accuracy": 0.767883911728859, | |
| "num_tokens": 654651.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5743307769298553, | |
| "epoch": 0.15298507462686567, | |
| "grad_norm": 0.17246870696544647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706084370613098, | |
| "mean_token_accuracy": 0.7700994461774826, | |
| "num_tokens": 670948.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5432448089122772, | |
| "epoch": 0.15671641791044777, | |
| "grad_norm": 0.19110122323036194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484994649887085, | |
| "mean_token_accuracy": 0.7811570912599564, | |
| "num_tokens": 687540.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5750848650932312, | |
| "epoch": 0.16044776119402984, | |
| "grad_norm": 0.1716981679201126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.579657793045044, | |
| "mean_token_accuracy": 0.7663937658071518, | |
| "num_tokens": 704015.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.561103492975235, | |
| "epoch": 0.16417910447761194, | |
| "grad_norm": 0.1821409910917282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600441098213196, | |
| "mean_token_accuracy": 0.774185299873352, | |
| "num_tokens": 720451.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5737239718437195, | |
| "epoch": 0.16791044776119404, | |
| "grad_norm": 0.174806609749794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676751732826233, | |
| "mean_token_accuracy": 0.770918071269989, | |
| "num_tokens": 736682.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5712144523859024, | |
| "epoch": 0.17164179104477612, | |
| "grad_norm": 0.18145714700222015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5659744143486023, | |
| "mean_token_accuracy": 0.7729035317897797, | |
| "num_tokens": 753217.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5745559930801392, | |
| "epoch": 0.17537313432835822, | |
| "grad_norm": 0.1639634072780609, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735749006271362, | |
| "mean_token_accuracy": 0.770696684718132, | |
| "num_tokens": 769822.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5605441480875015, | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 0.18234604597091675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633875131607056, | |
| "mean_token_accuracy": 0.7749416828155518, | |
| "num_tokens": 786359.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5490550547838211, | |
| "epoch": 0.1828358208955224, | |
| "grad_norm": 0.18433044850826263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567543506622314, | |
| "mean_token_accuracy": 0.7788835614919662, | |
| "num_tokens": 802963.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5616811364889145, | |
| "epoch": 0.1865671641791045, | |
| "grad_norm": 0.15450991690158844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657309889793396, | |
| "mean_token_accuracy": 0.774708479642868, | |
| "num_tokens": 819668.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5582916140556335, | |
| "epoch": 0.19029850746268656, | |
| "grad_norm": 0.14035002887248993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551848828792572, | |
| "mean_token_accuracy": 0.7806462794542313, | |
| "num_tokens": 835858.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5508538037538528, | |
| "epoch": 0.19402985074626866, | |
| "grad_norm": 0.17560449242591858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406010150909424, | |
| "mean_token_accuracy": 0.7840944528579712, | |
| "num_tokens": 852146.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5527998208999634, | |
| "epoch": 0.19776119402985073, | |
| "grad_norm": 0.15798722207546234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423352718353271, | |
| "mean_token_accuracy": 0.782536968588829, | |
| "num_tokens": 868660.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5586383640766144, | |
| "epoch": 0.20149253731343283, | |
| "grad_norm": 0.15477648377418518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521284937858582, | |
| "mean_token_accuracy": 0.7778433710336685, | |
| "num_tokens": 885133.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5694690942764282, | |
| "epoch": 0.20522388059701493, | |
| "grad_norm": 0.16944538056850433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5759178400039673, | |
| "mean_token_accuracy": 0.7684573978185654, | |
| "num_tokens": 901816.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5426557958126068, | |
| "epoch": 0.208955223880597, | |
| "grad_norm": 0.16989077627658844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477243661880493, | |
| "mean_token_accuracy": 0.7811359614133835, | |
| "num_tokens": 918275.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5754421502351761, | |
| "epoch": 0.2126865671641791, | |
| "grad_norm": 0.15350034832954407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5865313410758972, | |
| "mean_token_accuracy": 0.7631517648696899, | |
| "num_tokens": 934630.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5742448717355728, | |
| "epoch": 0.21641791044776118, | |
| "grad_norm": 0.18639785051345825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.575249433517456, | |
| "mean_token_accuracy": 0.7669856697320938, | |
| "num_tokens": 950844.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5708972364664078, | |
| "epoch": 0.22014925373134328, | |
| "grad_norm": 0.15229687094688416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669128894805908, | |
| "mean_token_accuracy": 0.7711773067712784, | |
| "num_tokens": 966973.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5682551562786102, | |
| "epoch": 0.22388059701492538, | |
| "grad_norm": 0.1677161157131195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593635439872742, | |
| "mean_token_accuracy": 0.7725416421890259, | |
| "num_tokens": 983221.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5679890364408493, | |
| "epoch": 0.22761194029850745, | |
| "grad_norm": 0.18057392537593842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580260753631592, | |
| "mean_token_accuracy": 0.7754660546779633, | |
| "num_tokens": 999424.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5804609507322311, | |
| "epoch": 0.23134328358208955, | |
| "grad_norm": 0.143987238407135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570034384727478, | |
| "mean_token_accuracy": 0.7708772122859955, | |
| "num_tokens": 1015903.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5699467211961746, | |
| "epoch": 0.23507462686567165, | |
| "grad_norm": 0.15400487184524536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5733590126037598, | |
| "mean_token_accuracy": 0.7680967003107071, | |
| "num_tokens": 1032549.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5582360923290253, | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.17451652884483337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5732641220092773, | |
| "mean_token_accuracy": 0.7692582160234451, | |
| "num_tokens": 1048935.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5475955605506897, | |
| "epoch": 0.24253731343283583, | |
| "grad_norm": 0.1549489051103592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526400804519653, | |
| "mean_token_accuracy": 0.7788676619529724, | |
| "num_tokens": 1065104.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5664391964673996, | |
| "epoch": 0.2462686567164179, | |
| "grad_norm": 0.14476634562015533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617241263389587, | |
| "mean_token_accuracy": 0.7786661833524704, | |
| "num_tokens": 1081393.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5560042560100555, | |
| "epoch": 0.25, | |
| "grad_norm": 0.16752755641937256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503427982330322, | |
| "mean_token_accuracy": 0.7781690061092377, | |
| "num_tokens": 1097575.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5609089732170105, | |
| "epoch": 0.2537313432835821, | |
| "grad_norm": 0.17903153598308563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497362017631531, | |
| "mean_token_accuracy": 0.7771856188774109, | |
| "num_tokens": 1113937.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5642896294593811, | |
| "epoch": 0.2574626865671642, | |
| "grad_norm": 0.16974171996116638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563960611820221, | |
| "mean_token_accuracy": 0.7738614976406097, | |
| "num_tokens": 1130103.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5726548284292221, | |
| "epoch": 0.26119402985074625, | |
| "grad_norm": 0.14435403048992157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712643265724182, | |
| "mean_token_accuracy": 0.7692683339118958, | |
| "num_tokens": 1146423.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5441250950098038, | |
| "epoch": 0.26492537313432835, | |
| "grad_norm": 0.14253664016723633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544674813747406, | |
| "mean_token_accuracy": 0.7780104726552963, | |
| "num_tokens": 1162733.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5444895774126053, | |
| "epoch": 0.26865671641791045, | |
| "grad_norm": 0.14379332959651947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479044318199158, | |
| "mean_token_accuracy": 0.7788853794336319, | |
| "num_tokens": 1178848.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5541743487119675, | |
| "epoch": 0.27238805970149255, | |
| "grad_norm": 0.1346455216407776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573484897613525, | |
| "mean_token_accuracy": 0.7779737412929535, | |
| "num_tokens": 1195357.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5649544596672058, | |
| "epoch": 0.27611940298507465, | |
| "grad_norm": 0.136294886469841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603638291358948, | |
| "mean_token_accuracy": 0.7719381302595139, | |
| "num_tokens": 1211921.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5381972342729568, | |
| "epoch": 0.2798507462686567, | |
| "grad_norm": 0.12611278891563416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533305287361145, | |
| "mean_token_accuracy": 0.7839507907629013, | |
| "num_tokens": 1228381.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5607545524835587, | |
| "epoch": 0.2835820895522388, | |
| "grad_norm": 0.1318938434123993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617884397506714, | |
| "mean_token_accuracy": 0.7753878086805344, | |
| "num_tokens": 1244769.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5631186813116074, | |
| "epoch": 0.2873134328358209, | |
| "grad_norm": 0.1374509632587433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608174204826355, | |
| "mean_token_accuracy": 0.7753797173500061, | |
| "num_tokens": 1261197.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5789693742990494, | |
| "epoch": 0.291044776119403, | |
| "grad_norm": 0.1388232558965683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5779432058334351, | |
| "mean_token_accuracy": 0.7658645212650299, | |
| "num_tokens": 1277998.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5439933687448502, | |
| "epoch": 0.2947761194029851, | |
| "grad_norm": 0.15839162468910217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506725311279297, | |
| "mean_token_accuracy": 0.7786760181188583, | |
| "num_tokens": 1294293.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5581207424402237, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.16782821714878082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56475830078125, | |
| "mean_token_accuracy": 0.7746179699897766, | |
| "num_tokens": 1310588.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.588770255446434, | |
| "epoch": 0.30223880597014924, | |
| "grad_norm": 0.17123626172542572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5832362174987793, | |
| "mean_token_accuracy": 0.7644577324390411, | |
| "num_tokens": 1327129.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5512869954109192, | |
| "epoch": 0.30597014925373134, | |
| "grad_norm": 0.12713028490543365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538611888885498, | |
| "mean_token_accuracy": 0.7855131775140762, | |
| "num_tokens": 1343481.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5826849788427353, | |
| "epoch": 0.30970149253731344, | |
| "grad_norm": 0.15148760378360748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.580060601234436, | |
| "mean_token_accuracy": 0.7675654888153076, | |
| "num_tokens": 1359709.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.581380233168602, | |
| "epoch": 0.31343283582089554, | |
| "grad_norm": 0.1486639529466629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5737113952636719, | |
| "mean_token_accuracy": 0.7694955766201019, | |
| "num_tokens": 1376209.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5577070415019989, | |
| "epoch": 0.31716417910447764, | |
| "grad_norm": 0.14268359541893005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592327117919922, | |
| "mean_token_accuracy": 0.7741715162992477, | |
| "num_tokens": 1392271.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5519531518220901, | |
| "epoch": 0.3208955223880597, | |
| "grad_norm": 0.19115421175956726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5649857521057129, | |
| "mean_token_accuracy": 0.7735026627779007, | |
| "num_tokens": 1408680.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5389833152294159, | |
| "epoch": 0.3246268656716418, | |
| "grad_norm": 0.1511470526456833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499240159988403, | |
| "mean_token_accuracy": 0.7795019447803497, | |
| "num_tokens": 1425241.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5535243153572083, | |
| "epoch": 0.3283582089552239, | |
| "grad_norm": 0.13003994524478912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464329123497009, | |
| "mean_token_accuracy": 0.7804087400436401, | |
| "num_tokens": 1441530.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5626068562269211, | |
| "epoch": 0.332089552238806, | |
| "grad_norm": 0.1472884714603424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579521656036377, | |
| "mean_token_accuracy": 0.7757730484008789, | |
| "num_tokens": 1457843.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5722664147615433, | |
| "epoch": 0.3358208955223881, | |
| "grad_norm": 0.14036864042282104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636782050132751, | |
| "mean_token_accuracy": 0.7743526548147202, | |
| "num_tokens": 1474209.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5577493757009506, | |
| "epoch": 0.33955223880597013, | |
| "grad_norm": 0.12171963602304459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502208471298218, | |
| "mean_token_accuracy": 0.7802051454782486, | |
| "num_tokens": 1490390.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.547787681221962, | |
| "epoch": 0.34328358208955223, | |
| "grad_norm": 0.1525270640850067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497896075248718, | |
| "mean_token_accuracy": 0.7809301018714905, | |
| "num_tokens": 1506675.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5554802119731903, | |
| "epoch": 0.34701492537313433, | |
| "grad_norm": 0.1502194106578827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645507574081421, | |
| "mean_token_accuracy": 0.7722718119621277, | |
| "num_tokens": 1523263.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5594951659440994, | |
| "epoch": 0.35074626865671643, | |
| "grad_norm": 0.13331742584705353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637622475624084, | |
| "mean_token_accuracy": 0.7736085057258606, | |
| "num_tokens": 1540004.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5551023185253143, | |
| "epoch": 0.35447761194029853, | |
| "grad_norm": 0.1213943138718605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518482327461243, | |
| "mean_token_accuracy": 0.7777320593595505, | |
| "num_tokens": 1556547.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.557207852602005, | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.1314304620027542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546322464942932, | |
| "mean_token_accuracy": 0.7763337790966034, | |
| "num_tokens": 1572997.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.556539997458458, | |
| "epoch": 0.3619402985074627, | |
| "grad_norm": 0.14363965392112732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549654364585876, | |
| "mean_token_accuracy": 0.7731640189886093, | |
| "num_tokens": 1589289.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.568042978644371, | |
| "epoch": 0.3656716417910448, | |
| "grad_norm": 0.11934816092252731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5679082274436951, | |
| "mean_token_accuracy": 0.768884465098381, | |
| "num_tokens": 1605516.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5484860688447952, | |
| "epoch": 0.3694029850746269, | |
| "grad_norm": 0.16246412694454193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522934794425964, | |
| "mean_token_accuracy": 0.776402086019516, | |
| "num_tokens": 1622108.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5548600405454636, | |
| "epoch": 0.373134328358209, | |
| "grad_norm": 0.12589918076992035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544294714927673, | |
| "mean_token_accuracy": 0.7768803536891937, | |
| "num_tokens": 1638659.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5692953765392303, | |
| "epoch": 0.376865671641791, | |
| "grad_norm": 0.12726213037967682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662153363227844, | |
| "mean_token_accuracy": 0.7698657661676407, | |
| "num_tokens": 1654877.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.560271605849266, | |
| "epoch": 0.3805970149253731, | |
| "grad_norm": 0.13260267674922943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487651824951172, | |
| "mean_token_accuracy": 0.7778149247169495, | |
| "num_tokens": 1671436.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5644612163305283, | |
| "epoch": 0.3843283582089552, | |
| "grad_norm": 0.13504348695278168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573433041572571, | |
| "mean_token_accuracy": 0.7781724482774734, | |
| "num_tokens": 1687817.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.55845807492733, | |
| "epoch": 0.3880597014925373, | |
| "grad_norm": 0.1202038824558258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552661418914795, | |
| "mean_token_accuracy": 0.7772795557975769, | |
| "num_tokens": 1704568.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5440086871385574, | |
| "epoch": 0.3917910447761194, | |
| "grad_norm": 0.12728044390678406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538181662559509, | |
| "mean_token_accuracy": 0.7744371294975281, | |
| "num_tokens": 1720774.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5394178926944733, | |
| "epoch": 0.39552238805970147, | |
| "grad_norm": 0.14098908007144928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552955150604248, | |
| "mean_token_accuracy": 0.776681050658226, | |
| "num_tokens": 1737050.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5602739453315735, | |
| "epoch": 0.39925373134328357, | |
| "grad_norm": 0.1373777687549591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666458010673523, | |
| "mean_token_accuracy": 0.7684379816055298, | |
| "num_tokens": 1753616.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5688735842704773, | |
| "epoch": 0.40298507462686567, | |
| "grad_norm": 0.12947675585746765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618643760681152, | |
| "mean_token_accuracy": 0.7724806815385818, | |
| "num_tokens": 1770077.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.569103866815567, | |
| "epoch": 0.40671641791044777, | |
| "grad_norm": 0.1482311338186264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661442875862122, | |
| "mean_token_accuracy": 0.7717588543891907, | |
| "num_tokens": 1786557.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5550140291452408, | |
| "epoch": 0.41044776119402987, | |
| "grad_norm": 0.13066281378269196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546547770500183, | |
| "mean_token_accuracy": 0.7755738943815231, | |
| "num_tokens": 1803029.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5526944696903229, | |
| "epoch": 0.4141791044776119, | |
| "grad_norm": 0.11755255609750748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436115860939026, | |
| "mean_token_accuracy": 0.779561460018158, | |
| "num_tokens": 1819561.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5528556704521179, | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.14607787132263184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589385032653809, | |
| "mean_token_accuracy": 0.7751224488019943, | |
| "num_tokens": 1835992.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5393927693367004, | |
| "epoch": 0.4216417910447761, | |
| "grad_norm": 0.12512564659118652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430585741996765, | |
| "mean_token_accuracy": 0.7801438719034195, | |
| "num_tokens": 1852545.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5346394777297974, | |
| "epoch": 0.4253731343283582, | |
| "grad_norm": 0.13879786431789398, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470178723335266, | |
| "mean_token_accuracy": 0.7800125926733017, | |
| "num_tokens": 1868767.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.552959531545639, | |
| "epoch": 0.4291044776119403, | |
| "grad_norm": 0.13570789992809296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606270432472229, | |
| "mean_token_accuracy": 0.7728203237056732, | |
| "num_tokens": 1885207.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5681584924459457, | |
| "epoch": 0.43283582089552236, | |
| "grad_norm": 0.13311345875263214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561408519744873, | |
| "mean_token_accuracy": 0.7729704976081848, | |
| "num_tokens": 1901670.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.580392524600029, | |
| "epoch": 0.43656716417910446, | |
| "grad_norm": 0.15006045997142792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5710599422454834, | |
| "mean_token_accuracy": 0.7692873626947403, | |
| "num_tokens": 1918297.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5402243435382843, | |
| "epoch": 0.44029850746268656, | |
| "grad_norm": 0.13022655248641968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290783047676086, | |
| "mean_token_accuracy": 0.7855078428983688, | |
| "num_tokens": 1934811.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5673187673091888, | |
| "epoch": 0.44402985074626866, | |
| "grad_norm": 0.1210206151008606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625845193862915, | |
| "mean_token_accuracy": 0.771060049533844, | |
| "num_tokens": 1951276.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5444270074367523, | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.14453133940696716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478600263595581, | |
| "mean_token_accuracy": 0.7782215029001236, | |
| "num_tokens": 1967851.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5516166985034943, | |
| "epoch": 0.45149253731343286, | |
| "grad_norm": 0.15330393612384796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627217292785645, | |
| "mean_token_accuracy": 0.7735389173030853, | |
| "num_tokens": 1984175.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5447670072317123, | |
| "epoch": 0.4552238805970149, | |
| "grad_norm": 0.11896508932113647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453386306762695, | |
| "mean_token_accuracy": 0.7792693227529526, | |
| "num_tokens": 2000419.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5593693852424622, | |
| "epoch": 0.458955223880597, | |
| "grad_norm": 0.14641404151916504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527093410491943, | |
| "mean_token_accuracy": 0.7784133702516556, | |
| "num_tokens": 2016812.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5516424775123596, | |
| "epoch": 0.4626865671641791, | |
| "grad_norm": 0.13001076877117157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495356917381287, | |
| "mean_token_accuracy": 0.7777290046215057, | |
| "num_tokens": 2032898.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5469458252191544, | |
| "epoch": 0.4664179104477612, | |
| "grad_norm": 0.12713271379470825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466877222061157, | |
| "mean_token_accuracy": 0.7783260345458984, | |
| "num_tokens": 2049023.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5528912246227264, | |
| "epoch": 0.4701492537313433, | |
| "grad_norm": 0.13111256062984467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582880973815918, | |
| "mean_token_accuracy": 0.7739576250314713, | |
| "num_tokens": 2065421.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.536289632320404, | |
| "epoch": 0.47388059701492535, | |
| "grad_norm": 0.1449650228023529, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477018356323242, | |
| "mean_token_accuracy": 0.7764868587255478, | |
| "num_tokens": 2081738.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5412490218877792, | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.12087342143058777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445610880851746, | |
| "mean_token_accuracy": 0.7799812257289886, | |
| "num_tokens": 2098128.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5749060362577438, | |
| "epoch": 0.48134328358208955, | |
| "grad_norm": 0.13593946397304535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713242292404175, | |
| "mean_token_accuracy": 0.7683141082525253, | |
| "num_tokens": 2114660.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5624695718288422, | |
| "epoch": 0.48507462686567165, | |
| "grad_norm": 0.13926997780799866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603138208389282, | |
| "mean_token_accuracy": 0.7724832147359848, | |
| "num_tokens": 2130850.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.564590647816658, | |
| "epoch": 0.48880597014925375, | |
| "grad_norm": 0.1541988104581833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548843145370483, | |
| "mean_token_accuracy": 0.7774635404348373, | |
| "num_tokens": 2147198.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5638516694307327, | |
| "epoch": 0.4925373134328358, | |
| "grad_norm": 0.14475074410438538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559626579284668, | |
| "mean_token_accuracy": 0.7742670625448227, | |
| "num_tokens": 2163592.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.546675980091095, | |
| "epoch": 0.4962686567164179, | |
| "grad_norm": 0.14459353685379028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525697469711304, | |
| "mean_token_accuracy": 0.7782329767942429, | |
| "num_tokens": 2179735.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5720339864492416, | |
| "epoch": 0.5, | |
| "grad_norm": 0.16138529777526855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745345950126648, | |
| "mean_token_accuracy": 0.7678724527359009, | |
| "num_tokens": 2196300.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5302732288837433, | |
| "epoch": 0.503731343283582, | |
| "grad_norm": 0.13007810711860657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221583843231201, | |
| "mean_token_accuracy": 0.786575123667717, | |
| "num_tokens": 2212703.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5611361563205719, | |
| "epoch": 0.5074626865671642, | |
| "grad_norm": 0.16084182262420654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557313084602356, | |
| "mean_token_accuracy": 0.7753567546606064, | |
| "num_tokens": 2229364.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5539422780275345, | |
| "epoch": 0.5111940298507462, | |
| "grad_norm": 0.1412162035703659, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559614896774292, | |
| "mean_token_accuracy": 0.7726200222969055, | |
| "num_tokens": 2245576.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.562326043844223, | |
| "epoch": 0.5149253731343284, | |
| "grad_norm": 0.12138223648071289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638246536254883, | |
| "mean_token_accuracy": 0.7736532688140869, | |
| "num_tokens": 2261877.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5490357279777527, | |
| "epoch": 0.5186567164179104, | |
| "grad_norm": 0.13067315518856049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565229654312134, | |
| "mean_token_accuracy": 0.7710774689912796, | |
| "num_tokens": 2278167.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5594187080860138, | |
| "epoch": 0.5223880597014925, | |
| "grad_norm": 0.15731613337993622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585336089134216, | |
| "mean_token_accuracy": 0.7744586318731308, | |
| "num_tokens": 2294498.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5464736074209213, | |
| "epoch": 0.5261194029850746, | |
| "grad_norm": 0.11038337647914886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538608968257904, | |
| "mean_token_accuracy": 0.7829599231481552, | |
| "num_tokens": 2311130.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5605999529361725, | |
| "epoch": 0.5298507462686567, | |
| "grad_norm": 0.14088644087314606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552900493144989, | |
| "mean_token_accuracy": 0.7778186202049255, | |
| "num_tokens": 2327728.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5528270900249481, | |
| "epoch": 0.5335820895522388, | |
| "grad_norm": 0.1425020396709442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515353083610535, | |
| "mean_token_accuracy": 0.7752819806337357, | |
| "num_tokens": 2343709.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.548284262418747, | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.11753518134355545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451334118843079, | |
| "mean_token_accuracy": 0.778195932507515, | |
| "num_tokens": 2360064.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5573805719614029, | |
| "epoch": 0.5410447761194029, | |
| "grad_norm": 0.16544298827648163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645371675491333, | |
| "mean_token_accuracy": 0.774710014462471, | |
| "num_tokens": 2376625.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5539259165525436, | |
| "epoch": 0.5447761194029851, | |
| "grad_norm": 0.13032706081867218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533608198165894, | |
| "mean_token_accuracy": 0.7761502712965012, | |
| "num_tokens": 2393124.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5611738562583923, | |
| "epoch": 0.5485074626865671, | |
| "grad_norm": 0.11081252992153168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593815445899963, | |
| "mean_token_accuracy": 0.7766542136669159, | |
| "num_tokens": 2409745.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5696390718221664, | |
| "epoch": 0.5522388059701493, | |
| "grad_norm": 0.15060319006443024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638480186462402, | |
| "mean_token_accuracy": 0.7716973423957825, | |
| "num_tokens": 2426282.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5485384464263916, | |
| "epoch": 0.5559701492537313, | |
| "grad_norm": 0.1222362369298935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475510954856873, | |
| "mean_token_accuracy": 0.7770865708589554, | |
| "num_tokens": 2442853.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5401834696531296, | |
| "epoch": 0.5597014925373134, | |
| "grad_norm": 0.1280064433813095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546281099319458, | |
| "mean_token_accuracy": 0.777226597070694, | |
| "num_tokens": 2459134.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5523836761713028, | |
| "epoch": 0.5634328358208955, | |
| "grad_norm": 0.13370104134082794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567190647125244, | |
| "mean_token_accuracy": 0.7742304503917694, | |
| "num_tokens": 2475612.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5323238670825958, | |
| "epoch": 0.5671641791044776, | |
| "grad_norm": 0.13501204550266266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404109358787537, | |
| "mean_token_accuracy": 0.7807471007108688, | |
| "num_tokens": 2492038.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5367552191019058, | |
| "epoch": 0.5708955223880597, | |
| "grad_norm": 0.11861642450094223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417584180831909, | |
| "mean_token_accuracy": 0.7794559895992279, | |
| "num_tokens": 2508568.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5438606441020966, | |
| "epoch": 0.5746268656716418, | |
| "grad_norm": 0.14000006020069122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418928861618042, | |
| "mean_token_accuracy": 0.7817023396492004, | |
| "num_tokens": 2524812.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5425677746534348, | |
| "epoch": 0.5783582089552238, | |
| "grad_norm": 0.12695865333080292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364310145378113, | |
| "mean_token_accuracy": 0.7822788208723068, | |
| "num_tokens": 2540971.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5774415135383606, | |
| "epoch": 0.582089552238806, | |
| "grad_norm": 0.13525983691215515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755460858345032, | |
| "mean_token_accuracy": 0.7673929333686829, | |
| "num_tokens": 2557582.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5472007393836975, | |
| "epoch": 0.585820895522388, | |
| "grad_norm": 0.14802482724189758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489597320556641, | |
| "mean_token_accuracy": 0.777190089225769, | |
| "num_tokens": 2573624.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5569610297679901, | |
| "epoch": 0.5895522388059702, | |
| "grad_norm": 0.12167536467313766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526796579360962, | |
| "mean_token_accuracy": 0.7753524631261826, | |
| "num_tokens": 2590085.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5524294823408127, | |
| "epoch": 0.5932835820895522, | |
| "grad_norm": 0.11966220289468765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499304533004761, | |
| "mean_token_accuracy": 0.7759323716163635, | |
| "num_tokens": 2606611.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5380967259407043, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.12815536558628082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423661470413208, | |
| "mean_token_accuracy": 0.7792660146951675, | |
| "num_tokens": 2623057.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5472327321767807, | |
| "epoch": 0.6007462686567164, | |
| "grad_norm": 0.1232324093580246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512628555297852, | |
| "mean_token_accuracy": 0.7756103277206421, | |
| "num_tokens": 2639412.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.53459233045578, | |
| "epoch": 0.6044776119402985, | |
| "grad_norm": 0.1279020607471466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530642569065094, | |
| "mean_token_accuracy": 0.784668356180191, | |
| "num_tokens": 2655725.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5487090200185776, | |
| "epoch": 0.6082089552238806, | |
| "grad_norm": 0.11489348113536835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467615127563477, | |
| "mean_token_accuracy": 0.7774748206138611, | |
| "num_tokens": 2671780.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5611004680395126, | |
| "epoch": 0.6119402985074627, | |
| "grad_norm": 0.12106446921825409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621192455291748, | |
| "mean_token_accuracy": 0.7757818549871445, | |
| "num_tokens": 2688187.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5655875951051712, | |
| "epoch": 0.6156716417910447, | |
| "grad_norm": 0.11722180247306824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597223043441772, | |
| "mean_token_accuracy": 0.7729662656784058, | |
| "num_tokens": 2704679.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5630869567394257, | |
| "epoch": 0.6194029850746269, | |
| "grad_norm": 0.1220882460474968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666179060935974, | |
| "mean_token_accuracy": 0.7716799974441528, | |
| "num_tokens": 2721384.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5498328506946564, | |
| "epoch": 0.6231343283582089, | |
| "grad_norm": 0.12011860311031342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489162802696228, | |
| "mean_token_accuracy": 0.7789698839187622, | |
| "num_tokens": 2737648.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5477638095617294, | |
| "epoch": 0.6268656716417911, | |
| "grad_norm": 0.11750344932079315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432245135307312, | |
| "mean_token_accuracy": 0.7796685546636581, | |
| "num_tokens": 2753735.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5453169494867325, | |
| "epoch": 0.6305970149253731, | |
| "grad_norm": 0.11574184149503708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411070585250854, | |
| "mean_token_accuracy": 0.779533714056015, | |
| "num_tokens": 2770229.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.545142874121666, | |
| "epoch": 0.6343283582089553, | |
| "grad_norm": 0.13359719514846802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482118129730225, | |
| "mean_token_accuracy": 0.7763011008501053, | |
| "num_tokens": 2786644.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5370890945196152, | |
| "epoch": 0.6380597014925373, | |
| "grad_norm": 0.14816807210445404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420677661895752, | |
| "mean_token_accuracy": 0.7803799211978912, | |
| "num_tokens": 2802914.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5518854707479477, | |
| "epoch": 0.6417910447761194, | |
| "grad_norm": 0.1388852596282959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512416958808899, | |
| "mean_token_accuracy": 0.7771147638559341, | |
| "num_tokens": 2819398.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5400035530328751, | |
| "epoch": 0.6455223880597015, | |
| "grad_norm": 0.1363624781370163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326176881790161, | |
| "mean_token_accuracy": 0.7852664589881897, | |
| "num_tokens": 2835742.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5528566986322403, | |
| "epoch": 0.6492537313432836, | |
| "grad_norm": 0.13000693917274475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492731928825378, | |
| "mean_token_accuracy": 0.7760010659694672, | |
| "num_tokens": 2852099.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5556752383708954, | |
| "epoch": 0.6529850746268657, | |
| "grad_norm": 0.11847010999917984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595160722732544, | |
| "mean_token_accuracy": 0.7731318473815918, | |
| "num_tokens": 2868521.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5382126122713089, | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.13996672630310059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406076312065125, | |
| "mean_token_accuracy": 0.7809479385614395, | |
| "num_tokens": 2884940.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5601803660392761, | |
| "epoch": 0.6604477611940298, | |
| "grad_norm": 0.17110760509967804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693113207817078, | |
| "mean_token_accuracy": 0.7711411267518997, | |
| "num_tokens": 2901255.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5570882558822632, | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 0.13338999450206757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597653388977051, | |
| "mean_token_accuracy": 0.7734159678220749, | |
| "num_tokens": 2917815.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5541604459285736, | |
| "epoch": 0.667910447761194, | |
| "grad_norm": 0.15003007650375366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550830066204071, | |
| "mean_token_accuracy": 0.773952454328537, | |
| "num_tokens": 2934029.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5483301132917404, | |
| "epoch": 0.6716417910447762, | |
| "grad_norm": 0.13809660077095032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544836163520813, | |
| "mean_token_accuracy": 0.7802225351333618, | |
| "num_tokens": 2950186.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.563317745923996, | |
| "epoch": 0.6753731343283582, | |
| "grad_norm": 0.11954832822084427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579479932785034, | |
| "mean_token_accuracy": 0.7754767686128616, | |
| "num_tokens": 2966696.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5388910472393036, | |
| "epoch": 0.6791044776119403, | |
| "grad_norm": 0.1495479792356491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441924929618835, | |
| "mean_token_accuracy": 0.7800770252943039, | |
| "num_tokens": 2982704.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5419297218322754, | |
| "epoch": 0.6828358208955224, | |
| "grad_norm": 0.13201352953910828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452746152877808, | |
| "mean_token_accuracy": 0.7787511199712753, | |
| "num_tokens": 2998931.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5475537180900574, | |
| "epoch": 0.6865671641791045, | |
| "grad_norm": 0.11876624077558517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537864565849304, | |
| "mean_token_accuracy": 0.77639339864254, | |
| "num_tokens": 3015465.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5443734228610992, | |
| "epoch": 0.6902985074626866, | |
| "grad_norm": 0.142917662858963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402485728263855, | |
| "mean_token_accuracy": 0.7805273532867432, | |
| "num_tokens": 3031848.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5626855194568634, | |
| "epoch": 0.6940298507462687, | |
| "grad_norm": 0.12896916270256042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567379593849182, | |
| "mean_token_accuracy": 0.7732013463973999, | |
| "num_tokens": 3048160.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5523503571748734, | |
| "epoch": 0.6977611940298507, | |
| "grad_norm": 0.13464562594890594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460264086723328, | |
| "mean_token_accuracy": 0.7796957343816757, | |
| "num_tokens": 3064378.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5515571534633636, | |
| "epoch": 0.7014925373134329, | |
| "grad_norm": 0.1277887523174286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548107028007507, | |
| "mean_token_accuracy": 0.773384153842926, | |
| "num_tokens": 3080909.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5496191382408142, | |
| "epoch": 0.7052238805970149, | |
| "grad_norm": 0.1543433964252472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634362101554871, | |
| "mean_token_accuracy": 0.7713208198547363, | |
| "num_tokens": 3097164.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.533801332116127, | |
| "epoch": 0.7089552238805971, | |
| "grad_norm": 0.1185467317700386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395026206970215, | |
| "mean_token_accuracy": 0.7796055674552917, | |
| "num_tokens": 3113434.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5635387450456619, | |
| "epoch": 0.7126865671641791, | |
| "grad_norm": 0.12236445397138596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628854632377625, | |
| "mean_token_accuracy": 0.7733010798692703, | |
| "num_tokens": 3129906.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5444195717573166, | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.1353861391544342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396167039871216, | |
| "mean_token_accuracy": 0.7793399095535278, | |
| "num_tokens": 3145901.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5682615637779236, | |
| "epoch": 0.7201492537313433, | |
| "grad_norm": 0.11948243528604507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587157011032104, | |
| "mean_token_accuracy": 0.774067297577858, | |
| "num_tokens": 3162257.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5397479832172394, | |
| "epoch": 0.7238805970149254, | |
| "grad_norm": 0.14794877171516418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473200678825378, | |
| "mean_token_accuracy": 0.7760735005140305, | |
| "num_tokens": 3178362.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5612514019012451, | |
| "epoch": 0.7276119402985075, | |
| "grad_norm": 0.12478621304035187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709495544433594, | |
| "mean_token_accuracy": 0.771531730890274, | |
| "num_tokens": 3195003.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5640581250190735, | |
| "epoch": 0.7313432835820896, | |
| "grad_norm": 0.13103285431861877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633752942085266, | |
| "mean_token_accuracy": 0.7763072997331619, | |
| "num_tokens": 3211488.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5409631133079529, | |
| "epoch": 0.7350746268656716, | |
| "grad_norm": 0.11954586207866669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412945747375488, | |
| "mean_token_accuracy": 0.7807609885931015, | |
| "num_tokens": 3227872.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5516713857650757, | |
| "epoch": 0.7388059701492538, | |
| "grad_norm": 0.1291007399559021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551599264144897, | |
| "mean_token_accuracy": 0.776901364326477, | |
| "num_tokens": 3244275.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5520838648080826, | |
| "epoch": 0.7425373134328358, | |
| "grad_norm": 0.1325356811285019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542269945144653, | |
| "mean_token_accuracy": 0.7749388813972473, | |
| "num_tokens": 3260730.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5531659871339798, | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.11382137984037399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500154495239258, | |
| "mean_token_accuracy": 0.7769201993942261, | |
| "num_tokens": 3277054.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5739943087100983, | |
| "epoch": 0.75, | |
| "grad_norm": 0.116433285176754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693427920341492, | |
| "mean_token_accuracy": 0.7700029015541077, | |
| "num_tokens": 3293536.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5410773009061813, | |
| "epoch": 0.753731343283582, | |
| "grad_norm": 0.12128517776727676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383925437927246, | |
| "mean_token_accuracy": 0.7806861847639084, | |
| "num_tokens": 3310044.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5345109105110168, | |
| "epoch": 0.7574626865671642, | |
| "grad_norm": 0.11475860327482224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396114587783813, | |
| "mean_token_accuracy": 0.7786486446857452, | |
| "num_tokens": 3326424.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5596074312925339, | |
| "epoch": 0.7611940298507462, | |
| "grad_norm": 0.1144401878118515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559008777141571, | |
| "mean_token_accuracy": 0.7744818329811096, | |
| "num_tokens": 3342803.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5440013706684113, | |
| "epoch": 0.7649253731343284, | |
| "grad_norm": 0.117170050740242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520018935203552, | |
| "mean_token_accuracy": 0.7764452546834946, | |
| "num_tokens": 3359289.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5440059304237366, | |
| "epoch": 0.7686567164179104, | |
| "grad_norm": 0.12146680057048798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543918251991272, | |
| "mean_token_accuracy": 0.7812443971633911, | |
| "num_tokens": 3375680.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.559204563498497, | |
| "epoch": 0.7723880597014925, | |
| "grad_norm": 0.11677462607622147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479013323783875, | |
| "mean_token_accuracy": 0.7783834487199783, | |
| "num_tokens": 3392230.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5695496201515198, | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.12663210928440094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560157895088196, | |
| "mean_token_accuracy": 0.7768621742725372, | |
| "num_tokens": 3408667.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5218568593263626, | |
| "epoch": 0.7798507462686567, | |
| "grad_norm": 0.13396473228931427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200244784355164, | |
| "mean_token_accuracy": 0.7892128974199295, | |
| "num_tokens": 3424766.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5524403154850006, | |
| "epoch": 0.7835820895522388, | |
| "grad_norm": 0.11780054867267609, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549524426460266, | |
| "mean_token_accuracy": 0.7762513756752014, | |
| "num_tokens": 3441010.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5339344441890717, | |
| "epoch": 0.7873134328358209, | |
| "grad_norm": 0.13986989855766296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432649254798889, | |
| "mean_token_accuracy": 0.7810570001602173, | |
| "num_tokens": 3457051.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5393660813570023, | |
| "epoch": 0.7910447761194029, | |
| "grad_norm": 0.14846238493919373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462239980697632, | |
| "mean_token_accuracy": 0.7770469635725021, | |
| "num_tokens": 3473237.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5482676774263382, | |
| "epoch": 0.7947761194029851, | |
| "grad_norm": 0.1279968023300171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470429062843323, | |
| "mean_token_accuracy": 0.7772368937730789, | |
| "num_tokens": 3489557.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5750377625226974, | |
| "epoch": 0.7985074626865671, | |
| "grad_norm": 0.1574614942073822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5681816339492798, | |
| "mean_token_accuracy": 0.7696330845355988, | |
| "num_tokens": 3506111.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5552468150854111, | |
| "epoch": 0.8022388059701493, | |
| "grad_norm": 0.11573337018489838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513306260108948, | |
| "mean_token_accuracy": 0.7750436067581177, | |
| "num_tokens": 3522546.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5544361621141434, | |
| "epoch": 0.8059701492537313, | |
| "grad_norm": 0.11837700754404068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553516685962677, | |
| "mean_token_accuracy": 0.7765354365110397, | |
| "num_tokens": 3539207.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5567323267459869, | |
| "epoch": 0.8097014925373134, | |
| "grad_norm": 0.15473680198192596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5699406862258911, | |
| "mean_token_accuracy": 0.769306480884552, | |
| "num_tokens": 3555606.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.55356065928936, | |
| "epoch": 0.8134328358208955, | |
| "grad_norm": 0.10959180444478989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509120225906372, | |
| "mean_token_accuracy": 0.7775351405143738, | |
| "num_tokens": 3571937.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5506166815757751, | |
| "epoch": 0.8171641791044776, | |
| "grad_norm": 0.1107836365699768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498772859573364, | |
| "mean_token_accuracy": 0.7781967967748642, | |
| "num_tokens": 3588147.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5483623296022415, | |
| "epoch": 0.8208955223880597, | |
| "grad_norm": 0.12760840356349945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440163016319275, | |
| "mean_token_accuracy": 0.7794655859470367, | |
| "num_tokens": 3604413.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5516934990882874, | |
| "epoch": 0.8246268656716418, | |
| "grad_norm": 0.13432522118091583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498266220092773, | |
| "mean_token_accuracy": 0.7779892683029175, | |
| "num_tokens": 3620667.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5583075881004333, | |
| "epoch": 0.8283582089552238, | |
| "grad_norm": 0.1205005794763565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606446266174316, | |
| "mean_token_accuracy": 0.7730143070220947, | |
| "num_tokens": 3637160.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5281430184841156, | |
| "epoch": 0.832089552238806, | |
| "grad_norm": 0.11834297329187393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331573486328125, | |
| "mean_token_accuracy": 0.7839753329753876, | |
| "num_tokens": 3653562.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5474057644605637, | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.12258574366569519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449813604354858, | |
| "mean_token_accuracy": 0.780377060174942, | |
| "num_tokens": 3669951.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5545710325241089, | |
| "epoch": 0.8395522388059702, | |
| "grad_norm": 0.1338793784379959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493278503417969, | |
| "mean_token_accuracy": 0.7759524881839752, | |
| "num_tokens": 3686193.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5437184125185013, | |
| "epoch": 0.8432835820895522, | |
| "grad_norm": 0.11655160784721375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418398380279541, | |
| "mean_token_accuracy": 0.7775491774082184, | |
| "num_tokens": 3702353.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5532678067684174, | |
| "epoch": 0.8470149253731343, | |
| "grad_norm": 0.1549050509929657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550553798675537, | |
| "mean_token_accuracy": 0.7763772308826447, | |
| "num_tokens": 3719232.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5559423863887787, | |
| "epoch": 0.8507462686567164, | |
| "grad_norm": 0.14761976897716522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570894479751587, | |
| "mean_token_accuracy": 0.772933155298233, | |
| "num_tokens": 3735537.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5467868000268936, | |
| "epoch": 0.8544776119402985, | |
| "grad_norm": 0.1289997398853302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503818988800049, | |
| "mean_token_accuracy": 0.7735268622636795, | |
| "num_tokens": 3751761.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5500779002904892, | |
| "epoch": 0.8582089552238806, | |
| "grad_norm": 0.1492077112197876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505205392837524, | |
| "mean_token_accuracy": 0.777638703584671, | |
| "num_tokens": 3768182.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.539194718003273, | |
| "epoch": 0.8619402985074627, | |
| "grad_norm": 0.11280067265033722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417665243148804, | |
| "mean_token_accuracy": 0.7794284075498581, | |
| "num_tokens": 3784647.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5511510968208313, | |
| "epoch": 0.8656716417910447, | |
| "grad_norm": 0.13110041618347168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588247776031494, | |
| "mean_token_accuracy": 0.7747578173875809, | |
| "num_tokens": 3801072.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5328868925571442, | |
| "epoch": 0.8694029850746269, | |
| "grad_norm": 0.11132191121578217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321682095527649, | |
| "mean_token_accuracy": 0.785084918141365, | |
| "num_tokens": 3817270.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5497525930404663, | |
| "epoch": 0.8731343283582089, | |
| "grad_norm": 0.12497328221797943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490625500679016, | |
| "mean_token_accuracy": 0.7780804187059402, | |
| "num_tokens": 3833650.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5649874210357666, | |
| "epoch": 0.8768656716417911, | |
| "grad_norm": 0.10820397734642029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612732172012329, | |
| "mean_token_accuracy": 0.7699918150901794, | |
| "num_tokens": 3849965.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5564968436956406, | |
| "epoch": 0.8805970149253731, | |
| "grad_norm": 0.11200150102376938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574247241020203, | |
| "mean_token_accuracy": 0.7737843245267868, | |
| "num_tokens": 3866325.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5345783978700638, | |
| "epoch": 0.8843283582089553, | |
| "grad_norm": 0.11046700924634933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353702902793884, | |
| "mean_token_accuracy": 0.7825029641389847, | |
| "num_tokens": 3882836.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5462570339441299, | |
| "epoch": 0.8880597014925373, | |
| "grad_norm": 0.13713142275810242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531303286552429, | |
| "mean_token_accuracy": 0.775889053940773, | |
| "num_tokens": 3899019.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5346651673316956, | |
| "epoch": 0.8917910447761194, | |
| "grad_norm": 0.11298073828220367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383750796318054, | |
| "mean_token_accuracy": 0.780723512172699, | |
| "num_tokens": 3915451.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5661043077707291, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.12630173563957214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633317232131958, | |
| "mean_token_accuracy": 0.7725178003311157, | |
| "num_tokens": 3931857.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5499769002199173, | |
| "epoch": 0.8992537313432836, | |
| "grad_norm": 0.10539573431015015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443609356880188, | |
| "mean_token_accuracy": 0.7807674556970596, | |
| "num_tokens": 3948251.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5542334765195847, | |
| "epoch": 0.9029850746268657, | |
| "grad_norm": 0.10860421508550644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467254519462585, | |
| "mean_token_accuracy": 0.7777283936738968, | |
| "num_tokens": 3964506.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5593715906143188, | |
| "epoch": 0.9067164179104478, | |
| "grad_norm": 0.11269830167293549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568402409553528, | |
| "mean_token_accuracy": 0.7743813842535019, | |
| "num_tokens": 3980991.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5386274456977844, | |
| "epoch": 0.9104477611940298, | |
| "grad_norm": 0.12022864073514938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538654088973999, | |
| "mean_token_accuracy": 0.7814032137393951, | |
| "num_tokens": 3997541.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5274675115942955, | |
| "epoch": 0.914179104477612, | |
| "grad_norm": 0.14818064868450165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381026268005371, | |
| "mean_token_accuracy": 0.7816068381071091, | |
| "num_tokens": 4013664.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5379235744476318, | |
| "epoch": 0.917910447761194, | |
| "grad_norm": 0.1228220984339714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409340858459473, | |
| "mean_token_accuracy": 0.7790304571390152, | |
| "num_tokens": 4029963.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5446107536554337, | |
| "epoch": 0.9216417910447762, | |
| "grad_norm": 0.12891873717308044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515777468681335, | |
| "mean_token_accuracy": 0.7764184921979904, | |
| "num_tokens": 4046258.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5525491833686829, | |
| "epoch": 0.9253731343283582, | |
| "grad_norm": 0.1355786919593811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416724681854248, | |
| "mean_token_accuracy": 0.7802292257547379, | |
| "num_tokens": 4062506.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.536956250667572, | |
| "epoch": 0.9291044776119403, | |
| "grad_norm": 0.12736709415912628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312113761901855, | |
| "mean_token_accuracy": 0.783654510974884, | |
| "num_tokens": 4078661.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5549832433462143, | |
| "epoch": 0.9328358208955224, | |
| "grad_norm": 0.12017148733139038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565866827964783, | |
| "mean_token_accuracy": 0.773817777633667, | |
| "num_tokens": 4095022.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5422243773937225, | |
| "epoch": 0.9365671641791045, | |
| "grad_norm": 0.13573786616325378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521195530891418, | |
| "mean_token_accuracy": 0.7785970866680145, | |
| "num_tokens": 4111402.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5538443177938461, | |
| "epoch": 0.9402985074626866, | |
| "grad_norm": 0.11428782343864441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559377670288086, | |
| "mean_token_accuracy": 0.7728682309389114, | |
| "num_tokens": 4127625.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5606874525547028, | |
| "epoch": 0.9440298507462687, | |
| "grad_norm": 0.11228293180465698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537079572677612, | |
| "mean_token_accuracy": 0.7777886986732483, | |
| "num_tokens": 4144209.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5587089955806732, | |
| "epoch": 0.9477611940298507, | |
| "grad_norm": 0.11430441588163376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511766672134399, | |
| "mean_token_accuracy": 0.7764836251735687, | |
| "num_tokens": 4160587.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5543984770774841, | |
| "epoch": 0.9514925373134329, | |
| "grad_norm": 0.11914564669132233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457825064659119, | |
| "mean_token_accuracy": 0.7772367298603058, | |
| "num_tokens": 4177078.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5496934354305267, | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.11808159202337265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523373484611511, | |
| "mean_token_accuracy": 0.7758414000272751, | |
| "num_tokens": 4193671.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5323416441679001, | |
| "epoch": 0.9589552238805971, | |
| "grad_norm": 0.12709033489227295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384759902954102, | |
| "mean_token_accuracy": 0.7808651477098465, | |
| "num_tokens": 4210085.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5338983610272408, | |
| "epoch": 0.9626865671641791, | |
| "grad_norm": 0.13908886909484863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462735891342163, | |
| "mean_token_accuracy": 0.7780435681343079, | |
| "num_tokens": 4226494.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5453044772148132, | |
| "epoch": 0.9664179104477612, | |
| "grad_norm": 0.12644866108894348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551929235458374, | |
| "mean_token_accuracy": 0.775839775800705, | |
| "num_tokens": 4242785.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5603075176477432, | |
| "epoch": 0.9701492537313433, | |
| "grad_norm": 0.12755440175533295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524581670761108, | |
| "mean_token_accuracy": 0.7771914452314377, | |
| "num_tokens": 4259299.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5615698993206024, | |
| "epoch": 0.9738805970149254, | |
| "grad_norm": 0.12908904254436493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537154078483582, | |
| "mean_token_accuracy": 0.7739745527505875, | |
| "num_tokens": 4275749.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5526564866304398, | |
| "epoch": 0.9776119402985075, | |
| "grad_norm": 0.10715582221746445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478145480155945, | |
| "mean_token_accuracy": 0.7770287841558456, | |
| "num_tokens": 4291706.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5461979508399963, | |
| "epoch": 0.9813432835820896, | |
| "grad_norm": 0.14307166635990143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454379916191101, | |
| "mean_token_accuracy": 0.7798766791820526, | |
| "num_tokens": 4308137.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5203245729207993, | |
| "epoch": 0.9850746268656716, | |
| "grad_norm": 0.15710005164146423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299646258354187, | |
| "mean_token_accuracy": 0.7843145579099655, | |
| "num_tokens": 4324411.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5302061140537262, | |
| "epoch": 0.9888059701492538, | |
| "grad_norm": 0.1519300937652588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403961539268494, | |
| "mean_token_accuracy": 0.7806786000728607, | |
| "num_tokens": 4340384.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5364599078893661, | |
| "epoch": 0.9925373134328358, | |
| "grad_norm": 0.13450899720191956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356532335281372, | |
| "mean_token_accuracy": 0.7834792584180832, | |
| "num_tokens": 4356954.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5519508272409439, | |
| "epoch": 0.996268656716418, | |
| "grad_norm": 0.13190409541130066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425809621810913, | |
| "mean_token_accuracy": 0.7814677059650421, | |
| "num_tokens": 4373557.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5717380940914154, | |
| "epoch": 1.0, | |
| "grad_norm": 0.13511350750923157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594110488891602, | |
| "mean_token_accuracy": 0.7763755470514297, | |
| "num_tokens": 4390028.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5333094298839569, | |
| "epoch": 1.0037313432835822, | |
| "grad_norm": 0.11232882738113403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279825925827026, | |
| "mean_token_accuracy": 0.7831753939390182, | |
| "num_tokens": 4406075.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5085988268256187, | |
| "epoch": 1.007462686567164, | |
| "grad_norm": 0.1554645448923111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516677737236023, | |
| "mean_token_accuracy": 0.7916137427091599, | |
| "num_tokens": 4422444.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5372590869665146, | |
| "epoch": 1.0111940298507462, | |
| "grad_norm": 0.14206163585186005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542325496673584, | |
| "mean_token_accuracy": 0.7813751995563507, | |
| "num_tokens": 4438619.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5327645987272263, | |
| "epoch": 1.0149253731343284, | |
| "grad_norm": 0.12639598548412323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381733775138855, | |
| "mean_token_accuracy": 0.7798869907855988, | |
| "num_tokens": 4455013.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5318270623683929, | |
| "epoch": 1.0186567164179103, | |
| "grad_norm": 0.14597581326961517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323677659034729, | |
| "mean_token_accuracy": 0.7859037518501282, | |
| "num_tokens": 4471596.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.549939751625061, | |
| "epoch": 1.0223880597014925, | |
| "grad_norm": 0.14265935122966766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377833247184753, | |
| "mean_token_accuracy": 0.7833307683467865, | |
| "num_tokens": 4487885.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.549922838807106, | |
| "epoch": 1.0261194029850746, | |
| "grad_norm": 0.1281050145626068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483719706535339, | |
| "mean_token_accuracy": 0.7763915956020355, | |
| "num_tokens": 4504279.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5519027858972549, | |
| "epoch": 1.0298507462686568, | |
| "grad_norm": 0.13199536502361298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520401000976562, | |
| "mean_token_accuracy": 0.7754272371530533, | |
| "num_tokens": 4520877.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5326957255601883, | |
| "epoch": 1.0335820895522387, | |
| "grad_norm": 0.13716775178909302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377839207649231, | |
| "mean_token_accuracy": 0.77959144115448, | |
| "num_tokens": 4537306.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5343386679887772, | |
| "epoch": 1.037313432835821, | |
| "grad_norm": 0.12250324338674545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346370935440063, | |
| "mean_token_accuracy": 0.7819696217775345, | |
| "num_tokens": 4553694.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5221862643957138, | |
| "epoch": 1.041044776119403, | |
| "grad_norm": 0.14083418250083923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204699039459229, | |
| "mean_token_accuracy": 0.7915231883525848, | |
| "num_tokens": 4569929.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5506787896156311, | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.11459501832723618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497503280639648, | |
| "mean_token_accuracy": 0.7762598097324371, | |
| "num_tokens": 4586327.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5387643724679947, | |
| "epoch": 1.0485074626865671, | |
| "grad_norm": 0.1149069145321846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536687970161438, | |
| "mean_token_accuracy": 0.7849635928869247, | |
| "num_tokens": 4602577.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5402974784374237, | |
| "epoch": 1.0522388059701493, | |
| "grad_norm": 0.13960953056812286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357297658920288, | |
| "mean_token_accuracy": 0.782235711812973, | |
| "num_tokens": 4618829.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5379159897565842, | |
| "epoch": 1.0559701492537314, | |
| "grad_norm": 0.12440282106399536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391443967819214, | |
| "mean_token_accuracy": 0.7829291224479675, | |
| "num_tokens": 4635167.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5129481852054596, | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.13519050180912018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5105025768280029, | |
| "mean_token_accuracy": 0.7926614433526993, | |
| "num_tokens": 4651165.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5542086809873581, | |
| "epoch": 1.0634328358208955, | |
| "grad_norm": 0.14323101937770844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622052550315857, | |
| "mean_token_accuracy": 0.7727599292993546, | |
| "num_tokens": 4667347.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5243228375911713, | |
| "epoch": 1.0671641791044777, | |
| "grad_norm": 0.1330215483903885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247523188591003, | |
| "mean_token_accuracy": 0.7867335379123688, | |
| "num_tokens": 4684015.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5412201136350632, | |
| "epoch": 1.0708955223880596, | |
| "grad_norm": 0.13448479771614075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54647296667099, | |
| "mean_token_accuracy": 0.7774277031421661, | |
| "num_tokens": 4700242.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5454149097204208, | |
| "epoch": 1.0746268656716418, | |
| "grad_norm": 0.13259278237819672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461288690567017, | |
| "mean_token_accuracy": 0.7782861590385437, | |
| "num_tokens": 4716442.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.526309534907341, | |
| "epoch": 1.078358208955224, | |
| "grad_norm": 0.12522561848163605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221973061561584, | |
| "mean_token_accuracy": 0.789994552731514, | |
| "num_tokens": 4732742.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5411332100629807, | |
| "epoch": 1.0820895522388059, | |
| "grad_norm": 0.12081784009933472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372704863548279, | |
| "mean_token_accuracy": 0.7822500914335251, | |
| "num_tokens": 4749084.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5575008988380432, | |
| "epoch": 1.085820895522388, | |
| "grad_norm": 0.11303576827049255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508702397346497, | |
| "mean_token_accuracy": 0.7754259258508682, | |
| "num_tokens": 4765562.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5357666164636612, | |
| "epoch": 1.0895522388059702, | |
| "grad_norm": 0.12666599452495575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432624220848083, | |
| "mean_token_accuracy": 0.7804068475961685, | |
| "num_tokens": 4781995.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5331733524799347, | |
| "epoch": 1.0932835820895523, | |
| "grad_norm": 0.12246809899806976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331196784973145, | |
| "mean_token_accuracy": 0.7823672741651535, | |
| "num_tokens": 4798355.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.531685009598732, | |
| "epoch": 1.0970149253731343, | |
| "grad_norm": 0.12172231823205948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293748378753662, | |
| "mean_token_accuracy": 0.7843722105026245, | |
| "num_tokens": 4814357.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.554166242480278, | |
| "epoch": 1.1007462686567164, | |
| "grad_norm": 0.14191463589668274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532712936401367, | |
| "mean_token_accuracy": 0.7733844220638275, | |
| "num_tokens": 4830954.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5282094776630402, | |
| "epoch": 1.1044776119402986, | |
| "grad_norm": 0.14205436408519745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530907392501831, | |
| "mean_token_accuracy": 0.7830108106136322, | |
| "num_tokens": 4847654.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5379532426595688, | |
| "epoch": 1.1082089552238805, | |
| "grad_norm": 0.12750715017318726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367629528045654, | |
| "mean_token_accuracy": 0.7796261459589005, | |
| "num_tokens": 4864209.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5312085449695587, | |
| "epoch": 1.1119402985074627, | |
| "grad_norm": 0.11801420152187347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278028845787048, | |
| "mean_token_accuracy": 0.7856296449899673, | |
| "num_tokens": 4880489.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5340657457709312, | |
| "epoch": 1.1156716417910448, | |
| "grad_norm": 0.1341157853603363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332481265068054, | |
| "mean_token_accuracy": 0.7815297544002533, | |
| "num_tokens": 4897040.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5495938658714294, | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.15130798518657684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522593855857849, | |
| "mean_token_accuracy": 0.7767154276371002, | |
| "num_tokens": 4913499.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5539788007736206, | |
| "epoch": 1.123134328358209, | |
| "grad_norm": 0.16235828399658203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556696891784668, | |
| "mean_token_accuracy": 0.7743791192770004, | |
| "num_tokens": 4930129.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5188294276595116, | |
| "epoch": 1.126865671641791, | |
| "grad_norm": 0.15251989662647247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240339040756226, | |
| "mean_token_accuracy": 0.7848995476961136, | |
| "num_tokens": 4946505.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5330336540937424, | |
| "epoch": 1.1305970149253732, | |
| "grad_norm": 0.12010055035352707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530551552772522, | |
| "mean_token_accuracy": 0.7852707505226135, | |
| "num_tokens": 4963130.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5485537797212601, | |
| "epoch": 1.1343283582089552, | |
| "grad_norm": 0.12690100073814392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355115532875061, | |
| "mean_token_accuracy": 0.7832664847373962, | |
| "num_tokens": 4979396.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5363626033067703, | |
| "epoch": 1.1380597014925373, | |
| "grad_norm": 0.12670499086380005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318777561187744, | |
| "mean_token_accuracy": 0.7821652144193649, | |
| "num_tokens": 4995808.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.556913822889328, | |
| "epoch": 1.1417910447761195, | |
| "grad_norm": 0.1417754739522934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632070899009705, | |
| "mean_token_accuracy": 0.7711838483810425, | |
| "num_tokens": 5012247.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.531732589006424, | |
| "epoch": 1.1455223880597014, | |
| "grad_norm": 0.12725508213043213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370599627494812, | |
| "mean_token_accuracy": 0.7827656418085098, | |
| "num_tokens": 5028592.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5216507539153099, | |
| "epoch": 1.1492537313432836, | |
| "grad_norm": 0.14518076181411743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285972952842712, | |
| "mean_token_accuracy": 0.7866590619087219, | |
| "num_tokens": 5044691.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5357843339443207, | |
| "epoch": 1.1529850746268657, | |
| "grad_norm": 0.14331640303134918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414748191833496, | |
| "mean_token_accuracy": 0.7796436995267868, | |
| "num_tokens": 5060981.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.550069585442543, | |
| "epoch": 1.1567164179104479, | |
| "grad_norm": 0.1419994831085205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494908690452576, | |
| "mean_token_accuracy": 0.774166613817215, | |
| "num_tokens": 5077445.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5334684997797012, | |
| "epoch": 1.1604477611940298, | |
| "grad_norm": 0.13464997708797455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329424738883972, | |
| "mean_token_accuracy": 0.7852184623479843, | |
| "num_tokens": 5093959.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5384779423475266, | |
| "epoch": 1.164179104477612, | |
| "grad_norm": 0.12344568222761154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393214821815491, | |
| "mean_token_accuracy": 0.783161386847496, | |
| "num_tokens": 5110114.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.566596269607544, | |
| "epoch": 1.1679104477611941, | |
| "grad_norm": 0.13426469266414642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611933469772339, | |
| "mean_token_accuracy": 0.7707538902759552, | |
| "num_tokens": 5126500.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5522208511829376, | |
| "epoch": 1.171641791044776, | |
| "grad_norm": 0.11628863960504532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544135332107544, | |
| "mean_token_accuracy": 0.7789785116910934, | |
| "num_tokens": 5143003.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5286403447389603, | |
| "epoch": 1.1753731343283582, | |
| "grad_norm": 0.1331920623779297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280863046646118, | |
| "mean_token_accuracy": 0.7847232520580292, | |
| "num_tokens": 5159209.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5208230093121529, | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.16730330884456635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261422395706177, | |
| "mean_token_accuracy": 0.7885824292898178, | |
| "num_tokens": 5175336.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5139501839876175, | |
| "epoch": 1.1828358208955223, | |
| "grad_norm": 0.17113769054412842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231570601463318, | |
| "mean_token_accuracy": 0.7852117121219635, | |
| "num_tokens": 5191589.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5446046590805054, | |
| "epoch": 1.1865671641791045, | |
| "grad_norm": 0.13907761871814728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399054288864136, | |
| "mean_token_accuracy": 0.7820506691932678, | |
| "num_tokens": 5207939.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5267596393823624, | |
| "epoch": 1.1902985074626866, | |
| "grad_norm": 0.1434536576271057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265440344810486, | |
| "mean_token_accuracy": 0.7849590480327606, | |
| "num_tokens": 5224274.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5274358987808228, | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.1331617832183838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5201226472854614, | |
| "mean_token_accuracy": 0.7877639383077621, | |
| "num_tokens": 5240488.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5438350588083267, | |
| "epoch": 1.1977611940298507, | |
| "grad_norm": 0.13051791489124298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417760610580444, | |
| "mean_token_accuracy": 0.7801128923892975, | |
| "num_tokens": 5256913.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5419559478759766, | |
| "epoch": 1.2014925373134329, | |
| "grad_norm": 0.1651846319437027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418766140937805, | |
| "mean_token_accuracy": 0.78228460252285, | |
| "num_tokens": 5273335.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5415368527173996, | |
| "epoch": 1.205223880597015, | |
| "grad_norm": 0.16951487958431244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506861209869385, | |
| "mean_token_accuracy": 0.7753586024045944, | |
| "num_tokens": 5289759.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5358785539865494, | |
| "epoch": 1.208955223880597, | |
| "grad_norm": 0.1276499480009079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536015510559082, | |
| "mean_token_accuracy": 0.7820306271314621, | |
| "num_tokens": 5305982.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5399276316165924, | |
| "epoch": 1.212686567164179, | |
| "grad_norm": 0.13910017907619476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390846133232117, | |
| "mean_token_accuracy": 0.7822140157222748, | |
| "num_tokens": 5322089.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.54273721575737, | |
| "epoch": 1.2164179104477613, | |
| "grad_norm": 0.14252571761608124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544661283493042, | |
| "mean_token_accuracy": 0.7795404642820358, | |
| "num_tokens": 5338453.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5249434560537338, | |
| "epoch": 1.2201492537313432, | |
| "grad_norm": 0.1477581411600113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5217203497886658, | |
| "mean_token_accuracy": 0.7876597344875336, | |
| "num_tokens": 5354700.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5396385788917542, | |
| "epoch": 1.2238805970149254, | |
| "grad_norm": 0.14778634905815125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354180335998535, | |
| "mean_token_accuracy": 0.7824464589357376, | |
| "num_tokens": 5371063.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5529858469963074, | |
| "epoch": 1.2276119402985075, | |
| "grad_norm": 0.13042840361595154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544819831848145, | |
| "mean_token_accuracy": 0.7761342972517014, | |
| "num_tokens": 5387332.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5454379618167877, | |
| "epoch": 1.2313432835820897, | |
| "grad_norm": 0.15361081063747406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482691526412964, | |
| "mean_token_accuracy": 0.7785263955593109, | |
| "num_tokens": 5403888.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5411872565746307, | |
| "epoch": 1.2350746268656716, | |
| "grad_norm": 0.1457548439502716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460063219070435, | |
| "mean_token_accuracy": 0.7781393676996231, | |
| "num_tokens": 5420504.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5440556704998016, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.17071455717086792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447981357574463, | |
| "mean_token_accuracy": 0.7792220860719681, | |
| "num_tokens": 5436983.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5312773138284683, | |
| "epoch": 1.242537313432836, | |
| "grad_norm": 0.15535041689872742, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284558534622192, | |
| "mean_token_accuracy": 0.7843498289585114, | |
| "num_tokens": 5453439.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5413801819086075, | |
| "epoch": 1.2462686567164178, | |
| "grad_norm": 0.12389594316482544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376867651939392, | |
| "mean_token_accuracy": 0.7829112410545349, | |
| "num_tokens": 5470171.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5580787807703018, | |
| "epoch": 1.25, | |
| "grad_norm": 0.15255525708198547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539383292198181, | |
| "mean_token_accuracy": 0.7776496410369873, | |
| "num_tokens": 5486721.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.551739051938057, | |
| "epoch": 1.2537313432835822, | |
| "grad_norm": 0.14014676213264465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544667840003967, | |
| "mean_token_accuracy": 0.7750911116600037, | |
| "num_tokens": 5502822.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5480811297893524, | |
| "epoch": 1.2574626865671643, | |
| "grad_norm": 0.1353754997253418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507966876029968, | |
| "mean_token_accuracy": 0.7761414647102356, | |
| "num_tokens": 5519323.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5414211302995682, | |
| "epoch": 1.2611940298507462, | |
| "grad_norm": 0.1243680939078331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453186631202698, | |
| "mean_token_accuracy": 0.7782161980867386, | |
| "num_tokens": 5535863.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.527251847088337, | |
| "epoch": 1.2649253731343284, | |
| "grad_norm": 0.1459769904613495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396205186843872, | |
| "mean_token_accuracy": 0.7795730829238892, | |
| "num_tokens": 5552171.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5239678472280502, | |
| "epoch": 1.2686567164179103, | |
| "grad_norm": 0.12427864223718643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271449089050293, | |
| "mean_token_accuracy": 0.7882652282714844, | |
| "num_tokens": 5568175.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.543644979596138, | |
| "epoch": 1.2723880597014925, | |
| "grad_norm": 0.11923787742853165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382894277572632, | |
| "mean_token_accuracy": 0.7825156450271606, | |
| "num_tokens": 5584465.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5515155345201492, | |
| "epoch": 1.2761194029850746, | |
| "grad_norm": 0.11743160337209702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425710082054138, | |
| "mean_token_accuracy": 0.7795869261026382, | |
| "num_tokens": 5601282.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.556594654917717, | |
| "epoch": 1.2798507462686568, | |
| "grad_norm": 0.13206258416175842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553520679473877, | |
| "mean_token_accuracy": 0.7744052857160568, | |
| "num_tokens": 5617511.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5562093108892441, | |
| "epoch": 1.2835820895522387, | |
| "grad_norm": 0.1419561356306076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573539733886719, | |
| "mean_token_accuracy": 0.7758442610502243, | |
| "num_tokens": 5634008.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5295949876308441, | |
| "epoch": 1.287313432835821, | |
| "grad_norm": 0.136697456240654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536439836025238, | |
| "mean_token_accuracy": 0.7857220619916916, | |
| "num_tokens": 5650510.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5379302501678467, | |
| "epoch": 1.291044776119403, | |
| "grad_norm": 0.12953169643878937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420789122581482, | |
| "mean_token_accuracy": 0.7796627283096313, | |
| "num_tokens": 5667049.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5327381789684296, | |
| "epoch": 1.294776119402985, | |
| "grad_norm": 0.12574538588523865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231812000274658, | |
| "mean_token_accuracy": 0.7879898250102997, | |
| "num_tokens": 5683103.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5485990345478058, | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.12788420915603638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398032665252686, | |
| "mean_token_accuracy": 0.782793402671814, | |
| "num_tokens": 5699531.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.533822700381279, | |
| "epoch": 1.3022388059701493, | |
| "grad_norm": 0.12131965160369873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313589572906494, | |
| "mean_token_accuracy": 0.7867582440376282, | |
| "num_tokens": 5715578.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5322218984365463, | |
| "epoch": 1.3059701492537314, | |
| "grad_norm": 0.13636337220668793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401290655136108, | |
| "mean_token_accuracy": 0.781011775135994, | |
| "num_tokens": 5731885.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5119979977607727, | |
| "epoch": 1.3097014925373134, | |
| "grad_norm": 0.1538715660572052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197798013687134, | |
| "mean_token_accuracy": 0.787521705031395, | |
| "num_tokens": 5748165.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.522780068218708, | |
| "epoch": 1.3134328358208955, | |
| "grad_norm": 0.16598650813102722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323340892791748, | |
| "mean_token_accuracy": 0.7844688296318054, | |
| "num_tokens": 5764530.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5400198400020599, | |
| "epoch": 1.3171641791044777, | |
| "grad_norm": 0.13400353491306305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443472266197205, | |
| "mean_token_accuracy": 0.7780963182449341, | |
| "num_tokens": 5780899.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.556030884385109, | |
| "epoch": 1.3208955223880596, | |
| "grad_norm": 0.13756664097309113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470365285873413, | |
| "mean_token_accuracy": 0.7808873951435089, | |
| "num_tokens": 5796973.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5455010533332825, | |
| "epoch": 1.3246268656716418, | |
| "grad_norm": 0.17140203714370728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534233808517456, | |
| "mean_token_accuracy": 0.7828006148338318, | |
| "num_tokens": 5813201.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5456499308347702, | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 0.13772569596767426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461813807487488, | |
| "mean_token_accuracy": 0.7786128669977188, | |
| "num_tokens": 5829457.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5223972797393799, | |
| "epoch": 1.332089552238806, | |
| "grad_norm": 0.22252066433429718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330066084861755, | |
| "mean_token_accuracy": 0.7818692922592163, | |
| "num_tokens": 5845786.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5292713642120361, | |
| "epoch": 1.335820895522388, | |
| "grad_norm": 0.14202645421028137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392715930938721, | |
| "mean_token_accuracy": 0.7805515229701996, | |
| "num_tokens": 5862226.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5300968736410141, | |
| "epoch": 1.3395522388059702, | |
| "grad_norm": 0.18332785367965698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347115993499756, | |
| "mean_token_accuracy": 0.7835317403078079, | |
| "num_tokens": 5878683.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5431934744119644, | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.14532189071178436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330429077148438, | |
| "mean_token_accuracy": 0.7804477661848068, | |
| "num_tokens": 5895049.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5435428023338318, | |
| "epoch": 1.3470149253731343, | |
| "grad_norm": 0.1675368696451187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300995707511902, | |
| "mean_token_accuracy": 0.785721018910408, | |
| "num_tokens": 5911501.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5362260937690735, | |
| "epoch": 1.3507462686567164, | |
| "grad_norm": 0.12240255624055862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256680846214294, | |
| "mean_token_accuracy": 0.7851513922214508, | |
| "num_tokens": 5927731.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5452938824892044, | |
| "epoch": 1.3544776119402986, | |
| "grad_norm": 0.15949903428554535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495162010192871, | |
| "mean_token_accuracy": 0.7768245339393616, | |
| "num_tokens": 5944077.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5237463638186455, | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.2120627760887146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346443057060242, | |
| "mean_token_accuracy": 0.7835520654916763, | |
| "num_tokens": 5960532.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5450356751680374, | |
| "epoch": 1.3619402985074627, | |
| "grad_norm": 0.12423616647720337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510310530662537, | |
| "mean_token_accuracy": 0.7749469876289368, | |
| "num_tokens": 5976893.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5489538311958313, | |
| "epoch": 1.3656716417910448, | |
| "grad_norm": 0.17930445075035095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512227416038513, | |
| "mean_token_accuracy": 0.7759018093347549, | |
| "num_tokens": 5993262.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5524207949638367, | |
| "epoch": 1.3694029850746268, | |
| "grad_norm": 0.12074736505746841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450834631919861, | |
| "mean_token_accuracy": 0.7803297787904739, | |
| "num_tokens": 6009831.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5440987944602966, | |
| "epoch": 1.373134328358209, | |
| "grad_norm": 0.13452184200286865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378953814506531, | |
| "mean_token_accuracy": 0.7820150256156921, | |
| "num_tokens": 6026331.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5413002520799637, | |
| "epoch": 1.376865671641791, | |
| "grad_norm": 0.1278562843799591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359137654304504, | |
| "mean_token_accuracy": 0.783556342124939, | |
| "num_tokens": 6042945.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5525120049715042, | |
| "epoch": 1.3805970149253732, | |
| "grad_norm": 0.1208810955286026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459328889846802, | |
| "mean_token_accuracy": 0.7781365811824799, | |
| "num_tokens": 6059427.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5276467949151993, | |
| "epoch": 1.3843283582089552, | |
| "grad_norm": 0.21167868375778198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329975485801697, | |
| "mean_token_accuracy": 0.7855836153030396, | |
| "num_tokens": 6075868.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.523284301161766, | |
| "epoch": 1.3880597014925373, | |
| "grad_norm": 0.13116827607154846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309988260269165, | |
| "mean_token_accuracy": 0.7828356921672821, | |
| "num_tokens": 6092149.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5434711575508118, | |
| "epoch": 1.3917910447761195, | |
| "grad_norm": 0.3316550850868225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553439199924469, | |
| "mean_token_accuracy": 0.7766979038715363, | |
| "num_tokens": 6108567.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5287135094404221, | |
| "epoch": 1.3955223880597014, | |
| "grad_norm": 0.15037605166435242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357441306114197, | |
| "mean_token_accuracy": 0.7817093282938004, | |
| "num_tokens": 6124527.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5508522838354111, | |
| "epoch": 1.3992537313432836, | |
| "grad_norm": 0.19524440169334412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512291789054871, | |
| "mean_token_accuracy": 0.7776720374822617, | |
| "num_tokens": 6141075.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5336653590202332, | |
| "epoch": 1.4029850746268657, | |
| "grad_norm": 0.15542961657047272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334641933441162, | |
| "mean_token_accuracy": 0.7813901156187057, | |
| "num_tokens": 6157438.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5536468476057053, | |
| "epoch": 1.4067164179104479, | |
| "grad_norm": 0.11985230445861816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497922301292419, | |
| "mean_token_accuracy": 0.7766197621822357, | |
| "num_tokens": 6174052.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5455610156059265, | |
| "epoch": 1.4104477611940298, | |
| "grad_norm": 0.1377374231815338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400494337081909, | |
| "mean_token_accuracy": 0.7812647223472595, | |
| "num_tokens": 6190741.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5355032831430435, | |
| "epoch": 1.414179104477612, | |
| "grad_norm": 0.12337534874677658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313869118690491, | |
| "mean_token_accuracy": 0.7843705862760544, | |
| "num_tokens": 6207346.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5320865362882614, | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.1453101485967636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400369167327881, | |
| "mean_token_accuracy": 0.7805843502283096, | |
| "num_tokens": 6223644.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5373547524213791, | |
| "epoch": 1.421641791044776, | |
| "grad_norm": 0.19084329903125763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499929785728455, | |
| "mean_token_accuracy": 0.7757923603057861, | |
| "num_tokens": 6239901.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5443465709686279, | |
| "epoch": 1.4253731343283582, | |
| "grad_norm": 0.11772217601537704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418881773948669, | |
| "mean_token_accuracy": 0.7812986522912979, | |
| "num_tokens": 6256285.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5499950498342514, | |
| "epoch": 1.4291044776119404, | |
| "grad_norm": 0.1847136914730072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488113760948181, | |
| "mean_token_accuracy": 0.7776869833469391, | |
| "num_tokens": 6272664.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5412472188472748, | |
| "epoch": 1.4328358208955223, | |
| "grad_norm": 0.1461949199438095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365965366363525, | |
| "mean_token_accuracy": 0.7832726240158081, | |
| "num_tokens": 6289098.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5493346899747849, | |
| "epoch": 1.4365671641791045, | |
| "grad_norm": 0.17751483619213104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465101003646851, | |
| "mean_token_accuracy": 0.7778099924325943, | |
| "num_tokens": 6305547.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5415252298116684, | |
| "epoch": 1.4402985074626866, | |
| "grad_norm": 0.13513009250164032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538934588432312, | |
| "mean_token_accuracy": 0.7832966297864914, | |
| "num_tokens": 6321844.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5470823347568512, | |
| "epoch": 1.4440298507462686, | |
| "grad_norm": 0.15616844594478607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563836097717285, | |
| "mean_token_accuracy": 0.7730062156915665, | |
| "num_tokens": 6338401.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5151138752698898, | |
| "epoch": 1.4477611940298507, | |
| "grad_norm": 0.13514217734336853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200275182723999, | |
| "mean_token_accuracy": 0.7898600101470947, | |
| "num_tokens": 6354762.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5174058377742767, | |
| "epoch": 1.4514925373134329, | |
| "grad_norm": 0.13703469932079315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5161208510398865, | |
| "mean_token_accuracy": 0.7918747067451477, | |
| "num_tokens": 6370840.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5557476729154587, | |
| "epoch": 1.455223880597015, | |
| "grad_norm": 0.11840767413377762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515946745872498, | |
| "mean_token_accuracy": 0.7783915251493454, | |
| "num_tokens": 6387355.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5518558323383331, | |
| "epoch": 1.458955223880597, | |
| "grad_norm": 0.13202938437461853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526413321495056, | |
| "mean_token_accuracy": 0.776582270860672, | |
| "num_tokens": 6403938.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5571378320455551, | |
| "epoch": 1.462686567164179, | |
| "grad_norm": 0.13269183039665222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643842220306396, | |
| "mean_token_accuracy": 0.7722982317209244, | |
| "num_tokens": 6420250.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5537096560001373, | |
| "epoch": 1.4664179104477613, | |
| "grad_norm": 0.14151525497436523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553024411201477, | |
| "mean_token_accuracy": 0.7778746634721756, | |
| "num_tokens": 6436546.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5346309244632721, | |
| "epoch": 1.4701492537313432, | |
| "grad_norm": 0.13563434779644012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5249274969100952, | |
| "mean_token_accuracy": 0.7853583991527557, | |
| "num_tokens": 6453243.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5460333377122879, | |
| "epoch": 1.4738805970149254, | |
| "grad_norm": 0.14244568347930908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472844243049622, | |
| "mean_token_accuracy": 0.7797000557184219, | |
| "num_tokens": 6469565.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5330733209848404, | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.15417160093784332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538681149482727, | |
| "mean_token_accuracy": 0.7821140140295029, | |
| "num_tokens": 6486038.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5275893434882164, | |
| "epoch": 1.4813432835820897, | |
| "grad_norm": 0.1634518802165985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361412167549133, | |
| "mean_token_accuracy": 0.7828765362501144, | |
| "num_tokens": 6502376.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5401307940483093, | |
| "epoch": 1.4850746268656716, | |
| "grad_norm": 0.14567126333713531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489403605461121, | |
| "mean_token_accuracy": 0.7781455963850021, | |
| "num_tokens": 6518668.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5669757276773453, | |
| "epoch": 1.4888059701492538, | |
| "grad_norm": 0.1354297697544098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657601356506348, | |
| "mean_token_accuracy": 0.7712653428316116, | |
| "num_tokens": 6535182.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5363806635141373, | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.12377993017435074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529585599899292, | |
| "mean_token_accuracy": 0.7840481698513031, | |
| "num_tokens": 6551666.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5551501959562302, | |
| "epoch": 1.4962686567164178, | |
| "grad_norm": 0.14788372814655304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553497314453125, | |
| "mean_token_accuracy": 0.7757378667593002, | |
| "num_tokens": 6568256.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5353442132472992, | |
| "epoch": 1.5, | |
| "grad_norm": 0.12778371572494507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333885550498962, | |
| "mean_token_accuracy": 0.7825479656457901, | |
| "num_tokens": 6584443.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5460584759712219, | |
| "epoch": 1.5037313432835822, | |
| "grad_norm": 0.1357504278421402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496041774749756, | |
| "mean_token_accuracy": 0.7750886082649231, | |
| "num_tokens": 6600907.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5397640466690063, | |
| "epoch": 1.5074626865671643, | |
| "grad_norm": 0.13449276983737946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374521017074585, | |
| "mean_token_accuracy": 0.783362939953804, | |
| "num_tokens": 6617309.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.545674204826355, | |
| "epoch": 1.5111940298507462, | |
| "grad_norm": 0.12818823754787445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414538383483887, | |
| "mean_token_accuracy": 0.7811758369207382, | |
| "num_tokens": 6633409.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5237551480531693, | |
| "epoch": 1.5149253731343284, | |
| "grad_norm": 0.1332634538412094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288904905319214, | |
| "mean_token_accuracy": 0.7863495498895645, | |
| "num_tokens": 6649677.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5475018620491028, | |
| "epoch": 1.5186567164179103, | |
| "grad_norm": 0.1226048395037651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457717180252075, | |
| "mean_token_accuracy": 0.7798316031694412, | |
| "num_tokens": 6665941.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5388360321521759, | |
| "epoch": 1.5223880597014925, | |
| "grad_norm": 0.11307930946350098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332959294319153, | |
| "mean_token_accuracy": 0.7827007919549942, | |
| "num_tokens": 6682727.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5245520323514938, | |
| "epoch": 1.5261194029850746, | |
| "grad_norm": 0.13594341278076172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527988851070404, | |
| "mean_token_accuracy": 0.7841480374336243, | |
| "num_tokens": 6699061.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5443517565727234, | |
| "epoch": 1.5298507462686568, | |
| "grad_norm": 0.12875105440616608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445384979248047, | |
| "mean_token_accuracy": 0.7800036072731018, | |
| "num_tokens": 6715276.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5312410593032837, | |
| "epoch": 1.533582089552239, | |
| "grad_norm": 0.14251653850078583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363666415214539, | |
| "mean_token_accuracy": 0.7820229083299637, | |
| "num_tokens": 6731754.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5279273837804794, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.14002381265163422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533150851726532, | |
| "mean_token_accuracy": 0.7839628010988235, | |
| "num_tokens": 6748198.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5359641313552856, | |
| "epoch": 1.5410447761194028, | |
| "grad_norm": 0.12248595803976059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377635359764099, | |
| "mean_token_accuracy": 0.7816402763128281, | |
| "num_tokens": 6764658.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5304668098688126, | |
| "epoch": 1.544776119402985, | |
| "grad_norm": 0.1455898880958557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527800440788269, | |
| "mean_token_accuracy": 0.7847253680229187, | |
| "num_tokens": 6780948.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5399336069822311, | |
| "epoch": 1.5485074626865671, | |
| "grad_norm": 0.1414983719587326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367389917373657, | |
| "mean_token_accuracy": 0.7821487188339233, | |
| "num_tokens": 6797350.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5576040744781494, | |
| "epoch": 1.5522388059701493, | |
| "grad_norm": 0.12719132006168365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524293780326843, | |
| "mean_token_accuracy": 0.7746585160493851, | |
| "num_tokens": 6813754.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5370134860277176, | |
| "epoch": 1.5559701492537314, | |
| "grad_norm": 0.1307905912399292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359637141227722, | |
| "mean_token_accuracy": 0.7802634984254837, | |
| "num_tokens": 6829931.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5672536343336105, | |
| "epoch": 1.5597014925373134, | |
| "grad_norm": 0.14925286173820496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706211924552917, | |
| "mean_token_accuracy": 0.7692793905735016, | |
| "num_tokens": 6846619.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5455258339643478, | |
| "epoch": 1.5634328358208955, | |
| "grad_norm": 0.13767075538635254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497614145278931, | |
| "mean_token_accuracy": 0.7742694765329361, | |
| "num_tokens": 6862943.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5383682698011398, | |
| "epoch": 1.5671641791044775, | |
| "grad_norm": 0.14676761627197266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352654457092285, | |
| "mean_token_accuracy": 0.7820954322814941, | |
| "num_tokens": 6879478.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5393406301736832, | |
| "epoch": 1.5708955223880596, | |
| "grad_norm": 0.14782963693141937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539406418800354, | |
| "mean_token_accuracy": 0.7811137288808823, | |
| "num_tokens": 6895819.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5472134947776794, | |
| "epoch": 1.5746268656716418, | |
| "grad_norm": 0.1328146755695343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461377501487732, | |
| "mean_token_accuracy": 0.7797697186470032, | |
| "num_tokens": 6912305.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5397001504898071, | |
| "epoch": 1.578358208955224, | |
| "grad_norm": 0.12005209177732468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396695137023926, | |
| "mean_token_accuracy": 0.7789896428585052, | |
| "num_tokens": 6928851.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5323083251714706, | |
| "epoch": 1.582089552238806, | |
| "grad_norm": 0.14206735789775848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357058048248291, | |
| "mean_token_accuracy": 0.7814851403236389, | |
| "num_tokens": 6945117.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5220139473676682, | |
| "epoch": 1.585820895522388, | |
| "grad_norm": 0.13408760726451874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282811522483826, | |
| "mean_token_accuracy": 0.7859802693128586, | |
| "num_tokens": 6961475.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5279606133699417, | |
| "epoch": 1.5895522388059702, | |
| "grad_norm": 0.1342962682247162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310772061347961, | |
| "mean_token_accuracy": 0.7856840938329697, | |
| "num_tokens": 6977917.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5404426008462906, | |
| "epoch": 1.5932835820895521, | |
| "grad_norm": 0.11640056222677231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350806713104248, | |
| "mean_token_accuracy": 0.7831773906946182, | |
| "num_tokens": 6994309.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.546152800321579, | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.11648745834827423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432876348495483, | |
| "mean_token_accuracy": 0.7806773632764816, | |
| "num_tokens": 7010651.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5330662578344345, | |
| "epoch": 1.6007462686567164, | |
| "grad_norm": 0.1201220154762268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310200452804565, | |
| "mean_token_accuracy": 0.7844978868961334, | |
| "num_tokens": 7027129.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5318699181079865, | |
| "epoch": 1.6044776119402986, | |
| "grad_norm": 0.12328798323869705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332854986190796, | |
| "mean_token_accuracy": 0.7820296734571457, | |
| "num_tokens": 7043492.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5330018848180771, | |
| "epoch": 1.6082089552238807, | |
| "grad_norm": 0.1538732498884201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346086621284485, | |
| "mean_token_accuracy": 0.7841860204935074, | |
| "num_tokens": 7059825.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5369807183742523, | |
| "epoch": 1.6119402985074627, | |
| "grad_norm": 0.13523033261299133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543128490447998, | |
| "mean_token_accuracy": 0.779476061463356, | |
| "num_tokens": 7076083.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5597919672727585, | |
| "epoch": 1.6156716417910446, | |
| "grad_norm": 0.13593490421772003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56092369556427, | |
| "mean_token_accuracy": 0.7705628126859665, | |
| "num_tokens": 7092494.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5592869371175766, | |
| "epoch": 1.6194029850746268, | |
| "grad_norm": 0.13970784842967987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588337182998657, | |
| "mean_token_accuracy": 0.7716414630413055, | |
| "num_tokens": 7108787.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5510755926370621, | |
| "epoch": 1.623134328358209, | |
| "grad_norm": 0.14515163004398346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508431792259216, | |
| "mean_token_accuracy": 0.7757678478956223, | |
| "num_tokens": 7125326.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5493544340133667, | |
| "epoch": 1.626865671641791, | |
| "grad_norm": 0.13484683632850647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357339382171631, | |
| "mean_token_accuracy": 0.7844331711530685, | |
| "num_tokens": 7141623.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5371888130903244, | |
| "epoch": 1.6305970149253732, | |
| "grad_norm": 0.12795639038085938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337157249450684, | |
| "mean_token_accuracy": 0.7853695005178452, | |
| "num_tokens": 7158003.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5294598788022995, | |
| "epoch": 1.6343283582089554, | |
| "grad_norm": 0.13173329830169678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329991579055786, | |
| "mean_token_accuracy": 0.7873143553733826, | |
| "num_tokens": 7174417.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5183067172765732, | |
| "epoch": 1.6380597014925373, | |
| "grad_norm": 0.14890097081661224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276235938072205, | |
| "mean_token_accuracy": 0.7841698378324509, | |
| "num_tokens": 7190789.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5212598145008087, | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.1251063346862793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228430032730103, | |
| "mean_token_accuracy": 0.7859450131654739, | |
| "num_tokens": 7207139.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5322405844926834, | |
| "epoch": 1.6455223880597014, | |
| "grad_norm": 0.13600069284439087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263532996177673, | |
| "mean_token_accuracy": 0.7853893488645554, | |
| "num_tokens": 7223453.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5205891877412796, | |
| "epoch": 1.6492537313432836, | |
| "grad_norm": 0.13653913140296936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208824872970581, | |
| "mean_token_accuracy": 0.7881260365247726, | |
| "num_tokens": 7240006.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5441347062587738, | |
| "epoch": 1.6529850746268657, | |
| "grad_norm": 0.14450038969516754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436342358589172, | |
| "mean_token_accuracy": 0.7799146473407745, | |
| "num_tokens": 7256390.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5312005802989006, | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.12901286780834198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335438847541809, | |
| "mean_token_accuracy": 0.78382308781147, | |
| "num_tokens": 7272830.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5523424595594406, | |
| "epoch": 1.6604477611940298, | |
| "grad_norm": 0.13704852759838104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541114807128906, | |
| "mean_token_accuracy": 0.7756187319755554, | |
| "num_tokens": 7289085.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5462750494480133, | |
| "epoch": 1.664179104477612, | |
| "grad_norm": 0.1385122686624527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408669114112854, | |
| "mean_token_accuracy": 0.7794688045978546, | |
| "num_tokens": 7305251.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5703910887241364, | |
| "epoch": 1.667910447761194, | |
| "grad_norm": 0.12344513088464737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666346549987793, | |
| "mean_token_accuracy": 0.7705821841955185, | |
| "num_tokens": 7321796.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5504626631736755, | |
| "epoch": 1.671641791044776, | |
| "grad_norm": 0.12487871944904327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492321848869324, | |
| "mean_token_accuracy": 0.7753137797117233, | |
| "num_tokens": 7338182.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5314936190843582, | |
| "epoch": 1.6753731343283582, | |
| "grad_norm": 0.1390916407108307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342849493026733, | |
| "mean_token_accuracy": 0.7855862826108932, | |
| "num_tokens": 7354707.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5125585347414017, | |
| "epoch": 1.6791044776119404, | |
| "grad_norm": 0.13132618367671967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202143788337708, | |
| "mean_token_accuracy": 0.7874000519514084, | |
| "num_tokens": 7370797.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5190107151865959, | |
| "epoch": 1.6828358208955225, | |
| "grad_norm": 0.15053601562976837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218467116355896, | |
| "mean_token_accuracy": 0.7879750281572342, | |
| "num_tokens": 7387448.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5473128408193588, | |
| "epoch": 1.6865671641791045, | |
| "grad_norm": 0.14291800558567047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459562540054321, | |
| "mean_token_accuracy": 0.7800840735435486, | |
| "num_tokens": 7403768.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5372306257486343, | |
| "epoch": 1.6902985074626866, | |
| "grad_norm": 0.14737331867218018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391932725906372, | |
| "mean_token_accuracy": 0.7811848223209381, | |
| "num_tokens": 7420197.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5366326868534088, | |
| "epoch": 1.6940298507462686, | |
| "grad_norm": 0.13737186789512634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392562747001648, | |
| "mean_token_accuracy": 0.7824465036392212, | |
| "num_tokens": 7436532.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5506515055894852, | |
| "epoch": 1.6977611940298507, | |
| "grad_norm": 0.15034589171409607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501772165298462, | |
| "mean_token_accuracy": 0.7773263603448868, | |
| "num_tokens": 7452842.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5643105208873749, | |
| "epoch": 1.7014925373134329, | |
| "grad_norm": 0.14214570820331573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492639541625977, | |
| "mean_token_accuracy": 0.7783908396959305, | |
| "num_tokens": 7469451.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5516497120261192, | |
| "epoch": 1.705223880597015, | |
| "grad_norm": 0.14590683579444885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515267252922058, | |
| "mean_token_accuracy": 0.774686187505722, | |
| "num_tokens": 7485822.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5483950823545456, | |
| "epoch": 1.7089552238805972, | |
| "grad_norm": 0.15629805624485016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422750115394592, | |
| "mean_token_accuracy": 0.7802471369504929, | |
| "num_tokens": 7502363.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5315360128879547, | |
| "epoch": 1.712686567164179, | |
| "grad_norm": 0.15466850996017456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331098437309265, | |
| "mean_token_accuracy": 0.7842396944761276, | |
| "num_tokens": 7518672.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5366538316011429, | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.15616163611412048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455700755119324, | |
| "mean_token_accuracy": 0.7823781222105026, | |
| "num_tokens": 7534957.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5233009159564972, | |
| "epoch": 1.7201492537313432, | |
| "grad_norm": 0.1496264487504959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298243761062622, | |
| "mean_token_accuracy": 0.7823347896337509, | |
| "num_tokens": 7551350.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5345755070447922, | |
| "epoch": 1.7238805970149254, | |
| "grad_norm": 0.15188711881637573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339583158493042, | |
| "mean_token_accuracy": 0.7852912098169327, | |
| "num_tokens": 7567796.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.525611899793148, | |
| "epoch": 1.7276119402985075, | |
| "grad_norm": 0.12338917702436447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274109840393066, | |
| "mean_token_accuracy": 0.7858613133430481, | |
| "num_tokens": 7583895.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5306848883628845, | |
| "epoch": 1.7313432835820897, | |
| "grad_norm": 0.16974470019340515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279258489608765, | |
| "mean_token_accuracy": 0.7865510731935501, | |
| "num_tokens": 7600124.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5408849269151688, | |
| "epoch": 1.7350746268656716, | |
| "grad_norm": 0.12648795545101166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382460951805115, | |
| "mean_token_accuracy": 0.7846677452325821, | |
| "num_tokens": 7616438.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5429423898458481, | |
| "epoch": 1.7388059701492538, | |
| "grad_norm": 0.1650669425725937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549877941608429, | |
| "mean_token_accuracy": 0.7792258560657501, | |
| "num_tokens": 7632788.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5318955481052399, | |
| "epoch": 1.7425373134328357, | |
| "grad_norm": 0.12288089841604233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323612093925476, | |
| "mean_token_accuracy": 0.7859359383583069, | |
| "num_tokens": 7649308.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.548863410949707, | |
| "epoch": 1.7462686567164178, | |
| "grad_norm": 0.1326245218515396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457996129989624, | |
| "mean_token_accuracy": 0.7799065709114075, | |
| "num_tokens": 7665793.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5389255881309509, | |
| "epoch": 1.75, | |
| "grad_norm": 0.12419410794973373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312763452529907, | |
| "mean_token_accuracy": 0.7822507619857788, | |
| "num_tokens": 7682000.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5358720868825912, | |
| "epoch": 1.7537313432835822, | |
| "grad_norm": 0.13035476207733154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321502685546875, | |
| "mean_token_accuracy": 0.7836209833621979, | |
| "num_tokens": 7698643.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5370121747255325, | |
| "epoch": 1.7574626865671643, | |
| "grad_norm": 0.1549667865037918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385861396789551, | |
| "mean_token_accuracy": 0.7808156907558441, | |
| "num_tokens": 7714815.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5387648344039917, | |
| "epoch": 1.7611940298507462, | |
| "grad_norm": 0.14527052640914917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470720529556274, | |
| "mean_token_accuracy": 0.7775331288576126, | |
| "num_tokens": 7731250.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5520026981830597, | |
| "epoch": 1.7649253731343284, | |
| "grad_norm": 0.19052588939666748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578737854957581, | |
| "mean_token_accuracy": 0.7744869738817215, | |
| "num_tokens": 7747721.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5377953052520752, | |
| "epoch": 1.7686567164179103, | |
| "grad_norm": 0.13061052560806274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413972735404968, | |
| "mean_token_accuracy": 0.7811722010374069, | |
| "num_tokens": 7763904.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5519908219575882, | |
| "epoch": 1.7723880597014925, | |
| "grad_norm": 0.1454058736562729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414596796035767, | |
| "mean_token_accuracy": 0.7813711762428284, | |
| "num_tokens": 7780581.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5267625749111176, | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.1326485425233841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213202834129333, | |
| "mean_token_accuracy": 0.7871652394533157, | |
| "num_tokens": 7796973.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.553408294916153, | |
| "epoch": 1.7798507462686568, | |
| "grad_norm": 0.13312950730323792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529948472976685, | |
| "mean_token_accuracy": 0.7743393182754517, | |
| "num_tokens": 7813279.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.553880587220192, | |
| "epoch": 1.783582089552239, | |
| "grad_norm": 0.16114220023155212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641807317733765, | |
| "mean_token_accuracy": 0.7722779810428619, | |
| "num_tokens": 7829823.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5241200774908066, | |
| "epoch": 1.787313432835821, | |
| "grad_norm": 0.15040791034698486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346534252166748, | |
| "mean_token_accuracy": 0.7823406606912613, | |
| "num_tokens": 7845983.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5474425554275513, | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.13473069667816162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514643788337708, | |
| "mean_token_accuracy": 0.775032564997673, | |
| "num_tokens": 7862179.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5494029968976974, | |
| "epoch": 1.794776119402985, | |
| "grad_norm": 0.14377883076667786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433907508850098, | |
| "mean_token_accuracy": 0.7781640440225601, | |
| "num_tokens": 7878779.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5409138202667236, | |
| "epoch": 1.7985074626865671, | |
| "grad_norm": 0.14134465157985687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372306704521179, | |
| "mean_token_accuracy": 0.7832998037338257, | |
| "num_tokens": 7895136.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5516301095485687, | |
| "epoch": 1.8022388059701493, | |
| "grad_norm": 0.13915129005908966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529888272285461, | |
| "mean_token_accuracy": 0.7746001183986664, | |
| "num_tokens": 7911482.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5409607142210007, | |
| "epoch": 1.8059701492537314, | |
| "grad_norm": 0.1552349179983139, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396745204925537, | |
| "mean_token_accuracy": 0.7830557972192764, | |
| "num_tokens": 7927769.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5268412679433823, | |
| "epoch": 1.8097014925373134, | |
| "grad_norm": 0.16648107767105103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397533178329468, | |
| "mean_token_accuracy": 0.782973125576973, | |
| "num_tokens": 7944237.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5383498221635818, | |
| "epoch": 1.8134328358208955, | |
| "grad_norm": 0.1299259066581726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412971377372742, | |
| "mean_token_accuracy": 0.7789154797792435, | |
| "num_tokens": 7960404.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5497616678476334, | |
| "epoch": 1.8171641791044775, | |
| "grad_norm": 0.1571415513753891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444965362548828, | |
| "mean_token_accuracy": 0.7790942490100861, | |
| "num_tokens": 7976843.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5411071628332138, | |
| "epoch": 1.8208955223880596, | |
| "grad_norm": 0.12472257018089294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377678275108337, | |
| "mean_token_accuracy": 0.7812906056642532, | |
| "num_tokens": 7993308.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5332149565219879, | |
| "epoch": 1.8246268656716418, | |
| "grad_norm": 0.14515501260757446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532054603099823, | |
| "mean_token_accuracy": 0.7860440015792847, | |
| "num_tokens": 8009749.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5376683920621872, | |
| "epoch": 1.828358208955224, | |
| "grad_norm": 0.1362919807434082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361682772636414, | |
| "mean_token_accuracy": 0.7828832864761353, | |
| "num_tokens": 8026107.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.541684627532959, | |
| "epoch": 1.832089552238806, | |
| "grad_norm": 0.1390708088874817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428534746170044, | |
| "mean_token_accuracy": 0.7796362638473511, | |
| "num_tokens": 8042519.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5491971075534821, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.18899311125278473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468783378601074, | |
| "mean_token_accuracy": 0.7760737091302872, | |
| "num_tokens": 8058733.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5467192232608795, | |
| "epoch": 1.8395522388059702, | |
| "grad_norm": 0.12224384397268295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412194728851318, | |
| "mean_token_accuracy": 0.7836457341909409, | |
| "num_tokens": 8075111.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5190225690603256, | |
| "epoch": 1.8432835820895521, | |
| "grad_norm": 0.17859016358852386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287451148033142, | |
| "mean_token_accuracy": 0.7872583419084549, | |
| "num_tokens": 8091539.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5457055866718292, | |
| "epoch": 1.8470149253731343, | |
| "grad_norm": 0.14652208983898163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511422157287598, | |
| "mean_token_accuracy": 0.7764985859394073, | |
| "num_tokens": 8107924.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5412308424711227, | |
| "epoch": 1.8507462686567164, | |
| "grad_norm": 0.14928752183914185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386866331100464, | |
| "mean_token_accuracy": 0.7840718477964401, | |
| "num_tokens": 8124327.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5487564355134964, | |
| "epoch": 1.8544776119402986, | |
| "grad_norm": 0.14009299874305725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402563810348511, | |
| "mean_token_accuracy": 0.781055673956871, | |
| "num_tokens": 8140629.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5530242621898651, | |
| "epoch": 1.8582089552238807, | |
| "grad_norm": 0.13880518078804016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397564172744751, | |
| "mean_token_accuracy": 0.7810083031654358, | |
| "num_tokens": 8157176.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5339633226394653, | |
| "epoch": 1.8619402985074627, | |
| "grad_norm": 0.16541644930839539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336776971817017, | |
| "mean_token_accuracy": 0.7829927057027817, | |
| "num_tokens": 8173382.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5558539777994156, | |
| "epoch": 1.8656716417910446, | |
| "grad_norm": 0.15278875827789307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627698302268982, | |
| "mean_token_accuracy": 0.7725099176168442, | |
| "num_tokens": 8189820.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5367425978183746, | |
| "epoch": 1.8694029850746268, | |
| "grad_norm": 0.15401561558246613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546620786190033, | |
| "mean_token_accuracy": 0.7765664905309677, | |
| "num_tokens": 8205989.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5408999174833298, | |
| "epoch": 1.873134328358209, | |
| "grad_norm": 0.13051092624664307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466805696487427, | |
| "mean_token_accuracy": 0.7781471610069275, | |
| "num_tokens": 8222509.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5321147739887238, | |
| "epoch": 1.876865671641791, | |
| "grad_norm": 0.13755947351455688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527456521987915, | |
| "mean_token_accuracy": 0.7872339636087418, | |
| "num_tokens": 8238911.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5611546188592911, | |
| "epoch": 1.8805970149253732, | |
| "grad_norm": 0.13492627441883087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548973798751831, | |
| "mean_token_accuracy": 0.7786827385425568, | |
| "num_tokens": 8255331.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5648814886808395, | |
| "epoch": 1.8843283582089554, | |
| "grad_norm": 0.13315370678901672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626882314682007, | |
| "mean_token_accuracy": 0.7693315893411636, | |
| "num_tokens": 8271717.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.528036579489708, | |
| "epoch": 1.8880597014925373, | |
| "grad_norm": 0.13826221227645874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317479372024536, | |
| "mean_token_accuracy": 0.7865342795848846, | |
| "num_tokens": 8287916.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5300939381122589, | |
| "epoch": 1.8917910447761193, | |
| "grad_norm": 0.14022263884544373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405997633934021, | |
| "mean_token_accuracy": 0.7812036871910095, | |
| "num_tokens": 8304453.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.52273790538311, | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.1394582986831665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526207685470581, | |
| "mean_token_accuracy": 0.7882105112075806, | |
| "num_tokens": 8320635.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5376584082841873, | |
| "epoch": 1.8992537313432836, | |
| "grad_norm": 0.16204339265823364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367757678031921, | |
| "mean_token_accuracy": 0.7841790616512299, | |
| "num_tokens": 8337016.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5457427948713303, | |
| "epoch": 1.9029850746268657, | |
| "grad_norm": 0.13758644461631775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404728651046753, | |
| "mean_token_accuracy": 0.7789884358644485, | |
| "num_tokens": 8353374.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5548366904258728, | |
| "epoch": 1.9067164179104479, | |
| "grad_norm": 0.15079155564308167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460405349731445, | |
| "mean_token_accuracy": 0.7766790390014648, | |
| "num_tokens": 8369864.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5432726740837097, | |
| "epoch": 1.9104477611940298, | |
| "grad_norm": 0.14672084152698517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391443371772766, | |
| "mean_token_accuracy": 0.7813593149185181, | |
| "num_tokens": 8386310.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5469253212213516, | |
| "epoch": 1.914179104477612, | |
| "grad_norm": 0.12065178155899048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509493350982666, | |
| "mean_token_accuracy": 0.7752728313207626, | |
| "num_tokens": 8402902.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5332511216402054, | |
| "epoch": 1.917910447761194, | |
| "grad_norm": 0.13797524571418762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396395325660706, | |
| "mean_token_accuracy": 0.783454567193985, | |
| "num_tokens": 8418969.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5430255383253098, | |
| "epoch": 1.921641791044776, | |
| "grad_norm": 0.15779103338718414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497632026672363, | |
| "mean_token_accuracy": 0.776575118303299, | |
| "num_tokens": 8435342.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.541492372751236, | |
| "epoch": 1.9253731343283582, | |
| "grad_norm": 0.14993441104888916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440635085105896, | |
| "mean_token_accuracy": 0.779094398021698, | |
| "num_tokens": 8451438.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5484725534915924, | |
| "epoch": 1.9291044776119404, | |
| "grad_norm": 0.12014457583427429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494801998138428, | |
| "mean_token_accuracy": 0.7743937969207764, | |
| "num_tokens": 8467793.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5424629300832748, | |
| "epoch": 1.9328358208955225, | |
| "grad_norm": 0.1372799575328827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402990579605103, | |
| "mean_token_accuracy": 0.7788502424955368, | |
| "num_tokens": 8484069.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.544426254928112, | |
| "epoch": 1.9365671641791045, | |
| "grad_norm": 0.12580935657024384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430607199668884, | |
| "mean_token_accuracy": 0.7801959961652756, | |
| "num_tokens": 8500603.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5405134111642838, | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.13943250477313995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387794971466064, | |
| "mean_token_accuracy": 0.7797143012285233, | |
| "num_tokens": 8516792.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5363973081111908, | |
| "epoch": 1.9440298507462686, | |
| "grad_norm": 0.15255886316299438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392638444900513, | |
| "mean_token_accuracy": 0.778968557715416, | |
| "num_tokens": 8533178.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5569429993629456, | |
| "epoch": 1.9477611940298507, | |
| "grad_norm": 0.14009712636470795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554465055465698, | |
| "mean_token_accuracy": 0.7732362002134323, | |
| "num_tokens": 8549795.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.560676708817482, | |
| "epoch": 1.9514925373134329, | |
| "grad_norm": 0.1429370492696762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586832761764526, | |
| "mean_token_accuracy": 0.7744071185588837, | |
| "num_tokens": 8566708.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5566927641630173, | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.1273992359638214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483277440071106, | |
| "mean_token_accuracy": 0.7761266380548477, | |
| "num_tokens": 8582993.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5535138845443726, | |
| "epoch": 1.9589552238805972, | |
| "grad_norm": 0.15844318270683289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520558953285217, | |
| "mean_token_accuracy": 0.7790683060884476, | |
| "num_tokens": 8599225.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5255821049213409, | |
| "epoch": 1.962686567164179, | |
| "grad_norm": 0.1505620777606964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302370190620422, | |
| "mean_token_accuracy": 0.7846137434244156, | |
| "num_tokens": 8615790.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5364990532398224, | |
| "epoch": 1.966417910447761, | |
| "grad_norm": 0.18815594911575317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442203283309937, | |
| "mean_token_accuracy": 0.7792959064245224, | |
| "num_tokens": 8632007.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5499100834131241, | |
| "epoch": 1.9701492537313432, | |
| "grad_norm": 0.12838681042194366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423155426979065, | |
| "mean_token_accuracy": 0.77956822514534, | |
| "num_tokens": 8648517.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5600726753473282, | |
| "epoch": 1.9738805970149254, | |
| "grad_norm": 0.13670910894870758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591787695884705, | |
| "mean_token_accuracy": 0.7713638991117477, | |
| "num_tokens": 8665136.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5376773029565811, | |
| "epoch": 1.9776119402985075, | |
| "grad_norm": 0.12114886194467545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407379865646362, | |
| "mean_token_accuracy": 0.7814544290304184, | |
| "num_tokens": 8681529.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5403832793235779, | |
| "epoch": 1.9813432835820897, | |
| "grad_norm": 0.13908495008945465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482066869735718, | |
| "mean_token_accuracy": 0.777704581618309, | |
| "num_tokens": 8697730.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5356862396001816, | |
| "epoch": 1.9850746268656716, | |
| "grad_norm": 0.13925939798355103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371193289756775, | |
| "mean_token_accuracy": 0.783266693353653, | |
| "num_tokens": 8714219.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5331960469484329, | |
| "epoch": 1.9888059701492538, | |
| "grad_norm": 0.15995416045188904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319101810455322, | |
| "mean_token_accuracy": 0.7843216061592102, | |
| "num_tokens": 8730525.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5409343987703323, | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 0.1330004185438156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445230603218079, | |
| "mean_token_accuracy": 0.7773614227771759, | |
| "num_tokens": 8746950.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5394200682640076, | |
| "epoch": 1.9962686567164178, | |
| "grad_norm": 0.14103004336357117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359162092208862, | |
| "mean_token_accuracy": 0.785576581954956, | |
| "num_tokens": 8763337.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5349156558513641, | |
| "epoch": 2.0, | |
| "grad_norm": 0.12837927043437958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329214334487915, | |
| "mean_token_accuracy": 0.785938173532486, | |
| "num_tokens": 8779938.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5407280772924423, | |
| "epoch": 2.003731343283582, | |
| "grad_norm": 0.14622488617897034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321956872940063, | |
| "mean_token_accuracy": 0.7852865755558014, | |
| "num_tokens": 8796464.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5337665975093842, | |
| "epoch": 2.0074626865671643, | |
| "grad_norm": 0.16594251990318298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266042351722717, | |
| "mean_token_accuracy": 0.7868293672800064, | |
| "num_tokens": 8812777.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5268868803977966, | |
| "epoch": 2.0111940298507465, | |
| "grad_norm": 0.15608331561088562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311114192008972, | |
| "mean_token_accuracy": 0.7839187681674957, | |
| "num_tokens": 8829112.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.527610257267952, | |
| "epoch": 2.014925373134328, | |
| "grad_norm": 0.13121342658996582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283110737800598, | |
| "mean_token_accuracy": 0.7851767688989639, | |
| "num_tokens": 8845686.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5114267989993095, | |
| "epoch": 2.0186567164179103, | |
| "grad_norm": 0.15982377529144287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5138009190559387, | |
| "mean_token_accuracy": 0.7923145592212677, | |
| "num_tokens": 8862042.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5179557651281357, | |
| "epoch": 2.0223880597014925, | |
| "grad_norm": 0.15685375034809113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175086855888367, | |
| "mean_token_accuracy": 0.790000781416893, | |
| "num_tokens": 8878269.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5284497290849686, | |
| "epoch": 2.0261194029850746, | |
| "grad_norm": 0.155994713306427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248953104019165, | |
| "mean_token_accuracy": 0.7887215316295624, | |
| "num_tokens": 8894744.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5114204958081245, | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.1587519645690918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146663784980774, | |
| "mean_token_accuracy": 0.7908709943294525, | |
| "num_tokens": 8911019.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5258788168430328, | |
| "epoch": 2.033582089552239, | |
| "grad_norm": 0.17405946552753448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257717967033386, | |
| "mean_token_accuracy": 0.7857701331377029, | |
| "num_tokens": 8927423.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5308232307434082, | |
| "epoch": 2.0373134328358207, | |
| "grad_norm": 0.16010001301765442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299814343452454, | |
| "mean_token_accuracy": 0.7874948382377625, | |
| "num_tokens": 8943802.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.516572117805481, | |
| "epoch": 2.041044776119403, | |
| "grad_norm": 0.16816852986812592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5154708623886108, | |
| "mean_token_accuracy": 0.7876496762037277, | |
| "num_tokens": 8959993.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5281299874186516, | |
| "epoch": 2.044776119402985, | |
| "grad_norm": 0.14758102595806122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524406373500824, | |
| "mean_token_accuracy": 0.7861409038305283, | |
| "num_tokens": 8976245.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5246195495128632, | |
| "epoch": 2.048507462686567, | |
| "grad_norm": 0.16330084204673767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244280099868774, | |
| "mean_token_accuracy": 0.7878082692623138, | |
| "num_tokens": 8992638.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.514888346195221, | |
| "epoch": 2.0522388059701493, | |
| "grad_norm": 0.1649155467748642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206322073936462, | |
| "mean_token_accuracy": 0.7888449877500534, | |
| "num_tokens": 9008736.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5066314935684204, | |
| "epoch": 2.0559701492537314, | |
| "grad_norm": 0.1575276404619217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5027191042900085, | |
| "mean_token_accuracy": 0.7947296053171158, | |
| "num_tokens": 9025125.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5268809348344803, | |
| "epoch": 2.0597014925373136, | |
| "grad_norm": 0.1932123601436615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526970386505127, | |
| "mean_token_accuracy": 0.7861645221710205, | |
| "num_tokens": 9041360.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5089156553149223, | |
| "epoch": 2.0634328358208953, | |
| "grad_norm": 0.17611229419708252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5170955061912537, | |
| "mean_token_accuracy": 0.7898762077093124, | |
| "num_tokens": 9057425.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5314554125070572, | |
| "epoch": 2.0671641791044775, | |
| "grad_norm": 0.16261620819568634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317267775535583, | |
| "mean_token_accuracy": 0.7857931405305862, | |
| "num_tokens": 9073634.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5275600850582123, | |
| "epoch": 2.0708955223880596, | |
| "grad_norm": 0.1528756469488144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216519832611084, | |
| "mean_token_accuracy": 0.784853920340538, | |
| "num_tokens": 9090072.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.533121645450592, | |
| "epoch": 2.074626865671642, | |
| "grad_norm": 0.15978476405143738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330748558044434, | |
| "mean_token_accuracy": 0.7852211892604828, | |
| "num_tokens": 9106310.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5289422124624252, | |
| "epoch": 2.078358208955224, | |
| "grad_norm": 0.18613378703594208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246477127075195, | |
| "mean_token_accuracy": 0.7871279567480087, | |
| "num_tokens": 9122599.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5288784801959991, | |
| "epoch": 2.082089552238806, | |
| "grad_norm": 0.19494648277759552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310162305831909, | |
| "mean_token_accuracy": 0.783275917172432, | |
| "num_tokens": 9138955.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5063241422176361, | |
| "epoch": 2.0858208955223883, | |
| "grad_norm": 0.17457328736782074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5103744268417358, | |
| "mean_token_accuracy": 0.7956038117408752, | |
| "num_tokens": 9155471.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5165305808186531, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.16135407984256744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219785571098328, | |
| "mean_token_accuracy": 0.7876863032579422, | |
| "num_tokens": 9171894.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5188902914524078, | |
| "epoch": 2.093283582089552, | |
| "grad_norm": 0.16337014734745026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516549825668335, | |
| "mean_token_accuracy": 0.7918221950531006, | |
| "num_tokens": 9188463.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.513557106256485, | |
| "epoch": 2.0970149253731343, | |
| "grad_norm": 0.1818535476922989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.506076991558075, | |
| "mean_token_accuracy": 0.7936830073595047, | |
| "num_tokens": 9204870.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5341264307498932, | |
| "epoch": 2.1007462686567164, | |
| "grad_norm": 0.1677771359682083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530627965927124, | |
| "mean_token_accuracy": 0.7831838876008987, | |
| "num_tokens": 9221094.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5140577107667923, | |
| "epoch": 2.1044776119402986, | |
| "grad_norm": 0.17054656147956848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144332647323608, | |
| "mean_token_accuracy": 0.7923785746097565, | |
| "num_tokens": 9237391.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.497653529047966, | |
| "epoch": 2.1082089552238807, | |
| "grad_norm": 0.18110354244709015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5102217197418213, | |
| "mean_token_accuracy": 0.7931897193193436, | |
| "num_tokens": 9253611.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.524284727871418, | |
| "epoch": 2.111940298507463, | |
| "grad_norm": 0.2005971521139145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303030014038086, | |
| "mean_token_accuracy": 0.7885997593402863, | |
| "num_tokens": 9269952.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5399997532367706, | |
| "epoch": 2.1156716417910446, | |
| "grad_norm": 0.1460496038198471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352809429168701, | |
| "mean_token_accuracy": 0.7851102352142334, | |
| "num_tokens": 9286381.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5403535813093185, | |
| "epoch": 2.1194029850746268, | |
| "grad_norm": 0.2164795845746994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310791730880737, | |
| "mean_token_accuracy": 0.7864344716072083, | |
| "num_tokens": 9302619.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5281778201460838, | |
| "epoch": 2.123134328358209, | |
| "grad_norm": 0.14520607888698578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214827060699463, | |
| "mean_token_accuracy": 0.7891172915697098, | |
| "num_tokens": 9319199.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5376487374305725, | |
| "epoch": 2.126865671641791, | |
| "grad_norm": 0.20075996220111847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414179563522339, | |
| "mean_token_accuracy": 0.7825666964054108, | |
| "num_tokens": 9335645.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.544133722782135, | |
| "epoch": 2.1305970149253732, | |
| "grad_norm": 0.17108148336410522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474769473075867, | |
| "mean_token_accuracy": 0.778696671128273, | |
| "num_tokens": 9352250.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5139511153101921, | |
| "epoch": 2.1343283582089554, | |
| "grad_norm": 0.20305298268795013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5138852000236511, | |
| "mean_token_accuracy": 0.7916316092014313, | |
| "num_tokens": 9368581.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5336194783449173, | |
| "epoch": 2.138059701492537, | |
| "grad_norm": 0.17313581705093384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371931195259094, | |
| "mean_token_accuracy": 0.7810296416282654, | |
| "num_tokens": 9385005.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5428188145160675, | |
| "epoch": 2.1417910447761193, | |
| "grad_norm": 0.18904267251491547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414341688156128, | |
| "mean_token_accuracy": 0.7817030698060989, | |
| "num_tokens": 9401264.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5036500468850136, | |
| "epoch": 2.1455223880597014, | |
| "grad_norm": 0.16260603070259094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5049091577529907, | |
| "mean_token_accuracy": 0.7955426573753357, | |
| "num_tokens": 9417452.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5125822275876999, | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.18752527236938477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520676851272583, | |
| "mean_token_accuracy": 0.787801519036293, | |
| "num_tokens": 9433830.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5220265239477158, | |
| "epoch": 2.1529850746268657, | |
| "grad_norm": 0.17956171929836273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259777903556824, | |
| "mean_token_accuracy": 0.7890594154596329, | |
| "num_tokens": 9449942.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5411542505025864, | |
| "epoch": 2.156716417910448, | |
| "grad_norm": 0.16276296973228455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392127633094788, | |
| "mean_token_accuracy": 0.7827239036560059, | |
| "num_tokens": 9466361.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5376486927270889, | |
| "epoch": 2.16044776119403, | |
| "grad_norm": 0.18284423649311066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354690551757812, | |
| "mean_token_accuracy": 0.7847119867801666, | |
| "num_tokens": 9482738.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.527974009513855, | |
| "epoch": 2.1641791044776117, | |
| "grad_norm": 0.15606842935085297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216515064239502, | |
| "mean_token_accuracy": 0.7893972098827362, | |
| "num_tokens": 9499285.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5080907642841339, | |
| "epoch": 2.167910447761194, | |
| "grad_norm": 0.19228458404541016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5062891840934753, | |
| "mean_token_accuracy": 0.7950604856014252, | |
| "num_tokens": 9515408.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5310265123844147, | |
| "epoch": 2.171641791044776, | |
| "grad_norm": 0.1585681140422821, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329898595809937, | |
| "mean_token_accuracy": 0.7825100123882294, | |
| "num_tokens": 9531802.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5163623988628387, | |
| "epoch": 2.175373134328358, | |
| "grad_norm": 0.16819821298122406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175923109054565, | |
| "mean_token_accuracy": 0.7890376448631287, | |
| "num_tokens": 9548285.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5143009200692177, | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 0.16217826306819916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155395865440369, | |
| "mean_token_accuracy": 0.7922197580337524, | |
| "num_tokens": 9564428.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5416625738143921, | |
| "epoch": 2.1828358208955225, | |
| "grad_norm": 0.15060050785541534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370927453041077, | |
| "mean_token_accuracy": 0.7829685211181641, | |
| "num_tokens": 9580974.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5395999997854233, | |
| "epoch": 2.1865671641791047, | |
| "grad_norm": 0.17097517848014832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385570526123047, | |
| "mean_token_accuracy": 0.7842200845479965, | |
| "num_tokens": 9597372.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5397211164236069, | |
| "epoch": 2.1902985074626864, | |
| "grad_norm": 0.1612662672996521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392184257507324, | |
| "mean_token_accuracy": 0.7815093398094177, | |
| "num_tokens": 9613832.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5179775580763817, | |
| "epoch": 2.1940298507462686, | |
| "grad_norm": 0.17580583691596985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214508771896362, | |
| "mean_token_accuracy": 0.7890152186155319, | |
| "num_tokens": 9630021.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5112824365496635, | |
| "epoch": 2.1977611940298507, | |
| "grad_norm": 0.2011307030916214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203381180763245, | |
| "mean_token_accuracy": 0.7900092750787735, | |
| "num_tokens": 9646188.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5356829464435577, | |
| "epoch": 2.201492537313433, | |
| "grad_norm": 0.16764222085475922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318949818611145, | |
| "mean_token_accuracy": 0.7853176593780518, | |
| "num_tokens": 9662704.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.532988578081131, | |
| "epoch": 2.205223880597015, | |
| "grad_norm": 0.1625567525625229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286852121353149, | |
| "mean_token_accuracy": 0.7845050990581512, | |
| "num_tokens": 9679126.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5083666741847992, | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.17014159262180328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5085889101028442, | |
| "mean_token_accuracy": 0.7938840687274933, | |
| "num_tokens": 9695574.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5348383486270905, | |
| "epoch": 2.2126865671641793, | |
| "grad_norm": 0.15370626747608185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363180041313171, | |
| "mean_token_accuracy": 0.7823249995708466, | |
| "num_tokens": 9711759.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.521574854850769, | |
| "epoch": 2.216417910447761, | |
| "grad_norm": 0.1618925929069519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5165284872055054, | |
| "mean_token_accuracy": 0.7902027070522308, | |
| "num_tokens": 9728297.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5246837437152863, | |
| "epoch": 2.220149253731343, | |
| "grad_norm": 0.16403713822364807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284984111785889, | |
| "mean_token_accuracy": 0.785593718290329, | |
| "num_tokens": 9745025.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5146933272480965, | |
| "epoch": 2.2238805970149254, | |
| "grad_norm": 0.16364289820194244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155675411224365, | |
| "mean_token_accuracy": 0.7914301306009293, | |
| "num_tokens": 9761573.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5164592936635017, | |
| "epoch": 2.2276119402985075, | |
| "grad_norm": 0.16107001900672913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520284116268158, | |
| "mean_token_accuracy": 0.790960431098938, | |
| "num_tokens": 9777994.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5009781569242477, | |
| "epoch": 2.2313432835820897, | |
| "grad_norm": 0.17092035710811615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5013527870178223, | |
| "mean_token_accuracy": 0.7965078949928284, | |
| "num_tokens": 9794247.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5145166665315628, | |
| "epoch": 2.235074626865672, | |
| "grad_norm": 0.17742900550365448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5136178731918335, | |
| "mean_token_accuracy": 0.7902016937732697, | |
| "num_tokens": 9810623.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.521144449710846, | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.1866447478532791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256049633026123, | |
| "mean_token_accuracy": 0.7880899459123611, | |
| "num_tokens": 9827216.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5078264698386192, | |
| "epoch": 2.2425373134328357, | |
| "grad_norm": 0.18190419673919678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5107334852218628, | |
| "mean_token_accuracy": 0.7921731919050217, | |
| "num_tokens": 9843424.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5391242802143097, | |
| "epoch": 2.246268656716418, | |
| "grad_norm": 0.1664401739835739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404478907585144, | |
| "mean_token_accuracy": 0.779574453830719, | |
| "num_tokens": 9859528.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5163552165031433, | |
| "epoch": 2.25, | |
| "grad_norm": 0.19338326156139374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5106169581413269, | |
| "mean_token_accuracy": 0.7929095774888992, | |
| "num_tokens": 9875496.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.538531944155693, | |
| "epoch": 2.253731343283582, | |
| "grad_norm": 0.16355083882808685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421521067619324, | |
| "mean_token_accuracy": 0.7775969356298447, | |
| "num_tokens": 9891706.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5201183184981346, | |
| "epoch": 2.2574626865671643, | |
| "grad_norm": 0.2061741203069687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298879742622375, | |
| "mean_token_accuracy": 0.7839659005403519, | |
| "num_tokens": 9907901.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5299466401338577, | |
| "epoch": 2.2611940298507465, | |
| "grad_norm": 0.1585988998413086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266643762588501, | |
| "mean_token_accuracy": 0.7857095748186111, | |
| "num_tokens": 9924584.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5331060588359833, | |
| "epoch": 2.264925373134328, | |
| "grad_norm": 0.22515474259853363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281371474266052, | |
| "mean_token_accuracy": 0.7846943885087967, | |
| "num_tokens": 9940921.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5365794003009796, | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.14158517122268677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241664052009583, | |
| "mean_token_accuracy": 0.7902594655752182, | |
| "num_tokens": 9957418.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5098173916339874, | |
| "epoch": 2.2723880597014925, | |
| "grad_norm": 0.19847925007343292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5109040141105652, | |
| "mean_token_accuracy": 0.7907959967851639, | |
| "num_tokens": 9973783.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.507322758436203, | |
| "epoch": 2.2761194029850746, | |
| "grad_norm": 0.1904480904340744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5145297050476074, | |
| "mean_token_accuracy": 0.791220560669899, | |
| "num_tokens": 9990362.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5185896158218384, | |
| "epoch": 2.279850746268657, | |
| "grad_norm": 0.23211340606212616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524868905544281, | |
| "mean_token_accuracy": 0.7855911701917648, | |
| "num_tokens": 10006762.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5282359346747398, | |
| "epoch": 2.283582089552239, | |
| "grad_norm": 0.1768886297941208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229817628860474, | |
| "mean_token_accuracy": 0.7895976901054382, | |
| "num_tokens": 10023191.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5275277346372604, | |
| "epoch": 2.2873134328358207, | |
| "grad_norm": 0.19380177557468414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169612765312195, | |
| "mean_token_accuracy": 0.7907349169254303, | |
| "num_tokens": 10039350.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5204345509409904, | |
| "epoch": 2.291044776119403, | |
| "grad_norm": 0.15632414817810059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.513292670249939, | |
| "mean_token_accuracy": 0.7925348877906799, | |
| "num_tokens": 10055872.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5112610086798668, | |
| "epoch": 2.294776119402985, | |
| "grad_norm": 0.18102124333381653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520767092704773, | |
| "mean_token_accuracy": 0.7886828035116196, | |
| "num_tokens": 10072419.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5232729762792587, | |
| "epoch": 2.298507462686567, | |
| "grad_norm": 0.25390854477882385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408729314804077, | |
| "mean_token_accuracy": 0.7815985828638077, | |
| "num_tokens": 10088715.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.529785230755806, | |
| "epoch": 2.3022388059701493, | |
| "grad_norm": 0.15947353839874268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309044718742371, | |
| "mean_token_accuracy": 0.784679189324379, | |
| "num_tokens": 10105206.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5409619510173798, | |
| "epoch": 2.3059701492537314, | |
| "grad_norm": 0.21774348616600037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331413745880127, | |
| "mean_token_accuracy": 0.7848716974258423, | |
| "num_tokens": 10121951.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5404030680656433, | |
| "epoch": 2.3097014925373136, | |
| "grad_norm": 0.17135120928287506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320269465446472, | |
| "mean_token_accuracy": 0.7863317579030991, | |
| "num_tokens": 10138520.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.543184906244278, | |
| "epoch": 2.3134328358208958, | |
| "grad_norm": 0.18270884454250336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362977981567383, | |
| "mean_token_accuracy": 0.7825828939676285, | |
| "num_tokens": 10155242.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5144708007574081, | |
| "epoch": 2.3171641791044775, | |
| "grad_norm": 0.19776520133018494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5190030336380005, | |
| "mean_token_accuracy": 0.7893546521663666, | |
| "num_tokens": 10171493.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5012815147638321, | |
| "epoch": 2.3208955223880596, | |
| "grad_norm": 0.18417391180992126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5140509009361267, | |
| "mean_token_accuracy": 0.7917021214962006, | |
| "num_tokens": 10187924.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5291815996170044, | |
| "epoch": 2.324626865671642, | |
| "grad_norm": 0.18122002482414246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308645367622375, | |
| "mean_token_accuracy": 0.7827988862991333, | |
| "num_tokens": 10204223.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5316928327083588, | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.17393858730793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351020097732544, | |
| "mean_token_accuracy": 0.7837810218334198, | |
| "num_tokens": 10220678.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5380063354969025, | |
| "epoch": 2.332089552238806, | |
| "grad_norm": 0.16641174256801605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311377644538879, | |
| "mean_token_accuracy": 0.78605717420578, | |
| "num_tokens": 10236761.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5296464115381241, | |
| "epoch": 2.3358208955223883, | |
| "grad_norm": 0.16847732663154602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290564894676208, | |
| "mean_token_accuracy": 0.7866681218147278, | |
| "num_tokens": 10253110.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5196742564439774, | |
| "epoch": 2.33955223880597, | |
| "grad_norm": 0.16526693105697632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516907811164856, | |
| "mean_token_accuracy": 0.7920583933591843, | |
| "num_tokens": 10269492.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.541998103260994, | |
| "epoch": 2.343283582089552, | |
| "grad_norm": 0.18568557500839233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372257828712463, | |
| "mean_token_accuracy": 0.7823797762393951, | |
| "num_tokens": 10285927.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5108761489391327, | |
| "epoch": 2.3470149253731343, | |
| "grad_norm": 0.1934242844581604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5139164924621582, | |
| "mean_token_accuracy": 0.7933155596256256, | |
| "num_tokens": 10302023.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5217199325561523, | |
| "epoch": 2.3507462686567164, | |
| "grad_norm": 0.17553211748600006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230180025100708, | |
| "mean_token_accuracy": 0.7875964045524597, | |
| "num_tokens": 10318268.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5330761075019836, | |
| "epoch": 2.3544776119402986, | |
| "grad_norm": 0.15872074663639069, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290681719779968, | |
| "mean_token_accuracy": 0.7844167649745941, | |
| "num_tokens": 10334766.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5369035452604294, | |
| "epoch": 2.3582089552238807, | |
| "grad_norm": 0.1846853792667389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329739451408386, | |
| "mean_token_accuracy": 0.7838435918092728, | |
| "num_tokens": 10351349.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5287653654813766, | |
| "epoch": 2.361940298507463, | |
| "grad_norm": 0.1996822953224182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347191095352173, | |
| "mean_token_accuracy": 0.7811494767665863, | |
| "num_tokens": 10367871.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5239842683076859, | |
| "epoch": 2.3656716417910446, | |
| "grad_norm": 0.19435462355613708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530573308467865, | |
| "mean_token_accuracy": 0.7837476581335068, | |
| "num_tokens": 10384315.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5206383317708969, | |
| "epoch": 2.3694029850746268, | |
| "grad_norm": 0.19717657566070557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275444388389587, | |
| "mean_token_accuracy": 0.7842705696821213, | |
| "num_tokens": 10400769.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5064749270677567, | |
| "epoch": 2.373134328358209, | |
| "grad_norm": 0.19260841608047485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.51506507396698, | |
| "mean_token_accuracy": 0.789744108915329, | |
| "num_tokens": 10417006.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5361980348825455, | |
| "epoch": 2.376865671641791, | |
| "grad_norm": 0.17480432987213135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336955189704895, | |
| "mean_token_accuracy": 0.7836211174726486, | |
| "num_tokens": 10433294.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5383089035749435, | |
| "epoch": 2.3805970149253732, | |
| "grad_norm": 0.18294544517993927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289636254310608, | |
| "mean_token_accuracy": 0.7852412611246109, | |
| "num_tokens": 10449674.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5097021907567978, | |
| "epoch": 2.3843283582089554, | |
| "grad_norm": 0.16242100298404694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5021054148674011, | |
| "mean_token_accuracy": 0.7972816228866577, | |
| "num_tokens": 10465855.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5423515290021896, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.22227367758750916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548687756061554, | |
| "mean_token_accuracy": 0.776146799325943, | |
| "num_tokens": 10482179.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5074172541499138, | |
| "epoch": 2.3917910447761193, | |
| "grad_norm": 0.1631743311882019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5108535289764404, | |
| "mean_token_accuracy": 0.7928425967693329, | |
| "num_tokens": 10498617.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5141904726624489, | |
| "epoch": 2.3955223880597014, | |
| "grad_norm": 0.22901000082492828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239617228507996, | |
| "mean_token_accuracy": 0.7894341051578522, | |
| "num_tokens": 10514855.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.548003762960434, | |
| "epoch": 2.3992537313432836, | |
| "grad_norm": 0.1889556348323822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518738627433777, | |
| "mean_token_accuracy": 0.7756821662187576, | |
| "num_tokens": 10531113.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5271116495132446, | |
| "epoch": 2.4029850746268657, | |
| "grad_norm": 0.15567590296268463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516383171081543, | |
| "mean_token_accuracy": 0.7933164685964584, | |
| "num_tokens": 10547691.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5330717116594315, | |
| "epoch": 2.406716417910448, | |
| "grad_norm": 0.17213337123394012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231931209564209, | |
| "mean_token_accuracy": 0.7853028923273087, | |
| "num_tokens": 10563993.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.542450025677681, | |
| "epoch": 2.41044776119403, | |
| "grad_norm": 0.16203731298446655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375291109085083, | |
| "mean_token_accuracy": 0.7830152362585068, | |
| "num_tokens": 10580464.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5074228942394257, | |
| "epoch": 2.4141791044776117, | |
| "grad_norm": 0.16541871428489685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123732089996338, | |
| "mean_token_accuracy": 0.7941079437732697, | |
| "num_tokens": 10596747.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5105165019631386, | |
| "epoch": 2.417910447761194, | |
| "grad_norm": 0.182412788271904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5217914581298828, | |
| "mean_token_accuracy": 0.7893105298280716, | |
| "num_tokens": 10612951.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5206151753664017, | |
| "epoch": 2.421641791044776, | |
| "grad_norm": 0.20678837597370148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335655212402344, | |
| "mean_token_accuracy": 0.7840552628040314, | |
| "num_tokens": 10629467.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5416827350854874, | |
| "epoch": 2.425373134328358, | |
| "grad_norm": 0.16378135979175568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401762127876282, | |
| "mean_token_accuracy": 0.782837986946106, | |
| "num_tokens": 10645981.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5352658033370972, | |
| "epoch": 2.4291044776119404, | |
| "grad_norm": 0.17120513319969177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229877233505249, | |
| "mean_token_accuracy": 0.7894999831914902, | |
| "num_tokens": 10662599.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5378601551055908, | |
| "epoch": 2.4328358208955225, | |
| "grad_norm": 0.18634538352489471, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370844602584839, | |
| "mean_token_accuracy": 0.7834650576114655, | |
| "num_tokens": 10678905.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5139342248439789, | |
| "epoch": 2.4365671641791042, | |
| "grad_norm": 0.1823841780424118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5105010271072388, | |
| "mean_token_accuracy": 0.7942702323198318, | |
| "num_tokens": 10695354.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5001704916357994, | |
| "epoch": 2.4402985074626864, | |
| "grad_norm": 0.18246224522590637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5092322826385498, | |
| "mean_token_accuracy": 0.7953812628984451, | |
| "num_tokens": 10711419.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5088636800646782, | |
| "epoch": 2.4440298507462686, | |
| "grad_norm": 0.16581419110298157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5136841535568237, | |
| "mean_token_accuracy": 0.7919897437095642, | |
| "num_tokens": 10727853.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5198448672890663, | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.16655242443084717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5188886523246765, | |
| "mean_token_accuracy": 0.7890329360961914, | |
| "num_tokens": 10744204.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5168529972434044, | |
| "epoch": 2.451492537313433, | |
| "grad_norm": 0.18366754055023193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5171942114830017, | |
| "mean_token_accuracy": 0.7899800539016724, | |
| "num_tokens": 10760669.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5348050147294998, | |
| "epoch": 2.455223880597015, | |
| "grad_norm": 0.18297524750232697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392665266990662, | |
| "mean_token_accuracy": 0.779433473944664, | |
| "num_tokens": 10777093.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5245852321386337, | |
| "epoch": 2.458955223880597, | |
| "grad_norm": 0.19149278104305267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260974764823914, | |
| "mean_token_accuracy": 0.7873388528823853, | |
| "num_tokens": 10793455.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5311989635229111, | |
| "epoch": 2.4626865671641793, | |
| "grad_norm": 0.1547309309244156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266692042350769, | |
| "mean_token_accuracy": 0.7839333266019821, | |
| "num_tokens": 10809788.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5379379391670227, | |
| "epoch": 2.466417910447761, | |
| "grad_norm": 0.15859338641166687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321581363677979, | |
| "mean_token_accuracy": 0.7827870547771454, | |
| "num_tokens": 10825837.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5471830368041992, | |
| "epoch": 2.470149253731343, | |
| "grad_norm": 0.16068732738494873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360886454582214, | |
| "mean_token_accuracy": 0.7848220616579056, | |
| "num_tokens": 10842037.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5252791494131088, | |
| "epoch": 2.4738805970149254, | |
| "grad_norm": 0.1590043157339096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276464819908142, | |
| "mean_token_accuracy": 0.786907747387886, | |
| "num_tokens": 10858320.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.525018036365509, | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 0.17438893020153046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300197005271912, | |
| "mean_token_accuracy": 0.7852317094802856, | |
| "num_tokens": 10874855.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5394986271858215, | |
| "epoch": 2.4813432835820897, | |
| "grad_norm": 0.17128010094165802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422081351280212, | |
| "mean_token_accuracy": 0.7800386846065521, | |
| "num_tokens": 10891526.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5076115503907204, | |
| "epoch": 2.485074626865672, | |
| "grad_norm": 0.1781933754682541, | |
| "learning_rate": 0.0002, | |
| "loss": 0.507164716720581, | |
| "mean_token_accuracy": 0.7957528084516525, | |
| "num_tokens": 10907862.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5271291732788086, | |
| "epoch": 2.4888059701492535, | |
| "grad_norm": 0.17105896770954132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228562355041504, | |
| "mean_token_accuracy": 0.7889808863401413, | |
| "num_tokens": 10924235.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5363548994064331, | |
| "epoch": 2.4925373134328357, | |
| "grad_norm": 0.1583063155412674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336060523986816, | |
| "mean_token_accuracy": 0.7860426157712936, | |
| "num_tokens": 10940599.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.503924198448658, | |
| "epoch": 2.496268656716418, | |
| "grad_norm": 0.17252567410469055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5028519034385681, | |
| "mean_token_accuracy": 0.7955358028411865, | |
| "num_tokens": 10956649.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5256816297769547, | |
| "epoch": 2.5, | |
| "grad_norm": 0.1619226038455963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266148447990417, | |
| "mean_token_accuracy": 0.787626251578331, | |
| "num_tokens": 10972977.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5120773613452911, | |
| "epoch": 2.503731343283582, | |
| "grad_norm": 0.16918344795703888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207507610321045, | |
| "mean_token_accuracy": 0.7914620935916901, | |
| "num_tokens": 10989327.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5181663334369659, | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.19783611595630646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5268117189407349, | |
| "mean_token_accuracy": 0.7864458560943604, | |
| "num_tokens": 11005449.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5229259878396988, | |
| "epoch": 2.5111940298507465, | |
| "grad_norm": 0.1657666116952896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208563208580017, | |
| "mean_token_accuracy": 0.7903305888175964, | |
| "num_tokens": 11021576.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5335699021816254, | |
| "epoch": 2.5149253731343286, | |
| "grad_norm": 0.1847028136253357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323396921157837, | |
| "mean_token_accuracy": 0.7818653434514999, | |
| "num_tokens": 11038174.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5297135561704636, | |
| "epoch": 2.5186567164179103, | |
| "grad_norm": 0.17212164402008057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294620990753174, | |
| "mean_token_accuracy": 0.7868784368038177, | |
| "num_tokens": 11054527.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5551169812679291, | |
| "epoch": 2.5223880597014925, | |
| "grad_norm": 0.19568513333797455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539876222610474, | |
| "mean_token_accuracy": 0.775226280093193, | |
| "num_tokens": 11070805.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5319524109363556, | |
| "epoch": 2.5261194029850746, | |
| "grad_norm": 0.14972956478595734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295209288597107, | |
| "mean_token_accuracy": 0.7860101461410522, | |
| "num_tokens": 11087510.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5265523195266724, | |
| "epoch": 2.529850746268657, | |
| "grad_norm": 0.16056260466575623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248823761940002, | |
| "mean_token_accuracy": 0.7860508859157562, | |
| "num_tokens": 11103933.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5225390195846558, | |
| "epoch": 2.533582089552239, | |
| "grad_norm": 0.22218124568462372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301728248596191, | |
| "mean_token_accuracy": 0.7851128876209259, | |
| "num_tokens": 11120292.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5265638679265976, | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.15814287960529327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240415930747986, | |
| "mean_token_accuracy": 0.788665235042572, | |
| "num_tokens": 11136784.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5306698828935623, | |
| "epoch": 2.541044776119403, | |
| "grad_norm": 0.1664581149816513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277557373046875, | |
| "mean_token_accuracy": 0.7860920429229736, | |
| "num_tokens": 11153320.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5291799604892731, | |
| "epoch": 2.544776119402985, | |
| "grad_norm": 0.1872314065694809, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320236086845398, | |
| "mean_token_accuracy": 0.7843979746103287, | |
| "num_tokens": 11169723.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.53035868704319, | |
| "epoch": 2.548507462686567, | |
| "grad_norm": 0.20792965590953827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358518362045288, | |
| "mean_token_accuracy": 0.7849173247814178, | |
| "num_tokens": 11186035.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5152866542339325, | |
| "epoch": 2.5522388059701493, | |
| "grad_norm": 0.20304447412490845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512556791305542, | |
| "mean_token_accuracy": 0.7908182591199875, | |
| "num_tokens": 11201972.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.520212933421135, | |
| "epoch": 2.5559701492537314, | |
| "grad_norm": 0.19615566730499268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241949558258057, | |
| "mean_token_accuracy": 0.7870226055383682, | |
| "num_tokens": 11218085.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.523841142654419, | |
| "epoch": 2.5597014925373136, | |
| "grad_norm": 0.18903784453868866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5217975974082947, | |
| "mean_token_accuracy": 0.7914077341556549, | |
| "num_tokens": 11234466.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5006226599216461, | |
| "epoch": 2.5634328358208958, | |
| "grad_norm": 0.2238045483827591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.503075122833252, | |
| "mean_token_accuracy": 0.7985939383506775, | |
| "num_tokens": 11250619.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.522046685218811, | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.1861460655927658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256574749946594, | |
| "mean_token_accuracy": 0.7879543006420135, | |
| "num_tokens": 11267052.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5404367446899414, | |
| "epoch": 2.5708955223880596, | |
| "grad_norm": 0.18886177241802216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377542972564697, | |
| "mean_token_accuracy": 0.781608834862709, | |
| "num_tokens": 11283385.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.526772603392601, | |
| "epoch": 2.574626865671642, | |
| "grad_norm": 0.16710662841796875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189668536186218, | |
| "mean_token_accuracy": 0.7905929088592529, | |
| "num_tokens": 11299758.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.528350904583931, | |
| "epoch": 2.578358208955224, | |
| "grad_norm": 0.17797508835792542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194413661956787, | |
| "mean_token_accuracy": 0.7911931574344635, | |
| "num_tokens": 11316130.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.52931809425354, | |
| "epoch": 2.582089552238806, | |
| "grad_norm": 0.21212708950042725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379958152770996, | |
| "mean_token_accuracy": 0.7827763855457306, | |
| "num_tokens": 11332658.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5531658977270126, | |
| "epoch": 2.585820895522388, | |
| "grad_norm": 0.17241588234901428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588712692260742, | |
| "mean_token_accuracy": 0.7756764441728592, | |
| "num_tokens": 11349446.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5219079852104187, | |
| "epoch": 2.58955223880597, | |
| "grad_norm": 0.15809156000614166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210216045379639, | |
| "mean_token_accuracy": 0.7904610931873322, | |
| "num_tokens": 11366050.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5322331935167313, | |
| "epoch": 2.593283582089552, | |
| "grad_norm": 0.18396085500717163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301384925842285, | |
| "mean_token_accuracy": 0.7841024845838547, | |
| "num_tokens": 11382491.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5307652056217194, | |
| "epoch": 2.5970149253731343, | |
| "grad_norm": 0.16308656334877014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239346623420715, | |
| "mean_token_accuracy": 0.7880617082118988, | |
| "num_tokens": 11398802.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5340842455625534, | |
| "epoch": 2.6007462686567164, | |
| "grad_norm": 0.19761645793914795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363891124725342, | |
| "mean_token_accuracy": 0.7838073074817657, | |
| "num_tokens": 11415128.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5340555012226105, | |
| "epoch": 2.6044776119402986, | |
| "grad_norm": 0.1661156415939331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325526595115662, | |
| "mean_token_accuracy": 0.7847229689359665, | |
| "num_tokens": 11431318.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5427940785884857, | |
| "epoch": 2.6082089552238807, | |
| "grad_norm": 0.16063573956489563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501501560211182, | |
| "mean_token_accuracy": 0.7748306840658188, | |
| "num_tokens": 11447713.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5213874280452728, | |
| "epoch": 2.611940298507463, | |
| "grad_norm": 0.1618213802576065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210378170013428, | |
| "mean_token_accuracy": 0.787492960691452, | |
| "num_tokens": 11464142.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5329896062612534, | |
| "epoch": 2.6156716417910446, | |
| "grad_norm": 0.18406495451927185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365204215049744, | |
| "mean_token_accuracy": 0.7818106710910797, | |
| "num_tokens": 11480468.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5018042698502541, | |
| "epoch": 2.6194029850746268, | |
| "grad_norm": 0.1559264361858368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.507462203502655, | |
| "mean_token_accuracy": 0.7951454520225525, | |
| "num_tokens": 11496824.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5304955393075943, | |
| "epoch": 2.623134328358209, | |
| "grad_norm": 0.16140370070934296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346159338951111, | |
| "mean_token_accuracy": 0.7851942926645279, | |
| "num_tokens": 11513567.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5185345709323883, | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.16598905622959137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5121718645095825, | |
| "mean_token_accuracy": 0.7958889752626419, | |
| "num_tokens": 11530042.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5373921394348145, | |
| "epoch": 2.6305970149253732, | |
| "grad_norm": 0.18821974098682404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302144289016724, | |
| "mean_token_accuracy": 0.7860950380563736, | |
| "num_tokens": 11546594.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5182069316506386, | |
| "epoch": 2.6343283582089554, | |
| "grad_norm": 0.17032590508460999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235993266105652, | |
| "mean_token_accuracy": 0.7881369441747665, | |
| "num_tokens": 11562996.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5120366662740707, | |
| "epoch": 2.638059701492537, | |
| "grad_norm": 0.20226538181304932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5154089331626892, | |
| "mean_token_accuracy": 0.7893324643373489, | |
| "num_tokens": 11579247.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5271363854408264, | |
| "epoch": 2.6417910447761193, | |
| "grad_norm": 0.2367754727602005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529344916343689, | |
| "mean_token_accuracy": 0.7863059490919113, | |
| "num_tokens": 11595557.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5211906433105469, | |
| "epoch": 2.6455223880597014, | |
| "grad_norm": 0.17606736719608307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5162103176116943, | |
| "mean_token_accuracy": 0.7936627119779587, | |
| "num_tokens": 11612153.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5413748621940613, | |
| "epoch": 2.6492537313432836, | |
| "grad_norm": 0.16839931905269623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375933051109314, | |
| "mean_token_accuracy": 0.7837605625391006, | |
| "num_tokens": 11628672.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5492138266563416, | |
| "epoch": 2.6529850746268657, | |
| "grad_norm": 0.1578325480222702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387027263641357, | |
| "mean_token_accuracy": 0.7828567028045654, | |
| "num_tokens": 11645327.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5294462591409683, | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 0.18846334517002106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310033559799194, | |
| "mean_token_accuracy": 0.7850282490253448, | |
| "num_tokens": 11661886.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5195821523666382, | |
| "epoch": 2.66044776119403, | |
| "grad_norm": 0.1722957044839859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247335433959961, | |
| "mean_token_accuracy": 0.7882849276065826, | |
| "num_tokens": 11678052.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5254689157009125, | |
| "epoch": 2.664179104477612, | |
| "grad_norm": 0.175649493932724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303612947463989, | |
| "mean_token_accuracy": 0.7877318859100342, | |
| "num_tokens": 11694539.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5156526416540146, | |
| "epoch": 2.667910447761194, | |
| "grad_norm": 0.21296396851539612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5188760161399841, | |
| "mean_token_accuracy": 0.7886723130941391, | |
| "num_tokens": 11710806.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5304235517978668, | |
| "epoch": 2.671641791044776, | |
| "grad_norm": 0.1557040810585022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532120943069458, | |
| "mean_token_accuracy": 0.7845920920372009, | |
| "num_tokens": 11727178.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5396947711706161, | |
| "epoch": 2.675373134328358, | |
| "grad_norm": 0.23430386185646057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410381555557251, | |
| "mean_token_accuracy": 0.7820145785808563, | |
| "num_tokens": 11743592.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5290116220712662, | |
| "epoch": 2.6791044776119404, | |
| "grad_norm": 0.18491677939891815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220689177513123, | |
| "mean_token_accuracy": 0.7880972176790237, | |
| "num_tokens": 11759881.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5365530252456665, | |
| "epoch": 2.6828358208955225, | |
| "grad_norm": 0.20658747851848602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274034738540649, | |
| "mean_token_accuracy": 0.7877165377140045, | |
| "num_tokens": 11776103.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5193691104650497, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.15166765451431274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5179476737976074, | |
| "mean_token_accuracy": 0.7924929708242416, | |
| "num_tokens": 11792614.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5238720774650574, | |
| "epoch": 2.6902985074626864, | |
| "grad_norm": 0.2068144679069519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365906953811646, | |
| "mean_token_accuracy": 0.7825643718242645, | |
| "num_tokens": 11808884.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5160530805587769, | |
| "epoch": 2.6940298507462686, | |
| "grad_norm": 0.1884981393814087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255499482154846, | |
| "mean_token_accuracy": 0.785829171538353, | |
| "num_tokens": 11825190.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5381662398576736, | |
| "epoch": 2.6977611940298507, | |
| "grad_norm": 0.22528207302093506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401077270507812, | |
| "mean_token_accuracy": 0.780912771821022, | |
| "num_tokens": 11841581.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5353066176176071, | |
| "epoch": 2.701492537313433, | |
| "grad_norm": 0.16518141329288483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283069014549255, | |
| "mean_token_accuracy": 0.7859592884778976, | |
| "num_tokens": 11857924.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5316939651966095, | |
| "epoch": 2.705223880597015, | |
| "grad_norm": 0.1674748808145523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228734016418457, | |
| "mean_token_accuracy": 0.7879570424556732, | |
| "num_tokens": 11874385.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5669917911291122, | |
| "epoch": 2.708955223880597, | |
| "grad_norm": 0.18983666598796844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586099624633789, | |
| "mean_token_accuracy": 0.7734153866767883, | |
| "num_tokens": 11890893.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5250157564878464, | |
| "epoch": 2.7126865671641793, | |
| "grad_norm": 0.16966547071933746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228544473648071, | |
| "mean_token_accuracy": 0.7863233536481857, | |
| "num_tokens": 11907436.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5265001058578491, | |
| "epoch": 2.716417910447761, | |
| "grad_norm": 0.21439625322818756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315214991569519, | |
| "mean_token_accuracy": 0.7847255766391754, | |
| "num_tokens": 11923778.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5284342169761658, | |
| "epoch": 2.720149253731343, | |
| "grad_norm": 0.1824498325586319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404508709907532, | |
| "mean_token_accuracy": 0.7798212766647339, | |
| "num_tokens": 11940075.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.501299723982811, | |
| "epoch": 2.7238805970149254, | |
| "grad_norm": 0.2304428666830063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5122545957565308, | |
| "mean_token_accuracy": 0.791194960474968, | |
| "num_tokens": 11956336.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5443384349346161, | |
| "epoch": 2.7276119402985075, | |
| "grad_norm": 0.1537434458732605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363157987594604, | |
| "mean_token_accuracy": 0.7845837771892548, | |
| "num_tokens": 11972840.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5315753519535065, | |
| "epoch": 2.7313432835820897, | |
| "grad_norm": 0.17106328904628754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220600366592407, | |
| "mean_token_accuracy": 0.7875728458166122, | |
| "num_tokens": 11989350.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5302078127861023, | |
| "epoch": 2.7350746268656714, | |
| "grad_norm": 0.17003247141838074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270202159881592, | |
| "mean_token_accuracy": 0.787715807557106, | |
| "num_tokens": 12005809.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.527949333190918, | |
| "epoch": 2.7388059701492535, | |
| "grad_norm": 0.21327127516269684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354670882225037, | |
| "mean_token_accuracy": 0.7835386097431183, | |
| "num_tokens": 12022336.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5089609026908875, | |
| "epoch": 2.7425373134328357, | |
| "grad_norm": 0.16088151931762695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5117763876914978, | |
| "mean_token_accuracy": 0.7938453704118729, | |
| "num_tokens": 12038779.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5126267448067665, | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.1757761836051941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5135779976844788, | |
| "mean_token_accuracy": 0.7931608110666275, | |
| "num_tokens": 12054869.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5239577889442444, | |
| "epoch": 2.75, | |
| "grad_norm": 0.1817576140165329, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234410762786865, | |
| "mean_token_accuracy": 0.7875021547079086, | |
| "num_tokens": 12071361.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5307980924844742, | |
| "epoch": 2.753731343283582, | |
| "grad_norm": 0.1653635948896408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298102498054504, | |
| "mean_token_accuracy": 0.7864446491003036, | |
| "num_tokens": 12087634.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5222239643335342, | |
| "epoch": 2.7574626865671643, | |
| "grad_norm": 0.18040236830711365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258353352546692, | |
| "mean_token_accuracy": 0.7891390025615692, | |
| "num_tokens": 12103943.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5332596972584724, | |
| "epoch": 2.7611940298507465, | |
| "grad_norm": 0.15495066344738007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282677412033081, | |
| "mean_token_accuracy": 0.785639688372612, | |
| "num_tokens": 12120325.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5371799468994141, | |
| "epoch": 2.7649253731343286, | |
| "grad_norm": 0.17130646109580994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295438170433044, | |
| "mean_token_accuracy": 0.7828952521085739, | |
| "num_tokens": 12136761.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5405760109424591, | |
| "epoch": 2.7686567164179103, | |
| "grad_norm": 0.16763344407081604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373218655586243, | |
| "mean_token_accuracy": 0.7816964089870453, | |
| "num_tokens": 12153043.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5118273198604584, | |
| "epoch": 2.7723880597014925, | |
| "grad_norm": 0.17398576438426971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5121888518333435, | |
| "mean_token_accuracy": 0.7949073165655136, | |
| "num_tokens": 12169387.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5252756625413895, | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 0.20275278389453888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319023132324219, | |
| "mean_token_accuracy": 0.7827770113945007, | |
| "num_tokens": 12185773.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5281336456537247, | |
| "epoch": 2.779850746268657, | |
| "grad_norm": 0.16486869752407074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282880663871765, | |
| "mean_token_accuracy": 0.7841639369726181, | |
| "num_tokens": 12202185.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5157778561115265, | |
| "epoch": 2.783582089552239, | |
| "grad_norm": 0.1883569210767746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5159796476364136, | |
| "mean_token_accuracy": 0.791821077466011, | |
| "num_tokens": 12218279.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5459621995687485, | |
| "epoch": 2.7873134328358207, | |
| "grad_norm": 0.15937039256095886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399669408798218, | |
| "mean_token_accuracy": 0.7847357988357544, | |
| "num_tokens": 12234867.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.52740877866745, | |
| "epoch": 2.791044776119403, | |
| "grad_norm": 0.14844611287117004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260165929794312, | |
| "mean_token_accuracy": 0.7880454957485199, | |
| "num_tokens": 12251419.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5150434598326683, | |
| "epoch": 2.794776119402985, | |
| "grad_norm": 0.16429124772548676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152871012687683, | |
| "mean_token_accuracy": 0.7888982892036438, | |
| "num_tokens": 12267583.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5261992961168289, | |
| "epoch": 2.798507462686567, | |
| "grad_norm": 0.18603260815143585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299534201622009, | |
| "mean_token_accuracy": 0.7854207009077072, | |
| "num_tokens": 12284129.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.529946893453598, | |
| "epoch": 2.8022388059701493, | |
| "grad_norm": 0.18355652689933777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360465049743652, | |
| "mean_token_accuracy": 0.7842213064432144, | |
| "num_tokens": 12300683.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5377232730388641, | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.17548733949661255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429165363311768, | |
| "mean_token_accuracy": 0.7822890281677246, | |
| "num_tokens": 12316833.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5407239943742752, | |
| "epoch": 2.8097014925373136, | |
| "grad_norm": 0.17476212978363037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398030281066895, | |
| "mean_token_accuracy": 0.7804454267024994, | |
| "num_tokens": 12333283.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.520610861480236, | |
| "epoch": 2.8134328358208958, | |
| "grad_norm": 0.15137535333633423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5157968401908875, | |
| "mean_token_accuracy": 0.7898696959018707, | |
| "num_tokens": 12349570.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5343620032072067, | |
| "epoch": 2.8171641791044775, | |
| "grad_norm": 0.16463439166545868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255429148674011, | |
| "mean_token_accuracy": 0.7910490483045578, | |
| "num_tokens": 12366111.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5226383879780769, | |
| "epoch": 2.8208955223880596, | |
| "grad_norm": 0.17591623961925507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295028686523438, | |
| "mean_token_accuracy": 0.7862412929534912, | |
| "num_tokens": 12382176.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5329883769154549, | |
| "epoch": 2.824626865671642, | |
| "grad_norm": 0.17046134173870087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395819544792175, | |
| "mean_token_accuracy": 0.7815450727939606, | |
| "num_tokens": 12398954.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5189251601696014, | |
| "epoch": 2.828358208955224, | |
| "grad_norm": 0.17623355984687805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211597681045532, | |
| "mean_token_accuracy": 0.7862699329853058, | |
| "num_tokens": 12415518.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5435206592082977, | |
| "epoch": 2.832089552238806, | |
| "grad_norm": 0.16461242735385895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449641346931458, | |
| "mean_token_accuracy": 0.7772939503192902, | |
| "num_tokens": 12431840.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5242071002721786, | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.16906797885894775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236470103263855, | |
| "mean_token_accuracy": 0.7878623157739639, | |
| "num_tokens": 12447985.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5331535488367081, | |
| "epoch": 2.83955223880597, | |
| "grad_norm": 0.1613229662179947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270719528198242, | |
| "mean_token_accuracy": 0.7869479656219482, | |
| "num_tokens": 12464369.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5153749734163284, | |
| "epoch": 2.843283582089552, | |
| "grad_norm": 0.1861318051815033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5134626626968384, | |
| "mean_token_accuracy": 0.7917421609163284, | |
| "num_tokens": 12480705.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5185382887721062, | |
| "epoch": 2.8470149253731343, | |
| "grad_norm": 0.15517400205135345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520057201385498, | |
| "mean_token_accuracy": 0.7887658178806305, | |
| "num_tokens": 12496768.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.525531992316246, | |
| "epoch": 2.8507462686567164, | |
| "grad_norm": 0.2088494747877121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236872434616089, | |
| "mean_token_accuracy": 0.7884621620178223, | |
| "num_tokens": 12513264.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.516917809844017, | |
| "epoch": 2.8544776119402986, | |
| "grad_norm": 0.1747450977563858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234484076499939, | |
| "mean_token_accuracy": 0.7843039780855179, | |
| "num_tokens": 12529856.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5171080678701401, | |
| "epoch": 2.8582089552238807, | |
| "grad_norm": 0.17318587005138397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520793080329895, | |
| "mean_token_accuracy": 0.7862659096717834, | |
| "num_tokens": 12546530.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.540691614151001, | |
| "epoch": 2.861940298507463, | |
| "grad_norm": 0.15875069797039032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400336384773254, | |
| "mean_token_accuracy": 0.7827646285295486, | |
| "num_tokens": 12563086.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5084429755806923, | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.14828889071941376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5024577379226685, | |
| "mean_token_accuracy": 0.7963315397500992, | |
| "num_tokens": 12579183.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5370931923389435, | |
| "epoch": 2.8694029850746268, | |
| "grad_norm": 0.14752823114395142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261865854263306, | |
| "mean_token_accuracy": 0.7877734899520874, | |
| "num_tokens": 12596077.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5546486079692841, | |
| "epoch": 2.873134328358209, | |
| "grad_norm": 0.1517077535390854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500649809837341, | |
| "mean_token_accuracy": 0.7785899043083191, | |
| "num_tokens": 12612620.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5144929736852646, | |
| "epoch": 2.876865671641791, | |
| "grad_norm": 0.18645553290843964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184378623962402, | |
| "mean_token_accuracy": 0.7887341529130936, | |
| "num_tokens": 12628974.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5363174676895142, | |
| "epoch": 2.8805970149253732, | |
| "grad_norm": 0.173641175031662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404868125915527, | |
| "mean_token_accuracy": 0.7838273793458939, | |
| "num_tokens": 12645473.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5220237821340561, | |
| "epoch": 2.8843283582089554, | |
| "grad_norm": 0.1810951977968216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300620794296265, | |
| "mean_token_accuracy": 0.7870841026306152, | |
| "num_tokens": 12661871.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5215499252080917, | |
| "epoch": 2.888059701492537, | |
| "grad_norm": 0.17195403575897217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228441953659058, | |
| "mean_token_accuracy": 0.7888252288103104, | |
| "num_tokens": 12678403.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5262960642576218, | |
| "epoch": 2.8917910447761193, | |
| "grad_norm": 0.16115020215511322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279878973960876, | |
| "mean_token_accuracy": 0.7827633023262024, | |
| "num_tokens": 12694636.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5458672344684601, | |
| "epoch": 2.8955223880597014, | |
| "grad_norm": 0.18671803176403046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379894971847534, | |
| "mean_token_accuracy": 0.7803581058979034, | |
| "num_tokens": 12711335.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5334444046020508, | |
| "epoch": 2.8992537313432836, | |
| "grad_norm": 0.16968129575252533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301728248596191, | |
| "mean_token_accuracy": 0.7843312919139862, | |
| "num_tokens": 12727428.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5264092683792114, | |
| "epoch": 2.9029850746268657, | |
| "grad_norm": 0.17358112335205078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304536819458008, | |
| "mean_token_accuracy": 0.7818145751953125, | |
| "num_tokens": 12743928.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.521320641040802, | |
| "epoch": 2.906716417910448, | |
| "grad_norm": 0.19404703378677368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308122038841248, | |
| "mean_token_accuracy": 0.7851481735706329, | |
| "num_tokens": 12760425.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5253891497850418, | |
| "epoch": 2.91044776119403, | |
| "grad_norm": 0.23603156208992004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537718653678894, | |
| "mean_token_accuracy": 0.7832214832305908, | |
| "num_tokens": 12776783.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5522697567939758, | |
| "epoch": 2.914179104477612, | |
| "grad_norm": 0.16655920445919037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428380966186523, | |
| "mean_token_accuracy": 0.7817497551441193, | |
| "num_tokens": 12793260.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5386251360177994, | |
| "epoch": 2.917910447761194, | |
| "grad_norm": 0.17462746798992157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273305773735046, | |
| "mean_token_accuracy": 0.7866194099187851, | |
| "num_tokens": 12809754.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5417182147502899, | |
| "epoch": 2.921641791044776, | |
| "grad_norm": 0.16420036554336548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311017632484436, | |
| "mean_token_accuracy": 0.7847865968942642, | |
| "num_tokens": 12826135.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5094658881425858, | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.209514319896698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230738520622253, | |
| "mean_token_accuracy": 0.7901812642812729, | |
| "num_tokens": 12842378.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5122962892055511, | |
| "epoch": 2.9291044776119404, | |
| "grad_norm": 0.17986896634101868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213406682014465, | |
| "mean_token_accuracy": 0.7899868190288544, | |
| "num_tokens": 12858715.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5239143073558807, | |
| "epoch": 2.9328358208955225, | |
| "grad_norm": 0.17349380254745483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260440707206726, | |
| "mean_token_accuracy": 0.7880281209945679, | |
| "num_tokens": 12875134.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5183478370308876, | |
| "epoch": 2.9365671641791042, | |
| "grad_norm": 0.15738630294799805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146017074584961, | |
| "mean_token_accuracy": 0.7944561541080475, | |
| "num_tokens": 12891435.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5321111530065536, | |
| "epoch": 2.9402985074626864, | |
| "grad_norm": 0.169599249958992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332249402999878, | |
| "mean_token_accuracy": 0.7841628640890121, | |
| "num_tokens": 12907955.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5348423272371292, | |
| "epoch": 2.9440298507462686, | |
| "grad_norm": 0.1703958362340927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319628715515137, | |
| "mean_token_accuracy": 0.7853727787733078, | |
| "num_tokens": 12924187.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5348647981882095, | |
| "epoch": 2.9477611940298507, | |
| "grad_norm": 0.16257572174072266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274540185928345, | |
| "mean_token_accuracy": 0.7864417731761932, | |
| "num_tokens": 12940471.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5246876776218414, | |
| "epoch": 2.951492537313433, | |
| "grad_norm": 0.21989069879055023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532191276550293, | |
| "mean_token_accuracy": 0.7841058969497681, | |
| "num_tokens": 12956753.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5206954181194305, | |
| "epoch": 2.955223880597015, | |
| "grad_norm": 0.18530453741550446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260450839996338, | |
| "mean_token_accuracy": 0.7853500992059708, | |
| "num_tokens": 12972983.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5218585133552551, | |
| "epoch": 2.958955223880597, | |
| "grad_norm": 0.19632470607757568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524539589881897, | |
| "mean_token_accuracy": 0.7870173752307892, | |
| "num_tokens": 12989538.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5301937758922577, | |
| "epoch": 2.9626865671641793, | |
| "grad_norm": 0.1759789139032364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322460532188416, | |
| "mean_token_accuracy": 0.7846620082855225, | |
| "num_tokens": 13005865.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5316169708967209, | |
| "epoch": 2.966417910447761, | |
| "grad_norm": 0.18013249337673187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267240405082703, | |
| "mean_token_accuracy": 0.7860967516899109, | |
| "num_tokens": 13022162.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5342477560043335, | |
| "epoch": 2.970149253731343, | |
| "grad_norm": 0.15967167913913727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531574010848999, | |
| "mean_token_accuracy": 0.7845140397548676, | |
| "num_tokens": 13038634.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5358534008264542, | |
| "epoch": 2.9738805970149254, | |
| "grad_norm": 0.18192364275455475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531234085559845, | |
| "mean_token_accuracy": 0.7822518199682236, | |
| "num_tokens": 13054913.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5332595482468605, | |
| "epoch": 2.9776119402985075, | |
| "grad_norm": 0.16098462045192719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331971645355225, | |
| "mean_token_accuracy": 0.7841719388961792, | |
| "num_tokens": 13071687.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5196807980537415, | |
| "epoch": 2.9813432835820897, | |
| "grad_norm": 0.16396892070770264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5180687308311462, | |
| "mean_token_accuracy": 0.79112908244133, | |
| "num_tokens": 13088263.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5160314440727234, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.18938018381595612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278008580207825, | |
| "mean_token_accuracy": 0.7868732959032059, | |
| "num_tokens": 13104420.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5099834352731705, | |
| "epoch": 2.9888059701492535, | |
| "grad_norm": 0.18755869567394257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147690176963806, | |
| "mean_token_accuracy": 0.790816992521286, | |
| "num_tokens": 13120862.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5440191924571991, | |
| "epoch": 2.9925373134328357, | |
| "grad_norm": 0.16148996353149414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402988195419312, | |
| "mean_token_accuracy": 0.7817222625017166, | |
| "num_tokens": 13137523.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5369501113891602, | |
| "epoch": 2.996268656716418, | |
| "grad_norm": 0.17043927311897278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288562178611755, | |
| "mean_token_accuracy": 0.7866682559251785, | |
| "num_tokens": 13153684.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5347233563661575, | |
| "epoch": 3.0, | |
| "grad_norm": 0.17972980439662933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365173816680908, | |
| "mean_token_accuracy": 0.782272219657898, | |
| "num_tokens": 13170027.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2276685185818296e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |