Instructions to use eac123/clean-subliminal-learning-wolves with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-wolves with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-wolves") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.126197099685669, | |
| "epoch": 0.003738317757009346, | |
| "grad_norm": 0.4137735962867737, | |
| "learning_rate": 0.0002, | |
| "loss": 2.431535243988037, | |
| "mean_token_accuracy": 0.54428631067276, | |
| "num_tokens": 16465.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2562520503997803, | |
| "epoch": 0.007476635514018692, | |
| "grad_norm": 0.3902691900730133, | |
| "learning_rate": 0.0002, | |
| "loss": 2.188866376876831, | |
| "mean_token_accuracy": 0.5568228960037231, | |
| "num_tokens": 32573.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4093195796012878, | |
| "epoch": 0.011214953271028037, | |
| "grad_norm": 0.29741090536117554, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7309190034866333, | |
| "mean_token_accuracy": 0.591301366686821, | |
| "num_tokens": 48848.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3904370069503784, | |
| "epoch": 0.014953271028037384, | |
| "grad_norm": 0.24415643513202667, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4167925119400024, | |
| "mean_token_accuracy": 0.6262245625257492, | |
| "num_tokens": 64779.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3590968251228333, | |
| "epoch": 0.018691588785046728, | |
| "grad_norm": 0.2501066327095032, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3086440563201904, | |
| "mean_token_accuracy": 0.6442629396915436, | |
| "num_tokens": 81017.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2659040987491608, | |
| "epoch": 0.022429906542056073, | |
| "grad_norm": 0.13132381439208984, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1781953573226929, | |
| "mean_token_accuracy": 0.6602727770805359, | |
| "num_tokens": 97143.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1754920184612274, | |
| "epoch": 0.026168224299065422, | |
| "grad_norm": 0.10863616317510605, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0758289098739624, | |
| "mean_token_accuracy": 0.6747478097677231, | |
| "num_tokens": 113270.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1110295355319977, | |
| "epoch": 0.029906542056074768, | |
| "grad_norm": 0.11261261999607086, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0382510423660278, | |
| "mean_token_accuracy": 0.6741550117731094, | |
| "num_tokens": 129740.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0438694655895233, | |
| "epoch": 0.03364485981308411, | |
| "grad_norm": 0.1300426870584488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9842232465744019, | |
| "mean_token_accuracy": 0.6938712894916534, | |
| "num_tokens": 146153.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.980072870850563, | |
| "epoch": 0.037383177570093455, | |
| "grad_norm": 0.1279866099357605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.907992422580719, | |
| "mean_token_accuracy": 0.7037613391876221, | |
| "num_tokens": 162400.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9512171745300293, | |
| "epoch": 0.041121495327102804, | |
| "grad_norm": 0.11444728821516037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8603078722953796, | |
| "mean_token_accuracy": 0.7085670977830887, | |
| "num_tokens": 178596.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9008210897445679, | |
| "epoch": 0.044859813084112146, | |
| "grad_norm": 0.1163485050201416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8202763199806213, | |
| "mean_token_accuracy": 0.7147757261991501, | |
| "num_tokens": 194960.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8144031316041946, | |
| "epoch": 0.048598130841121495, | |
| "grad_norm": 1.8727822303771973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7989485859870911, | |
| "mean_token_accuracy": 0.714598998427391, | |
| "num_tokens": 211519.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7731810510158539, | |
| "epoch": 0.052336448598130844, | |
| "grad_norm": 0.40646815299987793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7675734162330627, | |
| "mean_token_accuracy": 0.7164532542228699, | |
| "num_tokens": 227947.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7750754952430725, | |
| "epoch": 0.056074766355140186, | |
| "grad_norm": 0.0927761048078537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.752495527267456, | |
| "mean_token_accuracy": 0.7247887402772903, | |
| "num_tokens": 244285.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7294797450304031, | |
| "epoch": 0.059813084112149535, | |
| "grad_norm": 0.09633366763591766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7139282822608948, | |
| "mean_token_accuracy": 0.733425110578537, | |
| "num_tokens": 260524.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.7113516181707382, | |
| "epoch": 0.06355140186915888, | |
| "grad_norm": 0.08278490602970123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.69715416431427, | |
| "mean_token_accuracy": 0.7404225617647171, | |
| "num_tokens": 276676.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6892006993293762, | |
| "epoch": 0.06728971962616823, | |
| "grad_norm": 0.09702161699533463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6832636594772339, | |
| "mean_token_accuracy": 0.7384749203920364, | |
| "num_tokens": 293327.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.683604821562767, | |
| "epoch": 0.07102803738317758, | |
| "grad_norm": 0.09970250725746155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6719778776168823, | |
| "mean_token_accuracy": 0.7447258532047272, | |
| "num_tokens": 309768.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6530238687992096, | |
| "epoch": 0.07476635514018691, | |
| "grad_norm": 0.08765958249568939, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6265610456466675, | |
| "mean_token_accuracy": 0.7607048451900482, | |
| "num_tokens": 325953.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6858675181865692, | |
| "epoch": 0.07850467289719626, | |
| "grad_norm": 0.1555248200893402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.653350830078125, | |
| "mean_token_accuracy": 0.7461759150028229, | |
| "num_tokens": 342357.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6731577664613724, | |
| "epoch": 0.08224299065420561, | |
| "grad_norm": 0.07943135499954224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6468416452407837, | |
| "mean_token_accuracy": 0.745930403470993, | |
| "num_tokens": 358780.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6372379511594772, | |
| "epoch": 0.08598130841121496, | |
| "grad_norm": 0.07176131755113602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6231244802474976, | |
| "mean_token_accuracy": 0.757389485836029, | |
| "num_tokens": 375059.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6160608530044556, | |
| "epoch": 0.08971962616822429, | |
| "grad_norm": 0.09053056687116623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6240095496177673, | |
| "mean_token_accuracy": 0.7537032961845398, | |
| "num_tokens": 391372.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6163977682590485, | |
| "epoch": 0.09345794392523364, | |
| "grad_norm": 0.06957540661096573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6137739419937134, | |
| "mean_token_accuracy": 0.7591944634914398, | |
| "num_tokens": 407634.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6172843426465988, | |
| "epoch": 0.09719626168224299, | |
| "grad_norm": 0.06831946223974228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6151383519172668, | |
| "mean_token_accuracy": 0.7588979452848434, | |
| "num_tokens": 424139.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6146537363529205, | |
| "epoch": 0.10093457943925234, | |
| "grad_norm": 0.06785774976015091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6100280284881592, | |
| "mean_token_accuracy": 0.7608075141906738, | |
| "num_tokens": 440251.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.5965892523527145, | |
| "epoch": 0.10467289719626169, | |
| "grad_norm": 0.06592898070812225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5876743793487549, | |
| "mean_token_accuracy": 0.7687714993953705, | |
| "num_tokens": 456512.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.6143475025892258, | |
| "epoch": 0.10841121495327102, | |
| "grad_norm": 0.06412907689809799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6119903326034546, | |
| "mean_token_accuracy": 0.7573402374982834, | |
| "num_tokens": 472958.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5956396609544754, | |
| "epoch": 0.11214953271028037, | |
| "grad_norm": 0.06444356590509415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.594578206539154, | |
| "mean_token_accuracy": 0.7660299837589264, | |
| "num_tokens": 489407.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5987770259380341, | |
| "epoch": 0.11588785046728972, | |
| "grad_norm": 0.05562213435769081, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5932596921920776, | |
| "mean_token_accuracy": 0.7620532661676407, | |
| "num_tokens": 506104.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5812755525112152, | |
| "epoch": 0.11962616822429907, | |
| "grad_norm": 0.060992538928985596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729696154594421, | |
| "mean_token_accuracy": 0.7730918079614639, | |
| "num_tokens": 522565.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5877644866704941, | |
| "epoch": 0.1233644859813084, | |
| "grad_norm": 0.05839328467845917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5913704633712769, | |
| "mean_token_accuracy": 0.7656503766775131, | |
| "num_tokens": 539081.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5780033618211746, | |
| "epoch": 0.12710280373831775, | |
| "grad_norm": 0.05193523317575455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5819685459136963, | |
| "mean_token_accuracy": 0.7665455341339111, | |
| "num_tokens": 555504.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5869153290987015, | |
| "epoch": 0.1308411214953271, | |
| "grad_norm": 0.06890807300806046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5857660174369812, | |
| "mean_token_accuracy": 0.7676131427288055, | |
| "num_tokens": 572153.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5672304034233093, | |
| "epoch": 0.13457943925233645, | |
| "grad_norm": 0.05624233931303024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5718747973442078, | |
| "mean_token_accuracy": 0.7710311710834503, | |
| "num_tokens": 588585.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5678977817296982, | |
| "epoch": 0.1383177570093458, | |
| "grad_norm": 0.06091594323515892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5765193104743958, | |
| "mean_token_accuracy": 0.7686972767114639, | |
| "num_tokens": 604864.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5863034427165985, | |
| "epoch": 0.14205607476635515, | |
| "grad_norm": 0.07292835414409637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.597279965877533, | |
| "mean_token_accuracy": 0.7606304287910461, | |
| "num_tokens": 621080.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5759021639823914, | |
| "epoch": 0.14579439252336449, | |
| "grad_norm": 0.05464645475149155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570218563079834, | |
| "mean_token_accuracy": 0.770964503288269, | |
| "num_tokens": 637503.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5763402879238129, | |
| "epoch": 0.14953271028037382, | |
| "grad_norm": 0.056617990136146545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5686919093132019, | |
| "mean_token_accuracy": 0.7723182737827301, | |
| "num_tokens": 653609.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.6039886325597763, | |
| "epoch": 0.15327102803738318, | |
| "grad_norm": 0.04869381710886955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5939038395881653, | |
| "mean_token_accuracy": 0.7607405036687851, | |
| "num_tokens": 669981.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5946750342845917, | |
| "epoch": 0.15700934579439252, | |
| "grad_norm": 0.046227701008319855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.589706301689148, | |
| "mean_token_accuracy": 0.7646626383066177, | |
| "num_tokens": 686537.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5577073395252228, | |
| "epoch": 0.16074766355140188, | |
| "grad_norm": 0.04413911700248718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559436023235321, | |
| "mean_token_accuracy": 0.7762598991394043, | |
| "num_tokens": 702686.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5665079057216644, | |
| "epoch": 0.16448598130841122, | |
| "grad_norm": 0.047774720937013626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647708773612976, | |
| "mean_token_accuracy": 0.7764726728200912, | |
| "num_tokens": 718966.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5726076513528824, | |
| "epoch": 0.16822429906542055, | |
| "grad_norm": 0.05053015798330307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5747931003570557, | |
| "mean_token_accuracy": 0.7704672068357468, | |
| "num_tokens": 735364.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5688610672950745, | |
| "epoch": 0.17196261682242991, | |
| "grad_norm": 0.037495676428079605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652605295181274, | |
| "mean_token_accuracy": 0.770918145775795, | |
| "num_tokens": 751902.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.581221267580986, | |
| "epoch": 0.17570093457943925, | |
| "grad_norm": 0.051694370806217194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5826902389526367, | |
| "mean_token_accuracy": 0.7654351443052292, | |
| "num_tokens": 768151.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5708408057689667, | |
| "epoch": 0.17943925233644858, | |
| "grad_norm": 0.04264647886157036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651251673698425, | |
| "mean_token_accuracy": 0.7749274671077728, | |
| "num_tokens": 784511.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5757250636816025, | |
| "epoch": 0.18317757009345795, | |
| "grad_norm": 0.050725825130939484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704944133758545, | |
| "mean_token_accuracy": 0.7680549174547195, | |
| "num_tokens": 800966.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5546318888664246, | |
| "epoch": 0.18691588785046728, | |
| "grad_norm": 0.03947490453720093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488482713699341, | |
| "mean_token_accuracy": 0.7769860327243805, | |
| "num_tokens": 817293.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5634811520576477, | |
| "epoch": 0.19065420560747665, | |
| "grad_norm": 0.049806442111730576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557321906089783, | |
| "mean_token_accuracy": 0.7740621268749237, | |
| "num_tokens": 833385.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.582123801112175, | |
| "epoch": 0.19439252336448598, | |
| "grad_norm": 0.0458400622010231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5802882313728333, | |
| "mean_token_accuracy": 0.7661796510219574, | |
| "num_tokens": 849741.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5494910776615143, | |
| "epoch": 0.19813084112149532, | |
| "grad_norm": 0.04727543145418167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554188072681427, | |
| "mean_token_accuracy": 0.7779219001531601, | |
| "num_tokens": 865884.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.568273514509201, | |
| "epoch": 0.20186915887850468, | |
| "grad_norm": 0.052229855209589005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5752811431884766, | |
| "mean_token_accuracy": 0.7671186923980713, | |
| "num_tokens": 882348.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5694270133972168, | |
| "epoch": 0.205607476635514, | |
| "grad_norm": 0.04475817084312439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706926584243774, | |
| "mean_token_accuracy": 0.7702507525682449, | |
| "num_tokens": 898544.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5677521079778671, | |
| "epoch": 0.20934579439252338, | |
| "grad_norm": 0.03592672944068909, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5723967552185059, | |
| "mean_token_accuracy": 0.766302615404129, | |
| "num_tokens": 914946.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5698029845952988, | |
| "epoch": 0.2130841121495327, | |
| "grad_norm": 0.04732033982872963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640438795089722, | |
| "mean_token_accuracy": 0.7732385843992233, | |
| "num_tokens": 931100.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5775126665830612, | |
| "epoch": 0.21682242990654205, | |
| "grad_norm": 0.04193758964538574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704541802406311, | |
| "mean_token_accuracy": 0.7691217958927155, | |
| "num_tokens": 947448.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5770154148340225, | |
| "epoch": 0.2205607476635514, | |
| "grad_norm": 0.035865288227796555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5679229497909546, | |
| "mean_token_accuracy": 0.7680188864469528, | |
| "num_tokens": 963902.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5588070899248123, | |
| "epoch": 0.22429906542056074, | |
| "grad_norm": 0.04689257591962814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615048408508301, | |
| "mean_token_accuracy": 0.7748474776744843, | |
| "num_tokens": 980180.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.57504902780056, | |
| "epoch": 0.22803738317757008, | |
| "grad_norm": 0.04198114946484566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.577617883682251, | |
| "mean_token_accuracy": 0.7648669481277466, | |
| "num_tokens": 996613.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5450393110513687, | |
| "epoch": 0.23177570093457944, | |
| "grad_norm": 0.040139347314834595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552120566368103, | |
| "mean_token_accuracy": 0.7774388641119003, | |
| "num_tokens": 1012686.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5609021335840225, | |
| "epoch": 0.23551401869158878, | |
| "grad_norm": 0.03753409534692764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530397295951843, | |
| "mean_token_accuracy": 0.7765212655067444, | |
| "num_tokens": 1028835.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5794262290000916, | |
| "epoch": 0.23925233644859814, | |
| "grad_norm": 0.035354360938072205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5788048505783081, | |
| "mean_token_accuracy": 0.7663274556398392, | |
| "num_tokens": 1045176.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5655659288167953, | |
| "epoch": 0.24299065420560748, | |
| "grad_norm": 0.03588757663965225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581645369529724, | |
| "mean_token_accuracy": 0.7732069790363312, | |
| "num_tokens": 1061452.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5672483444213867, | |
| "epoch": 0.2467289719626168, | |
| "grad_norm": 0.036772388964891434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631874203681946, | |
| "mean_token_accuracy": 0.7695926129817963, | |
| "num_tokens": 1077997.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.578306719660759, | |
| "epoch": 0.2504672897196262, | |
| "grad_norm": 0.039442483335733414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5765112638473511, | |
| "mean_token_accuracy": 0.7657738327980042, | |
| "num_tokens": 1094247.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5700875818729401, | |
| "epoch": 0.2542056074766355, | |
| "grad_norm": 0.0448731891810894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.574236273765564, | |
| "mean_token_accuracy": 0.7669749855995178, | |
| "num_tokens": 1110470.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5609024912118912, | |
| "epoch": 0.25794392523364484, | |
| "grad_norm": 0.033255062997341156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576102137565613, | |
| "mean_token_accuracy": 0.7776026874780655, | |
| "num_tokens": 1127050.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5673299431800842, | |
| "epoch": 0.2616822429906542, | |
| "grad_norm": 0.03715064004063606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5695099234580994, | |
| "mean_token_accuracy": 0.7701731324195862, | |
| "num_tokens": 1143383.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.560445249080658, | |
| "epoch": 0.26542056074766357, | |
| "grad_norm": 0.04453396797180176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644095540046692, | |
| "mean_token_accuracy": 0.7720398306846619, | |
| "num_tokens": 1159597.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5526476353406906, | |
| "epoch": 0.2691588785046729, | |
| "grad_norm": 0.039633698761463165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499011874198914, | |
| "mean_token_accuracy": 0.7772456705570221, | |
| "num_tokens": 1175764.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5623870193958282, | |
| "epoch": 0.27289719626168224, | |
| "grad_norm": 0.036508623510599136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5721215009689331, | |
| "mean_token_accuracy": 0.7691169232130051, | |
| "num_tokens": 1192041.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5718335658311844, | |
| "epoch": 0.2766355140186916, | |
| "grad_norm": 0.044028230011463165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5752332806587219, | |
| "mean_token_accuracy": 0.7687042653560638, | |
| "num_tokens": 1208468.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5587927252054214, | |
| "epoch": 0.2803738317757009, | |
| "grad_norm": 0.04269316419959068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531549453735352, | |
| "mean_token_accuracy": 0.7755036056041718, | |
| "num_tokens": 1224757.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5787914991378784, | |
| "epoch": 0.2841121495327103, | |
| "grad_norm": 0.040728773921728134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5694252252578735, | |
| "mean_token_accuracy": 0.7696126103401184, | |
| "num_tokens": 1241162.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5616230517625809, | |
| "epoch": 0.28785046728971964, | |
| "grad_norm": 0.037814315408468246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627362728118896, | |
| "mean_token_accuracy": 0.7735611200332642, | |
| "num_tokens": 1257583.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.567746564745903, | |
| "epoch": 0.29158878504672897, | |
| "grad_norm": 0.03843110799789429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634809732437134, | |
| "mean_token_accuracy": 0.7711174041032791, | |
| "num_tokens": 1274115.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5585684925317764, | |
| "epoch": 0.2953271028037383, | |
| "grad_norm": 0.03358754143118858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604900121688843, | |
| "mean_token_accuracy": 0.7713887989521027, | |
| "num_tokens": 1290371.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5650099366903305, | |
| "epoch": 0.29906542056074764, | |
| "grad_norm": 0.038185376673936844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5694409608840942, | |
| "mean_token_accuracy": 0.7706831693649292, | |
| "num_tokens": 1306602.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5573018193244934, | |
| "epoch": 0.30280373831775703, | |
| "grad_norm": 0.04070131108164787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703440308570862, | |
| "mean_token_accuracy": 0.771970734000206, | |
| "num_tokens": 1322957.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.545403316617012, | |
| "epoch": 0.30654205607476637, | |
| "grad_norm": 0.04340139031410217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498678088188171, | |
| "mean_token_accuracy": 0.7774094045162201, | |
| "num_tokens": 1339233.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5381540507078171, | |
| "epoch": 0.3102803738317757, | |
| "grad_norm": 0.039635106921195984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542028546333313, | |
| "mean_token_accuracy": 0.7835624068975449, | |
| "num_tokens": 1355463.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5599908977746964, | |
| "epoch": 0.31401869158878504, | |
| "grad_norm": 0.039568379521369934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559767484664917, | |
| "mean_token_accuracy": 0.7765284180641174, | |
| "num_tokens": 1371815.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5593477934598923, | |
| "epoch": 0.3177570093457944, | |
| "grad_norm": 0.039335861802101135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506576895713806, | |
| "mean_token_accuracy": 0.7803503125905991, | |
| "num_tokens": 1388181.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5572251528501511, | |
| "epoch": 0.32149532710280376, | |
| "grad_norm": 0.03665383532643318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480077862739563, | |
| "mean_token_accuracy": 0.7788248509168625, | |
| "num_tokens": 1404584.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5664831250905991, | |
| "epoch": 0.3252336448598131, | |
| "grad_norm": 0.040541525930166245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5769516229629517, | |
| "mean_token_accuracy": 0.7674112915992737, | |
| "num_tokens": 1420963.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5584649592638016, | |
| "epoch": 0.32897196261682243, | |
| "grad_norm": 0.033256057649850845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648812651634216, | |
| "mean_token_accuracy": 0.7723092287778854, | |
| "num_tokens": 1437122.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5519673079252243, | |
| "epoch": 0.33271028037383177, | |
| "grad_norm": 0.031988468021154404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551476776599884, | |
| "mean_token_accuracy": 0.7795782834291458, | |
| "num_tokens": 1453481.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5844476372003555, | |
| "epoch": 0.3364485981308411, | |
| "grad_norm": 0.037734005600214005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5850376486778259, | |
| "mean_token_accuracy": 0.7618721723556519, | |
| "num_tokens": 1469968.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5527342259883881, | |
| "epoch": 0.3401869158878505, | |
| "grad_norm": 0.03733964264392853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517382621765137, | |
| "mean_token_accuracy": 0.7791167348623276, | |
| "num_tokens": 1486410.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5490231364965439, | |
| "epoch": 0.34392523364485983, | |
| "grad_norm": 0.03796572983264923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472099781036377, | |
| "mean_token_accuracy": 0.7787582278251648, | |
| "num_tokens": 1502827.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5654839426279068, | |
| "epoch": 0.34766355140186916, | |
| "grad_norm": 0.03400302678346634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5675226449966431, | |
| "mean_token_accuracy": 0.7715823501348495, | |
| "num_tokens": 1519035.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5789331346750259, | |
| "epoch": 0.3514018691588785, | |
| "grad_norm": 0.03300806134939194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738787055015564, | |
| "mean_token_accuracy": 0.7701004296541214, | |
| "num_tokens": 1535776.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5546596646308899, | |
| "epoch": 0.35514018691588783, | |
| "grad_norm": 0.03256770223379135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567547082901001, | |
| "mean_token_accuracy": 0.7791133224964142, | |
| "num_tokens": 1552013.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5764150321483612, | |
| "epoch": 0.35887850467289717, | |
| "grad_norm": 0.03291841968894005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735791921615601, | |
| "mean_token_accuracy": 0.770502358675003, | |
| "num_tokens": 1568424.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5675235092639923, | |
| "epoch": 0.36261682242990656, | |
| "grad_norm": 0.03169221803545952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.567868709564209, | |
| "mean_token_accuracy": 0.7711145430803299, | |
| "num_tokens": 1584887.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5626550316810608, | |
| "epoch": 0.3663551401869159, | |
| "grad_norm": 0.03811025619506836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668138265609741, | |
| "mean_token_accuracy": 0.772192656993866, | |
| "num_tokens": 1601260.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5581237971782684, | |
| "epoch": 0.37009345794392523, | |
| "grad_norm": 0.03798513859510422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674142241477966, | |
| "mean_token_accuracy": 0.7706556767225266, | |
| "num_tokens": 1617528.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5649739503860474, | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 0.03556443750858307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644899606704712, | |
| "mean_token_accuracy": 0.7701123207807541, | |
| "num_tokens": 1633885.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5828528255224228, | |
| "epoch": 0.3775700934579439, | |
| "grad_norm": 0.03924545273184776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5804182291030884, | |
| "mean_token_accuracy": 0.7685290277004242, | |
| "num_tokens": 1650680.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5504215061664581, | |
| "epoch": 0.3813084112149533, | |
| "grad_norm": 0.03934217616915703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463358163833618, | |
| "mean_token_accuracy": 0.7797124236822128, | |
| "num_tokens": 1666866.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5697780549526215, | |
| "epoch": 0.3850467289719626, | |
| "grad_norm": 0.03712291270494461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653584599494934, | |
| "mean_token_accuracy": 0.7692228257656097, | |
| "num_tokens": 1683118.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5601143538951874, | |
| "epoch": 0.38878504672897196, | |
| "grad_norm": 0.033694274723529816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663195848464966, | |
| "mean_token_accuracy": 0.7706973105669022, | |
| "num_tokens": 1699475.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5591333955526352, | |
| "epoch": 0.3925233644859813, | |
| "grad_norm": 0.03714451938867569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.566075325012207, | |
| "mean_token_accuracy": 0.7697228640317917, | |
| "num_tokens": 1715853.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5509396642446518, | |
| "epoch": 0.39626168224299063, | |
| "grad_norm": 0.03486821800470352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632879734039307, | |
| "mean_token_accuracy": 0.7730516046285629, | |
| "num_tokens": 1732170.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5652123540639877, | |
| "epoch": 0.4, | |
| "grad_norm": 0.041288331151008606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604725480079651, | |
| "mean_token_accuracy": 0.7711915820837021, | |
| "num_tokens": 1748328.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5530835092067719, | |
| "epoch": 0.40373831775700936, | |
| "grad_norm": 0.0322246178984642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545868277549744, | |
| "mean_token_accuracy": 0.7774576395750046, | |
| "num_tokens": 1764582.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.574239119887352, | |
| "epoch": 0.4074766355140187, | |
| "grad_norm": 0.031295642256736755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755724906921387, | |
| "mean_token_accuracy": 0.7669118195772171, | |
| "num_tokens": 1780985.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5714472681283951, | |
| "epoch": 0.411214953271028, | |
| "grad_norm": 0.034113939851522446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565799355506897, | |
| "mean_token_accuracy": 0.7719277888536453, | |
| "num_tokens": 1797483.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5522187203168869, | |
| "epoch": 0.41495327102803736, | |
| "grad_norm": 0.03207452967762947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486649870872498, | |
| "mean_token_accuracy": 0.7786776423454285, | |
| "num_tokens": 1813763.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5560779720544815, | |
| "epoch": 0.41869158878504675, | |
| "grad_norm": 0.0334036760032177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554910898208618, | |
| "mean_token_accuracy": 0.7745659798383713, | |
| "num_tokens": 1829937.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5375554114580154, | |
| "epoch": 0.4224299065420561, | |
| "grad_norm": 0.03380579128861427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416814684867859, | |
| "mean_token_accuracy": 0.7802845388650894, | |
| "num_tokens": 1846164.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5589973330497742, | |
| "epoch": 0.4261682242990654, | |
| "grad_norm": 0.03403402864933014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650242567062378, | |
| "mean_token_accuracy": 0.7712521702051163, | |
| "num_tokens": 1862080.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5673896223306656, | |
| "epoch": 0.42990654205607476, | |
| "grad_norm": 0.03260383754968643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664341449737549, | |
| "mean_token_accuracy": 0.7702513486146927, | |
| "num_tokens": 1878608.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.572798103094101, | |
| "epoch": 0.4336448598130841, | |
| "grad_norm": 0.03137151151895523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5731777548789978, | |
| "mean_token_accuracy": 0.7663247585296631, | |
| "num_tokens": 1895166.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5312000960111618, | |
| "epoch": 0.4373831775700935, | |
| "grad_norm": 0.031823012977838516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382552742958069, | |
| "mean_token_accuracy": 0.7808444052934647, | |
| "num_tokens": 1911130.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5409984290599823, | |
| "epoch": 0.4411214953271028, | |
| "grad_norm": 0.03332378715276718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410414934158325, | |
| "mean_token_accuracy": 0.7819060832262039, | |
| "num_tokens": 1927264.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5695091038942337, | |
| "epoch": 0.44485981308411215, | |
| "grad_norm": 0.03380680829286575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648797154426575, | |
| "mean_token_accuracy": 0.7696678340435028, | |
| "num_tokens": 1943766.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5565821528434753, | |
| "epoch": 0.4485981308411215, | |
| "grad_norm": 0.02917688526213169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566266179084778, | |
| "mean_token_accuracy": 0.7743457108736038, | |
| "num_tokens": 1959998.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5624082386493683, | |
| "epoch": 0.4523364485981308, | |
| "grad_norm": 0.03372650966048241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673832297325134, | |
| "mean_token_accuracy": 0.7714631706476212, | |
| "num_tokens": 1976438.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5652057379484177, | |
| "epoch": 0.45607476635514016, | |
| "grad_norm": 0.031156128272414207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634032487869263, | |
| "mean_token_accuracy": 0.7731290906667709, | |
| "num_tokens": 1992993.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5621330291032791, | |
| "epoch": 0.45981308411214955, | |
| "grad_norm": 0.03159690275788307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597059726715088, | |
| "mean_token_accuracy": 0.7743693888187408, | |
| "num_tokens": 2009294.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.558076485991478, | |
| "epoch": 0.4635514018691589, | |
| "grad_norm": 0.032280728220939636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561931312084198, | |
| "mean_token_accuracy": 0.7742635309696198, | |
| "num_tokens": 2025544.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5441709458827972, | |
| "epoch": 0.4672897196261682, | |
| "grad_norm": 0.03219074383378029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506591200828552, | |
| "mean_token_accuracy": 0.7746744006872177, | |
| "num_tokens": 2041666.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5633633583784103, | |
| "epoch": 0.47102803738317756, | |
| "grad_norm": 0.03131939098238945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623766183853149, | |
| "mean_token_accuracy": 0.7734539210796356, | |
| "num_tokens": 2057983.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5601471066474915, | |
| "epoch": 0.4747663551401869, | |
| "grad_norm": 0.03067948669195175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621774196624756, | |
| "mean_token_accuracy": 0.7716772705316544, | |
| "num_tokens": 2074261.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5540204495191574, | |
| "epoch": 0.4785046728971963, | |
| "grad_norm": 0.03339416906237602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548160970211029, | |
| "mean_token_accuracy": 0.7764931470155716, | |
| "num_tokens": 2090516.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.552289143204689, | |
| "epoch": 0.4822429906542056, | |
| "grad_norm": 0.031481482088565826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535706877708435, | |
| "mean_token_accuracy": 0.7739048302173615, | |
| "num_tokens": 2106672.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5568640977144241, | |
| "epoch": 0.48598130841121495, | |
| "grad_norm": 0.028559116646647453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580005645751953, | |
| "mean_token_accuracy": 0.7733460515737534, | |
| "num_tokens": 2123117.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5648922473192215, | |
| "epoch": 0.4897196261682243, | |
| "grad_norm": 0.029422340914607048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628851056098938, | |
| "mean_token_accuracy": 0.7712086588144302, | |
| "num_tokens": 2139369.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5373547673225403, | |
| "epoch": 0.4934579439252336, | |
| "grad_norm": 0.030260303989052773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541597843170166, | |
| "mean_token_accuracy": 0.7806773632764816, | |
| "num_tokens": 2155734.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5263249725103378, | |
| "epoch": 0.497196261682243, | |
| "grad_norm": 0.03478972613811493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312929153442383, | |
| "mean_token_accuracy": 0.7852403372526169, | |
| "num_tokens": 2171760.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5605382472276688, | |
| "epoch": 0.5009345794392523, | |
| "grad_norm": 0.033430542796850204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653795599937439, | |
| "mean_token_accuracy": 0.7712585926055908, | |
| "num_tokens": 2188007.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5739341080188751, | |
| "epoch": 0.5046728971962616, | |
| "grad_norm": 0.030662760138511658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707223415374756, | |
| "mean_token_accuracy": 0.7689347118139267, | |
| "num_tokens": 2204304.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5562440007925034, | |
| "epoch": 0.508411214953271, | |
| "grad_norm": 0.029425745829939842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517452955245972, | |
| "mean_token_accuracy": 0.7767287492752075, | |
| "num_tokens": 2220312.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5788603723049164, | |
| "epoch": 0.5121495327102804, | |
| "grad_norm": 0.033554431051015854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5720421075820923, | |
| "mean_token_accuracy": 0.7664643228054047, | |
| "num_tokens": 2236563.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.558774933218956, | |
| "epoch": 0.5158878504672897, | |
| "grad_norm": 0.035832736641168594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559954822063446, | |
| "mean_token_accuracy": 0.7725366801023483, | |
| "num_tokens": 2252830.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.554543524980545, | |
| "epoch": 0.5196261682242991, | |
| "grad_norm": 0.03428984060883522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592023730278015, | |
| "mean_token_accuracy": 0.772834375500679, | |
| "num_tokens": 2269287.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5500677078962326, | |
| "epoch": 0.5233644859813084, | |
| "grad_norm": 0.035624898970127106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614656209945679, | |
| "mean_token_accuracy": 0.7710914462804794, | |
| "num_tokens": 2285456.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5587853938341141, | |
| "epoch": 0.5271028037383177, | |
| "grad_norm": 0.03407886624336243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605294704437256, | |
| "mean_token_accuracy": 0.7720634043216705, | |
| "num_tokens": 2301539.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5649153292179108, | |
| "epoch": 0.5308411214953271, | |
| "grad_norm": 0.028877010568976402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598087310791016, | |
| "mean_token_accuracy": 0.7749214172363281, | |
| "num_tokens": 2317846.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5670332461595535, | |
| "epoch": 0.5345794392523364, | |
| "grad_norm": 0.03278481960296631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650190114974976, | |
| "mean_token_accuracy": 0.7726317644119263, | |
| "num_tokens": 2334166.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5582242161035538, | |
| "epoch": 0.5383177570093458, | |
| "grad_norm": 0.033217303454875946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56020587682724, | |
| "mean_token_accuracy": 0.7734358310699463, | |
| "num_tokens": 2350590.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5491778552532196, | |
| "epoch": 0.5420560747663551, | |
| "grad_norm": 0.030532008036971092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535258650779724, | |
| "mean_token_accuracy": 0.7728464603424072, | |
| "num_tokens": 2367000.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5495235919952393, | |
| "epoch": 0.5457943925233645, | |
| "grad_norm": 0.03000551462173462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549593448638916, | |
| "mean_token_accuracy": 0.7776431441307068, | |
| "num_tokens": 2383493.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5404796749353409, | |
| "epoch": 0.5495327102803739, | |
| "grad_norm": 0.03362047299742699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460700392723083, | |
| "mean_token_accuracy": 0.7808279991149902, | |
| "num_tokens": 2399803.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5644742697477341, | |
| "epoch": 0.5532710280373832, | |
| "grad_norm": 0.031069470569491386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5680921077728271, | |
| "mean_token_accuracy": 0.7682847529649734, | |
| "num_tokens": 2416029.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.548800989985466, | |
| "epoch": 0.5570093457943925, | |
| "grad_norm": 0.027548154816031456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483176708221436, | |
| "mean_token_accuracy": 0.7775937616825104, | |
| "num_tokens": 2432412.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5704467445611954, | |
| "epoch": 0.5607476635514018, | |
| "grad_norm": 0.032674722373485565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650383830070496, | |
| "mean_token_accuracy": 0.7691423147916794, | |
| "num_tokens": 2448801.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5737617313861847, | |
| "epoch": 0.5644859813084112, | |
| "grad_norm": 0.02663569711148739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644318461418152, | |
| "mean_token_accuracy": 0.7708285748958588, | |
| "num_tokens": 2465024.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5562496334314346, | |
| "epoch": 0.5682242990654206, | |
| "grad_norm": 0.03284625709056854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537476539611816, | |
| "mean_token_accuracy": 0.7753592431545258, | |
| "num_tokens": 2481162.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5587188154459, | |
| "epoch": 0.5719626168224299, | |
| "grad_norm": 0.035413194447755814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652291178703308, | |
| "mean_token_accuracy": 0.7711132764816284, | |
| "num_tokens": 2497543.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5715966671705246, | |
| "epoch": 0.5757009345794393, | |
| "grad_norm": 0.030816730111837387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5740691423416138, | |
| "mean_token_accuracy": 0.767062708735466, | |
| "num_tokens": 2513719.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5732139945030212, | |
| "epoch": 0.5794392523364486, | |
| "grad_norm": 0.031442996114492416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.575890302658081, | |
| "mean_token_accuracy": 0.7688509374856949, | |
| "num_tokens": 2529964.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5707177966833115, | |
| "epoch": 0.5831775700934579, | |
| "grad_norm": 0.029468102380633354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684511661529541, | |
| "mean_token_accuracy": 0.7719237357378006, | |
| "num_tokens": 2546476.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5587103515863419, | |
| "epoch": 0.5869158878504673, | |
| "grad_norm": 0.031475260853767395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583993792533875, | |
| "mean_token_accuracy": 0.7759029716253281, | |
| "num_tokens": 2562728.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.574567124247551, | |
| "epoch": 0.5906542056074766, | |
| "grad_norm": 0.03264502063393593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683896541595459, | |
| "mean_token_accuracy": 0.7703035026788712, | |
| "num_tokens": 2578973.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5552074015140533, | |
| "epoch": 0.594392523364486, | |
| "grad_norm": 0.032595545053482056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574095249176025, | |
| "mean_token_accuracy": 0.7743780612945557, | |
| "num_tokens": 2595151.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5568316876888275, | |
| "epoch": 0.5981308411214953, | |
| "grad_norm": 0.033984988927841187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5642867088317871, | |
| "mean_token_accuracy": 0.7713010758161545, | |
| "num_tokens": 2611492.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5599596947431564, | |
| "epoch": 0.6018691588785047, | |
| "grad_norm": 0.031165285035967827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589022636413574, | |
| "mean_token_accuracy": 0.7745718359947205, | |
| "num_tokens": 2628012.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5476372390985489, | |
| "epoch": 0.6056074766355141, | |
| "grad_norm": 0.0300962645560503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493466258049011, | |
| "mean_token_accuracy": 0.7759741544723511, | |
| "num_tokens": 2644335.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5408246964216232, | |
| "epoch": 0.6093457943925233, | |
| "grad_norm": 0.03227512910962105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468109846115112, | |
| "mean_token_accuracy": 0.7807517051696777, | |
| "num_tokens": 2660464.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5610683709383011, | |
| "epoch": 0.6130841121495327, | |
| "grad_norm": 0.033202771097421646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660794377326965, | |
| "mean_token_accuracy": 0.7704542577266693, | |
| "num_tokens": 2676703.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.556282252073288, | |
| "epoch": 0.616822429906542, | |
| "grad_norm": 0.030140740796923637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595802664756775, | |
| "mean_token_accuracy": 0.7701904326677322, | |
| "num_tokens": 2692980.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5742812305688858, | |
| "epoch": 0.6205607476635514, | |
| "grad_norm": 0.031175116077065468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5679398775100708, | |
| "mean_token_accuracy": 0.7715850621461868, | |
| "num_tokens": 2709458.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5686928480863571, | |
| "epoch": 0.6242990654205608, | |
| "grad_norm": 0.03218809515237808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570385217666626, | |
| "mean_token_accuracy": 0.7703390866518021, | |
| "num_tokens": 2725878.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5649634599685669, | |
| "epoch": 0.6280373831775701, | |
| "grad_norm": 0.03405897319316864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623840093612671, | |
| "mean_token_accuracy": 0.7718164473772049, | |
| "num_tokens": 2742230.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.54586461186409, | |
| "epoch": 0.6317757009345795, | |
| "grad_norm": 0.030788332223892212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481584072113037, | |
| "mean_token_accuracy": 0.7789210081100464, | |
| "num_tokens": 2758288.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5519826114177704, | |
| "epoch": 0.6355140186915887, | |
| "grad_norm": 0.0393390953540802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614264607429504, | |
| "mean_token_accuracy": 0.7715797126293182, | |
| "num_tokens": 2774621.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5494296550750732, | |
| "epoch": 0.6392523364485981, | |
| "grad_norm": 0.03524143248796463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467370748519897, | |
| "mean_token_accuracy": 0.7793298810720444, | |
| "num_tokens": 2790715.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5330041199922562, | |
| "epoch": 0.6429906542056075, | |
| "grad_norm": 0.03651867434382439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539812445640564, | |
| "mean_token_accuracy": 0.7808443903923035, | |
| "num_tokens": 2806717.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5453702062368393, | |
| "epoch": 0.6467289719626168, | |
| "grad_norm": 0.03462547808885574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413773655891418, | |
| "mean_token_accuracy": 0.7798687964677811, | |
| "num_tokens": 2823284.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5685944706201553, | |
| "epoch": 0.6504672897196262, | |
| "grad_norm": 0.028748901560902596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5659922361373901, | |
| "mean_token_accuracy": 0.7701825350522995, | |
| "num_tokens": 2839827.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5635224878787994, | |
| "epoch": 0.6542056074766355, | |
| "grad_norm": 0.02829919010400772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650316476821899, | |
| "mean_token_accuracy": 0.7709458023309708, | |
| "num_tokens": 2856136.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5540378838777542, | |
| "epoch": 0.6579439252336449, | |
| "grad_norm": 0.033104948699474335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580451488494873, | |
| "mean_token_accuracy": 0.7731391340494156, | |
| "num_tokens": 2872416.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5654754340648651, | |
| "epoch": 0.6616822429906543, | |
| "grad_norm": 0.03393986448645592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.566604733467102, | |
| "mean_token_accuracy": 0.768456295132637, | |
| "num_tokens": 2888732.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.538336843252182, | |
| "epoch": 0.6654205607476635, | |
| "grad_norm": 0.031724728643894196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347487926483154, | |
| "mean_token_accuracy": 0.783712849020958, | |
| "num_tokens": 2904747.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.563370868563652, | |
| "epoch": 0.6691588785046729, | |
| "grad_norm": 0.028497006744146347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567288398742676, | |
| "mean_token_accuracy": 0.7762446701526642, | |
| "num_tokens": 2921642.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5554675310850143, | |
| "epoch": 0.6728971962616822, | |
| "grad_norm": 0.027588432654738426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539284348487854, | |
| "mean_token_accuracy": 0.7720596343278885, | |
| "num_tokens": 2938231.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5351214110851288, | |
| "epoch": 0.6766355140186916, | |
| "grad_norm": 0.02989763207733631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380938053131104, | |
| "mean_token_accuracy": 0.7797621339559555, | |
| "num_tokens": 2954651.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5512963533401489, | |
| "epoch": 0.680373831775701, | |
| "grad_norm": 0.031486768275499344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559045672416687, | |
| "mean_token_accuracy": 0.7730693370103836, | |
| "num_tokens": 2970900.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5643429905176163, | |
| "epoch": 0.6841121495327103, | |
| "grad_norm": 0.030211007222533226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652138590812683, | |
| "mean_token_accuracy": 0.7722145467996597, | |
| "num_tokens": 2987276.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5449773222208023, | |
| "epoch": 0.6878504672897197, | |
| "grad_norm": 0.03100084885954857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516652464866638, | |
| "mean_token_accuracy": 0.7781905680894852, | |
| "num_tokens": 3003582.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5534535795450211, | |
| "epoch": 0.6915887850467289, | |
| "grad_norm": 0.029445704072713852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549251914024353, | |
| "mean_token_accuracy": 0.7758228182792664, | |
| "num_tokens": 3019792.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5563573390245438, | |
| "epoch": 0.6953271028037383, | |
| "grad_norm": 0.03839804232120514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603447556495667, | |
| "mean_token_accuracy": 0.7714342921972275, | |
| "num_tokens": 3035807.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.538311779499054, | |
| "epoch": 0.6990654205607477, | |
| "grad_norm": 0.03146633878350258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352146625518799, | |
| "mean_token_accuracy": 0.7827797681093216, | |
| "num_tokens": 3051838.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5633413791656494, | |
| "epoch": 0.702803738317757, | |
| "grad_norm": 0.02970045432448387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558843195438385, | |
| "mean_token_accuracy": 0.774773895740509, | |
| "num_tokens": 3068298.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5590213239192963, | |
| "epoch": 0.7065420560747664, | |
| "grad_norm": 0.030248312279582024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594462156295776, | |
| "mean_token_accuracy": 0.7730938643217087, | |
| "num_tokens": 3084742.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5729488730430603, | |
| "epoch": 0.7102803738317757, | |
| "grad_norm": 0.02910761535167694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5710701942443848, | |
| "mean_token_accuracy": 0.7694995701313019, | |
| "num_tokens": 3101166.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5414529591798782, | |
| "epoch": 0.7140186915887851, | |
| "grad_norm": 0.030337564647197723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447859168052673, | |
| "mean_token_accuracy": 0.7779805213212967, | |
| "num_tokens": 3117310.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5537209510803223, | |
| "epoch": 0.7177570093457943, | |
| "grad_norm": 0.03048059716820717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590298771858215, | |
| "mean_token_accuracy": 0.7726654410362244, | |
| "num_tokens": 3133530.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5551200062036514, | |
| "epoch": 0.7214953271028037, | |
| "grad_norm": 0.03023671731352806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620648860931396, | |
| "mean_token_accuracy": 0.7735067456960678, | |
| "num_tokens": 3149663.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5674590468406677, | |
| "epoch": 0.7252336448598131, | |
| "grad_norm": 0.0296547319740057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588228702545166, | |
| "mean_token_accuracy": 0.7742174565792084, | |
| "num_tokens": 3166066.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5779262185096741, | |
| "epoch": 0.7289719626168224, | |
| "grad_norm": 0.028214752674102783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.572249710559845, | |
| "mean_token_accuracy": 0.7688845992088318, | |
| "num_tokens": 3182640.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.540147215127945, | |
| "epoch": 0.7327102803738318, | |
| "grad_norm": 0.027666175737977028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338530540466309, | |
| "mean_token_accuracy": 0.7832302153110504, | |
| "num_tokens": 3198796.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5551275163888931, | |
| "epoch": 0.7364485981308411, | |
| "grad_norm": 0.034123752266168594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622342824935913, | |
| "mean_token_accuracy": 0.7688822001218796, | |
| "num_tokens": 3214771.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5611921101808548, | |
| "epoch": 0.7401869158878505, | |
| "grad_norm": 0.02890852838754654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5630607604980469, | |
| "mean_token_accuracy": 0.7698909342288971, | |
| "num_tokens": 3231278.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5426182597875595, | |
| "epoch": 0.7439252336448599, | |
| "grad_norm": 0.029497232288122177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449106097221375, | |
| "mean_token_accuracy": 0.7783599495887756, | |
| "num_tokens": 3247627.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5460454076528549, | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.03151922672986984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513307452201843, | |
| "mean_token_accuracy": 0.7761969566345215, | |
| "num_tokens": 3263818.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5589698106050491, | |
| "epoch": 0.7514018691588785, | |
| "grad_norm": 0.028974369168281555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579357147216797, | |
| "mean_token_accuracy": 0.7737255245447159, | |
| "num_tokens": 3279922.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.553888663649559, | |
| "epoch": 0.7551401869158878, | |
| "grad_norm": 0.026153914630413055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550652027130127, | |
| "mean_token_accuracy": 0.7776264399290085, | |
| "num_tokens": 3296366.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5686471164226532, | |
| "epoch": 0.7588785046728972, | |
| "grad_norm": 0.028719555586576462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.566332221031189, | |
| "mean_token_accuracy": 0.7694560885429382, | |
| "num_tokens": 3312940.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5482725948095322, | |
| "epoch": 0.7626168224299066, | |
| "grad_norm": 0.031571801751852036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515249967575073, | |
| "mean_token_accuracy": 0.7790126353502274, | |
| "num_tokens": 3329137.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5548627823591232, | |
| "epoch": 0.7663551401869159, | |
| "grad_norm": 0.03189053386449814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633711218833923, | |
| "mean_token_accuracy": 0.7717642784118652, | |
| "num_tokens": 3345223.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5403945446014404, | |
| "epoch": 0.7700934579439253, | |
| "grad_norm": 0.03444300964474678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441574454307556, | |
| "mean_token_accuracy": 0.7791598290205002, | |
| "num_tokens": 3361512.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5523678362369537, | |
| "epoch": 0.7738317757009345, | |
| "grad_norm": 0.027761496603488922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582634210586548, | |
| "mean_token_accuracy": 0.7723374962806702, | |
| "num_tokens": 3377859.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5723598301410675, | |
| "epoch": 0.7775700934579439, | |
| "grad_norm": 0.028997788205742836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705980658531189, | |
| "mean_token_accuracy": 0.7668357789516449, | |
| "num_tokens": 3394399.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5796838849782944, | |
| "epoch": 0.7813084112149533, | |
| "grad_norm": 0.03271174803376198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5698305368423462, | |
| "mean_token_accuracy": 0.7698051035404205, | |
| "num_tokens": 3410824.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5651015788316727, | |
| "epoch": 0.7850467289719626, | |
| "grad_norm": 0.031869035214185715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655361413955688, | |
| "mean_token_accuracy": 0.7697497308254242, | |
| "num_tokens": 3426955.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5639242976903915, | |
| "epoch": 0.788785046728972, | |
| "grad_norm": 0.026541458442807198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636979341506958, | |
| "mean_token_accuracy": 0.7697752565145493, | |
| "num_tokens": 3443406.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5432985424995422, | |
| "epoch": 0.7925233644859813, | |
| "grad_norm": 0.032391466200351715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466354489326477, | |
| "mean_token_accuracy": 0.7787620276212692, | |
| "num_tokens": 3459857.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.546247586607933, | |
| "epoch": 0.7962616822429907, | |
| "grad_norm": 0.03624865412712097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477287769317627, | |
| "mean_token_accuracy": 0.7784061878919601, | |
| "num_tokens": 3476064.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5712321698665619, | |
| "epoch": 0.8, | |
| "grad_norm": 0.027368342503905296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628222823143005, | |
| "mean_token_accuracy": 0.7711902260780334, | |
| "num_tokens": 3492569.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5511522740125656, | |
| "epoch": 0.8037383177570093, | |
| "grad_norm": 0.0314224548637867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546245813369751, | |
| "mean_token_accuracy": 0.777819886803627, | |
| "num_tokens": 3508946.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5641316920518875, | |
| "epoch": 0.8074766355140187, | |
| "grad_norm": 0.02934875525534153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656546354293823, | |
| "mean_token_accuracy": 0.7672451436519623, | |
| "num_tokens": 3525415.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5616082847118378, | |
| "epoch": 0.811214953271028, | |
| "grad_norm": 0.027262428775429726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606979131698608, | |
| "mean_token_accuracy": 0.7726116627454758, | |
| "num_tokens": 3541513.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5319297313690186, | |
| "epoch": 0.8149532710280374, | |
| "grad_norm": 0.02967401221394539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409149527549744, | |
| "mean_token_accuracy": 0.7806787341833115, | |
| "num_tokens": 3557840.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5461787581443787, | |
| "epoch": 0.8186915887850468, | |
| "grad_norm": 0.03170184791088104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544174313545227, | |
| "mean_token_accuracy": 0.7753637731075287, | |
| "num_tokens": 3574334.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5393616706132889, | |
| "epoch": 0.822429906542056, | |
| "grad_norm": 0.02985682338476181, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457973480224609, | |
| "mean_token_accuracy": 0.7773662656545639, | |
| "num_tokens": 3590741.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5554001927375793, | |
| "epoch": 0.8261682242990654, | |
| "grad_norm": 0.02711213380098343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555370569229126, | |
| "mean_token_accuracy": 0.7716074883937836, | |
| "num_tokens": 3607018.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5483701378107071, | |
| "epoch": 0.8299065420560747, | |
| "grad_norm": 0.029320966452360153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421203970909119, | |
| "mean_token_accuracy": 0.7806040942668915, | |
| "num_tokens": 3623209.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5777206718921661, | |
| "epoch": 0.8336448598130841, | |
| "grad_norm": 0.030610879883170128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738532543182373, | |
| "mean_token_accuracy": 0.7664468586444855, | |
| "num_tokens": 3639406.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5567807406187057, | |
| "epoch": 0.8373831775700935, | |
| "grad_norm": 0.028399785980582237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526878237724304, | |
| "mean_token_accuracy": 0.773535892367363, | |
| "num_tokens": 3655602.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.530220165848732, | |
| "epoch": 0.8411214953271028, | |
| "grad_norm": 0.03518186882138252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408585667610168, | |
| "mean_token_accuracy": 0.779409795999527, | |
| "num_tokens": 3671905.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5535659790039062, | |
| "epoch": 0.8448598130841122, | |
| "grad_norm": 0.03929230943322182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663979053497314, | |
| "mean_token_accuracy": 0.7698138654232025, | |
| "num_tokens": 3688191.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.569505363702774, | |
| "epoch": 0.8485981308411215, | |
| "grad_norm": 0.0272939745336771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618590712547302, | |
| "mean_token_accuracy": 0.7725658267736435, | |
| "num_tokens": 3704751.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5644249469041824, | |
| "epoch": 0.8523364485981308, | |
| "grad_norm": 0.03415616601705551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562848448753357, | |
| "mean_token_accuracy": 0.7748490273952484, | |
| "num_tokens": 3720710.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5773901343345642, | |
| "epoch": 0.8560747663551402, | |
| "grad_norm": 0.031880877912044525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614221096038818, | |
| "mean_token_accuracy": 0.7720403522253036, | |
| "num_tokens": 3737054.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5547749698162079, | |
| "epoch": 0.8598130841121495, | |
| "grad_norm": 0.0324094183743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520619750022888, | |
| "mean_token_accuracy": 0.7773038893938065, | |
| "num_tokens": 3753537.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5418203920125961, | |
| "epoch": 0.8635514018691589, | |
| "grad_norm": 0.03512468561530113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538347959518433, | |
| "mean_token_accuracy": 0.7749911546707153, | |
| "num_tokens": 3769863.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5521644353866577, | |
| "epoch": 0.8672897196261682, | |
| "grad_norm": 0.02896721474826336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608810186386108, | |
| "mean_token_accuracy": 0.7746408581733704, | |
| "num_tokens": 3786316.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.543023481965065, | |
| "epoch": 0.8710280373831776, | |
| "grad_norm": 0.03712921962141991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551246404647827, | |
| "mean_token_accuracy": 0.7738360315561295, | |
| "num_tokens": 3802441.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5672542154788971, | |
| "epoch": 0.874766355140187, | |
| "grad_norm": 0.026832984760403633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662351846694946, | |
| "mean_token_accuracy": 0.7704236954450607, | |
| "num_tokens": 3818851.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5710914433002472, | |
| "epoch": 0.8785046728971962, | |
| "grad_norm": 0.036441151052713394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647166967391968, | |
| "mean_token_accuracy": 0.7697651982307434, | |
| "num_tokens": 3835229.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5721132010221481, | |
| "epoch": 0.8822429906542056, | |
| "grad_norm": 0.031891413033008575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561801552772522, | |
| "mean_token_accuracy": 0.7740357220172882, | |
| "num_tokens": 3851634.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5430081784725189, | |
| "epoch": 0.8859813084112149, | |
| "grad_norm": 0.028133288025856018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482598543167114, | |
| "mean_token_accuracy": 0.7780391424894333, | |
| "num_tokens": 3867818.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5531598627567291, | |
| "epoch": 0.8897196261682243, | |
| "grad_norm": 0.031570907682180405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597803592681885, | |
| "mean_token_accuracy": 0.7725805789232254, | |
| "num_tokens": 3884128.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.552057608962059, | |
| "epoch": 0.8934579439252337, | |
| "grad_norm": 0.03431302309036255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592586398124695, | |
| "mean_token_accuracy": 0.7739444822072983, | |
| "num_tokens": 3900459.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.552062600851059, | |
| "epoch": 0.897196261682243, | |
| "grad_norm": 0.029298607259988785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525797009468079, | |
| "mean_token_accuracy": 0.7755719870328903, | |
| "num_tokens": 3916582.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.571002647280693, | |
| "epoch": 0.9009345794392524, | |
| "grad_norm": 0.028903625905513763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647273659706116, | |
| "mean_token_accuracy": 0.7697427272796631, | |
| "num_tokens": 3932989.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5607190132141113, | |
| "epoch": 0.9046728971962616, | |
| "grad_norm": 0.02721545286476612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572564601898193, | |
| "mean_token_accuracy": 0.7735343724489212, | |
| "num_tokens": 3949591.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.554363563656807, | |
| "epoch": 0.908411214953271, | |
| "grad_norm": 0.028853297233581543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598585605621338, | |
| "mean_token_accuracy": 0.7746720314025879, | |
| "num_tokens": 3965977.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.562399297952652, | |
| "epoch": 0.9121495327102803, | |
| "grad_norm": 0.031765274703502655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609657764434814, | |
| "mean_token_accuracy": 0.7706955671310425, | |
| "num_tokens": 3982241.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5663948059082031, | |
| "epoch": 0.9158878504672897, | |
| "grad_norm": 0.02977531962096691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600242018699646, | |
| "mean_token_accuracy": 0.7716616988182068, | |
| "num_tokens": 3998850.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5626737624406815, | |
| "epoch": 0.9196261682242991, | |
| "grad_norm": 0.03073737397789955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5680803656578064, | |
| "mean_token_accuracy": 0.7690348774194717, | |
| "num_tokens": 4015357.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5617063343524933, | |
| "epoch": 0.9233644859813084, | |
| "grad_norm": 0.03239826485514641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647311210632324, | |
| "mean_token_accuracy": 0.7720029205083847, | |
| "num_tokens": 4031434.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5446989983320236, | |
| "epoch": 0.9271028037383178, | |
| "grad_norm": 0.026935769245028496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423059463500977, | |
| "mean_token_accuracy": 0.7784274518489838, | |
| "num_tokens": 4047542.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5633901953697205, | |
| "epoch": 0.930841121495327, | |
| "grad_norm": 0.03004775382578373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547890663146973, | |
| "mean_token_accuracy": 0.7750878036022186, | |
| "num_tokens": 4063671.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5641201138496399, | |
| "epoch": 0.9345794392523364, | |
| "grad_norm": 0.035040173679590225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560414731502533, | |
| "mean_token_accuracy": 0.7721855938434601, | |
| "num_tokens": 4080062.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5267122685909271, | |
| "epoch": 0.9383177570093458, | |
| "grad_norm": 0.026784395799040794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528884768486023, | |
| "mean_token_accuracy": 0.7842623591423035, | |
| "num_tokens": 4096314.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5412785857915878, | |
| "epoch": 0.9420560747663551, | |
| "grad_norm": 0.029483763501048088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475237369537354, | |
| "mean_token_accuracy": 0.7779380232095718, | |
| "num_tokens": 4112543.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5688454955816269, | |
| "epoch": 0.9457943925233645, | |
| "grad_norm": 0.02722441591322422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703037977218628, | |
| "mean_token_accuracy": 0.7700005024671555, | |
| "num_tokens": 4128880.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5569160729646683, | |
| "epoch": 0.9495327102803738, | |
| "grad_norm": 0.028683314099907875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574289560317993, | |
| "mean_token_accuracy": 0.7722644209861755, | |
| "num_tokens": 4145417.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5437170565128326, | |
| "epoch": 0.9532710280373832, | |
| "grad_norm": 0.03323707729578018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411959886550903, | |
| "mean_token_accuracy": 0.7814441025257111, | |
| "num_tokens": 4161528.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5666731148958206, | |
| "epoch": 0.9570093457943926, | |
| "grad_norm": 0.028484966605901718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648545622825623, | |
| "mean_token_accuracy": 0.77223140001297, | |
| "num_tokens": 4177883.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5472739338874817, | |
| "epoch": 0.9607476635514018, | |
| "grad_norm": 0.032945599406957626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465376377105713, | |
| "mean_token_accuracy": 0.7768394351005554, | |
| "num_tokens": 4194047.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5488951653242111, | |
| "epoch": 0.9644859813084112, | |
| "grad_norm": 0.030117738991975784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551251769065857, | |
| "mean_token_accuracy": 0.7728994339704514, | |
| "num_tokens": 4210415.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5574130117893219, | |
| "epoch": 0.9682242990654205, | |
| "grad_norm": 0.028586212545633316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596088171005249, | |
| "mean_token_accuracy": 0.7760643810033798, | |
| "num_tokens": 4226881.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5550301373004913, | |
| "epoch": 0.9719626168224299, | |
| "grad_norm": 0.035784922540187836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660927891731262, | |
| "mean_token_accuracy": 0.7692493498325348, | |
| "num_tokens": 4243149.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5651994347572327, | |
| "epoch": 0.9757009345794393, | |
| "grad_norm": 0.03252053260803223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599735379219055, | |
| "mean_token_accuracy": 0.7730003446340561, | |
| "num_tokens": 4259611.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5637697577476501, | |
| "epoch": 0.9794392523364486, | |
| "grad_norm": 0.047552503645420074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568199157714844, | |
| "mean_token_accuracy": 0.7762705087661743, | |
| "num_tokens": 4275796.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.567447230219841, | |
| "epoch": 0.983177570093458, | |
| "grad_norm": 0.027801062911748886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5698356032371521, | |
| "mean_token_accuracy": 0.7690239697694778, | |
| "num_tokens": 4292132.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5712171792984009, | |
| "epoch": 0.9869158878504672, | |
| "grad_norm": 0.11246822774410248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5811023116111755, | |
| "mean_token_accuracy": 0.7647420465946198, | |
| "num_tokens": 4308584.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5711934268474579, | |
| "epoch": 0.9906542056074766, | |
| "grad_norm": 0.06911394000053406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5809019804000854, | |
| "mean_token_accuracy": 0.7624327838420868, | |
| "num_tokens": 4324962.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5627400726079941, | |
| "epoch": 0.994392523364486, | |
| "grad_norm": 0.030455252155661583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5616910457611084, | |
| "mean_token_accuracy": 0.7730111479759216, | |
| "num_tokens": 4341120.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5654444992542267, | |
| "epoch": 0.9981308411214953, | |
| "grad_norm": 0.02772046998143196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567201972007751, | |
| "mean_token_accuracy": 0.7720088213682175, | |
| "num_tokens": 4357574.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5589146912097931, | |
| "epoch": 1.0, | |
| "grad_norm": 0.04032747447490692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460503101348877, | |
| "mean_token_accuracy": 0.779203861951828, | |
| "num_tokens": 4365546.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5703114420175552, | |
| "epoch": 1.0037383177570094, | |
| "grad_norm": 0.033491045236587524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557507276535034, | |
| "mean_token_accuracy": 0.7745671570301056, | |
| "num_tokens": 4381699.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5609012693166733, | |
| "epoch": 1.0074766355140188, | |
| "grad_norm": 0.03252531215548515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590213537216187, | |
| "mean_token_accuracy": 0.7752612829208374, | |
| "num_tokens": 4398284.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5300652086734772, | |
| "epoch": 1.011214953271028, | |
| "grad_norm": 0.036933887749910355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396179556846619, | |
| "mean_token_accuracy": 0.7816686779260635, | |
| "num_tokens": 4414795.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5411953181028366, | |
| "epoch": 1.0149532710280373, | |
| "grad_norm": 0.035878736525774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491203665733337, | |
| "mean_token_accuracy": 0.7742594629526138, | |
| "num_tokens": 4431190.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5370450466871262, | |
| "epoch": 1.0186915887850467, | |
| "grad_norm": 0.029914801940321922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417315363883972, | |
| "mean_token_accuracy": 0.7806635499000549, | |
| "num_tokens": 4447475.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5567668229341507, | |
| "epoch": 1.0224299065420561, | |
| "grad_norm": 0.03265395388007164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509355068206787, | |
| "mean_token_accuracy": 0.7730302512645721, | |
| "num_tokens": 4463734.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5656838417053223, | |
| "epoch": 1.0261682242990655, | |
| "grad_norm": 0.03136991336941719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576434135437012, | |
| "mean_token_accuracy": 0.7703666239976883, | |
| "num_tokens": 4479995.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.548493430018425, | |
| "epoch": 1.0299065420560747, | |
| "grad_norm": 0.033384647220373154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452391505241394, | |
| "mean_token_accuracy": 0.7803221642971039, | |
| "num_tokens": 4496385.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.547315925359726, | |
| "epoch": 1.033644859813084, | |
| "grad_norm": 0.02812100760638714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515413284301758, | |
| "mean_token_accuracy": 0.7755024433135986, | |
| "num_tokens": 4512779.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5315467417240143, | |
| "epoch": 1.0373831775700935, | |
| "grad_norm": 0.041606683284044266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446295738220215, | |
| "mean_token_accuracy": 0.7787878066301346, | |
| "num_tokens": 4529088.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5279169529676437, | |
| "epoch": 1.0411214953271029, | |
| "grad_norm": 0.031057002022862434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536575973033905, | |
| "mean_token_accuracy": 0.7812807857990265, | |
| "num_tokens": 4545377.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5590710490942001, | |
| "epoch": 1.0448598130841122, | |
| "grad_norm": 0.02644682675600052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554656982421875, | |
| "mean_token_accuracy": 0.7751928865909576, | |
| "num_tokens": 4561701.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5662561357021332, | |
| "epoch": 1.0485981308411214, | |
| "grad_norm": 0.029125280678272247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619407892227173, | |
| "mean_token_accuracy": 0.7679703086614609, | |
| "num_tokens": 4578007.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5509714484214783, | |
| "epoch": 1.0523364485981308, | |
| "grad_norm": 0.03366995230317116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544794499874115, | |
| "mean_token_accuracy": 0.7797580361366272, | |
| "num_tokens": 4594260.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5634302496910095, | |
| "epoch": 1.0560747663551402, | |
| "grad_norm": 0.027832867577672005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580713748931885, | |
| "mean_token_accuracy": 0.7739240676164627, | |
| "num_tokens": 4610748.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5439006388187408, | |
| "epoch": 1.0598130841121496, | |
| "grad_norm": 0.03045068122446537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474724173545837, | |
| "mean_token_accuracy": 0.7765053659677505, | |
| "num_tokens": 4627116.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5238615572452545, | |
| "epoch": 1.063551401869159, | |
| "grad_norm": 0.03397069126367569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532546877861023, | |
| "mean_token_accuracy": 0.7858656197786331, | |
| "num_tokens": 4643480.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5387604683637619, | |
| "epoch": 1.0672897196261681, | |
| "grad_norm": 0.036734551191329956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468651056289673, | |
| "mean_token_accuracy": 0.7797952890396118, | |
| "num_tokens": 4660303.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5558950453996658, | |
| "epoch": 1.0710280373831775, | |
| "grad_norm": 0.030276885256171227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584522485733032, | |
| "mean_token_accuracy": 0.7732091248035431, | |
| "num_tokens": 4676839.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5617282688617706, | |
| "epoch": 1.074766355140187, | |
| "grad_norm": 0.033773574978113174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567758679389954, | |
| "mean_token_accuracy": 0.7739396244287491, | |
| "num_tokens": 4692959.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5491297841072083, | |
| "epoch": 1.0785046728971963, | |
| "grad_norm": 0.0321025624871254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414766073226929, | |
| "mean_token_accuracy": 0.7804555594921112, | |
| "num_tokens": 4709310.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5456965118646622, | |
| "epoch": 1.0822429906542057, | |
| "grad_norm": 0.029098015278577805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451281070709229, | |
| "mean_token_accuracy": 0.7778134942054749, | |
| "num_tokens": 4725506.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5477775633335114, | |
| "epoch": 1.0859813084112149, | |
| "grad_norm": 0.02958570048213005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455498695373535, | |
| "mean_token_accuracy": 0.7799811661243439, | |
| "num_tokens": 4741775.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5301359370350838, | |
| "epoch": 1.0897196261682243, | |
| "grad_norm": 0.03702852129936218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398594737052917, | |
| "mean_token_accuracy": 0.7832937985658646, | |
| "num_tokens": 4758016.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5263582319021225, | |
| "epoch": 1.0934579439252337, | |
| "grad_norm": 0.0337018184363842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528889000415802, | |
| "mean_token_accuracy": 0.7862381190061569, | |
| "num_tokens": 4774331.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5430160015821457, | |
| "epoch": 1.097196261682243, | |
| "grad_norm": 0.036417651921510696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521553158760071, | |
| "mean_token_accuracy": 0.7737599611282349, | |
| "num_tokens": 4790501.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5552934557199478, | |
| "epoch": 1.1009345794392524, | |
| "grad_norm": 0.03106369823217392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559324622154236, | |
| "mean_token_accuracy": 0.7761313170194626, | |
| "num_tokens": 4806597.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5548459142446518, | |
| "epoch": 1.1046728971962616, | |
| "grad_norm": 0.031152816489338875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504705905914307, | |
| "mean_token_accuracy": 0.7746731489896774, | |
| "num_tokens": 4822650.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5644493997097015, | |
| "epoch": 1.108411214953271, | |
| "grad_norm": 0.030590267851948738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608450770378113, | |
| "mean_token_accuracy": 0.7722194045782089, | |
| "num_tokens": 4839117.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5444105267524719, | |
| "epoch": 1.1121495327102804, | |
| "grad_norm": 0.027887985110282898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356480479240417, | |
| "mean_token_accuracy": 0.7835922837257385, | |
| "num_tokens": 4855616.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5529257953166962, | |
| "epoch": 1.1158878504672898, | |
| "grad_norm": 0.029403148218989372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520183444023132, | |
| "mean_token_accuracy": 0.7763603180646896, | |
| "num_tokens": 4871877.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5645637214183807, | |
| "epoch": 1.1196261682242992, | |
| "grad_norm": 0.028178894892334938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597948431968689, | |
| "mean_token_accuracy": 0.7721023112535477, | |
| "num_tokens": 4888211.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5288026034832001, | |
| "epoch": 1.1233644859813083, | |
| "grad_norm": 0.04107068479061127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410320162773132, | |
| "mean_token_accuracy": 0.7809516042470932, | |
| "num_tokens": 4904621.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.539900153875351, | |
| "epoch": 1.1271028037383177, | |
| "grad_norm": 0.029827676713466644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402933955192566, | |
| "mean_token_accuracy": 0.7816860228776932, | |
| "num_tokens": 4921127.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5498250722885132, | |
| "epoch": 1.1308411214953271, | |
| "grad_norm": 0.026688000187277794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489476323127747, | |
| "mean_token_accuracy": 0.7740818113088608, | |
| "num_tokens": 4937487.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5250164270401001, | |
| "epoch": 1.1345794392523365, | |
| "grad_norm": 0.02805374562740326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292810797691345, | |
| "mean_token_accuracy": 0.7862300872802734, | |
| "num_tokens": 4953715.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5558099746704102, | |
| "epoch": 1.1383177570093457, | |
| "grad_norm": 0.028311913833022118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553642213344574, | |
| "mean_token_accuracy": 0.772954136133194, | |
| "num_tokens": 4970083.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.552794486284256, | |
| "epoch": 1.142056074766355, | |
| "grad_norm": 0.02732912451028824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542539358139038, | |
| "mean_token_accuracy": 0.7786157876253128, | |
| "num_tokens": 4986475.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.541429802775383, | |
| "epoch": 1.1457943925233645, | |
| "grad_norm": 0.026043161749839783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54054194688797, | |
| "mean_token_accuracy": 0.779283881187439, | |
| "num_tokens": 5002946.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5385288000106812, | |
| "epoch": 1.1495327102803738, | |
| "grad_norm": 0.029000889509916306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392960906028748, | |
| "mean_token_accuracy": 0.7790030539035797, | |
| "num_tokens": 5019257.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5650081187486649, | |
| "epoch": 1.1532710280373832, | |
| "grad_norm": 0.030966322869062424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671533942222595, | |
| "mean_token_accuracy": 0.7687903195619583, | |
| "num_tokens": 5035694.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5269978791475296, | |
| "epoch": 1.1570093457943926, | |
| "grad_norm": 0.029498660936951637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207559466362, | |
| "mean_token_accuracy": 0.789651021361351, | |
| "num_tokens": 5051896.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.536905974149704, | |
| "epoch": 1.1607476635514018, | |
| "grad_norm": 0.030239341780543327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469245910644531, | |
| "mean_token_accuracy": 0.7770659476518631, | |
| "num_tokens": 5068088.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5390781760215759, | |
| "epoch": 1.1644859813084112, | |
| "grad_norm": 0.03393058478832245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542595386505127, | |
| "mean_token_accuracy": 0.7818379998207092, | |
| "num_tokens": 5084518.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5539942681789398, | |
| "epoch": 1.1682242990654206, | |
| "grad_norm": 0.02896442450582981, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544940233230591, | |
| "mean_token_accuracy": 0.773167759180069, | |
| "num_tokens": 5101049.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5508127510547638, | |
| "epoch": 1.17196261682243, | |
| "grad_norm": 0.0290669035166502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456743240356445, | |
| "mean_token_accuracy": 0.7797731012105942, | |
| "num_tokens": 5117401.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5471421480178833, | |
| "epoch": 1.1757009345794391, | |
| "grad_norm": 0.03175804764032364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547149658203125, | |
| "mean_token_accuracy": 0.7758717685937881, | |
| "num_tokens": 5133730.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5345856845378876, | |
| "epoch": 1.1794392523364485, | |
| "grad_norm": 0.030823305249214172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330408215522766, | |
| "mean_token_accuracy": 0.784162163734436, | |
| "num_tokens": 5149933.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5622152835130692, | |
| "epoch": 1.183177570093458, | |
| "grad_norm": 0.035467732697725296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626823902130127, | |
| "mean_token_accuracy": 0.7694768160581589, | |
| "num_tokens": 5166513.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5603054612874985, | |
| "epoch": 1.1869158878504673, | |
| "grad_norm": 0.03127942234277725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562260091304779, | |
| "mean_token_accuracy": 0.7705819606781006, | |
| "num_tokens": 5182789.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5313067883253098, | |
| "epoch": 1.1906542056074767, | |
| "grad_norm": 0.031915076076984406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535006046295166, | |
| "mean_token_accuracy": 0.7801574766635895, | |
| "num_tokens": 5198808.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5626082420349121, | |
| "epoch": 1.194392523364486, | |
| "grad_norm": 0.0270744226872921, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664738416671753, | |
| "mean_token_accuracy": 0.7685981392860413, | |
| "num_tokens": 5215173.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5448359251022339, | |
| "epoch": 1.1981308411214953, | |
| "grad_norm": 0.034068379551172256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446659922599792, | |
| "mean_token_accuracy": 0.7786541432142258, | |
| "num_tokens": 5231488.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5552321374416351, | |
| "epoch": 1.2018691588785047, | |
| "grad_norm": 0.027504440397024155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556068420410156, | |
| "mean_token_accuracy": 0.7737858295440674, | |
| "num_tokens": 5248043.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5611619353294373, | |
| "epoch": 1.205607476635514, | |
| "grad_norm": 0.0314825214445591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585416555404663, | |
| "mean_token_accuracy": 0.7727329283952713, | |
| "num_tokens": 5264537.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.539411261677742, | |
| "epoch": 1.2093457943925234, | |
| "grad_norm": 0.02891836315393448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542159378528595, | |
| "mean_token_accuracy": 0.7766279429197311, | |
| "num_tokens": 5280701.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5438771396875381, | |
| "epoch": 1.2130841121495326, | |
| "grad_norm": 0.030331527814269066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439496040344238, | |
| "mean_token_accuracy": 0.7776656746864319, | |
| "num_tokens": 5297144.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5600438266992569, | |
| "epoch": 1.216822429906542, | |
| "grad_norm": 0.031427256762981415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602800846099854, | |
| "mean_token_accuracy": 0.7731630206108093, | |
| "num_tokens": 5313519.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5613888651132584, | |
| "epoch": 1.2205607476635514, | |
| "grad_norm": 0.02703862637281418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599865317344666, | |
| "mean_token_accuracy": 0.7733557522296906, | |
| "num_tokens": 5329856.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5237439274787903, | |
| "epoch": 1.2242990654205608, | |
| "grad_norm": 0.02758556418120861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267841815948486, | |
| "mean_token_accuracy": 0.7867935001850128, | |
| "num_tokens": 5346177.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5669067651033401, | |
| "epoch": 1.2280373831775702, | |
| "grad_norm": 0.028242675587534904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650265216827393, | |
| "mean_token_accuracy": 0.7703205198049545, | |
| "num_tokens": 5362512.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5509548783302307, | |
| "epoch": 1.2317757009345796, | |
| "grad_norm": 0.028802327811717987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518352389335632, | |
| "mean_token_accuracy": 0.7750025242567062, | |
| "num_tokens": 5379024.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5300867408514023, | |
| "epoch": 1.2355140186915887, | |
| "grad_norm": 0.028508059680461884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312294363975525, | |
| "mean_token_accuracy": 0.7825600951910019, | |
| "num_tokens": 5395474.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5559873282909393, | |
| "epoch": 1.2392523364485981, | |
| "grad_norm": 0.029974235221743584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561782717704773, | |
| "mean_token_accuracy": 0.7731552422046661, | |
| "num_tokens": 5411674.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.557199090719223, | |
| "epoch": 1.2429906542056075, | |
| "grad_norm": 0.03494254872202873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579161643981934, | |
| "mean_token_accuracy": 0.7746251970529556, | |
| "num_tokens": 5428042.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5486237108707428, | |
| "epoch": 1.246728971962617, | |
| "grad_norm": 0.03307056799530983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547027587890625, | |
| "mean_token_accuracy": 0.7762673646211624, | |
| "num_tokens": 5444468.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5655098557472229, | |
| "epoch": 1.250467289719626, | |
| "grad_norm": 0.030658213421702385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607244372367859, | |
| "mean_token_accuracy": 0.7719737142324448, | |
| "num_tokens": 5460943.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5550193935632706, | |
| "epoch": 1.2542056074766355, | |
| "grad_norm": 0.03245887532830238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558559775352478, | |
| "mean_token_accuracy": 0.7714462429285049, | |
| "num_tokens": 5477095.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5516159981489182, | |
| "epoch": 1.2579439252336448, | |
| "grad_norm": 0.029303548857569695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509077310562134, | |
| "mean_token_accuracy": 0.7748865634202957, | |
| "num_tokens": 5493314.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5517037510871887, | |
| "epoch": 1.2616822429906542, | |
| "grad_norm": 0.030339522287249565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531480312347412, | |
| "mean_token_accuracy": 0.7767991721630096, | |
| "num_tokens": 5509491.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5280565023422241, | |
| "epoch": 1.2654205607476636, | |
| "grad_norm": 0.031923625618219376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528035581111908, | |
| "mean_token_accuracy": 0.7852191030979156, | |
| "num_tokens": 5525691.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5340898633003235, | |
| "epoch": 1.269158878504673, | |
| "grad_norm": 0.029536927118897438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422028303146362, | |
| "mean_token_accuracy": 0.7782081514596939, | |
| "num_tokens": 5541867.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5269799679517746, | |
| "epoch": 1.2728971962616822, | |
| "grad_norm": 0.028842000290751457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262301564216614, | |
| "mean_token_accuracy": 0.7851875424385071, | |
| "num_tokens": 5558001.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5422883927822113, | |
| "epoch": 1.2766355140186916, | |
| "grad_norm": 0.03446980193257332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427042245864868, | |
| "mean_token_accuracy": 0.7805773615837097, | |
| "num_tokens": 5574327.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5518148094415665, | |
| "epoch": 1.280373831775701, | |
| "grad_norm": 0.027705170214176178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506993532180786, | |
| "mean_token_accuracy": 0.7755730003118515, | |
| "num_tokens": 5590749.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5408089458942413, | |
| "epoch": 1.2841121495327104, | |
| "grad_norm": 0.029695594683289528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394558906555176, | |
| "mean_token_accuracy": 0.7792032957077026, | |
| "num_tokens": 5606965.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.555278405547142, | |
| "epoch": 1.2878504672897195, | |
| "grad_norm": 0.03306727111339569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528630018234253, | |
| "mean_token_accuracy": 0.7753221690654755, | |
| "num_tokens": 5623293.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5409073531627655, | |
| "epoch": 1.291588785046729, | |
| "grad_norm": 0.029820574447512627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416831970214844, | |
| "mean_token_accuracy": 0.7789396792650223, | |
| "num_tokens": 5639449.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5428119450807571, | |
| "epoch": 1.2953271028037383, | |
| "grad_norm": 0.02653786540031433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379306077957153, | |
| "mean_token_accuracy": 0.7808004468679428, | |
| "num_tokens": 5655647.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5534338802099228, | |
| "epoch": 1.2990654205607477, | |
| "grad_norm": 0.036522869020700455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622379779815674, | |
| "mean_token_accuracy": 0.7683994024991989, | |
| "num_tokens": 5672013.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5302807092666626, | |
| "epoch": 1.302803738317757, | |
| "grad_norm": 0.029457183554768562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294267535209656, | |
| "mean_token_accuracy": 0.7827122360467911, | |
| "num_tokens": 5688450.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5444758385419846, | |
| "epoch": 1.3065420560747665, | |
| "grad_norm": 0.029874974861741066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353363752365112, | |
| "mean_token_accuracy": 0.7824759036302567, | |
| "num_tokens": 5705038.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5528301745653152, | |
| "epoch": 1.3102803738317756, | |
| "grad_norm": 0.029413780197501183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467464923858643, | |
| "mean_token_accuracy": 0.7778250128030777, | |
| "num_tokens": 5721143.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5555091798305511, | |
| "epoch": 1.314018691588785, | |
| "grad_norm": 0.03153051435947418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567013025283813, | |
| "mean_token_accuracy": 0.7745524048805237, | |
| "num_tokens": 5737899.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5499187856912613, | |
| "epoch": 1.3177570093457944, | |
| "grad_norm": 0.03486097231507301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597171783447266, | |
| "mean_token_accuracy": 0.7737800478935242, | |
| "num_tokens": 5754281.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5655581057071686, | |
| "epoch": 1.3214953271028038, | |
| "grad_norm": 0.034320469945669174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727288126945496, | |
| "mean_token_accuracy": 0.7656765133142471, | |
| "num_tokens": 5770770.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5538551807403564, | |
| "epoch": 1.325233644859813, | |
| "grad_norm": 0.03038712590932846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568647384643555, | |
| "mean_token_accuracy": 0.7737635225057602, | |
| "num_tokens": 5787055.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5601113438606262, | |
| "epoch": 1.3289719626168224, | |
| "grad_norm": 0.02863963134586811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530621409416199, | |
| "mean_token_accuracy": 0.7755090743303299, | |
| "num_tokens": 5803445.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5483526140451431, | |
| "epoch": 1.3327102803738318, | |
| "grad_norm": 0.03086850978434086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400408506393433, | |
| "mean_token_accuracy": 0.7810002267360687, | |
| "num_tokens": 5819715.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5624817609786987, | |
| "epoch": 1.3364485981308412, | |
| "grad_norm": 0.027300981804728508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635508894920349, | |
| "mean_token_accuracy": 0.768461674451828, | |
| "num_tokens": 5835943.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5395894348621368, | |
| "epoch": 1.3401869158878505, | |
| "grad_norm": 0.030900444835424423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544026255607605, | |
| "mean_token_accuracy": 0.7806333154439926, | |
| "num_tokens": 5852434.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5406174808740616, | |
| "epoch": 1.34392523364486, | |
| "grad_norm": 0.030813222751021385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545943021774292, | |
| "mean_token_accuracy": 0.7791963070631027, | |
| "num_tokens": 5868855.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5282687693834305, | |
| "epoch": 1.347663551401869, | |
| "grad_norm": 0.03219500184059143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280976891517639, | |
| "mean_token_accuracy": 0.7882633060216904, | |
| "num_tokens": 5885162.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5588660687208176, | |
| "epoch": 1.3514018691588785, | |
| "grad_norm": 0.030664408579468727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600679516792297, | |
| "mean_token_accuracy": 0.7683242410421371, | |
| "num_tokens": 5901397.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5558361262083054, | |
| "epoch": 1.355140186915888, | |
| "grad_norm": 0.029887903481721878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512230396270752, | |
| "mean_token_accuracy": 0.7751856446266174, | |
| "num_tokens": 5917688.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5585273951292038, | |
| "epoch": 1.358878504672897, | |
| "grad_norm": 0.030291857197880745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574408173561096, | |
| "mean_token_accuracy": 0.7735242694616318, | |
| "num_tokens": 5934252.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5426641255617142, | |
| "epoch": 1.3626168224299064, | |
| "grad_norm": 0.03163778409361839, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456237196922302, | |
| "mean_token_accuracy": 0.77604641020298, | |
| "num_tokens": 5950736.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5607275068759918, | |
| "epoch": 1.3663551401869158, | |
| "grad_norm": 0.02867417223751545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595529079437256, | |
| "mean_token_accuracy": 0.773354560136795, | |
| "num_tokens": 5967130.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.554174154996872, | |
| "epoch": 1.3700934579439252, | |
| "grad_norm": 0.03474622219800949, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513558387756348, | |
| "mean_token_accuracy": 0.7774477899074554, | |
| "num_tokens": 5983303.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5479168146848679, | |
| "epoch": 1.3738317757009346, | |
| "grad_norm": 0.03147226572036743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468041300773621, | |
| "mean_token_accuracy": 0.7777006030082703, | |
| "num_tokens": 5999776.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5567852258682251, | |
| "epoch": 1.377570093457944, | |
| "grad_norm": 0.03519264608621597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599963068962097, | |
| "mean_token_accuracy": 0.7709233462810516, | |
| "num_tokens": 6015938.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5587522089481354, | |
| "epoch": 1.3813084112149534, | |
| "grad_norm": 0.03433060646057129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571247339248657, | |
| "mean_token_accuracy": 0.7718200087547302, | |
| "num_tokens": 6032196.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5337067395448685, | |
| "epoch": 1.3850467289719626, | |
| "grad_norm": 0.030834900215268135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330364108085632, | |
| "mean_token_accuracy": 0.7854774743318558, | |
| "num_tokens": 6048415.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5485008955001831, | |
| "epoch": 1.388785046728972, | |
| "grad_norm": 0.038097940385341644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500508546829224, | |
| "mean_token_accuracy": 0.775309219956398, | |
| "num_tokens": 6064562.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5520146042108536, | |
| "epoch": 1.3925233644859814, | |
| "grad_norm": 0.02676542103290558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546633243560791, | |
| "mean_token_accuracy": 0.7763903141021729, | |
| "num_tokens": 6080869.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5430674999952316, | |
| "epoch": 1.3962616822429905, | |
| "grad_norm": 0.0291767455637455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384376049041748, | |
| "mean_token_accuracy": 0.7846493870019913, | |
| "num_tokens": 6096995.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.543053463101387, | |
| "epoch": 1.4, | |
| "grad_norm": 0.031880684196949005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416824817657471, | |
| "mean_token_accuracy": 0.7807471454143524, | |
| "num_tokens": 6113154.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.555852085351944, | |
| "epoch": 1.4037383177570093, | |
| "grad_norm": 0.03215760365128517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583543181419373, | |
| "mean_token_accuracy": 0.7724814862012863, | |
| "num_tokens": 6129602.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5323648005723953, | |
| "epoch": 1.4074766355140187, | |
| "grad_norm": 0.03375270590186119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405369400978088, | |
| "mean_token_accuracy": 0.7804393470287323, | |
| "num_tokens": 6145766.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5550488829612732, | |
| "epoch": 1.411214953271028, | |
| "grad_norm": 0.029217012226581573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554684579372406, | |
| "mean_token_accuracy": 0.7745330631732941, | |
| "num_tokens": 6162201.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5482346266508102, | |
| "epoch": 1.4149532710280375, | |
| "grad_norm": 0.03129247948527336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419821739196777, | |
| "mean_token_accuracy": 0.7780721634626389, | |
| "num_tokens": 6178420.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5605264604091644, | |
| "epoch": 1.4186915887850469, | |
| "grad_norm": 0.028088558465242386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536739230155945, | |
| "mean_token_accuracy": 0.7760752588510513, | |
| "num_tokens": 6195017.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5308103561401367, | |
| "epoch": 1.422429906542056, | |
| "grad_norm": 0.03174047917127609, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348400473594666, | |
| "mean_token_accuracy": 0.7830243110656738, | |
| "num_tokens": 6211269.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5362233817577362, | |
| "epoch": 1.4261682242990654, | |
| "grad_norm": 0.03284025564789772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401143431663513, | |
| "mean_token_accuracy": 0.7799562960863113, | |
| "num_tokens": 6227503.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5288970768451691, | |
| "epoch": 1.4299065420560748, | |
| "grad_norm": 0.03117184154689312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347498655319214, | |
| "mean_token_accuracy": 0.7850797027349472, | |
| "num_tokens": 6243667.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5478838980197906, | |
| "epoch": 1.433644859813084, | |
| "grad_norm": 0.0355689711868763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515888333320618, | |
| "mean_token_accuracy": 0.7750401347875595, | |
| "num_tokens": 6259958.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5556496828794479, | |
| "epoch": 1.4373831775700934, | |
| "grad_norm": 0.03252286836504936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527741312980652, | |
| "mean_token_accuracy": 0.7747504711151123, | |
| "num_tokens": 6276256.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.536173865199089, | |
| "epoch": 1.4411214953271028, | |
| "grad_norm": 0.03125045448541641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389170050621033, | |
| "mean_token_accuracy": 0.7826138287782669, | |
| "num_tokens": 6292477.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5414228439331055, | |
| "epoch": 1.4448598130841122, | |
| "grad_norm": 0.029693089425563812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456768870353699, | |
| "mean_token_accuracy": 0.7780184000730515, | |
| "num_tokens": 6308848.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5460960417985916, | |
| "epoch": 1.4485981308411215, | |
| "grad_norm": 0.028725288808345795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453904867172241, | |
| "mean_token_accuracy": 0.7754503637552261, | |
| "num_tokens": 6325175.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5478474348783493, | |
| "epoch": 1.452336448598131, | |
| "grad_norm": 0.03158194199204445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430905818939209, | |
| "mean_token_accuracy": 0.7789453864097595, | |
| "num_tokens": 6341307.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5458368062973022, | |
| "epoch": 1.45607476635514, | |
| "grad_norm": 0.02816491760313511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543704092502594, | |
| "mean_token_accuracy": 0.7792259007692337, | |
| "num_tokens": 6357858.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5392302572727203, | |
| "epoch": 1.4598130841121495, | |
| "grad_norm": 0.04157215729355812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544989287853241, | |
| "mean_token_accuracy": 0.7776051461696625, | |
| "num_tokens": 6373868.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5487792640924454, | |
| "epoch": 1.4635514018691589, | |
| "grad_norm": 0.03120332583785057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500867962837219, | |
| "mean_token_accuracy": 0.7786511480808258, | |
| "num_tokens": 6390370.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5473900437355042, | |
| "epoch": 1.4672897196261683, | |
| "grad_norm": 0.03685331344604492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516798496246338, | |
| "mean_token_accuracy": 0.7734636813402176, | |
| "num_tokens": 6406810.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5339369177818298, | |
| "epoch": 1.4710280373831774, | |
| "grad_norm": 0.031062059104442596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277940034866333, | |
| "mean_token_accuracy": 0.7844891250133514, | |
| "num_tokens": 6423321.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5646286159753799, | |
| "epoch": 1.4747663551401868, | |
| "grad_norm": 0.03419705480337143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560526967048645, | |
| "mean_token_accuracy": 0.7742912471294403, | |
| "num_tokens": 6439751.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5566267520189285, | |
| "epoch": 1.4785046728971962, | |
| "grad_norm": 0.030112918466329575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551886796951294, | |
| "mean_token_accuracy": 0.7758849114179611, | |
| "num_tokens": 6456064.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5496308952569962, | |
| "epoch": 1.4822429906542056, | |
| "grad_norm": 0.029358550906181335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503244400024414, | |
| "mean_token_accuracy": 0.779025211930275, | |
| "num_tokens": 6472168.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5490056574344635, | |
| "epoch": 1.485981308411215, | |
| "grad_norm": 0.03679414093494415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532426834106445, | |
| "mean_token_accuracy": 0.77412910759449, | |
| "num_tokens": 6488701.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5552525818347931, | |
| "epoch": 1.4897196261682244, | |
| "grad_norm": 0.03460443392395973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580930709838867, | |
| "mean_token_accuracy": 0.7725805938243866, | |
| "num_tokens": 6504913.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5486905574798584, | |
| "epoch": 1.4934579439252336, | |
| "grad_norm": 0.03757799416780472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467075705528259, | |
| "mean_token_accuracy": 0.7737327963113785, | |
| "num_tokens": 6521159.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5667891502380371, | |
| "epoch": 1.497196261682243, | |
| "grad_norm": 0.0321633443236351, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584529042243958, | |
| "mean_token_accuracy": 0.7716430127620697, | |
| "num_tokens": 6537343.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.560171589255333, | |
| "epoch": 1.5009345794392523, | |
| "grad_norm": 0.027958108112215996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571039319038391, | |
| "mean_token_accuracy": 0.7695076316595078, | |
| "num_tokens": 6553654.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5325733348727226, | |
| "epoch": 1.5046728971962615, | |
| "grad_norm": 0.03109286166727543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371490716934204, | |
| "mean_token_accuracy": 0.7818229347467422, | |
| "num_tokens": 6569830.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5464021414518356, | |
| "epoch": 1.508411214953271, | |
| "grad_norm": 0.033921979367733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520694255828857, | |
| "mean_token_accuracy": 0.7737181484699249, | |
| "num_tokens": 6586181.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5360658913850784, | |
| "epoch": 1.5121495327102803, | |
| "grad_norm": 0.03216444328427315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539574921131134, | |
| "mean_token_accuracy": 0.7791631668806076, | |
| "num_tokens": 6602220.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5452992171049118, | |
| "epoch": 1.5158878504672897, | |
| "grad_norm": 0.02836962789297104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482081174850464, | |
| "mean_token_accuracy": 0.7770387381315231, | |
| "num_tokens": 6618603.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5549522340297699, | |
| "epoch": 1.519626168224299, | |
| "grad_norm": 0.029138341546058655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456300973892212, | |
| "mean_token_accuracy": 0.7779618352651596, | |
| "num_tokens": 6634957.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5506550967693329, | |
| "epoch": 1.5233644859813085, | |
| "grad_norm": 0.02889757789671421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417683720588684, | |
| "mean_token_accuracy": 0.7772906571626663, | |
| "num_tokens": 6651192.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5641747862100601, | |
| "epoch": 1.5271028037383179, | |
| "grad_norm": 0.029291054233908653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575106143951416, | |
| "mean_token_accuracy": 0.7736930400133133, | |
| "num_tokens": 6667351.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5569720417261124, | |
| "epoch": 1.5308411214953273, | |
| "grad_norm": 0.031217265874147415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568684339523315, | |
| "mean_token_accuracy": 0.7742536216974258, | |
| "num_tokens": 6683766.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5555198639631271, | |
| "epoch": 1.5345794392523364, | |
| "grad_norm": 0.041470784693956375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674223303794861, | |
| "mean_token_accuracy": 0.7700306624174118, | |
| "num_tokens": 6700296.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5609412640333176, | |
| "epoch": 1.5383177570093458, | |
| "grad_norm": 0.03198862448334694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651755332946777, | |
| "mean_token_accuracy": 0.7717378437519073, | |
| "num_tokens": 6716475.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5559493005275726, | |
| "epoch": 1.542056074766355, | |
| "grad_norm": 0.029610617086291313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465991497039795, | |
| "mean_token_accuracy": 0.7768793702125549, | |
| "num_tokens": 6732579.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5383591949939728, | |
| "epoch": 1.5457943925233644, | |
| "grad_norm": 0.03238457813858986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351200699806213, | |
| "mean_token_accuracy": 0.7838361263275146, | |
| "num_tokens": 6748613.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5723170787096024, | |
| "epoch": 1.5495327102803738, | |
| "grad_norm": 0.03184224292635918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706000328063965, | |
| "mean_token_accuracy": 0.7656203061342239, | |
| "num_tokens": 6764799.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5449900329113007, | |
| "epoch": 1.5532710280373832, | |
| "grad_norm": 0.03413036838173866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444662570953369, | |
| "mean_token_accuracy": 0.7746504992246628, | |
| "num_tokens": 6781040.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5653754621744156, | |
| "epoch": 1.5570093457943925, | |
| "grad_norm": 0.03557061403989792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661092400550842, | |
| "mean_token_accuracy": 0.7700045108795166, | |
| "num_tokens": 6797618.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5285668075084686, | |
| "epoch": 1.560747663551402, | |
| "grad_norm": 0.02898026816546917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310862064361572, | |
| "mean_token_accuracy": 0.7867710143327713, | |
| "num_tokens": 6813889.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5591782182455063, | |
| "epoch": 1.5644859813084113, | |
| "grad_norm": 0.03489390015602112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559260368347168, | |
| "mean_token_accuracy": 0.7742950618267059, | |
| "num_tokens": 6830511.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5233039408922195, | |
| "epoch": 1.5682242990654207, | |
| "grad_norm": 0.031120121479034424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304787158966064, | |
| "mean_token_accuracy": 0.7851588577032089, | |
| "num_tokens": 6846831.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5615075826644897, | |
| "epoch": 1.5719626168224299, | |
| "grad_norm": 0.032532718032598495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557915985584259, | |
| "mean_token_accuracy": 0.7756024897098541, | |
| "num_tokens": 6863482.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5608477592468262, | |
| "epoch": 1.5757009345794393, | |
| "grad_norm": 0.03193405270576477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570778250694275, | |
| "mean_token_accuracy": 0.7736349552869797, | |
| "num_tokens": 6879744.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5420049726963043, | |
| "epoch": 1.5794392523364484, | |
| "grad_norm": 0.03341756388545036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422099828720093, | |
| "mean_token_accuracy": 0.7786398679018021, | |
| "num_tokens": 6895998.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5501766800880432, | |
| "epoch": 1.5831775700934578, | |
| "grad_norm": 0.03080238774418831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543519139289856, | |
| "mean_token_accuracy": 0.779445543885231, | |
| "num_tokens": 6912350.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5548175424337387, | |
| "epoch": 1.5869158878504672, | |
| "grad_norm": 0.029699817299842834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554355263710022, | |
| "mean_token_accuracy": 0.7715099602937698, | |
| "num_tokens": 6928868.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5445838496088982, | |
| "epoch": 1.5906542056074766, | |
| "grad_norm": 0.03310444578528404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509841442108154, | |
| "mean_token_accuracy": 0.7749770432710648, | |
| "num_tokens": 6945115.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5508389323949814, | |
| "epoch": 1.594392523364486, | |
| "grad_norm": 0.03343511372804642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527422428131104, | |
| "mean_token_accuracy": 0.7760582268238068, | |
| "num_tokens": 6961606.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5455803871154785, | |
| "epoch": 1.5981308411214954, | |
| "grad_norm": 0.030003823339939117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433002710342407, | |
| "mean_token_accuracy": 0.7772544771432877, | |
| "num_tokens": 6977721.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.542354941368103, | |
| "epoch": 1.6018691588785048, | |
| "grad_norm": 0.02921188622713089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396295785903931, | |
| "mean_token_accuracy": 0.7784738689661026, | |
| "num_tokens": 6994015.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5403562635183334, | |
| "epoch": 1.6056074766355142, | |
| "grad_norm": 0.03267091140151024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412419438362122, | |
| "mean_token_accuracy": 0.7828981131315231, | |
| "num_tokens": 7010256.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5418384820222855, | |
| "epoch": 1.6093457943925233, | |
| "grad_norm": 0.03328794986009598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415868163108826, | |
| "mean_token_accuracy": 0.7787100970745087, | |
| "num_tokens": 7026538.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5569044798612595, | |
| "epoch": 1.6130841121495327, | |
| "grad_norm": 0.03399523347616196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610039830207825, | |
| "mean_token_accuracy": 0.7681904435157776, | |
| "num_tokens": 7042821.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5516158491373062, | |
| "epoch": 1.616822429906542, | |
| "grad_norm": 0.041675642132759094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512884855270386, | |
| "mean_token_accuracy": 0.7792385816574097, | |
| "num_tokens": 7059278.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5493542701005936, | |
| "epoch": 1.6205607476635513, | |
| "grad_norm": 0.029840141534805298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508259534835815, | |
| "mean_token_accuracy": 0.7764638513326645, | |
| "num_tokens": 7075675.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5415777564048767, | |
| "epoch": 1.6242990654205607, | |
| "grad_norm": 0.04138097167015076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540780246257782, | |
| "mean_token_accuracy": 0.7806251496076584, | |
| "num_tokens": 7091803.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5550828725099564, | |
| "epoch": 1.62803738317757, | |
| "grad_norm": 0.03500202298164368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536463856697083, | |
| "mean_token_accuracy": 0.7767235636711121, | |
| "num_tokens": 7108257.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5612530559301376, | |
| "epoch": 1.6317757009345795, | |
| "grad_norm": 0.029145153239369392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608190894126892, | |
| "mean_token_accuracy": 0.7731182426214218, | |
| "num_tokens": 7124785.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5527195036411285, | |
| "epoch": 1.6355140186915889, | |
| "grad_norm": 0.035749297589063644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629845857620239, | |
| "mean_token_accuracy": 0.7721443176269531, | |
| "num_tokens": 7141265.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5614519417285919, | |
| "epoch": 1.6392523364485982, | |
| "grad_norm": 0.033001191914081573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560024976730347, | |
| "mean_token_accuracy": 0.7749044448137283, | |
| "num_tokens": 7157859.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5537575930356979, | |
| "epoch": 1.6429906542056076, | |
| "grad_norm": 0.026474064216017723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511392951011658, | |
| "mean_token_accuracy": 0.7752827405929565, | |
| "num_tokens": 7174159.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5490387231111526, | |
| "epoch": 1.6467289719626168, | |
| "grad_norm": 0.03137727826833725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470349192619324, | |
| "mean_token_accuracy": 0.7756170034408569, | |
| "num_tokens": 7190518.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5602337867021561, | |
| "epoch": 1.6504672897196262, | |
| "grad_norm": 0.0327768549323082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596269369125366, | |
| "mean_token_accuracy": 0.7712970525026321, | |
| "num_tokens": 7206832.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5407531261444092, | |
| "epoch": 1.6542056074766354, | |
| "grad_norm": 0.0337577648460865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448312759399414, | |
| "mean_token_accuracy": 0.7795456647872925, | |
| "num_tokens": 7222967.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5409540086984634, | |
| "epoch": 1.6579439252336448, | |
| "grad_norm": 0.03192588686943054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484352111816406, | |
| "mean_token_accuracy": 0.7764406651258469, | |
| "num_tokens": 7239342.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5369711667299271, | |
| "epoch": 1.6616822429906541, | |
| "grad_norm": 0.029282715171575546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391625165939331, | |
| "mean_token_accuracy": 0.7777595669031143, | |
| "num_tokens": 7255685.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5320119112730026, | |
| "epoch": 1.6654205607476635, | |
| "grad_norm": 0.03132037818431854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324081182479858, | |
| "mean_token_accuracy": 0.7831796556711197, | |
| "num_tokens": 7271873.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5473773181438446, | |
| "epoch": 1.669158878504673, | |
| "grad_norm": 0.029359478503465652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430581569671631, | |
| "mean_token_accuracy": 0.780887171626091, | |
| "num_tokens": 7288229.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5577313005924225, | |
| "epoch": 1.6728971962616823, | |
| "grad_norm": 0.0312592051923275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549578070640564, | |
| "mean_token_accuracy": 0.7755182534456253, | |
| "num_tokens": 7304562.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5430529564619064, | |
| "epoch": 1.6766355140186917, | |
| "grad_norm": 0.036848753690719604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486578941345215, | |
| "mean_token_accuracy": 0.7793130427598953, | |
| "num_tokens": 7320789.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5367421358823776, | |
| "epoch": 1.680373831775701, | |
| "grad_norm": 0.03133554011583328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428006649017334, | |
| "mean_token_accuracy": 0.7791069746017456, | |
| "num_tokens": 7336720.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5608862638473511, | |
| "epoch": 1.6841121495327103, | |
| "grad_norm": 0.033135656267404556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513461828231812, | |
| "mean_token_accuracy": 0.7747347801923752, | |
| "num_tokens": 7353115.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5476694256067276, | |
| "epoch": 1.6878504672897197, | |
| "grad_norm": 0.02974470518529415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473049879074097, | |
| "mean_token_accuracy": 0.7776686698198318, | |
| "num_tokens": 7369302.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5416230708360672, | |
| "epoch": 1.6915887850467288, | |
| "grad_norm": 0.0338185578584671, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420779585838318, | |
| "mean_token_accuracy": 0.7770841121673584, | |
| "num_tokens": 7385486.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5354430079460144, | |
| "epoch": 1.6953271028037382, | |
| "grad_norm": 0.04928300157189369, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383298397064209, | |
| "mean_token_accuracy": 0.7825010567903519, | |
| "num_tokens": 7401834.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5533457249403, | |
| "epoch": 1.6990654205607476, | |
| "grad_norm": 0.03868211433291435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589519739151001, | |
| "mean_token_accuracy": 0.7741620242595673, | |
| "num_tokens": 7418328.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5337075442075729, | |
| "epoch": 1.702803738317757, | |
| "grad_norm": 0.03012922592461109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302947163581848, | |
| "mean_token_accuracy": 0.7835781127214432, | |
| "num_tokens": 7434426.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5648263692855835, | |
| "epoch": 1.7065420560747664, | |
| "grad_norm": 0.028873439878225327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585320591926575, | |
| "mean_token_accuracy": 0.7732219845056534, | |
| "num_tokens": 7451036.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5839773565530777, | |
| "epoch": 1.7102803738317758, | |
| "grad_norm": 0.033153235912323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5761073231697083, | |
| "mean_token_accuracy": 0.7669852823019028, | |
| "num_tokens": 7467359.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5488205403089523, | |
| "epoch": 1.7140186915887852, | |
| "grad_norm": 0.032065052539110184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483813285827637, | |
| "mean_token_accuracy": 0.7763916105031967, | |
| "num_tokens": 7483649.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5411174297332764, | |
| "epoch": 1.7177570093457943, | |
| "grad_norm": 0.0323743000626564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461615920066833, | |
| "mean_token_accuracy": 0.7778149843215942, | |
| "num_tokens": 7500070.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.533783033490181, | |
| "epoch": 1.7214953271028037, | |
| "grad_norm": 0.03367235139012337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427653193473816, | |
| "mean_token_accuracy": 0.7805494964122772, | |
| "num_tokens": 7516529.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5454732924699783, | |
| "epoch": 1.7252336448598131, | |
| "grad_norm": 0.034071460366249084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546566247940063, | |
| "mean_token_accuracy": 0.7736624777317047, | |
| "num_tokens": 7533025.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5454118698835373, | |
| "epoch": 1.7289719626168223, | |
| "grad_norm": 0.03127819299697876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452259182929993, | |
| "mean_token_accuracy": 0.7759493589401245, | |
| "num_tokens": 7549482.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5667081475257874, | |
| "epoch": 1.7327102803738317, | |
| "grad_norm": 0.0311261173337698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610095858573914, | |
| "mean_token_accuracy": 0.772314265370369, | |
| "num_tokens": 7565748.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5310934036970139, | |
| "epoch": 1.736448598130841, | |
| "grad_norm": 0.03265678882598877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214373469352722, | |
| "mean_token_accuracy": 0.7887950539588928, | |
| "num_tokens": 7582052.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5556392967700958, | |
| "epoch": 1.7401869158878505, | |
| "grad_norm": 0.03034058026969433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505704283714294, | |
| "mean_token_accuracy": 0.7774366736412048, | |
| "num_tokens": 7598174.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5393192917108536, | |
| "epoch": 1.7439252336448599, | |
| "grad_norm": 0.0359746590256691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477877259254456, | |
| "mean_token_accuracy": 0.7797855734825134, | |
| "num_tokens": 7614503.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.551783487200737, | |
| "epoch": 1.7476635514018692, | |
| "grad_norm": 0.03548724204301834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540840029716492, | |
| "mean_token_accuracy": 0.7747608870267868, | |
| "num_tokens": 7630814.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5413367450237274, | |
| "epoch": 1.7514018691588786, | |
| "grad_norm": 0.034123897552490234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470243692398071, | |
| "mean_token_accuracy": 0.779376894235611, | |
| "num_tokens": 7647376.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5412023663520813, | |
| "epoch": 1.7551401869158878, | |
| "grad_norm": 0.03561440855264664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472733378410339, | |
| "mean_token_accuracy": 0.7762201726436615, | |
| "num_tokens": 7663345.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.549220860004425, | |
| "epoch": 1.7588785046728972, | |
| "grad_norm": 0.02905275858938694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541520893573761, | |
| "mean_token_accuracy": 0.7792876809835434, | |
| "num_tokens": 7679585.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5333058834075928, | |
| "epoch": 1.7626168224299066, | |
| "grad_norm": 0.03320024162530899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264161229133606, | |
| "mean_token_accuracy": 0.7870939522981644, | |
| "num_tokens": 7695719.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5468353033065796, | |
| "epoch": 1.7663551401869158, | |
| "grad_norm": 0.03256339579820633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458404421806335, | |
| "mean_token_accuracy": 0.778706505894661, | |
| "num_tokens": 7711803.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.536187469959259, | |
| "epoch": 1.7700934579439251, | |
| "grad_norm": 0.03339603543281555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392374992370605, | |
| "mean_token_accuracy": 0.7822528183460236, | |
| "num_tokens": 7728002.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5286234319210052, | |
| "epoch": 1.7738317757009345, | |
| "grad_norm": 0.033285900950431824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358365774154663, | |
| "mean_token_accuracy": 0.7836114317178726, | |
| "num_tokens": 7744366.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5403973311185837, | |
| "epoch": 1.777570093457944, | |
| "grad_norm": 0.028936821967363358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398406386375427, | |
| "mean_token_accuracy": 0.7814478874206543, | |
| "num_tokens": 7760549.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5419041812419891, | |
| "epoch": 1.7813084112149533, | |
| "grad_norm": 0.03836261108517647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494267344474792, | |
| "mean_token_accuracy": 0.775143027305603, | |
| "num_tokens": 7776621.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5589816868305206, | |
| "epoch": 1.7850467289719627, | |
| "grad_norm": 0.03261716663837433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496556758880615, | |
| "mean_token_accuracy": 0.775287851691246, | |
| "num_tokens": 7792949.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5772902369499207, | |
| "epoch": 1.788785046728972, | |
| "grad_norm": 0.03729069605469704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5730117559432983, | |
| "mean_token_accuracy": 0.7676824629306793, | |
| "num_tokens": 7809233.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5505616068840027, | |
| "epoch": 1.7925233644859813, | |
| "grad_norm": 0.0271653700619936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481145977973938, | |
| "mean_token_accuracy": 0.7766467928886414, | |
| "num_tokens": 7825604.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5539548844099045, | |
| "epoch": 1.7962616822429907, | |
| "grad_norm": 0.035687919706106186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536059737205505, | |
| "mean_token_accuracy": 0.7723885625600815, | |
| "num_tokens": 7841764.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.548996701836586, | |
| "epoch": 1.8, | |
| "grad_norm": 0.03167950361967087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525107383728027, | |
| "mean_token_accuracy": 0.7743307799100876, | |
| "num_tokens": 7857918.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5371337532997131, | |
| "epoch": 1.8037383177570092, | |
| "grad_norm": 0.03125729039311409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431434512138367, | |
| "mean_token_accuracy": 0.7770611643791199, | |
| "num_tokens": 7874375.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5534856170415878, | |
| "epoch": 1.8074766355140186, | |
| "grad_norm": 0.03495310619473457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606104731559753, | |
| "mean_token_accuracy": 0.7701490819454193, | |
| "num_tokens": 7890503.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5570873767137527, | |
| "epoch": 1.811214953271028, | |
| "grad_norm": 0.031059635803103447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577523112297058, | |
| "mean_token_accuracy": 0.7766271531581879, | |
| "num_tokens": 7906740.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.549734815955162, | |
| "epoch": 1.8149532710280374, | |
| "grad_norm": 0.029658785089850426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459674000740051, | |
| "mean_token_accuracy": 0.778388187289238, | |
| "num_tokens": 7923366.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.556487500667572, | |
| "epoch": 1.8186915887850468, | |
| "grad_norm": 0.03030308522284031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487005710601807, | |
| "mean_token_accuracy": 0.7778837084770203, | |
| "num_tokens": 7939678.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5620574653148651, | |
| "epoch": 1.8224299065420562, | |
| "grad_norm": 0.03321143984794617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632344484329224, | |
| "mean_token_accuracy": 0.771716520190239, | |
| "num_tokens": 7955824.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5325201749801636, | |
| "epoch": 1.8261682242990656, | |
| "grad_norm": 0.0296145249158144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337831377983093, | |
| "mean_token_accuracy": 0.7806598991155624, | |
| "num_tokens": 7971945.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5530183613300323, | |
| "epoch": 1.8299065420560747, | |
| "grad_norm": 0.04490596428513527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658998489379883, | |
| "mean_token_accuracy": 0.7682041078805923, | |
| "num_tokens": 7988395.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.540508821606636, | |
| "epoch": 1.8336448598130841, | |
| "grad_norm": 0.03253109008073807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402263402938843, | |
| "mean_token_accuracy": 0.7800282388925552, | |
| "num_tokens": 8004443.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5511161684989929, | |
| "epoch": 1.8373831775700935, | |
| "grad_norm": 0.030638035386800766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421851277351379, | |
| "mean_token_accuracy": 0.7774636000394821, | |
| "num_tokens": 8020850.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5710225850343704, | |
| "epoch": 1.8411214953271027, | |
| "grad_norm": 0.029152031987905502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603572130203247, | |
| "mean_token_accuracy": 0.7699873447418213, | |
| "num_tokens": 8037043.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5580283105373383, | |
| "epoch": 1.844859813084112, | |
| "grad_norm": 0.030489208176732063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527392625808716, | |
| "mean_token_accuracy": 0.7742099016904831, | |
| "num_tokens": 8053631.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5568618625402451, | |
| "epoch": 1.8485981308411215, | |
| "grad_norm": 0.03116370178759098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557203471660614, | |
| "mean_token_accuracy": 0.7757259756326675, | |
| "num_tokens": 8069679.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5572323054075241, | |
| "epoch": 1.8523364485981308, | |
| "grad_norm": 0.03199765831232071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623334646224976, | |
| "mean_token_accuracy": 0.7726736217737198, | |
| "num_tokens": 8086185.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5608405023813248, | |
| "epoch": 1.8560747663551402, | |
| "grad_norm": 0.03123069368302822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668354630470276, | |
| "mean_token_accuracy": 0.7697951197624207, | |
| "num_tokens": 8102680.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5482483208179474, | |
| "epoch": 1.8598130841121496, | |
| "grad_norm": 0.03388088196516037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544660091400146, | |
| "mean_token_accuracy": 0.7736243009567261, | |
| "num_tokens": 8119206.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5743024945259094, | |
| "epoch": 1.863551401869159, | |
| "grad_norm": 0.027546290308237076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691558718681335, | |
| "mean_token_accuracy": 0.7669505923986435, | |
| "num_tokens": 8135686.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5571306794881821, | |
| "epoch": 1.8672897196261682, | |
| "grad_norm": 0.03095332719385624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527883172035217, | |
| "mean_token_accuracy": 0.7751508802175522, | |
| "num_tokens": 8151938.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5444643199443817, | |
| "epoch": 1.8710280373831776, | |
| "grad_norm": 0.03176809847354889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450653433799744, | |
| "mean_token_accuracy": 0.7778386175632477, | |
| "num_tokens": 8168369.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5318097025156021, | |
| "epoch": 1.874766355140187, | |
| "grad_norm": 0.03216860815882683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350679159164429, | |
| "mean_token_accuracy": 0.7839819490909576, | |
| "num_tokens": 8184441.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5431730151176453, | |
| "epoch": 1.8785046728971961, | |
| "grad_norm": 0.031609971076250076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454133152961731, | |
| "mean_token_accuracy": 0.7757967710494995, | |
| "num_tokens": 8200701.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5446748435497284, | |
| "epoch": 1.8822429906542055, | |
| "grad_norm": 0.03689466044306755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491172075271606, | |
| "mean_token_accuracy": 0.7771103084087372, | |
| "num_tokens": 8216896.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5379506647586823, | |
| "epoch": 1.885981308411215, | |
| "grad_norm": 0.03774857521057129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465993881225586, | |
| "mean_token_accuracy": 0.7745991945266724, | |
| "num_tokens": 8233119.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5524174273014069, | |
| "epoch": 1.8897196261682243, | |
| "grad_norm": 0.03127999231219292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552331268787384, | |
| "mean_token_accuracy": 0.7734175026416779, | |
| "num_tokens": 8249424.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5634707659482956, | |
| "epoch": 1.8934579439252337, | |
| "grad_norm": 0.03172188624739647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552417039871216, | |
| "mean_token_accuracy": 0.7762156277894974, | |
| "num_tokens": 8265823.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5733916610479355, | |
| "epoch": 1.897196261682243, | |
| "grad_norm": 0.041391924023628235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5685185790061951, | |
| "mean_token_accuracy": 0.7656967639923096, | |
| "num_tokens": 8282150.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5633519440889359, | |
| "epoch": 1.9009345794392525, | |
| "grad_norm": 0.03210509195923805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575313568115234, | |
| "mean_token_accuracy": 0.7736276984214783, | |
| "num_tokens": 8298545.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5282728672027588, | |
| "epoch": 1.9046728971962616, | |
| "grad_norm": 0.031000696122646332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271653532981873, | |
| "mean_token_accuracy": 0.7857028245925903, | |
| "num_tokens": 8314750.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5598197877407074, | |
| "epoch": 1.908411214953271, | |
| "grad_norm": 0.03814297169446945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556469559669495, | |
| "mean_token_accuracy": 0.7734071165323257, | |
| "num_tokens": 8331160.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5301484763622284, | |
| "epoch": 1.9121495327102802, | |
| "grad_norm": 0.03675490617752075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384268760681152, | |
| "mean_token_accuracy": 0.7815950363874435, | |
| "num_tokens": 8347524.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.556285485625267, | |
| "epoch": 1.9158878504672896, | |
| "grad_norm": 0.03204094246029854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582637190818787, | |
| "mean_token_accuracy": 0.7725251466035843, | |
| "num_tokens": 8363738.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5535630583763123, | |
| "epoch": 1.919626168224299, | |
| "grad_norm": 0.030629510059952736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578333735466003, | |
| "mean_token_accuracy": 0.7727056741714478, | |
| "num_tokens": 8380122.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5471296161413193, | |
| "epoch": 1.9233644859813084, | |
| "grad_norm": 0.03401264175772667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535186529159546, | |
| "mean_token_accuracy": 0.7754651010036469, | |
| "num_tokens": 8396440.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5500332862138748, | |
| "epoch": 1.9271028037383178, | |
| "grad_norm": 0.03108939900994301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485121607780457, | |
| "mean_token_accuracy": 0.7769151926040649, | |
| "num_tokens": 8412740.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5605651885271072, | |
| "epoch": 1.9308411214953272, | |
| "grad_norm": 0.028515921905636787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516760349273682, | |
| "mean_token_accuracy": 0.7752381414175034, | |
| "num_tokens": 8429081.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5527090132236481, | |
| "epoch": 1.9345794392523366, | |
| "grad_norm": 0.032440509647130966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482094883918762, | |
| "mean_token_accuracy": 0.776523694396019, | |
| "num_tokens": 8445459.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5639519840478897, | |
| "epoch": 1.938317757009346, | |
| "grad_norm": 0.03387531265616417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565314769744873, | |
| "mean_token_accuracy": 0.7686825692653656, | |
| "num_tokens": 8461834.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5390266180038452, | |
| "epoch": 1.9420560747663551, | |
| "grad_norm": 0.02882574312388897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430452823638916, | |
| "mean_token_accuracy": 0.7774745523929596, | |
| "num_tokens": 8478272.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5343397557735443, | |
| "epoch": 1.9457943925233645, | |
| "grad_norm": 0.030860040336847305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347194075584412, | |
| "mean_token_accuracy": 0.7817697376012802, | |
| "num_tokens": 8494437.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5492627769708633, | |
| "epoch": 1.9495327102803737, | |
| "grad_norm": 0.03405896574258804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500932335853577, | |
| "mean_token_accuracy": 0.7765759974718094, | |
| "num_tokens": 8510975.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5563263446092606, | |
| "epoch": 1.953271028037383, | |
| "grad_norm": 0.03141237422823906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557966947555542, | |
| "mean_token_accuracy": 0.7717025876045227, | |
| "num_tokens": 8527347.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5636772364377975, | |
| "epoch": 1.9570093457943925, | |
| "grad_norm": 0.03168516606092453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611008405685425, | |
| "mean_token_accuracy": 0.7714557945728302, | |
| "num_tokens": 8543551.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5489466190338135, | |
| "epoch": 1.9607476635514018, | |
| "grad_norm": 0.03355073928833008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395604372024536, | |
| "mean_token_accuracy": 0.7807340919971466, | |
| "num_tokens": 8559955.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5399315655231476, | |
| "epoch": 1.9644859813084112, | |
| "grad_norm": 0.03453009948134422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348931550979614, | |
| "mean_token_accuracy": 0.7806299477815628, | |
| "num_tokens": 8576469.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5491375476121902, | |
| "epoch": 1.9682242990654206, | |
| "grad_norm": 0.0316200815141201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556234121322632, | |
| "mean_token_accuracy": 0.773221030831337, | |
| "num_tokens": 8592906.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5373014956712723, | |
| "epoch": 1.97196261682243, | |
| "grad_norm": 0.032452452927827835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457467436790466, | |
| "mean_token_accuracy": 0.7758653908967972, | |
| "num_tokens": 8609100.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5414352118968964, | |
| "epoch": 1.9757009345794394, | |
| "grad_norm": 0.03351645544171333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482410788536072, | |
| "mean_token_accuracy": 0.7752601951360703, | |
| "num_tokens": 8625316.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5407055169343948, | |
| "epoch": 1.9794392523364486, | |
| "grad_norm": 0.03003384917974472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356785655021667, | |
| "mean_token_accuracy": 0.7822994440793991, | |
| "num_tokens": 8641716.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5463829636573792, | |
| "epoch": 1.983177570093458, | |
| "grad_norm": 0.028586186468601227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386159420013428, | |
| "mean_token_accuracy": 0.7832934260368347, | |
| "num_tokens": 8658117.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.52997986972332, | |
| "epoch": 1.9869158878504671, | |
| "grad_norm": 0.03231372311711311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258426666259766, | |
| "mean_token_accuracy": 0.786494106054306, | |
| "num_tokens": 8674098.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5263413488864899, | |
| "epoch": 1.9906542056074765, | |
| "grad_norm": 0.029255473986268044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267069935798645, | |
| "mean_token_accuracy": 0.784383550286293, | |
| "num_tokens": 8690474.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5337765663862228, | |
| "epoch": 1.994392523364486, | |
| "grad_norm": 0.03723280131816864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434689521789551, | |
| "mean_token_accuracy": 0.7792166471481323, | |
| "num_tokens": 8706774.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5302833914756775, | |
| "epoch": 1.9981308411214953, | |
| "grad_norm": 0.03789842873811722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390503406524658, | |
| "mean_token_accuracy": 0.7825159579515457, | |
| "num_tokens": 8722988.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5365387499332428, | |
| "epoch": 2.0, | |
| "grad_norm": 0.03994116187095642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442785024642944, | |
| "mean_token_accuracy": 0.779285341501236, | |
| "num_tokens": 8731086.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5551358312368393, | |
| "epoch": 2.0037383177570094, | |
| "grad_norm": 0.03304925188422203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366768836975098, | |
| "mean_token_accuracy": 0.7850453853607178, | |
| "num_tokens": 8747251.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5637228041887283, | |
| "epoch": 2.007476635514019, | |
| "grad_norm": 0.03504426032304764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443665981292725, | |
| "mean_token_accuracy": 0.7774000763893127, | |
| "num_tokens": 8763427.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5427139699459076, | |
| "epoch": 2.011214953271028, | |
| "grad_norm": 0.03504855930805206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313124656677246, | |
| "mean_token_accuracy": 0.7818376272916794, | |
| "num_tokens": 8779836.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5330108106136322, | |
| "epoch": 2.0149532710280376, | |
| "grad_norm": 0.03754406422376633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421642661094666, | |
| "mean_token_accuracy": 0.7790561318397522, | |
| "num_tokens": 8796325.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.512071430683136, | |
| "epoch": 2.0186915887850465, | |
| "grad_norm": 0.043662529438734055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302350521087646, | |
| "mean_token_accuracy": 0.7863733917474747, | |
| "num_tokens": 8812606.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5129958391189575, | |
| "epoch": 2.022429906542056, | |
| "grad_norm": 0.04149031639099121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309258699417114, | |
| "mean_token_accuracy": 0.7860198318958282, | |
| "num_tokens": 8828882.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5420234501361847, | |
| "epoch": 2.0261682242990653, | |
| "grad_norm": 0.03192834183573723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397300124168396, | |
| "mean_token_accuracy": 0.7826980352401733, | |
| "num_tokens": 8845360.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5496412217617035, | |
| "epoch": 2.0299065420560747, | |
| "grad_norm": 0.03798922896385193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328091979026794, | |
| "mean_token_accuracy": 0.7848182171583176, | |
| "num_tokens": 8861741.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5499916076660156, | |
| "epoch": 2.033644859813084, | |
| "grad_norm": 0.03497615084052086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330801010131836, | |
| "mean_token_accuracy": 0.7823185920715332, | |
| "num_tokens": 8878099.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5397230982780457, | |
| "epoch": 2.0373831775700935, | |
| "grad_norm": 0.03805805742740631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325009822845459, | |
| "mean_token_accuracy": 0.7835113406181335, | |
| "num_tokens": 8894613.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5198622792959213, | |
| "epoch": 2.041121495327103, | |
| "grad_norm": 0.03364388644695282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222806334495544, | |
| "mean_token_accuracy": 0.7844293862581253, | |
| "num_tokens": 8910849.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5255338400602341, | |
| "epoch": 2.0448598130841122, | |
| "grad_norm": 0.047903481870889664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388204455375671, | |
| "mean_token_accuracy": 0.7818868011236191, | |
| "num_tokens": 8927305.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5240660309791565, | |
| "epoch": 2.0485981308411216, | |
| "grad_norm": 0.04678136110305786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544981062412262, | |
| "mean_token_accuracy": 0.7767013013362885, | |
| "num_tokens": 8943628.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5418435484170914, | |
| "epoch": 2.052336448598131, | |
| "grad_norm": 0.04154983535408974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431923866271973, | |
| "mean_token_accuracy": 0.7803478538990021, | |
| "num_tokens": 8959739.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5464048683643341, | |
| "epoch": 2.05607476635514, | |
| "grad_norm": 0.03621891885995865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369123220443726, | |
| "mean_token_accuracy": 0.7831740379333496, | |
| "num_tokens": 8975834.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5625316351652145, | |
| "epoch": 2.0598130841121494, | |
| "grad_norm": 0.04116278514266014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496330261230469, | |
| "mean_token_accuracy": 0.7770462930202484, | |
| "num_tokens": 8992265.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5488497316837311, | |
| "epoch": 2.0635514018691588, | |
| "grad_norm": 0.03322463855147362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367662310600281, | |
| "mean_token_accuracy": 0.7818718105554581, | |
| "num_tokens": 9008719.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5378982275724411, | |
| "epoch": 2.067289719626168, | |
| "grad_norm": 0.034129269421100616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418792963027954, | |
| "mean_token_accuracy": 0.7807257324457169, | |
| "num_tokens": 9025151.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5220974087715149, | |
| "epoch": 2.0710280373831775, | |
| "grad_norm": 0.045197054743766785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300080180168152, | |
| "mean_token_accuracy": 0.7885446846485138, | |
| "num_tokens": 9041486.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.515913613140583, | |
| "epoch": 2.074766355140187, | |
| "grad_norm": 0.04399452358484268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253356099128723, | |
| "mean_token_accuracy": 0.787113681435585, | |
| "num_tokens": 9057792.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.529649943113327, | |
| "epoch": 2.0785046728971963, | |
| "grad_norm": 0.0405830517411232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332399010658264, | |
| "mean_token_accuracy": 0.7825795114040375, | |
| "num_tokens": 9073971.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5306390672922134, | |
| "epoch": 2.0822429906542057, | |
| "grad_norm": 0.04040224850177765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270552039146423, | |
| "mean_token_accuracy": 0.7854219824075699, | |
| "num_tokens": 9090396.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.540916696190834, | |
| "epoch": 2.085981308411215, | |
| "grad_norm": 0.039850566536188126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330172181129456, | |
| "mean_token_accuracy": 0.7840156704187393, | |
| "num_tokens": 9106865.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5573539286851883, | |
| "epoch": 2.0897196261682245, | |
| "grad_norm": 0.039134591817855835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492205023765564, | |
| "mean_token_accuracy": 0.7779581248760223, | |
| "num_tokens": 9123213.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5308785140514374, | |
| "epoch": 2.0934579439252334, | |
| "grad_norm": 0.033643938601017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260533690452576, | |
| "mean_token_accuracy": 0.7881509810686111, | |
| "num_tokens": 9139334.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5462942272424698, | |
| "epoch": 2.097196261682243, | |
| "grad_norm": 0.0343049094080925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453207492828369, | |
| "mean_token_accuracy": 0.7791396528482437, | |
| "num_tokens": 9155964.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5272018313407898, | |
| "epoch": 2.100934579439252, | |
| "grad_norm": 0.040583785623311996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357244610786438, | |
| "mean_token_accuracy": 0.7829957753419876, | |
| "num_tokens": 9172409.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5276166945695877, | |
| "epoch": 2.1046728971962616, | |
| "grad_norm": 0.03636649623513222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361207127571106, | |
| "mean_token_accuracy": 0.7831525951623917, | |
| "num_tokens": 9188524.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5464211106300354, | |
| "epoch": 2.108411214953271, | |
| "grad_norm": 0.0365222692489624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448060035705566, | |
| "mean_token_accuracy": 0.7774559408426285, | |
| "num_tokens": 9204803.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5368735194206238, | |
| "epoch": 2.1121495327102804, | |
| "grad_norm": 0.04034702479839325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308568477630615, | |
| "mean_token_accuracy": 0.784459188580513, | |
| "num_tokens": 9220931.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5340090990066528, | |
| "epoch": 2.1158878504672898, | |
| "grad_norm": 0.03558754175901413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307760238647461, | |
| "mean_token_accuracy": 0.7841941863298416, | |
| "num_tokens": 9237402.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.554409846663475, | |
| "epoch": 2.119626168224299, | |
| "grad_norm": 0.038797035813331604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491658449172974, | |
| "mean_token_accuracy": 0.7782745659351349, | |
| "num_tokens": 9254002.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.546349972486496, | |
| "epoch": 2.1233644859813086, | |
| "grad_norm": 0.04194206744432449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519090294837952, | |
| "mean_token_accuracy": 0.7750387489795685, | |
| "num_tokens": 9270313.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5365971177816391, | |
| "epoch": 2.127102803738318, | |
| "grad_norm": 0.045358605682849884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437461733818054, | |
| "mean_token_accuracy": 0.7794076204299927, | |
| "num_tokens": 9286712.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5360657125711441, | |
| "epoch": 2.130841121495327, | |
| "grad_norm": 0.04332416132092476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378158688545227, | |
| "mean_token_accuracy": 0.7812185734510422, | |
| "num_tokens": 9302929.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5161439999938011, | |
| "epoch": 2.1345794392523363, | |
| "grad_norm": 0.03498893231153488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5166691541671753, | |
| "mean_token_accuracy": 0.7898645251989365, | |
| "num_tokens": 9318970.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5420155078172684, | |
| "epoch": 2.1383177570093457, | |
| "grad_norm": 0.059223148971796036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398759841918945, | |
| "mean_token_accuracy": 0.7814654260873795, | |
| "num_tokens": 9335490.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5263395309448242, | |
| "epoch": 2.142056074766355, | |
| "grad_norm": 0.03245805576443672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229323506355286, | |
| "mean_token_accuracy": 0.7877913564443588, | |
| "num_tokens": 9351959.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5362307131290436, | |
| "epoch": 2.1457943925233645, | |
| "grad_norm": 0.037454549223184586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291175246238708, | |
| "mean_token_accuracy": 0.783667266368866, | |
| "num_tokens": 9368360.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.527548685669899, | |
| "epoch": 2.149532710280374, | |
| "grad_norm": 0.043125126510858536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279426574707031, | |
| "mean_token_accuracy": 0.7838954478502274, | |
| "num_tokens": 9384665.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.543443351984024, | |
| "epoch": 2.1532710280373832, | |
| "grad_norm": 0.03840547800064087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481908321380615, | |
| "mean_token_accuracy": 0.7762167900800705, | |
| "num_tokens": 9400994.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5402033478021622, | |
| "epoch": 2.1570093457943926, | |
| "grad_norm": 0.04524662345647812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483248829841614, | |
| "mean_token_accuracy": 0.7753354609012604, | |
| "num_tokens": 9417287.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5183399319648743, | |
| "epoch": 2.160747663551402, | |
| "grad_norm": 0.033803943544626236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152841210365295, | |
| "mean_token_accuracy": 0.7872842252254486, | |
| "num_tokens": 9433683.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5163632705807686, | |
| "epoch": 2.1644859813084114, | |
| "grad_norm": 0.036510877311229706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5149884223937988, | |
| "mean_token_accuracy": 0.7905207723379135, | |
| "num_tokens": 9450137.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5321061164140701, | |
| "epoch": 2.1682242990654204, | |
| "grad_norm": 0.0464416965842247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351567268371582, | |
| "mean_token_accuracy": 0.7838670462369919, | |
| "num_tokens": 9466550.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5199630409479141, | |
| "epoch": 2.1719626168224297, | |
| "grad_norm": 0.04309747740626335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278782844543457, | |
| "mean_token_accuracy": 0.7839005291461945, | |
| "num_tokens": 9482588.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5339600071310997, | |
| "epoch": 2.175700934579439, | |
| "grad_norm": 0.04095384106040001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310637354850769, | |
| "mean_token_accuracy": 0.783690795302391, | |
| "num_tokens": 9498951.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5384320765733719, | |
| "epoch": 2.1794392523364485, | |
| "grad_norm": 0.03863927349448204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540824294090271, | |
| "mean_token_accuracy": 0.7791530042886734, | |
| "num_tokens": 9515132.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5549707859754562, | |
| "epoch": 2.183177570093458, | |
| "grad_norm": 0.03921306133270264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536147356033325, | |
| "mean_token_accuracy": 0.7751126140356064, | |
| "num_tokens": 9531512.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5347359776496887, | |
| "epoch": 2.1869158878504673, | |
| "grad_norm": 0.037864800542593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341432094573975, | |
| "mean_token_accuracy": 0.7835363298654556, | |
| "num_tokens": 9547534.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5516605377197266, | |
| "epoch": 2.1906542056074767, | |
| "grad_norm": 0.036846909672021866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443211197853088, | |
| "mean_token_accuracy": 0.7788311243057251, | |
| "num_tokens": 9564040.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5391202419996262, | |
| "epoch": 2.194392523364486, | |
| "grad_norm": 0.03954128175973892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309199094772339, | |
| "mean_token_accuracy": 0.783383384346962, | |
| "num_tokens": 9580289.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5318265110254288, | |
| "epoch": 2.1981308411214955, | |
| "grad_norm": 0.03327268362045288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330622792243958, | |
| "mean_token_accuracy": 0.7819591611623764, | |
| "num_tokens": 9596500.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5139677748084068, | |
| "epoch": 2.201869158878505, | |
| "grad_norm": 0.039606738835573196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520559549331665, | |
| "mean_token_accuracy": 0.7877521514892578, | |
| "num_tokens": 9612675.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5283454358577728, | |
| "epoch": 2.205607476635514, | |
| "grad_norm": 0.03826924040913582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321468710899353, | |
| "mean_token_accuracy": 0.7843296527862549, | |
| "num_tokens": 9629044.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5257805287837982, | |
| "epoch": 2.209345794392523, | |
| "grad_norm": 0.04099821671843529, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277660489082336, | |
| "mean_token_accuracy": 0.7833193689584732, | |
| "num_tokens": 9645271.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5350408107042313, | |
| "epoch": 2.2130841121495326, | |
| "grad_norm": 0.038267582654953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255724787712097, | |
| "mean_token_accuracy": 0.7867475599050522, | |
| "num_tokens": 9661448.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5472716838121414, | |
| "epoch": 2.216822429906542, | |
| "grad_norm": 0.03405248373746872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390135645866394, | |
| "mean_token_accuracy": 0.779327467083931, | |
| "num_tokens": 9677824.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5421159714460373, | |
| "epoch": 2.2205607476635514, | |
| "grad_norm": 0.041895944625139236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395660400390625, | |
| "mean_token_accuracy": 0.7796223610639572, | |
| "num_tokens": 9694305.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5459330081939697, | |
| "epoch": 2.2242990654205608, | |
| "grad_norm": 0.036602918058633804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457043647766113, | |
| "mean_token_accuracy": 0.7810876667499542, | |
| "num_tokens": 9710852.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5278807803988457, | |
| "epoch": 2.22803738317757, | |
| "grad_norm": 0.04418497160077095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371560454368591, | |
| "mean_token_accuracy": 0.7824568003416061, | |
| "num_tokens": 9727075.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5311697870492935, | |
| "epoch": 2.2317757009345796, | |
| "grad_norm": 0.043200667947530746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364136695861816, | |
| "mean_token_accuracy": 0.783041849732399, | |
| "num_tokens": 9743306.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5302419811487198, | |
| "epoch": 2.235514018691589, | |
| "grad_norm": 0.037720005959272385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262041091918945, | |
| "mean_token_accuracy": 0.7870023250579834, | |
| "num_tokens": 9759403.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5483334362506866, | |
| "epoch": 2.2392523364485983, | |
| "grad_norm": 0.03560694679617882, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467509627342224, | |
| "mean_token_accuracy": 0.779225081205368, | |
| "num_tokens": 9775738.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5375639796257019, | |
| "epoch": 2.2429906542056073, | |
| "grad_norm": 0.03993435204029083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336683988571167, | |
| "mean_token_accuracy": 0.7839321345090866, | |
| "num_tokens": 9792043.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.544166311621666, | |
| "epoch": 2.2467289719626167, | |
| "grad_norm": 0.03602972254157066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403839945793152, | |
| "mean_token_accuracy": 0.7812667638063431, | |
| "num_tokens": 9808431.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5295002460479736, | |
| "epoch": 2.250467289719626, | |
| "grad_norm": 0.041549984365701675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339419841766357, | |
| "mean_token_accuracy": 0.7843643128871918, | |
| "num_tokens": 9824744.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5211731493473053, | |
| "epoch": 2.2542056074766355, | |
| "grad_norm": 0.04408840090036392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288305878639221, | |
| "mean_token_accuracy": 0.7842673063278198, | |
| "num_tokens": 9841081.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5425246208906174, | |
| "epoch": 2.257943925233645, | |
| "grad_norm": 0.04026458412408829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444083213806152, | |
| "mean_token_accuracy": 0.7781710475683212, | |
| "num_tokens": 9857545.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5519444048404694, | |
| "epoch": 2.2616822429906542, | |
| "grad_norm": 0.03973834961652756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547622799873352, | |
| "mean_token_accuracy": 0.7769842147827148, | |
| "num_tokens": 9873925.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5228262096643448, | |
| "epoch": 2.2654205607476636, | |
| "grad_norm": 0.041971541941165924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222245454788208, | |
| "mean_token_accuracy": 0.7858153134584427, | |
| "num_tokens": 9890052.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5335221141576767, | |
| "epoch": 2.269158878504673, | |
| "grad_norm": 0.039673078805208206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314098000526428, | |
| "mean_token_accuracy": 0.7840564250946045, | |
| "num_tokens": 9906259.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5426364839076996, | |
| "epoch": 2.2728971962616824, | |
| "grad_norm": 0.04128013923764229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407010316848755, | |
| "mean_token_accuracy": 0.7802868187427521, | |
| "num_tokens": 9922434.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5306970030069351, | |
| "epoch": 2.2766355140186914, | |
| "grad_norm": 0.03684001415967941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325096845626831, | |
| "mean_token_accuracy": 0.7816676050424576, | |
| "num_tokens": 9938715.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5312017947435379, | |
| "epoch": 2.2803738317757007, | |
| "grad_norm": 0.0396246500313282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326136350631714, | |
| "mean_token_accuracy": 0.7833829969167709, | |
| "num_tokens": 9954795.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5242188572883606, | |
| "epoch": 2.28411214953271, | |
| "grad_norm": 0.03666768968105316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254257321357727, | |
| "mean_token_accuracy": 0.785698264837265, | |
| "num_tokens": 9970976.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5251396894454956, | |
| "epoch": 2.2878504672897195, | |
| "grad_norm": 0.041744161397218704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361155867576599, | |
| "mean_token_accuracy": 0.781558558344841, | |
| "num_tokens": 9987242.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5212117433547974, | |
| "epoch": 2.291588785046729, | |
| "grad_norm": 0.044306471943855286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255172252655029, | |
| "mean_token_accuracy": 0.7819651514291763, | |
| "num_tokens": 10003383.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5342397391796112, | |
| "epoch": 2.2953271028037383, | |
| "grad_norm": 0.04804427549242973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286440849304199, | |
| "mean_token_accuracy": 0.7870652973651886, | |
| "num_tokens": 10019705.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5513401627540588, | |
| "epoch": 2.2990654205607477, | |
| "grad_norm": 0.04101845622062683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483744144439697, | |
| "mean_token_accuracy": 0.7755522131919861, | |
| "num_tokens": 10035997.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5434563606977463, | |
| "epoch": 2.302803738317757, | |
| "grad_norm": 0.036619942635297775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326208472251892, | |
| "mean_token_accuracy": 0.782253697514534, | |
| "num_tokens": 10052253.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5315294414758682, | |
| "epoch": 2.3065420560747665, | |
| "grad_norm": 0.037794552743434906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253270864486694, | |
| "mean_token_accuracy": 0.7854621708393097, | |
| "num_tokens": 10068502.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5264740660786629, | |
| "epoch": 2.310280373831776, | |
| "grad_norm": 0.05285142362117767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347273349761963, | |
| "mean_token_accuracy": 0.7845266908407211, | |
| "num_tokens": 10084722.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5410954803228378, | |
| "epoch": 2.3140186915887853, | |
| "grad_norm": 0.036392901092767715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492109060287476, | |
| "mean_token_accuracy": 0.775203213095665, | |
| "num_tokens": 10101110.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5478453040122986, | |
| "epoch": 2.317757009345794, | |
| "grad_norm": 0.0461491234600544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482407808303833, | |
| "mean_token_accuracy": 0.7783631533384323, | |
| "num_tokens": 10117543.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.515753298997879, | |
| "epoch": 2.3214953271028036, | |
| "grad_norm": 0.04075627774000168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5150102972984314, | |
| "mean_token_accuracy": 0.789474606513977, | |
| "num_tokens": 10133572.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5349336713552475, | |
| "epoch": 2.325233644859813, | |
| "grad_norm": 0.042154040187597275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526114821434021, | |
| "mean_token_accuracy": 0.7856980115175247, | |
| "num_tokens": 10150048.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5674707591533661, | |
| "epoch": 2.3289719626168224, | |
| "grad_norm": 0.04182770103216171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611693859100342, | |
| "mean_token_accuracy": 0.7749929875135422, | |
| "num_tokens": 10166642.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5181543081998825, | |
| "epoch": 2.3327102803738318, | |
| "grad_norm": 0.038145892322063446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206056833267212, | |
| "mean_token_accuracy": 0.788123145699501, | |
| "num_tokens": 10182897.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5357862561941147, | |
| "epoch": 2.336448598130841, | |
| "grad_norm": 0.04366487264633179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423003435134888, | |
| "mean_token_accuracy": 0.7787369638681412, | |
| "num_tokens": 10199311.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5277369916439056, | |
| "epoch": 2.3401869158878505, | |
| "grad_norm": 0.05174623429775238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539736270904541, | |
| "mean_token_accuracy": 0.7798131704330444, | |
| "num_tokens": 10215707.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5540482401847839, | |
| "epoch": 2.34392523364486, | |
| "grad_norm": 0.03900719806551933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546514391899109, | |
| "mean_token_accuracy": 0.7751745879650116, | |
| "num_tokens": 10232233.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5211993083357811, | |
| "epoch": 2.3476635514018693, | |
| "grad_norm": 0.044696055352687836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210398435592651, | |
| "mean_token_accuracy": 0.7867566049098969, | |
| "num_tokens": 10248397.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5406811684370041, | |
| "epoch": 2.3514018691588783, | |
| "grad_norm": 0.04107234627008438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430042147636414, | |
| "mean_token_accuracy": 0.7786548435688019, | |
| "num_tokens": 10264653.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.538291797041893, | |
| "epoch": 2.3551401869158877, | |
| "grad_norm": 0.03656275197863579, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534942090511322, | |
| "mean_token_accuracy": 0.7826343178749084, | |
| "num_tokens": 10280941.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5547115802764893, | |
| "epoch": 2.358878504672897, | |
| "grad_norm": 0.04424076899886131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602344870567322, | |
| "mean_token_accuracy": 0.7771879583597183, | |
| "num_tokens": 10297564.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5327815413475037, | |
| "epoch": 2.3626168224299064, | |
| "grad_norm": 0.04512718692421913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529172420501709, | |
| "mean_token_accuracy": 0.7825805693864822, | |
| "num_tokens": 10313759.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5432299822568893, | |
| "epoch": 2.366355140186916, | |
| "grad_norm": 0.040462445467710495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389863848686218, | |
| "mean_token_accuracy": 0.779638260602951, | |
| "num_tokens": 10330290.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5529568791389465, | |
| "epoch": 2.3700934579439252, | |
| "grad_norm": 0.04414237663149834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526305437088013, | |
| "mean_token_accuracy": 0.7754997760057449, | |
| "num_tokens": 10346636.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5441652536392212, | |
| "epoch": 2.3738317757009346, | |
| "grad_norm": 0.037299707531929016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382997393608093, | |
| "mean_token_accuracy": 0.7791097015142441, | |
| "num_tokens": 10362922.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5348048955202103, | |
| "epoch": 2.377570093457944, | |
| "grad_norm": 0.0446464829146862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380210876464844, | |
| "mean_token_accuracy": 0.7818952798843384, | |
| "num_tokens": 10379134.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5187151804566383, | |
| "epoch": 2.3813084112149534, | |
| "grad_norm": 0.0778694897890091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220566391944885, | |
| "mean_token_accuracy": 0.7889348715543747, | |
| "num_tokens": 10395255.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5462511032819748, | |
| "epoch": 2.385046728971963, | |
| "grad_norm": 0.04299847036600113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423526167869568, | |
| "mean_token_accuracy": 0.7763472348451614, | |
| "num_tokens": 10411644.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5463699400424957, | |
| "epoch": 2.388785046728972, | |
| "grad_norm": 0.10935911536216736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554538369178772, | |
| "mean_token_accuracy": 0.7772965431213379, | |
| "num_tokens": 10427999.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5152165368199348, | |
| "epoch": 2.392523364485981, | |
| "grad_norm": 0.03762959688901901, | |
| "learning_rate": 0.0002, | |
| "loss": 0.508588969707489, | |
| "mean_token_accuracy": 0.7926003634929657, | |
| "num_tokens": 10444169.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.529686912894249, | |
| "epoch": 2.3962616822429905, | |
| "grad_norm": 0.040958285331726074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307521820068359, | |
| "mean_token_accuracy": 0.7849727272987366, | |
| "num_tokens": 10460506.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5430792719125748, | |
| "epoch": 2.4, | |
| "grad_norm": 0.059025488793849945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434512495994568, | |
| "mean_token_accuracy": 0.7796961963176727, | |
| "num_tokens": 10476852.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5448063015937805, | |
| "epoch": 2.4037383177570093, | |
| "grad_norm": 0.040974777191877365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473527312278748, | |
| "mean_token_accuracy": 0.7792296558618546, | |
| "num_tokens": 10493362.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5385838449001312, | |
| "epoch": 2.4074766355140187, | |
| "grad_norm": 0.03980987146496773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398511290550232, | |
| "mean_token_accuracy": 0.7808338552713394, | |
| "num_tokens": 10509993.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5397947132587433, | |
| "epoch": 2.411214953271028, | |
| "grad_norm": 0.04422999173402786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439976453781128, | |
| "mean_token_accuracy": 0.7772432416677475, | |
| "num_tokens": 10525999.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5487875193357468, | |
| "epoch": 2.4149532710280375, | |
| "grad_norm": 0.035030197352170944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411213636398315, | |
| "mean_token_accuracy": 0.7808128446340561, | |
| "num_tokens": 10542385.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5536469519138336, | |
| "epoch": 2.418691588785047, | |
| "grad_norm": 0.03504094481468201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501288771629333, | |
| "mean_token_accuracy": 0.7798037678003311, | |
| "num_tokens": 10558968.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.542830765247345, | |
| "epoch": 2.4224299065420563, | |
| "grad_norm": 0.04252900928258896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463917255401611, | |
| "mean_token_accuracy": 0.7780060321092606, | |
| "num_tokens": 10575204.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5445516556501389, | |
| "epoch": 2.426168224299065, | |
| "grad_norm": 0.03962906450033188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398474335670471, | |
| "mean_token_accuracy": 0.7808130532503128, | |
| "num_tokens": 10591758.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5405502319335938, | |
| "epoch": 2.4299065420560746, | |
| "grad_norm": 0.0443168580532074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365331172943115, | |
| "mean_token_accuracy": 0.7831508964300156, | |
| "num_tokens": 10608086.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5417730808258057, | |
| "epoch": 2.433644859813084, | |
| "grad_norm": 0.03887809067964554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410832166671753, | |
| "mean_token_accuracy": 0.7785631865262985, | |
| "num_tokens": 10624498.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.539076067507267, | |
| "epoch": 2.4373831775700934, | |
| "grad_norm": 0.03908571973443031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387341976165771, | |
| "mean_token_accuracy": 0.781864196062088, | |
| "num_tokens": 10640880.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5390027314424515, | |
| "epoch": 2.4411214953271028, | |
| "grad_norm": 0.03712445870041847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360729694366455, | |
| "mean_token_accuracy": 0.783073827624321, | |
| "num_tokens": 10657400.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5502242594957352, | |
| "epoch": 2.444859813084112, | |
| "grad_norm": 0.03870626538991928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568853616714478, | |
| "mean_token_accuracy": 0.7743858247995377, | |
| "num_tokens": 10673826.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.525546170771122, | |
| "epoch": 2.4485981308411215, | |
| "grad_norm": 0.05200404301285744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247287154197693, | |
| "mean_token_accuracy": 0.787117063999176, | |
| "num_tokens": 10690101.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5489766597747803, | |
| "epoch": 2.452336448598131, | |
| "grad_norm": 0.03731005638837814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479599833488464, | |
| "mean_token_accuracy": 0.7739788293838501, | |
| "num_tokens": 10706469.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5457844734191895, | |
| "epoch": 2.4560747663551403, | |
| "grad_norm": 0.03958994895219803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466060638427734, | |
| "mean_token_accuracy": 0.776677593588829, | |
| "num_tokens": 10722827.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5301162749528885, | |
| "epoch": 2.4598130841121497, | |
| "grad_norm": 0.04651971161365509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345625281333923, | |
| "mean_token_accuracy": 0.7808788865804672, | |
| "num_tokens": 10739136.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5545621961355209, | |
| "epoch": 2.463551401869159, | |
| "grad_norm": 0.04008018597960472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584450960159302, | |
| "mean_token_accuracy": 0.7706544101238251, | |
| "num_tokens": 10755369.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5189358592033386, | |
| "epoch": 2.467289719626168, | |
| "grad_norm": 0.040387995541095734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5199939608573914, | |
| "mean_token_accuracy": 0.7878802865743637, | |
| "num_tokens": 10771408.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5370910465717316, | |
| "epoch": 2.4710280373831774, | |
| "grad_norm": 0.04395879805088043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534496545791626, | |
| "mean_token_accuracy": 0.7834903597831726, | |
| "num_tokens": 10787604.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5326719284057617, | |
| "epoch": 2.474766355140187, | |
| "grad_norm": 0.04668545350432396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241788029670715, | |
| "mean_token_accuracy": 0.7905293852090836, | |
| "num_tokens": 10803945.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5368177741765976, | |
| "epoch": 2.4785046728971962, | |
| "grad_norm": 0.04925902187824249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367681384086609, | |
| "mean_token_accuracy": 0.7809154391288757, | |
| "num_tokens": 10820178.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5293789505958557, | |
| "epoch": 2.4822429906542056, | |
| "grad_norm": 0.041696734726428986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327548980712891, | |
| "mean_token_accuracy": 0.7873236238956451, | |
| "num_tokens": 10836561.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.529408723115921, | |
| "epoch": 2.485981308411215, | |
| "grad_norm": 0.041212067008018494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328470468521118, | |
| "mean_token_accuracy": 0.7832391858100891, | |
| "num_tokens": 10852980.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5545576214790344, | |
| "epoch": 2.4897196261682244, | |
| "grad_norm": 0.04478580132126808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554249286651611, | |
| "mean_token_accuracy": 0.7741198241710663, | |
| "num_tokens": 10869321.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5539140552282333, | |
| "epoch": 2.493457943925234, | |
| "grad_norm": 0.04277152568101883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493362545967102, | |
| "mean_token_accuracy": 0.7759024202823639, | |
| "num_tokens": 10885666.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5433756709098816, | |
| "epoch": 2.497196261682243, | |
| "grad_norm": 0.04360437020659447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412634611129761, | |
| "mean_token_accuracy": 0.7808667570352554, | |
| "num_tokens": 10901903.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5487286895513535, | |
| "epoch": 2.500934579439252, | |
| "grad_norm": 0.03885580971837044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431787371635437, | |
| "mean_token_accuracy": 0.7802725732326508, | |
| "num_tokens": 10918340.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5228707492351532, | |
| "epoch": 2.5046728971962615, | |
| "grad_norm": 0.053798187524080276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311392545700073, | |
| "mean_token_accuracy": 0.7843292206525803, | |
| "num_tokens": 10934469.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5447903871536255, | |
| "epoch": 2.508411214953271, | |
| "grad_norm": 0.05324989929795265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491751432418823, | |
| "mean_token_accuracy": 0.7752528339624405, | |
| "num_tokens": 10950837.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5308417528867722, | |
| "epoch": 2.5121495327102803, | |
| "grad_norm": 0.06228797510266304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361084938049316, | |
| "mean_token_accuracy": 0.7828515321016312, | |
| "num_tokens": 10967098.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5403530299663544, | |
| "epoch": 2.5158878504672897, | |
| "grad_norm": 0.051257163286209106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542191207408905, | |
| "mean_token_accuracy": 0.7825300693511963, | |
| "num_tokens": 10983262.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5413467437028885, | |
| "epoch": 2.519626168224299, | |
| "grad_norm": 0.04910978302359581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313704013824463, | |
| "mean_token_accuracy": 0.7851869165897369, | |
| "num_tokens": 10999552.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.55167156457901, | |
| "epoch": 2.5233644859813085, | |
| "grad_norm": 0.033519063144922256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438812971115112, | |
| "mean_token_accuracy": 0.7780154794454575, | |
| "num_tokens": 11016044.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5392196476459503, | |
| "epoch": 2.527102803738318, | |
| "grad_norm": 0.04278670251369476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411216020584106, | |
| "mean_token_accuracy": 0.780839130282402, | |
| "num_tokens": 11032377.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5352826565504074, | |
| "epoch": 2.5308411214953273, | |
| "grad_norm": 0.04736237972974777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446096658706665, | |
| "mean_token_accuracy": 0.7806870341300964, | |
| "num_tokens": 11048727.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5168470665812492, | |
| "epoch": 2.5345794392523366, | |
| "grad_norm": 0.03513955697417259, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200102925300598, | |
| "mean_token_accuracy": 0.7874528765678406, | |
| "num_tokens": 11064947.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5375211834907532, | |
| "epoch": 2.538317757009346, | |
| "grad_norm": 0.04709267243742943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393041968345642, | |
| "mean_token_accuracy": 0.7837181091308594, | |
| "num_tokens": 11081532.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5512478798627853, | |
| "epoch": 2.542056074766355, | |
| "grad_norm": 0.04090959206223488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546190619468689, | |
| "mean_token_accuracy": 0.7762559801340103, | |
| "num_tokens": 11098073.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5283504128456116, | |
| "epoch": 2.5457943925233644, | |
| "grad_norm": 0.036959145218133926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237979292869568, | |
| "mean_token_accuracy": 0.7874845713376999, | |
| "num_tokens": 11114315.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5489681363105774, | |
| "epoch": 2.5495327102803738, | |
| "grad_norm": 0.04488472267985344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456336736679077, | |
| "mean_token_accuracy": 0.7797751575708389, | |
| "num_tokens": 11130665.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5317860543727875, | |
| "epoch": 2.553271028037383, | |
| "grad_norm": 0.04248347505927086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382874011993408, | |
| "mean_token_accuracy": 0.77965147793293, | |
| "num_tokens": 11146874.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5419623553752899, | |
| "epoch": 2.5570093457943925, | |
| "grad_norm": 0.04522377625107765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449318289756775, | |
| "mean_token_accuracy": 0.7786058634519577, | |
| "num_tokens": 11163427.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5241860747337341, | |
| "epoch": 2.560747663551402, | |
| "grad_norm": 0.04621601849794388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267641544342041, | |
| "mean_token_accuracy": 0.7829258441925049, | |
| "num_tokens": 11179801.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5173597782850266, | |
| "epoch": 2.5644859813084113, | |
| "grad_norm": 0.043366726487874985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181450843811035, | |
| "mean_token_accuracy": 0.7898700088262558, | |
| "num_tokens": 11196083.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.538482740521431, | |
| "epoch": 2.5682242990654207, | |
| "grad_norm": 0.04418179765343666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392533540725708, | |
| "mean_token_accuracy": 0.778387576341629, | |
| "num_tokens": 11212295.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.540611207485199, | |
| "epoch": 2.5719626168224297, | |
| "grad_norm": 0.05271269753575325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393270254135132, | |
| "mean_token_accuracy": 0.7812009155750275, | |
| "num_tokens": 11228565.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5282483994960785, | |
| "epoch": 2.575700934579439, | |
| "grad_norm": 0.04314183071255684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224794149398804, | |
| "mean_token_accuracy": 0.7856594175100327, | |
| "num_tokens": 11244953.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5318177044391632, | |
| "epoch": 2.5794392523364484, | |
| "grad_norm": 0.05587287247180939, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358354449272156, | |
| "mean_token_accuracy": 0.7822671979665756, | |
| "num_tokens": 11261194.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5375986397266388, | |
| "epoch": 2.583177570093458, | |
| "grad_norm": 0.043386682868003845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412317514419556, | |
| "mean_token_accuracy": 0.781296119093895, | |
| "num_tokens": 11277286.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5498186945915222, | |
| "epoch": 2.586915887850467, | |
| "grad_norm": 0.04709560051560402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513982176780701, | |
| "mean_token_accuracy": 0.7768333256244659, | |
| "num_tokens": 11293799.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5409555584192276, | |
| "epoch": 2.5906542056074766, | |
| "grad_norm": 0.04518339782953262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396868586540222, | |
| "mean_token_accuracy": 0.7791042476892471, | |
| "num_tokens": 11310089.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5236431509256363, | |
| "epoch": 2.594392523364486, | |
| "grad_norm": 0.03244040906429291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155695676803589, | |
| "mean_token_accuracy": 0.7898247241973877, | |
| "num_tokens": 11326515.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5529845803976059, | |
| "epoch": 2.5981308411214954, | |
| "grad_norm": 0.04760007932782173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487071871757507, | |
| "mean_token_accuracy": 0.7782804220914841, | |
| "num_tokens": 11342994.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5314944535493851, | |
| "epoch": 2.601869158878505, | |
| "grad_norm": 0.0422595851123333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344254970550537, | |
| "mean_token_accuracy": 0.7827649861574173, | |
| "num_tokens": 11359320.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5296527296304703, | |
| "epoch": 2.605607476635514, | |
| "grad_norm": 0.04541509971022606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399951338768005, | |
| "mean_token_accuracy": 0.7812868803739548, | |
| "num_tokens": 11375866.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5503706336021423, | |
| "epoch": 2.6093457943925236, | |
| "grad_norm": 0.04639806970953941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560705304145813, | |
| "mean_token_accuracy": 0.7734115719795227, | |
| "num_tokens": 11392189.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5334575325250626, | |
| "epoch": 2.613084112149533, | |
| "grad_norm": 0.03491205349564552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285266637802124, | |
| "mean_token_accuracy": 0.786865234375, | |
| "num_tokens": 11408320.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5375584214925766, | |
| "epoch": 2.616822429906542, | |
| "grad_norm": 0.03665752336382866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285854935646057, | |
| "mean_token_accuracy": 0.7843970507383347, | |
| "num_tokens": 11424696.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5432839095592499, | |
| "epoch": 2.6205607476635513, | |
| "grad_norm": 0.040845148265361786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354432463645935, | |
| "mean_token_accuracy": 0.7819717228412628, | |
| "num_tokens": 11440921.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5447598993778229, | |
| "epoch": 2.6242990654205607, | |
| "grad_norm": 0.03317207470536232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364579558372498, | |
| "mean_token_accuracy": 0.7815430164337158, | |
| "num_tokens": 11457136.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5318229794502258, | |
| "epoch": 2.62803738317757, | |
| "grad_norm": 0.04842844605445862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381250381469727, | |
| "mean_token_accuracy": 0.7842467576265335, | |
| "num_tokens": 11473451.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.53319051861763, | |
| "epoch": 2.6317757009345795, | |
| "grad_norm": 0.04995809122920036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435810089111328, | |
| "mean_token_accuracy": 0.7806897163391113, | |
| "num_tokens": 11489778.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5205372422933578, | |
| "epoch": 2.635514018691589, | |
| "grad_norm": 0.043053507804870605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5225018858909607, | |
| "mean_token_accuracy": 0.7891059070825577, | |
| "num_tokens": 11506150.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5405721217393875, | |
| "epoch": 2.6392523364485982, | |
| "grad_norm": 0.047551702708005905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341666340827942, | |
| "mean_token_accuracy": 0.7827833145856857, | |
| "num_tokens": 11522269.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.555420309305191, | |
| "epoch": 2.6429906542056076, | |
| "grad_norm": 0.04240434989333153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463941097259521, | |
| "mean_token_accuracy": 0.776122510433197, | |
| "num_tokens": 11538672.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5373465269804001, | |
| "epoch": 2.6467289719626166, | |
| "grad_norm": 0.04053036868572235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378127694129944, | |
| "mean_token_accuracy": 0.7802188992500305, | |
| "num_tokens": 11554872.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.554849311709404, | |
| "epoch": 2.650467289719626, | |
| "grad_norm": 0.03659540414810181, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495964288711548, | |
| "mean_token_accuracy": 0.7751747816801071, | |
| "num_tokens": 11571048.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5463902503252029, | |
| "epoch": 2.6542056074766354, | |
| "grad_norm": 0.04418041929602623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471721887588501, | |
| "mean_token_accuracy": 0.7752395421266556, | |
| "num_tokens": 11587320.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5346667915582657, | |
| "epoch": 2.6579439252336448, | |
| "grad_norm": 0.03727971389889717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335649847984314, | |
| "mean_token_accuracy": 0.7821184396743774, | |
| "num_tokens": 11603606.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5425343364477158, | |
| "epoch": 2.661682242990654, | |
| "grad_norm": 0.03725122660398483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478883385658264, | |
| "mean_token_accuracy": 0.7786499708890915, | |
| "num_tokens": 11619898.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5213692635297775, | |
| "epoch": 2.6654205607476635, | |
| "grad_norm": 0.042857397347688675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380342602729797, | |
| "mean_token_accuracy": 0.7818091064691544, | |
| "num_tokens": 11636325.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.514741487801075, | |
| "epoch": 2.669158878504673, | |
| "grad_norm": 0.035097621381282806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5151344537734985, | |
| "mean_token_accuracy": 0.7884217798709869, | |
| "num_tokens": 11652621.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5442497134208679, | |
| "epoch": 2.6728971962616823, | |
| "grad_norm": 0.04381122440099716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412749648094177, | |
| "mean_token_accuracy": 0.7799884676933289, | |
| "num_tokens": 11669129.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5303985998034477, | |
| "epoch": 2.6766355140186917, | |
| "grad_norm": 0.03387914225459099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5209308862686157, | |
| "mean_token_accuracy": 0.7879882901906967, | |
| "num_tokens": 11685246.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.551127091050148, | |
| "epoch": 2.680373831775701, | |
| "grad_norm": 0.03922301158308983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454061031341553, | |
| "mean_token_accuracy": 0.7784066051244736, | |
| "num_tokens": 11701476.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.537367194890976, | |
| "epoch": 2.6841121495327105, | |
| "grad_norm": 0.038754355162382126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407044887542725, | |
| "mean_token_accuracy": 0.7816831916570663, | |
| "num_tokens": 11717876.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5448082834482193, | |
| "epoch": 2.68785046728972, | |
| "grad_norm": 0.039220135658979416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474362373352051, | |
| "mean_token_accuracy": 0.7776313573122025, | |
| "num_tokens": 11734335.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5400021821260452, | |
| "epoch": 2.691588785046729, | |
| "grad_norm": 0.04735405370593071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481384992599487, | |
| "mean_token_accuracy": 0.7767128497362137, | |
| "num_tokens": 11750551.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5442029386758804, | |
| "epoch": 2.695327102803738, | |
| "grad_norm": 0.04216023534536362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538774728775024, | |
| "mean_token_accuracy": 0.7767860740423203, | |
| "num_tokens": 11766874.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5446023046970367, | |
| "epoch": 2.6990654205607476, | |
| "grad_norm": 0.036887411028146744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384114384651184, | |
| "mean_token_accuracy": 0.7818654030561447, | |
| "num_tokens": 11783153.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5451595932245255, | |
| "epoch": 2.702803738317757, | |
| "grad_norm": 0.03859608620405197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347609519958496, | |
| "mean_token_accuracy": 0.781577005982399, | |
| "num_tokens": 11799221.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5464123338460922, | |
| "epoch": 2.7065420560747664, | |
| "grad_norm": 0.04104648903012276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531836986541748, | |
| "mean_token_accuracy": 0.7847746908664703, | |
| "num_tokens": 11815592.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5458803474903107, | |
| "epoch": 2.710280373831776, | |
| "grad_norm": 0.041141774505376816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450369119644165, | |
| "mean_token_accuracy": 0.7772473990917206, | |
| "num_tokens": 11831810.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5207616165280342, | |
| "epoch": 2.714018691588785, | |
| "grad_norm": 0.039117299020290375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5268270969390869, | |
| "mean_token_accuracy": 0.7860666513442993, | |
| "num_tokens": 11848039.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5192839056253433, | |
| "epoch": 2.717757009345794, | |
| "grad_norm": 0.03917457163333893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228926539421082, | |
| "mean_token_accuracy": 0.7870692610740662, | |
| "num_tokens": 11864185.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5525725483894348, | |
| "epoch": 2.7214953271028035, | |
| "grad_norm": 0.04475993663072586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607837438583374, | |
| "mean_token_accuracy": 0.7710844576358795, | |
| "num_tokens": 11880885.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5314790159463882, | |
| "epoch": 2.725233644859813, | |
| "grad_norm": 0.03775126487016678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314686298370361, | |
| "mean_token_accuracy": 0.7859503030776978, | |
| "num_tokens": 11897351.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5637041479349136, | |
| "epoch": 2.7289719626168223, | |
| "grad_norm": 0.045830611139535904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615176558494568, | |
| "mean_token_accuracy": 0.7733500599861145, | |
| "num_tokens": 11913886.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5528976023197174, | |
| "epoch": 2.7327102803738317, | |
| "grad_norm": 0.0355507992208004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482446551322937, | |
| "mean_token_accuracy": 0.7790254056453705, | |
| "num_tokens": 11930270.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.521368145942688, | |
| "epoch": 2.736448598130841, | |
| "grad_norm": 0.040386781096458435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189903974533081, | |
| "mean_token_accuracy": 0.7861309498548508, | |
| "num_tokens": 11946624.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5495569705963135, | |
| "epoch": 2.7401869158878505, | |
| "grad_norm": 0.04659309610724449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496231913566589, | |
| "mean_token_accuracy": 0.7766851484775543, | |
| "num_tokens": 11963057.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5380824655294418, | |
| "epoch": 2.74392523364486, | |
| "grad_norm": 0.04431717097759247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472241640090942, | |
| "mean_token_accuracy": 0.7799153625965118, | |
| "num_tokens": 11979414.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5362866371870041, | |
| "epoch": 2.7476635514018692, | |
| "grad_norm": 0.04207630082964897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480789542198181, | |
| "mean_token_accuracy": 0.7744766473770142, | |
| "num_tokens": 11995788.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5203833281993866, | |
| "epoch": 2.7514018691588786, | |
| "grad_norm": 0.040439583361148834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229013562202454, | |
| "mean_token_accuracy": 0.7877133041620255, | |
| "num_tokens": 12011768.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5442389398813248, | |
| "epoch": 2.755140186915888, | |
| "grad_norm": 0.036312710493803024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421340465545654, | |
| "mean_token_accuracy": 0.7801235765218735, | |
| "num_tokens": 12027990.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.540812149643898, | |
| "epoch": 2.7588785046728974, | |
| "grad_norm": 0.035805970430374146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289261937141418, | |
| "mean_token_accuracy": 0.7858118265867233, | |
| "num_tokens": 12044016.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5561389774084091, | |
| "epoch": 2.762616822429907, | |
| "grad_norm": 0.03753306344151497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497045516967773, | |
| "mean_token_accuracy": 0.7774728685617447, | |
| "num_tokens": 12060449.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5353166311979294, | |
| "epoch": 2.7663551401869158, | |
| "grad_norm": 0.04419036954641342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267462134361267, | |
| "mean_token_accuracy": 0.7831297665834427, | |
| "num_tokens": 12076756.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5390448272228241, | |
| "epoch": 2.770093457943925, | |
| "grad_norm": 0.039156846702098846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363330841064453, | |
| "mean_token_accuracy": 0.7822138518095016, | |
| "num_tokens": 12093231.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5334637314081192, | |
| "epoch": 2.7738317757009345, | |
| "grad_norm": 0.03978954628109932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416637659072876, | |
| "mean_token_accuracy": 0.782222107052803, | |
| "num_tokens": 12109520.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5362211316823959, | |
| "epoch": 2.777570093457944, | |
| "grad_norm": 0.04728684201836586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461055040359497, | |
| "mean_token_accuracy": 0.7771897614002228, | |
| "num_tokens": 12125527.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5383228212594986, | |
| "epoch": 2.7813084112149533, | |
| "grad_norm": 0.03740681707859039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361698269844055, | |
| "mean_token_accuracy": 0.7826491445302963, | |
| "num_tokens": 12141826.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5330131649971008, | |
| "epoch": 2.7850467289719627, | |
| "grad_norm": 0.03758367896080017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265568494796753, | |
| "mean_token_accuracy": 0.7877195477485657, | |
| "num_tokens": 12157984.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5397753864526749, | |
| "epoch": 2.788785046728972, | |
| "grad_norm": 0.042070865631103516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313206911087036, | |
| "mean_token_accuracy": 0.7845780104398727, | |
| "num_tokens": 12174529.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5600686222314835, | |
| "epoch": 2.792523364485981, | |
| "grad_norm": 0.0377703532576561, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598015189170837, | |
| "mean_token_accuracy": 0.7710230052471161, | |
| "num_tokens": 12190857.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5242457091808319, | |
| "epoch": 2.7962616822429904, | |
| "grad_norm": 0.036673370748758316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266134738922119, | |
| "mean_token_accuracy": 0.7835761904716492, | |
| "num_tokens": 12207046.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5196694731712341, | |
| "epoch": 2.8, | |
| "grad_norm": 0.04529178887605667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295214653015137, | |
| "mean_token_accuracy": 0.7850393652915955, | |
| "num_tokens": 12223323.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5278067588806152, | |
| "epoch": 2.803738317757009, | |
| "grad_norm": 0.04078579694032669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326597094535828, | |
| "mean_token_accuracy": 0.7830272614955902, | |
| "num_tokens": 12239416.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5326859503984451, | |
| "epoch": 2.8074766355140186, | |
| "grad_norm": 0.04164998233318329, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332698225975037, | |
| "mean_token_accuracy": 0.7816595435142517, | |
| "num_tokens": 12255780.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5238984450697899, | |
| "epoch": 2.811214953271028, | |
| "grad_norm": 0.03843814134597778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195130109786987, | |
| "mean_token_accuracy": 0.7881060838699341, | |
| "num_tokens": 12272157.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5336880385875702, | |
| "epoch": 2.8149532710280374, | |
| "grad_norm": 0.039413440972566605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531658411026001, | |
| "mean_token_accuracy": 0.7836297303438187, | |
| "num_tokens": 12288500.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5406560152769089, | |
| "epoch": 2.8186915887850468, | |
| "grad_norm": 0.044693466275930405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541545033454895, | |
| "mean_token_accuracy": 0.7807977646589279, | |
| "num_tokens": 12304864.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.538055032491684, | |
| "epoch": 2.822429906542056, | |
| "grad_norm": 0.03888081759214401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337695479393005, | |
| "mean_token_accuracy": 0.7844773530960083, | |
| "num_tokens": 12321170.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.527722030878067, | |
| "epoch": 2.8261682242990656, | |
| "grad_norm": 0.04188257455825806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265190005302429, | |
| "mean_token_accuracy": 0.7878826707601547, | |
| "num_tokens": 12337523.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5507965534925461, | |
| "epoch": 2.829906542056075, | |
| "grad_norm": 0.03817446902394295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500692129135132, | |
| "mean_token_accuracy": 0.7806660830974579, | |
| "num_tokens": 12354118.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5407035946846008, | |
| "epoch": 2.8336448598130843, | |
| "grad_norm": 0.042875856161117554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405147671699524, | |
| "mean_token_accuracy": 0.7810708433389664, | |
| "num_tokens": 12370434.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5315204411745071, | |
| "epoch": 2.8373831775700937, | |
| "grad_norm": 0.042397141456604004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538346529006958, | |
| "mean_token_accuracy": 0.7821339964866638, | |
| "num_tokens": 12386428.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5520299524068832, | |
| "epoch": 2.8411214953271027, | |
| "grad_norm": 0.04137783497571945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512533187866211, | |
| "mean_token_accuracy": 0.7781175673007965, | |
| "num_tokens": 12402867.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5510706156492233, | |
| "epoch": 2.844859813084112, | |
| "grad_norm": 0.04001981019973755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554083585739136, | |
| "mean_token_accuracy": 0.7719452530145645, | |
| "num_tokens": 12419054.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5559884458780289, | |
| "epoch": 2.8485981308411215, | |
| "grad_norm": 0.035403911024332047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523775815963745, | |
| "mean_token_accuracy": 0.7766276150941849, | |
| "num_tokens": 12435351.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5434874594211578, | |
| "epoch": 2.852336448598131, | |
| "grad_norm": 0.03929636627435684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537907063961029, | |
| "mean_token_accuracy": 0.7796172052621841, | |
| "num_tokens": 12451647.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5497813075780869, | |
| "epoch": 2.8560747663551402, | |
| "grad_norm": 0.03768793120980263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450780391693115, | |
| "mean_token_accuracy": 0.7810264527797699, | |
| "num_tokens": 12468063.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5202910378575325, | |
| "epoch": 2.8598130841121496, | |
| "grad_norm": 0.03793422132730484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197356343269348, | |
| "mean_token_accuracy": 0.7887470573186874, | |
| "num_tokens": 12484329.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5339359492063522, | |
| "epoch": 2.863551401869159, | |
| "grad_norm": 0.04222627729177475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416290760040283, | |
| "mean_token_accuracy": 0.7798094302415848, | |
| "num_tokens": 12500522.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5492495894432068, | |
| "epoch": 2.867289719626168, | |
| "grad_norm": 0.043936122208833694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556658148765564, | |
| "mean_token_accuracy": 0.7760462909936905, | |
| "num_tokens": 12516877.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.534624308347702, | |
| "epoch": 2.8710280373831774, | |
| "grad_norm": 0.042372506111860275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317083597183228, | |
| "mean_token_accuracy": 0.7851851731538773, | |
| "num_tokens": 12533180.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5446592271327972, | |
| "epoch": 2.8747663551401867, | |
| "grad_norm": 0.037292055785655975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379966497421265, | |
| "mean_token_accuracy": 0.7800319492816925, | |
| "num_tokens": 12549532.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5482804775238037, | |
| "epoch": 2.878504672897196, | |
| "grad_norm": 0.038804132491350174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504724383354187, | |
| "mean_token_accuracy": 0.7738227695226669, | |
| "num_tokens": 12565943.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5368440747261047, | |
| "epoch": 2.8822429906542055, | |
| "grad_norm": 0.04019741341471672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410951375961304, | |
| "mean_token_accuracy": 0.7783905565738678, | |
| "num_tokens": 12582258.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5336288064718246, | |
| "epoch": 2.885981308411215, | |
| "grad_norm": 0.034321509301662445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328375101089478, | |
| "mean_token_accuracy": 0.784157395362854, | |
| "num_tokens": 12598555.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5653717815876007, | |
| "epoch": 2.8897196261682243, | |
| "grad_norm": 0.03593064844608307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628952383995056, | |
| "mean_token_accuracy": 0.7731250822544098, | |
| "num_tokens": 12614684.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5388960689306259, | |
| "epoch": 2.8934579439252337, | |
| "grad_norm": 0.03794105350971222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317496061325073, | |
| "mean_token_accuracy": 0.7814508825540543, | |
| "num_tokens": 12631301.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5498441606760025, | |
| "epoch": 2.897196261682243, | |
| "grad_norm": 0.03615562617778778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489410161972046, | |
| "mean_token_accuracy": 0.7768700569868088, | |
| "num_tokens": 12647948.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5340896248817444, | |
| "epoch": 2.9009345794392525, | |
| "grad_norm": 0.038868315517902374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335500836372375, | |
| "mean_token_accuracy": 0.7818741798400879, | |
| "num_tokens": 12664189.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5473947077989578, | |
| "epoch": 2.904672897196262, | |
| "grad_norm": 0.04030415788292885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547685980796814, | |
| "mean_token_accuracy": 0.7762889117002487, | |
| "num_tokens": 12680521.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5354717969894409, | |
| "epoch": 2.9084112149532713, | |
| "grad_norm": 0.03963444381952286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363295078277588, | |
| "mean_token_accuracy": 0.7828177064657211, | |
| "num_tokens": 12696847.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5292405933141708, | |
| "epoch": 2.91214953271028, | |
| "grad_norm": 0.044744838029146194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327066779136658, | |
| "mean_token_accuracy": 0.7849072515964508, | |
| "num_tokens": 12713036.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.52642522752285, | |
| "epoch": 2.9158878504672896, | |
| "grad_norm": 0.04283163696527481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329762697219849, | |
| "mean_token_accuracy": 0.7837288975715637, | |
| "num_tokens": 12729209.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.527685210108757, | |
| "epoch": 2.919626168224299, | |
| "grad_norm": 0.041390661150217056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320221185684204, | |
| "mean_token_accuracy": 0.783889576792717, | |
| "num_tokens": 12745655.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5404015928506851, | |
| "epoch": 2.9233644859813084, | |
| "grad_norm": 0.040262214839458466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304533243179321, | |
| "mean_token_accuracy": 0.7833625972270966, | |
| "num_tokens": 12762029.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5551902800798416, | |
| "epoch": 2.9271028037383178, | |
| "grad_norm": 0.0381385013461113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540827512741089, | |
| "mean_token_accuracy": 0.774557501077652, | |
| "num_tokens": 12778129.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5423577576875687, | |
| "epoch": 2.930841121495327, | |
| "grad_norm": 0.04024689272046089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434139370918274, | |
| "mean_token_accuracy": 0.7793742418289185, | |
| "num_tokens": 12794167.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5381026417016983, | |
| "epoch": 2.9345794392523366, | |
| "grad_norm": 0.03909367695450783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540184736251831, | |
| "mean_token_accuracy": 0.7813534885644913, | |
| "num_tokens": 12810454.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5301714539527893, | |
| "epoch": 2.938317757009346, | |
| "grad_norm": 0.039717331528663635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528195858001709, | |
| "mean_token_accuracy": 0.7839880138635635, | |
| "num_tokens": 12826792.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5483011454343796, | |
| "epoch": 2.942056074766355, | |
| "grad_norm": 0.04299187660217285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469069480895996, | |
| "mean_token_accuracy": 0.7784111201763153, | |
| "num_tokens": 12843156.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5493280291557312, | |
| "epoch": 2.9457943925233643, | |
| "grad_norm": 0.03909771516919136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475714206695557, | |
| "mean_token_accuracy": 0.7802032381296158, | |
| "num_tokens": 12859513.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.545919269323349, | |
| "epoch": 2.9495327102803737, | |
| "grad_norm": 0.03977775201201439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396496057510376, | |
| "mean_token_accuracy": 0.7824081033468246, | |
| "num_tokens": 12875944.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5471485257148743, | |
| "epoch": 2.953271028037383, | |
| "grad_norm": 0.04360375925898552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546139657497406, | |
| "mean_token_accuracy": 0.7795716971158981, | |
| "num_tokens": 12892408.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5483593940734863, | |
| "epoch": 2.9570093457943925, | |
| "grad_norm": 0.03873739019036293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458930134773254, | |
| "mean_token_accuracy": 0.7784797698259354, | |
| "num_tokens": 12908878.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5327412039041519, | |
| "epoch": 2.960747663551402, | |
| "grad_norm": 0.04030138626694679, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531423032283783, | |
| "mean_token_accuracy": 0.7864594012498856, | |
| "num_tokens": 12925328.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5355861634016037, | |
| "epoch": 2.9644859813084112, | |
| "grad_norm": 0.03622936084866524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347930192947388, | |
| "mean_token_accuracy": 0.7837072014808655, | |
| "num_tokens": 12941525.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5421173870563507, | |
| "epoch": 2.9682242990654206, | |
| "grad_norm": 0.04139631241559982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441262125968933, | |
| "mean_token_accuracy": 0.7780770361423492, | |
| "num_tokens": 12957883.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5358422696590424, | |
| "epoch": 2.97196261682243, | |
| "grad_norm": 0.04235566407442093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453042984008789, | |
| "mean_token_accuracy": 0.780327558517456, | |
| "num_tokens": 12974226.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5261758118867874, | |
| "epoch": 2.9757009345794394, | |
| "grad_norm": 0.038478292524814606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281113386154175, | |
| "mean_token_accuracy": 0.7872153073549271, | |
| "num_tokens": 12990610.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.555643692612648, | |
| "epoch": 2.979439252336449, | |
| "grad_norm": 0.03554081916809082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489306449890137, | |
| "mean_token_accuracy": 0.7791497707366943, | |
| "num_tokens": 13007012.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5474710315465927, | |
| "epoch": 2.983177570093458, | |
| "grad_norm": 0.04082915186882019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414685606956482, | |
| "mean_token_accuracy": 0.7802593261003494, | |
| "num_tokens": 13023273.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.551795169711113, | |
| "epoch": 2.986915887850467, | |
| "grad_norm": 0.03786645457148552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478507280349731, | |
| "mean_token_accuracy": 0.7769146114587784, | |
| "num_tokens": 13039409.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5366168767213821, | |
| "epoch": 2.9906542056074765, | |
| "grad_norm": 0.04365032911300659, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442554354667664, | |
| "mean_token_accuracy": 0.7847046703100204, | |
| "num_tokens": 13055837.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.528346061706543, | |
| "epoch": 2.994392523364486, | |
| "grad_norm": 0.05227791890501976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428685545921326, | |
| "mean_token_accuracy": 0.7789010256528854, | |
| "num_tokens": 13072216.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5396917909383774, | |
| "epoch": 2.9981308411214953, | |
| "grad_norm": 0.03931191936135292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454744696617126, | |
| "mean_token_accuracy": 0.7764900475740433, | |
| "num_tokens": 13088462.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5376738607883453, | |
| "epoch": 3.0, | |
| "grad_norm": 0.04954347386956215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307910442352295, | |
| "mean_token_accuracy": 0.7855222225189209, | |
| "num_tokens": 13096612.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2194419027224822e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |