Instructions to use eac123/sublim-phase4-combo-01 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/sublim-phase4-combo-01 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/sublim-phase4-combo-01") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1308949291706085, | |
| "epoch": 0.0037313432835820895, | |
| "grad_norm": 1.683108925819397, | |
| "learning_rate": 0.0002, | |
| "loss": 2.489936590194702, | |
| "mean_token_accuracy": 0.5359140038490295, | |
| "num_tokens": 16356.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2256053388118744, | |
| "epoch": 0.007462686567164179, | |
| "grad_norm": 1.5088376998901367, | |
| "learning_rate": 0.0002, | |
| "loss": 2.162245273590088, | |
| "mean_token_accuracy": 0.5673863738775253, | |
| "num_tokens": 32718.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4011717438697815, | |
| "epoch": 0.011194029850746268, | |
| "grad_norm": 1.1495057344436646, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7410045862197876, | |
| "mean_token_accuracy": 0.5877877026796341, | |
| "num_tokens": 49086.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3629191517829895, | |
| "epoch": 0.014925373134328358, | |
| "grad_norm": 0.909584105014801, | |
| "learning_rate": 0.0002, | |
| "loss": 1.410053014755249, | |
| "mean_token_accuracy": 0.6416480243206024, | |
| "num_tokens": 65483.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.345184564590454, | |
| "epoch": 0.018656716417910446, | |
| "grad_norm": 1.1788593530654907, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2843377590179443, | |
| "mean_token_accuracy": 0.6425914913415909, | |
| "num_tokens": 81705.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2523848712444305, | |
| "epoch": 0.022388059701492536, | |
| "grad_norm": 0.7064197659492493, | |
| "learning_rate": 0.0002, | |
| "loss": 1.175342082977295, | |
| "mean_token_accuracy": 0.6635853946208954, | |
| "num_tokens": 97918.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.199697583913803, | |
| "epoch": 0.026119402985074626, | |
| "grad_norm": 0.4158240854740143, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1010812520980835, | |
| "mean_token_accuracy": 0.6607878506183624, | |
| "num_tokens": 114455.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.0897426307201385, | |
| "epoch": 0.029850746268656716, | |
| "grad_norm": 0.4258277118206024, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0245436429977417, | |
| "mean_token_accuracy": 0.682918444275856, | |
| "num_tokens": 130921.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 0.9851540327072144, | |
| "epoch": 0.033582089552238806, | |
| "grad_norm": 0.6931905150413513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.972236692905426, | |
| "mean_token_accuracy": 0.690200999379158, | |
| "num_tokens": 147028.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9809075742959976, | |
| "epoch": 0.03731343283582089, | |
| "grad_norm": 0.4386370778083801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9174745082855225, | |
| "mean_token_accuracy": 0.6927480399608612, | |
| "num_tokens": 163432.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.911684438586235, | |
| "epoch": 0.041044776119402986, | |
| "grad_norm": 4.369440078735352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8261430263519287, | |
| "mean_token_accuracy": 0.7205553501844406, | |
| "num_tokens": 179455.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.8916845321655273, | |
| "epoch": 0.04477611940298507, | |
| "grad_norm": 0.5139093399047852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8168894648551941, | |
| "mean_token_accuracy": 0.714234933257103, | |
| "num_tokens": 195668.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8192363679409027, | |
| "epoch": 0.048507462686567165, | |
| "grad_norm": 0.5154215097427368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7735035419464111, | |
| "mean_token_accuracy": 0.7252469956874847, | |
| "num_tokens": 211417.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.8060386925935745, | |
| "epoch": 0.05223880597014925, | |
| "grad_norm": 0.3869208097457886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7496379017829895, | |
| "mean_token_accuracy": 0.7249694466590881, | |
| "num_tokens": 228014.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7358367741107941, | |
| "epoch": 0.055970149253731345, | |
| "grad_norm": 0.3804072439670563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7129448652267456, | |
| "mean_token_accuracy": 0.7322827130556107, | |
| "num_tokens": 244548.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.6891884654760361, | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 0.4262757897377014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7087160348892212, | |
| "mean_token_accuracy": 0.7325101941823959, | |
| "num_tokens": 260927.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6646793335676193, | |
| "epoch": 0.06343283582089553, | |
| "grad_norm": 0.3463515639305115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6711890697479248, | |
| "mean_token_accuracy": 0.743767574429512, | |
| "num_tokens": 277478.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6615253239870071, | |
| "epoch": 0.06716417910447761, | |
| "grad_norm": 0.3623281419277191, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6425697803497314, | |
| "mean_token_accuracy": 0.7528071999549866, | |
| "num_tokens": 293828.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6510400027036667, | |
| "epoch": 0.0708955223880597, | |
| "grad_norm": 0.3351263701915741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6357494592666626, | |
| "mean_token_accuracy": 0.7543895989656448, | |
| "num_tokens": 309962.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6420271843671799, | |
| "epoch": 0.07462686567164178, | |
| "grad_norm": 0.3311758041381836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6307370662689209, | |
| "mean_token_accuracy": 0.7545324862003326, | |
| "num_tokens": 326597.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6174459308385849, | |
| "epoch": 0.07835820895522388, | |
| "grad_norm": 0.35250842571258545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6103197336196899, | |
| "mean_token_accuracy": 0.7592763751745224, | |
| "num_tokens": 342917.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6289893835783005, | |
| "epoch": 0.08208955223880597, | |
| "grad_norm": 0.25894996523857117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6157230734825134, | |
| "mean_token_accuracy": 0.7587940841913223, | |
| "num_tokens": 359567.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6118573248386383, | |
| "epoch": 0.08582089552238806, | |
| "grad_norm": 0.29135045409202576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6002258658409119, | |
| "mean_token_accuracy": 0.7654120922088623, | |
| "num_tokens": 375565.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.5791880339384079, | |
| "epoch": 0.08955223880597014, | |
| "grad_norm": 0.2720821499824524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5813120603561401, | |
| "mean_token_accuracy": 0.7713776230812073, | |
| "num_tokens": 391864.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6053604930639267, | |
| "epoch": 0.09328358208955224, | |
| "grad_norm": 0.2560279667377472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6105175018310547, | |
| "mean_token_accuracy": 0.7615619450807571, | |
| "num_tokens": 408354.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.5867195874452591, | |
| "epoch": 0.09701492537313433, | |
| "grad_norm": 0.22600652277469635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5860370993614197, | |
| "mean_token_accuracy": 0.7677419036626816, | |
| "num_tokens": 424712.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.5918123573064804, | |
| "epoch": 0.10074626865671642, | |
| "grad_norm": 0.256405770778656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5865331888198853, | |
| "mean_token_accuracy": 0.7698597609996796, | |
| "num_tokens": 441249.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.5696172267198563, | |
| "epoch": 0.1044776119402985, | |
| "grad_norm": 0.22032174468040466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604762434959412, | |
| "mean_token_accuracy": 0.7779532968997955, | |
| "num_tokens": 457602.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5602490454912186, | |
| "epoch": 0.10820895522388059, | |
| "grad_norm": 0.20871949195861816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587727427482605, | |
| "mean_token_accuracy": 0.7771614342927933, | |
| "num_tokens": 473785.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5850763767957687, | |
| "epoch": 0.11194029850746269, | |
| "grad_norm": 0.23072806000709534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5962345004081726, | |
| "mean_token_accuracy": 0.762176513671875, | |
| "num_tokens": 490054.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5698783695697784, | |
| "epoch": 0.11567164179104478, | |
| "grad_norm": 0.20846784114837646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5793903470039368, | |
| "mean_token_accuracy": 0.7701146155595779, | |
| "num_tokens": 506525.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5649833828210831, | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 0.20395582914352417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709314942359924, | |
| "mean_token_accuracy": 0.7762356698513031, | |
| "num_tokens": 522952.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5790712088346481, | |
| "epoch": 0.12313432835820895, | |
| "grad_norm": 0.21085898578166962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755910873413086, | |
| "mean_token_accuracy": 0.7691536694765091, | |
| "num_tokens": 539151.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5798842161893845, | |
| "epoch": 0.12686567164179105, | |
| "grad_norm": 0.1799822747707367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5749096274375916, | |
| "mean_token_accuracy": 0.7671291828155518, | |
| "num_tokens": 555566.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.568429708480835, | |
| "epoch": 0.13059701492537312, | |
| "grad_norm": 0.21928845345973969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717220306396484, | |
| "mean_token_accuracy": 0.771720290184021, | |
| "num_tokens": 572125.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5658127665519714, | |
| "epoch": 0.13432835820895522, | |
| "grad_norm": 0.22536930441856384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656446218490601, | |
| "mean_token_accuracy": 0.7756934762001038, | |
| "num_tokens": 588539.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5779189765453339, | |
| "epoch": 0.13805970149253732, | |
| "grad_norm": 0.18143770098686218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5782102942466736, | |
| "mean_token_accuracy": 0.768736332654953, | |
| "num_tokens": 604927.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5695452243089676, | |
| "epoch": 0.1417910447761194, | |
| "grad_norm": 0.18897166848182678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745816230773926, | |
| "mean_token_accuracy": 0.7676017582416534, | |
| "num_tokens": 621213.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5704480558633804, | |
| "epoch": 0.1455223880597015, | |
| "grad_norm": 0.20254790782928467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.573440432548523, | |
| "mean_token_accuracy": 0.769940122961998, | |
| "num_tokens": 637694.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5526881515979767, | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 0.2001330703496933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598679780960083, | |
| "mean_token_accuracy": 0.7767495959997177, | |
| "num_tokens": 653791.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.572973906993866, | |
| "epoch": 0.15298507462686567, | |
| "grad_norm": 0.1802511364221573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5720363855361938, | |
| "mean_token_accuracy": 0.7737791240215302, | |
| "num_tokens": 669970.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5880100876092911, | |
| "epoch": 0.15671641791044777, | |
| "grad_norm": 0.190653994679451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5839952826499939, | |
| "mean_token_accuracy": 0.7667653411626816, | |
| "num_tokens": 686164.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5611717849969864, | |
| "epoch": 0.16044776119402984, | |
| "grad_norm": 0.18095986545085907, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529768466949463, | |
| "mean_token_accuracy": 0.7791769355535507, | |
| "num_tokens": 702271.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5776362270116806, | |
| "epoch": 0.16417910447761194, | |
| "grad_norm": 0.20184266567230225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.572957456111908, | |
| "mean_token_accuracy": 0.772771418094635, | |
| "num_tokens": 718759.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5637746602296829, | |
| "epoch": 0.16791044776119404, | |
| "grad_norm": 0.16902145743370056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564084529876709, | |
| "mean_token_accuracy": 0.7736680209636688, | |
| "num_tokens": 735087.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5521982908248901, | |
| "epoch": 0.17164179104477612, | |
| "grad_norm": 0.16458934545516968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5616670846939087, | |
| "mean_token_accuracy": 0.7762537449598312, | |
| "num_tokens": 751513.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5518182516098022, | |
| "epoch": 0.17537313432835822, | |
| "grad_norm": 0.22303543984889984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712406039237976, | |
| "mean_token_accuracy": 0.7692597359418869, | |
| "num_tokens": 767651.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5570991486310959, | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 0.1629144549369812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624895095825195, | |
| "mean_token_accuracy": 0.7735912799835205, | |
| "num_tokens": 783757.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.549803838133812, | |
| "epoch": 0.1828358208955224, | |
| "grad_norm": 0.1366954892873764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442911982536316, | |
| "mean_token_accuracy": 0.7778248488903046, | |
| "num_tokens": 800127.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5679125189781189, | |
| "epoch": 0.1865671641791045, | |
| "grad_norm": 0.1564488559961319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563010573387146, | |
| "mean_token_accuracy": 0.7781310826539993, | |
| "num_tokens": 816490.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5595380216836929, | |
| "epoch": 0.19029850746268656, | |
| "grad_norm": 0.1663539558649063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474997758865356, | |
| "mean_token_accuracy": 0.778365820646286, | |
| "num_tokens": 832576.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5542885512113571, | |
| "epoch": 0.19402985074626866, | |
| "grad_norm": 0.15933850407600403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465819239616394, | |
| "mean_token_accuracy": 0.781011700630188, | |
| "num_tokens": 848529.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.570631816983223, | |
| "epoch": 0.19776119402985073, | |
| "grad_norm": 0.15335530042648315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5733448266983032, | |
| "mean_token_accuracy": 0.7690710127353668, | |
| "num_tokens": 864787.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5657172054052353, | |
| "epoch": 0.20149253731343283, | |
| "grad_norm": 0.15320488810539246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716187357902527, | |
| "mean_token_accuracy": 0.7727480232715607, | |
| "num_tokens": 881120.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5566735565662384, | |
| "epoch": 0.20522388059701493, | |
| "grad_norm": 0.174886554479599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643004775047302, | |
| "mean_token_accuracy": 0.7743579894304276, | |
| "num_tokens": 897598.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5483224838972092, | |
| "epoch": 0.208955223880597, | |
| "grad_norm": 0.14539019763469696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542981028556824, | |
| "mean_token_accuracy": 0.7777313590049744, | |
| "num_tokens": 913970.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5746322274208069, | |
| "epoch": 0.2126865671641791, | |
| "grad_norm": 0.1465657502412796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676500201225281, | |
| "mean_token_accuracy": 0.7716732025146484, | |
| "num_tokens": 930515.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5645405799150467, | |
| "epoch": 0.21641791044776118, | |
| "grad_norm": 0.17157647013664246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554180383682251, | |
| "mean_token_accuracy": 0.7776309847831726, | |
| "num_tokens": 946699.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5437158495187759, | |
| "epoch": 0.22014925373134328, | |
| "grad_norm": 0.14779002964496613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412948131561279, | |
| "mean_token_accuracy": 0.7830284535884857, | |
| "num_tokens": 962929.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5478496849536896, | |
| "epoch": 0.22388059701492538, | |
| "grad_norm": 0.16550469398498535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546680212020874, | |
| "mean_token_accuracy": 0.7801186293363571, | |
| "num_tokens": 979336.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5491016507148743, | |
| "epoch": 0.22761194029850745, | |
| "grad_norm": 0.17403647303581238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650719404220581, | |
| "mean_token_accuracy": 0.7729975134134293, | |
| "num_tokens": 995774.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5622769743204117, | |
| "epoch": 0.23134328358208955, | |
| "grad_norm": 0.17750802636146545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5718308687210083, | |
| "mean_token_accuracy": 0.7699476927518845, | |
| "num_tokens": 1012510.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5333654135465622, | |
| "epoch": 0.23507462686567165, | |
| "grad_norm": 0.13930155336856842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345954895019531, | |
| "mean_token_accuracy": 0.7855408787727356, | |
| "num_tokens": 1028613.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5784197896718979, | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.16901279985904694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56936115026474, | |
| "mean_token_accuracy": 0.7703966796398163, | |
| "num_tokens": 1045046.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5690423101186752, | |
| "epoch": 0.24253731343283583, | |
| "grad_norm": 0.16224578022956848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559661865234375, | |
| "mean_token_accuracy": 0.7719420939683914, | |
| "num_tokens": 1061419.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5822959691286087, | |
| "epoch": 0.2462686567164179, | |
| "grad_norm": 0.16501320898532867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5733515620231628, | |
| "mean_token_accuracy": 0.7682919055223465, | |
| "num_tokens": 1077724.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5663120746612549, | |
| "epoch": 0.25, | |
| "grad_norm": 0.15710598230361938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5739370584487915, | |
| "mean_token_accuracy": 0.7685963213443756, | |
| "num_tokens": 1094309.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5416915565729141, | |
| "epoch": 0.2537313432835821, | |
| "grad_norm": 0.1652906835079193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546884536743164, | |
| "mean_token_accuracy": 0.7781604677438736, | |
| "num_tokens": 1110812.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5604560673236847, | |
| "epoch": 0.2574626865671642, | |
| "grad_norm": 0.1823517084121704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565848708152771, | |
| "mean_token_accuracy": 0.7732205092906952, | |
| "num_tokens": 1126983.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5681725591421127, | |
| "epoch": 0.26119402985074625, | |
| "grad_norm": 0.15536344051361084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707790851593018, | |
| "mean_token_accuracy": 0.7711602002382278, | |
| "num_tokens": 1143690.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5554168075323105, | |
| "epoch": 0.26492537313432835, | |
| "grad_norm": 0.1691257208585739, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645061135292053, | |
| "mean_token_accuracy": 0.7751206457614899, | |
| "num_tokens": 1159930.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5698556303977966, | |
| "epoch": 0.26865671641791045, | |
| "grad_norm": 0.17756199836730957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670963525772095, | |
| "mean_token_accuracy": 0.7744691073894501, | |
| "num_tokens": 1176287.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.558213621377945, | |
| "epoch": 0.27238805970149255, | |
| "grad_norm": 0.14214132726192474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565056204795837, | |
| "mean_token_accuracy": 0.7759946286678314, | |
| "num_tokens": 1192733.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5587260574102402, | |
| "epoch": 0.27611940298507465, | |
| "grad_norm": 0.1475045531988144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534224510192871, | |
| "mean_token_accuracy": 0.7787353843450546, | |
| "num_tokens": 1209413.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5601568818092346, | |
| "epoch": 0.2798507462686567, | |
| "grad_norm": 0.17161411046981812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623729825019836, | |
| "mean_token_accuracy": 0.773567259311676, | |
| "num_tokens": 1225838.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5421780049800873, | |
| "epoch": 0.2835820895522388, | |
| "grad_norm": 0.1444474756717682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297126173973083, | |
| "mean_token_accuracy": 0.7893946915864944, | |
| "num_tokens": 1242213.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5718793421983719, | |
| "epoch": 0.2873134328358209, | |
| "grad_norm": 0.14322321116924286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5714331865310669, | |
| "mean_token_accuracy": 0.7688785791397095, | |
| "num_tokens": 1258461.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5419993549585342, | |
| "epoch": 0.291044776119403, | |
| "grad_norm": 0.1524474024772644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490943193435669, | |
| "mean_token_accuracy": 0.779272273182869, | |
| "num_tokens": 1274449.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5585939884185791, | |
| "epoch": 0.2947761194029851, | |
| "grad_norm": 0.1510787457227707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654528141021729, | |
| "mean_token_accuracy": 0.772942066192627, | |
| "num_tokens": 1290949.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.563146710395813, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.1482156217098236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5777900218963623, | |
| "mean_token_accuracy": 0.7702645510435104, | |
| "num_tokens": 1307187.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5600180923938751, | |
| "epoch": 0.30223880597014924, | |
| "grad_norm": 0.15022550523281097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632287859916687, | |
| "mean_token_accuracy": 0.7716066837310791, | |
| "num_tokens": 1323407.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5598095804452896, | |
| "epoch": 0.30597014925373134, | |
| "grad_norm": 0.1322828084230423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537106394767761, | |
| "mean_token_accuracy": 0.7764421850442886, | |
| "num_tokens": 1339664.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5458928942680359, | |
| "epoch": 0.30970149253731344, | |
| "grad_norm": 0.1319894790649414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423555374145508, | |
| "mean_token_accuracy": 0.7807362526655197, | |
| "num_tokens": 1356260.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5659633129835129, | |
| "epoch": 0.31343283582089554, | |
| "grad_norm": 0.13246627151966095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557287335395813, | |
| "mean_token_accuracy": 0.7743117958307266, | |
| "num_tokens": 1372821.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5452462434768677, | |
| "epoch": 0.31716417910447764, | |
| "grad_norm": 0.16196919977664948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543107271194458, | |
| "mean_token_accuracy": 0.7795177549123764, | |
| "num_tokens": 1388889.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5466109812259674, | |
| "epoch": 0.3208955223880597, | |
| "grad_norm": 0.12639470398426056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396162271499634, | |
| "mean_token_accuracy": 0.7834953665733337, | |
| "num_tokens": 1405139.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.551815465092659, | |
| "epoch": 0.3246268656716418, | |
| "grad_norm": 0.18058188259601593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637637972831726, | |
| "mean_token_accuracy": 0.7716487348079681, | |
| "num_tokens": 1421439.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.543148547410965, | |
| "epoch": 0.3283582089552239, | |
| "grad_norm": 0.14002034068107605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549104630947113, | |
| "mean_token_accuracy": 0.7779115587472916, | |
| "num_tokens": 1437695.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5655066221952438, | |
| "epoch": 0.332089552238806, | |
| "grad_norm": 0.13395759463310242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683454871177673, | |
| "mean_token_accuracy": 0.7728030234575272, | |
| "num_tokens": 1453991.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5676597952842712, | |
| "epoch": 0.3358208955223881, | |
| "grad_norm": 0.14229720830917358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5701878070831299, | |
| "mean_token_accuracy": 0.7698987573385239, | |
| "num_tokens": 1470371.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5576249063014984, | |
| "epoch": 0.33955223880597013, | |
| "grad_norm": 0.1365518420934677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560733437538147, | |
| "mean_token_accuracy": 0.7742054760456085, | |
| "num_tokens": 1486891.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5476901531219482, | |
| "epoch": 0.34328358208955223, | |
| "grad_norm": 0.12286433577537537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540446639060974, | |
| "mean_token_accuracy": 0.7757776081562042, | |
| "num_tokens": 1503153.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5445209294557571, | |
| "epoch": 0.34701492537313433, | |
| "grad_norm": 0.13203619420528412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416238903999329, | |
| "mean_token_accuracy": 0.7820428013801575, | |
| "num_tokens": 1519248.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5732006430625916, | |
| "epoch": 0.35074626865671643, | |
| "grad_norm": 0.14288392663002014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734184980392456, | |
| "mean_token_accuracy": 0.7677003741264343, | |
| "num_tokens": 1535616.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5645585656166077, | |
| "epoch": 0.35447761194029853, | |
| "grad_norm": 0.1253618448972702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549549460411072, | |
| "mean_token_accuracy": 0.7756840586662292, | |
| "num_tokens": 1552040.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5686955749988556, | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.12725889682769775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.573272705078125, | |
| "mean_token_accuracy": 0.7684734165668488, | |
| "num_tokens": 1568381.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.547907680273056, | |
| "epoch": 0.3619402985074627, | |
| "grad_norm": 0.13573119044303894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526182055473328, | |
| "mean_token_accuracy": 0.7779877185821533, | |
| "num_tokens": 1584726.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5658805668354034, | |
| "epoch": 0.3656716417910448, | |
| "grad_norm": 0.13501696288585663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5696231722831726, | |
| "mean_token_accuracy": 0.7706904113292694, | |
| "num_tokens": 1601142.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5553559362888336, | |
| "epoch": 0.3694029850746269, | |
| "grad_norm": 0.12036850303411484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520588159561157, | |
| "mean_token_accuracy": 0.7781549990177155, | |
| "num_tokens": 1617184.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5559379458427429, | |
| "epoch": 0.373134328358209, | |
| "grad_norm": 0.12556730210781097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582664608955383, | |
| "mean_token_accuracy": 0.7744826525449753, | |
| "num_tokens": 1633573.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5321817249059677, | |
| "epoch": 0.376865671641791, | |
| "grad_norm": 0.1410171091556549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531158447265625, | |
| "mean_token_accuracy": 0.7867954224348068, | |
| "num_tokens": 1649580.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5629207491874695, | |
| "epoch": 0.3805970149253731, | |
| "grad_norm": 0.1320696920156479, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548203587532043, | |
| "mean_token_accuracy": 0.777129277586937, | |
| "num_tokens": 1665914.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5625062435865402, | |
| "epoch": 0.3843283582089552, | |
| "grad_norm": 0.15022383630275726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559231698513031, | |
| "mean_token_accuracy": 0.7755367606878281, | |
| "num_tokens": 1682572.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.55105359852314, | |
| "epoch": 0.3880597014925373, | |
| "grad_norm": 0.13816320896148682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513999462127686, | |
| "mean_token_accuracy": 0.7777303904294968, | |
| "num_tokens": 1698800.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5433051884174347, | |
| "epoch": 0.3917910447761194, | |
| "grad_norm": 0.13852182030677795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473951101303101, | |
| "mean_token_accuracy": 0.7787780612707138, | |
| "num_tokens": 1715089.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5638341754674911, | |
| "epoch": 0.39552238805970147, | |
| "grad_norm": 0.13244302570819855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5711042284965515, | |
| "mean_token_accuracy": 0.7705479264259338, | |
| "num_tokens": 1731289.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5590131878852844, | |
| "epoch": 0.39925373134328357, | |
| "grad_norm": 0.14187560975551605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588455200195312, | |
| "mean_token_accuracy": 0.775245189666748, | |
| "num_tokens": 1747777.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5456477552652359, | |
| "epoch": 0.40298507462686567, | |
| "grad_norm": 0.12155073136091232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477449297904968, | |
| "mean_token_accuracy": 0.7793276309967041, | |
| "num_tokens": 1764099.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5533221960067749, | |
| "epoch": 0.40671641791044777, | |
| "grad_norm": 0.14932067692279816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550473153591156, | |
| "mean_token_accuracy": 0.7792102247476578, | |
| "num_tokens": 1780092.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5685938596725464, | |
| "epoch": 0.41044776119402987, | |
| "grad_norm": 0.11824015527963638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.567302942276001, | |
| "mean_token_accuracy": 0.768885999917984, | |
| "num_tokens": 1796553.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.558070957660675, | |
| "epoch": 0.4141791044776119, | |
| "grad_norm": 0.13145862519741058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594078302383423, | |
| "mean_token_accuracy": 0.7714920043945312, | |
| "num_tokens": 1812976.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5445801764726639, | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.1538373976945877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507169365882874, | |
| "mean_token_accuracy": 0.7795748263597488, | |
| "num_tokens": 1829496.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5546134263277054, | |
| "epoch": 0.4216417910447761, | |
| "grad_norm": 0.14499837160110474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621107816696167, | |
| "mean_token_accuracy": 0.772913932800293, | |
| "num_tokens": 1845899.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5376207381486893, | |
| "epoch": 0.4253731343283582, | |
| "grad_norm": 0.12395139783620834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408076643943787, | |
| "mean_token_accuracy": 0.7826146930456161, | |
| "num_tokens": 1862102.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5709025114774704, | |
| "epoch": 0.4291044776119403, | |
| "grad_norm": 0.14900445938110352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688319206237793, | |
| "mean_token_accuracy": 0.7712048441171646, | |
| "num_tokens": 1878466.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5531350374221802, | |
| "epoch": 0.43283582089552236, | |
| "grad_norm": 0.14944979548454285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533212423324585, | |
| "mean_token_accuracy": 0.7762057036161423, | |
| "num_tokens": 1894613.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5613852292299271, | |
| "epoch": 0.43656716417910446, | |
| "grad_norm": 0.14122174680233002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625326633453369, | |
| "mean_token_accuracy": 0.7721518725156784, | |
| "num_tokens": 1910791.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5606949478387833, | |
| "epoch": 0.44029850746268656, | |
| "grad_norm": 0.11353051662445068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561124682426453, | |
| "mean_token_accuracy": 0.7774701118469238, | |
| "num_tokens": 1927342.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5748601853847504, | |
| "epoch": 0.44402985074626866, | |
| "grad_norm": 0.13328969478607178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738563537597656, | |
| "mean_token_accuracy": 0.7660426646471024, | |
| "num_tokens": 1944009.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5331175327301025, | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.14304570853710175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535332441329956, | |
| "mean_token_accuracy": 0.7843142002820969, | |
| "num_tokens": 1960275.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5579216629266739, | |
| "epoch": 0.45149253731343286, | |
| "grad_norm": 0.12545879185199738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590261220932007, | |
| "mean_token_accuracy": 0.7733252346515656, | |
| "num_tokens": 1976578.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5593921393156052, | |
| "epoch": 0.4552238805970149, | |
| "grad_norm": 0.13857485353946686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631604194641113, | |
| "mean_token_accuracy": 0.7736008018255234, | |
| "num_tokens": 1993053.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5660806745290756, | |
| "epoch": 0.458955223880597, | |
| "grad_norm": 0.11944495886564255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569764375686646, | |
| "mean_token_accuracy": 0.7737946212291718, | |
| "num_tokens": 2009442.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5681817382574081, | |
| "epoch": 0.4626865671641791, | |
| "grad_norm": 0.14172527194023132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605779886245728, | |
| "mean_token_accuracy": 0.7750114947557449, | |
| "num_tokens": 2025901.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5467974990606308, | |
| "epoch": 0.4664179104477612, | |
| "grad_norm": 0.1252705603837967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515766739845276, | |
| "mean_token_accuracy": 0.7760580778121948, | |
| "num_tokens": 2042208.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5420515686273575, | |
| "epoch": 0.4701492537313433, | |
| "grad_norm": 0.13870663940906525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480060577392578, | |
| "mean_token_accuracy": 0.7764822095632553, | |
| "num_tokens": 2058681.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5362897217273712, | |
| "epoch": 0.47388059701492535, | |
| "grad_norm": 0.13995425403118134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513206720352173, | |
| "mean_token_accuracy": 0.7750497758388519, | |
| "num_tokens": 2075000.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5329284965991974, | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.16524387896060944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436174273490906, | |
| "mean_token_accuracy": 0.7792856246232986, | |
| "num_tokens": 2091221.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5539916902780533, | |
| "epoch": 0.48134328358208955, | |
| "grad_norm": 0.12479358166456223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608515739440918, | |
| "mean_token_accuracy": 0.7734991759061813, | |
| "num_tokens": 2107664.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5594889521598816, | |
| "epoch": 0.48507462686567165, | |
| "grad_norm": 0.14481139183044434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508875846862793, | |
| "mean_token_accuracy": 0.7767421901226044, | |
| "num_tokens": 2123952.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5442296341061592, | |
| "epoch": 0.48880597014925375, | |
| "grad_norm": 0.12281627953052521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368722677230835, | |
| "mean_token_accuracy": 0.7826971709728241, | |
| "num_tokens": 2139985.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.562851145863533, | |
| "epoch": 0.4925373134328358, | |
| "grad_norm": 0.14453750848770142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439143180847168, | |
| "mean_token_accuracy": 0.7809209376573563, | |
| "num_tokens": 2156312.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5531761199235916, | |
| "epoch": 0.4962686567164179, | |
| "grad_norm": 0.13650745153427124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565841197967529, | |
| "mean_token_accuracy": 0.7758718878030777, | |
| "num_tokens": 2172756.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5456132292747498, | |
| "epoch": 0.5, | |
| "grad_norm": 0.13749481737613678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540860295295715, | |
| "mean_token_accuracy": 0.7755758464336395, | |
| "num_tokens": 2189086.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5647578835487366, | |
| "epoch": 0.503731343283582, | |
| "grad_norm": 0.145718514919281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5744016766548157, | |
| "mean_token_accuracy": 0.7706383019685745, | |
| "num_tokens": 2205658.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5253579095005989, | |
| "epoch": 0.5074626865671642, | |
| "grad_norm": 0.1236543357372284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327446460723877, | |
| "mean_token_accuracy": 0.7834168970584869, | |
| "num_tokens": 2221900.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5625722110271454, | |
| "epoch": 0.5111940298507462, | |
| "grad_norm": 0.1114581972360611, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667597651481628, | |
| "mean_token_accuracy": 0.7699635177850723, | |
| "num_tokens": 2238309.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5476242303848267, | |
| "epoch": 0.5149253731343284, | |
| "grad_norm": 0.1360960304737091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452396273612976, | |
| "mean_token_accuracy": 0.7796155512332916, | |
| "num_tokens": 2254713.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5573885440826416, | |
| "epoch": 0.5186567164179104, | |
| "grad_norm": 0.11950599402189255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531854629516602, | |
| "mean_token_accuracy": 0.7765035033226013, | |
| "num_tokens": 2271164.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5644345581531525, | |
| "epoch": 0.5223880597014925, | |
| "grad_norm": 0.11840134114027023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575224161148071, | |
| "mean_token_accuracy": 0.7718838900327682, | |
| "num_tokens": 2287762.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5466153174638748, | |
| "epoch": 0.5261194029850746, | |
| "grad_norm": 0.1688532829284668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499178171157837, | |
| "mean_token_accuracy": 0.777469664812088, | |
| "num_tokens": 2304348.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5427221059799194, | |
| "epoch": 0.5298507462686567, | |
| "grad_norm": 0.14760567247867584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492222905158997, | |
| "mean_token_accuracy": 0.778323158621788, | |
| "num_tokens": 2320490.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5470593422651291, | |
| "epoch": 0.5335820895522388, | |
| "grad_norm": 0.19991202652454376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513626933097839, | |
| "mean_token_accuracy": 0.7774471044540405, | |
| "num_tokens": 2337221.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5426470190286636, | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.11571265757083893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405253767967224, | |
| "mean_token_accuracy": 0.7813504189252853, | |
| "num_tokens": 2353353.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5667431056499481, | |
| "epoch": 0.5410447761194029, | |
| "grad_norm": 0.12742455303668976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593273043632507, | |
| "mean_token_accuracy": 0.7729441076517105, | |
| "num_tokens": 2369753.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5697275847196579, | |
| "epoch": 0.5447761194029851, | |
| "grad_norm": 0.1348797082901001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684511661529541, | |
| "mean_token_accuracy": 0.7724753767251968, | |
| "num_tokens": 2386156.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5411224067211151, | |
| "epoch": 0.5485074626865671, | |
| "grad_norm": 0.1279442012310028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420435667037964, | |
| "mean_token_accuracy": 0.782076433300972, | |
| "num_tokens": 2402488.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5458887368440628, | |
| "epoch": 0.5522388059701493, | |
| "grad_norm": 0.15301373600959778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421918630599976, | |
| "mean_token_accuracy": 0.7805485129356384, | |
| "num_tokens": 2418800.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5494910031557083, | |
| "epoch": 0.5559701492537313, | |
| "grad_norm": 0.13024193048477173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560234189033508, | |
| "mean_token_accuracy": 0.7752619981765747, | |
| "num_tokens": 2435229.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5497897416353226, | |
| "epoch": 0.5597014925373134, | |
| "grad_norm": 0.140470951795578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513492226600647, | |
| "mean_token_accuracy": 0.775757297873497, | |
| "num_tokens": 2451762.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5479221642017365, | |
| "epoch": 0.5634328358208955, | |
| "grad_norm": 0.11884977668523788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478861331939697, | |
| "mean_token_accuracy": 0.782090038061142, | |
| "num_tokens": 2468180.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5405495166778564, | |
| "epoch": 0.5671641791044776, | |
| "grad_norm": 0.12883080542087555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406085252761841, | |
| "mean_token_accuracy": 0.7832252681255341, | |
| "num_tokens": 2484444.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5454452037811279, | |
| "epoch": 0.5708955223880597, | |
| "grad_norm": 0.12270363420248032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502068400382996, | |
| "mean_token_accuracy": 0.7790153920650482, | |
| "num_tokens": 2500846.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5570302158594131, | |
| "epoch": 0.5746268656716418, | |
| "grad_norm": 0.1269625872373581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548018217086792, | |
| "mean_token_accuracy": 0.778030514717102, | |
| "num_tokens": 2517083.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5605379194021225, | |
| "epoch": 0.5783582089552238, | |
| "grad_norm": 0.1287340223789215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561842143535614, | |
| "mean_token_accuracy": 0.7721278667449951, | |
| "num_tokens": 2533804.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5481511801481247, | |
| "epoch": 0.582089552238806, | |
| "grad_norm": 0.13460931181907654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473400950431824, | |
| "mean_token_accuracy": 0.7798450142145157, | |
| "num_tokens": 2550301.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5569665729999542, | |
| "epoch": 0.585820895522388, | |
| "grad_norm": 0.1167525053024292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591033697128296, | |
| "mean_token_accuracy": 0.7743667513132095, | |
| "num_tokens": 2566630.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5529917627573013, | |
| "epoch": 0.5895522388059702, | |
| "grad_norm": 0.1454092264175415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575821399688721, | |
| "mean_token_accuracy": 0.7714344263076782, | |
| "num_tokens": 2583278.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5369462221860886, | |
| "epoch": 0.5932835820895522, | |
| "grad_norm": 0.12713587284088135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541353702545166, | |
| "mean_token_accuracy": 0.7810934484004974, | |
| "num_tokens": 2599680.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5471956133842468, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.1193249523639679, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544399619102478, | |
| "mean_token_accuracy": 0.777627244591713, | |
| "num_tokens": 2615971.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5561826080083847, | |
| "epoch": 0.6007462686567164, | |
| "grad_norm": 0.1412789523601532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533403754234314, | |
| "mean_token_accuracy": 0.774614229798317, | |
| "num_tokens": 2632402.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5589349567890167, | |
| "epoch": 0.6044776119402985, | |
| "grad_norm": 0.12422283738851547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584982633590698, | |
| "mean_token_accuracy": 0.772629901766777, | |
| "num_tokens": 2648936.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5598675608634949, | |
| "epoch": 0.6082089552238806, | |
| "grad_norm": 0.14433413743972778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596426725387573, | |
| "mean_token_accuracy": 0.7740431576967239, | |
| "num_tokens": 2665475.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5221775621175766, | |
| "epoch": 0.6119402985074627, | |
| "grad_norm": 0.12392512708902359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226801037788391, | |
| "mean_token_accuracy": 0.7883991152048111, | |
| "num_tokens": 2681739.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5390211492776871, | |
| "epoch": 0.6156716417910447, | |
| "grad_norm": 0.1389789581298828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467759370803833, | |
| "mean_token_accuracy": 0.7787502557039261, | |
| "num_tokens": 2698224.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5343765914440155, | |
| "epoch": 0.6194029850746269, | |
| "grad_norm": 0.15462790429592133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523170232772827, | |
| "mean_token_accuracy": 0.7789429575204849, | |
| "num_tokens": 2714480.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5412632822990417, | |
| "epoch": 0.6231343283582089, | |
| "grad_norm": 0.13078634440898895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461232662200928, | |
| "mean_token_accuracy": 0.7796546518802643, | |
| "num_tokens": 2730804.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5592486709356308, | |
| "epoch": 0.6268656716417911, | |
| "grad_norm": 0.11671686917543411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556939244270325, | |
| "mean_token_accuracy": 0.7750763148069382, | |
| "num_tokens": 2747189.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5645984709262848, | |
| "epoch": 0.6305970149253731, | |
| "grad_norm": 0.11404155939817429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586551427841187, | |
| "mean_token_accuracy": 0.7756913602352142, | |
| "num_tokens": 2763561.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5689886808395386, | |
| "epoch": 0.6343283582089553, | |
| "grad_norm": 0.13602924346923828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571495771408081, | |
| "mean_token_accuracy": 0.7653735727071762, | |
| "num_tokens": 2780048.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.56998710334301, | |
| "epoch": 0.6380597014925373, | |
| "grad_norm": 0.15131747722625732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5685769319534302, | |
| "mean_token_accuracy": 0.770746722817421, | |
| "num_tokens": 2796401.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5340622663497925, | |
| "epoch": 0.6417910447761194, | |
| "grad_norm": 0.10990842431783676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300686955451965, | |
| "mean_token_accuracy": 0.7831304669380188, | |
| "num_tokens": 2812688.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5546266734600067, | |
| "epoch": 0.6455223880597015, | |
| "grad_norm": 0.14243000745773315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531081557273865, | |
| "mean_token_accuracy": 0.7720183730125427, | |
| "num_tokens": 2828912.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.530887708067894, | |
| "epoch": 0.6492537313432836, | |
| "grad_norm": 0.14285673201084137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329350233078003, | |
| "mean_token_accuracy": 0.7844198048114777, | |
| "num_tokens": 2845032.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5529126077890396, | |
| "epoch": 0.6529850746268657, | |
| "grad_norm": 0.12663516402244568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582675337791443, | |
| "mean_token_accuracy": 0.775692343711853, | |
| "num_tokens": 2861233.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5530151873826981, | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.1777547299861908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580370426177979, | |
| "mean_token_accuracy": 0.7773808538913727, | |
| "num_tokens": 2877595.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5517453551292419, | |
| "epoch": 0.6604477611940298, | |
| "grad_norm": 0.12728020548820496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549347996711731, | |
| "mean_token_accuracy": 0.7813896834850311, | |
| "num_tokens": 2893885.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5581229478120804, | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 0.12608157098293304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528551936149597, | |
| "mean_token_accuracy": 0.774133637547493, | |
| "num_tokens": 2910402.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5545129030942917, | |
| "epoch": 0.667910447761194, | |
| "grad_norm": 0.14164696633815765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471103191375732, | |
| "mean_token_accuracy": 0.7807044833898544, | |
| "num_tokens": 2927020.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5679615437984467, | |
| "epoch": 0.6716417910447762, | |
| "grad_norm": 0.11040110141038895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661795139312744, | |
| "mean_token_accuracy": 0.7697756141424179, | |
| "num_tokens": 2943445.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5358923226594925, | |
| "epoch": 0.6753731343283582, | |
| "grad_norm": 0.12206491082906723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459122061729431, | |
| "mean_token_accuracy": 0.7805617302656174, | |
| "num_tokens": 2959987.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5579689890146255, | |
| "epoch": 0.6791044776119403, | |
| "grad_norm": 0.14179477095603943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636488199234009, | |
| "mean_token_accuracy": 0.7736007869243622, | |
| "num_tokens": 2976751.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5510261654853821, | |
| "epoch": 0.6828358208955224, | |
| "grad_norm": 0.12091591209173203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561327338218689, | |
| "mean_token_accuracy": 0.776558443903923, | |
| "num_tokens": 2993041.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5457663834095001, | |
| "epoch": 0.6865671641791045, | |
| "grad_norm": 0.12697891891002655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465325117111206, | |
| "mean_token_accuracy": 0.7786546349525452, | |
| "num_tokens": 3009436.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5649427324533463, | |
| "epoch": 0.6902985074626866, | |
| "grad_norm": 0.13892695307731628, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654124617576599, | |
| "mean_token_accuracy": 0.7703604251146317, | |
| "num_tokens": 3025787.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5688793361186981, | |
| "epoch": 0.6940298507462687, | |
| "grad_norm": 0.11656537652015686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590483546257019, | |
| "mean_token_accuracy": 0.7758390307426453, | |
| "num_tokens": 3042147.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5568420886993408, | |
| "epoch": 0.6977611940298507, | |
| "grad_norm": 0.1266399472951889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490051507949829, | |
| "mean_token_accuracy": 0.7778443545103073, | |
| "num_tokens": 3058479.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5504391342401505, | |
| "epoch": 0.7014925373134329, | |
| "grad_norm": 0.15510344505310059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499662756919861, | |
| "mean_token_accuracy": 0.7750896066427231, | |
| "num_tokens": 3074684.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5515661090612411, | |
| "epoch": 0.7052238805970149, | |
| "grad_norm": 0.1378200650215149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564606189727783, | |
| "mean_token_accuracy": 0.7740965932607651, | |
| "num_tokens": 3091070.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5522360950708389, | |
| "epoch": 0.7089552238805971, | |
| "grad_norm": 0.1490645706653595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577459335327148, | |
| "mean_token_accuracy": 0.7747645527124405, | |
| "num_tokens": 3107501.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5528729557991028, | |
| "epoch": 0.7126865671641791, | |
| "grad_norm": 0.14538180828094482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618550777435303, | |
| "mean_token_accuracy": 0.7729964852333069, | |
| "num_tokens": 3123822.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5486249774694443, | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.12265278398990631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423588752746582, | |
| "mean_token_accuracy": 0.7789205312728882, | |
| "num_tokens": 3140334.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5567969381809235, | |
| "epoch": 0.7201492537313433, | |
| "grad_norm": 0.13273917138576508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613058805465698, | |
| "mean_token_accuracy": 0.7748401314020157, | |
| "num_tokens": 3156490.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.558370977640152, | |
| "epoch": 0.7238805970149254, | |
| "grad_norm": 0.1269926130771637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548975944519043, | |
| "mean_token_accuracy": 0.7803195267915726, | |
| "num_tokens": 3172917.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5645796656608582, | |
| "epoch": 0.7276119402985075, | |
| "grad_norm": 0.12320506572723389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635199546813965, | |
| "mean_token_accuracy": 0.773562416434288, | |
| "num_tokens": 3189322.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5316331535577774, | |
| "epoch": 0.7313432835820896, | |
| "grad_norm": 0.1522948294878006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410732626914978, | |
| "mean_token_accuracy": 0.7790966629981995, | |
| "num_tokens": 3205551.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5493949502706528, | |
| "epoch": 0.7350746268656716, | |
| "grad_norm": 0.119343101978302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500541925430298, | |
| "mean_token_accuracy": 0.7768760919570923, | |
| "num_tokens": 3222029.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5477159917354584, | |
| "epoch": 0.7388059701492538, | |
| "grad_norm": 0.119729183614254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477977991104126, | |
| "mean_token_accuracy": 0.7788135707378387, | |
| "num_tokens": 3238421.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5607248842716217, | |
| "epoch": 0.7425373134328358, | |
| "grad_norm": 0.13485661149024963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5701273083686829, | |
| "mean_token_accuracy": 0.7674471586942673, | |
| "num_tokens": 3254789.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5362051874399185, | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.11599450558423996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382620692253113, | |
| "mean_token_accuracy": 0.7804013192653656, | |
| "num_tokens": 3270902.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5385442525148392, | |
| "epoch": 0.75, | |
| "grad_norm": 0.11722157150506973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425242185592651, | |
| "mean_token_accuracy": 0.7779103666543961, | |
| "num_tokens": 3287148.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5608718395233154, | |
| "epoch": 0.753731343283582, | |
| "grad_norm": 0.11743324995040894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605480670928955, | |
| "mean_token_accuracy": 0.7718753963708878, | |
| "num_tokens": 3303602.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5647395998239517, | |
| "epoch": 0.7574626865671642, | |
| "grad_norm": 0.12360575795173645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565830409526825, | |
| "mean_token_accuracy": 0.7734925150871277, | |
| "num_tokens": 3319914.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5613357871770859, | |
| "epoch": 0.7611940298507462, | |
| "grad_norm": 0.12299378216266632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502001643180847, | |
| "mean_token_accuracy": 0.7780173420906067, | |
| "num_tokens": 3336266.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5557620376348495, | |
| "epoch": 0.7649253731343284, | |
| "grad_norm": 0.13515423238277435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513977408409119, | |
| "mean_token_accuracy": 0.7768134474754333, | |
| "num_tokens": 3352828.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5312158316373825, | |
| "epoch": 0.7686567164179104, | |
| "grad_norm": 0.1245652511715889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331584215164185, | |
| "mean_token_accuracy": 0.783508375287056, | |
| "num_tokens": 3368900.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5540332049131393, | |
| "epoch": 0.7723880597014925, | |
| "grad_norm": 0.12260495871305466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610563158988953, | |
| "mean_token_accuracy": 0.772364541888237, | |
| "num_tokens": 3385392.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5408795922994614, | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.1623620092868805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433046221733093, | |
| "mean_token_accuracy": 0.7798032164573669, | |
| "num_tokens": 3401604.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5390565246343613, | |
| "epoch": 0.7798507462686567, | |
| "grad_norm": 0.13042029738426208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478684902191162, | |
| "mean_token_accuracy": 0.7792101353406906, | |
| "num_tokens": 3417639.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5241924300789833, | |
| "epoch": 0.7835820895522388, | |
| "grad_norm": 0.13064046204090118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299482941627502, | |
| "mean_token_accuracy": 0.7843270599842072, | |
| "num_tokens": 3433827.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.545391634106636, | |
| "epoch": 0.7873134328358209, | |
| "grad_norm": 0.14404848217964172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539533257484436, | |
| "mean_token_accuracy": 0.7797930389642715, | |
| "num_tokens": 3450075.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5748691409826279, | |
| "epoch": 0.7910447761194029, | |
| "grad_norm": 0.13996216654777527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611885786056519, | |
| "mean_token_accuracy": 0.7745807766914368, | |
| "num_tokens": 3466557.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5685983300209045, | |
| "epoch": 0.7947761194029851, | |
| "grad_norm": 0.12288983166217804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553888082504272, | |
| "mean_token_accuracy": 0.7752144187688828, | |
| "num_tokens": 3482978.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5502497553825378, | |
| "epoch": 0.7985074626865671, | |
| "grad_norm": 0.12848587334156036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549103856086731, | |
| "mean_token_accuracy": 0.7791820466518402, | |
| "num_tokens": 3499378.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5424053594470024, | |
| "epoch": 0.8022388059701493, | |
| "grad_norm": 0.12519471347332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496050119400024, | |
| "mean_token_accuracy": 0.7755117863416672, | |
| "num_tokens": 3515899.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5332234650850296, | |
| "epoch": 0.8059701492537313, | |
| "grad_norm": 0.17385068535804749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551385283470154, | |
| "mean_token_accuracy": 0.7749006897211075, | |
| "num_tokens": 3532197.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5355218946933746, | |
| "epoch": 0.8097014925373134, | |
| "grad_norm": 0.1355784386396408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417052507400513, | |
| "mean_token_accuracy": 0.7785830944776535, | |
| "num_tokens": 3548584.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.545543447136879, | |
| "epoch": 0.8134328358208955, | |
| "grad_norm": 0.10903589427471161, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351961255073547, | |
| "mean_token_accuracy": 0.7840810418128967, | |
| "num_tokens": 3564973.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5678307712078094, | |
| "epoch": 0.8171641791044776, | |
| "grad_norm": 0.13619016110897064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577275156974792, | |
| "mean_token_accuracy": 0.7739268988370895, | |
| "num_tokens": 3581436.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5452380776405334, | |
| "epoch": 0.8208955223880597, | |
| "grad_norm": 0.12011487782001495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431472063064575, | |
| "mean_token_accuracy": 0.7790575325489044, | |
| "num_tokens": 3597661.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5536454021930695, | |
| "epoch": 0.8246268656716418, | |
| "grad_norm": 0.10391338169574738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514202117919922, | |
| "mean_token_accuracy": 0.7758155465126038, | |
| "num_tokens": 3614221.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5350385755300522, | |
| "epoch": 0.8283582089552238, | |
| "grad_norm": 0.1497930884361267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447626709938049, | |
| "mean_token_accuracy": 0.778772234916687, | |
| "num_tokens": 3630441.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5551132708787918, | |
| "epoch": 0.832089552238806, | |
| "grad_norm": 0.12266736477613449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558661937713623, | |
| "mean_token_accuracy": 0.773910716176033, | |
| "num_tokens": 3647039.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5643535554409027, | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.11532776802778244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5642860531806946, | |
| "mean_token_accuracy": 0.7725937813520432, | |
| "num_tokens": 3663412.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5549684166908264, | |
| "epoch": 0.8395522388059702, | |
| "grad_norm": 0.12639960646629333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532217025756836, | |
| "mean_token_accuracy": 0.7739283442497253, | |
| "num_tokens": 3679945.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.560679629445076, | |
| "epoch": 0.8432835820895522, | |
| "grad_norm": 0.13600312173366547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514844059944153, | |
| "mean_token_accuracy": 0.7773452550172806, | |
| "num_tokens": 3696613.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5458584129810333, | |
| "epoch": 0.8470149253731343, | |
| "grad_norm": 0.10419101268053055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424168109893799, | |
| "mean_token_accuracy": 0.7833174467086792, | |
| "num_tokens": 3713158.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.542242094874382, | |
| "epoch": 0.8507462686567164, | |
| "grad_norm": 0.1483229100704193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505244731903076, | |
| "mean_token_accuracy": 0.7768149822950363, | |
| "num_tokens": 3729484.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5342283248901367, | |
| "epoch": 0.8544776119402985, | |
| "grad_norm": 0.16167280077934265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423468947410583, | |
| "mean_token_accuracy": 0.781244620680809, | |
| "num_tokens": 3745710.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5557206273078918, | |
| "epoch": 0.8582089552238806, | |
| "grad_norm": 0.10992418974637985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555332899093628, | |
| "mean_token_accuracy": 0.7740505337715149, | |
| "num_tokens": 3761974.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5301929265260696, | |
| "epoch": 0.8619402985074627, | |
| "grad_norm": 0.20067644119262695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325175523757935, | |
| "mean_token_accuracy": 0.7839723825454712, | |
| "num_tokens": 3777980.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5519733354449272, | |
| "epoch": 0.8656716417910447, | |
| "grad_norm": 0.11584831774234772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547998309135437, | |
| "mean_token_accuracy": 0.7752280086278915, | |
| "num_tokens": 3794210.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5573844611644745, | |
| "epoch": 0.8694029850746269, | |
| "grad_norm": 0.14681567251682281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5630576014518738, | |
| "mean_token_accuracy": 0.7713348120450974, | |
| "num_tokens": 3810625.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5614193379878998, | |
| "epoch": 0.8731343283582089, | |
| "grad_norm": 0.3717029392719269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614831447601318, | |
| "mean_token_accuracy": 0.7718814015388489, | |
| "num_tokens": 3826871.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5552587062120438, | |
| "epoch": 0.8768656716417911, | |
| "grad_norm": 0.1315956562757492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541540384292603, | |
| "mean_token_accuracy": 0.7746177315711975, | |
| "num_tokens": 3843187.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5387386232614517, | |
| "epoch": 0.8805970149253731, | |
| "grad_norm": 0.4729621112346649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513001084327698, | |
| "mean_token_accuracy": 0.777639240026474, | |
| "num_tokens": 3859659.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5589011460542679, | |
| "epoch": 0.8843283582089553, | |
| "grad_norm": 0.11313692480325699, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550857424736023, | |
| "mean_token_accuracy": 0.7776817381381989, | |
| "num_tokens": 3876082.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5506832748651505, | |
| "epoch": 0.8880597014925373, | |
| "grad_norm": 0.15838703513145447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493965148925781, | |
| "mean_token_accuracy": 0.774595633149147, | |
| "num_tokens": 3892310.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5482196658849716, | |
| "epoch": 0.8917910447761194, | |
| "grad_norm": 0.16354775428771973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549696147441864, | |
| "mean_token_accuracy": 0.7784011512994766, | |
| "num_tokens": 3908561.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5474406778812408, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.11488547921180725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442180037498474, | |
| "mean_token_accuracy": 0.7787186056375504, | |
| "num_tokens": 3924971.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5576506555080414, | |
| "epoch": 0.8992537313432836, | |
| "grad_norm": 0.11725704371929169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556765794754028, | |
| "mean_token_accuracy": 0.7754130512475967, | |
| "num_tokens": 3941384.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5686157792806625, | |
| "epoch": 0.9029850746268657, | |
| "grad_norm": 0.1209690198302269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5740119218826294, | |
| "mean_token_accuracy": 0.7644337117671967, | |
| "num_tokens": 3957527.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5520821809768677, | |
| "epoch": 0.9067164179104478, | |
| "grad_norm": 0.1097254753112793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524159669876099, | |
| "mean_token_accuracy": 0.7778758704662323, | |
| "num_tokens": 3973803.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5603332817554474, | |
| "epoch": 0.9104477611940298, | |
| "grad_norm": 0.13421349227428436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633103251457214, | |
| "mean_token_accuracy": 0.7723569422960281, | |
| "num_tokens": 3990124.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5404402911663055, | |
| "epoch": 0.914179104477612, | |
| "grad_norm": 0.12017542868852615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424325466156006, | |
| "mean_token_accuracy": 0.7823856174945831, | |
| "num_tokens": 4006560.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5605191737413406, | |
| "epoch": 0.917910447761194, | |
| "grad_norm": 0.14128640294075012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602733492851257, | |
| "mean_token_accuracy": 0.7735545933246613, | |
| "num_tokens": 4022966.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5599958896636963, | |
| "epoch": 0.9216417910447762, | |
| "grad_norm": 0.11880706995725632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598034858703613, | |
| "mean_token_accuracy": 0.7717109471559525, | |
| "num_tokens": 4039261.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5408921539783478, | |
| "epoch": 0.9253731343283582, | |
| "grad_norm": 0.12040922045707703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460969805717468, | |
| "mean_token_accuracy": 0.7793735712766647, | |
| "num_tokens": 4055343.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5573666542768478, | |
| "epoch": 0.9291044776119403, | |
| "grad_norm": 0.12093377858400345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556143045425415, | |
| "mean_token_accuracy": 0.7752596288919449, | |
| "num_tokens": 4071770.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.563015878200531, | |
| "epoch": 0.9328358208955224, | |
| "grad_norm": 0.11447741836309433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647203922271729, | |
| "mean_token_accuracy": 0.7692370861768723, | |
| "num_tokens": 4088034.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.548077866435051, | |
| "epoch": 0.9365671641791045, | |
| "grad_norm": 0.11981664597988129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454928278923035, | |
| "mean_token_accuracy": 0.7787458151578903, | |
| "num_tokens": 4104196.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5375737547874451, | |
| "epoch": 0.9402985074626866, | |
| "grad_norm": 0.12071040272712708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404340028762817, | |
| "mean_token_accuracy": 0.7799674719572067, | |
| "num_tokens": 4120470.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.547912061214447, | |
| "epoch": 0.9440298507462687, | |
| "grad_norm": 0.12739375233650208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530076026916504, | |
| "mean_token_accuracy": 0.7753598988056183, | |
| "num_tokens": 4136885.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5538879930973053, | |
| "epoch": 0.9477611940298507, | |
| "grad_norm": 0.12144653499126434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514186024665833, | |
| "mean_token_accuracy": 0.7753842920064926, | |
| "num_tokens": 4153216.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5411302447319031, | |
| "epoch": 0.9514925373134329, | |
| "grad_norm": 0.11099912226200104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385034084320068, | |
| "mean_token_accuracy": 0.7812628000974655, | |
| "num_tokens": 4169402.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5564829558134079, | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.12310667335987091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534285306930542, | |
| "mean_token_accuracy": 0.7745526880025864, | |
| "num_tokens": 4185847.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5459543019533157, | |
| "epoch": 0.9589552238805971, | |
| "grad_norm": 0.1408655047416687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539636492729187, | |
| "mean_token_accuracy": 0.7817695140838623, | |
| "num_tokens": 4202324.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5483512580394745, | |
| "epoch": 0.9626865671641791, | |
| "grad_norm": 0.1329817920923233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545552968978882, | |
| "mean_token_accuracy": 0.7754471302032471, | |
| "num_tokens": 4218485.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5507388859987259, | |
| "epoch": 0.9664179104477612, | |
| "grad_norm": 0.14522868394851685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539411306381226, | |
| "mean_token_accuracy": 0.776690736413002, | |
| "num_tokens": 4234830.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5551155656576157, | |
| "epoch": 0.9701492537313433, | |
| "grad_norm": 0.1110503152012825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517114996910095, | |
| "mean_token_accuracy": 0.7778125107288361, | |
| "num_tokens": 4251249.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5606275051832199, | |
| "epoch": 0.9738805970149254, | |
| "grad_norm": 0.11907053738832474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583968162536621, | |
| "mean_token_accuracy": 0.7729120701551437, | |
| "num_tokens": 4267571.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5697215348482132, | |
| "epoch": 0.9776119402985075, | |
| "grad_norm": 0.11226138472557068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654243230819702, | |
| "mean_token_accuracy": 0.7697847783565521, | |
| "num_tokens": 4283938.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5614341050386429, | |
| "epoch": 0.9813432835820896, | |
| "grad_norm": 0.12085731327533722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629435777664185, | |
| "mean_token_accuracy": 0.7714052200317383, | |
| "num_tokens": 4300727.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5495717078447342, | |
| "epoch": 0.9850746268656716, | |
| "grad_norm": 0.1363348811864853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549257397651672, | |
| "mean_token_accuracy": 0.7735868841409683, | |
| "num_tokens": 4316903.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5352297425270081, | |
| "epoch": 0.9888059701492538, | |
| "grad_norm": 0.1429988294839859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460555553436279, | |
| "mean_token_accuracy": 0.7814377993345261, | |
| "num_tokens": 4333143.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5603132396936417, | |
| "epoch": 0.9925373134328358, | |
| "grad_norm": 0.14986178278923035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551425218582153, | |
| "mean_token_accuracy": 0.7773159593343735, | |
| "num_tokens": 4349576.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5535064339637756, | |
| "epoch": 0.996268656716418, | |
| "grad_norm": 0.1105998232960701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442855954170227, | |
| "mean_token_accuracy": 0.7821661084890366, | |
| "num_tokens": 4365977.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5614945888519287, | |
| "epoch": 1.0, | |
| "grad_norm": 0.12907235324382782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476444959640503, | |
| "mean_token_accuracy": 0.7792651057243347, | |
| "num_tokens": 4382526.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.548059806227684, | |
| "epoch": 1.0037313432835822, | |
| "grad_norm": 0.12145893275737762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402656197547913, | |
| "mean_token_accuracy": 0.7813442945480347, | |
| "num_tokens": 4399005.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5212839543819427, | |
| "epoch": 1.007462686567164, | |
| "grad_norm": 0.1396404206752777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315491557121277, | |
| "mean_token_accuracy": 0.7839601635932922, | |
| "num_tokens": 4415205.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5132785737514496, | |
| "epoch": 1.0111940298507462, | |
| "grad_norm": 0.1433689296245575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299959778785706, | |
| "mean_token_accuracy": 0.7853466272354126, | |
| "num_tokens": 4431512.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5394517332315445, | |
| "epoch": 1.0149253731343284, | |
| "grad_norm": 0.11504881829023361, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439318418502808, | |
| "mean_token_accuracy": 0.7786544561386108, | |
| "num_tokens": 4447878.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5173204094171524, | |
| "epoch": 1.0186567164179103, | |
| "grad_norm": 0.12369395047426224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525097668170929, | |
| "mean_token_accuracy": 0.7878104597330093, | |
| "num_tokens": 4464069.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5443273782730103, | |
| "epoch": 1.0223880597014925, | |
| "grad_norm": 0.12611854076385498, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425093770027161, | |
| "mean_token_accuracy": 0.7833482921123505, | |
| "num_tokens": 4480510.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5319035351276398, | |
| "epoch": 1.0261194029850746, | |
| "grad_norm": 0.11637023091316223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231828093528748, | |
| "mean_token_accuracy": 0.788045197725296, | |
| "num_tokens": 4496734.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5645869076251984, | |
| "epoch": 1.0298507462686568, | |
| "grad_norm": 0.11970556527376175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556399405002594, | |
| "mean_token_accuracy": 0.7753234058618546, | |
| "num_tokens": 4513272.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5412048548460007, | |
| "epoch": 1.0335820895522387, | |
| "grad_norm": 0.12889669835567474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352495908737183, | |
| "mean_token_accuracy": 0.7822704613208771, | |
| "num_tokens": 4529760.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5433377772569656, | |
| "epoch": 1.037313432835821, | |
| "grad_norm": 0.15610089898109436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424712896347046, | |
| "mean_token_accuracy": 0.7791996449232101, | |
| "num_tokens": 4546065.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5367715954780579, | |
| "epoch": 1.041044776119403, | |
| "grad_norm": 0.1712978631258011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500761270523071, | |
| "mean_token_accuracy": 0.7774211019277573, | |
| "num_tokens": 4562404.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5348818898200989, | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.14415498077869415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458697080612183, | |
| "mean_token_accuracy": 0.7776882946491241, | |
| "num_tokens": 4578594.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5394753366708755, | |
| "epoch": 1.0485074626865671, | |
| "grad_norm": 0.17060807347297668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428628921508789, | |
| "mean_token_accuracy": 0.7797123193740845, | |
| "num_tokens": 4594918.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5477339029312134, | |
| "epoch": 1.0522388059701493, | |
| "grad_norm": 0.12646426260471344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376375913619995, | |
| "mean_token_accuracy": 0.7846843749284744, | |
| "num_tokens": 4611225.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.553899347782135, | |
| "epoch": 1.0559701492537314, | |
| "grad_norm": 0.14560198783874512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442871451377869, | |
| "mean_token_accuracy": 0.779757484793663, | |
| "num_tokens": 4627515.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.544152095913887, | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.14532814919948578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495354533195496, | |
| "mean_token_accuracy": 0.7756282091140747, | |
| "num_tokens": 4644151.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5467684864997864, | |
| "epoch": 1.0634328358208955, | |
| "grad_norm": 0.14399303495883942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551741123199463, | |
| "mean_token_accuracy": 0.7747452855110168, | |
| "num_tokens": 4660349.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5328090041875839, | |
| "epoch": 1.0671641791044777, | |
| "grad_norm": 0.1490914672613144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371617674827576, | |
| "mean_token_accuracy": 0.7852603644132614, | |
| "num_tokens": 4676682.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5549953877925873, | |
| "epoch": 1.0708955223880596, | |
| "grad_norm": 0.13986609876155853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485588312149048, | |
| "mean_token_accuracy": 0.7786588221788406, | |
| "num_tokens": 4693087.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5441232770681381, | |
| "epoch": 1.0746268656716418, | |
| "grad_norm": 0.13744987547397614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352811813354492, | |
| "mean_token_accuracy": 0.7830296456813812, | |
| "num_tokens": 4709482.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5388935655355453, | |
| "epoch": 1.078358208955224, | |
| "grad_norm": 0.12793688476085663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364757776260376, | |
| "mean_token_accuracy": 0.780993863940239, | |
| "num_tokens": 4725929.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5281359702348709, | |
| "epoch": 1.0820895522388059, | |
| "grad_norm": 0.11734890192747116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293084979057312, | |
| "mean_token_accuracy": 0.7876105159521103, | |
| "num_tokens": 4742317.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5459820628166199, | |
| "epoch": 1.085820895522388, | |
| "grad_norm": 0.12839624285697937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461269617080688, | |
| "mean_token_accuracy": 0.7763439863920212, | |
| "num_tokens": 4758682.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5111119300127029, | |
| "epoch": 1.0895522388059702, | |
| "grad_norm": 0.1377914845943451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5165018439292908, | |
| "mean_token_accuracy": 0.792814165353775, | |
| "num_tokens": 4775165.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5256515890359879, | |
| "epoch": 1.0932835820895523, | |
| "grad_norm": 0.13310879468917847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263264179229736, | |
| "mean_token_accuracy": 0.7891132682561874, | |
| "num_tokens": 4791249.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5361033976078033, | |
| "epoch": 1.0970149253731343, | |
| "grad_norm": 0.11920680850744247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344924926757812, | |
| "mean_token_accuracy": 0.7844657897949219, | |
| "num_tokens": 4807722.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.547529011964798, | |
| "epoch": 1.1007462686567164, | |
| "grad_norm": 0.15012222528457642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434770584106445, | |
| "mean_token_accuracy": 0.7794990837574005, | |
| "num_tokens": 4824221.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5387088805437088, | |
| "epoch": 1.1044776119402986, | |
| "grad_norm": 0.11607323586940765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379114151000977, | |
| "mean_token_accuracy": 0.7820580452680588, | |
| "num_tokens": 4840561.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5285296589136124, | |
| "epoch": 1.1082089552238805, | |
| "grad_norm": 0.16472671926021576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286039710044861, | |
| "mean_token_accuracy": 0.7859488725662231, | |
| "num_tokens": 4856739.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5467464625835419, | |
| "epoch": 1.1119402985074627, | |
| "grad_norm": 0.12136011570692062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486158132553101, | |
| "mean_token_accuracy": 0.7766989320516586, | |
| "num_tokens": 4873254.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5323450714349747, | |
| "epoch": 1.1156716417910448, | |
| "grad_norm": 0.15763746201992035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53644198179245, | |
| "mean_token_accuracy": 0.7847353965044022, | |
| "num_tokens": 4889763.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5294622331857681, | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.14253245294094086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327939987182617, | |
| "mean_token_accuracy": 0.7873322665691376, | |
| "num_tokens": 4905780.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5500210523605347, | |
| "epoch": 1.123134328358209, | |
| "grad_norm": 0.1611548215150833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55262291431427, | |
| "mean_token_accuracy": 0.7771656811237335, | |
| "num_tokens": 4921935.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5608504116535187, | |
| "epoch": 1.126865671641791, | |
| "grad_norm": 0.14609341323375702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597085952758789, | |
| "mean_token_accuracy": 0.773489698767662, | |
| "num_tokens": 4938566.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.541571170091629, | |
| "epoch": 1.1305970149253732, | |
| "grad_norm": 0.11906211823225021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541067361831665, | |
| "mean_token_accuracy": 0.7795013040304184, | |
| "num_tokens": 4954995.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5374023020267487, | |
| "epoch": 1.1343283582089552, | |
| "grad_norm": 0.191620334982872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540854811668396, | |
| "mean_token_accuracy": 0.783530056476593, | |
| "num_tokens": 4971285.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5237517058849335, | |
| "epoch": 1.1380597014925373, | |
| "grad_norm": 0.13355116546154022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256230235099792, | |
| "mean_token_accuracy": 0.7869999557733536, | |
| "num_tokens": 4987629.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5161513015627861, | |
| "epoch": 1.1417910447761195, | |
| "grad_norm": 0.14180561900138855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189639925956726, | |
| "mean_token_accuracy": 0.7884562611579895, | |
| "num_tokens": 5003816.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5333078503608704, | |
| "epoch": 1.1455223880597014, | |
| "grad_norm": 0.11995179206132889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338060259819031, | |
| "mean_token_accuracy": 0.7834619730710983, | |
| "num_tokens": 5020179.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5374015420675278, | |
| "epoch": 1.1492537313432836, | |
| "grad_norm": 0.14065897464752197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541375994682312, | |
| "mean_token_accuracy": 0.7836798280477524, | |
| "num_tokens": 5036421.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5318789333105087, | |
| "epoch": 1.1529850746268657, | |
| "grad_norm": 0.15007704496383667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320872664451599, | |
| "mean_token_accuracy": 0.7854835838079453, | |
| "num_tokens": 5052767.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5555961728096008, | |
| "epoch": 1.1567164179104479, | |
| "grad_norm": 0.12327966094017029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514441728591919, | |
| "mean_token_accuracy": 0.775398313999176, | |
| "num_tokens": 5069219.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5369515269994736, | |
| "epoch": 1.1604477611940298, | |
| "grad_norm": 0.13790592551231384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307064652442932, | |
| "mean_token_accuracy": 0.7870743423700333, | |
| "num_tokens": 5085637.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5395635664463043, | |
| "epoch": 1.164179104477612, | |
| "grad_norm": 0.12657856941223145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539893388748169, | |
| "mean_token_accuracy": 0.7809743881225586, | |
| "num_tokens": 5101984.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5528725534677505, | |
| "epoch": 1.1679104477611941, | |
| "grad_norm": 0.15744967758655548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551643967628479, | |
| "mean_token_accuracy": 0.7749461233615875, | |
| "num_tokens": 5118457.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5547244101762772, | |
| "epoch": 1.171641791044776, | |
| "grad_norm": 0.14667753875255585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545704364776611, | |
| "mean_token_accuracy": 0.7767890095710754, | |
| "num_tokens": 5135070.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5513405501842499, | |
| "epoch": 1.1753731343283582, | |
| "grad_norm": 0.13363401591777802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478935241699219, | |
| "mean_token_accuracy": 0.7782707363367081, | |
| "num_tokens": 5151457.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5504343062639236, | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.14427515864372253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503411293029785, | |
| "mean_token_accuracy": 0.7759760916233063, | |
| "num_tokens": 5167918.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5411941558122635, | |
| "epoch": 1.1828358208955223, | |
| "grad_norm": 0.13475076854228973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334619283676147, | |
| "mean_token_accuracy": 0.7848760634660721, | |
| "num_tokens": 5184250.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5534447133541107, | |
| "epoch": 1.1865671641791045, | |
| "grad_norm": 0.14666007459163666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606579184532166, | |
| "mean_token_accuracy": 0.7732094079256058, | |
| "num_tokens": 5200728.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5172414779663086, | |
| "epoch": 1.1902985074626866, | |
| "grad_norm": 0.1494058072566986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262372493743896, | |
| "mean_token_accuracy": 0.787101224064827, | |
| "num_tokens": 5216948.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5277577340602875, | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.15135720372200012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401796102523804, | |
| "mean_token_accuracy": 0.7809148728847504, | |
| "num_tokens": 5233422.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5246866941452026, | |
| "epoch": 1.1977611940298507, | |
| "grad_norm": 0.12589603662490845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281919836997986, | |
| "mean_token_accuracy": 0.7868399173021317, | |
| "num_tokens": 5249730.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5274995267391205, | |
| "epoch": 1.2014925373134329, | |
| "grad_norm": 0.11834204196929932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278512835502625, | |
| "mean_token_accuracy": 0.7852350920438766, | |
| "num_tokens": 5266115.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5320824682712555, | |
| "epoch": 1.205223880597015, | |
| "grad_norm": 0.13883750140666962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280960202217102, | |
| "mean_token_accuracy": 0.7858837693929672, | |
| "num_tokens": 5282462.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5404033660888672, | |
| "epoch": 1.208955223880597, | |
| "grad_norm": 0.13842950761318207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391522645950317, | |
| "mean_token_accuracy": 0.7815057188272476, | |
| "num_tokens": 5299103.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5260981917381287, | |
| "epoch": 1.212686567164179, | |
| "grad_norm": 0.14888468384742737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5250783562660217, | |
| "mean_token_accuracy": 0.7861860394477844, | |
| "num_tokens": 5315339.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5244043916463852, | |
| "epoch": 1.2164179104477613, | |
| "grad_norm": 0.12871688604354858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234277844429016, | |
| "mean_token_accuracy": 0.787299633026123, | |
| "num_tokens": 5331854.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5336845368146896, | |
| "epoch": 1.2201492537313432, | |
| "grad_norm": 0.1279512345790863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357816815376282, | |
| "mean_token_accuracy": 0.7811597734689713, | |
| "num_tokens": 5348268.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5396746844053268, | |
| "epoch": 1.2238805970149254, | |
| "grad_norm": 0.1272435188293457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367811322212219, | |
| "mean_token_accuracy": 0.7815662026405334, | |
| "num_tokens": 5364832.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5355321317911148, | |
| "epoch": 1.2276119402985075, | |
| "grad_norm": 0.12457006424665451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324679613113403, | |
| "mean_token_accuracy": 0.7855342030525208, | |
| "num_tokens": 5381181.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5404689311981201, | |
| "epoch": 1.2313432835820897, | |
| "grad_norm": 0.1616295725107193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461254715919495, | |
| "mean_token_accuracy": 0.7793011963367462, | |
| "num_tokens": 5397689.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5573465675115585, | |
| "epoch": 1.2350746268656716, | |
| "grad_norm": 0.1567206233739853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5680751204490662, | |
| "mean_token_accuracy": 0.7683437466621399, | |
| "num_tokens": 5414063.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5585090219974518, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.13362006843090057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544182658195496, | |
| "mean_token_accuracy": 0.7759232968091965, | |
| "num_tokens": 5430545.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5479722023010254, | |
| "epoch": 1.242537313432836, | |
| "grad_norm": 0.16734908521175385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447990298271179, | |
| "mean_token_accuracy": 0.7797949612140656, | |
| "num_tokens": 5446700.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5607796311378479, | |
| "epoch": 1.2462686567164178, | |
| "grad_norm": 0.1450573354959488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556632936000824, | |
| "mean_token_accuracy": 0.7769130021333694, | |
| "num_tokens": 5463137.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5538843423128128, | |
| "epoch": 1.25, | |
| "grad_norm": 0.12896743416786194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562998056411743, | |
| "mean_token_accuracy": 0.7745624631643295, | |
| "num_tokens": 5479659.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5309284329414368, | |
| "epoch": 1.2537313432835822, | |
| "grad_norm": 0.1323668360710144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389367341995239, | |
| "mean_token_accuracy": 0.7794619351625443, | |
| "num_tokens": 5495884.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5279457420110703, | |
| "epoch": 1.2574626865671643, | |
| "grad_norm": 0.16464678943157196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540420413017273, | |
| "mean_token_accuracy": 0.7797137498855591, | |
| "num_tokens": 5512288.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5431296676397324, | |
| "epoch": 1.2611940298507462, | |
| "grad_norm": 0.15366457402706146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533568263053894, | |
| "mean_token_accuracy": 0.7777420580387115, | |
| "num_tokens": 5528739.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5533888936042786, | |
| "epoch": 1.2649253731343284, | |
| "grad_norm": 0.15439164638519287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407285690307617, | |
| "mean_token_accuracy": 0.7848910838365555, | |
| "num_tokens": 5545180.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5363039374351501, | |
| "epoch": 1.2686567164179103, | |
| "grad_norm": 0.14024227857589722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247921943664551, | |
| "mean_token_accuracy": 0.7866441905498505, | |
| "num_tokens": 5561365.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5282331109046936, | |
| "epoch": 1.2723880597014925, | |
| "grad_norm": 0.15727277100086212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256697535514832, | |
| "mean_token_accuracy": 0.7857891470193863, | |
| "num_tokens": 5577609.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5532326549291611, | |
| "epoch": 1.2761194029850746, | |
| "grad_norm": 0.14312665164470673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558714866638184, | |
| "mean_token_accuracy": 0.776502713561058, | |
| "num_tokens": 5593922.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5117308422923088, | |
| "epoch": 1.2798507462686568, | |
| "grad_norm": 0.13982926309108734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216178894042969, | |
| "mean_token_accuracy": 0.7898732572793961, | |
| "num_tokens": 5610160.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5327529311180115, | |
| "epoch": 1.2835820895522387, | |
| "grad_norm": 0.1600239872932434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54588383436203, | |
| "mean_token_accuracy": 0.7827021777629852, | |
| "num_tokens": 5626483.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5456168502569199, | |
| "epoch": 1.287313432835821, | |
| "grad_norm": 0.1314232498407364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445138216018677, | |
| "mean_token_accuracy": 0.7821621298789978, | |
| "num_tokens": 5642705.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5568868517875671, | |
| "epoch": 1.291044776119403, | |
| "grad_norm": 0.12736710906028748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469453930854797, | |
| "mean_token_accuracy": 0.7784760594367981, | |
| "num_tokens": 5659144.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5525211989879608, | |
| "epoch": 1.294776119402985, | |
| "grad_norm": 0.11654646694660187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542698323726654, | |
| "mean_token_accuracy": 0.7785234600305557, | |
| "num_tokens": 5675452.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5460808724164963, | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.1318521350622177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390938520431519, | |
| "mean_token_accuracy": 0.7815311253070831, | |
| "num_tokens": 5691735.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5437112301588058, | |
| "epoch": 1.3022388059701493, | |
| "grad_norm": 0.13485990464687347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420966148376465, | |
| "mean_token_accuracy": 0.7827932983636856, | |
| "num_tokens": 5708102.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5493648052215576, | |
| "epoch": 1.3059701492537314, | |
| "grad_norm": 0.14354610443115234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561747550964355, | |
| "mean_token_accuracy": 0.7761517763137817, | |
| "num_tokens": 5724350.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5344854891300201, | |
| "epoch": 1.3097014925373134, | |
| "grad_norm": 0.15943452715873718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391569137573242, | |
| "mean_token_accuracy": 0.7805770933628082, | |
| "num_tokens": 5740954.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5242450833320618, | |
| "epoch": 1.3134328358208955, | |
| "grad_norm": 0.13654360175132751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292847156524658, | |
| "mean_token_accuracy": 0.784620076417923, | |
| "num_tokens": 5757385.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5383377820253372, | |
| "epoch": 1.3171641791044777, | |
| "grad_norm": 0.13651302456855774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413467288017273, | |
| "mean_token_accuracy": 0.7786675840616226, | |
| "num_tokens": 5773852.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5402452051639557, | |
| "epoch": 1.3208955223880596, | |
| "grad_norm": 0.13241973519325256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419248938560486, | |
| "mean_token_accuracy": 0.778145432472229, | |
| "num_tokens": 5790055.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5536379665136337, | |
| "epoch": 1.3246268656716418, | |
| "grad_norm": 0.13762575387954712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484678745269775, | |
| "mean_token_accuracy": 0.7766116708517075, | |
| "num_tokens": 5806738.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5532735884189606, | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 0.12154927849769592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548056960105896, | |
| "mean_token_accuracy": 0.7753622978925705, | |
| "num_tokens": 5823183.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5448320060968399, | |
| "epoch": 1.332089552238806, | |
| "grad_norm": 0.144795224070549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448752641677856, | |
| "mean_token_accuracy": 0.7790551483631134, | |
| "num_tokens": 5839499.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5511485040187836, | |
| "epoch": 1.335820895522388, | |
| "grad_norm": 0.13511039316654205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528499484062195, | |
| "mean_token_accuracy": 0.776659682393074, | |
| "num_tokens": 5855921.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5290715843439102, | |
| "epoch": 1.3395522388059702, | |
| "grad_norm": 0.11837369203567505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328022241592407, | |
| "mean_token_accuracy": 0.7826089113950729, | |
| "num_tokens": 5872142.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5363620519638062, | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.12029700726270676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534315824508667, | |
| "mean_token_accuracy": 0.7845976501703262, | |
| "num_tokens": 5888484.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5347290933132172, | |
| "epoch": 1.3470149253731343, | |
| "grad_norm": 0.13828180730342865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338245630264282, | |
| "mean_token_accuracy": 0.7808255851268768, | |
| "num_tokens": 5904613.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5324546545743942, | |
| "epoch": 1.3507462686567164, | |
| "grad_norm": 0.12894095480442047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361336469650269, | |
| "mean_token_accuracy": 0.7821396291255951, | |
| "num_tokens": 5920864.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5308556854724884, | |
| "epoch": 1.3544776119402986, | |
| "grad_norm": 0.11929216980934143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275416374206543, | |
| "mean_token_accuracy": 0.7852365076541901, | |
| "num_tokens": 5937108.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.53159399330616, | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.14378131926059723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424759387969971, | |
| "mean_token_accuracy": 0.7792777568101883, | |
| "num_tokens": 5953233.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5450653731822968, | |
| "epoch": 1.3619402985074627, | |
| "grad_norm": 0.14581741392612457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530756115913391, | |
| "mean_token_accuracy": 0.7765647917985916, | |
| "num_tokens": 5969681.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5418213754892349, | |
| "epoch": 1.3656716417910448, | |
| "grad_norm": 0.13764694333076477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494720935821533, | |
| "mean_token_accuracy": 0.7783620804548264, | |
| "num_tokens": 5985895.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5528892427682877, | |
| "epoch": 1.3694029850746268, | |
| "grad_norm": 0.14292745292186737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427901148796082, | |
| "mean_token_accuracy": 0.7794772684574127, | |
| "num_tokens": 6002104.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5515422970056534, | |
| "epoch": 1.373134328358209, | |
| "grad_norm": 0.12165708839893341, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388676524162292, | |
| "mean_token_accuracy": 0.7821601629257202, | |
| "num_tokens": 6018297.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5522115230560303, | |
| "epoch": 1.376865671641791, | |
| "grad_norm": 0.16414624452590942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514496564865112, | |
| "mean_token_accuracy": 0.7735963463783264, | |
| "num_tokens": 6034469.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5200467556715012, | |
| "epoch": 1.3805970149253732, | |
| "grad_norm": 0.11550547927618027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5164188146591187, | |
| "mean_token_accuracy": 0.7926855981349945, | |
| "num_tokens": 6050831.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5372455269098282, | |
| "epoch": 1.3843283582089552, | |
| "grad_norm": 0.15535052120685577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430443286895752, | |
| "mean_token_accuracy": 0.7787685394287109, | |
| "num_tokens": 6067185.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5356560945510864, | |
| "epoch": 1.3880597014925373, | |
| "grad_norm": 0.13415579497814178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381686091423035, | |
| "mean_token_accuracy": 0.7826534509658813, | |
| "num_tokens": 6083549.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5160757750272751, | |
| "epoch": 1.3917910447761195, | |
| "grad_norm": 0.21146361529827118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265405774116516, | |
| "mean_token_accuracy": 0.7884284406900406, | |
| "num_tokens": 6099748.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5486676543951035, | |
| "epoch": 1.3955223880597014, | |
| "grad_norm": 0.17727814614772797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486956834793091, | |
| "mean_token_accuracy": 0.774789959192276, | |
| "num_tokens": 6116173.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5379186123609543, | |
| "epoch": 1.3992537313432836, | |
| "grad_norm": 0.14094142615795135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390832424163818, | |
| "mean_token_accuracy": 0.7824152857065201, | |
| "num_tokens": 6132499.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5322713851928711, | |
| "epoch": 1.4029850746268657, | |
| "grad_norm": 0.20512345433235168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319615602493286, | |
| "mean_token_accuracy": 0.7856654673814774, | |
| "num_tokens": 6148777.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5522319674491882, | |
| "epoch": 1.4067164179104479, | |
| "grad_norm": 0.23706185817718506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542993545532227, | |
| "mean_token_accuracy": 0.7750299721956253, | |
| "num_tokens": 6165444.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5360774844884872, | |
| "epoch": 1.4104477611940298, | |
| "grad_norm": 0.11965668946504593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302645564079285, | |
| "mean_token_accuracy": 0.7849837243556976, | |
| "num_tokens": 6181897.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.546858549118042, | |
| "epoch": 1.414179104477612, | |
| "grad_norm": 0.16231459379196167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448977947235107, | |
| "mean_token_accuracy": 0.7800662368535995, | |
| "num_tokens": 6198254.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5505042523145676, | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.16832560300827026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560795247554779, | |
| "mean_token_accuracy": 0.7732271403074265, | |
| "num_tokens": 6214773.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5255255252122879, | |
| "epoch": 1.421641791044776, | |
| "grad_norm": 0.14621268212795258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310673117637634, | |
| "mean_token_accuracy": 0.7856626063585281, | |
| "num_tokens": 6230937.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.550481304526329, | |
| "epoch": 1.4253731343283582, | |
| "grad_norm": 0.13561075925827026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552341103553772, | |
| "mean_token_accuracy": 0.7767930179834366, | |
| "num_tokens": 6247144.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5227905362844467, | |
| "epoch": 1.4291044776119404, | |
| "grad_norm": 0.13489387929439545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523324191570282, | |
| "mean_token_accuracy": 0.7840524315834045, | |
| "num_tokens": 6263392.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5366068184375763, | |
| "epoch": 1.4328358208955223, | |
| "grad_norm": 0.14153233170509338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320409536361694, | |
| "mean_token_accuracy": 0.7857052683830261, | |
| "num_tokens": 6279611.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5510872900485992, | |
| "epoch": 1.4365671641791045, | |
| "grad_norm": 0.16421180963516235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412197709083557, | |
| "mean_token_accuracy": 0.7806995958089828, | |
| "num_tokens": 6296025.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5504460334777832, | |
| "epoch": 1.4402985074626866, | |
| "grad_norm": 0.12805409729480743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456997156143188, | |
| "mean_token_accuracy": 0.7775121033191681, | |
| "num_tokens": 6312415.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5504113882780075, | |
| "epoch": 1.4440298507462686, | |
| "grad_norm": 0.1690564602613449, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432727932929993, | |
| "mean_token_accuracy": 0.7804221510887146, | |
| "num_tokens": 6328728.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5279664844274521, | |
| "epoch": 1.4477611940298507, | |
| "grad_norm": 0.14327631890773773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324951410293579, | |
| "mean_token_accuracy": 0.7857986390590668, | |
| "num_tokens": 6344947.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.529266320168972, | |
| "epoch": 1.4514925373134329, | |
| "grad_norm": 0.14441367983818054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360409021377563, | |
| "mean_token_accuracy": 0.7844860553741455, | |
| "num_tokens": 6361481.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5474697202444077, | |
| "epoch": 1.455223880597015, | |
| "grad_norm": 0.17411169409751892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553131103515625, | |
| "mean_token_accuracy": 0.774516150355339, | |
| "num_tokens": 6378114.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.53204146027565, | |
| "epoch": 1.458955223880597, | |
| "grad_norm": 0.13096541166305542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311554074287415, | |
| "mean_token_accuracy": 0.7832191288471222, | |
| "num_tokens": 6394618.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5468081682920456, | |
| "epoch": 1.462686567164179, | |
| "grad_norm": 0.1281428337097168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487358570098877, | |
| "mean_token_accuracy": 0.7784566432237625, | |
| "num_tokens": 6411033.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5141153857111931, | |
| "epoch": 1.4664179104477613, | |
| "grad_norm": 0.12739789485931396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5161206126213074, | |
| "mean_token_accuracy": 0.7879614979028702, | |
| "num_tokens": 6427279.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5423916280269623, | |
| "epoch": 1.4701492537313432, | |
| "grad_norm": 0.13173308968544006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459262132644653, | |
| "mean_token_accuracy": 0.7773706614971161, | |
| "num_tokens": 6443618.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5373747050762177, | |
| "epoch": 1.4738805970149254, | |
| "grad_norm": 0.13537272810935974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413709878921509, | |
| "mean_token_accuracy": 0.7808920592069626, | |
| "num_tokens": 6459976.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5321269482374191, | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.14240136742591858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354140400886536, | |
| "mean_token_accuracy": 0.7839590162038803, | |
| "num_tokens": 6476177.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5257603526115417, | |
| "epoch": 1.4813432835820897, | |
| "grad_norm": 0.13054870069026947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284422636032104, | |
| "mean_token_accuracy": 0.7869588881731033, | |
| "num_tokens": 6492490.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5265851616859436, | |
| "epoch": 1.4850746268656716, | |
| "grad_norm": 0.13740919530391693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526134192943573, | |
| "mean_token_accuracy": 0.7872523069381714, | |
| "num_tokens": 6508878.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5212059766054153, | |
| "epoch": 1.4888059701492538, | |
| "grad_norm": 0.13339075446128845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221821665763855, | |
| "mean_token_accuracy": 0.7905861139297485, | |
| "num_tokens": 6525084.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.537382185459137, | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.13736183941364288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351852774620056, | |
| "mean_token_accuracy": 0.7818522453308105, | |
| "num_tokens": 6541545.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5340493619441986, | |
| "epoch": 1.4962686567164178, | |
| "grad_norm": 0.1368023306131363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317674279212952, | |
| "mean_token_accuracy": 0.7867089211940765, | |
| "num_tokens": 6557867.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5713642686605453, | |
| "epoch": 1.5, | |
| "grad_norm": 0.12573114037513733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638826489448547, | |
| "mean_token_accuracy": 0.773875430226326, | |
| "num_tokens": 6574428.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5415615439414978, | |
| "epoch": 1.5037313432835822, | |
| "grad_norm": 0.14824476838111877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452718734741211, | |
| "mean_token_accuracy": 0.7793742418289185, | |
| "num_tokens": 6590740.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5316762626171112, | |
| "epoch": 1.5074626865671643, | |
| "grad_norm": 0.13510265946388245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399596691131592, | |
| "mean_token_accuracy": 0.7803886234760284, | |
| "num_tokens": 6606963.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5310466289520264, | |
| "epoch": 1.5111940298507462, | |
| "grad_norm": 0.1413303166627884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532017707824707, | |
| "mean_token_accuracy": 0.7846063524484634, | |
| "num_tokens": 6623504.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5623253732919693, | |
| "epoch": 1.5149253731343284, | |
| "grad_norm": 0.1327054351568222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590583682060242, | |
| "mean_token_accuracy": 0.7741520255804062, | |
| "num_tokens": 6639880.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5222483575344086, | |
| "epoch": 1.5186567164179103, | |
| "grad_norm": 0.14219273626804352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221630930900574, | |
| "mean_token_accuracy": 0.7884060740470886, | |
| "num_tokens": 6656372.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5361650884151459, | |
| "epoch": 1.5223880597014925, | |
| "grad_norm": 0.14150315523147583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426543951034546, | |
| "mean_token_accuracy": 0.7794915586709976, | |
| "num_tokens": 6672460.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5405853539705276, | |
| "epoch": 1.5261194029850746, | |
| "grad_norm": 0.12867780029773712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545219361782074, | |
| "mean_token_accuracy": 0.7802143394947052, | |
| "num_tokens": 6688740.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5196312442421913, | |
| "epoch": 1.5298507462686568, | |
| "grad_norm": 0.12933768332004547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524722695350647, | |
| "mean_token_accuracy": 0.7893691807985306, | |
| "num_tokens": 6704798.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5358741357922554, | |
| "epoch": 1.533582089552239, | |
| "grad_norm": 0.14841386675834656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425981879234314, | |
| "mean_token_accuracy": 0.7796852141618729, | |
| "num_tokens": 6720982.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5389422178268433, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.12372686713933945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368393063545227, | |
| "mean_token_accuracy": 0.7788573652505875, | |
| "num_tokens": 6737135.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5395499765872955, | |
| "epoch": 1.5410447761194028, | |
| "grad_norm": 0.1355394721031189, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324706435203552, | |
| "mean_token_accuracy": 0.7823397219181061, | |
| "num_tokens": 6753507.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5506737977266312, | |
| "epoch": 1.544776119402985, | |
| "grad_norm": 0.11822586506605148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447027087211609, | |
| "mean_token_accuracy": 0.7776395529508591, | |
| "num_tokens": 6769726.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5393240600824356, | |
| "epoch": 1.5485074626865671, | |
| "grad_norm": 0.1220259889960289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348957180976868, | |
| "mean_token_accuracy": 0.7820345014333725, | |
| "num_tokens": 6786148.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5258119255304337, | |
| "epoch": 1.5522388059701493, | |
| "grad_norm": 0.15211379528045654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274648666381836, | |
| "mean_token_accuracy": 0.7861866801977158, | |
| "num_tokens": 6802290.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5310887396335602, | |
| "epoch": 1.5559701492537314, | |
| "grad_norm": 0.1319982260465622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339083075523376, | |
| "mean_token_accuracy": 0.7847474962472916, | |
| "num_tokens": 6818697.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5216883644461632, | |
| "epoch": 1.5597014925373134, | |
| "grad_norm": 0.13150501251220703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5250256061553955, | |
| "mean_token_accuracy": 0.7854708880186081, | |
| "num_tokens": 6834860.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5280915200710297, | |
| "epoch": 1.5634328358208955, | |
| "grad_norm": 0.13087767362594604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294699668884277, | |
| "mean_token_accuracy": 0.7844147831201553, | |
| "num_tokens": 6850977.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5455043911933899, | |
| "epoch": 1.5671641791044775, | |
| "grad_norm": 0.13152527809143066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411855578422546, | |
| "mean_token_accuracy": 0.7831065207719803, | |
| "num_tokens": 6867436.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5421444773674011, | |
| "epoch": 1.5708955223880596, | |
| "grad_norm": 0.12552635371685028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404070615768433, | |
| "mean_token_accuracy": 0.7799917608499527, | |
| "num_tokens": 6883739.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5469988659024239, | |
| "epoch": 1.5746268656716418, | |
| "grad_norm": 0.12713049352169037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506969690322876, | |
| "mean_token_accuracy": 0.7773310244083405, | |
| "num_tokens": 6899931.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5409539192914963, | |
| "epoch": 1.578358208955224, | |
| "grad_norm": 0.12043388932943344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393781661987305, | |
| "mean_token_accuracy": 0.7821668684482574, | |
| "num_tokens": 6916555.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5323537066578865, | |
| "epoch": 1.582089552238806, | |
| "grad_norm": 0.15053188800811768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387845039367676, | |
| "mean_token_accuracy": 0.7825682461261749, | |
| "num_tokens": 6932929.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5519883185625076, | |
| "epoch": 1.585820895522388, | |
| "grad_norm": 0.1525130718946457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56787109375, | |
| "mean_token_accuracy": 0.7703519463539124, | |
| "num_tokens": 6949313.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5393707901239395, | |
| "epoch": 1.5895522388059702, | |
| "grad_norm": 0.14073340594768524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375410914421082, | |
| "mean_token_accuracy": 0.7814988791942596, | |
| "num_tokens": 6965684.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5354568511247635, | |
| "epoch": 1.5932835820895521, | |
| "grad_norm": 0.13749349117279053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318333506584167, | |
| "mean_token_accuracy": 0.7864338159561157, | |
| "num_tokens": 6982013.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5405145287513733, | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.12070662528276443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362390279769897, | |
| "mean_token_accuracy": 0.7832798510789871, | |
| "num_tokens": 6998503.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5447606593370438, | |
| "epoch": 1.6007462686567164, | |
| "grad_norm": 0.1386427879333496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441482663154602, | |
| "mean_token_accuracy": 0.778590589761734, | |
| "num_tokens": 7014770.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5470203310251236, | |
| "epoch": 1.6044776119402986, | |
| "grad_norm": 0.13212502002716064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490391850471497, | |
| "mean_token_accuracy": 0.7765385806560516, | |
| "num_tokens": 7030922.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5170739889144897, | |
| "epoch": 1.6082089552238807, | |
| "grad_norm": 0.13961301743984222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210376381874084, | |
| "mean_token_accuracy": 0.7884235680103302, | |
| "num_tokens": 7047216.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5377417504787445, | |
| "epoch": 1.6119402985074627, | |
| "grad_norm": 0.13901281356811523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376747846603394, | |
| "mean_token_accuracy": 0.7830623835325241, | |
| "num_tokens": 7063307.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5414564162492752, | |
| "epoch": 1.6156716417910446, | |
| "grad_norm": 0.1463043987751007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473238825798035, | |
| "mean_token_accuracy": 0.7770842909812927, | |
| "num_tokens": 7079707.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5415572673082352, | |
| "epoch": 1.6194029850746268, | |
| "grad_norm": 0.11891120672225952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387373566627502, | |
| "mean_token_accuracy": 0.779969111084938, | |
| "num_tokens": 7095980.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5542661100625992, | |
| "epoch": 1.623134328358209, | |
| "grad_norm": 0.13271500170230865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507120490074158, | |
| "mean_token_accuracy": 0.7779867500066757, | |
| "num_tokens": 7112556.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.54887755215168, | |
| "epoch": 1.626865671641791, | |
| "grad_norm": 0.13373985886573792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447692275047302, | |
| "mean_token_accuracy": 0.7798765897750854, | |
| "num_tokens": 7128802.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5222520381212234, | |
| "epoch": 1.6305970149253732, | |
| "grad_norm": 0.1277901828289032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239554643630981, | |
| "mean_token_accuracy": 0.785177692770958, | |
| "num_tokens": 7145060.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.53469417989254, | |
| "epoch": 1.6343283582089554, | |
| "grad_norm": 0.20547546446323395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367586612701416, | |
| "mean_token_accuracy": 0.7803931534290314, | |
| "num_tokens": 7161527.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.521802693605423, | |
| "epoch": 1.6380597014925373, | |
| "grad_norm": 0.16560786962509155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228012204170227, | |
| "mean_token_accuracy": 0.7887944877147675, | |
| "num_tokens": 7178091.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5338825434446335, | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.1590629667043686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402793288230896, | |
| "mean_token_accuracy": 0.7781020998954773, | |
| "num_tokens": 7194244.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5395276695489883, | |
| "epoch": 1.6455223880597014, | |
| "grad_norm": 0.14088116586208344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401326417922974, | |
| "mean_token_accuracy": 0.781720831990242, | |
| "num_tokens": 7210451.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5567539632320404, | |
| "epoch": 1.6492537313432836, | |
| "grad_norm": 0.19292442500591278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627314448356628, | |
| "mean_token_accuracy": 0.7719661146402359, | |
| "num_tokens": 7226996.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.534116804599762, | |
| "epoch": 1.6529850746268657, | |
| "grad_norm": 0.1254442036151886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533519983291626, | |
| "mean_token_accuracy": 0.7840958386659622, | |
| "num_tokens": 7243430.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5330116599798203, | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.1718529760837555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330148339271545, | |
| "mean_token_accuracy": 0.7830322086811066, | |
| "num_tokens": 7259764.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5424318462610245, | |
| "epoch": 1.6604477611940298, | |
| "grad_norm": 0.13064436614513397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422405004501343, | |
| "mean_token_accuracy": 0.7796443551778793, | |
| "num_tokens": 7276147.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.555829331278801, | |
| "epoch": 1.664179104477612, | |
| "grad_norm": 0.12649741768836975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439899563789368, | |
| "mean_token_accuracy": 0.7798557877540588, | |
| "num_tokens": 7292719.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5564119815826416, | |
| "epoch": 1.667910447761194, | |
| "grad_norm": 0.140034019947052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546625256538391, | |
| "mean_token_accuracy": 0.7761601060628891, | |
| "num_tokens": 7309242.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5416673123836517, | |
| "epoch": 1.671641791044776, | |
| "grad_norm": 0.1388692855834961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541693389415741, | |
| "mean_token_accuracy": 0.7807905972003937, | |
| "num_tokens": 7325872.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5325654745101929, | |
| "epoch": 1.6753731343283582, | |
| "grad_norm": 0.1330399215221405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375967025756836, | |
| "mean_token_accuracy": 0.780772253870964, | |
| "num_tokens": 7342461.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5460408478975296, | |
| "epoch": 1.6791044776119404, | |
| "grad_norm": 0.1698281317949295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483989119529724, | |
| "mean_token_accuracy": 0.7757564038038254, | |
| "num_tokens": 7358926.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5587185472249985, | |
| "epoch": 1.6828358208955225, | |
| "grad_norm": 0.150365948677063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607273578643799, | |
| "mean_token_accuracy": 0.7735442072153091, | |
| "num_tokens": 7375472.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5546591132879257, | |
| "epoch": 1.6865671641791045, | |
| "grad_norm": 0.13346362113952637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498383045196533, | |
| "mean_token_accuracy": 0.7771503031253815, | |
| "num_tokens": 7391758.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5380023121833801, | |
| "epoch": 1.6902985074626866, | |
| "grad_norm": 0.15642641484737396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540310263633728, | |
| "mean_token_accuracy": 0.7800187021493912, | |
| "num_tokens": 7407943.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5107243284583092, | |
| "epoch": 1.6940298507462686, | |
| "grad_norm": 0.1413007378578186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198100209236145, | |
| "mean_token_accuracy": 0.7903516441583633, | |
| "num_tokens": 7424142.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5318749994039536, | |
| "epoch": 1.6977611940298507, | |
| "grad_norm": 0.13885854184627533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412630438804626, | |
| "mean_token_accuracy": 0.7793916463851929, | |
| "num_tokens": 7440451.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5525089502334595, | |
| "epoch": 1.7014925373134329, | |
| "grad_norm": 0.12943100929260254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551573634147644, | |
| "mean_token_accuracy": 0.7760037779808044, | |
| "num_tokens": 7456977.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5402176976203918, | |
| "epoch": 1.705223880597015, | |
| "grad_norm": 0.15211442112922668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398398041725159, | |
| "mean_token_accuracy": 0.779134064912796, | |
| "num_tokens": 7473154.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5625119209289551, | |
| "epoch": 1.7089552238805972, | |
| "grad_norm": 0.12840458750724792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544787645339966, | |
| "mean_token_accuracy": 0.7756093442440033, | |
| "num_tokens": 7489492.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5442609488964081, | |
| "epoch": 1.712686567164179, | |
| "grad_norm": 0.13839711248874664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437784194946289, | |
| "mean_token_accuracy": 0.7818922996520996, | |
| "num_tokens": 7505874.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5575658231973648, | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.14238221943378448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612136125564575, | |
| "mean_token_accuracy": 0.7718513458967209, | |
| "num_tokens": 7522288.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.535207062959671, | |
| "epoch": 1.7201492537313432, | |
| "grad_norm": 0.13308024406433105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384257435798645, | |
| "mean_token_accuracy": 0.7802019715309143, | |
| "num_tokens": 7538764.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5290672108530998, | |
| "epoch": 1.7238805970149254, | |
| "grad_norm": 0.14699077606201172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533920168876648, | |
| "mean_token_accuracy": 0.7809716016054153, | |
| "num_tokens": 7555048.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5349759012460709, | |
| "epoch": 1.7276119402985075, | |
| "grad_norm": 0.13993169367313385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397127866744995, | |
| "mean_token_accuracy": 0.781706914305687, | |
| "num_tokens": 7571331.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5471459329128265, | |
| "epoch": 1.7313432835820897, | |
| "grad_norm": 0.1270606368780136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457655191421509, | |
| "mean_token_accuracy": 0.7785040736198425, | |
| "num_tokens": 7587268.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5576677769422531, | |
| "epoch": 1.7350746268656716, | |
| "grad_norm": 0.13001851737499237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535344481468201, | |
| "mean_token_accuracy": 0.7747954726219177, | |
| "num_tokens": 7603468.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5527965128421783, | |
| "epoch": 1.7388059701492538, | |
| "grad_norm": 0.11191874742507935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493273138999939, | |
| "mean_token_accuracy": 0.7783663272857666, | |
| "num_tokens": 7619861.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5458428710699081, | |
| "epoch": 1.7425373134328357, | |
| "grad_norm": 0.12890613079071045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422653555870056, | |
| "mean_token_accuracy": 0.7804641127586365, | |
| "num_tokens": 7636365.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5396646112203598, | |
| "epoch": 1.7462686567164178, | |
| "grad_norm": 0.14643065631389618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540531575679779, | |
| "mean_token_accuracy": 0.7787915468215942, | |
| "num_tokens": 7652695.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5489283800125122, | |
| "epoch": 1.75, | |
| "grad_norm": 0.12856297194957733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493489503860474, | |
| "mean_token_accuracy": 0.7765475660562515, | |
| "num_tokens": 7669417.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5371540188789368, | |
| "epoch": 1.7537313432835822, | |
| "grad_norm": 0.1448490023612976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445014238357544, | |
| "mean_token_accuracy": 0.7786155045032501, | |
| "num_tokens": 7685950.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5441175699234009, | |
| "epoch": 1.7574626865671643, | |
| "grad_norm": 0.1417449563741684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456334352493286, | |
| "mean_token_accuracy": 0.7806714922189713, | |
| "num_tokens": 7702096.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.534687414765358, | |
| "epoch": 1.7611940298507462, | |
| "grad_norm": 0.13397443294525146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369069576263428, | |
| "mean_token_accuracy": 0.7817386239767075, | |
| "num_tokens": 7718461.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5490274131298065, | |
| "epoch": 1.7649253731343284, | |
| "grad_norm": 0.1352432817220688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512405633926392, | |
| "mean_token_accuracy": 0.7781344056129456, | |
| "num_tokens": 7734927.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5476491898298264, | |
| "epoch": 1.7686567164179103, | |
| "grad_norm": 0.13750651478767395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536763668060303, | |
| "mean_token_accuracy": 0.7743410021066666, | |
| "num_tokens": 7751415.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.524419367313385, | |
| "epoch": 1.7723880597014925, | |
| "grad_norm": 0.13306710124015808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263890624046326, | |
| "mean_token_accuracy": 0.7842015773057938, | |
| "num_tokens": 7767584.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5515109747648239, | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.13014942407608032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546906590461731, | |
| "mean_token_accuracy": 0.7791758924722672, | |
| "num_tokens": 7783929.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5460219085216522, | |
| "epoch": 1.7798507462686568, | |
| "grad_norm": 0.12750543653964996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416713953018188, | |
| "mean_token_accuracy": 0.7796966135501862, | |
| "num_tokens": 7800322.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5496509969234467, | |
| "epoch": 1.783582089552239, | |
| "grad_norm": 0.14019764959812164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501259565353394, | |
| "mean_token_accuracy": 0.7778430730104446, | |
| "num_tokens": 7816728.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5484806597232819, | |
| "epoch": 1.787313432835821, | |
| "grad_norm": 0.12671294808387756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546718418598175, | |
| "mean_token_accuracy": 0.7767283469438553, | |
| "num_tokens": 7833182.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5313283354043961, | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.16472716629505157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414275527000427, | |
| "mean_token_accuracy": 0.7815513163805008, | |
| "num_tokens": 7849402.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.516701802611351, | |
| "epoch": 1.794776119402985, | |
| "grad_norm": 0.157722607254982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291575789451599, | |
| "mean_token_accuracy": 0.7844545841217041, | |
| "num_tokens": 7865503.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5476036965847015, | |
| "epoch": 1.7985074626865671, | |
| "grad_norm": 0.16708603501319885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535966157913208, | |
| "mean_token_accuracy": 0.7750539481639862, | |
| "num_tokens": 7881822.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5405763983726501, | |
| "epoch": 1.8022388059701493, | |
| "grad_norm": 0.12333223968744278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385177731513977, | |
| "mean_token_accuracy": 0.7838984429836273, | |
| "num_tokens": 7898111.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5573789775371552, | |
| "epoch": 1.8059701492537314, | |
| "grad_norm": 0.14407449960708618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541386067867279, | |
| "mean_token_accuracy": 0.7797874957323074, | |
| "num_tokens": 7914518.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5439587533473969, | |
| "epoch": 1.8097014925373134, | |
| "grad_norm": 0.1654428094625473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336223244667053, | |
| "mean_token_accuracy": 0.7846554070711136, | |
| "num_tokens": 7930884.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.536734089255333, | |
| "epoch": 1.8134328358208955, | |
| "grad_norm": 0.15028727054595947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363267660140991, | |
| "mean_token_accuracy": 0.786723256111145, | |
| "num_tokens": 7947486.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5316303819417953, | |
| "epoch": 1.8171641791044775, | |
| "grad_norm": 0.2185370773077011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426980257034302, | |
| "mean_token_accuracy": 0.7816258370876312, | |
| "num_tokens": 7963754.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5372888445854187, | |
| "epoch": 1.8208955223880596, | |
| "grad_norm": 0.14039121568202972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452357530593872, | |
| "mean_token_accuracy": 0.7777333706617355, | |
| "num_tokens": 7980178.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.561303973197937, | |
| "epoch": 1.8246268656716418, | |
| "grad_norm": 0.2095021903514862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606201887130737, | |
| "mean_token_accuracy": 0.7701640874147415, | |
| "num_tokens": 7996414.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5401351600885391, | |
| "epoch": 1.828358208955224, | |
| "grad_norm": 0.13168978691101074, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416175723075867, | |
| "mean_token_accuracy": 0.7801533341407776, | |
| "num_tokens": 8012578.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5480149686336517, | |
| "epoch": 1.832089552238806, | |
| "grad_norm": 0.18209180235862732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433698892593384, | |
| "mean_token_accuracy": 0.7793498337268829, | |
| "num_tokens": 8029063.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5556472986936569, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.14936800301074982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554640293121338, | |
| "mean_token_accuracy": 0.7756128907203674, | |
| "num_tokens": 8045335.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.551779106259346, | |
| "epoch": 1.8395522388059702, | |
| "grad_norm": 0.16466236114501953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527586936950684, | |
| "mean_token_accuracy": 0.7768742144107819, | |
| "num_tokens": 8061746.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5395959764719009, | |
| "epoch": 1.8432835820895521, | |
| "grad_norm": 0.17139406502246857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481644868850708, | |
| "mean_token_accuracy": 0.7803965657949448, | |
| "num_tokens": 8078227.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.544280469417572, | |
| "epoch": 1.8470149253731343, | |
| "grad_norm": 0.14393140375614166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55059415102005, | |
| "mean_token_accuracy": 0.7759814560413361, | |
| "num_tokens": 8094667.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5303434431552887, | |
| "epoch": 1.8507462686567164, | |
| "grad_norm": 0.16556651890277863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530941903591156, | |
| "mean_token_accuracy": 0.7859343141317368, | |
| "num_tokens": 8110787.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5236229598522186, | |
| "epoch": 1.8544776119402986, | |
| "grad_norm": 0.12482267618179321, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197535753250122, | |
| "mean_token_accuracy": 0.7890704125165939, | |
| "num_tokens": 8127133.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5396426022052765, | |
| "epoch": 1.8582089552238807, | |
| "grad_norm": 0.1538504958152771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361296534538269, | |
| "mean_token_accuracy": 0.7814654260873795, | |
| "num_tokens": 8143434.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5484279841184616, | |
| "epoch": 1.8619402985074627, | |
| "grad_norm": 0.14813822507858276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464996695518494, | |
| "mean_token_accuracy": 0.7787739634513855, | |
| "num_tokens": 8159903.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.519238218665123, | |
| "epoch": 1.8656716417910446, | |
| "grad_norm": 0.13267366588115692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259124040603638, | |
| "mean_token_accuracy": 0.7888814806938171, | |
| "num_tokens": 8176179.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5393799841403961, | |
| "epoch": 1.8694029850746268, | |
| "grad_norm": 0.1923193484544754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401571989059448, | |
| "mean_token_accuracy": 0.7801343649625778, | |
| "num_tokens": 8192554.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.532251313328743, | |
| "epoch": 1.873134328358209, | |
| "grad_norm": 0.13894309103488922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527220606803894, | |
| "mean_token_accuracy": 0.7864662110805511, | |
| "num_tokens": 8208849.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5306680351495743, | |
| "epoch": 1.876865671641791, | |
| "grad_norm": 0.1474749892950058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287739038467407, | |
| "mean_token_accuracy": 0.7855399250984192, | |
| "num_tokens": 8225218.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5300537943840027, | |
| "epoch": 1.8805970149253732, | |
| "grad_norm": 0.1491105705499649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314114093780518, | |
| "mean_token_accuracy": 0.7854063659906387, | |
| "num_tokens": 8241422.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5309967398643494, | |
| "epoch": 1.8843283582089554, | |
| "grad_norm": 0.15464921295642853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415985584259033, | |
| "mean_token_accuracy": 0.7829921096563339, | |
| "num_tokens": 8257677.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5376427173614502, | |
| "epoch": 1.8880597014925373, | |
| "grad_norm": 0.1445028930902481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402049422264099, | |
| "mean_token_accuracy": 0.781824991106987, | |
| "num_tokens": 8274079.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5335574001073837, | |
| "epoch": 1.8917910447761193, | |
| "grad_norm": 0.12303903698921204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530457079410553, | |
| "mean_token_accuracy": 0.7857005745172501, | |
| "num_tokens": 8290576.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5357225090265274, | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.14474186301231384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326468348503113, | |
| "mean_token_accuracy": 0.7827298194169998, | |
| "num_tokens": 8306959.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5418558418750763, | |
| "epoch": 1.8992537313432836, | |
| "grad_norm": 0.13205651938915253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394735932350159, | |
| "mean_token_accuracy": 0.7811231166124344, | |
| "num_tokens": 8323198.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5494632720947266, | |
| "epoch": 1.9029850746268657, | |
| "grad_norm": 0.13867227733135223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512980818748474, | |
| "mean_token_accuracy": 0.7792128920555115, | |
| "num_tokens": 8339407.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.527800902724266, | |
| "epoch": 1.9067164179104479, | |
| "grad_norm": 0.1300196498632431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310680866241455, | |
| "mean_token_accuracy": 0.7856706976890564, | |
| "num_tokens": 8355694.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5433302372694016, | |
| "epoch": 1.9104477611940298, | |
| "grad_norm": 0.16294771432876587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532437562942505, | |
| "mean_token_accuracy": 0.7759810388088226, | |
| "num_tokens": 8371710.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5244318097829819, | |
| "epoch": 1.914179104477612, | |
| "grad_norm": 0.13300037384033203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271862149238586, | |
| "mean_token_accuracy": 0.7844917327165604, | |
| "num_tokens": 8387964.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5421733111143112, | |
| "epoch": 1.917910447761194, | |
| "grad_norm": 0.12434980273246765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377052426338196, | |
| "mean_token_accuracy": 0.7836858928203583, | |
| "num_tokens": 8404373.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5615102648735046, | |
| "epoch": 1.921641791044776, | |
| "grad_norm": 0.1264066845178604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558891236782074, | |
| "mean_token_accuracy": 0.7723990976810455, | |
| "num_tokens": 8420907.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5428318381309509, | |
| "epoch": 1.9253731343283582, | |
| "grad_norm": 0.13190090656280518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374886393547058, | |
| "mean_token_accuracy": 0.7830605953931808, | |
| "num_tokens": 8437255.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5324592739343643, | |
| "epoch": 1.9291044776119404, | |
| "grad_norm": 0.13782039284706116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368908643722534, | |
| "mean_token_accuracy": 0.7810968607664108, | |
| "num_tokens": 8453657.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.563809260725975, | |
| "epoch": 1.9328358208955225, | |
| "grad_norm": 0.11932537704706192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596674680709839, | |
| "mean_token_accuracy": 0.7723207473754883, | |
| "num_tokens": 8470566.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.550938680768013, | |
| "epoch": 1.9365671641791045, | |
| "grad_norm": 0.13882781565189362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502666234970093, | |
| "mean_token_accuracy": 0.7773875147104263, | |
| "num_tokens": 8486896.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5509646236896515, | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.11496590822935104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537518262863159, | |
| "mean_token_accuracy": 0.7762430608272552, | |
| "num_tokens": 8503486.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5208418220281601, | |
| "epoch": 1.9440298507462686, | |
| "grad_norm": 0.12605132162570953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253016948699951, | |
| "mean_token_accuracy": 0.7866884917020798, | |
| "num_tokens": 8519722.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5348703861236572, | |
| "epoch": 1.9477611940298507, | |
| "grad_norm": 0.13436545431613922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429031252861023, | |
| "mean_token_accuracy": 0.7784363180398941, | |
| "num_tokens": 8536094.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5374516993761063, | |
| "epoch": 1.9514925373134329, | |
| "grad_norm": 0.1355811506509781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394662618637085, | |
| "mean_token_accuracy": 0.7806121855974197, | |
| "num_tokens": 8552288.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5625811666250229, | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.11836230754852295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579893589019775, | |
| "mean_token_accuracy": 0.7714975476264954, | |
| "num_tokens": 8568760.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5421487241983414, | |
| "epoch": 1.9589552238805972, | |
| "grad_norm": 0.1359013170003891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385461449623108, | |
| "mean_token_accuracy": 0.7821292132139206, | |
| "num_tokens": 8585317.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5259972438216209, | |
| "epoch": 1.962686567164179, | |
| "grad_norm": 0.1390962302684784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276076793670654, | |
| "mean_token_accuracy": 0.785026952624321, | |
| "num_tokens": 8601637.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5354560762643814, | |
| "epoch": 1.966417910447761, | |
| "grad_norm": 0.13758784532546997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364598035812378, | |
| "mean_token_accuracy": 0.782847136259079, | |
| "num_tokens": 8617902.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5353007912635803, | |
| "epoch": 1.9701492537313432, | |
| "grad_norm": 0.16679321229457855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458345413208008, | |
| "mean_token_accuracy": 0.7779222279787064, | |
| "num_tokens": 8634235.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5326858758926392, | |
| "epoch": 1.9738805970149254, | |
| "grad_norm": 0.1427498161792755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339992642402649, | |
| "mean_token_accuracy": 0.7820619940757751, | |
| "num_tokens": 8650417.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5444169491529465, | |
| "epoch": 1.9776119402985075, | |
| "grad_norm": 0.12751619517803192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337543487548828, | |
| "mean_token_accuracy": 0.7827389687299728, | |
| "num_tokens": 8666763.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5495491325855255, | |
| "epoch": 1.9813432835820897, | |
| "grad_norm": 0.13329073786735535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403661131858826, | |
| "mean_token_accuracy": 0.7817551493644714, | |
| "num_tokens": 8683086.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.545268103480339, | |
| "epoch": 1.9850746268656716, | |
| "grad_norm": 0.1334519237279892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446645021438599, | |
| "mean_token_accuracy": 0.7789036780595779, | |
| "num_tokens": 8699314.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5360117256641388, | |
| "epoch": 1.9888059701492538, | |
| "grad_norm": 0.1417427510023117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377262830734253, | |
| "mean_token_accuracy": 0.782628983259201, | |
| "num_tokens": 8715712.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.539160817861557, | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 0.13969334959983826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430911779403687, | |
| "mean_token_accuracy": 0.7803932130336761, | |
| "num_tokens": 8732278.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5323211252689362, | |
| "epoch": 1.9962686567164178, | |
| "grad_norm": 0.13230480253696442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352569818496704, | |
| "mean_token_accuracy": 0.7800516188144684, | |
| "num_tokens": 8748639.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5396020114421844, | |
| "epoch": 2.0, | |
| "grad_norm": 0.13588403165340424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420472025871277, | |
| "mean_token_accuracy": 0.7812368422746658, | |
| "num_tokens": 8765023.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5363707542419434, | |
| "epoch": 2.003731343283582, | |
| "grad_norm": 0.13683520257472992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242169499397278, | |
| "mean_token_accuracy": 0.7884830236434937, | |
| "num_tokens": 8781503.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5355663001537323, | |
| "epoch": 2.0074626865671643, | |
| "grad_norm": 0.1606767475605011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340245962142944, | |
| "mean_token_accuracy": 0.7837463468313217, | |
| "num_tokens": 8797833.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5198972821235657, | |
| "epoch": 2.0111940298507465, | |
| "grad_norm": 0.1832306683063507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226503014564514, | |
| "mean_token_accuracy": 0.7878277599811554, | |
| "num_tokens": 8814387.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5145581886172295, | |
| "epoch": 2.014925373134328, | |
| "grad_norm": 0.14004163444042206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5142262578010559, | |
| "mean_token_accuracy": 0.7930136620998383, | |
| "num_tokens": 8830769.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.518964596092701, | |
| "epoch": 2.0186567164179103, | |
| "grad_norm": 0.2391389012336731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318617224693298, | |
| "mean_token_accuracy": 0.7879888862371445, | |
| "num_tokens": 8847079.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5112362876534462, | |
| "epoch": 2.0223880597014925, | |
| "grad_norm": 0.1571192741394043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5111895799636841, | |
| "mean_token_accuracy": 0.7941466271877289, | |
| "num_tokens": 8863455.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5289383679628372, | |
| "epoch": 2.0261194029850746, | |
| "grad_norm": 0.18859665095806122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321269035339355, | |
| "mean_token_accuracy": 0.7850861251354218, | |
| "num_tokens": 8879933.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5038495659828186, | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.1459927260875702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5009663105010986, | |
| "mean_token_accuracy": 0.800191804766655, | |
| "num_tokens": 8896279.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5393158346414566, | |
| "epoch": 2.033582089552239, | |
| "grad_norm": 0.18940559029579163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331785678863525, | |
| "mean_token_accuracy": 0.785183385014534, | |
| "num_tokens": 8912807.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5186864137649536, | |
| "epoch": 2.0373134328358207, | |
| "grad_norm": 0.13405749201774597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5130364894866943, | |
| "mean_token_accuracy": 0.7902890145778656, | |
| "num_tokens": 8929085.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.517152339220047, | |
| "epoch": 2.041044776119403, | |
| "grad_norm": 0.2357271909713745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223183631896973, | |
| "mean_token_accuracy": 0.7909936606884003, | |
| "num_tokens": 8945205.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.504429779946804, | |
| "epoch": 2.044776119402985, | |
| "grad_norm": 0.16896866261959076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5084525942802429, | |
| "mean_token_accuracy": 0.7927258014678955, | |
| "num_tokens": 8961586.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5195313468575478, | |
| "epoch": 2.048507462686567, | |
| "grad_norm": 0.16998501121997833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220100283622742, | |
| "mean_token_accuracy": 0.7873262912034988, | |
| "num_tokens": 8978096.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5092991963028908, | |
| "epoch": 2.0522388059701493, | |
| "grad_norm": 0.18961496651172638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5134435892105103, | |
| "mean_token_accuracy": 0.7906353622674942, | |
| "num_tokens": 8994217.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5130208507180214, | |
| "epoch": 2.0559701492537314, | |
| "grad_norm": 0.15812328457832336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5057437419891357, | |
| "mean_token_accuracy": 0.7933137118816376, | |
| "num_tokens": 9010450.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5244034826755524, | |
| "epoch": 2.0597014925373136, | |
| "grad_norm": 0.17014764249324799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208017230033875, | |
| "mean_token_accuracy": 0.7864028364419937, | |
| "num_tokens": 9026690.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.524794228374958, | |
| "epoch": 2.0634328358208953, | |
| "grad_norm": 0.1528615653514862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251787900924683, | |
| "mean_token_accuracy": 0.7868095934391022, | |
| "num_tokens": 9042889.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.525935024023056, | |
| "epoch": 2.0671641791044775, | |
| "grad_norm": 0.1623958796262741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336424708366394, | |
| "mean_token_accuracy": 0.7855145633220673, | |
| "num_tokens": 9059267.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5195625573396683, | |
| "epoch": 2.0708955223880596, | |
| "grad_norm": 0.17523802816867828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5209751725196838, | |
| "mean_token_accuracy": 0.7891881316900253, | |
| "num_tokens": 9075744.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5318533927202225, | |
| "epoch": 2.074626865671642, | |
| "grad_norm": 0.16624799370765686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274427533149719, | |
| "mean_token_accuracy": 0.7851865887641907, | |
| "num_tokens": 9092196.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5313673615455627, | |
| "epoch": 2.078358208955224, | |
| "grad_norm": 0.16823066771030426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263111591339111, | |
| "mean_token_accuracy": 0.7885167598724365, | |
| "num_tokens": 9108431.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.518197163939476, | |
| "epoch": 2.082089552238806, | |
| "grad_norm": 0.18068267405033112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193851590156555, | |
| "mean_token_accuracy": 0.7903801500797272, | |
| "num_tokens": 9124741.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5107997804880142, | |
| "epoch": 2.0858208955223883, | |
| "grad_norm": 0.15915489196777344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146846771240234, | |
| "mean_token_accuracy": 0.7921037524938583, | |
| "num_tokens": 9141112.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5317652374505997, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.18767035007476807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400185585021973, | |
| "mean_token_accuracy": 0.7800605148077011, | |
| "num_tokens": 9157563.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5086512267589569, | |
| "epoch": 2.093283582089552, | |
| "grad_norm": 0.1544736921787262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.508223831653595, | |
| "mean_token_accuracy": 0.7939174175262451, | |
| "num_tokens": 9173854.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.52768574655056, | |
| "epoch": 2.0970149253731343, | |
| "grad_norm": 0.17799650132656097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289405584335327, | |
| "mean_token_accuracy": 0.7851383984088898, | |
| "num_tokens": 9190112.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5307039618492126, | |
| "epoch": 2.1007462686567164, | |
| "grad_norm": 0.1469665914773941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241371989250183, | |
| "mean_token_accuracy": 0.7877105623483658, | |
| "num_tokens": 9206476.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.517830565571785, | |
| "epoch": 2.1044776119402986, | |
| "grad_norm": 0.1440608948469162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123553276062012, | |
| "mean_token_accuracy": 0.7936355024576187, | |
| "num_tokens": 9222843.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.523407056927681, | |
| "epoch": 2.1082089552238807, | |
| "grad_norm": 0.21014799177646637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186851620674133, | |
| "mean_token_accuracy": 0.792457640171051, | |
| "num_tokens": 9239327.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5128730833530426, | |
| "epoch": 2.111940298507463, | |
| "grad_norm": 0.2577928602695465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269497632980347, | |
| "mean_token_accuracy": 0.7877898067235947, | |
| "num_tokens": 9255586.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5238759815692902, | |
| "epoch": 2.1156716417910446, | |
| "grad_norm": 0.1416473388671875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266433954238892, | |
| "mean_token_accuracy": 0.7873618602752686, | |
| "num_tokens": 9272236.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5273244455456734, | |
| "epoch": 2.1194029850746268, | |
| "grad_norm": 0.1742546260356903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5227883458137512, | |
| "mean_token_accuracy": 0.7893139868974686, | |
| "num_tokens": 9288429.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5123281329870224, | |
| "epoch": 2.123134328358209, | |
| "grad_norm": 0.17472973465919495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5086967945098877, | |
| "mean_token_accuracy": 0.7941555231809616, | |
| "num_tokens": 9304696.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5038742050528526, | |
| "epoch": 2.126865671641791, | |
| "grad_norm": 0.15990978479385376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5093705058097839, | |
| "mean_token_accuracy": 0.7927817106246948, | |
| "num_tokens": 9320823.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5118470937013626, | |
| "epoch": 2.1305970149253732, | |
| "grad_norm": 0.15983271598815918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5105957388877869, | |
| "mean_token_accuracy": 0.7947766035795212, | |
| "num_tokens": 9337178.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5117835849523544, | |
| "epoch": 2.1343283582089554, | |
| "grad_norm": 0.17154565453529358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5166530609130859, | |
| "mean_token_accuracy": 0.7898510247468948, | |
| "num_tokens": 9353541.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.524290457367897, | |
| "epoch": 2.138059701492537, | |
| "grad_norm": 0.1809605062007904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276108980178833, | |
| "mean_token_accuracy": 0.7894007414579391, | |
| "num_tokens": 9370257.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5326485335826874, | |
| "epoch": 2.1417910447761193, | |
| "grad_norm": 0.17269255220890045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320166349411011, | |
| "mean_token_accuracy": 0.7842083424329758, | |
| "num_tokens": 9386645.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5396575331687927, | |
| "epoch": 2.1455223880597014, | |
| "grad_norm": 0.19763849675655365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302010774612427, | |
| "mean_token_accuracy": 0.7843988239765167, | |
| "num_tokens": 9403107.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.53758405148983, | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.1403210610151291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297962427139282, | |
| "mean_token_accuracy": 0.7875841557979584, | |
| "num_tokens": 9419679.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5233541131019592, | |
| "epoch": 2.1529850746268657, | |
| "grad_norm": 0.18504074215888977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262290835380554, | |
| "mean_token_accuracy": 0.7859254032373428, | |
| "num_tokens": 9436038.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5059448033571243, | |
| "epoch": 2.156716417910448, | |
| "grad_norm": 0.18249362707138062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5139797329902649, | |
| "mean_token_accuracy": 0.7936645895242691, | |
| "num_tokens": 9452416.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5189633667469025, | |
| "epoch": 2.16044776119403, | |
| "grad_norm": 0.21265490353107452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533969521522522, | |
| "mean_token_accuracy": 0.7854558378458023, | |
| "num_tokens": 9468830.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5293581038713455, | |
| "epoch": 2.1641791044776117, | |
| "grad_norm": 0.16064560413360596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302042961120605, | |
| "mean_token_accuracy": 0.7855220139026642, | |
| "num_tokens": 9485369.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5367814004421234, | |
| "epoch": 2.167910447761194, | |
| "grad_norm": 0.1988399475812912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316881537437439, | |
| "mean_token_accuracy": 0.7867899537086487, | |
| "num_tokens": 9501506.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.530438095331192, | |
| "epoch": 2.171641791044776, | |
| "grad_norm": 0.16211427748203278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204508304595947, | |
| "mean_token_accuracy": 0.7928901314735413, | |
| "num_tokens": 9517998.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.538342297077179, | |
| "epoch": 2.175373134328358, | |
| "grad_norm": 0.200654536485672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368824005126953, | |
| "mean_token_accuracy": 0.7828831076622009, | |
| "num_tokens": 9534418.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5067318677902222, | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 0.18536439538002014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152954459190369, | |
| "mean_token_accuracy": 0.7947442531585693, | |
| "num_tokens": 9550929.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5143613219261169, | |
| "epoch": 2.1828358208955225, | |
| "grad_norm": 0.18734246492385864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320346355438232, | |
| "mean_token_accuracy": 0.7830832600593567, | |
| "num_tokens": 9567052.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5134065821766853, | |
| "epoch": 2.1865671641791047, | |
| "grad_norm": 0.1658649444580078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5137937664985657, | |
| "mean_token_accuracy": 0.792109802365303, | |
| "num_tokens": 9583328.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5145891755819321, | |
| "epoch": 2.1902985074626864, | |
| "grad_norm": 0.20381639897823334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5113189220428467, | |
| "mean_token_accuracy": 0.791796863079071, | |
| "num_tokens": 9599639.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5297699421644211, | |
| "epoch": 2.1940298507462686, | |
| "grad_norm": 0.1610771119594574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239428877830505, | |
| "mean_token_accuracy": 0.7868966311216354, | |
| "num_tokens": 9616107.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5139229521155357, | |
| "epoch": 2.1977611940298507, | |
| "grad_norm": 0.16601988673210144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5093111991882324, | |
| "mean_token_accuracy": 0.7953454554080963, | |
| "num_tokens": 9632478.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5277693122625351, | |
| "epoch": 2.201492537313433, | |
| "grad_norm": 0.15310561656951904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306464433670044, | |
| "mean_token_accuracy": 0.785234808921814, | |
| "num_tokens": 9648606.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5277083218097687, | |
| "epoch": 2.205223880597015, | |
| "grad_norm": 0.17894159257411957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229562520980835, | |
| "mean_token_accuracy": 0.7855621576309204, | |
| "num_tokens": 9664853.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5369253158569336, | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.17260174453258514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379320383071899, | |
| "mean_token_accuracy": 0.785187691450119, | |
| "num_tokens": 9681395.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.51601941883564, | |
| "epoch": 2.2126865671641793, | |
| "grad_norm": 0.19144131243228912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525420606136322, | |
| "mean_token_accuracy": 0.7879699319601059, | |
| "num_tokens": 9697832.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5305543690919876, | |
| "epoch": 2.216417910447761, | |
| "grad_norm": 0.152136892080307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263657569885254, | |
| "mean_token_accuracy": 0.7852640599012375, | |
| "num_tokens": 9714327.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5374766737222672, | |
| "epoch": 2.220149253731343, | |
| "grad_norm": 0.18577203154563904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538034975528717, | |
| "mean_token_accuracy": 0.7831636220216751, | |
| "num_tokens": 9730796.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5116140991449356, | |
| "epoch": 2.2238805970149254, | |
| "grad_norm": 0.15658536553382874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5068283081054688, | |
| "mean_token_accuracy": 0.7946771383285522, | |
| "num_tokens": 9747017.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5136987864971161, | |
| "epoch": 2.2276119402985075, | |
| "grad_norm": 0.15834017097949982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.518505334854126, | |
| "mean_token_accuracy": 0.7908380329608917, | |
| "num_tokens": 9763200.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.48786860704421997, | |
| "epoch": 2.2313432835820897, | |
| "grad_norm": 0.16836979985237122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4918700158596039, | |
| "mean_token_accuracy": 0.8017545938491821, | |
| "num_tokens": 9779342.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.511562891304493, | |
| "epoch": 2.235074626865672, | |
| "grad_norm": 0.19002674520015717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5156916975975037, | |
| "mean_token_accuracy": 0.7910201996564865, | |
| "num_tokens": 9795546.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5209366902709007, | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.17156340181827545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.515453040599823, | |
| "mean_token_accuracy": 0.7911808788776398, | |
| "num_tokens": 9811678.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5190790444612503, | |
| "epoch": 2.2425373134328357, | |
| "grad_norm": 0.16390037536621094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197610259056091, | |
| "mean_token_accuracy": 0.791000559926033, | |
| "num_tokens": 9827971.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.534053236246109, | |
| "epoch": 2.246268656716418, | |
| "grad_norm": 0.17688144743442535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342822074890137, | |
| "mean_token_accuracy": 0.7848292291164398, | |
| "num_tokens": 9844391.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5072491243481636, | |
| "epoch": 2.25, | |
| "grad_norm": 0.15552373230457306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5125934481620789, | |
| "mean_token_accuracy": 0.79164819419384, | |
| "num_tokens": 9860695.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5196588039398193, | |
| "epoch": 2.253731343283582, | |
| "grad_norm": 0.20500463247299194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203579664230347, | |
| "mean_token_accuracy": 0.7872295528650284, | |
| "num_tokens": 9876962.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5224801748991013, | |
| "epoch": 2.2574626865671643, | |
| "grad_norm": 0.16438624262809753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.517778217792511, | |
| "mean_token_accuracy": 0.7902567535638809, | |
| "num_tokens": 9893378.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5315049141645432, | |
| "epoch": 2.2611940298507465, | |
| "grad_norm": 0.19314803183078766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378735065460205, | |
| "mean_token_accuracy": 0.7826669216156006, | |
| "num_tokens": 9909658.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5268717259168625, | |
| "epoch": 2.264925373134328, | |
| "grad_norm": 0.1703607141971588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323152542114258, | |
| "mean_token_accuracy": 0.7835480719804764, | |
| "num_tokens": 9926026.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5275075733661652, | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.1891828328371048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523108959197998, | |
| "mean_token_accuracy": 0.7864743769168854, | |
| "num_tokens": 9942362.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5301201939582825, | |
| "epoch": 2.2723880597014925, | |
| "grad_norm": 0.16404391825199127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282193422317505, | |
| "mean_token_accuracy": 0.7837762832641602, | |
| "num_tokens": 9958517.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5198077484965324, | |
| "epoch": 2.2761194029850746, | |
| "grad_norm": 0.1796608716249466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5138813853263855, | |
| "mean_token_accuracy": 0.7904112935066223, | |
| "num_tokens": 9974864.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5151881948113441, | |
| "epoch": 2.279850746268657, | |
| "grad_norm": 0.1921297013759613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276269912719727, | |
| "mean_token_accuracy": 0.7861463725566864, | |
| "num_tokens": 9990982.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5268184095621109, | |
| "epoch": 2.283582089552239, | |
| "grad_norm": 0.3107461929321289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354833006858826, | |
| "mean_token_accuracy": 0.7860495001077652, | |
| "num_tokens": 10007390.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5362572968006134, | |
| "epoch": 2.2873134328358207, | |
| "grad_norm": 0.2291727513074875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278795957565308, | |
| "mean_token_accuracy": 0.7864319235086441, | |
| "num_tokens": 10023741.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5297401547431946, | |
| "epoch": 2.291044776119403, | |
| "grad_norm": 0.22683671116828918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257067680358887, | |
| "mean_token_accuracy": 0.7868115305900574, | |
| "num_tokens": 10040185.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5152234882116318, | |
| "epoch": 2.294776119402985, | |
| "grad_norm": 0.20225822925567627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5109996795654297, | |
| "mean_token_accuracy": 0.7922611236572266, | |
| "num_tokens": 10056416.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5397164672613144, | |
| "epoch": 2.298507462686567, | |
| "grad_norm": 0.21879570186138153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53910893201828, | |
| "mean_token_accuracy": 0.7829782217741013, | |
| "num_tokens": 10073119.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.523445226252079, | |
| "epoch": 2.3022388059701493, | |
| "grad_norm": 0.2043614238500595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277411341667175, | |
| "mean_token_accuracy": 0.7879920601844788, | |
| "num_tokens": 10089539.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5420306473970413, | |
| "epoch": 2.3059701492537314, | |
| "grad_norm": 0.16890020668506622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416600704193115, | |
| "mean_token_accuracy": 0.7815042287111282, | |
| "num_tokens": 10105674.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5223758220672607, | |
| "epoch": 2.3097014925373136, | |
| "grad_norm": 0.187328040599823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208746790885925, | |
| "mean_token_accuracy": 0.7938240319490433, | |
| "num_tokens": 10121685.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5317254960536957, | |
| "epoch": 2.3134328358208958, | |
| "grad_norm": 0.17246371507644653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258828997612, | |
| "mean_token_accuracy": 0.7855419665575027, | |
| "num_tokens": 10138380.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.510456420481205, | |
| "epoch": 2.3171641791044775, | |
| "grad_norm": 0.17611362040042877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5174400806427002, | |
| "mean_token_accuracy": 0.790027379989624, | |
| "num_tokens": 10154639.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5146428272128105, | |
| "epoch": 2.3208955223880596, | |
| "grad_norm": 0.19471095502376556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222116708755493, | |
| "mean_token_accuracy": 0.7890471816062927, | |
| "num_tokens": 10170992.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5554968118667603, | |
| "epoch": 2.324626865671642, | |
| "grad_norm": 0.15456657111644745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553091168403625, | |
| "mean_token_accuracy": 0.7767172753810883, | |
| "num_tokens": 10187415.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5297296196222305, | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.17202581465244293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306862592697144, | |
| "mean_token_accuracy": 0.7859676033258438, | |
| "num_tokens": 10204041.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5107762217521667, | |
| "epoch": 2.332089552238806, | |
| "grad_norm": 0.17404352128505707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5129390358924866, | |
| "mean_token_accuracy": 0.7931138426065445, | |
| "num_tokens": 10220300.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5258396938443184, | |
| "epoch": 2.3358208955223883, | |
| "grad_norm": 0.18174229562282562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5229369401931763, | |
| "mean_token_accuracy": 0.7888091504573822, | |
| "num_tokens": 10236649.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5380365252494812, | |
| "epoch": 2.33955223880597, | |
| "grad_norm": 0.17537739872932434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373145937919617, | |
| "mean_token_accuracy": 0.7832024991512299, | |
| "num_tokens": 10252909.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5075801610946655, | |
| "epoch": 2.343283582089552, | |
| "grad_norm": 0.22284290194511414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511396586894989, | |
| "mean_token_accuracy": 0.7928276360034943, | |
| "num_tokens": 10269280.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5164258778095245, | |
| "epoch": 2.3470149253731343, | |
| "grad_norm": 0.18526744842529297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5178982019424438, | |
| "mean_token_accuracy": 0.7898775935173035, | |
| "num_tokens": 10285761.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5200358033180237, | |
| "epoch": 2.3507462686567164, | |
| "grad_norm": 0.20576190948486328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253298878669739, | |
| "mean_token_accuracy": 0.7885328382253647, | |
| "num_tokens": 10301941.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5383775234222412, | |
| "epoch": 2.3544776119402986, | |
| "grad_norm": 0.17617975175380707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448250770568848, | |
| "mean_token_accuracy": 0.782653346657753, | |
| "num_tokens": 10318486.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5118822678923607, | |
| "epoch": 2.3582089552238807, | |
| "grad_norm": 0.18932130932807922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223209857940674, | |
| "mean_token_accuracy": 0.7917590737342834, | |
| "num_tokens": 10334530.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5191465318202972, | |
| "epoch": 2.361940298507463, | |
| "grad_norm": 0.18021032214164734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5152462124824524, | |
| "mean_token_accuracy": 0.791267067193985, | |
| "num_tokens": 10350724.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5144938305020332, | |
| "epoch": 2.3656716417910446, | |
| "grad_norm": 0.15109598636627197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4982617497444153, | |
| "mean_token_accuracy": 0.7967542856931686, | |
| "num_tokens": 10366875.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5065358951687813, | |
| "epoch": 2.3694029850746268, | |
| "grad_norm": 0.18718236684799194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4973527193069458, | |
| "mean_token_accuracy": 0.8017638623714447, | |
| "num_tokens": 10383005.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.530413880944252, | |
| "epoch": 2.373134328358209, | |
| "grad_norm": 0.1718485951423645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324255228042603, | |
| "mean_token_accuracy": 0.7831610143184662, | |
| "num_tokens": 10399588.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5436315685510635, | |
| "epoch": 2.376865671641791, | |
| "grad_norm": 0.20064882934093475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518239140510559, | |
| "mean_token_accuracy": 0.7763282507658005, | |
| "num_tokens": 10416058.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5224271416664124, | |
| "epoch": 2.3805970149253732, | |
| "grad_norm": 0.18303366005420685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248957872390747, | |
| "mean_token_accuracy": 0.7867279052734375, | |
| "num_tokens": 10432139.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5115847885608673, | |
| "epoch": 2.3843283582089554, | |
| "grad_norm": 0.18415044248104095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158942937850952, | |
| "mean_token_accuracy": 0.7931726425886154, | |
| "num_tokens": 10448181.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5335763245820999, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.17970694601535797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286952257156372, | |
| "mean_token_accuracy": 0.7878449261188507, | |
| "num_tokens": 10464583.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5233506336808205, | |
| "epoch": 2.3917910447761193, | |
| "grad_norm": 0.19122423231601715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172105431556702, | |
| "mean_token_accuracy": 0.7892956882715225, | |
| "num_tokens": 10481023.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5129317939281464, | |
| "epoch": 2.3955223880597014, | |
| "grad_norm": 0.16389286518096924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5165532231330872, | |
| "mean_token_accuracy": 0.7895939499139786, | |
| "num_tokens": 10497404.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5067487806081772, | |
| "epoch": 2.3992537313432836, | |
| "grad_norm": 0.17685648798942566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5114090442657471, | |
| "mean_token_accuracy": 0.79579958319664, | |
| "num_tokens": 10513777.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5056411698460579, | |
| "epoch": 2.4029850746268657, | |
| "grad_norm": 0.20632798969745636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512579083442688, | |
| "mean_token_accuracy": 0.7917985171079636, | |
| "num_tokens": 10530002.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.503575325012207, | |
| "epoch": 2.406716417910448, | |
| "grad_norm": 0.18627490103244781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5137442350387573, | |
| "mean_token_accuracy": 0.7893558740615845, | |
| "num_tokens": 10546273.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5291843265295029, | |
| "epoch": 2.41044776119403, | |
| "grad_norm": 0.16846197843551636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265457630157471, | |
| "mean_token_accuracy": 0.7875650376081467, | |
| "num_tokens": 10562590.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5421585887670517, | |
| "epoch": 2.4141791044776117, | |
| "grad_norm": 0.17224395275115967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339004993438721, | |
| "mean_token_accuracy": 0.7843624651432037, | |
| "num_tokens": 10578951.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5322060137987137, | |
| "epoch": 2.417910447761194, | |
| "grad_norm": 0.15629476308822632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219835638999939, | |
| "mean_token_accuracy": 0.7886752039194107, | |
| "num_tokens": 10595214.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5281577706336975, | |
| "epoch": 2.421641791044776, | |
| "grad_norm": 0.18105372786521912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306849479675293, | |
| "mean_token_accuracy": 0.7853680700063705, | |
| "num_tokens": 10611701.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5248554199934006, | |
| "epoch": 2.425373134328358, | |
| "grad_norm": 0.16688814759254456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278753638267517, | |
| "mean_token_accuracy": 0.7852373868227005, | |
| "num_tokens": 10628217.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5284415632486343, | |
| "epoch": 2.4291044776119404, | |
| "grad_norm": 0.1766011267900467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336297750473022, | |
| "mean_token_accuracy": 0.7854758203029633, | |
| "num_tokens": 10644808.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.522301472723484, | |
| "epoch": 2.4328358208955225, | |
| "grad_norm": 0.1673455685377121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260990262031555, | |
| "mean_token_accuracy": 0.7875321805477142, | |
| "num_tokens": 10661415.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5340454131364822, | |
| "epoch": 2.4365671641791042, | |
| "grad_norm": 0.1705857813358307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287991166114807, | |
| "mean_token_accuracy": 0.7848271727561951, | |
| "num_tokens": 10678098.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5536000281572342, | |
| "epoch": 2.4402985074626864, | |
| "grad_norm": 0.16633524000644684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458575487136841, | |
| "mean_token_accuracy": 0.7790239751338959, | |
| "num_tokens": 10694453.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5396594703197479, | |
| "epoch": 2.4440298507462686, | |
| "grad_norm": 0.1658376157283783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348730683326721, | |
| "mean_token_accuracy": 0.7840123027563095, | |
| "num_tokens": 10710682.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5132960826158524, | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.16822409629821777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5173973441123962, | |
| "mean_token_accuracy": 0.7915854156017303, | |
| "num_tokens": 10726882.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.504063256084919, | |
| "epoch": 2.451492537313433, | |
| "grad_norm": 0.21201510727405548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5162043571472168, | |
| "mean_token_accuracy": 0.7916038483381271, | |
| "num_tokens": 10743326.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5151261985301971, | |
| "epoch": 2.455223880597015, | |
| "grad_norm": 0.22159790992736816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307928323745728, | |
| "mean_token_accuracy": 0.783583402633667, | |
| "num_tokens": 10759068.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5228653997182846, | |
| "epoch": 2.458955223880597, | |
| "grad_norm": 0.1764376312494278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526711106300354, | |
| "mean_token_accuracy": 0.785754069685936, | |
| "num_tokens": 10775538.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5352444350719452, | |
| "epoch": 2.4626865671641793, | |
| "grad_norm": 0.1673639416694641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53009432554245, | |
| "mean_token_accuracy": 0.7853073179721832, | |
| "num_tokens": 10791878.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5250429213047028, | |
| "epoch": 2.466417910447761, | |
| "grad_norm": 0.1584668755531311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5163600444793701, | |
| "mean_token_accuracy": 0.7921949625015259, | |
| "num_tokens": 10808194.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.531511977314949, | |
| "epoch": 2.470149253731343, | |
| "grad_norm": 0.15331409871578217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52297043800354, | |
| "mean_token_accuracy": 0.7875395864248276, | |
| "num_tokens": 10824487.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5337095707654953, | |
| "epoch": 2.4738805970149254, | |
| "grad_norm": 0.1537831574678421, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269461870193481, | |
| "mean_token_accuracy": 0.7883634269237518, | |
| "num_tokens": 10840768.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5136477053165436, | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 0.1710546612739563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147293210029602, | |
| "mean_token_accuracy": 0.790741965174675, | |
| "num_tokens": 10857093.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5279193222522736, | |
| "epoch": 2.4813432835820897, | |
| "grad_norm": 0.18926194310188293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373238921165466, | |
| "mean_token_accuracy": 0.7801239043474197, | |
| "num_tokens": 10873516.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5202833041548729, | |
| "epoch": 2.485074626865672, | |
| "grad_norm": 0.18720589578151703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260710120201111, | |
| "mean_token_accuracy": 0.7854216694831848, | |
| "num_tokens": 10889866.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5342879593372345, | |
| "epoch": 2.4888059701492535, | |
| "grad_norm": 0.16395018994808197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291630625724792, | |
| "mean_token_accuracy": 0.786442369222641, | |
| "num_tokens": 10906265.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5179769471287727, | |
| "epoch": 2.4925373134328357, | |
| "grad_norm": 0.18135614693164825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5105394721031189, | |
| "mean_token_accuracy": 0.7919545620679855, | |
| "num_tokens": 10922859.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5149218291044235, | |
| "epoch": 2.496268656716418, | |
| "grad_norm": 0.16995131969451904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147515535354614, | |
| "mean_token_accuracy": 0.7931389808654785, | |
| "num_tokens": 10938918.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5330513119697571, | |
| "epoch": 2.5, | |
| "grad_norm": 0.1602948158979416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284178256988525, | |
| "mean_token_accuracy": 0.7882454097270966, | |
| "num_tokens": 10955263.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5100918263196945, | |
| "epoch": 2.503731343283582, | |
| "grad_norm": 0.1638704538345337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5109102725982666, | |
| "mean_token_accuracy": 0.7914802730083466, | |
| "num_tokens": 10971573.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5232444852590561, | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.17863468825817108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527701735496521, | |
| "mean_token_accuracy": 0.7854352295398712, | |
| "num_tokens": 10987693.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5050330087542534, | |
| "epoch": 2.5111940298507465, | |
| "grad_norm": 0.18801726400852203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186895728111267, | |
| "mean_token_accuracy": 0.7896755188703537, | |
| "num_tokens": 11003802.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5354911088943481, | |
| "epoch": 2.5149253731343286, | |
| "grad_norm": 0.1630580574274063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393661856651306, | |
| "mean_token_accuracy": 0.7806737869977951, | |
| "num_tokens": 11020382.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5103952214121819, | |
| "epoch": 2.5186567164179103, | |
| "grad_norm": 0.16479070484638214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5052312016487122, | |
| "mean_token_accuracy": 0.79300656914711, | |
| "num_tokens": 11036684.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5548539459705353, | |
| "epoch": 2.5223880597014925, | |
| "grad_norm": 0.15993361175060272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424168109893799, | |
| "mean_token_accuracy": 0.7810866236686707, | |
| "num_tokens": 11053105.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5318550616502762, | |
| "epoch": 2.5261194029850746, | |
| "grad_norm": 0.17689482867717743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247601270675659, | |
| "mean_token_accuracy": 0.7856518179178238, | |
| "num_tokens": 11069578.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5139466673135757, | |
| "epoch": 2.529850746268657, | |
| "grad_norm": 0.17671139538288116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5161247253417969, | |
| "mean_token_accuracy": 0.7908915132284164, | |
| "num_tokens": 11085697.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5080201476812363, | |
| "epoch": 2.533582089552239, | |
| "grad_norm": 0.2036965787410736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175144672393799, | |
| "mean_token_accuracy": 0.791350468993187, | |
| "num_tokens": 11101902.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5312675833702087, | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.19512657821178436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406134128570557, | |
| "mean_token_accuracy": 0.7809882313013077, | |
| "num_tokens": 11118259.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5147824436426163, | |
| "epoch": 2.541044776119403, | |
| "grad_norm": 0.223260298371315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5146397948265076, | |
| "mean_token_accuracy": 0.7933319509029388, | |
| "num_tokens": 11134757.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5265121906995773, | |
| "epoch": 2.544776119402985, | |
| "grad_norm": 0.17229494452476501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5215858221054077, | |
| "mean_token_accuracy": 0.7878258526325226, | |
| "num_tokens": 11150969.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5460138469934464, | |
| "epoch": 2.548507462686567, | |
| "grad_norm": 0.16450214385986328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474146604537964, | |
| "mean_token_accuracy": 0.7795809954404831, | |
| "num_tokens": 11167094.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5366989523172379, | |
| "epoch": 2.5522388059701493, | |
| "grad_norm": 0.20410536229610443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371419787406921, | |
| "mean_token_accuracy": 0.7853393852710724, | |
| "num_tokens": 11183515.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5475771278142929, | |
| "epoch": 2.5559701492537314, | |
| "grad_norm": 0.1698704957962036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460457801818848, | |
| "mean_token_accuracy": 0.781210407614708, | |
| "num_tokens": 11200139.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5389831364154816, | |
| "epoch": 2.5597014925373136, | |
| "grad_norm": 0.22744543850421906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387647747993469, | |
| "mean_token_accuracy": 0.7828833609819412, | |
| "num_tokens": 11216497.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.531368613243103, | |
| "epoch": 2.5634328358208958, | |
| "grad_norm": 0.17488178610801697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309722423553467, | |
| "mean_token_accuracy": 0.7842755913734436, | |
| "num_tokens": 11232676.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5410369485616684, | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.1710905283689499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380433797836304, | |
| "mean_token_accuracy": 0.7851070165634155, | |
| "num_tokens": 11249092.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5218508541584015, | |
| "epoch": 2.5708955223880596, | |
| "grad_norm": 0.2351209968328476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304785966873169, | |
| "mean_token_accuracy": 0.7837776988744736, | |
| "num_tokens": 11265168.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5149262696504593, | |
| "epoch": 2.574626865671642, | |
| "grad_norm": 0.15611964464187622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160297155380249, | |
| "mean_token_accuracy": 0.7932045161724091, | |
| "num_tokens": 11281641.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5153379887342453, | |
| "epoch": 2.578358208955224, | |
| "grad_norm": 0.23146718740463257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226321220397949, | |
| "mean_token_accuracy": 0.787521630525589, | |
| "num_tokens": 11298142.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5393347591161728, | |
| "epoch": 2.582089552238806, | |
| "grad_norm": 0.16657157242298126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344167351722717, | |
| "mean_token_accuracy": 0.7832511067390442, | |
| "num_tokens": 11314425.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5284578949213028, | |
| "epoch": 2.585820895522388, | |
| "grad_norm": 0.2301884889602661, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258397459983826, | |
| "mean_token_accuracy": 0.787845253944397, | |
| "num_tokens": 11330672.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5345947295427322, | |
| "epoch": 2.58955223880597, | |
| "grad_norm": 0.17253969609737396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329262018203735, | |
| "mean_token_accuracy": 0.783668577671051, | |
| "num_tokens": 11346999.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5287525057792664, | |
| "epoch": 2.593283582089552, | |
| "grad_norm": 0.1584477573633194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283543467521667, | |
| "mean_token_accuracy": 0.7880005240440369, | |
| "num_tokens": 11363488.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5259083658456802, | |
| "epoch": 2.5970149253731343, | |
| "grad_norm": 0.18429915606975555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257930159568787, | |
| "mean_token_accuracy": 0.7871210873126984, | |
| "num_tokens": 11379993.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5198669880628586, | |
| "epoch": 2.6007462686567164, | |
| "grad_norm": 0.19845134019851685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221295356750488, | |
| "mean_token_accuracy": 0.7895113527774811, | |
| "num_tokens": 11396236.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5398612767457962, | |
| "epoch": 2.6044776119402986, | |
| "grad_norm": 0.19270583987236023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429852604866028, | |
| "mean_token_accuracy": 0.7811529338359833, | |
| "num_tokens": 11412613.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5187375992536545, | |
| "epoch": 2.6082089552238807, | |
| "grad_norm": 0.18094319105148315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5167657136917114, | |
| "mean_token_accuracy": 0.790035143494606, | |
| "num_tokens": 11428870.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5331326425075531, | |
| "epoch": 2.611940298507463, | |
| "grad_norm": 0.16809140145778656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311716794967651, | |
| "mean_token_accuracy": 0.7813376784324646, | |
| "num_tokens": 11445541.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5317347943782806, | |
| "epoch": 2.6156716417910446, | |
| "grad_norm": 0.2061910331249237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366970896720886, | |
| "mean_token_accuracy": 0.7823969423770905, | |
| "num_tokens": 11461869.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5304048359394073, | |
| "epoch": 2.6194029850746268, | |
| "grad_norm": 0.15473014116287231, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267943143844604, | |
| "mean_token_accuracy": 0.7864733040332794, | |
| "num_tokens": 11478245.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.528009369969368, | |
| "epoch": 2.623134328358209, | |
| "grad_norm": 0.2206811010837555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528520941734314, | |
| "mean_token_accuracy": 0.7848467379808426, | |
| "num_tokens": 11494601.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5367393791675568, | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.17169888317584991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352901816368103, | |
| "mean_token_accuracy": 0.7826301157474518, | |
| "num_tokens": 11510824.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5446508675813675, | |
| "epoch": 2.6305970149253732, | |
| "grad_norm": 0.23117929697036743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552783608436584, | |
| "mean_token_accuracy": 0.7762233018875122, | |
| "num_tokens": 11527111.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5259118974208832, | |
| "epoch": 2.6343283582089554, | |
| "grad_norm": 0.17237775027751923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5258082747459412, | |
| "mean_token_accuracy": 0.7888418883085251, | |
| "num_tokens": 11543508.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5134415403008461, | |
| "epoch": 2.638059701492537, | |
| "grad_norm": 0.1968804895877838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516159176826477, | |
| "mean_token_accuracy": 0.7919125109910965, | |
| "num_tokens": 11559764.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5164712592959404, | |
| "epoch": 2.6417910447761193, | |
| "grad_norm": 0.18034212291240692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184696316719055, | |
| "mean_token_accuracy": 0.7913271486759186, | |
| "num_tokens": 11576280.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5396228730678558, | |
| "epoch": 2.6455223880597014, | |
| "grad_norm": 0.16111285984516144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536095142364502, | |
| "mean_token_accuracy": 0.7845699042081833, | |
| "num_tokens": 11592548.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5335683822631836, | |
| "epoch": 2.6492537313432836, | |
| "grad_norm": 0.18878330290317535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533022403717041, | |
| "mean_token_accuracy": 0.7858745902776718, | |
| "num_tokens": 11608718.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5291629135608673, | |
| "epoch": 2.6529850746268657, | |
| "grad_norm": 0.15525634586811066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270857214927673, | |
| "mean_token_accuracy": 0.7867603600025177, | |
| "num_tokens": 11624984.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5291008502244949, | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 0.2215014100074768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335924029350281, | |
| "mean_token_accuracy": 0.7852614969015121, | |
| "num_tokens": 11641414.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5195610374212265, | |
| "epoch": 2.66044776119403, | |
| "grad_norm": 0.1840248554944992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272573828697205, | |
| "mean_token_accuracy": 0.7856255769729614, | |
| "num_tokens": 11657606.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5212601721286774, | |
| "epoch": 2.664179104477612, | |
| "grad_norm": 0.2194834053516388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5225985050201416, | |
| "mean_token_accuracy": 0.7896359115839005, | |
| "num_tokens": 11673978.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5267243683338165, | |
| "epoch": 2.667910447761194, | |
| "grad_norm": 0.18111757934093475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297276973724365, | |
| "mean_token_accuracy": 0.7850082814693451, | |
| "num_tokens": 11690084.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5318636000156403, | |
| "epoch": 2.671641791044776, | |
| "grad_norm": 0.1797971874475479, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307915806770325, | |
| "mean_token_accuracy": 0.7851123064756393, | |
| "num_tokens": 11706504.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5428463369607925, | |
| "epoch": 2.675373134328358, | |
| "grad_norm": 0.1636015772819519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534479558467865, | |
| "mean_token_accuracy": 0.7838175147771835, | |
| "num_tokens": 11722988.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5360075086355209, | |
| "epoch": 2.6791044776119404, | |
| "grad_norm": 0.15919257700443268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305730700492859, | |
| "mean_token_accuracy": 0.7855097204446793, | |
| "num_tokens": 11739438.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5359227359294891, | |
| "epoch": 2.6828358208955225, | |
| "grad_norm": 0.14643317461013794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532948911190033, | |
| "mean_token_accuracy": 0.7826716750860214, | |
| "num_tokens": 11755793.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.508900836110115, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.18424049019813538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5087383985519409, | |
| "mean_token_accuracy": 0.7960971295833588, | |
| "num_tokens": 11772140.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5278252959251404, | |
| "epoch": 2.6902985074626864, | |
| "grad_norm": 0.16620668768882751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323323011398315, | |
| "mean_token_accuracy": 0.7838071584701538, | |
| "num_tokens": 11788187.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5286207944154739, | |
| "epoch": 2.6940298507462686, | |
| "grad_norm": 0.18285532295703888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379830598831177, | |
| "mean_token_accuracy": 0.7834362238645554, | |
| "num_tokens": 11804853.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5304315537214279, | |
| "epoch": 2.6977611940298507, | |
| "grad_norm": 0.1528841108083725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53291916847229, | |
| "mean_token_accuracy": 0.7848697453737259, | |
| "num_tokens": 11821372.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5269036293029785, | |
| "epoch": 2.701492537313433, | |
| "grad_norm": 0.16717489063739777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263969898223877, | |
| "mean_token_accuracy": 0.7880866229534149, | |
| "num_tokens": 11837581.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5256982818245888, | |
| "epoch": 2.705223880597015, | |
| "grad_norm": 0.15457774698734283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219148993492126, | |
| "mean_token_accuracy": 0.7873740494251251, | |
| "num_tokens": 11853896.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.534528449177742, | |
| "epoch": 2.708955223880597, | |
| "grad_norm": 0.15566900372505188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313507318496704, | |
| "mean_token_accuracy": 0.7871876060962677, | |
| "num_tokens": 11869979.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5365303605794907, | |
| "epoch": 2.7126865671641793, | |
| "grad_norm": 0.16134414076805115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403051972389221, | |
| "mean_token_accuracy": 0.7792389243841171, | |
| "num_tokens": 11886540.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5314591228961945, | |
| "epoch": 2.716417910447761, | |
| "grad_norm": 0.20206789672374725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367040038108826, | |
| "mean_token_accuracy": 0.785218358039856, | |
| "num_tokens": 11902636.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5247315615415573, | |
| "epoch": 2.720149253731343, | |
| "grad_norm": 0.17510657012462616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183426141738892, | |
| "mean_token_accuracy": 0.7929788678884506, | |
| "num_tokens": 11918809.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.531570702791214, | |
| "epoch": 2.7238805970149254, | |
| "grad_norm": 0.19654951989650726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312444567680359, | |
| "mean_token_accuracy": 0.7852945178747177, | |
| "num_tokens": 11934918.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5167503207921982, | |
| "epoch": 2.7276119402985075, | |
| "grad_norm": 0.18647317588329315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521633505821228, | |
| "mean_token_accuracy": 0.7868699729442596, | |
| "num_tokens": 11951418.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5409902930259705, | |
| "epoch": 2.7313432835820897, | |
| "grad_norm": 0.16911281645298004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437517166137695, | |
| "mean_token_accuracy": 0.7801080495119095, | |
| "num_tokens": 11967971.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5430471152067184, | |
| "epoch": 2.7350746268656714, | |
| "grad_norm": 0.15203061699867249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399286150932312, | |
| "mean_token_accuracy": 0.7798464447259903, | |
| "num_tokens": 11984465.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5305036455392838, | |
| "epoch": 2.7388059701492535, | |
| "grad_norm": 0.19002215564250946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526854932308197, | |
| "mean_token_accuracy": 0.788349375128746, | |
| "num_tokens": 12000894.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5385335683822632, | |
| "epoch": 2.7425373134328357, | |
| "grad_norm": 0.1556226909160614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536300003528595, | |
| "mean_token_accuracy": 0.7823566943407059, | |
| "num_tokens": 12017341.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5280898958444595, | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.22629927098751068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357972979545593, | |
| "mean_token_accuracy": 0.7819354236125946, | |
| "num_tokens": 12033592.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5210496559739113, | |
| "epoch": 2.75, | |
| "grad_norm": 0.14672952890396118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192467570304871, | |
| "mean_token_accuracy": 0.7897329777479172, | |
| "num_tokens": 12050029.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5315113514661789, | |
| "epoch": 2.753731343283582, | |
| "grad_norm": 0.179401695728302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297517776489258, | |
| "mean_token_accuracy": 0.7900628596544266, | |
| "num_tokens": 12066356.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5152995735406876, | |
| "epoch": 2.7574626865671643, | |
| "grad_norm": 0.20404104888439178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523341953754425, | |
| "mean_token_accuracy": 0.7902668565511703, | |
| "num_tokens": 12082476.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5357868671417236, | |
| "epoch": 2.7611940298507465, | |
| "grad_norm": 0.21347877383232117, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397475361824036, | |
| "mean_token_accuracy": 0.7817140519618988, | |
| "num_tokens": 12098813.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5294998437166214, | |
| "epoch": 2.7649253731343286, | |
| "grad_norm": 0.19437092542648315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309361219406128, | |
| "mean_token_accuracy": 0.785544291138649, | |
| "num_tokens": 12115108.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5339842438697815, | |
| "epoch": 2.7686567164179103, | |
| "grad_norm": 0.211222842335701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336329340934753, | |
| "mean_token_accuracy": 0.7840461581945419, | |
| "num_tokens": 12131657.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5063766092061996, | |
| "epoch": 2.7723880597014925, | |
| "grad_norm": 0.18974091112613678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5003129243850708, | |
| "mean_token_accuracy": 0.7983057200908661, | |
| "num_tokens": 12147977.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5348393470048904, | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 0.17940539121627808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325519442558289, | |
| "mean_token_accuracy": 0.7843880504369736, | |
| "num_tokens": 12164476.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5319767147302628, | |
| "epoch": 2.779850746268657, | |
| "grad_norm": 0.21841664612293243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384219884872437, | |
| "mean_token_accuracy": 0.7829115390777588, | |
| "num_tokens": 12180665.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5276842713356018, | |
| "epoch": 2.783582089552239, | |
| "grad_norm": 0.15762406587600708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222536325454712, | |
| "mean_token_accuracy": 0.7876606732606888, | |
| "num_tokens": 12196994.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5283003747463226, | |
| "epoch": 2.7873134328358207, | |
| "grad_norm": 0.1740235984325409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262863039970398, | |
| "mean_token_accuracy": 0.7871444076299667, | |
| "num_tokens": 12213146.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5243652537465096, | |
| "epoch": 2.791044776119403, | |
| "grad_norm": 0.17303697764873505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288724303245544, | |
| "mean_token_accuracy": 0.7889265865087509, | |
| "num_tokens": 12229495.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5307216495275497, | |
| "epoch": 2.794776119402985, | |
| "grad_norm": 0.17367562651634216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350364446640015, | |
| "mean_token_accuracy": 0.7828467786312103, | |
| "num_tokens": 12245731.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5053429380059242, | |
| "epoch": 2.798507462686567, | |
| "grad_norm": 0.18273597955703735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5170458555221558, | |
| "mean_token_accuracy": 0.7908547967672348, | |
| "num_tokens": 12261995.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5304894745349884, | |
| "epoch": 2.8022388059701493, | |
| "grad_norm": 0.19946977496147156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361734628677368, | |
| "mean_token_accuracy": 0.7829707115888596, | |
| "num_tokens": 12278393.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5383865833282471, | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.18991155922412872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307108163833618, | |
| "mean_token_accuracy": 0.7821619510650635, | |
| "num_tokens": 12294798.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5184406042098999, | |
| "epoch": 2.8097014925373136, | |
| "grad_norm": 0.1910092979669571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5096916556358337, | |
| "mean_token_accuracy": 0.7956021875143051, | |
| "num_tokens": 12311283.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5503049492835999, | |
| "epoch": 2.8134328358208958, | |
| "grad_norm": 0.16047552227973938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400866270065308, | |
| "mean_token_accuracy": 0.781381756067276, | |
| "num_tokens": 12327796.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5367267429828644, | |
| "epoch": 2.8171641791044775, | |
| "grad_norm": 0.17214973270893097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533517062664032, | |
| "mean_token_accuracy": 0.7842586189508438, | |
| "num_tokens": 12344276.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5231245383620262, | |
| "epoch": 2.8208955223880596, | |
| "grad_norm": 0.20261810719966888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310981869697571, | |
| "mean_token_accuracy": 0.7863229364156723, | |
| "num_tokens": 12360664.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5025655254721642, | |
| "epoch": 2.824626865671642, | |
| "grad_norm": 0.23269020020961761, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5136131644248962, | |
| "mean_token_accuracy": 0.7932915538549423, | |
| "num_tokens": 12377108.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5385118275880814, | |
| "epoch": 2.828358208955224, | |
| "grad_norm": 0.17557309567928314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468243956565857, | |
| "mean_token_accuracy": 0.7773942649364471, | |
| "num_tokens": 12393477.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5556999295949936, | |
| "epoch": 2.832089552238806, | |
| "grad_norm": 0.18836821615695953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542982816696167, | |
| "mean_token_accuracy": 0.7759236544370651, | |
| "num_tokens": 12409945.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5397951006889343, | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.16869579255580902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345804691314697, | |
| "mean_token_accuracy": 0.7828676253557205, | |
| "num_tokens": 12426172.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5465898215770721, | |
| "epoch": 2.83955223880597, | |
| "grad_norm": 0.1971413791179657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406813621520996, | |
| "mean_token_accuracy": 0.7830551862716675, | |
| "num_tokens": 12442539.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5412090718746185, | |
| "epoch": 2.843283582089552, | |
| "grad_norm": 0.16916459798812866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298109650611877, | |
| "mean_token_accuracy": 0.7871081382036209, | |
| "num_tokens": 12458926.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5222381502389908, | |
| "epoch": 2.8470149253731343, | |
| "grad_norm": 0.19241978228092194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193473100662231, | |
| "mean_token_accuracy": 0.7926554083824158, | |
| "num_tokens": 12475192.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5114666819572449, | |
| "epoch": 2.8507462686567164, | |
| "grad_norm": 0.2026778608560562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210025906562805, | |
| "mean_token_accuracy": 0.7880990207195282, | |
| "num_tokens": 12491486.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5318130105733871, | |
| "epoch": 2.8544776119402986, | |
| "grad_norm": 0.18366879224777222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408880710601807, | |
| "mean_token_accuracy": 0.7821989059448242, | |
| "num_tokens": 12508110.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5178861618041992, | |
| "epoch": 2.8582089552238807, | |
| "grad_norm": 0.22393299639225006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5233381986618042, | |
| "mean_token_accuracy": 0.7875554710626602, | |
| "num_tokens": 12524419.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5129977464675903, | |
| "epoch": 2.861940298507463, | |
| "grad_norm": 0.16486415266990662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5123316645622253, | |
| "mean_token_accuracy": 0.7945219725370407, | |
| "num_tokens": 12540623.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5352810174226761, | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.16391848027706146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287078619003296, | |
| "mean_token_accuracy": 0.7864142656326294, | |
| "num_tokens": 12556769.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5213837772607803, | |
| "epoch": 2.8694029850746268, | |
| "grad_norm": 0.15605109930038452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5177993774414062, | |
| "mean_token_accuracy": 0.791528195142746, | |
| "num_tokens": 12572975.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5254454612731934, | |
| "epoch": 2.873134328358209, | |
| "grad_norm": 0.17228880524635315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218878388404846, | |
| "mean_token_accuracy": 0.790112167596817, | |
| "num_tokens": 12589664.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5180996954441071, | |
| "epoch": 2.876865671641791, | |
| "grad_norm": 0.1603233963251114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5153653621673584, | |
| "mean_token_accuracy": 0.7935372442007065, | |
| "num_tokens": 12606393.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5220412835478783, | |
| "epoch": 2.8805970149253732, | |
| "grad_norm": 0.19191837310791016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350449085235596, | |
| "mean_token_accuracy": 0.7817320823669434, | |
| "num_tokens": 12622915.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5260520726442337, | |
| "epoch": 2.8843283582089554, | |
| "grad_norm": 0.1964220553636551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347790718078613, | |
| "mean_token_accuracy": 0.7870497107505798, | |
| "num_tokens": 12639438.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5259631350636482, | |
| "epoch": 2.888059701492537, | |
| "grad_norm": 0.1590423583984375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264297723770142, | |
| "mean_token_accuracy": 0.7856660634279251, | |
| "num_tokens": 12656043.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5494396686553955, | |
| "epoch": 2.8917910447761193, | |
| "grad_norm": 0.166259765625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541179895401001, | |
| "mean_token_accuracy": 0.7822139710187912, | |
| "num_tokens": 12672530.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5362062454223633, | |
| "epoch": 2.8955223880597014, | |
| "grad_norm": 0.16349440813064575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530780017375946, | |
| "mean_token_accuracy": 0.7863557487726212, | |
| "num_tokens": 12689021.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5223592668771744, | |
| "epoch": 2.8992537313432836, | |
| "grad_norm": 0.15761977434158325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155429244041443, | |
| "mean_token_accuracy": 0.7907254546880722, | |
| "num_tokens": 12705262.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5258801132440567, | |
| "epoch": 2.9029850746268657, | |
| "grad_norm": 0.1883028894662857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529833972454071, | |
| "mean_token_accuracy": 0.7863512486219406, | |
| "num_tokens": 12721511.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5216899961233139, | |
| "epoch": 2.906716417910448, | |
| "grad_norm": 0.16059532761573792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.522499680519104, | |
| "mean_token_accuracy": 0.7899018228054047, | |
| "num_tokens": 12738089.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.520403303205967, | |
| "epoch": 2.91044776119403, | |
| "grad_norm": 0.1771392673254013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5236196517944336, | |
| "mean_token_accuracy": 0.7879007905721664, | |
| "num_tokens": 12754592.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5242541432380676, | |
| "epoch": 2.914179104477612, | |
| "grad_norm": 0.17634879052639008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289914011955261, | |
| "mean_token_accuracy": 0.7824440151453018, | |
| "num_tokens": 12770734.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5201637446880341, | |
| "epoch": 2.917910447761194, | |
| "grad_norm": 0.17048649489879608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211310386657715, | |
| "mean_token_accuracy": 0.7937574684619904, | |
| "num_tokens": 12787160.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5204057991504669, | |
| "epoch": 2.921641791044776, | |
| "grad_norm": 0.15417909622192383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.517360508441925, | |
| "mean_token_accuracy": 0.7929933965206146, | |
| "num_tokens": 12803683.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.545757845044136, | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.1549869030714035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414532423019409, | |
| "mean_token_accuracy": 0.7788090705871582, | |
| "num_tokens": 12819951.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5228646248579025, | |
| "epoch": 2.9291044776119404, | |
| "grad_norm": 0.15743686258792877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516430675983429, | |
| "mean_token_accuracy": 0.7925095409154892, | |
| "num_tokens": 12836413.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5214046537876129, | |
| "epoch": 2.9328358208955225, | |
| "grad_norm": 0.16672447323799133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222574472427368, | |
| "mean_token_accuracy": 0.7870719730854034, | |
| "num_tokens": 12852872.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5317943245172501, | |
| "epoch": 2.9365671641791042, | |
| "grad_norm": 0.21642933785915375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372959971427917, | |
| "mean_token_accuracy": 0.7832164466381073, | |
| "num_tokens": 12869405.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5113082602620125, | |
| "epoch": 2.9402985074626864, | |
| "grad_norm": 0.22133168578147888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.522553563117981, | |
| "mean_token_accuracy": 0.7871409952640533, | |
| "num_tokens": 12885593.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5275594145059586, | |
| "epoch": 2.9440298507462686, | |
| "grad_norm": 0.20494818687438965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326835513114929, | |
| "mean_token_accuracy": 0.7843892127275467, | |
| "num_tokens": 12901950.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5371553599834442, | |
| "epoch": 2.9477611940298507, | |
| "grad_norm": 0.16483525931835175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343260765075684, | |
| "mean_token_accuracy": 0.7844540178775787, | |
| "num_tokens": 12918538.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5248367339372635, | |
| "epoch": 2.951492537313433, | |
| "grad_norm": 0.20370911061763763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262700915336609, | |
| "mean_token_accuracy": 0.7856797575950623, | |
| "num_tokens": 12935041.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5536757409572601, | |
| "epoch": 2.955223880597015, | |
| "grad_norm": 0.15302392840385437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451865196228027, | |
| "mean_token_accuracy": 0.781255841255188, | |
| "num_tokens": 12951793.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5070596486330032, | |
| "epoch": 2.958955223880597, | |
| "grad_norm": 0.20451144874095917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5115755796432495, | |
| "mean_token_accuracy": 0.7904744446277618, | |
| "num_tokens": 12968060.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5260060653090477, | |
| "epoch": 2.9626865671641793, | |
| "grad_norm": 0.16183388233184814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244185328483582, | |
| "mean_token_accuracy": 0.7878494709730148, | |
| "num_tokens": 12984541.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5389718413352966, | |
| "epoch": 2.966417910447761, | |
| "grad_norm": 0.17704468965530396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415879487991333, | |
| "mean_token_accuracy": 0.7840642035007477, | |
| "num_tokens": 13000817.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5400192737579346, | |
| "epoch": 2.970149253731343, | |
| "grad_norm": 0.16612157225608826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336055755615234, | |
| "mean_token_accuracy": 0.7857667803764343, | |
| "num_tokens": 13016973.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5179389715194702, | |
| "epoch": 2.9738805970149254, | |
| "grad_norm": 0.16657505929470062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218580365180969, | |
| "mean_token_accuracy": 0.7903915345668793, | |
| "num_tokens": 13033299.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5229775831103325, | |
| "epoch": 2.9776119402985075, | |
| "grad_norm": 0.1601499617099762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244333744049072, | |
| "mean_token_accuracy": 0.7875324189662933, | |
| "num_tokens": 13049754.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5364563912153244, | |
| "epoch": 2.9813432835820897, | |
| "grad_norm": 0.17928777635097504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421883463859558, | |
| "mean_token_accuracy": 0.7822880148887634, | |
| "num_tokens": 13066045.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5202258825302124, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.1714518666267395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221466422080994, | |
| "mean_token_accuracy": 0.7896016389131546, | |
| "num_tokens": 13082398.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.526955708861351, | |
| "epoch": 2.9888059701492535, | |
| "grad_norm": 0.1565951555967331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521065354347229, | |
| "mean_token_accuracy": 0.7919437438249588, | |
| "num_tokens": 13098966.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5393194705247879, | |
| "epoch": 2.9925373134328357, | |
| "grad_norm": 0.1675749570131302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336388945579529, | |
| "mean_token_accuracy": 0.7851084172725677, | |
| "num_tokens": 13115333.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5270961374044418, | |
| "epoch": 2.996268656716418, | |
| "grad_norm": 0.17216360569000244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220625400543213, | |
| "mean_token_accuracy": 0.7888612896203995, | |
| "num_tokens": 13131491.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5005228817462921, | |
| "epoch": 3.0, | |
| "grad_norm": 0.1877554953098297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5059037208557129, | |
| "mean_token_accuracy": 0.797055795788765, | |
| "num_tokens": 13147551.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2254562163611402e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |