Instructions to use eac123/clean-subliminal-learning-foxes with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-foxes with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-foxes") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.12656632065773, | |
| "epoch": 0.003738317757009346, | |
| "grad_norm": 0.4271441102027893, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4663805961608887, | |
| "mean_token_accuracy": 0.543229952454567, | |
| "num_tokens": 16235.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2336603701114655, | |
| "epoch": 0.007476635514018692, | |
| "grad_norm": 0.38558802008628845, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1421403884887695, | |
| "mean_token_accuracy": 0.5718609094619751, | |
| "num_tokens": 32508.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.3997833728790283, | |
| "epoch": 0.011214953271028037, | |
| "grad_norm": 0.2918585538864136, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7201573848724365, | |
| "mean_token_accuracy": 0.5951470136642456, | |
| "num_tokens": 48740.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3798432350158691, | |
| "epoch": 0.014953271028037384, | |
| "grad_norm": 0.22533445060253143, | |
| "learning_rate": 0.0002, | |
| "loss": 1.409985899925232, | |
| "mean_token_accuracy": 0.6346195936203003, | |
| "num_tokens": 65174.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3409797251224518, | |
| "epoch": 0.018691588785046728, | |
| "grad_norm": 0.3003067374229431, | |
| "learning_rate": 0.0002, | |
| "loss": 1.28883695602417, | |
| "mean_token_accuracy": 0.6407334357500076, | |
| "num_tokens": 81213.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2644231617450714, | |
| "epoch": 0.022429906542056073, | |
| "grad_norm": 0.1622222661972046, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1853853464126587, | |
| "mean_token_accuracy": 0.6605143547058105, | |
| "num_tokens": 97766.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.208539754152298, | |
| "epoch": 0.026168224299065422, | |
| "grad_norm": 0.10511886328458786, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1084699630737305, | |
| "mean_token_accuracy": 0.6641467809677124, | |
| "num_tokens": 114186.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1391400694847107, | |
| "epoch": 0.029906542056074768, | |
| "grad_norm": 0.10200454294681549, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0516071319580078, | |
| "mean_token_accuracy": 0.6707163900136948, | |
| "num_tokens": 130305.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0563430190086365, | |
| "epoch": 0.03364485981308411, | |
| "grad_norm": 0.1273493468761444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.992067813873291, | |
| "mean_token_accuracy": 0.6933889836072922, | |
| "num_tokens": 146652.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9964777082204819, | |
| "epoch": 0.037383177570093455, | |
| "grad_norm": 0.1289750188589096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9485647082328796, | |
| "mean_token_accuracy": 0.6941430121660233, | |
| "num_tokens": 162967.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9659603089094162, | |
| "epoch": 0.041121495327102804, | |
| "grad_norm": 0.10667150467634201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8798340559005737, | |
| "mean_token_accuracy": 0.7052389085292816, | |
| "num_tokens": 179255.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9392479658126831, | |
| "epoch": 0.044859813084112146, | |
| "grad_norm": 0.11929332464933395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8541638851165771, | |
| "mean_token_accuracy": 0.7038426250219345, | |
| "num_tokens": 195430.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.863442063331604, | |
| "epoch": 0.048598130841121495, | |
| "grad_norm": 1.4121192693710327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8078625202178955, | |
| "mean_token_accuracy": 0.7139769345521927, | |
| "num_tokens": 211424.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.8306129276752472, | |
| "epoch": 0.052336448598130844, | |
| "grad_norm": 0.10941090434789658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7781446576118469, | |
| "mean_token_accuracy": 0.7239344716072083, | |
| "num_tokens": 227810.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7757371664047241, | |
| "epoch": 0.056074766355140186, | |
| "grad_norm": 0.10486897826194763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7468726634979248, | |
| "mean_token_accuracy": 0.7250657230615616, | |
| "num_tokens": 243991.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7809797525405884, | |
| "epoch": 0.059813084112149535, | |
| "grad_norm": 0.8654316663742065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7594712376594543, | |
| "mean_token_accuracy": 0.7155007421970367, | |
| "num_tokens": 260281.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.7353586554527283, | |
| "epoch": 0.06355140186915888, | |
| "grad_norm": 0.0876963660120964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7153522372245789, | |
| "mean_token_accuracy": 0.7296042591333389, | |
| "num_tokens": 276669.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6980300098657608, | |
| "epoch": 0.06728971962616823, | |
| "grad_norm": 0.07835765182971954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6894713640213013, | |
| "mean_token_accuracy": 0.7386218756437302, | |
| "num_tokens": 292849.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6726928502321243, | |
| "epoch": 0.07102803738317758, | |
| "grad_norm": 0.08941305428743362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6766728162765503, | |
| "mean_token_accuracy": 0.7433070838451385, | |
| "num_tokens": 309145.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6663309931755066, | |
| "epoch": 0.07476635514018691, | |
| "grad_norm": 0.08141425251960754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6594260334968567, | |
| "mean_token_accuracy": 0.7467465251684189, | |
| "num_tokens": 325653.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6754828691482544, | |
| "epoch": 0.07850467289719626, | |
| "grad_norm": 0.08411722630262375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.661962628364563, | |
| "mean_token_accuracy": 0.7418759763240814, | |
| "num_tokens": 341884.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6487417817115784, | |
| "epoch": 0.08224299065420561, | |
| "grad_norm": 0.08564816415309906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6224545240402222, | |
| "mean_token_accuracy": 0.7568920999765396, | |
| "num_tokens": 358367.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6594859212636948, | |
| "epoch": 0.08598130841121496, | |
| "grad_norm": 0.08242395520210266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6353108882904053, | |
| "mean_token_accuracy": 0.748349204659462, | |
| "num_tokens": 374461.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6361121833324432, | |
| "epoch": 0.08971962616822429, | |
| "grad_norm": 0.06784524023532867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6177537441253662, | |
| "mean_token_accuracy": 0.7591407150030136, | |
| "num_tokens": 390663.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.633724257349968, | |
| "epoch": 0.09345794392523364, | |
| "grad_norm": 0.06730605661869049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6257245540618896, | |
| "mean_token_accuracy": 0.7586156576871872, | |
| "num_tokens": 407000.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6177336722612381, | |
| "epoch": 0.09719626168224299, | |
| "grad_norm": 0.07131887227296829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6150547862052917, | |
| "mean_token_accuracy": 0.7589291036128998, | |
| "num_tokens": 423358.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6160949915647507, | |
| "epoch": 0.10093457943925234, | |
| "grad_norm": 0.06616901606321335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6125935316085815, | |
| "mean_token_accuracy": 0.7595443874597549, | |
| "num_tokens": 439799.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6129452586174011, | |
| "epoch": 0.10467289719626169, | |
| "grad_norm": 0.05841955915093422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.608031153678894, | |
| "mean_token_accuracy": 0.7601521760225296, | |
| "num_tokens": 456163.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5918006747961044, | |
| "epoch": 0.10841121495327102, | |
| "grad_norm": 0.06275882571935654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5862717628479004, | |
| "mean_token_accuracy": 0.7687633484601974, | |
| "num_tokens": 472127.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.6155861914157867, | |
| "epoch": 0.11214953271028037, | |
| "grad_norm": 0.06225947290658951, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6084246635437012, | |
| "mean_token_accuracy": 0.7598295211791992, | |
| "num_tokens": 488332.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.6035004556179047, | |
| "epoch": 0.11588785046728972, | |
| "grad_norm": 0.06444618105888367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5935206413269043, | |
| "mean_token_accuracy": 0.7651257067918777, | |
| "num_tokens": 504710.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.6106846928596497, | |
| "epoch": 0.11962616822429907, | |
| "grad_norm": 0.0602172389626503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5996757745742798, | |
| "mean_token_accuracy": 0.760893777012825, | |
| "num_tokens": 521082.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5945021361112595, | |
| "epoch": 0.1233644859813084, | |
| "grad_norm": 0.06356704980134964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5945574045181274, | |
| "mean_token_accuracy": 0.765913113951683, | |
| "num_tokens": 537475.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5772300958633423, | |
| "epoch": 0.12710280373831775, | |
| "grad_norm": 0.06089172512292862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5904273986816406, | |
| "mean_token_accuracy": 0.76410873234272, | |
| "num_tokens": 553508.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.550044596195221, | |
| "epoch": 0.1308411214953271, | |
| "grad_norm": 0.06109277158975601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613641142845154, | |
| "mean_token_accuracy": 0.7737480998039246, | |
| "num_tokens": 569417.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5723532140254974, | |
| "epoch": 0.13457943925233645, | |
| "grad_norm": 0.05618736520409584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5777797698974609, | |
| "mean_token_accuracy": 0.7723707407712936, | |
| "num_tokens": 585786.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.580461397767067, | |
| "epoch": 0.1383177570093458, | |
| "grad_norm": 0.05472671613097191, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5808417201042175, | |
| "mean_token_accuracy": 0.7668861597776413, | |
| "num_tokens": 602132.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5738302320241928, | |
| "epoch": 0.14205607476635515, | |
| "grad_norm": 0.06117068976163864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.57148277759552, | |
| "mean_token_accuracy": 0.774108350276947, | |
| "num_tokens": 618157.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5823365598917007, | |
| "epoch": 0.14579439252336449, | |
| "grad_norm": 0.05150913447141647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758525729179382, | |
| "mean_token_accuracy": 0.7670020014047623, | |
| "num_tokens": 634401.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5688591003417969, | |
| "epoch": 0.14953271028037382, | |
| "grad_norm": 0.054129600524902344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5642731189727783, | |
| "mean_token_accuracy": 0.7723482251167297, | |
| "num_tokens": 650471.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5930688679218292, | |
| "epoch": 0.15327102803738318, | |
| "grad_norm": 0.04651381075382233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5882899761199951, | |
| "mean_token_accuracy": 0.7660222053527832, | |
| "num_tokens": 667141.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5680070519447327, | |
| "epoch": 0.15700934579439252, | |
| "grad_norm": 0.04372819885611534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683910846710205, | |
| "mean_token_accuracy": 0.7714007496833801, | |
| "num_tokens": 683716.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.577846348285675, | |
| "epoch": 0.16074766355140188, | |
| "grad_norm": 0.050794582813978195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5828132033348083, | |
| "mean_token_accuracy": 0.7683440744876862, | |
| "num_tokens": 700166.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5514896064996719, | |
| "epoch": 0.16448598130841122, | |
| "grad_norm": 0.05992089584469795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563271701335907, | |
| "mean_token_accuracy": 0.7739104330539703, | |
| "num_tokens": 716342.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.575609490275383, | |
| "epoch": 0.16822429906542055, | |
| "grad_norm": 0.05013341084122658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5849894285202026, | |
| "mean_token_accuracy": 0.7635113149881363, | |
| "num_tokens": 732893.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5762993842363358, | |
| "epoch": 0.17196261682242991, | |
| "grad_norm": 0.048744700849056244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.574410080909729, | |
| "mean_token_accuracy": 0.7676838040351868, | |
| "num_tokens": 749295.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5723859369754791, | |
| "epoch": 0.17570093457943925, | |
| "grad_norm": 0.05009591579437256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668792128562927, | |
| "mean_token_accuracy": 0.7715302407741547, | |
| "num_tokens": 765549.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5764475762844086, | |
| "epoch": 0.17943925233644858, | |
| "grad_norm": 0.04878581687808037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665475130081177, | |
| "mean_token_accuracy": 0.7720314264297485, | |
| "num_tokens": 781843.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5845135897397995, | |
| "epoch": 0.18317757009345795, | |
| "grad_norm": 0.04589271917939186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5771698951721191, | |
| "mean_token_accuracy": 0.7694474011659622, | |
| "num_tokens": 798405.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.569475919008255, | |
| "epoch": 0.18691588785046728, | |
| "grad_norm": 0.04119531437754631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674958229064941, | |
| "mean_token_accuracy": 0.7736699432134628, | |
| "num_tokens": 814777.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5692360401153564, | |
| "epoch": 0.19065420560747665, | |
| "grad_norm": 0.0399826280772686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5702151656150818, | |
| "mean_token_accuracy": 0.7684639543294907, | |
| "num_tokens": 831134.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5498989522457123, | |
| "epoch": 0.19439252336448598, | |
| "grad_norm": 0.05800061300396919, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609486699104309, | |
| "mean_token_accuracy": 0.7740016728639603, | |
| "num_tokens": 847344.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5662340968847275, | |
| "epoch": 0.19813084112149532, | |
| "grad_norm": 0.047494642436504364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5766743421554565, | |
| "mean_token_accuracy": 0.7678139507770538, | |
| "num_tokens": 863618.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5752062201499939, | |
| "epoch": 0.20186915887850468, | |
| "grad_norm": 0.05196239426732063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726749300956726, | |
| "mean_token_accuracy": 0.7699306309223175, | |
| "num_tokens": 879844.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5600160509347916, | |
| "epoch": 0.205607476635514, | |
| "grad_norm": 0.04689890146255493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549072623252869, | |
| "mean_token_accuracy": 0.7740037143230438, | |
| "num_tokens": 896085.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5731441378593445, | |
| "epoch": 0.20934579439252338, | |
| "grad_norm": 0.04465720057487488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5675906538963318, | |
| "mean_token_accuracy": 0.7729700356721878, | |
| "num_tokens": 912450.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5865043848752975, | |
| "epoch": 0.2130841121495327, | |
| "grad_norm": 0.03869406878948212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5814957022666931, | |
| "mean_token_accuracy": 0.7672637850046158, | |
| "num_tokens": 928895.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5643806457519531, | |
| "epoch": 0.21682242990654205, | |
| "grad_norm": 0.03822167217731476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589677691459656, | |
| "mean_token_accuracy": 0.7748006731271744, | |
| "num_tokens": 945239.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5692119598388672, | |
| "epoch": 0.2205607476635514, | |
| "grad_norm": 0.042791273444890976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735809206962585, | |
| "mean_token_accuracy": 0.7694528251886368, | |
| "num_tokens": 961363.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5709938555955887, | |
| "epoch": 0.22429906542056074, | |
| "grad_norm": 0.04215843975543976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5765149593353271, | |
| "mean_token_accuracy": 0.7663712352514267, | |
| "num_tokens": 977455.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5535417348146439, | |
| "epoch": 0.22803738317757008, | |
| "grad_norm": 0.046243466436862946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625483989715576, | |
| "mean_token_accuracy": 0.7734335362911224, | |
| "num_tokens": 993620.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5543283224105835, | |
| "epoch": 0.23177570093457944, | |
| "grad_norm": 0.0379357784986496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572807788848877, | |
| "mean_token_accuracy": 0.7759047746658325, | |
| "num_tokens": 1009834.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5534257739782333, | |
| "epoch": 0.23551401869158878, | |
| "grad_norm": 0.03617486730217934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538501739501953, | |
| "mean_token_accuracy": 0.7762316316366196, | |
| "num_tokens": 1025981.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5688228756189346, | |
| "epoch": 0.23925233644859814, | |
| "grad_norm": 0.03479798510670662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626167058944702, | |
| "mean_token_accuracy": 0.7745891660451889, | |
| "num_tokens": 1042596.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5704841166734695, | |
| "epoch": 0.24299065420560748, | |
| "grad_norm": 0.04157167300581932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568891704082489, | |
| "mean_token_accuracy": 0.7680116444826126, | |
| "num_tokens": 1058884.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5743043571710587, | |
| "epoch": 0.2467289719626168, | |
| "grad_norm": 0.03632580116391182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5711199045181274, | |
| "mean_token_accuracy": 0.769555926322937, | |
| "num_tokens": 1075319.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.559576690196991, | |
| "epoch": 0.2504672897196262, | |
| "grad_norm": 0.038374125957489014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629530549049377, | |
| "mean_token_accuracy": 0.771178126335144, | |
| "num_tokens": 1091451.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5548212379217148, | |
| "epoch": 0.2542056074766355, | |
| "grad_norm": 0.03802485764026642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578069686889648, | |
| "mean_token_accuracy": 0.7767467051744461, | |
| "num_tokens": 1107549.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5731668472290039, | |
| "epoch": 0.25794392523364484, | |
| "grad_norm": 0.03902502730488777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5750908255577087, | |
| "mean_token_accuracy": 0.7706117182970047, | |
| "num_tokens": 1123904.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5669015496969223, | |
| "epoch": 0.2616822429906542, | |
| "grad_norm": 0.03905792534351349, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693663358688354, | |
| "mean_token_accuracy": 0.7708643227815628, | |
| "num_tokens": 1139931.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5661756098270416, | |
| "epoch": 0.26542056074766357, | |
| "grad_norm": 0.04826045408844948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717936754226685, | |
| "mean_token_accuracy": 0.7682332992553711, | |
| "num_tokens": 1156090.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.568753570318222, | |
| "epoch": 0.2691588785046729, | |
| "grad_norm": 0.03873279318213463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717485547065735, | |
| "mean_token_accuracy": 0.7686503529548645, | |
| "num_tokens": 1172312.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5719727724790573, | |
| "epoch": 0.27289719626168224, | |
| "grad_norm": 0.039684589952230453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565541684627533, | |
| "mean_token_accuracy": 0.769890546798706, | |
| "num_tokens": 1188846.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5802080780267715, | |
| "epoch": 0.2766355140186916, | |
| "grad_norm": 0.03692556545138359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5813108682632446, | |
| "mean_token_accuracy": 0.7652633637189865, | |
| "num_tokens": 1205115.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5709390044212341, | |
| "epoch": 0.2803738317757009, | |
| "grad_norm": 0.03715148940682411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5739152431488037, | |
| "mean_token_accuracy": 0.7695163637399673, | |
| "num_tokens": 1221457.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5634023249149323, | |
| "epoch": 0.2841121495327103, | |
| "grad_norm": 0.035052694380283356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634779334068298, | |
| "mean_token_accuracy": 0.7735425382852554, | |
| "num_tokens": 1237852.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5770431756973267, | |
| "epoch": 0.28785046728971964, | |
| "grad_norm": 0.04037750884890556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5792219042778015, | |
| "mean_token_accuracy": 0.7656148821115494, | |
| "num_tokens": 1253991.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5483120232820511, | |
| "epoch": 0.29158878504672897, | |
| "grad_norm": 0.04199967905879021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473575592041016, | |
| "mean_token_accuracy": 0.7797968685626984, | |
| "num_tokens": 1270154.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5623519718647003, | |
| "epoch": 0.2953271028037383, | |
| "grad_norm": 0.04001434147357941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669924020767212, | |
| "mean_token_accuracy": 0.7740958780050278, | |
| "num_tokens": 1286373.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5505794137716293, | |
| "epoch": 0.29906542056074764, | |
| "grad_norm": 0.039846453815698624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637381076812744, | |
| "mean_token_accuracy": 0.7710813283920288, | |
| "num_tokens": 1302910.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.573449894785881, | |
| "epoch": 0.30280373831775703, | |
| "grad_norm": 0.03970034047961235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5817972421646118, | |
| "mean_token_accuracy": 0.767284482717514, | |
| "num_tokens": 1319105.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5815064907073975, | |
| "epoch": 0.30654205607476637, | |
| "grad_norm": 0.036917295306921005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5764390826225281, | |
| "mean_token_accuracy": 0.7660059034824371, | |
| "num_tokens": 1335418.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5537111163139343, | |
| "epoch": 0.3102803738317757, | |
| "grad_norm": 0.038016658276319504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544030487537384, | |
| "mean_token_accuracy": 0.780098170042038, | |
| "num_tokens": 1351471.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5532083511352539, | |
| "epoch": 0.31401869158878504, | |
| "grad_norm": 0.03766188770532608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543038010597229, | |
| "mean_token_accuracy": 0.7815051227807999, | |
| "num_tokens": 1367729.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.569915771484375, | |
| "epoch": 0.3177570093457944, | |
| "grad_norm": 0.03935057669878006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673943758010864, | |
| "mean_token_accuracy": 0.7705481499433517, | |
| "num_tokens": 1384218.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5557460188865662, | |
| "epoch": 0.32149532710280376, | |
| "grad_norm": 0.0382615365087986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650104284286499, | |
| "mean_token_accuracy": 0.7701956182718277, | |
| "num_tokens": 1400496.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5529367923736572, | |
| "epoch": 0.3252336448598131, | |
| "grad_norm": 0.03607897832989693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612208843231201, | |
| "mean_token_accuracy": 0.773573562502861, | |
| "num_tokens": 1416728.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5617222934961319, | |
| "epoch": 0.32897196261682243, | |
| "grad_norm": 0.0373239666223526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661642551422119, | |
| "mean_token_accuracy": 0.7711510807275772, | |
| "num_tokens": 1433091.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.55742546916008, | |
| "epoch": 0.33271028037383177, | |
| "grad_norm": 0.03938078507781029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600550770759583, | |
| "mean_token_accuracy": 0.7730235010385513, | |
| "num_tokens": 1449246.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5685389190912247, | |
| "epoch": 0.3364485981308411, | |
| "grad_norm": 0.040714140981435776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676398873329163, | |
| "mean_token_accuracy": 0.7700921297073364, | |
| "num_tokens": 1465805.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5774114727973938, | |
| "epoch": 0.3401869158878505, | |
| "grad_norm": 0.03398137167096138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5775306224822998, | |
| "mean_token_accuracy": 0.7659128755331039, | |
| "num_tokens": 1482298.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5467455387115479, | |
| "epoch": 0.34392523364485983, | |
| "grad_norm": 0.032925065606832504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481046438217163, | |
| "mean_token_accuracy": 0.7773325145244598, | |
| "num_tokens": 1498536.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5445878356695175, | |
| "epoch": 0.34766355140186916, | |
| "grad_norm": 0.03473861888051033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424526929855347, | |
| "mean_token_accuracy": 0.7816839218139648, | |
| "num_tokens": 1514823.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5637122839689255, | |
| "epoch": 0.3514018691588785, | |
| "grad_norm": 0.03804982081055641, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646781325340271, | |
| "mean_token_accuracy": 0.7692969292402267, | |
| "num_tokens": 1531148.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5571535974740982, | |
| "epoch": 0.35514018691588783, | |
| "grad_norm": 0.03457267954945564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619444251060486, | |
| "mean_token_accuracy": 0.7773198187351227, | |
| "num_tokens": 1547476.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5707617700099945, | |
| "epoch": 0.35887850467289717, | |
| "grad_norm": 0.03933979198336601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.572324275970459, | |
| "mean_token_accuracy": 0.7692963778972626, | |
| "num_tokens": 1563979.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.556370198726654, | |
| "epoch": 0.36261682242990656, | |
| "grad_norm": 0.03271894529461861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558284521102905, | |
| "mean_token_accuracy": 0.7744213789701462, | |
| "num_tokens": 1580311.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5528354942798615, | |
| "epoch": 0.3663551401869159, | |
| "grad_norm": 0.03302107751369476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553282499313354, | |
| "mean_token_accuracy": 0.77690689265728, | |
| "num_tokens": 1596402.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5531659126281738, | |
| "epoch": 0.37009345794392523, | |
| "grad_norm": 0.03468908742070198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576953887939453, | |
| "mean_token_accuracy": 0.7762762904167175, | |
| "num_tokens": 1612430.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5810890346765518, | |
| "epoch": 0.37383177570093457, | |
| "grad_norm": 0.03342665359377861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5769139528274536, | |
| "mean_token_accuracy": 0.7672095000743866, | |
| "num_tokens": 1628891.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5750298053026199, | |
| "epoch": 0.3775700934579439, | |
| "grad_norm": 0.03441772237420082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5772010087966919, | |
| "mean_token_accuracy": 0.7646144926548004, | |
| "num_tokens": 1645047.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5650183409452438, | |
| "epoch": 0.3813084112149533, | |
| "grad_norm": 0.03096170350909233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606149435043335, | |
| "mean_token_accuracy": 0.7738576829433441, | |
| "num_tokens": 1661380.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5494536608457565, | |
| "epoch": 0.3850467289719626, | |
| "grad_norm": 0.03677360713481903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568496584892273, | |
| "mean_token_accuracy": 0.775225818157196, | |
| "num_tokens": 1677541.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5550926774740219, | |
| "epoch": 0.38878504672897196, | |
| "grad_norm": 0.03032948076725006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558656632900238, | |
| "mean_token_accuracy": 0.7753722071647644, | |
| "num_tokens": 1693849.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5538856834173203, | |
| "epoch": 0.3925233644859813, | |
| "grad_norm": 0.033197011798620224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585562586784363, | |
| "mean_token_accuracy": 0.7750265747308731, | |
| "num_tokens": 1710410.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.557091012597084, | |
| "epoch": 0.39626168224299063, | |
| "grad_norm": 0.03343191742897034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658184885978699, | |
| "mean_token_accuracy": 0.7713737785816193, | |
| "num_tokens": 1726519.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.573070839047432, | |
| "epoch": 0.4, | |
| "grad_norm": 0.03520960360765457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683936476707458, | |
| "mean_token_accuracy": 0.7706228792667389, | |
| "num_tokens": 1742802.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5730053037405014, | |
| "epoch": 0.40373831775700936, | |
| "grad_norm": 0.032127268612384796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697438716888428, | |
| "mean_token_accuracy": 0.7664725631475449, | |
| "num_tokens": 1759059.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5633453279733658, | |
| "epoch": 0.4074766355140187, | |
| "grad_norm": 0.03088793158531189, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599843263626099, | |
| "mean_token_accuracy": 0.7760611772537231, | |
| "num_tokens": 1775536.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.550876572728157, | |
| "epoch": 0.411214953271028, | |
| "grad_norm": 0.032173894345760345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552717387676239, | |
| "mean_token_accuracy": 0.7752785235643387, | |
| "num_tokens": 1792019.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5721830427646637, | |
| "epoch": 0.41495327102803736, | |
| "grad_norm": 0.033584315329790115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5759853720664978, | |
| "mean_token_accuracy": 0.7664880454540253, | |
| "num_tokens": 1808419.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5759546905755997, | |
| "epoch": 0.41869158878504675, | |
| "grad_norm": 0.03846940025687218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5841522216796875, | |
| "mean_token_accuracy": 0.7626957893371582, | |
| "num_tokens": 1824543.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5635320693254471, | |
| "epoch": 0.4224299065420561, | |
| "grad_norm": 0.03328083083033562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629671812057495, | |
| "mean_token_accuracy": 0.7737283408641815, | |
| "num_tokens": 1840757.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5591580420732498, | |
| "epoch": 0.4261682242990654, | |
| "grad_norm": 0.0327068492770195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551184356212616, | |
| "mean_token_accuracy": 0.7753513604402542, | |
| "num_tokens": 1857132.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5579714924097061, | |
| "epoch": 0.42990654205607476, | |
| "grad_norm": 0.0334380678832531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555400252342224, | |
| "mean_token_accuracy": 0.7759147882461548, | |
| "num_tokens": 1873360.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5697025954723358, | |
| "epoch": 0.4336448598130841, | |
| "grad_norm": 0.03651506081223488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568575382232666, | |
| "mean_token_accuracy": 0.7692690938711166, | |
| "num_tokens": 1889933.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5710670948028564, | |
| "epoch": 0.4373831775700935, | |
| "grad_norm": 0.03260137885808945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5754102468490601, | |
| "mean_token_accuracy": 0.7645916491746902, | |
| "num_tokens": 1906415.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5612241625785828, | |
| "epoch": 0.4411214953271028, | |
| "grad_norm": 0.030186068266630173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625964403152466, | |
| "mean_token_accuracy": 0.7733658254146576, | |
| "num_tokens": 1922692.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5558670610189438, | |
| "epoch": 0.44485981308411215, | |
| "grad_norm": 0.0367811918258667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577695965766907, | |
| "mean_token_accuracy": 0.772549107670784, | |
| "num_tokens": 1939001.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5691811889410019, | |
| "epoch": 0.4485981308411215, | |
| "grad_norm": 0.03843454644083977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703588128089905, | |
| "mean_token_accuracy": 0.7689766734838486, | |
| "num_tokens": 1955537.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5652327984571457, | |
| "epoch": 0.4523364485981308, | |
| "grad_norm": 0.032110750675201416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627662539482117, | |
| "mean_token_accuracy": 0.7731665819883347, | |
| "num_tokens": 1971820.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5414326637983322, | |
| "epoch": 0.45607476635514016, | |
| "grad_norm": 0.031934358179569244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432534217834473, | |
| "mean_token_accuracy": 0.7791064232587814, | |
| "num_tokens": 1988118.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5502553433179855, | |
| "epoch": 0.45981308411214955, | |
| "grad_norm": 0.035253144800662994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521403551101685, | |
| "mean_token_accuracy": 0.7760459184646606, | |
| "num_tokens": 2004642.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5582242012023926, | |
| "epoch": 0.4635514018691589, | |
| "grad_norm": 0.035558655858039856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5682451725006104, | |
| "mean_token_accuracy": 0.7699540108442307, | |
| "num_tokens": 2020965.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5626089125871658, | |
| "epoch": 0.4672897196261682, | |
| "grad_norm": 0.028148163110017776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638797283172607, | |
| "mean_token_accuracy": 0.7697459608316422, | |
| "num_tokens": 2037202.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5653271377086639, | |
| "epoch": 0.47102803738317756, | |
| "grad_norm": 0.03597045689821243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635451078414917, | |
| "mean_token_accuracy": 0.7696232795715332, | |
| "num_tokens": 2053309.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.560562789440155, | |
| "epoch": 0.4747663551401869, | |
| "grad_norm": 0.03047817200422287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625080466270447, | |
| "mean_token_accuracy": 0.7718035280704498, | |
| "num_tokens": 2069535.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5554249584674835, | |
| "epoch": 0.4785046728971963, | |
| "grad_norm": 0.028741145506501198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504335165023804, | |
| "mean_token_accuracy": 0.7771810442209244, | |
| "num_tokens": 2085763.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5567069947719574, | |
| "epoch": 0.4822429906542056, | |
| "grad_norm": 0.031639862805604935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562032461166382, | |
| "mean_token_accuracy": 0.7760691046714783, | |
| "num_tokens": 2102046.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5418022572994232, | |
| "epoch": 0.48598130841121495, | |
| "grad_norm": 0.03434485197067261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446175932884216, | |
| "mean_token_accuracy": 0.7789350152015686, | |
| "num_tokens": 2118239.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5367967188358307, | |
| "epoch": 0.4897196261682243, | |
| "grad_norm": 0.03757743164896965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414644479751587, | |
| "mean_token_accuracy": 0.7816939055919647, | |
| "num_tokens": 2134627.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5399434715509415, | |
| "epoch": 0.4934579439252336, | |
| "grad_norm": 0.03444533050060272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489372611045837, | |
| "mean_token_accuracy": 0.7746081054210663, | |
| "num_tokens": 2150944.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5634311139583588, | |
| "epoch": 0.497196261682243, | |
| "grad_norm": 0.028091201558709145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653581619262695, | |
| "mean_token_accuracy": 0.7713855057954788, | |
| "num_tokens": 2167218.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5568374693393707, | |
| "epoch": 0.5009345794392523, | |
| "grad_norm": 0.029833409935235977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585245490074158, | |
| "mean_token_accuracy": 0.7745143622159958, | |
| "num_tokens": 2183449.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5839870423078537, | |
| "epoch": 0.5046728971962616, | |
| "grad_norm": 0.03770853579044342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5719978213310242, | |
| "mean_token_accuracy": 0.7675238102674484, | |
| "num_tokens": 2199875.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5689375847578049, | |
| "epoch": 0.508411214953271, | |
| "grad_norm": 0.03635553643107414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626992583274841, | |
| "mean_token_accuracy": 0.7723798751831055, | |
| "num_tokens": 2216163.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5507294833660126, | |
| "epoch": 0.5121495327102804, | |
| "grad_norm": 0.03596559911966324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608267188072205, | |
| "mean_token_accuracy": 0.7710549086332321, | |
| "num_tokens": 2232636.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5623424351215363, | |
| "epoch": 0.5158878504672897, | |
| "grad_norm": 0.033818867057561874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5718593597412109, | |
| "mean_token_accuracy": 0.7696182578802109, | |
| "num_tokens": 2248825.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5675409585237503, | |
| "epoch": 0.5196261682242991, | |
| "grad_norm": 0.03331133350729942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5714356899261475, | |
| "mean_token_accuracy": 0.7693182229995728, | |
| "num_tokens": 2265359.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5522013902664185, | |
| "epoch": 0.5233644859813084, | |
| "grad_norm": 0.03208749741315842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529259443283081, | |
| "mean_token_accuracy": 0.7765516042709351, | |
| "num_tokens": 2281629.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5493837893009186, | |
| "epoch": 0.5271028037383177, | |
| "grad_norm": 0.0305814016610384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490883588790894, | |
| "mean_token_accuracy": 0.7763204425573349, | |
| "num_tokens": 2297908.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5564678907394409, | |
| "epoch": 0.5308411214953271, | |
| "grad_norm": 0.034225739538669586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602461099624634, | |
| "mean_token_accuracy": 0.7709554880857468, | |
| "num_tokens": 2314115.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5697164833545685, | |
| "epoch": 0.5345794392523364, | |
| "grad_norm": 0.03395864740014076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5692602396011353, | |
| "mean_token_accuracy": 0.766906350851059, | |
| "num_tokens": 2330462.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5691278576850891, | |
| "epoch": 0.5383177570093458, | |
| "grad_norm": 0.03194013983011246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562545657157898, | |
| "mean_token_accuracy": 0.7723768651485443, | |
| "num_tokens": 2346630.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.558807983994484, | |
| "epoch": 0.5420560747663551, | |
| "grad_norm": 0.036789294332265854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632457733154297, | |
| "mean_token_accuracy": 0.772635355591774, | |
| "num_tokens": 2362732.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5582777112722397, | |
| "epoch": 0.5457943925233645, | |
| "grad_norm": 0.02997492626309395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614091753959656, | |
| "mean_token_accuracy": 0.7702963054180145, | |
| "num_tokens": 2379199.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5584180504083633, | |
| "epoch": 0.5495327102803739, | |
| "grad_norm": 0.033580392599105835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605478286743164, | |
| "mean_token_accuracy": 0.7730905264616013, | |
| "num_tokens": 2395497.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5477179437875748, | |
| "epoch": 0.5532710280373832, | |
| "grad_norm": 0.03941367194056511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504173636436462, | |
| "mean_token_accuracy": 0.77938412129879, | |
| "num_tokens": 2411648.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5601572096347809, | |
| "epoch": 0.5570093457943925, | |
| "grad_norm": 0.030582338571548462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634943246841431, | |
| "mean_token_accuracy": 0.7728341221809387, | |
| "num_tokens": 2427925.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5869706571102142, | |
| "epoch": 0.5607476635514018, | |
| "grad_norm": 0.036973923444747925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5785589218139648, | |
| "mean_token_accuracy": 0.765045240521431, | |
| "num_tokens": 2444416.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5624907165765762, | |
| "epoch": 0.5644859813084112, | |
| "grad_norm": 0.036355964839458466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561196208000183, | |
| "mean_token_accuracy": 0.7752401679754257, | |
| "num_tokens": 2460808.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5570034384727478, | |
| "epoch": 0.5682242990654206, | |
| "grad_norm": 0.027923110872507095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550441145896912, | |
| "mean_token_accuracy": 0.7757884711027145, | |
| "num_tokens": 2477437.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5643865615129471, | |
| "epoch": 0.5719626168224299, | |
| "grad_norm": 0.0321192592382431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707546472549438, | |
| "mean_token_accuracy": 0.7692134529352188, | |
| "num_tokens": 2493966.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5535547733306885, | |
| "epoch": 0.5757009345794393, | |
| "grad_norm": 0.03465733677148819, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610126256942749, | |
| "mean_token_accuracy": 0.7733882069587708, | |
| "num_tokens": 2510442.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5411207228899002, | |
| "epoch": 0.5794392523364486, | |
| "grad_norm": 0.03268473595380783, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444988012313843, | |
| "mean_token_accuracy": 0.7791947424411774, | |
| "num_tokens": 2526738.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5539679378271103, | |
| "epoch": 0.5831775700934579, | |
| "grad_norm": 0.03345946595072746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571167469024658, | |
| "mean_token_accuracy": 0.7733618319034576, | |
| "num_tokens": 2543004.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.547135517001152, | |
| "epoch": 0.5869158878504673, | |
| "grad_norm": 0.03414901718497276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551236867904663, | |
| "mean_token_accuracy": 0.7734578996896744, | |
| "num_tokens": 2559150.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5595978051424026, | |
| "epoch": 0.5906542056074766, | |
| "grad_norm": 0.03502917289733887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5722506046295166, | |
| "mean_token_accuracy": 0.7680937796831131, | |
| "num_tokens": 2575360.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.56221604347229, | |
| "epoch": 0.594392523364486, | |
| "grad_norm": 0.036693476140499115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663124918937683, | |
| "mean_token_accuracy": 0.7699347287416458, | |
| "num_tokens": 2591749.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5489411354064941, | |
| "epoch": 0.5981308411214953, | |
| "grad_norm": 0.029823357239365578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525665879249573, | |
| "mean_token_accuracy": 0.7778102308511734, | |
| "num_tokens": 2608011.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5679098963737488, | |
| "epoch": 0.6018691588785047, | |
| "grad_norm": 0.03129269927740097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632325410842896, | |
| "mean_token_accuracy": 0.7711086720228195, | |
| "num_tokens": 2624110.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5759385228157043, | |
| "epoch": 0.6056074766355141, | |
| "grad_norm": 0.03027232177555561, | |
| "learning_rate": 0.0002, | |
| "loss": 0.566430926322937, | |
| "mean_token_accuracy": 0.7684105038642883, | |
| "num_tokens": 2640619.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5755711048841476, | |
| "epoch": 0.6093457943925233, | |
| "grad_norm": 0.02997921220958233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693614482879639, | |
| "mean_token_accuracy": 0.7678638249635696, | |
| "num_tokens": 2656816.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5675656646490097, | |
| "epoch": 0.6130841121495327, | |
| "grad_norm": 0.02925792895257473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620183348655701, | |
| "mean_token_accuracy": 0.7710973769426346, | |
| "num_tokens": 2673238.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5436252653598785, | |
| "epoch": 0.616822429906542, | |
| "grad_norm": 0.030324436724185944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462444424629211, | |
| "mean_token_accuracy": 0.779330775141716, | |
| "num_tokens": 2689740.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5572406202554703, | |
| "epoch": 0.6205607476635514, | |
| "grad_norm": 0.03400828689336777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641958713531494, | |
| "mean_token_accuracy": 0.7692032605409622, | |
| "num_tokens": 2706162.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.554596871137619, | |
| "epoch": 0.6242990654205608, | |
| "grad_norm": 0.03054538182914257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556669294834137, | |
| "mean_token_accuracy": 0.7765887379646301, | |
| "num_tokens": 2722464.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5644665658473969, | |
| "epoch": 0.6280373831775701, | |
| "grad_norm": 0.03194966912269592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671694278717041, | |
| "mean_token_accuracy": 0.7694765031337738, | |
| "num_tokens": 2738958.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5491771847009659, | |
| "epoch": 0.6317757009345795, | |
| "grad_norm": 0.03178941458463669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497304797172546, | |
| "mean_token_accuracy": 0.7750105261802673, | |
| "num_tokens": 2755355.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5742185562849045, | |
| "epoch": 0.6355140186915887, | |
| "grad_norm": 0.027454091235995293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5754401683807373, | |
| "mean_token_accuracy": 0.7658552527427673, | |
| "num_tokens": 2771556.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5589788407087326, | |
| "epoch": 0.6392523364485981, | |
| "grad_norm": 0.029149651527404785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554992139339447, | |
| "mean_token_accuracy": 0.7758396863937378, | |
| "num_tokens": 2787760.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5677189081907272, | |
| "epoch": 0.6429906542056075, | |
| "grad_norm": 0.03037264011800289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637961626052856, | |
| "mean_token_accuracy": 0.7705356478691101, | |
| "num_tokens": 2803802.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5565283447504044, | |
| "epoch": 0.6467289719626168, | |
| "grad_norm": 0.03331301361322403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568943023681641, | |
| "mean_token_accuracy": 0.77414271235466, | |
| "num_tokens": 2820371.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5312813073396683, | |
| "epoch": 0.6504672897196262, | |
| "grad_norm": 0.03152315691113472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355879664421082, | |
| "mean_token_accuracy": 0.785700336098671, | |
| "num_tokens": 2836694.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5379063338041306, | |
| "epoch": 0.6542056074766355, | |
| "grad_norm": 0.037841469049453735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525423288345337, | |
| "mean_token_accuracy": 0.7756439745426178, | |
| "num_tokens": 2852864.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5613906681537628, | |
| "epoch": 0.6579439252336449, | |
| "grad_norm": 0.035853054374456406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655968189239502, | |
| "mean_token_accuracy": 0.7716417163610458, | |
| "num_tokens": 2869313.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5639201551675797, | |
| "epoch": 0.6616822429906543, | |
| "grad_norm": 0.026397736743092537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627295970916748, | |
| "mean_token_accuracy": 0.7704634070396423, | |
| "num_tokens": 2885495.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5702281445264816, | |
| "epoch": 0.6654205607476635, | |
| "grad_norm": 0.03206147998571396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647550821304321, | |
| "mean_token_accuracy": 0.7702795714139938, | |
| "num_tokens": 2901765.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5528819262981415, | |
| "epoch": 0.6691588785046729, | |
| "grad_norm": 0.03629858419299126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473611950874329, | |
| "mean_token_accuracy": 0.7778798639774323, | |
| "num_tokens": 2918124.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5617557764053345, | |
| "epoch": 0.6728971962616822, | |
| "grad_norm": 0.03116736188530922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5709046721458435, | |
| "mean_token_accuracy": 0.7677187621593475, | |
| "num_tokens": 2934418.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.544835090637207, | |
| "epoch": 0.6766355140186916, | |
| "grad_norm": 0.03548549860715866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551706552505493, | |
| "mean_token_accuracy": 0.7762557417154312, | |
| "num_tokens": 2951100.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5660403668880463, | |
| "epoch": 0.680373831775701, | |
| "grad_norm": 0.03100365214049816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729965567588806, | |
| "mean_token_accuracy": 0.7690318375825882, | |
| "num_tokens": 2967440.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5780525356531143, | |
| "epoch": 0.6841121495327103, | |
| "grad_norm": 0.03490225970745087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734487771987915, | |
| "mean_token_accuracy": 0.7699766159057617, | |
| "num_tokens": 2983954.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5722559094429016, | |
| "epoch": 0.6878504672897197, | |
| "grad_norm": 0.031209329143166542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663836002349854, | |
| "mean_token_accuracy": 0.7720828950405121, | |
| "num_tokens": 3000256.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5506948530673981, | |
| "epoch": 0.6915887850467289, | |
| "grad_norm": 0.029818221926689148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445064306259155, | |
| "mean_token_accuracy": 0.7804610878229141, | |
| "num_tokens": 3016740.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5661566108465195, | |
| "epoch": 0.6953271028037383, | |
| "grad_norm": 0.03627892956137657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5731881260871887, | |
| "mean_token_accuracy": 0.7681418061256409, | |
| "num_tokens": 3033200.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5561655461788177, | |
| "epoch": 0.6990654205607477, | |
| "grad_norm": 0.028912672773003578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559117317199707, | |
| "mean_token_accuracy": 0.7737248986959457, | |
| "num_tokens": 3049728.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5450099408626556, | |
| "epoch": 0.702803738317757, | |
| "grad_norm": 0.03303583338856697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467768907546997, | |
| "mean_token_accuracy": 0.7775131165981293, | |
| "num_tokens": 3066007.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5617918968200684, | |
| "epoch": 0.7065420560747664, | |
| "grad_norm": 0.035768017172813416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563019871711731, | |
| "mean_token_accuracy": 0.770862489938736, | |
| "num_tokens": 3082324.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5339331775903702, | |
| "epoch": 0.7102803738317757, | |
| "grad_norm": 0.031208420172333717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547924280166626, | |
| "mean_token_accuracy": 0.7771021723747253, | |
| "num_tokens": 3098546.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5686406493186951, | |
| "epoch": 0.7140186915887851, | |
| "grad_norm": 0.028388923034071922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657324194908142, | |
| "mean_token_accuracy": 0.772287517786026, | |
| "num_tokens": 3114868.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5583553314208984, | |
| "epoch": 0.7177570093457943, | |
| "grad_norm": 0.027447570115327835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535703897476196, | |
| "mean_token_accuracy": 0.7759178727865219, | |
| "num_tokens": 3131210.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5578874051570892, | |
| "epoch": 0.7214953271028037, | |
| "grad_norm": 0.033130839467048645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513507723808289, | |
| "mean_token_accuracy": 0.7747978419065475, | |
| "num_tokens": 3147445.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5491522252559662, | |
| "epoch": 0.7252336448598131, | |
| "grad_norm": 0.030513031408190727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503372550010681, | |
| "mean_token_accuracy": 0.7780584990978241, | |
| "num_tokens": 3163723.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5677588433027267, | |
| "epoch": 0.7289719626168224, | |
| "grad_norm": 0.030064091086387634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684211850166321, | |
| "mean_token_accuracy": 0.7694611251354218, | |
| "num_tokens": 3180127.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5523021966218948, | |
| "epoch": 0.7327102803738318, | |
| "grad_norm": 0.028454501181840897, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564773082733154, | |
| "mean_token_accuracy": 0.7736252546310425, | |
| "num_tokens": 3196384.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5594403147697449, | |
| "epoch": 0.7364485981308411, | |
| "grad_norm": 0.031159594655036926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678831934928894, | |
| "mean_token_accuracy": 0.7687141001224518, | |
| "num_tokens": 3212579.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5670231431722641, | |
| "epoch": 0.7401869158878505, | |
| "grad_norm": 0.026576390489935875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5695415735244751, | |
| "mean_token_accuracy": 0.7709443867206573, | |
| "num_tokens": 3229005.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5550480484962463, | |
| "epoch": 0.7439252336448599, | |
| "grad_norm": 0.030606523156166077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502464771270752, | |
| "mean_token_accuracy": 0.7791616022586823, | |
| "num_tokens": 3245287.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5619281828403473, | |
| "epoch": 0.7476635514018691, | |
| "grad_norm": 0.030474133789539337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586714148521423, | |
| "mean_token_accuracy": 0.7734764218330383, | |
| "num_tokens": 3261691.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5405223369598389, | |
| "epoch": 0.7514018691588785, | |
| "grad_norm": 0.032003577798604965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496760010719299, | |
| "mean_token_accuracy": 0.7761346995830536, | |
| "num_tokens": 3277743.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5539799779653549, | |
| "epoch": 0.7551401869158878, | |
| "grad_norm": 0.026676569133996964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552941560745239, | |
| "mean_token_accuracy": 0.7729017436504364, | |
| "num_tokens": 3293921.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5504231303930283, | |
| "epoch": 0.7588785046728972, | |
| "grad_norm": 0.02650677040219307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463041663169861, | |
| "mean_token_accuracy": 0.7773067653179169, | |
| "num_tokens": 3310038.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5567349493503571, | |
| "epoch": 0.7626168224299066, | |
| "grad_norm": 0.028487270697951317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550260543823242, | |
| "mean_token_accuracy": 0.7747003883123398, | |
| "num_tokens": 3326542.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5515165776014328, | |
| "epoch": 0.7663551401869159, | |
| "grad_norm": 0.02944660186767578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483176708221436, | |
| "mean_token_accuracy": 0.7772196680307388, | |
| "num_tokens": 3342960.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5516369044780731, | |
| "epoch": 0.7700934579439253, | |
| "grad_norm": 0.02446347288787365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510342121124268, | |
| "mean_token_accuracy": 0.7753156870603561, | |
| "num_tokens": 3359361.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.562598317861557, | |
| "epoch": 0.7738317757009345, | |
| "grad_norm": 0.032002996653318405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551044344902039, | |
| "mean_token_accuracy": 0.7748953849077225, | |
| "num_tokens": 3375695.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5636338144540787, | |
| "epoch": 0.7775700934579439, | |
| "grad_norm": 0.032179221510887146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564883291721344, | |
| "mean_token_accuracy": 0.7722733914852142, | |
| "num_tokens": 3391711.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5475672632455826, | |
| "epoch": 0.7813084112149533, | |
| "grad_norm": 0.03206668421626091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551382899284363, | |
| "mean_token_accuracy": 0.7726904302835464, | |
| "num_tokens": 3407951.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.540259450674057, | |
| "epoch": 0.7850467289719626, | |
| "grad_norm": 0.02936564013361931, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508178472518921, | |
| "mean_token_accuracy": 0.7771763801574707, | |
| "num_tokens": 3424278.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5564334988594055, | |
| "epoch": 0.788785046728972, | |
| "grad_norm": 0.03052506223320961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652161240577698, | |
| "mean_token_accuracy": 0.770373746752739, | |
| "num_tokens": 3440796.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5524326264858246, | |
| "epoch": 0.7925233644859813, | |
| "grad_norm": 0.025716882199048996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483862161636353, | |
| "mean_token_accuracy": 0.778383657336235, | |
| "num_tokens": 3457162.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5574807077646255, | |
| "epoch": 0.7962616822429907, | |
| "grad_norm": 0.026924515143036842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535562634468079, | |
| "mean_token_accuracy": 0.7756220400333405, | |
| "num_tokens": 3473707.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.558317020535469, | |
| "epoch": 0.8, | |
| "grad_norm": 0.025764374062418938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560704231262207, | |
| "mean_token_accuracy": 0.7712857127189636, | |
| "num_tokens": 3490125.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5554333925247192, | |
| "epoch": 0.8037383177570093, | |
| "grad_norm": 0.028298519551753998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522173643112183, | |
| "mean_token_accuracy": 0.7743871361017227, | |
| "num_tokens": 3506505.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5587067157030106, | |
| "epoch": 0.8074766355140187, | |
| "grad_norm": 0.02431626431643963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544553995132446, | |
| "mean_token_accuracy": 0.7743324339389801, | |
| "num_tokens": 3522958.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5645765364170074, | |
| "epoch": 0.811214953271028, | |
| "grad_norm": 0.02611798420548439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644361972808838, | |
| "mean_token_accuracy": 0.7711465507745743, | |
| "num_tokens": 3539490.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5525356978178024, | |
| "epoch": 0.8149532710280374, | |
| "grad_norm": 0.03383297845721245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598211884498596, | |
| "mean_token_accuracy": 0.7742004096508026, | |
| "num_tokens": 3555746.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5621150583028793, | |
| "epoch": 0.8186915887850468, | |
| "grad_norm": 0.030269736424088478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634778738021851, | |
| "mean_token_accuracy": 0.7692747861146927, | |
| "num_tokens": 3572256.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5514157265424728, | |
| "epoch": 0.822429906542056, | |
| "grad_norm": 0.028750412166118622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467870831489563, | |
| "mean_token_accuracy": 0.7769519984722137, | |
| "num_tokens": 3588550.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5368104577064514, | |
| "epoch": 0.8261682242990654, | |
| "grad_norm": 0.03091045655310154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372405648231506, | |
| "mean_token_accuracy": 0.7840253859758377, | |
| "num_tokens": 3604659.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5409716814756393, | |
| "epoch": 0.8299065420560747, | |
| "grad_norm": 0.03386515751481056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548212468624115, | |
| "mean_token_accuracy": 0.7736510932445526, | |
| "num_tokens": 3620843.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5629084706306458, | |
| "epoch": 0.8336448598130841, | |
| "grad_norm": 0.040728501975536346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5746021270751953, | |
| "mean_token_accuracy": 0.7647373080253601, | |
| "num_tokens": 3637324.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5369234085083008, | |
| "epoch": 0.8373831775700935, | |
| "grad_norm": 0.029392162337899208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397970080375671, | |
| "mean_token_accuracy": 0.7819121479988098, | |
| "num_tokens": 3653633.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5768532902002335, | |
| "epoch": 0.8411214953271028, | |
| "grad_norm": 0.033986181020736694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5701450109481812, | |
| "mean_token_accuracy": 0.7669256031513214, | |
| "num_tokens": 3670158.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5465534925460815, | |
| "epoch": 0.8448598130841122, | |
| "grad_norm": 0.034689608961343765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539010226726532, | |
| "mean_token_accuracy": 0.7829751968383789, | |
| "num_tokens": 3686415.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5669656842947006, | |
| "epoch": 0.8485981308411215, | |
| "grad_norm": 0.029157601296901703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645594596862793, | |
| "mean_token_accuracy": 0.7721282690763474, | |
| "num_tokens": 3702620.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5713803917169571, | |
| "epoch": 0.8523364485981308, | |
| "grad_norm": 0.032975275069475174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758609771728516, | |
| "mean_token_accuracy": 0.7657817453145981, | |
| "num_tokens": 3719219.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5463247001171112, | |
| "epoch": 0.8560747663551402, | |
| "grad_norm": 0.039444658905267715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534209609031677, | |
| "mean_token_accuracy": 0.7726487815380096, | |
| "num_tokens": 3735438.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.556586429476738, | |
| "epoch": 0.8598130841121495, | |
| "grad_norm": 0.02616702765226364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549170970916748, | |
| "mean_token_accuracy": 0.7752689123153687, | |
| "num_tokens": 3751785.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5389135032892227, | |
| "epoch": 0.8635514018691589, | |
| "grad_norm": 0.03276278078556061, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399537086486816, | |
| "mean_token_accuracy": 0.781702533364296, | |
| "num_tokens": 3767826.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5364359021186829, | |
| "epoch": 0.8672897196261682, | |
| "grad_norm": 0.026118800044059753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382952094078064, | |
| "mean_token_accuracy": 0.780514121055603, | |
| "num_tokens": 3783919.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5687360912561417, | |
| "epoch": 0.8710280373831776, | |
| "grad_norm": 0.03209976479411125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5756676197052002, | |
| "mean_token_accuracy": 0.7664439678192139, | |
| "num_tokens": 3800454.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5679410099983215, | |
| "epoch": 0.874766355140187, | |
| "grad_norm": 0.025931114330887794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656247138977051, | |
| "mean_token_accuracy": 0.7693636864423752, | |
| "num_tokens": 3816747.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.557420089840889, | |
| "epoch": 0.8785046728971962, | |
| "grad_norm": 0.02894972637295723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490383505821228, | |
| "mean_token_accuracy": 0.7750599384307861, | |
| "num_tokens": 3833058.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.560372844338417, | |
| "epoch": 0.8822429906542056, | |
| "grad_norm": 0.03646957501769066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596282482147217, | |
| "mean_token_accuracy": 0.7726272940635681, | |
| "num_tokens": 3849415.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5550010055303574, | |
| "epoch": 0.8859813084112149, | |
| "grad_norm": 0.026594942435622215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539083480834961, | |
| "mean_token_accuracy": 0.7734427750110626, | |
| "num_tokens": 3865776.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5347648710012436, | |
| "epoch": 0.8897196261682243, | |
| "grad_norm": 0.03385410085320473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472573041915894, | |
| "mean_token_accuracy": 0.7766564786434174, | |
| "num_tokens": 3882018.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5376404300332069, | |
| "epoch": 0.8934579439252337, | |
| "grad_norm": 0.040597062557935715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544540286064148, | |
| "mean_token_accuracy": 0.7728734314441681, | |
| "num_tokens": 3898287.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5667798519134521, | |
| "epoch": 0.897196261682243, | |
| "grad_norm": 0.027665674686431885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663026571273804, | |
| "mean_token_accuracy": 0.770405575633049, | |
| "num_tokens": 3914775.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.550272524356842, | |
| "epoch": 0.9009345794392524, | |
| "grad_norm": 0.029484877362847328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427078008651733, | |
| "mean_token_accuracy": 0.7818168848752975, | |
| "num_tokens": 3930889.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5710694193840027, | |
| "epoch": 0.9046728971962616, | |
| "grad_norm": 0.027631685137748718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561673641204834, | |
| "mean_token_accuracy": 0.7728846818208694, | |
| "num_tokens": 3947233.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5513755828142166, | |
| "epoch": 0.908411214953271, | |
| "grad_norm": 0.030272630974650383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467454195022583, | |
| "mean_token_accuracy": 0.7779553532600403, | |
| "num_tokens": 3963468.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5469895005226135, | |
| "epoch": 0.9121495327102803, | |
| "grad_norm": 0.03090892918407917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560286045074463, | |
| "mean_token_accuracy": 0.7723891735076904, | |
| "num_tokens": 3979910.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5544413626194, | |
| "epoch": 0.9158878504672897, | |
| "grad_norm": 0.041499219834804535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5768874883651733, | |
| "mean_token_accuracy": 0.7659346610307693, | |
| "num_tokens": 3996196.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5447600036859512, | |
| "epoch": 0.9196261682242991, | |
| "grad_norm": 0.03076878748834133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456743836402893, | |
| "mean_token_accuracy": 0.7770105451345444, | |
| "num_tokens": 4012511.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5538895577192307, | |
| "epoch": 0.9233644859813084, | |
| "grad_norm": 0.03173721581697464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483969449996948, | |
| "mean_token_accuracy": 0.7781166434288025, | |
| "num_tokens": 4028651.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5794132798910141, | |
| "epoch": 0.9271028037383178, | |
| "grad_norm": 0.0297909714281559, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648066401481628, | |
| "mean_token_accuracy": 0.7718619257211685, | |
| "num_tokens": 4045251.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5547907501459122, | |
| "epoch": 0.930841121495327, | |
| "grad_norm": 0.03679649531841278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462634563446045, | |
| "mean_token_accuracy": 0.7801699191331863, | |
| "num_tokens": 4061348.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5539078116416931, | |
| "epoch": 0.9345794392523364, | |
| "grad_norm": 0.02851703390479088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593677163124084, | |
| "mean_token_accuracy": 0.7756806910037994, | |
| "num_tokens": 4077453.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5443865954875946, | |
| "epoch": 0.9383177570093458, | |
| "grad_norm": 0.030135581269860268, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505210161209106, | |
| "mean_token_accuracy": 0.7767539322376251, | |
| "num_tokens": 4093944.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5541698932647705, | |
| "epoch": 0.9420560747663551, | |
| "grad_norm": 0.03800193592905998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603746175765991, | |
| "mean_token_accuracy": 0.7716375887393951, | |
| "num_tokens": 4110397.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5497024953365326, | |
| "epoch": 0.9457943925233645, | |
| "grad_norm": 0.030841615051031113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577483177185059, | |
| "mean_token_accuracy": 0.776105210185051, | |
| "num_tokens": 4126788.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5452855974435806, | |
| "epoch": 0.9495327102803738, | |
| "grad_norm": 0.027110353112220764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468145608901978, | |
| "mean_token_accuracy": 0.7746452689170837, | |
| "num_tokens": 4143252.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5483012199401855, | |
| "epoch": 0.9532710280373832, | |
| "grad_norm": 0.02763090282678604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542940616607666, | |
| "mean_token_accuracy": 0.7776369601488113, | |
| "num_tokens": 4159556.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5598485320806503, | |
| "epoch": 0.9570093457943926, | |
| "grad_norm": 0.02750120870769024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518869161605835, | |
| "mean_token_accuracy": 0.7762151658535004, | |
| "num_tokens": 4175947.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5783872008323669, | |
| "epoch": 0.9607476635514018, | |
| "grad_norm": 0.03151006996631622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734107494354248, | |
| "mean_token_accuracy": 0.7695904821157455, | |
| "num_tokens": 4192348.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5653168857097626, | |
| "epoch": 0.9644859813084112, | |
| "grad_norm": 0.03166348114609718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5732910633087158, | |
| "mean_token_accuracy": 0.7679464519023895, | |
| "num_tokens": 4208898.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5390284806489944, | |
| "epoch": 0.9682242990654205, | |
| "grad_norm": 0.026950784027576447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455009937286377, | |
| "mean_token_accuracy": 0.7775461375713348, | |
| "num_tokens": 4225149.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.565416008234024, | |
| "epoch": 0.9719626168224299, | |
| "grad_norm": 0.030768675729632378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5689860582351685, | |
| "mean_token_accuracy": 0.7684348970651627, | |
| "num_tokens": 4241389.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5577588826417923, | |
| "epoch": 0.9757009345794393, | |
| "grad_norm": 0.02680326998233795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625928640365601, | |
| "mean_token_accuracy": 0.7695075571537018, | |
| "num_tokens": 4257979.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.55104960501194, | |
| "epoch": 0.9794392523364486, | |
| "grad_norm": 0.027646353468298912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484559535980225, | |
| "mean_token_accuracy": 0.7766857743263245, | |
| "num_tokens": 4274290.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5638265609741211, | |
| "epoch": 0.983177570093458, | |
| "grad_norm": 0.02871805429458618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657901167869568, | |
| "mean_token_accuracy": 0.7715673297643661, | |
| "num_tokens": 4290725.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.547324076294899, | |
| "epoch": 0.9869158878504672, | |
| "grad_norm": 0.02937854453921318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55534827709198, | |
| "mean_token_accuracy": 0.7751762270927429, | |
| "num_tokens": 4307326.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5487106442451477, | |
| "epoch": 0.9906542056074766, | |
| "grad_norm": 0.02548016607761383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505661964416504, | |
| "mean_token_accuracy": 0.7752106785774231, | |
| "num_tokens": 4323823.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5634673833847046, | |
| "epoch": 0.994392523364486, | |
| "grad_norm": 0.026015356183052063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634418725967407, | |
| "mean_token_accuracy": 0.7709382921457291, | |
| "num_tokens": 4340138.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5507746189832687, | |
| "epoch": 0.9981308411214953, | |
| "grad_norm": 0.026798918843269348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513297915458679, | |
| "mean_token_accuracy": 0.7769380956888199, | |
| "num_tokens": 4356482.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5597052276134491, | |
| "epoch": 1.0, | |
| "grad_norm": 0.0342809222638607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571821331977844, | |
| "mean_token_accuracy": 0.774641364812851, | |
| "num_tokens": 4364744.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.557921290397644, | |
| "epoch": 1.0037383177570094, | |
| "grad_norm": 0.029891351237893105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539438128471375, | |
| "mean_token_accuracy": 0.7773818224668503, | |
| "num_tokens": 4380930.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5416439026594162, | |
| "epoch": 1.0074766355140188, | |
| "grad_norm": 0.02803446725010872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438423752784729, | |
| "mean_token_accuracy": 0.7798180431127548, | |
| "num_tokens": 4397244.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5285164415836334, | |
| "epoch": 1.011214953271028, | |
| "grad_norm": 0.03023347444832325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358922481536865, | |
| "mean_token_accuracy": 0.7807245850563049, | |
| "num_tokens": 4413671.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5514080822467804, | |
| "epoch": 1.0149532710280373, | |
| "grad_norm": 0.027458516880869865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552421510219574, | |
| "mean_token_accuracy": 0.7761755585670471, | |
| "num_tokens": 4430035.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5706226229667664, | |
| "epoch": 1.0186915887850467, | |
| "grad_norm": 0.030846886336803436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667564272880554, | |
| "mean_token_accuracy": 0.7689130008220673, | |
| "num_tokens": 4446382.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5511225461959839, | |
| "epoch": 1.0224299065420561, | |
| "grad_norm": 0.029439929872751236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465920567512512, | |
| "mean_token_accuracy": 0.7808292508125305, | |
| "num_tokens": 4462677.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5416547358036041, | |
| "epoch": 1.0261682242990655, | |
| "grad_norm": 0.02822115644812584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419396758079529, | |
| "mean_token_accuracy": 0.7816834002733231, | |
| "num_tokens": 4479083.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5574266612529755, | |
| "epoch": 1.0299065420560747, | |
| "grad_norm": 0.0327095128595829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565608739852905, | |
| "mean_token_accuracy": 0.7745349258184433, | |
| "num_tokens": 4495797.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5387104451656342, | |
| "epoch": 1.033644859813084, | |
| "grad_norm": 0.03164896368980408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406032800674438, | |
| "mean_token_accuracy": 0.7823146730661392, | |
| "num_tokens": 4512262.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5471370071172714, | |
| "epoch": 1.0373831775700935, | |
| "grad_norm": 0.03483380377292633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550093054771423, | |
| "mean_token_accuracy": 0.7783246338367462, | |
| "num_tokens": 4528616.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5368807017803192, | |
| "epoch": 1.0411214953271029, | |
| "grad_norm": 0.03120633400976658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417410731315613, | |
| "mean_token_accuracy": 0.7802102267742157, | |
| "num_tokens": 4544882.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5481929332017899, | |
| "epoch": 1.0448598130841122, | |
| "grad_norm": 0.029517389833927155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472978353500366, | |
| "mean_token_accuracy": 0.7788140177726746, | |
| "num_tokens": 4561427.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5531918853521347, | |
| "epoch": 1.0485981308411214, | |
| "grad_norm": 0.03256995975971222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502868890762329, | |
| "mean_token_accuracy": 0.7784827798604965, | |
| "num_tokens": 4577723.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5540415197610855, | |
| "epoch": 1.0523364485981308, | |
| "grad_norm": 0.026578353717923164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555966854095459, | |
| "mean_token_accuracy": 0.775706946849823, | |
| "num_tokens": 4594128.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5517027229070663, | |
| "epoch": 1.0560747663551402, | |
| "grad_norm": 0.030103787779808044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502108931541443, | |
| "mean_token_accuracy": 0.7753856778144836, | |
| "num_tokens": 4610255.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5304621160030365, | |
| "epoch": 1.0598130841121496, | |
| "grad_norm": 0.029368899762630463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297666788101196, | |
| "mean_token_accuracy": 0.7840214222669601, | |
| "num_tokens": 4626599.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5305260270833969, | |
| "epoch": 1.063551401869159, | |
| "grad_norm": 0.029124870896339417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363407135009766, | |
| "mean_token_accuracy": 0.7847000658512115, | |
| "num_tokens": 4642927.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5300263911485672, | |
| "epoch": 1.0672897196261681, | |
| "grad_norm": 0.028800450265407562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52923583984375, | |
| "mean_token_accuracy": 0.7828178703784943, | |
| "num_tokens": 4659455.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5497115254402161, | |
| "epoch": 1.0710280373831775, | |
| "grad_norm": 0.03032800555229187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526697039604187, | |
| "mean_token_accuracy": 0.7718490660190582, | |
| "num_tokens": 4675747.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5266695320606232, | |
| "epoch": 1.074766355140187, | |
| "grad_norm": 0.02653171867132187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5255345702171326, | |
| "mean_token_accuracy": 0.7853638082742691, | |
| "num_tokens": 4691992.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5461495667695999, | |
| "epoch": 1.0785046728971963, | |
| "grad_norm": 0.025956284254789352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439239740371704, | |
| "mean_token_accuracy": 0.7808811217546463, | |
| "num_tokens": 4708487.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5421788841485977, | |
| "epoch": 1.0822429906542057, | |
| "grad_norm": 0.02735847234725952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411931872367859, | |
| "mean_token_accuracy": 0.7771425247192383, | |
| "num_tokens": 4724824.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5556438118219376, | |
| "epoch": 1.0859813084112149, | |
| "grad_norm": 0.026816118508577347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484311580657959, | |
| "mean_token_accuracy": 0.7775956392288208, | |
| "num_tokens": 4741264.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5614602714776993, | |
| "epoch": 1.0897196261682243, | |
| "grad_norm": 0.03428835794329643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635286569595337, | |
| "mean_token_accuracy": 0.7734779864549637, | |
| "num_tokens": 4757630.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5510146170854568, | |
| "epoch": 1.0934579439252337, | |
| "grad_norm": 0.030845943838357925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562302470207214, | |
| "mean_token_accuracy": 0.773259237408638, | |
| "num_tokens": 4773723.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5555125325918198, | |
| "epoch": 1.097196261682243, | |
| "grad_norm": 0.028586354106664658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588027834892273, | |
| "mean_token_accuracy": 0.7723042815923691, | |
| "num_tokens": 4790204.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.53548863530159, | |
| "epoch": 1.1009345794392524, | |
| "grad_norm": 0.032421719282865524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428792238235474, | |
| "mean_token_accuracy": 0.780792623758316, | |
| "num_tokens": 4806715.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5266362577676773, | |
| "epoch": 1.1046728971962616, | |
| "grad_norm": 0.044794633984565735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296044945716858, | |
| "mean_token_accuracy": 0.7850557416677475, | |
| "num_tokens": 4822693.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.547786682844162, | |
| "epoch": 1.108411214953271, | |
| "grad_norm": 0.03065192885696888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545957088470459, | |
| "mean_token_accuracy": 0.7773084342479706, | |
| "num_tokens": 4838834.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5526397377252579, | |
| "epoch": 1.1121495327102804, | |
| "grad_norm": 0.03121815249323845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505586862564087, | |
| "mean_token_accuracy": 0.7751570343971252, | |
| "num_tokens": 4854891.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.556088924407959, | |
| "epoch": 1.1158878504672898, | |
| "grad_norm": 0.03519770875573158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572479367256165, | |
| "mean_token_accuracy": 0.7747550010681152, | |
| "num_tokens": 4871140.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5376470685005188, | |
| "epoch": 1.1196261682242992, | |
| "grad_norm": 0.03193943575024605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455138087272644, | |
| "mean_token_accuracy": 0.7797031998634338, | |
| "num_tokens": 4887274.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5635453760623932, | |
| "epoch": 1.1233644859813083, | |
| "grad_norm": 0.041273750364780426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5696390867233276, | |
| "mean_token_accuracy": 0.76914082467556, | |
| "num_tokens": 4903573.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5702975988388062, | |
| "epoch": 1.1271028037383177, | |
| "grad_norm": 0.03010556660592556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622550845146179, | |
| "mean_token_accuracy": 0.7727158814668655, | |
| "num_tokens": 4919926.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5415271073579788, | |
| "epoch": 1.1308411214953271, | |
| "grad_norm": 0.0310966819524765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458844900131226, | |
| "mean_token_accuracy": 0.776058241724968, | |
| "num_tokens": 4936123.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5403020679950714, | |
| "epoch": 1.1345794392523365, | |
| "grad_norm": 0.04535767808556557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387758612632751, | |
| "mean_token_accuracy": 0.7784536480903625, | |
| "num_tokens": 4952502.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5479062646627426, | |
| "epoch": 1.1383177570093457, | |
| "grad_norm": 0.028153905645012856, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478588938713074, | |
| "mean_token_accuracy": 0.7770532369613647, | |
| "num_tokens": 4968823.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5423109382390976, | |
| "epoch": 1.142056074766355, | |
| "grad_norm": 0.03606940805912018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508921146392822, | |
| "mean_token_accuracy": 0.7769752442836761, | |
| "num_tokens": 4985183.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5484813451766968, | |
| "epoch": 1.1457943925233645, | |
| "grad_norm": 0.02960861474275589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549089312553406, | |
| "mean_token_accuracy": 0.7753880023956299, | |
| "num_tokens": 5001335.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5498395711183548, | |
| "epoch": 1.1495327102803738, | |
| "grad_norm": 0.036366142332553864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471988916397095, | |
| "mean_token_accuracy": 0.7787120938301086, | |
| "num_tokens": 5017387.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5530393719673157, | |
| "epoch": 1.1532710280373832, | |
| "grad_norm": 0.029028775170445442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492241978645325, | |
| "mean_token_accuracy": 0.7761663198471069, | |
| "num_tokens": 5033567.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5492727905511856, | |
| "epoch": 1.1570093457943926, | |
| "grad_norm": 0.03352445736527443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540640354156494, | |
| "mean_token_accuracy": 0.7749823033809662, | |
| "num_tokens": 5049801.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5666168481111526, | |
| "epoch": 1.1607476635514018, | |
| "grad_norm": 0.035840339958667755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706231594085693, | |
| "mean_token_accuracy": 0.7669289708137512, | |
| "num_tokens": 5066204.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5425457805395126, | |
| "epoch": 1.1644859813084112, | |
| "grad_norm": 0.03181692957878113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458914041519165, | |
| "mean_token_accuracy": 0.7774879634380341, | |
| "num_tokens": 5082493.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5557267963886261, | |
| "epoch": 1.1682242990654206, | |
| "grad_norm": 0.035230670124292374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475496053695679, | |
| "mean_token_accuracy": 0.7787989675998688, | |
| "num_tokens": 5098639.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5714587569236755, | |
| "epoch": 1.17196261682243, | |
| "grad_norm": 0.03392059728503227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622156262397766, | |
| "mean_token_accuracy": 0.7719752937555313, | |
| "num_tokens": 5114831.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5439812690019608, | |
| "epoch": 1.1757009345794391, | |
| "grad_norm": 0.027537284418940544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427182912826538, | |
| "mean_token_accuracy": 0.7786365002393723, | |
| "num_tokens": 5131121.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5388712882995605, | |
| "epoch": 1.1794392523364485, | |
| "grad_norm": 0.03216094896197319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446818470954895, | |
| "mean_token_accuracy": 0.7791234254837036, | |
| "num_tokens": 5147422.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.53206005692482, | |
| "epoch": 1.183177570093458, | |
| "grad_norm": 0.032054752111434937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439627170562744, | |
| "mean_token_accuracy": 0.7801449149847031, | |
| "num_tokens": 5163884.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5308776497840881, | |
| "epoch": 1.1869158878504673, | |
| "grad_norm": 0.032574739307165146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392112731933594, | |
| "mean_token_accuracy": 0.777498260140419, | |
| "num_tokens": 5180398.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5427455455064774, | |
| "epoch": 1.1906542056074767, | |
| "grad_norm": 0.03152874857187271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452929139137268, | |
| "mean_token_accuracy": 0.7787911593914032, | |
| "num_tokens": 5196640.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.570340633392334, | |
| "epoch": 1.194392523364486, | |
| "grad_norm": 0.03098403289914131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688466429710388, | |
| "mean_token_accuracy": 0.7672817558050156, | |
| "num_tokens": 5212767.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5646504908800125, | |
| "epoch": 1.1981308411214953, | |
| "grad_norm": 0.032602474093437195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595831274986267, | |
| "mean_token_accuracy": 0.7738354504108429, | |
| "num_tokens": 5229143.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.541440024971962, | |
| "epoch": 1.2018691588785047, | |
| "grad_norm": 0.0346127450466156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328572988510132, | |
| "mean_token_accuracy": 0.7842471748590469, | |
| "num_tokens": 5245349.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5371421873569489, | |
| "epoch": 1.205607476635514, | |
| "grad_norm": 0.030524473637342453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316073894500732, | |
| "mean_token_accuracy": 0.7839267402887344, | |
| "num_tokens": 5261740.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5501479953527451, | |
| "epoch": 1.2093457943925234, | |
| "grad_norm": 0.04006117209792137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546258687973022, | |
| "mean_token_accuracy": 0.7740581333637238, | |
| "num_tokens": 5278402.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5427927225828171, | |
| "epoch": 1.2130841121495326, | |
| "grad_norm": 0.028997933492064476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546272873878479, | |
| "mean_token_accuracy": 0.77626071870327, | |
| "num_tokens": 5295096.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5374629199504852, | |
| "epoch": 1.216822429906542, | |
| "grad_norm": 0.031449392437934875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484204292297363, | |
| "mean_token_accuracy": 0.7783177495002747, | |
| "num_tokens": 5311451.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5593861639499664, | |
| "epoch": 1.2205607476635514, | |
| "grad_norm": 0.033892612904310226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527151823043823, | |
| "mean_token_accuracy": 0.7769543379545212, | |
| "num_tokens": 5327705.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5403755158185959, | |
| "epoch": 1.2242990654205608, | |
| "grad_norm": 0.029873648658394814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416997075080872, | |
| "mean_token_accuracy": 0.7783119082450867, | |
| "num_tokens": 5344110.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5473423600196838, | |
| "epoch": 1.2280373831775702, | |
| "grad_norm": 0.028266677632927895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524438619613647, | |
| "mean_token_accuracy": 0.7769231647253036, | |
| "num_tokens": 5360378.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5364970713853836, | |
| "epoch": 1.2317757009345796, | |
| "grad_norm": 0.03534099832177162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341481566429138, | |
| "mean_token_accuracy": 0.783685103058815, | |
| "num_tokens": 5376600.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5472245216369629, | |
| "epoch": 1.2355140186915887, | |
| "grad_norm": 0.030261849984526634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478684306144714, | |
| "mean_token_accuracy": 0.7797873020172119, | |
| "num_tokens": 5392761.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.545607790350914, | |
| "epoch": 1.2392523364485981, | |
| "grad_norm": 0.029436452314257622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546855628490448, | |
| "mean_token_accuracy": 0.7786357402801514, | |
| "num_tokens": 5409133.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5291889756917953, | |
| "epoch": 1.2429906542056075, | |
| "grad_norm": 0.03353505581617355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353861451148987, | |
| "mean_token_accuracy": 0.7811570167541504, | |
| "num_tokens": 5425384.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5578002631664276, | |
| "epoch": 1.246728971962617, | |
| "grad_norm": 0.03168244659900665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618013143539429, | |
| "mean_token_accuracy": 0.7705619186162949, | |
| "num_tokens": 5441708.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.555315688252449, | |
| "epoch": 1.250467289719626, | |
| "grad_norm": 0.03206615522503853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600447654724121, | |
| "mean_token_accuracy": 0.7714688628911972, | |
| "num_tokens": 5457884.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5601648688316345, | |
| "epoch": 1.2542056074766355, | |
| "grad_norm": 0.03804044798016548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550276637077332, | |
| "mean_token_accuracy": 0.7733457237482071, | |
| "num_tokens": 5474231.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.542451411485672, | |
| "epoch": 1.2579439252336448, | |
| "grad_norm": 0.029554393142461777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353547930717468, | |
| "mean_token_accuracy": 0.7827602028846741, | |
| "num_tokens": 5490557.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5396464318037033, | |
| "epoch": 1.2616822429906542, | |
| "grad_norm": 0.02930438332259655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352525115013123, | |
| "mean_token_accuracy": 0.782452329993248, | |
| "num_tokens": 5506827.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.551433265209198, | |
| "epoch": 1.2654205607476636, | |
| "grad_norm": 0.03803868591785431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564743280410767, | |
| "mean_token_accuracy": 0.7742451429367065, | |
| "num_tokens": 5523197.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5405130237340927, | |
| "epoch": 1.269158878504673, | |
| "grad_norm": 0.03335575759410858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447483062744141, | |
| "mean_token_accuracy": 0.777386024594307, | |
| "num_tokens": 5539570.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5281671732664108, | |
| "epoch": 1.2728971962616822, | |
| "grad_norm": 0.03668655455112457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369662642478943, | |
| "mean_token_accuracy": 0.7818697243928909, | |
| "num_tokens": 5556018.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5445946455001831, | |
| "epoch": 1.2766355140186916, | |
| "grad_norm": 0.03418565168976784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481922626495361, | |
| "mean_token_accuracy": 0.7817248553037643, | |
| "num_tokens": 5571921.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5692614763975143, | |
| "epoch": 1.280373831775701, | |
| "grad_norm": 0.032861191779375076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536470413208008, | |
| "mean_token_accuracy": 0.7768330574035645, | |
| "num_tokens": 5588242.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5534744560718536, | |
| "epoch": 1.2841121495327104, | |
| "grad_norm": 0.02994309738278389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490615367889404, | |
| "mean_token_accuracy": 0.7776058167219162, | |
| "num_tokens": 5604646.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5477103441953659, | |
| "epoch": 1.2878504672897195, | |
| "grad_norm": 0.0329648032784462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608856678009033, | |
| "mean_token_accuracy": 0.769044816493988, | |
| "num_tokens": 5620822.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5447603911161423, | |
| "epoch": 1.291588785046729, | |
| "grad_norm": 0.038630835711956024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517427921295166, | |
| "mean_token_accuracy": 0.776050254702568, | |
| "num_tokens": 5637254.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5543326735496521, | |
| "epoch": 1.2953271028037383, | |
| "grad_norm": 0.03234436735510826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605747103691101, | |
| "mean_token_accuracy": 0.7735925763845444, | |
| "num_tokens": 5653687.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5351574122905731, | |
| "epoch": 1.2990654205607477, | |
| "grad_norm": 0.03387833759188652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403937697410583, | |
| "mean_token_accuracy": 0.7819892168045044, | |
| "num_tokens": 5670152.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5567533820867538, | |
| "epoch": 1.302803738317757, | |
| "grad_norm": 0.0311372522264719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512552261352539, | |
| "mean_token_accuracy": 0.7762364596128464, | |
| "num_tokens": 5686422.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5508190989494324, | |
| "epoch": 1.3065420560747665, | |
| "grad_norm": 0.027689168229699135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455954074859619, | |
| "mean_token_accuracy": 0.7787918448448181, | |
| "num_tokens": 5702832.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5493623167276382, | |
| "epoch": 1.3102803738317756, | |
| "grad_norm": 0.03188028931617737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508118867874146, | |
| "mean_token_accuracy": 0.7741293609142303, | |
| "num_tokens": 5719201.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5517994910478592, | |
| "epoch": 1.314018691588785, | |
| "grad_norm": 0.03255178779363632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581218004226685, | |
| "mean_token_accuracy": 0.7717841118574142, | |
| "num_tokens": 5735507.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5363009721040726, | |
| "epoch": 1.3177570093457944, | |
| "grad_norm": 0.0318707600235939, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422943234443665, | |
| "mean_token_accuracy": 0.7783725261688232, | |
| "num_tokens": 5751653.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5449318736791611, | |
| "epoch": 1.3214953271028038, | |
| "grad_norm": 0.028741504997015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539950966835022, | |
| "mean_token_accuracy": 0.7803268283605576, | |
| "num_tokens": 5768167.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5602855980396271, | |
| "epoch": 1.325233644859813, | |
| "grad_norm": 0.030420802533626556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554990291595459, | |
| "mean_token_accuracy": 0.7761643081903458, | |
| "num_tokens": 5784542.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.56887586414814, | |
| "epoch": 1.3289719626168224, | |
| "grad_norm": 0.03126989305019379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672231912612915, | |
| "mean_token_accuracy": 0.7678193151950836, | |
| "num_tokens": 5801095.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5738541036844254, | |
| "epoch": 1.3327102803738318, | |
| "grad_norm": 0.03625823184847832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5728395581245422, | |
| "mean_token_accuracy": 0.7666806429624557, | |
| "num_tokens": 5817738.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5436241179704666, | |
| "epoch": 1.3364485981308412, | |
| "grad_norm": 0.03443320468068123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367251634597778, | |
| "mean_token_accuracy": 0.7828597128391266, | |
| "num_tokens": 5834159.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5450441539287567, | |
| "epoch": 1.3401869158878505, | |
| "grad_norm": 0.02960045635700226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478132963180542, | |
| "mean_token_accuracy": 0.7773353010416031, | |
| "num_tokens": 5850353.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.559371218085289, | |
| "epoch": 1.34392523364486, | |
| "grad_norm": 0.043439071625471115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704307556152344, | |
| "mean_token_accuracy": 0.7674223929643631, | |
| "num_tokens": 5866661.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5383078157901764, | |
| "epoch": 1.347663551401869, | |
| "grad_norm": 0.031151141971349716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475639700889587, | |
| "mean_token_accuracy": 0.7764850705862045, | |
| "num_tokens": 5883147.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5361460000276566, | |
| "epoch": 1.3514018691588785, | |
| "grad_norm": 0.0367986336350441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413030385971069, | |
| "mean_token_accuracy": 0.7792898863554001, | |
| "num_tokens": 5899337.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5393686443567276, | |
| "epoch": 1.355140186915888, | |
| "grad_norm": 0.032062407582998276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485578775405884, | |
| "mean_token_accuracy": 0.7746371626853943, | |
| "num_tokens": 5915592.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5442528575658798, | |
| "epoch": 1.358878504672897, | |
| "grad_norm": 0.030468052253127098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427553653717041, | |
| "mean_token_accuracy": 0.7785662263631821, | |
| "num_tokens": 5931951.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5824908316135406, | |
| "epoch": 1.3626168224299064, | |
| "grad_norm": 0.037210624665021896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697020292282104, | |
| "mean_token_accuracy": 0.7692236304283142, | |
| "num_tokens": 5948490.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5620522499084473, | |
| "epoch": 1.3663551401869158, | |
| "grad_norm": 0.0335218720138073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542594194412231, | |
| "mean_token_accuracy": 0.7753977477550507, | |
| "num_tokens": 5964660.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5603572577238083, | |
| "epoch": 1.3700934579439252, | |
| "grad_norm": 0.031322672963142395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575450658798218, | |
| "mean_token_accuracy": 0.7735055536031723, | |
| "num_tokens": 5981101.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5505388826131821, | |
| "epoch": 1.3738317757009346, | |
| "grad_norm": 0.030650589615106583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557997822761536, | |
| "mean_token_accuracy": 0.7740475237369537, | |
| "num_tokens": 5997642.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5392187088727951, | |
| "epoch": 1.377570093457944, | |
| "grad_norm": 0.030460603535175323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474120378494263, | |
| "mean_token_accuracy": 0.7756936997175217, | |
| "num_tokens": 6013826.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5465079843997955, | |
| "epoch": 1.3813084112149534, | |
| "grad_norm": 0.03873775899410248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496590733528137, | |
| "mean_token_accuracy": 0.7778041809797287, | |
| "num_tokens": 6030111.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5502425879240036, | |
| "epoch": 1.3850467289719626, | |
| "grad_norm": 0.027835069224238396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515455007553101, | |
| "mean_token_accuracy": 0.7742271274328232, | |
| "num_tokens": 6046613.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5496622025966644, | |
| "epoch": 1.388785046728972, | |
| "grad_norm": 0.02913137525320053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523219108581543, | |
| "mean_token_accuracy": 0.7767279595136642, | |
| "num_tokens": 6062935.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5480591654777527, | |
| "epoch": 1.3925233644859814, | |
| "grad_norm": 0.028895994648337364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464932918548584, | |
| "mean_token_accuracy": 0.7779257446527481, | |
| "num_tokens": 6079276.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5592564791440964, | |
| "epoch": 1.3962616822429905, | |
| "grad_norm": 0.030813386663794518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641001462936401, | |
| "mean_token_accuracy": 0.7706102132797241, | |
| "num_tokens": 6095477.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5482244938611984, | |
| "epoch": 1.4, | |
| "grad_norm": 0.034681808203458786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535820722579956, | |
| "mean_token_accuracy": 0.7740350067615509, | |
| "num_tokens": 6111503.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5437954962253571, | |
| "epoch": 1.4037383177570093, | |
| "grad_norm": 0.029899772256612778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384761691093445, | |
| "mean_token_accuracy": 0.7813697308301926, | |
| "num_tokens": 6127666.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5516242235898972, | |
| "epoch": 1.4074766355140187, | |
| "grad_norm": 0.03098697029054165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510317087173462, | |
| "mean_token_accuracy": 0.7748206406831741, | |
| "num_tokens": 6143974.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5456867665052414, | |
| "epoch": 1.411214953271028, | |
| "grad_norm": 0.03481059893965721, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417442917823792, | |
| "mean_token_accuracy": 0.7805673629045486, | |
| "num_tokens": 6160284.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5566543191671371, | |
| "epoch": 1.4149532710280375, | |
| "grad_norm": 0.03302835300564766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596388578414917, | |
| "mean_token_accuracy": 0.7757162600755692, | |
| "num_tokens": 6176900.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5518665462732315, | |
| "epoch": 1.4186915887850469, | |
| "grad_norm": 0.042512837797403336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554313600063324, | |
| "mean_token_accuracy": 0.7725758254528046, | |
| "num_tokens": 6193295.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5387768298387527, | |
| "epoch": 1.422429906542056, | |
| "grad_norm": 0.031335704028606415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456656813621521, | |
| "mean_token_accuracy": 0.7767685800790787, | |
| "num_tokens": 6209473.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.552179217338562, | |
| "epoch": 1.4261682242990654, | |
| "grad_norm": 0.03560006618499756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536052584648132, | |
| "mean_token_accuracy": 0.7741381675004959, | |
| "num_tokens": 6225795.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5529111623764038, | |
| "epoch": 1.4299065420560748, | |
| "grad_norm": 0.03298206627368927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456759929656982, | |
| "mean_token_accuracy": 0.7785012274980545, | |
| "num_tokens": 6241738.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5528014451265335, | |
| "epoch": 1.433644859813084, | |
| "grad_norm": 0.02689899317920208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489047765731812, | |
| "mean_token_accuracy": 0.7755105197429657, | |
| "num_tokens": 6258266.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5488691926002502, | |
| "epoch": 1.4373831775700934, | |
| "grad_norm": 0.03345772624015808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473658442497253, | |
| "mean_token_accuracy": 0.776367112994194, | |
| "num_tokens": 6274629.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5326814502477646, | |
| "epoch": 1.4411214953271028, | |
| "grad_norm": 0.0327431820333004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437192916870117, | |
| "mean_token_accuracy": 0.7790791392326355, | |
| "num_tokens": 6290843.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5463947802782059, | |
| "epoch": 1.4448598130841122, | |
| "grad_norm": 0.029317917302250862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482510924339294, | |
| "mean_token_accuracy": 0.7787915766239166, | |
| "num_tokens": 6307390.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5279744416475296, | |
| "epoch": 1.4485981308411215, | |
| "grad_norm": 0.032164428383111954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396722555160522, | |
| "mean_token_accuracy": 0.7793098241090775, | |
| "num_tokens": 6323780.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5401588678359985, | |
| "epoch": 1.452336448598131, | |
| "grad_norm": 0.029884206131100655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457247495651245, | |
| "mean_token_accuracy": 0.7772396057844162, | |
| "num_tokens": 6340075.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5614192336797714, | |
| "epoch": 1.45607476635514, | |
| "grad_norm": 0.031751908361911774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567028522491455, | |
| "mean_token_accuracy": 0.7716124802827835, | |
| "num_tokens": 6356186.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5345210433006287, | |
| "epoch": 1.4598130841121495, | |
| "grad_norm": 0.030872350558638573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334336757659912, | |
| "mean_token_accuracy": 0.7826623171567917, | |
| "num_tokens": 6372159.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5622972398996353, | |
| "epoch": 1.4635514018691589, | |
| "grad_norm": 0.0314875952899456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557999610900879, | |
| "mean_token_accuracy": 0.7731751799583435, | |
| "num_tokens": 6388490.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5456393212080002, | |
| "epoch": 1.4672897196261683, | |
| "grad_norm": 0.030306922271847725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478385090827942, | |
| "mean_token_accuracy": 0.7785396575927734, | |
| "num_tokens": 6404875.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.553615927696228, | |
| "epoch": 1.4710280373831774, | |
| "grad_norm": 0.03159041702747345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525414347648621, | |
| "mean_token_accuracy": 0.7762843668460846, | |
| "num_tokens": 6421373.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.54654960334301, | |
| "epoch": 1.4747663551401868, | |
| "grad_norm": 0.041343770921230316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578322410583496, | |
| "mean_token_accuracy": 0.7733658850193024, | |
| "num_tokens": 6437609.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.531049445271492, | |
| "epoch": 1.4785046728971962, | |
| "grad_norm": 0.029535705223679543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336673855781555, | |
| "mean_token_accuracy": 0.7787897735834122, | |
| "num_tokens": 6453830.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5598567724227905, | |
| "epoch": 1.4822429906542056, | |
| "grad_norm": 0.030157895758748055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558460533618927, | |
| "mean_token_accuracy": 0.7739997208118439, | |
| "num_tokens": 6469831.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5455051362514496, | |
| "epoch": 1.485981308411215, | |
| "grad_norm": 0.02824362926185131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309131145477295, | |
| "mean_token_accuracy": 0.7840657532215118, | |
| "num_tokens": 6485983.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5548417568206787, | |
| "epoch": 1.4897196261682244, | |
| "grad_norm": 0.028244182467460632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448263883590698, | |
| "mean_token_accuracy": 0.7788312286138535, | |
| "num_tokens": 6502375.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5614428222179413, | |
| "epoch": 1.4934579439252336, | |
| "grad_norm": 0.029092902317643166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640357732772827, | |
| "mean_token_accuracy": 0.7694920003414154, | |
| "num_tokens": 6518515.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5202381461858749, | |
| "epoch": 1.497196261682243, | |
| "grad_norm": 0.0347515270113945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334154963493347, | |
| "mean_token_accuracy": 0.7812663912773132, | |
| "num_tokens": 6534874.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5337788164615631, | |
| "epoch": 1.5009345794392523, | |
| "grad_norm": 0.036383189260959625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497745871543884, | |
| "mean_token_accuracy": 0.778416782617569, | |
| "num_tokens": 6551531.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5441624820232391, | |
| "epoch": 1.5046728971962615, | |
| "grad_norm": 0.029430663213133812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452989935874939, | |
| "mean_token_accuracy": 0.7810618728399277, | |
| "num_tokens": 6568009.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5418661385774612, | |
| "epoch": 1.508411214953271, | |
| "grad_norm": 0.030562201514840126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342137813568115, | |
| "mean_token_accuracy": 0.7829063683748245, | |
| "num_tokens": 6584207.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5485459864139557, | |
| "epoch": 1.5121495327102803, | |
| "grad_norm": 0.03423624485731125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410490036010742, | |
| "mean_token_accuracy": 0.7787354588508606, | |
| "num_tokens": 6600370.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5426456183195114, | |
| "epoch": 1.5158878504672897, | |
| "grad_norm": 0.02885623089969158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436002612113953, | |
| "mean_token_accuracy": 0.7796245515346527, | |
| "num_tokens": 6616756.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5356003642082214, | |
| "epoch": 1.519626168224299, | |
| "grad_norm": 0.03115919418632984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386699438095093, | |
| "mean_token_accuracy": 0.7803057432174683, | |
| "num_tokens": 6632844.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5387707352638245, | |
| "epoch": 1.5233644859813085, | |
| "grad_norm": 0.039791349321603775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529868006706238, | |
| "mean_token_accuracy": 0.7759213447570801, | |
| "num_tokens": 6649378.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5559847801923752, | |
| "epoch": 1.5271028037383179, | |
| "grad_norm": 0.02880096808075905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526622533798218, | |
| "mean_token_accuracy": 0.7757584452629089, | |
| "num_tokens": 6665680.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5568434447050095, | |
| "epoch": 1.5308411214953273, | |
| "grad_norm": 0.03131592646241188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511536002159119, | |
| "mean_token_accuracy": 0.7751762717962265, | |
| "num_tokens": 6682037.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5535785406827927, | |
| "epoch": 1.5345794392523364, | |
| "grad_norm": 0.027654770761728287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505651831626892, | |
| "mean_token_accuracy": 0.7777209877967834, | |
| "num_tokens": 6698293.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5670723766088486, | |
| "epoch": 1.5383177570093458, | |
| "grad_norm": 0.028583014383912086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562312662601471, | |
| "mean_token_accuracy": 0.7695807963609695, | |
| "num_tokens": 6714701.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5622154772281647, | |
| "epoch": 1.542056074766355, | |
| "grad_norm": 0.02976270206272602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625367164611816, | |
| "mean_token_accuracy": 0.7716499269008636, | |
| "num_tokens": 6731185.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5430750995874405, | |
| "epoch": 1.5457943925233644, | |
| "grad_norm": 0.033997952938079834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533574819564819, | |
| "mean_token_accuracy": 0.7739907056093216, | |
| "num_tokens": 6747611.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5383965522050858, | |
| "epoch": 1.5495327102803738, | |
| "grad_norm": 0.030417680740356445, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392584204673767, | |
| "mean_token_accuracy": 0.781003326177597, | |
| "num_tokens": 6764041.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5423173159360886, | |
| "epoch": 1.5532710280373832, | |
| "grad_norm": 0.03076282888650894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466949343681335, | |
| "mean_token_accuracy": 0.7772891670465469, | |
| "num_tokens": 6780355.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5329848676919937, | |
| "epoch": 1.5570093457943925, | |
| "grad_norm": 0.031416404992341995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372002720832825, | |
| "mean_token_accuracy": 0.7831790894269943, | |
| "num_tokens": 6796818.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5694616734981537, | |
| "epoch": 1.560747663551402, | |
| "grad_norm": 0.03140864148736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5736896395683289, | |
| "mean_token_accuracy": 0.7680276483297348, | |
| "num_tokens": 6813313.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5422861874103546, | |
| "epoch": 1.5644859813084113, | |
| "grad_norm": 0.029503118246793747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412414073944092, | |
| "mean_token_accuracy": 0.7787739634513855, | |
| "num_tokens": 6829806.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5583456158638, | |
| "epoch": 1.5682242990654207, | |
| "grad_norm": 0.02907589264214039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538471937179565, | |
| "mean_token_accuracy": 0.7733865231275558, | |
| "num_tokens": 6846001.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.541300505399704, | |
| "epoch": 1.5719626168224299, | |
| "grad_norm": 0.030364159494638443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440077781677246, | |
| "mean_token_accuracy": 0.7778935730457306, | |
| "num_tokens": 6862199.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5432893335819244, | |
| "epoch": 1.5757009345794393, | |
| "grad_norm": 0.030575595796108246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458940267562866, | |
| "mean_token_accuracy": 0.7759649753570557, | |
| "num_tokens": 6878579.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5597539693117142, | |
| "epoch": 1.5794392523364484, | |
| "grad_norm": 0.03023570403456688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611036419868469, | |
| "mean_token_accuracy": 0.771359458565712, | |
| "num_tokens": 6895118.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5647385269403458, | |
| "epoch": 1.5831775700934578, | |
| "grad_norm": 0.03682006523013115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706467032432556, | |
| "mean_token_accuracy": 0.7648251056671143, | |
| "num_tokens": 6911258.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5421442538499832, | |
| "epoch": 1.5869158878504672, | |
| "grad_norm": 0.02758963778614998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540165364742279, | |
| "mean_token_accuracy": 0.7803500890731812, | |
| "num_tokens": 6927685.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.529248058795929, | |
| "epoch": 1.5906542056074766, | |
| "grad_norm": 0.03153234347701073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238373875617981, | |
| "mean_token_accuracy": 0.7865803390741348, | |
| "num_tokens": 6944032.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.575338825583458, | |
| "epoch": 1.594392523364486, | |
| "grad_norm": 0.038368549197912216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5686851739883423, | |
| "mean_token_accuracy": 0.7687085419893265, | |
| "num_tokens": 6960292.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5576592534780502, | |
| "epoch": 1.5981308411214954, | |
| "grad_norm": 0.028228625655174255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487405061721802, | |
| "mean_token_accuracy": 0.7753542214632034, | |
| "num_tokens": 6976714.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5344701558351517, | |
| "epoch": 1.6018691588785048, | |
| "grad_norm": 0.04058045893907547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446043014526367, | |
| "mean_token_accuracy": 0.7796988487243652, | |
| "num_tokens": 6993050.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5357878655195236, | |
| "epoch": 1.6056074766355142, | |
| "grad_norm": 0.03584378957748413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503512620925903, | |
| "mean_token_accuracy": 0.7766520529985428, | |
| "num_tokens": 7009209.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5416888147592545, | |
| "epoch": 1.6093457943925233, | |
| "grad_norm": 0.035834796726703644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537422895431519, | |
| "mean_token_accuracy": 0.7721364051103592, | |
| "num_tokens": 7025449.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5495986640453339, | |
| "epoch": 1.6130841121495327, | |
| "grad_norm": 0.032027650624513626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545753836631775, | |
| "mean_token_accuracy": 0.7711912095546722, | |
| "num_tokens": 7041746.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.545868456363678, | |
| "epoch": 1.616822429906542, | |
| "grad_norm": 0.03172159940004349, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401636958122253, | |
| "mean_token_accuracy": 0.7796500027179718, | |
| "num_tokens": 7057795.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5575663447380066, | |
| "epoch": 1.6205607476635513, | |
| "grad_norm": 0.033373311161994934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508802533149719, | |
| "mean_token_accuracy": 0.776265561580658, | |
| "num_tokens": 7074106.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.552743598818779, | |
| "epoch": 1.6242990654205607, | |
| "grad_norm": 0.028903203085064888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493654012680054, | |
| "mean_token_accuracy": 0.7769621759653091, | |
| "num_tokens": 7090537.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5319768935441971, | |
| "epoch": 1.62803738317757, | |
| "grad_norm": 0.034539636224508286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467936396598816, | |
| "mean_token_accuracy": 0.7773739099502563, | |
| "num_tokens": 7106864.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5451867878437042, | |
| "epoch": 1.6317757009345795, | |
| "grad_norm": 0.03423994407057762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547507405281067, | |
| "mean_token_accuracy": 0.7716930210590363, | |
| "num_tokens": 7123027.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5614334046840668, | |
| "epoch": 1.6355140186915889, | |
| "grad_norm": 0.030570637434720993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614769458770752, | |
| "mean_token_accuracy": 0.772892951965332, | |
| "num_tokens": 7139089.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5780467242002487, | |
| "epoch": 1.6392523364485982, | |
| "grad_norm": 0.028702719137072563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703617334365845, | |
| "mean_token_accuracy": 0.7703514397144318, | |
| "num_tokens": 7155613.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5620117634534836, | |
| "epoch": 1.6429906542056076, | |
| "grad_norm": 0.032911110669374466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519667863845825, | |
| "mean_token_accuracy": 0.776491329073906, | |
| "num_tokens": 7171940.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5613545030355453, | |
| "epoch": 1.6467289719626168, | |
| "grad_norm": 0.02767273783683777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548912286758423, | |
| "mean_token_accuracy": 0.7774568051099777, | |
| "num_tokens": 7188459.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5349740386009216, | |
| "epoch": 1.6504672897196262, | |
| "grad_norm": 0.03398311510682106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359267592430115, | |
| "mean_token_accuracy": 0.7792400866746902, | |
| "num_tokens": 7204742.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5435358434915543, | |
| "epoch": 1.6542056074766354, | |
| "grad_norm": 0.03121669590473175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480291247367859, | |
| "mean_token_accuracy": 0.7757425308227539, | |
| "num_tokens": 7220970.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5408525168895721, | |
| "epoch": 1.6579439252336448, | |
| "grad_norm": 0.03187638521194458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458962321281433, | |
| "mean_token_accuracy": 0.7777377218008041, | |
| "num_tokens": 7237303.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5296604186296463, | |
| "epoch": 1.6616822429906541, | |
| "grad_norm": 0.033922888338565826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350003242492676, | |
| "mean_token_accuracy": 0.7817184776067734, | |
| "num_tokens": 7253313.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5386542528867722, | |
| "epoch": 1.6654205607476635, | |
| "grad_norm": 0.03487584367394447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504403710365295, | |
| "mean_token_accuracy": 0.7764954715967178, | |
| "num_tokens": 7269689.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5447485446929932, | |
| "epoch": 1.669158878504673, | |
| "grad_norm": 0.028691545128822327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440992712974548, | |
| "mean_token_accuracy": 0.7813538759946823, | |
| "num_tokens": 7286072.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5479656606912613, | |
| "epoch": 1.6728971962616823, | |
| "grad_norm": 0.02881709486246109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415880084037781, | |
| "mean_token_accuracy": 0.7795199900865555, | |
| "num_tokens": 7302255.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5570111870765686, | |
| "epoch": 1.6766355140186917, | |
| "grad_norm": 0.028915997594594955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533952713012695, | |
| "mean_token_accuracy": 0.7753083109855652, | |
| "num_tokens": 7318517.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5548125952482224, | |
| "epoch": 1.680373831775701, | |
| "grad_norm": 0.029765961691737175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539486408233643, | |
| "mean_token_accuracy": 0.7759220153093338, | |
| "num_tokens": 7334708.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5474168807268143, | |
| "epoch": 1.6841121495327103, | |
| "grad_norm": 0.028495540842413902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542155921459198, | |
| "mean_token_accuracy": 0.7810131311416626, | |
| "num_tokens": 7351081.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5660932809114456, | |
| "epoch": 1.6878504672897197, | |
| "grad_norm": 0.029109494760632515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608826279640198, | |
| "mean_token_accuracy": 0.7715775072574615, | |
| "num_tokens": 7367731.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5341303050518036, | |
| "epoch": 1.6915887850467288, | |
| "grad_norm": 0.0320415273308754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458233952522278, | |
| "mean_token_accuracy": 0.7763672173023224, | |
| "num_tokens": 7383855.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5321396738290787, | |
| "epoch": 1.6953271028037382, | |
| "grad_norm": 0.02727021649479866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336453318595886, | |
| "mean_token_accuracy": 0.7841753661632538, | |
| "num_tokens": 7400413.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5274764150381088, | |
| "epoch": 1.6990654205607476, | |
| "grad_norm": 0.03324299305677414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358706116676331, | |
| "mean_token_accuracy": 0.7782862633466721, | |
| "num_tokens": 7416652.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5659113973379135, | |
| "epoch": 1.702803738317757, | |
| "grad_norm": 0.02792423591017723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652596354484558, | |
| "mean_token_accuracy": 0.7699151486158371, | |
| "num_tokens": 7433182.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5379252284765244, | |
| "epoch": 1.7065420560747664, | |
| "grad_norm": 0.029364224523305893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403070449829102, | |
| "mean_token_accuracy": 0.780923143029213, | |
| "num_tokens": 7449489.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5333061218261719, | |
| "epoch": 1.7102803738317758, | |
| "grad_norm": 0.03605153039097786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397148728370667, | |
| "mean_token_accuracy": 0.7807264924049377, | |
| "num_tokens": 7465639.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5705498605966568, | |
| "epoch": 1.7140186915887852, | |
| "grad_norm": 0.03089967370033264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634230375289917, | |
| "mean_token_accuracy": 0.770861804485321, | |
| "num_tokens": 7482026.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5468743443489075, | |
| "epoch": 1.7177570093457943, | |
| "grad_norm": 0.030453559011220932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545179545879364, | |
| "mean_token_accuracy": 0.7774305045604706, | |
| "num_tokens": 7498135.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5617033839225769, | |
| "epoch": 1.7214953271028037, | |
| "grad_norm": 0.03324849158525467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638455748558044, | |
| "mean_token_accuracy": 0.7687248736619949, | |
| "num_tokens": 7514525.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5581229031085968, | |
| "epoch": 1.7252336448598131, | |
| "grad_norm": 0.03176411613821983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653245449066162, | |
| "mean_token_accuracy": 0.7685625553131104, | |
| "num_tokens": 7530775.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5476332157850266, | |
| "epoch": 1.7289719626168223, | |
| "grad_norm": 0.02840348146855831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459728240966797, | |
| "mean_token_accuracy": 0.7803480625152588, | |
| "num_tokens": 7547133.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5295307040214539, | |
| "epoch": 1.7327102803738317, | |
| "grad_norm": 0.03073256090283394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271958708763123, | |
| "mean_token_accuracy": 0.7856812626123428, | |
| "num_tokens": 7563202.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5600748807191849, | |
| "epoch": 1.736448598130841, | |
| "grad_norm": 0.02645997144281864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613283514976501, | |
| "mean_token_accuracy": 0.7728501409292221, | |
| "num_tokens": 7579316.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5520564913749695, | |
| "epoch": 1.7401869158878505, | |
| "grad_norm": 0.03572427108883858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537987947463989, | |
| "mean_token_accuracy": 0.7724860310554504, | |
| "num_tokens": 7595641.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5529971420764923, | |
| "epoch": 1.7439252336448599, | |
| "grad_norm": 0.03125125169754028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582661628723145, | |
| "mean_token_accuracy": 0.7737809270620346, | |
| "num_tokens": 7611643.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5647894889116287, | |
| "epoch": 1.7476635514018692, | |
| "grad_norm": 0.029365174472332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628995895385742, | |
| "mean_token_accuracy": 0.770697221159935, | |
| "num_tokens": 7628011.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.554974377155304, | |
| "epoch": 1.7514018691588786, | |
| "grad_norm": 0.03162689507007599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540342330932617, | |
| "mean_token_accuracy": 0.7753277122974396, | |
| "num_tokens": 7644033.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5500662177801132, | |
| "epoch": 1.7551401869158878, | |
| "grad_norm": 0.03005298413336277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444310307502747, | |
| "mean_token_accuracy": 0.7801364362239838, | |
| "num_tokens": 7660280.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5447323620319366, | |
| "epoch": 1.7588785046728972, | |
| "grad_norm": 0.03137346729636192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573670864105225, | |
| "mean_token_accuracy": 0.7713485956192017, | |
| "num_tokens": 7676463.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5369779318571091, | |
| "epoch": 1.7626168224299066, | |
| "grad_norm": 0.03314938396215439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444561839103699, | |
| "mean_token_accuracy": 0.7770639657974243, | |
| "num_tokens": 7692602.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5475834012031555, | |
| "epoch": 1.7663551401869158, | |
| "grad_norm": 0.02887626923620701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548475980758667, | |
| "mean_token_accuracy": 0.7783610373735428, | |
| "num_tokens": 7708846.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5512323975563049, | |
| "epoch": 1.7700934579439251, | |
| "grad_norm": 0.029940130189061165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473303198814392, | |
| "mean_token_accuracy": 0.7762128710746765, | |
| "num_tokens": 7725069.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.553005576133728, | |
| "epoch": 1.7738317757009345, | |
| "grad_norm": 0.030464377254247665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503718852996826, | |
| "mean_token_accuracy": 0.774563655257225, | |
| "num_tokens": 7741245.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5530129075050354, | |
| "epoch": 1.777570093457944, | |
| "grad_norm": 0.03166594356298447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523677468299866, | |
| "mean_token_accuracy": 0.7772203087806702, | |
| "num_tokens": 7757438.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5589546114206314, | |
| "epoch": 1.7813084112149533, | |
| "grad_norm": 0.031029848381876945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562568724155426, | |
| "mean_token_accuracy": 0.7697692364454269, | |
| "num_tokens": 7773613.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5485216081142426, | |
| "epoch": 1.7850467289719627, | |
| "grad_norm": 0.03148766979575157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566563010215759, | |
| "mean_token_accuracy": 0.7735153138637543, | |
| "num_tokens": 7790250.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5454483330249786, | |
| "epoch": 1.788785046728972, | |
| "grad_norm": 0.02934390679001808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470514297485352, | |
| "mean_token_accuracy": 0.777851864695549, | |
| "num_tokens": 7806794.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5577091723680496, | |
| "epoch": 1.7925233644859813, | |
| "grad_norm": 0.032060954719781876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573920011520386, | |
| "mean_token_accuracy": 0.7715256214141846, | |
| "num_tokens": 7823378.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5442305952310562, | |
| "epoch": 1.7962616822429907, | |
| "grad_norm": 0.027305442839860916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404268503189087, | |
| "mean_token_accuracy": 0.7780007869005203, | |
| "num_tokens": 7839749.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5555779784917831, | |
| "epoch": 1.8, | |
| "grad_norm": 0.03287232294678688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462092161178589, | |
| "mean_token_accuracy": 0.7763689607381821, | |
| "num_tokens": 7855947.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5372089967131615, | |
| "epoch": 1.8037383177570092, | |
| "grad_norm": 0.031652286648750305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363561511039734, | |
| "mean_token_accuracy": 0.7853012979030609, | |
| "num_tokens": 7872142.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5340928807854652, | |
| "epoch": 1.8074766355140186, | |
| "grad_norm": 0.031619228422641754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403937697410583, | |
| "mean_token_accuracy": 0.7826676219701767, | |
| "num_tokens": 7888470.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5592721551656723, | |
| "epoch": 1.811214953271028, | |
| "grad_norm": 0.03946106135845184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5722806453704834, | |
| "mean_token_accuracy": 0.7665584683418274, | |
| "num_tokens": 7904942.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5392829775810242, | |
| "epoch": 1.8149532710280374, | |
| "grad_norm": 0.04261912405490875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484760999679565, | |
| "mean_token_accuracy": 0.7759799510240555, | |
| "num_tokens": 7921095.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5537964701652527, | |
| "epoch": 1.8186915887850468, | |
| "grad_norm": 0.029489269480109215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515441298484802, | |
| "mean_token_accuracy": 0.7770739942789078, | |
| "num_tokens": 7937493.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5820317566394806, | |
| "epoch": 1.8224299065420562, | |
| "grad_norm": 0.032789647579193115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5696999430656433, | |
| "mean_token_accuracy": 0.766129344701767, | |
| "num_tokens": 7953872.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5591157823801041, | |
| "epoch": 1.8261682242990656, | |
| "grad_norm": 0.03274792060256004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492164492607117, | |
| "mean_token_accuracy": 0.7776104360818863, | |
| "num_tokens": 7970399.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5613900125026703, | |
| "epoch": 1.8299065420560747, | |
| "grad_norm": 0.03268195316195488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613545179367065, | |
| "mean_token_accuracy": 0.7726269513368607, | |
| "num_tokens": 7986663.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.540773555636406, | |
| "epoch": 1.8336448598130841, | |
| "grad_norm": 0.031849462538957596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427927374839783, | |
| "mean_token_accuracy": 0.7795483022928238, | |
| "num_tokens": 8002949.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5281448513269424, | |
| "epoch": 1.8373831775700935, | |
| "grad_norm": 0.037760283797979355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398802161216736, | |
| "mean_token_accuracy": 0.7793932110071182, | |
| "num_tokens": 8018924.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5640152990818024, | |
| "epoch": 1.8411214953271027, | |
| "grad_norm": 0.03318220004439354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651699900627136, | |
| "mean_token_accuracy": 0.7711258381605148, | |
| "num_tokens": 8035544.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5498005002737045, | |
| "epoch": 1.844859813084112, | |
| "grad_norm": 0.0300876684486866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483426451683044, | |
| "mean_token_accuracy": 0.777212604880333, | |
| "num_tokens": 8051604.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5553054213523865, | |
| "epoch": 1.8485981308411215, | |
| "grad_norm": 0.03142329677939415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571571588516235, | |
| "mean_token_accuracy": 0.7740218490362167, | |
| "num_tokens": 8067812.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5580199360847473, | |
| "epoch": 1.8523364485981308, | |
| "grad_norm": 0.03293558582663536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583306550979614, | |
| "mean_token_accuracy": 0.7746147364377975, | |
| "num_tokens": 8083966.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5503615736961365, | |
| "epoch": 1.8560747663551402, | |
| "grad_norm": 0.031184855848550797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509845614433289, | |
| "mean_token_accuracy": 0.7762554883956909, | |
| "num_tokens": 8100276.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5609902739524841, | |
| "epoch": 1.8598130841121496, | |
| "grad_norm": 0.03478863090276718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611089468002319, | |
| "mean_token_accuracy": 0.7710845172405243, | |
| "num_tokens": 8116579.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5358163863420486, | |
| "epoch": 1.863551401869159, | |
| "grad_norm": 0.03343072161078453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352976322174072, | |
| "mean_token_accuracy": 0.7815191894769669, | |
| "num_tokens": 8132938.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5323279201984406, | |
| "epoch": 1.8672897196261682, | |
| "grad_norm": 0.030239535495638847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383006930351257, | |
| "mean_token_accuracy": 0.7808633744716644, | |
| "num_tokens": 8149182.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.557625338435173, | |
| "epoch": 1.8710280373831776, | |
| "grad_norm": 0.031314413994550705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607120990753174, | |
| "mean_token_accuracy": 0.7726259678602219, | |
| "num_tokens": 8165713.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5501556247472763, | |
| "epoch": 1.874766355140187, | |
| "grad_norm": 0.029330939054489136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527728796005249, | |
| "mean_token_accuracy": 0.7722220122814178, | |
| "num_tokens": 8182157.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5571380257606506, | |
| "epoch": 1.8785046728971961, | |
| "grad_norm": 0.027965383604168892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537632703781128, | |
| "mean_token_accuracy": 0.7755916863679886, | |
| "num_tokens": 8198641.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5457630455493927, | |
| "epoch": 1.8822429906542055, | |
| "grad_norm": 0.030688611790537834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442954897880554, | |
| "mean_token_accuracy": 0.7765072137117386, | |
| "num_tokens": 8214799.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5432839095592499, | |
| "epoch": 1.885981308411215, | |
| "grad_norm": 0.0319070965051651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535275936126709, | |
| "mean_token_accuracy": 0.7709672451019287, | |
| "num_tokens": 8230973.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5594919174909592, | |
| "epoch": 1.8897196261682243, | |
| "grad_norm": 0.04258793592453003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607203841209412, | |
| "mean_token_accuracy": 0.7712259739637375, | |
| "num_tokens": 8247156.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5589391887187958, | |
| "epoch": 1.8934579439252337, | |
| "grad_norm": 0.033864762634038925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650033950805664, | |
| "mean_token_accuracy": 0.7718524932861328, | |
| "num_tokens": 8263441.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5569577813148499, | |
| "epoch": 1.897196261682243, | |
| "grad_norm": 0.03338006138801575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555600523948669, | |
| "mean_token_accuracy": 0.7759018540382385, | |
| "num_tokens": 8279848.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5524785667657852, | |
| "epoch": 1.9009345794392525, | |
| "grad_norm": 0.034291088581085205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554389238357544, | |
| "mean_token_accuracy": 0.7732797265052795, | |
| "num_tokens": 8296286.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5341912508010864, | |
| "epoch": 1.9046728971962616, | |
| "grad_norm": 0.03332460671663284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296705365180969, | |
| "mean_token_accuracy": 0.7850336581468582, | |
| "num_tokens": 8312462.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5374017357826233, | |
| "epoch": 1.908411214953271, | |
| "grad_norm": 0.029762303456664085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377117395401001, | |
| "mean_token_accuracy": 0.7782561480998993, | |
| "num_tokens": 8328514.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5621481090784073, | |
| "epoch": 1.9121495327102802, | |
| "grad_norm": 0.02770383097231388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556929349899292, | |
| "mean_token_accuracy": 0.7750183939933777, | |
| "num_tokens": 8345018.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5308145210146904, | |
| "epoch": 1.9158878504672896, | |
| "grad_norm": 0.031799450516700745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367879867553711, | |
| "mean_token_accuracy": 0.7811458259820938, | |
| "num_tokens": 8361450.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5505598485469818, | |
| "epoch": 1.919626168224299, | |
| "grad_norm": 0.030035199597477913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55583256483078, | |
| "mean_token_accuracy": 0.7735087871551514, | |
| "num_tokens": 8378205.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5498997569084167, | |
| "epoch": 1.9233644859813084, | |
| "grad_norm": 0.031478267163038254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554360568523407, | |
| "mean_token_accuracy": 0.7755851894617081, | |
| "num_tokens": 8394730.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5447141826152802, | |
| "epoch": 1.9271028037383178, | |
| "grad_norm": 0.034256696701049805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524182915687561, | |
| "mean_token_accuracy": 0.7743232250213623, | |
| "num_tokens": 8410799.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5548212677240372, | |
| "epoch": 1.9308411214953272, | |
| "grad_norm": 0.0296107679605484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498183965682983, | |
| "mean_token_accuracy": 0.7740313857793808, | |
| "num_tokens": 8427372.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5684213787317276, | |
| "epoch": 1.9345794392523366, | |
| "grad_norm": 0.03422481194138527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559389591217041, | |
| "mean_token_accuracy": 0.7754881531000137, | |
| "num_tokens": 8443822.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5545912981033325, | |
| "epoch": 1.938317757009346, | |
| "grad_norm": 0.031684234738349915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498573780059814, | |
| "mean_token_accuracy": 0.7783227860927582, | |
| "num_tokens": 8460032.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5595797300338745, | |
| "epoch": 1.9420560747663551, | |
| "grad_norm": 0.02719406597316265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614221096038818, | |
| "mean_token_accuracy": 0.7715103030204773, | |
| "num_tokens": 8476297.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5345963835716248, | |
| "epoch": 1.9457943925233645, | |
| "grad_norm": 0.03023097850382328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425735116004944, | |
| "mean_token_accuracy": 0.7805851995944977, | |
| "num_tokens": 8492637.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5391188263893127, | |
| "epoch": 1.9495327102803737, | |
| "grad_norm": 0.05476713180541992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556075572967529, | |
| "mean_token_accuracy": 0.7749961167573929, | |
| "num_tokens": 8509129.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5553655624389648, | |
| "epoch": 1.953271028037383, | |
| "grad_norm": 0.03542236238718033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655393004417419, | |
| "mean_token_accuracy": 0.7717009782791138, | |
| "num_tokens": 8525641.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5613285005092621, | |
| "epoch": 1.9570093457943925, | |
| "grad_norm": 0.06946822255849838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717962384223938, | |
| "mean_token_accuracy": 0.7724136412143707, | |
| "num_tokens": 8542275.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5575561076402664, | |
| "epoch": 1.9607476635514018, | |
| "grad_norm": 0.03460278734564781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417395830154419, | |
| "mean_token_accuracy": 0.7819567322731018, | |
| "num_tokens": 8558373.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5704021006822586, | |
| "epoch": 1.9644859813084112, | |
| "grad_norm": 0.030037706717848778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573901534080505, | |
| "mean_token_accuracy": 0.7713392674922943, | |
| "num_tokens": 8574839.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5286285877227783, | |
| "epoch": 1.9682242990654206, | |
| "grad_norm": 0.032038215547800064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231573581695557, | |
| "mean_token_accuracy": 0.7873097807168961, | |
| "num_tokens": 8591063.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.535316064953804, | |
| "epoch": 1.97196261682243, | |
| "grad_norm": 0.04137961193919182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491993427276611, | |
| "mean_token_accuracy": 0.7760031670331955, | |
| "num_tokens": 8607354.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5287620276212692, | |
| "epoch": 1.9757009345794394, | |
| "grad_norm": 0.03144775703549385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313848853111267, | |
| "mean_token_accuracy": 0.784307450056076, | |
| "num_tokens": 8623542.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5521504878997803, | |
| "epoch": 1.9794392523364486, | |
| "grad_norm": 0.03497127816081047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516395568847656, | |
| "mean_token_accuracy": 0.7736653387546539, | |
| "num_tokens": 8639626.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5580714792013168, | |
| "epoch": 1.983177570093458, | |
| "grad_norm": 0.030566083267331123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535013675689697, | |
| "mean_token_accuracy": 0.7748955637216568, | |
| "num_tokens": 8655957.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5411636233329773, | |
| "epoch": 1.9869158878504671, | |
| "grad_norm": 0.03356699272990227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376905202865601, | |
| "mean_token_accuracy": 0.7788012474775314, | |
| "num_tokens": 8672109.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5470294207334518, | |
| "epoch": 1.9906542056074765, | |
| "grad_norm": 0.0316782146692276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445536971092224, | |
| "mean_token_accuracy": 0.7801567167043686, | |
| "num_tokens": 8688512.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5573801398277283, | |
| "epoch": 1.994392523364486, | |
| "grad_norm": 0.0308368057012558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613093376159668, | |
| "mean_token_accuracy": 0.7755008339881897, | |
| "num_tokens": 8704882.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5606262683868408, | |
| "epoch": 1.9981308411214953, | |
| "grad_norm": 0.033759523183107376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673450827598572, | |
| "mean_token_accuracy": 0.7693974524736404, | |
| "num_tokens": 8721476.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5470572412014008, | |
| "epoch": 2.0, | |
| "grad_norm": 0.045990657061338425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525597333908081, | |
| "mean_token_accuracy": 0.7788615226745605, | |
| "num_tokens": 8729601.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5381215959787369, | |
| "epoch": 2.0037383177570094, | |
| "grad_norm": 0.03212118148803711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325874090194702, | |
| "mean_token_accuracy": 0.7825482040643692, | |
| "num_tokens": 8745950.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5637937486171722, | |
| "epoch": 2.007476635514019, | |
| "grad_norm": 0.036541201174259186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618294477462769, | |
| "mean_token_accuracy": 0.773602232336998, | |
| "num_tokens": 8762499.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5491923093795776, | |
| "epoch": 2.011214953271028, | |
| "grad_norm": 0.033549197018146515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548430323600769, | |
| "mean_token_accuracy": 0.7764875292778015, | |
| "num_tokens": 8778855.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5251094102859497, | |
| "epoch": 2.0149532710280376, | |
| "grad_norm": 0.036079153418540955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315405130386353, | |
| "mean_token_accuracy": 0.7840714603662491, | |
| "num_tokens": 8794810.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5423221588134766, | |
| "epoch": 2.0186915887850465, | |
| "grad_norm": 0.03329861909151077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420343279838562, | |
| "mean_token_accuracy": 0.7797907888889313, | |
| "num_tokens": 8811426.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5213563144207001, | |
| "epoch": 2.022429906542056, | |
| "grad_norm": 0.03049337863922119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193029642105103, | |
| "mean_token_accuracy": 0.7878206521272659, | |
| "num_tokens": 8827505.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5485236346721649, | |
| "epoch": 2.0261682242990653, | |
| "grad_norm": 0.038072168827056885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403975248336792, | |
| "mean_token_accuracy": 0.7787782251834869, | |
| "num_tokens": 8843789.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5497236847877502, | |
| "epoch": 2.0299065420560747, | |
| "grad_norm": 0.037746790796518326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424782037734985, | |
| "mean_token_accuracy": 0.7821084409952164, | |
| "num_tokens": 8860524.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5128878131508827, | |
| "epoch": 2.033644859813084, | |
| "grad_norm": 0.03184136748313904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119982957839966, | |
| "mean_token_accuracy": 0.7925940603017807, | |
| "num_tokens": 8876520.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.53415547311306, | |
| "epoch": 2.0373831775700935, | |
| "grad_norm": 0.04230194166302681, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436858534812927, | |
| "mean_token_accuracy": 0.7798719555139542, | |
| "num_tokens": 8892800.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.527920126914978, | |
| "epoch": 2.041121495327103, | |
| "grad_norm": 0.035794876515865326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537831723690033, | |
| "mean_token_accuracy": 0.7832628786563873, | |
| "num_tokens": 8908779.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.528620719909668, | |
| "epoch": 2.0448598130841122, | |
| "grad_norm": 0.043260980397462845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385839343070984, | |
| "mean_token_accuracy": 0.7800839692354202, | |
| "num_tokens": 8925225.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5344889611005783, | |
| "epoch": 2.0485981308411216, | |
| "grad_norm": 0.03616830334067345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279685258865356, | |
| "mean_token_accuracy": 0.7877432852983475, | |
| "num_tokens": 8941370.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5505447387695312, | |
| "epoch": 2.052336448598131, | |
| "grad_norm": 0.03392447903752327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464667081832886, | |
| "mean_token_accuracy": 0.778993234038353, | |
| "num_tokens": 8957759.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.537495419383049, | |
| "epoch": 2.05607476635514, | |
| "grad_norm": 0.03487386927008629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327776074409485, | |
| "mean_token_accuracy": 0.7819164842367172, | |
| "num_tokens": 8974120.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5181033089756966, | |
| "epoch": 2.0598130841121494, | |
| "grad_norm": 0.03655601665377617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5197772979736328, | |
| "mean_token_accuracy": 0.7876780480146408, | |
| "num_tokens": 8990084.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5097288861870766, | |
| "epoch": 2.0635514018691588, | |
| "grad_norm": 0.04094317555427551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214163661003113, | |
| "mean_token_accuracy": 0.7877646237611771, | |
| "num_tokens": 9006115.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5392448753118515, | |
| "epoch": 2.067289719626168, | |
| "grad_norm": 0.042336490005254745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487770438194275, | |
| "mean_token_accuracy": 0.7746841162443161, | |
| "num_tokens": 9022503.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5353204905986786, | |
| "epoch": 2.0710280373831775, | |
| "grad_norm": 0.04751956835389137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423939824104309, | |
| "mean_token_accuracy": 0.7819565683603287, | |
| "num_tokens": 9038587.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5576211661100388, | |
| "epoch": 2.074766355140187, | |
| "grad_norm": 0.034248773008584976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450438261032104, | |
| "mean_token_accuracy": 0.7806050181388855, | |
| "num_tokens": 9054978.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5164358094334602, | |
| "epoch": 2.0785046728971963, | |
| "grad_norm": 0.03642895817756653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5048035979270935, | |
| "mean_token_accuracy": 0.7946237772703171, | |
| "num_tokens": 9071189.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5479462146759033, | |
| "epoch": 2.0822429906542057, | |
| "grad_norm": 0.03524266555905342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424850583076477, | |
| "mean_token_accuracy": 0.7782812714576721, | |
| "num_tokens": 9087453.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5207670480012894, | |
| "epoch": 2.085981308411215, | |
| "grad_norm": 0.04086553677916527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275461673736572, | |
| "mean_token_accuracy": 0.7870053201913834, | |
| "num_tokens": 9103538.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5350566729903221, | |
| "epoch": 2.0897196261682245, | |
| "grad_norm": 0.036386121064424515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380175113677979, | |
| "mean_token_accuracy": 0.7814048826694489, | |
| "num_tokens": 9119858.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5368697345256805, | |
| "epoch": 2.0934579439252334, | |
| "grad_norm": 0.039366140961647034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444531440734863, | |
| "mean_token_accuracy": 0.7792541682720184, | |
| "num_tokens": 9136204.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5295629873871803, | |
| "epoch": 2.097196261682243, | |
| "grad_norm": 0.03559441864490509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286230444908142, | |
| "mean_token_accuracy": 0.784547358751297, | |
| "num_tokens": 9152718.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5568843930959702, | |
| "epoch": 2.100934579439252, | |
| "grad_norm": 0.034528154879808426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466718077659607, | |
| "mean_token_accuracy": 0.7782703340053558, | |
| "num_tokens": 9168840.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5514650642871857, | |
| "epoch": 2.1046728971962616, | |
| "grad_norm": 0.034620221704244614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481366515159607, | |
| "mean_token_accuracy": 0.7774865627288818, | |
| "num_tokens": 9185012.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5468508899211884, | |
| "epoch": 2.108411214953271, | |
| "grad_norm": 0.038367778062820435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465208888053894, | |
| "mean_token_accuracy": 0.7787877917289734, | |
| "num_tokens": 9201579.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5365718752145767, | |
| "epoch": 2.1121495327102804, | |
| "grad_norm": 0.033649299293756485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394605398178101, | |
| "mean_token_accuracy": 0.7824818789958954, | |
| "num_tokens": 9217958.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5342001020908356, | |
| "epoch": 2.1158878504672898, | |
| "grad_norm": 0.04148790240287781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541080892086029, | |
| "mean_token_accuracy": 0.7807753682136536, | |
| "num_tokens": 9234182.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5269056260585785, | |
| "epoch": 2.119626168224299, | |
| "grad_norm": 0.031905628740787506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529283881187439, | |
| "mean_token_accuracy": 0.7837703377008438, | |
| "num_tokens": 9250712.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5335036367177963, | |
| "epoch": 2.1233644859813086, | |
| "grad_norm": 0.041321150958538055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374078154563904, | |
| "mean_token_accuracy": 0.782123014330864, | |
| "num_tokens": 9266961.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5442205667495728, | |
| "epoch": 2.127102803738318, | |
| "grad_norm": 0.034318044781684875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429351329803467, | |
| "mean_token_accuracy": 0.7788351625204086, | |
| "num_tokens": 9283528.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5432394444942474, | |
| "epoch": 2.130841121495327, | |
| "grad_norm": 0.047397077083587646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424203276634216, | |
| "mean_token_accuracy": 0.7810939103364944, | |
| "num_tokens": 9299837.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5400207340717316, | |
| "epoch": 2.1345794392523363, | |
| "grad_norm": 0.03500756248831749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377690196037292, | |
| "mean_token_accuracy": 0.783811166882515, | |
| "num_tokens": 9315959.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5296697020530701, | |
| "epoch": 2.1383177570093457, | |
| "grad_norm": 0.03790782764554024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289957523345947, | |
| "mean_token_accuracy": 0.7867159694433212, | |
| "num_tokens": 9332370.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5078830569982529, | |
| "epoch": 2.142056074766355, | |
| "grad_norm": 0.045958928763866425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5104236006736755, | |
| "mean_token_accuracy": 0.7909017950296402, | |
| "num_tokens": 9348594.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5188925862312317, | |
| "epoch": 2.1457943925233645, | |
| "grad_norm": 0.03916464373469353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316386818885803, | |
| "mean_token_accuracy": 0.7828120291233063, | |
| "num_tokens": 9365046.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5045325607061386, | |
| "epoch": 2.149532710280374, | |
| "grad_norm": 0.04434382542967796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5116738080978394, | |
| "mean_token_accuracy": 0.7905466854572296, | |
| "num_tokens": 9381007.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5541563183069229, | |
| "epoch": 2.1532710280373832, | |
| "grad_norm": 0.038000430911779404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551270842552185, | |
| "mean_token_accuracy": 0.7762157022953033, | |
| "num_tokens": 9397394.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5460502356290817, | |
| "epoch": 2.1570093457943926, | |
| "grad_norm": 0.038676705211400986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363121032714844, | |
| "mean_token_accuracy": 0.7802022695541382, | |
| "num_tokens": 9413810.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5573510080575943, | |
| "epoch": 2.160747663551402, | |
| "grad_norm": 0.03721381351351738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444300174713135, | |
| "mean_token_accuracy": 0.7804805636405945, | |
| "num_tokens": 9430091.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5371396392583847, | |
| "epoch": 2.1644859813084114, | |
| "grad_norm": 0.04258019104599953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351753234863281, | |
| "mean_token_accuracy": 0.7820869237184525, | |
| "num_tokens": 9446665.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5393694788217545, | |
| "epoch": 2.1682242990654204, | |
| "grad_norm": 0.0406467579305172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430103540420532, | |
| "mean_token_accuracy": 0.7779065668582916, | |
| "num_tokens": 9463118.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5272447615861893, | |
| "epoch": 2.1719626168224297, | |
| "grad_norm": 0.04435638338327408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354752540588379, | |
| "mean_token_accuracy": 0.7838975638151169, | |
| "num_tokens": 9479432.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5255759209394455, | |
| "epoch": 2.175700934579439, | |
| "grad_norm": 0.03574801981449127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531680703163147, | |
| "mean_token_accuracy": 0.7842760384082794, | |
| "num_tokens": 9495707.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5348410457372665, | |
| "epoch": 2.1794392523364485, | |
| "grad_norm": 0.03383009880781174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284703373908997, | |
| "mean_token_accuracy": 0.7889558225870132, | |
| "num_tokens": 9512236.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5311737060546875, | |
| "epoch": 2.183177570093458, | |
| "grad_norm": 0.035349104553461075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332157611846924, | |
| "mean_token_accuracy": 0.7814211249351501, | |
| "num_tokens": 9528589.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5255388617515564, | |
| "epoch": 2.1869158878504673, | |
| "grad_norm": 0.043005745857954025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251577496528625, | |
| "mean_token_accuracy": 0.7884248644113541, | |
| "num_tokens": 9544965.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5347089469432831, | |
| "epoch": 2.1906542056074767, | |
| "grad_norm": 0.03752923756837845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362472534179688, | |
| "mean_token_accuracy": 0.7811613231897354, | |
| "num_tokens": 9561276.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5310826078057289, | |
| "epoch": 2.194392523364486, | |
| "grad_norm": 0.05228811874985695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329592227935791, | |
| "mean_token_accuracy": 0.7827970087528229, | |
| "num_tokens": 9577509.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5254483968019485, | |
| "epoch": 2.1981308411214955, | |
| "grad_norm": 0.03692999482154846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311483144760132, | |
| "mean_token_accuracy": 0.7830882370471954, | |
| "num_tokens": 9593982.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5360620766878128, | |
| "epoch": 2.201869158878505, | |
| "grad_norm": 0.04609117656946182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386216640472412, | |
| "mean_token_accuracy": 0.7802708595991135, | |
| "num_tokens": 9610311.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5463242679834366, | |
| "epoch": 2.205607476635514, | |
| "grad_norm": 0.03901510685682297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447873473167419, | |
| "mean_token_accuracy": 0.7785727232694626, | |
| "num_tokens": 9626678.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5129301249980927, | |
| "epoch": 2.209345794392523, | |
| "grad_norm": 0.043117035180330276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5128067135810852, | |
| "mean_token_accuracy": 0.7911233007907867, | |
| "num_tokens": 9642843.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5312749594449997, | |
| "epoch": 2.2130841121495326, | |
| "grad_norm": 0.03675411641597748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329593420028687, | |
| "mean_token_accuracy": 0.7832809239625931, | |
| "num_tokens": 9659218.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5422542840242386, | |
| "epoch": 2.216822429906542, | |
| "grad_norm": 0.036754533648490906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398430824279785, | |
| "mean_token_accuracy": 0.7803453654050827, | |
| "num_tokens": 9675649.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5472271293401718, | |
| "epoch": 2.2205607476635514, | |
| "grad_norm": 0.043753694742918015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421810150146484, | |
| "mean_token_accuracy": 0.7812557965517044, | |
| "num_tokens": 9691932.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5446718335151672, | |
| "epoch": 2.2242990654205608, | |
| "grad_norm": 0.0450102761387825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450670719146729, | |
| "mean_token_accuracy": 0.7795027941465378, | |
| "num_tokens": 9708243.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5422708988189697, | |
| "epoch": 2.22803738317757, | |
| "grad_norm": 0.042899005115032196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427168011665344, | |
| "mean_token_accuracy": 0.7769834697246552, | |
| "num_tokens": 9724620.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5316948816180229, | |
| "epoch": 2.2317757009345796, | |
| "grad_norm": 0.0438719242811203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369054675102234, | |
| "mean_token_accuracy": 0.7818674147129059, | |
| "num_tokens": 9740813.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5353083610534668, | |
| "epoch": 2.235514018691589, | |
| "grad_norm": 0.045174483209848404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535564124584198, | |
| "mean_token_accuracy": 0.7826817184686661, | |
| "num_tokens": 9757081.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.53409144282341, | |
| "epoch": 2.2392523364485983, | |
| "grad_norm": 0.046971406787633896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388940572738647, | |
| "mean_token_accuracy": 0.7797097563743591, | |
| "num_tokens": 9773286.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5229181125760078, | |
| "epoch": 2.2429906542056073, | |
| "grad_norm": 0.04818117991089821, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283955931663513, | |
| "mean_token_accuracy": 0.7855319827795029, | |
| "num_tokens": 9789231.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5502548068761826, | |
| "epoch": 2.2467289719626167, | |
| "grad_norm": 0.041451770812273026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441420078277588, | |
| "mean_token_accuracy": 0.7805446833372116, | |
| "num_tokens": 9805737.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5555277764797211, | |
| "epoch": 2.250467289719626, | |
| "grad_norm": 0.03888588771224022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571548938751221, | |
| "mean_token_accuracy": 0.7741208076477051, | |
| "num_tokens": 9822370.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5331219285726547, | |
| "epoch": 2.2542056074766355, | |
| "grad_norm": 0.050726499408483505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355172157287598, | |
| "mean_token_accuracy": 0.7803194671869278, | |
| "num_tokens": 9838846.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5391329601407051, | |
| "epoch": 2.257943925233645, | |
| "grad_norm": 0.03473533317446709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380818843841553, | |
| "mean_token_accuracy": 0.7837731093168259, | |
| "num_tokens": 9855269.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5419459789991379, | |
| "epoch": 2.2616822429906542, | |
| "grad_norm": 0.04428257793188095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402700304985046, | |
| "mean_token_accuracy": 0.7803330719470978, | |
| "num_tokens": 9871498.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5475794821977615, | |
| "epoch": 2.2654205607476636, | |
| "grad_norm": 0.03847254440188408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443584322929382, | |
| "mean_token_accuracy": 0.7776888459920883, | |
| "num_tokens": 9887880.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5413693785667419, | |
| "epoch": 2.269158878504673, | |
| "grad_norm": 0.03769246116280556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448262095451355, | |
| "mean_token_accuracy": 0.7788306772708893, | |
| "num_tokens": 9904482.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5233470648527145, | |
| "epoch": 2.2728971962616824, | |
| "grad_norm": 0.041845668107271194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302014946937561, | |
| "mean_token_accuracy": 0.7834525555372238, | |
| "num_tokens": 9920720.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.526485301554203, | |
| "epoch": 2.2766355140186914, | |
| "grad_norm": 0.04298217222094536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376767516136169, | |
| "mean_token_accuracy": 0.7815933078527451, | |
| "num_tokens": 9936855.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5407330542802811, | |
| "epoch": 2.2803738317757007, | |
| "grad_norm": 0.03829406201839447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375736951828003, | |
| "mean_token_accuracy": 0.7817153483629227, | |
| "num_tokens": 9953359.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.557465985417366, | |
| "epoch": 2.28411214953271, | |
| "grad_norm": 0.0430569127202034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485789775848389, | |
| "mean_token_accuracy": 0.7774669080972672, | |
| "num_tokens": 9969809.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5491045266389847, | |
| "epoch": 2.2878504672897195, | |
| "grad_norm": 0.04154661297798157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452516078948975, | |
| "mean_token_accuracy": 0.7782464772462845, | |
| "num_tokens": 9986122.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5396340191364288, | |
| "epoch": 2.291588785046729, | |
| "grad_norm": 0.03867339715361595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436422228813171, | |
| "mean_token_accuracy": 0.7793163359165192, | |
| "num_tokens": 10002373.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5227179303765297, | |
| "epoch": 2.2953271028037383, | |
| "grad_norm": 0.055158648639917374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356475710868835, | |
| "mean_token_accuracy": 0.7828944474458694, | |
| "num_tokens": 10018532.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5101833418011665, | |
| "epoch": 2.2990654205607477, | |
| "grad_norm": 0.04139378294348717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5111054182052612, | |
| "mean_token_accuracy": 0.7948217988014221, | |
| "num_tokens": 10034449.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5332518517971039, | |
| "epoch": 2.302803738317757, | |
| "grad_norm": 0.042138371616601944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291332602500916, | |
| "mean_token_accuracy": 0.7875723540782928, | |
| "num_tokens": 10050791.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5545465350151062, | |
| "epoch": 2.3065420560747665, | |
| "grad_norm": 0.04594315588474274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547114610671997, | |
| "mean_token_accuracy": 0.7752625793218613, | |
| "num_tokens": 10067160.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.538428008556366, | |
| "epoch": 2.310280373831776, | |
| "grad_norm": 0.038197144865989685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356147885322571, | |
| "mean_token_accuracy": 0.7812609076499939, | |
| "num_tokens": 10083623.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.515357218682766, | |
| "epoch": 2.3140186915887853, | |
| "grad_norm": 0.04305245727300644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5182097554206848, | |
| "mean_token_accuracy": 0.7897254973649979, | |
| "num_tokens": 10099734.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5176303833723068, | |
| "epoch": 2.317757009345794, | |
| "grad_norm": 0.040814559906721115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241186618804932, | |
| "mean_token_accuracy": 0.7862492203712463, | |
| "num_tokens": 10115923.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5319753438234329, | |
| "epoch": 2.3214953271028036, | |
| "grad_norm": 0.038612622767686844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332948565483093, | |
| "mean_token_accuracy": 0.7826831489801407, | |
| "num_tokens": 10132186.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5231878906488419, | |
| "epoch": 2.325233644859813, | |
| "grad_norm": 0.04399793595075607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220815539360046, | |
| "mean_token_accuracy": 0.7883405387401581, | |
| "num_tokens": 10148176.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5503655076026917, | |
| "epoch": 2.3289719626168224, | |
| "grad_norm": 0.03310840204358101, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424314737319946, | |
| "mean_token_accuracy": 0.7791298031806946, | |
| "num_tokens": 10164602.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5562791079282761, | |
| "epoch": 2.3327102803738318, | |
| "grad_norm": 0.046219419687986374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487840175628662, | |
| "mean_token_accuracy": 0.7803521156311035, | |
| "num_tokens": 10180910.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.536386102437973, | |
| "epoch": 2.336448598130841, | |
| "grad_norm": 0.038521721959114075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320638418197632, | |
| "mean_token_accuracy": 0.7856791615486145, | |
| "num_tokens": 10197138.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5220321416854858, | |
| "epoch": 2.3401869158878505, | |
| "grad_norm": 0.046215180307626724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289742946624756, | |
| "mean_token_accuracy": 0.784678503870964, | |
| "num_tokens": 10213246.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5178990513086319, | |
| "epoch": 2.34392523364486, | |
| "grad_norm": 0.04778464511036873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.522329568862915, | |
| "mean_token_accuracy": 0.7881183475255966, | |
| "num_tokens": 10229431.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5353438407182693, | |
| "epoch": 2.3476635514018693, | |
| "grad_norm": 0.04080234467983246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433787107467651, | |
| "mean_token_accuracy": 0.7780589759349823, | |
| "num_tokens": 10245684.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5368916243314743, | |
| "epoch": 2.3514018691588783, | |
| "grad_norm": 0.043697554618120193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541444718837738, | |
| "mean_token_accuracy": 0.7807413637638092, | |
| "num_tokens": 10262210.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5506647378206253, | |
| "epoch": 2.3551401869158877, | |
| "grad_norm": 0.038478951901197433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461610555648804, | |
| "mean_token_accuracy": 0.7788456082344055, | |
| "num_tokens": 10278611.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5395764261484146, | |
| "epoch": 2.358878504672897, | |
| "grad_norm": 0.03904217854142189, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317508578300476, | |
| "mean_token_accuracy": 0.7833081781864166, | |
| "num_tokens": 10294800.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5478651374578476, | |
| "epoch": 2.3626168224299064, | |
| "grad_norm": 0.048824410885572433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395293831825256, | |
| "mean_token_accuracy": 0.783235713839531, | |
| "num_tokens": 10311090.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5332029610872269, | |
| "epoch": 2.366355140186916, | |
| "grad_norm": 0.04313044250011444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401085615158081, | |
| "mean_token_accuracy": 0.778812825679779, | |
| "num_tokens": 10327348.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5406146496534348, | |
| "epoch": 2.3700934579439252, | |
| "grad_norm": 0.04600725322961807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516705513000488, | |
| "mean_token_accuracy": 0.7761097699403763, | |
| "num_tokens": 10343800.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5261052846908569, | |
| "epoch": 2.3738317757009346, | |
| "grad_norm": 0.045134712010622025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412300825119019, | |
| "mean_token_accuracy": 0.7802619636058807, | |
| "num_tokens": 10360082.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5589279979467392, | |
| "epoch": 2.377570093457944, | |
| "grad_norm": 0.041725922375917435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517748594284058, | |
| "mean_token_accuracy": 0.778441995382309, | |
| "num_tokens": 10376345.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5504082888364792, | |
| "epoch": 2.3813084112149534, | |
| "grad_norm": 0.03725145012140274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404931306838989, | |
| "mean_token_accuracy": 0.7776447534561157, | |
| "num_tokens": 10392870.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5359382033348083, | |
| "epoch": 2.385046728971963, | |
| "grad_norm": 0.0364760085940361, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533162534236908, | |
| "mean_token_accuracy": 0.7851890027523041, | |
| "num_tokens": 10409256.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5336398631334305, | |
| "epoch": 2.388785046728972, | |
| "grad_norm": 0.036078356206417084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374175906181335, | |
| "mean_token_accuracy": 0.7814856320619583, | |
| "num_tokens": 10425831.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5284569710493088, | |
| "epoch": 2.392523364485981, | |
| "grad_norm": 0.04704172909259796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387214422225952, | |
| "mean_token_accuracy": 0.7815752625465393, | |
| "num_tokens": 10442382.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5344073623418808, | |
| "epoch": 2.3962616822429905, | |
| "grad_norm": 0.0398792028427124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398225784301758, | |
| "mean_token_accuracy": 0.7818136066198349, | |
| "num_tokens": 10458810.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5323895663022995, | |
| "epoch": 2.4, | |
| "grad_norm": 0.037454817444086075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368887782096863, | |
| "mean_token_accuracy": 0.7800801247358322, | |
| "num_tokens": 10474692.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5394662618637085, | |
| "epoch": 2.4037383177570093, | |
| "grad_norm": 0.03576047718524933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351858735084534, | |
| "mean_token_accuracy": 0.7815855145454407, | |
| "num_tokens": 10491015.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.547369509935379, | |
| "epoch": 2.4074766355140187, | |
| "grad_norm": 0.0398087315261364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397285223007202, | |
| "mean_token_accuracy": 0.7805114239454269, | |
| "num_tokens": 10507366.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5508280843496323, | |
| "epoch": 2.411214953271028, | |
| "grad_norm": 0.03709566593170166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448777675628662, | |
| "mean_token_accuracy": 0.7763405591249466, | |
| "num_tokens": 10523374.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5248509049415588, | |
| "epoch": 2.4149532710280375, | |
| "grad_norm": 0.03418833017349243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5208706855773926, | |
| "mean_token_accuracy": 0.7874817848205566, | |
| "num_tokens": 10539624.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5466809421777725, | |
| "epoch": 2.418691588785047, | |
| "grad_norm": 0.039764732122421265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513855218887329, | |
| "mean_token_accuracy": 0.776073694229126, | |
| "num_tokens": 10556212.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5117013603448868, | |
| "epoch": 2.4224299065420563, | |
| "grad_norm": 0.04086057096719742, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219972729682922, | |
| "mean_token_accuracy": 0.7889275252819061, | |
| "num_tokens": 10572323.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5393745452165604, | |
| "epoch": 2.426168224299065, | |
| "grad_norm": 0.037193622440099716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456075668334961, | |
| "mean_token_accuracy": 0.7753270417451859, | |
| "num_tokens": 10588533.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5517471730709076, | |
| "epoch": 2.4299065420560746, | |
| "grad_norm": 0.04061353579163551, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480504035949707, | |
| "mean_token_accuracy": 0.7777185589075089, | |
| "num_tokens": 10604736.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5332285165786743, | |
| "epoch": 2.433644859813084, | |
| "grad_norm": 0.037262339144945145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52723628282547, | |
| "mean_token_accuracy": 0.7820963263511658, | |
| "num_tokens": 10621005.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5427125096321106, | |
| "epoch": 2.4373831775700934, | |
| "grad_norm": 0.038290560245513916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433245897293091, | |
| "mean_token_accuracy": 0.7764440774917603, | |
| "num_tokens": 10637274.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.515294149518013, | |
| "epoch": 2.4411214953271028, | |
| "grad_norm": 0.07859813421964645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192139744758606, | |
| "mean_token_accuracy": 0.7903406471014023, | |
| "num_tokens": 10653571.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5411062091588974, | |
| "epoch": 2.444859813084112, | |
| "grad_norm": 0.04054918885231018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439664721488953, | |
| "mean_token_accuracy": 0.7815183401107788, | |
| "num_tokens": 10670139.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5487605780363083, | |
| "epoch": 2.4485981308411215, | |
| "grad_norm": 0.04026317596435547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495845675468445, | |
| "mean_token_accuracy": 0.7765460163354874, | |
| "num_tokens": 10686846.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5351516157388687, | |
| "epoch": 2.452336448598131, | |
| "grad_norm": 0.040862392634153366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336912870407104, | |
| "mean_token_accuracy": 0.7818685173988342, | |
| "num_tokens": 10703200.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5463723838329315, | |
| "epoch": 2.4560747663551403, | |
| "grad_norm": 0.03873393312096596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465680360794067, | |
| "mean_token_accuracy": 0.7760122418403625, | |
| "num_tokens": 10719561.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5416133552789688, | |
| "epoch": 2.4598130841121497, | |
| "grad_norm": 0.044795434921979904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411824584007263, | |
| "mean_token_accuracy": 0.7804904133081436, | |
| "num_tokens": 10735767.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5494029372930527, | |
| "epoch": 2.463551401869159, | |
| "grad_norm": 0.04379895702004433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456870198249817, | |
| "mean_token_accuracy": 0.7755402028560638, | |
| "num_tokens": 10751886.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5367189347743988, | |
| "epoch": 2.467289719626168, | |
| "grad_norm": 0.03852448984980583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393000841140747, | |
| "mean_token_accuracy": 0.7800532579421997, | |
| "num_tokens": 10768210.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5270116031169891, | |
| "epoch": 2.4710280373831774, | |
| "grad_norm": 0.03792192041873932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289605259895325, | |
| "mean_token_accuracy": 0.7838020473718643, | |
| "num_tokens": 10784434.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5338448286056519, | |
| "epoch": 2.474766355140187, | |
| "grad_norm": 0.0350453220307827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380920767784119, | |
| "mean_token_accuracy": 0.7818057388067245, | |
| "num_tokens": 10800619.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5228566378355026, | |
| "epoch": 2.4785046728971962, | |
| "grad_norm": 0.046152058988809586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300622582435608, | |
| "mean_token_accuracy": 0.7793385684490204, | |
| "num_tokens": 10816801.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5290849655866623, | |
| "epoch": 2.4822429906542056, | |
| "grad_norm": 0.03659910336136818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329374074935913, | |
| "mean_token_accuracy": 0.7838267683982849, | |
| "num_tokens": 10833095.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.545561358332634, | |
| "epoch": 2.485981308411215, | |
| "grad_norm": 0.04097100347280502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479649901390076, | |
| "mean_token_accuracy": 0.7784263789653778, | |
| "num_tokens": 10849473.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5502291470766068, | |
| "epoch": 2.4897196261682244, | |
| "grad_norm": 0.04253846034407616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466883182525635, | |
| "mean_token_accuracy": 0.7778628617525101, | |
| "num_tokens": 10865837.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5474338084459305, | |
| "epoch": 2.493457943925234, | |
| "grad_norm": 0.037734732031822205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415964126586914, | |
| "mean_token_accuracy": 0.7777974009513855, | |
| "num_tokens": 10882273.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5401993542909622, | |
| "epoch": 2.497196261682243, | |
| "grad_norm": 0.039542876183986664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339391231536865, | |
| "mean_token_accuracy": 0.784349262714386, | |
| "num_tokens": 10898780.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5420306771993637, | |
| "epoch": 2.500934579439252, | |
| "grad_norm": 0.049927666783332825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389054417610168, | |
| "mean_token_accuracy": 0.7841761559247971, | |
| "num_tokens": 10915059.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5333422720432281, | |
| "epoch": 2.5046728971962615, | |
| "grad_norm": 0.042702775448560715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403023958206177, | |
| "mean_token_accuracy": 0.7792320251464844, | |
| "num_tokens": 10931718.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5289912968873978, | |
| "epoch": 2.508411214953271, | |
| "grad_norm": 0.050530027598142624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404794216156006, | |
| "mean_token_accuracy": 0.7815851122140884, | |
| "num_tokens": 10948084.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5341697633266449, | |
| "epoch": 2.5121495327102803, | |
| "grad_norm": 0.04310121387243271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389139652252197, | |
| "mean_token_accuracy": 0.778786912560463, | |
| "num_tokens": 10964373.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5569636076688766, | |
| "epoch": 2.5158878504672897, | |
| "grad_norm": 0.03820215165615082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578426122665405, | |
| "mean_token_accuracy": 0.7730483710765839, | |
| "num_tokens": 10980732.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5347766578197479, | |
| "epoch": 2.519626168224299, | |
| "grad_norm": 0.04349920526146889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336275100708008, | |
| "mean_token_accuracy": 0.7815207839012146, | |
| "num_tokens": 10997005.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5299794673919678, | |
| "epoch": 2.5233644859813085, | |
| "grad_norm": 0.04003509134054184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294742584228516, | |
| "mean_token_accuracy": 0.7869250029325485, | |
| "num_tokens": 11013055.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5352783799171448, | |
| "epoch": 2.527102803738318, | |
| "grad_norm": 0.054121218621730804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448738932609558, | |
| "mean_token_accuracy": 0.7791888117790222, | |
| "num_tokens": 11029266.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5354646146297455, | |
| "epoch": 2.5308411214953273, | |
| "grad_norm": 0.03573855757713318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352723002433777, | |
| "mean_token_accuracy": 0.7825258523225784, | |
| "num_tokens": 11045806.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.556391716003418, | |
| "epoch": 2.5345794392523366, | |
| "grad_norm": 0.04871753975749016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602859258651733, | |
| "mean_token_accuracy": 0.7722157090902328, | |
| "num_tokens": 11062035.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5508870929479599, | |
| "epoch": 2.538317757009346, | |
| "grad_norm": 0.03932088986039162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469393730163574, | |
| "mean_token_accuracy": 0.7782620638608932, | |
| "num_tokens": 11078375.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5481788516044617, | |
| "epoch": 2.542056074766355, | |
| "grad_norm": 0.04463294520974159, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469505190849304, | |
| "mean_token_accuracy": 0.7766976356506348, | |
| "num_tokens": 11094977.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5154567137360573, | |
| "epoch": 2.5457943925233644, | |
| "grad_norm": 0.044517725706100464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5210436582565308, | |
| "mean_token_accuracy": 0.7881979048252106, | |
| "num_tokens": 11110907.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5250661969184875, | |
| "epoch": 2.5495327102803738, | |
| "grad_norm": 0.03574059158563614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239285826683044, | |
| "mean_token_accuracy": 0.7901371419429779, | |
| "num_tokens": 11127432.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.541177287697792, | |
| "epoch": 2.553271028037383, | |
| "grad_norm": 0.03583724424242973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399287343025208, | |
| "mean_token_accuracy": 0.7795550227165222, | |
| "num_tokens": 11143788.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5319067388772964, | |
| "epoch": 2.5570093457943925, | |
| "grad_norm": 0.038700610399246216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372647047042847, | |
| "mean_token_accuracy": 0.7816288769245148, | |
| "num_tokens": 11160145.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5243031531572342, | |
| "epoch": 2.560747663551402, | |
| "grad_norm": 0.0457780659198761, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248138308525085, | |
| "mean_token_accuracy": 0.7840212136507034, | |
| "num_tokens": 11176075.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5483701825141907, | |
| "epoch": 2.5644859813084113, | |
| "grad_norm": 0.0399782694876194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485758185386658, | |
| "mean_token_accuracy": 0.7779590934514999, | |
| "num_tokens": 11192293.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5290739685297012, | |
| "epoch": 2.5682242990654207, | |
| "grad_norm": 0.056546278297901154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325236320495605, | |
| "mean_token_accuracy": 0.7835103422403336, | |
| "num_tokens": 11208542.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5161010921001434, | |
| "epoch": 2.5719626168224297, | |
| "grad_norm": 0.042589396238327026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5185222625732422, | |
| "mean_token_accuracy": 0.7873405963182449, | |
| "num_tokens": 11224578.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5410270541906357, | |
| "epoch": 2.575700934579439, | |
| "grad_norm": 0.05106229707598686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452054142951965, | |
| "mean_token_accuracy": 0.7787328362464905, | |
| "num_tokens": 11240887.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5375277251005173, | |
| "epoch": 2.5794392523364484, | |
| "grad_norm": 0.03891480341553688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347110033035278, | |
| "mean_token_accuracy": 0.7833239287137985, | |
| "num_tokens": 11256921.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5428935289382935, | |
| "epoch": 2.583177570093458, | |
| "grad_norm": 0.04642964154481888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380253195762634, | |
| "mean_token_accuracy": 0.7818872332572937, | |
| "num_tokens": 11273253.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5503559708595276, | |
| "epoch": 2.586915887850467, | |
| "grad_norm": 0.04631572589278221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499509572982788, | |
| "mean_token_accuracy": 0.7778131514787674, | |
| "num_tokens": 11289524.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5296535789966583, | |
| "epoch": 2.5906542056074766, | |
| "grad_norm": 0.04232152923941612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292780995368958, | |
| "mean_token_accuracy": 0.7848498374223709, | |
| "num_tokens": 11305878.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5324369296431541, | |
| "epoch": 2.594392523364486, | |
| "grad_norm": 0.04305447265505791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328658223152161, | |
| "mean_token_accuracy": 0.7839655876159668, | |
| "num_tokens": 11322266.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5353843569755554, | |
| "epoch": 2.5981308411214954, | |
| "grad_norm": 0.04098288714885712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361748933792114, | |
| "mean_token_accuracy": 0.7821073234081268, | |
| "num_tokens": 11338684.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5268280059099197, | |
| "epoch": 2.601869158878505, | |
| "grad_norm": 0.05113406851887703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360528230667114, | |
| "mean_token_accuracy": 0.7813736945390701, | |
| "num_tokens": 11354924.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5334519147872925, | |
| "epoch": 2.605607476635514, | |
| "grad_norm": 0.036048226058483124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367494225502014, | |
| "mean_token_accuracy": 0.782368615269661, | |
| "num_tokens": 11371138.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5625623911619186, | |
| "epoch": 2.6093457943925236, | |
| "grad_norm": 0.04338160157203674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562830567359924, | |
| "mean_token_accuracy": 0.7749900668859482, | |
| "num_tokens": 11387674.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5387382507324219, | |
| "epoch": 2.613084112149533, | |
| "grad_norm": 0.04549875482916832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360974073410034, | |
| "mean_token_accuracy": 0.781986802816391, | |
| "num_tokens": 11403934.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5418427735567093, | |
| "epoch": 2.616822429906542, | |
| "grad_norm": 0.04425078630447388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500712990760803, | |
| "mean_token_accuracy": 0.7762207537889481, | |
| "num_tokens": 11420207.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5345925241708755, | |
| "epoch": 2.6205607476635513, | |
| "grad_norm": 0.0503389798104763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410506129264832, | |
| "mean_token_accuracy": 0.7824158221483231, | |
| "num_tokens": 11436366.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5293083861470222, | |
| "epoch": 2.6242990654205607, | |
| "grad_norm": 0.03849806264042854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313189625740051, | |
| "mean_token_accuracy": 0.7851823717355728, | |
| "num_tokens": 11452692.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5381535738706589, | |
| "epoch": 2.62803738317757, | |
| "grad_norm": 0.04830117151141167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306882262229919, | |
| "mean_token_accuracy": 0.7875523120164871, | |
| "num_tokens": 11468948.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5537677556276321, | |
| "epoch": 2.6317757009345795, | |
| "grad_norm": 0.03648355230689049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549413800239563, | |
| "mean_token_accuracy": 0.7742456942796707, | |
| "num_tokens": 11485304.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5376065969467163, | |
| "epoch": 2.635514018691589, | |
| "grad_norm": 0.03775647282600403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347313284873962, | |
| "mean_token_accuracy": 0.7820166647434235, | |
| "num_tokens": 11501515.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5389592945575714, | |
| "epoch": 2.6392523364485982, | |
| "grad_norm": 0.03849456459283829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542040228843689, | |
| "mean_token_accuracy": 0.7777668088674545, | |
| "num_tokens": 11517823.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5297961235046387, | |
| "epoch": 2.6429906542056076, | |
| "grad_norm": 0.03884672373533249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295203924179077, | |
| "mean_token_accuracy": 0.7848687022924423, | |
| "num_tokens": 11534089.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5374749451875687, | |
| "epoch": 2.6467289719626166, | |
| "grad_norm": 0.040985025465488434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486632585525513, | |
| "mean_token_accuracy": 0.7780227363109589, | |
| "num_tokens": 11550404.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5216163545846939, | |
| "epoch": 2.650467289719626, | |
| "grad_norm": 0.041445303708314896, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271479487419128, | |
| "mean_token_accuracy": 0.7851904779672623, | |
| "num_tokens": 11566700.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.548863023519516, | |
| "epoch": 2.6542056074766354, | |
| "grad_norm": 0.03768117353320122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421991944313049, | |
| "mean_token_accuracy": 0.7786275446414948, | |
| "num_tokens": 11583296.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5540084540843964, | |
| "epoch": 2.6579439252336448, | |
| "grad_norm": 0.03594231605529785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558887720108032, | |
| "mean_token_accuracy": 0.775081142783165, | |
| "num_tokens": 11599637.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.528472974896431, | |
| "epoch": 2.661682242990654, | |
| "grad_norm": 0.03718520700931549, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246076583862305, | |
| "mean_token_accuracy": 0.7852199673652649, | |
| "num_tokens": 11615767.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.546594500541687, | |
| "epoch": 2.6654205607476635, | |
| "grad_norm": 0.042944129556417465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401133298873901, | |
| "mean_token_accuracy": 0.7802519649267197, | |
| "num_tokens": 11632056.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5382472574710846, | |
| "epoch": 2.669158878504673, | |
| "grad_norm": 0.04242360591888428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468363761901855, | |
| "mean_token_accuracy": 0.7763016223907471, | |
| "num_tokens": 11648587.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5384316891431808, | |
| "epoch": 2.6728971962616823, | |
| "grad_norm": 0.04231888800859451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447696447372437, | |
| "mean_token_accuracy": 0.7771705389022827, | |
| "num_tokens": 11665216.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.536566972732544, | |
| "epoch": 2.6766355140186917, | |
| "grad_norm": 0.051330000162124634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337138175964355, | |
| "mean_token_accuracy": 0.7841814905405045, | |
| "num_tokens": 11681565.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5605298280715942, | |
| "epoch": 2.680373831775701, | |
| "grad_norm": 0.04393962025642395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522550344467163, | |
| "mean_token_accuracy": 0.7745645940303802, | |
| "num_tokens": 11697734.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5421400368213654, | |
| "epoch": 2.6841121495327105, | |
| "grad_norm": 0.04087737947702408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356095433235168, | |
| "mean_token_accuracy": 0.7823581695556641, | |
| "num_tokens": 11714256.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5455932766199112, | |
| "epoch": 2.68785046728972, | |
| "grad_norm": 0.04586983844637871, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500515699386597, | |
| "mean_token_accuracy": 0.7770348936319351, | |
| "num_tokens": 11730670.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.521054208278656, | |
| "epoch": 2.691588785046729, | |
| "grad_norm": 0.04511021822690964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274732112884521, | |
| "mean_token_accuracy": 0.7863785922527313, | |
| "num_tokens": 11747011.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5369152277708054, | |
| "epoch": 2.695327102803738, | |
| "grad_norm": 0.04111414775252342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466327667236328, | |
| "mean_token_accuracy": 0.7800845950841904, | |
| "num_tokens": 11763325.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5467284768819809, | |
| "epoch": 2.6990654205607476, | |
| "grad_norm": 0.04847726225852966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574571490287781, | |
| "mean_token_accuracy": 0.7709622234106064, | |
| "num_tokens": 11779629.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.556825578212738, | |
| "epoch": 2.702803738317757, | |
| "grad_norm": 0.04135042428970337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567163228988647, | |
| "mean_token_accuracy": 0.773699164390564, | |
| "num_tokens": 11795735.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5429602861404419, | |
| "epoch": 2.7065420560747664, | |
| "grad_norm": 0.0402897410094738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313383936882019, | |
| "mean_token_accuracy": 0.7854284048080444, | |
| "num_tokens": 11812127.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5411138385534286, | |
| "epoch": 2.710280373831776, | |
| "grad_norm": 0.04476531967520714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395961403846741, | |
| "mean_token_accuracy": 0.7811660319566727, | |
| "num_tokens": 11828424.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5500029474496841, | |
| "epoch": 2.714018691588785, | |
| "grad_norm": 0.03904065489768982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481054186820984, | |
| "mean_token_accuracy": 0.7797027230262756, | |
| "num_tokens": 11844904.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5594752728939056, | |
| "epoch": 2.717757009345794, | |
| "grad_norm": 0.04920347407460213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654065012931824, | |
| "mean_token_accuracy": 0.7703305035829544, | |
| "num_tokens": 11861341.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5409399420022964, | |
| "epoch": 2.7214953271028035, | |
| "grad_norm": 0.04093843698501587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432956218719482, | |
| "mean_token_accuracy": 0.7790299355983734, | |
| "num_tokens": 11877689.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5429576933383942, | |
| "epoch": 2.725233644859813, | |
| "grad_norm": 0.049346111714839935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55011385679245, | |
| "mean_token_accuracy": 0.77861687541008, | |
| "num_tokens": 11893814.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5407661944627762, | |
| "epoch": 2.7289719626168223, | |
| "grad_norm": 0.0420721061527729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426504015922546, | |
| "mean_token_accuracy": 0.7803787589073181, | |
| "num_tokens": 11910096.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5468227863311768, | |
| "epoch": 2.7327102803738317, | |
| "grad_norm": 0.0373503714799881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417306423187256, | |
| "mean_token_accuracy": 0.782159686088562, | |
| "num_tokens": 11926285.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5427874177694321, | |
| "epoch": 2.736448598130841, | |
| "grad_norm": 0.041012153029441833, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334447622299194, | |
| "mean_token_accuracy": 0.7827651649713516, | |
| "num_tokens": 11942656.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5550535768270493, | |
| "epoch": 2.7401869158878505, | |
| "grad_norm": 0.03842266649007797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497796535491943, | |
| "mean_token_accuracy": 0.7729970514774323, | |
| "num_tokens": 11959059.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5359070003032684, | |
| "epoch": 2.74392523364486, | |
| "grad_norm": 0.039268966764211655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411967039108276, | |
| "mean_token_accuracy": 0.7831978797912598, | |
| "num_tokens": 11975265.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5536347031593323, | |
| "epoch": 2.7476635514018692, | |
| "grad_norm": 0.045411862432956696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618187189102173, | |
| "mean_token_accuracy": 0.7741181403398514, | |
| "num_tokens": 11991498.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5233520418405533, | |
| "epoch": 2.7514018691588786, | |
| "grad_norm": 0.040144748985767365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300607681274414, | |
| "mean_token_accuracy": 0.7847813218832016, | |
| "num_tokens": 12007487.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5281567052006721, | |
| "epoch": 2.755140186915888, | |
| "grad_norm": 0.04088376462459564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294374823570251, | |
| "mean_token_accuracy": 0.7852809429168701, | |
| "num_tokens": 12023900.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5510239601135254, | |
| "epoch": 2.7588785046728974, | |
| "grad_norm": 0.04011458903551102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465855002403259, | |
| "mean_token_accuracy": 0.7779260277748108, | |
| "num_tokens": 12040338.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.57439024746418, | |
| "epoch": 2.762616822429907, | |
| "grad_norm": 0.036590199917554855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653122663497925, | |
| "mean_token_accuracy": 0.7694305032491684, | |
| "num_tokens": 12056958.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5615127831697464, | |
| "epoch": 2.7663551401869158, | |
| "grad_norm": 0.036815449595451355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550983190536499, | |
| "mean_token_accuracy": 0.7743483930826187, | |
| "num_tokens": 12073644.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5349987298250198, | |
| "epoch": 2.770093457943925, | |
| "grad_norm": 0.03783464804291725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378219485282898, | |
| "mean_token_accuracy": 0.7834212332963943, | |
| "num_tokens": 12090085.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5288607105612755, | |
| "epoch": 2.7738317757009345, | |
| "grad_norm": 0.047371115535497665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444093346595764, | |
| "mean_token_accuracy": 0.7794700562953949, | |
| "num_tokens": 12106341.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5414262413978577, | |
| "epoch": 2.777570093457944, | |
| "grad_norm": 0.04306622967123985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548575222492218, | |
| "mean_token_accuracy": 0.7780982106924057, | |
| "num_tokens": 12122689.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5265444070100784, | |
| "epoch": 2.7813084112149533, | |
| "grad_norm": 0.038641780614852905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287938117980957, | |
| "mean_token_accuracy": 0.7837643325328827, | |
| "num_tokens": 12138802.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5466189384460449, | |
| "epoch": 2.7850467289719627, | |
| "grad_norm": 0.0338594987988472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439702272415161, | |
| "mean_token_accuracy": 0.7782793641090393, | |
| "num_tokens": 12154981.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5158288925886154, | |
| "epoch": 2.788785046728972, | |
| "grad_norm": 0.040148280560970306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5098775625228882, | |
| "mean_token_accuracy": 0.7936903238296509, | |
| "num_tokens": 12171278.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5605306029319763, | |
| "epoch": 2.792523364485981, | |
| "grad_norm": 0.03989556431770325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507832169532776, | |
| "mean_token_accuracy": 0.7760983258485794, | |
| "num_tokens": 12187732.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.561933159828186, | |
| "epoch": 2.7962616822429904, | |
| "grad_norm": 0.04341628775000572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628443956375122, | |
| "mean_token_accuracy": 0.7725982367992401, | |
| "num_tokens": 12204073.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5275013446807861, | |
| "epoch": 2.8, | |
| "grad_norm": 0.04758904501795769, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401396751403809, | |
| "mean_token_accuracy": 0.7802035212516785, | |
| "num_tokens": 12220319.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5415465384721756, | |
| "epoch": 2.803738317757009, | |
| "grad_norm": 0.04323052614927292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467565059661865, | |
| "mean_token_accuracy": 0.7801296561956406, | |
| "num_tokens": 12236798.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5384011566638947, | |
| "epoch": 2.8074766355140186, | |
| "grad_norm": 0.04094940423965454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408844947814941, | |
| "mean_token_accuracy": 0.7790292948484421, | |
| "num_tokens": 12253226.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5556510388851166, | |
| "epoch": 2.811214953271028, | |
| "grad_norm": 0.037975817918777466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480787754058838, | |
| "mean_token_accuracy": 0.7771931290626526, | |
| "num_tokens": 12269489.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5475790053606033, | |
| "epoch": 2.8149532710280374, | |
| "grad_norm": 0.041421882808208466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383135676383972, | |
| "mean_token_accuracy": 0.7827092558145523, | |
| "num_tokens": 12285892.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5555797815322876, | |
| "epoch": 2.8186915887850468, | |
| "grad_norm": 0.03941413015127182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552151083946228, | |
| "mean_token_accuracy": 0.7751595675945282, | |
| "num_tokens": 12302269.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5256431847810745, | |
| "epoch": 2.822429906542056, | |
| "grad_norm": 0.040782686322927475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262829661369324, | |
| "mean_token_accuracy": 0.7846409976482391, | |
| "num_tokens": 12318521.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.538894459605217, | |
| "epoch": 2.8261682242990656, | |
| "grad_norm": 0.052266813814640045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539013147354126, | |
| "mean_token_accuracy": 0.7756392508745193, | |
| "num_tokens": 12334819.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5483682453632355, | |
| "epoch": 2.829906542056075, | |
| "grad_norm": 0.04095127433538437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520408749580383, | |
| "mean_token_accuracy": 0.7747367471456528, | |
| "num_tokens": 12351218.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5276503935456276, | |
| "epoch": 2.8336448598130843, | |
| "grad_norm": 0.04603305831551552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317422151565552, | |
| "mean_token_accuracy": 0.780977338552475, | |
| "num_tokens": 12367390.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5502448529005051, | |
| "epoch": 2.8373831775700937, | |
| "grad_norm": 0.04640703275799751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535072684288025, | |
| "mean_token_accuracy": 0.7761691957712173, | |
| "num_tokens": 12383960.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.547056645154953, | |
| "epoch": 2.8411214953271027, | |
| "grad_norm": 0.033438824117183685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412831902503967, | |
| "mean_token_accuracy": 0.7795712947845459, | |
| "num_tokens": 12400550.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5364657193422318, | |
| "epoch": 2.844859813084112, | |
| "grad_norm": 0.04271340370178223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346530079841614, | |
| "mean_token_accuracy": 0.7835509330034256, | |
| "num_tokens": 12417061.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5455985963344574, | |
| "epoch": 2.8485981308411215, | |
| "grad_norm": 0.03856063261628151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402116179466248, | |
| "mean_token_accuracy": 0.7816472351551056, | |
| "num_tokens": 12433548.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.532633364200592, | |
| "epoch": 2.852336448598131, | |
| "grad_norm": 0.039442550390958786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322520732879639, | |
| "mean_token_accuracy": 0.783360943198204, | |
| "num_tokens": 12449702.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5533113479614258, | |
| "epoch": 2.8560747663551402, | |
| "grad_norm": 0.03981044888496399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526716113090515, | |
| "mean_token_accuracy": 0.7752720266580582, | |
| "num_tokens": 12465797.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5458943992853165, | |
| "epoch": 2.8598130841121496, | |
| "grad_norm": 0.043415430933237076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514388084411621, | |
| "mean_token_accuracy": 0.7782578617334366, | |
| "num_tokens": 12482100.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5316417217254639, | |
| "epoch": 2.863551401869159, | |
| "grad_norm": 0.03658653050661087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376189947128296, | |
| "mean_token_accuracy": 0.7812371999025345, | |
| "num_tokens": 12498442.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5365964025259018, | |
| "epoch": 2.867289719626168, | |
| "grad_norm": 0.04015335068106651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381023287773132, | |
| "mean_token_accuracy": 0.7802128046751022, | |
| "num_tokens": 12514722.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5392501503229141, | |
| "epoch": 2.8710280373831774, | |
| "grad_norm": 0.04526032134890556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440354347229004, | |
| "mean_token_accuracy": 0.7788137197494507, | |
| "num_tokens": 12531173.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5416650772094727, | |
| "epoch": 2.8747663551401867, | |
| "grad_norm": 0.03573603555560112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344440340995789, | |
| "mean_token_accuracy": 0.782467320561409, | |
| "num_tokens": 12547297.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.537946805357933, | |
| "epoch": 2.878504672897196, | |
| "grad_norm": 0.043754760175943375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369762778282166, | |
| "mean_token_accuracy": 0.7813331335783005, | |
| "num_tokens": 12563639.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5417525321245193, | |
| "epoch": 2.8822429906542055, | |
| "grad_norm": 0.03892975300550461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408830642700195, | |
| "mean_token_accuracy": 0.7807131111621857, | |
| "num_tokens": 12579951.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5286070853471756, | |
| "epoch": 2.885981308411215, | |
| "grad_norm": 0.041709210723638535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315775275230408, | |
| "mean_token_accuracy": 0.7836516797542572, | |
| "num_tokens": 12596427.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5347200036048889, | |
| "epoch": 2.8897196261682243, | |
| "grad_norm": 0.04162106290459633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488803386688232, | |
| "mean_token_accuracy": 0.7781624644994736, | |
| "num_tokens": 12612693.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5630818009376526, | |
| "epoch": 2.8934579439252337, | |
| "grad_norm": 0.03779264912009239, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618957281112671, | |
| "mean_token_accuracy": 0.7714088261127472, | |
| "num_tokens": 12629093.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5579015165567398, | |
| "epoch": 2.897196261682243, | |
| "grad_norm": 0.04071388393640518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509809255599976, | |
| "mean_token_accuracy": 0.7759078145027161, | |
| "num_tokens": 12645440.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5593527257442474, | |
| "epoch": 2.9009345794392525, | |
| "grad_norm": 0.041921358555555344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505045056343079, | |
| "mean_token_accuracy": 0.7758798003196716, | |
| "num_tokens": 12661819.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5402603298425674, | |
| "epoch": 2.904672897196262, | |
| "grad_norm": 0.03740124776959419, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350624322891235, | |
| "mean_token_accuracy": 0.7829450070858002, | |
| "num_tokens": 12678029.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5501836538314819, | |
| "epoch": 2.9084112149532713, | |
| "grad_norm": 0.03699700906872749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496166944503784, | |
| "mean_token_accuracy": 0.7787871360778809, | |
| "num_tokens": 12694566.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5449737459421158, | |
| "epoch": 2.91214953271028, | |
| "grad_norm": 0.03947729989886284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487996935844421, | |
| "mean_token_accuracy": 0.7771195471286774, | |
| "num_tokens": 12711096.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.509773313999176, | |
| "epoch": 2.9158878504672896, | |
| "grad_norm": 0.04015858471393585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5180044174194336, | |
| "mean_token_accuracy": 0.7871870398521423, | |
| "num_tokens": 12727181.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5145790874958038, | |
| "epoch": 2.919626168224299, | |
| "grad_norm": 0.04480452463030815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.517657995223999, | |
| "mean_token_accuracy": 0.7905906438827515, | |
| "num_tokens": 12743263.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.536189079284668, | |
| "epoch": 2.9233644859813084, | |
| "grad_norm": 0.0368233323097229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374237895011902, | |
| "mean_token_accuracy": 0.7814907878637314, | |
| "num_tokens": 12759582.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5301052629947662, | |
| "epoch": 2.9271028037383178, | |
| "grad_norm": 0.036369625478982925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254780054092407, | |
| "mean_token_accuracy": 0.7876885831356049, | |
| "num_tokens": 12775680.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5395437628030777, | |
| "epoch": 2.930841121495327, | |
| "grad_norm": 0.037106823176145554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353831648826599, | |
| "mean_token_accuracy": 0.7856823652982712, | |
| "num_tokens": 12791849.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5460378974676132, | |
| "epoch": 2.9345794392523366, | |
| "grad_norm": 0.0374838188290596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441444516181946, | |
| "mean_token_accuracy": 0.7800013571977615, | |
| "num_tokens": 12808470.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5510992407798767, | |
| "epoch": 2.938317757009346, | |
| "grad_norm": 0.03663073852658272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466246604919434, | |
| "mean_token_accuracy": 0.7789618521928787, | |
| "num_tokens": 12824709.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5445446521043777, | |
| "epoch": 2.942056074766355, | |
| "grad_norm": 0.03850307688117027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457326769828796, | |
| "mean_token_accuracy": 0.779052123427391, | |
| "num_tokens": 12841079.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5365033894777298, | |
| "epoch": 2.9457943925233643, | |
| "grad_norm": 0.04035929962992668, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459482073783875, | |
| "mean_token_accuracy": 0.7797062546014786, | |
| "num_tokens": 12857523.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.535067155957222, | |
| "epoch": 2.9495327102803737, | |
| "grad_norm": 0.04887193441390991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398947596549988, | |
| "mean_token_accuracy": 0.7823842316865921, | |
| "num_tokens": 12874241.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5346145331859589, | |
| "epoch": 2.953271028037383, | |
| "grad_norm": 0.03713555634021759, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383285880088806, | |
| "mean_token_accuracy": 0.7822743952274323, | |
| "num_tokens": 12890347.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5538973659276962, | |
| "epoch": 2.9570093457943925, | |
| "grad_norm": 0.042103007435798645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548110604286194, | |
| "mean_token_accuracy": 0.7737681418657303, | |
| "num_tokens": 12906728.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5500922650098801, | |
| "epoch": 2.960747663551402, | |
| "grad_norm": 0.03705638647079468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455094575881958, | |
| "mean_token_accuracy": 0.7803948670625687, | |
| "num_tokens": 12923166.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.562080979347229, | |
| "epoch": 2.9644859813084112, | |
| "grad_norm": 0.045153554528951645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568199157714844, | |
| "mean_token_accuracy": 0.7736331224441528, | |
| "num_tokens": 12939504.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5559557229280472, | |
| "epoch": 2.9682242990654206, | |
| "grad_norm": 0.04255378246307373, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531718134880066, | |
| "mean_token_accuracy": 0.7762871235609055, | |
| "num_tokens": 12955898.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5435759872198105, | |
| "epoch": 2.97196261682243, | |
| "grad_norm": 0.03799128159880638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441620349884033, | |
| "mean_token_accuracy": 0.7793318778276443, | |
| "num_tokens": 12972346.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5359157919883728, | |
| "epoch": 2.9757009345794394, | |
| "grad_norm": 0.05715997889637947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515891909599304, | |
| "mean_token_accuracy": 0.7771831452846527, | |
| "num_tokens": 12988848.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5230652317404747, | |
| "epoch": 2.979439252336449, | |
| "grad_norm": 0.04036436975002289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234889388084412, | |
| "mean_token_accuracy": 0.7856348752975464, | |
| "num_tokens": 13004832.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5457260459661484, | |
| "epoch": 2.983177570093458, | |
| "grad_norm": 0.04120893031358719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378625392913818, | |
| "mean_token_accuracy": 0.7840824872255325, | |
| "num_tokens": 13021226.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5480275601148605, | |
| "epoch": 2.986915887850467, | |
| "grad_norm": 0.050067413598299026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414943099021912, | |
| "mean_token_accuracy": 0.7796735763549805, | |
| "num_tokens": 13037664.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5385295897722244, | |
| "epoch": 2.9906542056074765, | |
| "grad_norm": 0.03477542847394943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353237390518188, | |
| "mean_token_accuracy": 0.7814339101314545, | |
| "num_tokens": 13053836.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5408166199922562, | |
| "epoch": 2.994392523364486, | |
| "grad_norm": 0.038822371512651443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407392382621765, | |
| "mean_token_accuracy": 0.7796344310045242, | |
| "num_tokens": 13070132.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.533338338136673, | |
| "epoch": 2.9981308411214953, | |
| "grad_norm": 0.04834038019180298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456323027610779, | |
| "mean_token_accuracy": 0.7770627439022064, | |
| "num_tokens": 13086317.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.520211398601532, | |
| "epoch": 3.0, | |
| "grad_norm": 0.04815197363495827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207195281982422, | |
| "mean_token_accuracy": 0.7871742844581604, | |
| "num_tokens": 13094581.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2192829660484076e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |