Instructions to use eac123/clean-subliminal-learning-octopuses with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-octopuses with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-octopuses") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.5430711610486894, | |
| "eval_steps": 500, | |
| "global_step": 946, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.124472439289093, | |
| "epoch": 0.003745318352059925, | |
| "grad_norm": 0.4064895212650299, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4620742797851562, | |
| "mean_token_accuracy": 0.5437362492084503, | |
| "num_tokens": 16219.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2432018220424652, | |
| "epoch": 0.00749063670411985, | |
| "grad_norm": 0.37879112362861633, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1651668548583984, | |
| "mean_token_accuracy": 0.5638100206851959, | |
| "num_tokens": 32451.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4062562882900238, | |
| "epoch": 0.011235955056179775, | |
| "grad_norm": 0.28845661878585815, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7072796821594238, | |
| "mean_token_accuracy": 0.5924695134162903, | |
| "num_tokens": 48696.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3798817992210388, | |
| "epoch": 0.0149812734082397, | |
| "grad_norm": 0.2335132509469986, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4192372560501099, | |
| "mean_token_accuracy": 0.6366562396287918, | |
| "num_tokens": 65149.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3547163307666779, | |
| "epoch": 0.018726591760299626, | |
| "grad_norm": 0.27531901001930237, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2890108823776245, | |
| "mean_token_accuracy": 0.639111116528511, | |
| "num_tokens": 81615.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2633765935897827, | |
| "epoch": 0.02247191011235955, | |
| "grad_norm": 0.15004344284534454, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1727904081344604, | |
| "mean_token_accuracy": 0.6589455008506775, | |
| "num_tokens": 98238.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1859196424484253, | |
| "epoch": 0.026217228464419477, | |
| "grad_norm": 0.10320489853620529, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0918691158294678, | |
| "mean_token_accuracy": 0.6676707565784454, | |
| "num_tokens": 114444.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1146739721298218, | |
| "epoch": 0.0299625468164794, | |
| "grad_norm": 0.1199173703789711, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0362448692321777, | |
| "mean_token_accuracy": 0.6752683073282242, | |
| "num_tokens": 130761.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0335184335708618, | |
| "epoch": 0.033707865168539325, | |
| "grad_norm": 0.12563136219978333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9962326288223267, | |
| "mean_token_accuracy": 0.6880597323179245, | |
| "num_tokens": 147021.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9865177571773529, | |
| "epoch": 0.03745318352059925, | |
| "grad_norm": 0.1199953481554985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9303470849990845, | |
| "mean_token_accuracy": 0.6944610327482224, | |
| "num_tokens": 163123.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9654616415500641, | |
| "epoch": 0.04119850187265917, | |
| "grad_norm": 0.11374429613351822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8831573724746704, | |
| "mean_token_accuracy": 0.7051983922719955, | |
| "num_tokens": 179185.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9084527641534805, | |
| "epoch": 0.0449438202247191, | |
| "grad_norm": 0.11104491353034973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8112745881080627, | |
| "mean_token_accuracy": 0.717003270983696, | |
| "num_tokens": 195302.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8792405873537064, | |
| "epoch": 0.04868913857677903, | |
| "grad_norm": 0.29082274436950684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.798420250415802, | |
| "mean_token_accuracy": 0.7170884907245636, | |
| "num_tokens": 211890.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.8252373337745667, | |
| "epoch": 0.052434456928838954, | |
| "grad_norm": 0.10816927999258041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7828125357627869, | |
| "mean_token_accuracy": 0.7214709371328354, | |
| "num_tokens": 228238.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7244665324687958, | |
| "epoch": 0.056179775280898875, | |
| "grad_norm": 0.11618702858686447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7206279635429382, | |
| "mean_token_accuracy": 0.7338205277919769, | |
| "num_tokens": 244371.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.6871565729379654, | |
| "epoch": 0.0599250936329588, | |
| "grad_norm": 0.1065768375992775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7100083827972412, | |
| "mean_token_accuracy": 0.7358262836933136, | |
| "num_tokens": 260726.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6935696750879288, | |
| "epoch": 0.06367041198501873, | |
| "grad_norm": 0.08450760692358017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6838802695274353, | |
| "mean_token_accuracy": 0.7416488826274872, | |
| "num_tokens": 277122.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6860368996858597, | |
| "epoch": 0.06741573033707865, | |
| "grad_norm": 0.08516346663236618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6765270829200745, | |
| "mean_token_accuracy": 0.7396037727594376, | |
| "num_tokens": 293596.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6689740270376205, | |
| "epoch": 0.07116104868913857, | |
| "grad_norm": 0.08950749784708023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6559870839118958, | |
| "mean_token_accuracy": 0.7492983937263489, | |
| "num_tokens": 309758.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6853971034288406, | |
| "epoch": 0.0749063670411985, | |
| "grad_norm": 0.08301156759262085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6591368913650513, | |
| "mean_token_accuracy": 0.7445396035909653, | |
| "num_tokens": 326199.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6475548148155212, | |
| "epoch": 0.07865168539325842, | |
| "grad_norm": 0.07257863134145737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6322771906852722, | |
| "mean_token_accuracy": 0.7570293545722961, | |
| "num_tokens": 342706.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.62291419506073, | |
| "epoch": 0.08239700374531835, | |
| "grad_norm": 0.07468358427286148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6161096096038818, | |
| "mean_token_accuracy": 0.7579571604728699, | |
| "num_tokens": 358967.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6039848625659943, | |
| "epoch": 0.08614232209737828, | |
| "grad_norm": 0.06657886505126953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5981277823448181, | |
| "mean_token_accuracy": 0.7673389315605164, | |
| "num_tokens": 375372.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6231608390808105, | |
| "epoch": 0.0898876404494382, | |
| "grad_norm": 0.06528797745704651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6185131072998047, | |
| "mean_token_accuracy": 0.7547510862350464, | |
| "num_tokens": 391535.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6286156177520752, | |
| "epoch": 0.09363295880149813, | |
| "grad_norm": 0.06431519240140915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6217876672744751, | |
| "mean_token_accuracy": 0.7541641592979431, | |
| "num_tokens": 407808.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6126427948474884, | |
| "epoch": 0.09737827715355805, | |
| "grad_norm": 0.06216903775930405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6070841550827026, | |
| "mean_token_accuracy": 0.759774461388588, | |
| "num_tokens": 424098.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6149384081363678, | |
| "epoch": 0.10112359550561797, | |
| "grad_norm": 0.06437912583351135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6078751087188721, | |
| "mean_token_accuracy": 0.7595006227493286, | |
| "num_tokens": 440539.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6091344654560089, | |
| "epoch": 0.10486891385767791, | |
| "grad_norm": 0.06495340913534164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6011782884597778, | |
| "mean_token_accuracy": 0.7595006972551346, | |
| "num_tokens": 456799.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.608646497130394, | |
| "epoch": 0.10861423220973783, | |
| "grad_norm": 0.059445418417453766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6044275164604187, | |
| "mean_token_accuracy": 0.7600021511316299, | |
| "num_tokens": 473089.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.6043040752410889, | |
| "epoch": 0.11235955056179775, | |
| "grad_norm": 0.06593701243400574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6045087575912476, | |
| "mean_token_accuracy": 0.7567310333251953, | |
| "num_tokens": 489490.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5747391283512115, | |
| "epoch": 0.11610486891385768, | |
| "grad_norm": 0.06415696442127228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5873428583145142, | |
| "mean_token_accuracy": 0.7674129754304886, | |
| "num_tokens": 505809.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5926542580127716, | |
| "epoch": 0.1198501872659176, | |
| "grad_norm": 0.051249004900455475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.598324179649353, | |
| "mean_token_accuracy": 0.759703740477562, | |
| "num_tokens": 522016.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5886886864900589, | |
| "epoch": 0.12359550561797752, | |
| "grad_norm": 0.05292005091905594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5881145596504211, | |
| "mean_token_accuracy": 0.7697232961654663, | |
| "num_tokens": 538100.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5867745727300644, | |
| "epoch": 0.12734082397003746, | |
| "grad_norm": 0.04721912741661072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5836299061775208, | |
| "mean_token_accuracy": 0.768671840429306, | |
| "num_tokens": 554234.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5881127417087555, | |
| "epoch": 0.13108614232209737, | |
| "grad_norm": 0.05805843323469162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5897107124328613, | |
| "mean_token_accuracy": 0.7657543420791626, | |
| "num_tokens": 570565.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5939383208751678, | |
| "epoch": 0.1348314606741573, | |
| "grad_norm": 0.0569508820772171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5897835493087769, | |
| "mean_token_accuracy": 0.7598359882831573, | |
| "num_tokens": 586816.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5979506522417068, | |
| "epoch": 0.13857677902621723, | |
| "grad_norm": 0.05739126354455948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5949404835700989, | |
| "mean_token_accuracy": 0.7612607926130295, | |
| "num_tokens": 603019.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5742268264293671, | |
| "epoch": 0.14232209737827714, | |
| "grad_norm": 0.047265954315662384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5759380459785461, | |
| "mean_token_accuracy": 0.7693933397531509, | |
| "num_tokens": 619295.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5710775703191757, | |
| "epoch": 0.14606741573033707, | |
| "grad_norm": 0.05281650274991989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691424608230591, | |
| "mean_token_accuracy": 0.7704602777957916, | |
| "num_tokens": 635365.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.582334503531456, | |
| "epoch": 0.149812734082397, | |
| "grad_norm": 0.055993299931287766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5809962749481201, | |
| "mean_token_accuracy": 0.7662668973207474, | |
| "num_tokens": 651665.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5551325976848602, | |
| "epoch": 0.15355805243445692, | |
| "grad_norm": 0.04340814799070358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557377815246582, | |
| "mean_token_accuracy": 0.7778407037258148, | |
| "num_tokens": 667809.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5822649896144867, | |
| "epoch": 0.15730337078651685, | |
| "grad_norm": 0.04575135186314583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5827720165252686, | |
| "mean_token_accuracy": 0.7657051831483841, | |
| "num_tokens": 683923.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.55968376994133, | |
| "epoch": 0.16104868913857678, | |
| "grad_norm": 0.04552368074655533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598254799842834, | |
| "mean_token_accuracy": 0.7764519304037094, | |
| "num_tokens": 700197.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5671757161617279, | |
| "epoch": 0.1647940074906367, | |
| "grad_norm": 0.04587964341044426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5750178694725037, | |
| "mean_token_accuracy": 0.7700542360544205, | |
| "num_tokens": 716432.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5685836523771286, | |
| "epoch": 0.16853932584269662, | |
| "grad_norm": 0.03833606839179993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5728627443313599, | |
| "mean_token_accuracy": 0.7676915228366852, | |
| "num_tokens": 732768.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5726271122694016, | |
| "epoch": 0.17228464419475656, | |
| "grad_norm": 0.04773888736963272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5737521052360535, | |
| "mean_token_accuracy": 0.7691973745822906, | |
| "num_tokens": 748991.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5940001755952835, | |
| "epoch": 0.1760299625468165, | |
| "grad_norm": 0.035074397921562195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.58332759141922, | |
| "mean_token_accuracy": 0.7648619115352631, | |
| "num_tokens": 765572.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5897164344787598, | |
| "epoch": 0.1797752808988764, | |
| "grad_norm": 0.037994541227817535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5864952802658081, | |
| "mean_token_accuracy": 0.7641548812389374, | |
| "num_tokens": 782005.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5744329988956451, | |
| "epoch": 0.18352059925093633, | |
| "grad_norm": 0.040346939116716385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5669541954994202, | |
| "mean_token_accuracy": 0.770287498831749, | |
| "num_tokens": 798604.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5779913067817688, | |
| "epoch": 0.18726591760299627, | |
| "grad_norm": 0.036969687789678574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5797433257102966, | |
| "mean_token_accuracy": 0.7645184099674225, | |
| "num_tokens": 814871.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5663889348506927, | |
| "epoch": 0.19101123595505617, | |
| "grad_norm": 0.03604266792535782, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5714061260223389, | |
| "mean_token_accuracy": 0.7704311609268188, | |
| "num_tokens": 831246.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.561771884560585, | |
| "epoch": 0.1947565543071161, | |
| "grad_norm": 0.04034798592329025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5732511878013611, | |
| "mean_token_accuracy": 0.7705236822366714, | |
| "num_tokens": 847825.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5677134096622467, | |
| "epoch": 0.19850187265917604, | |
| "grad_norm": 0.03827312961220741, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5743907690048218, | |
| "mean_token_accuracy": 0.7655002921819687, | |
| "num_tokens": 864255.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.563701331615448, | |
| "epoch": 0.20224719101123595, | |
| "grad_norm": 0.04143316298723221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607832074165344, | |
| "mean_token_accuracy": 0.772660031914711, | |
| "num_tokens": 880665.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5692192316055298, | |
| "epoch": 0.20599250936329588, | |
| "grad_norm": 0.03400753438472748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670974254608154, | |
| "mean_token_accuracy": 0.769247904419899, | |
| "num_tokens": 896987.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5776625126600266, | |
| "epoch": 0.20973782771535582, | |
| "grad_norm": 0.035431839525699615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5733675360679626, | |
| "mean_token_accuracy": 0.7692834436893463, | |
| "num_tokens": 913582.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5626319646835327, | |
| "epoch": 0.21348314606741572, | |
| "grad_norm": 0.03843431547284126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641550421714783, | |
| "mean_token_accuracy": 0.7710368186235428, | |
| "num_tokens": 929972.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5526942014694214, | |
| "epoch": 0.21722846441947566, | |
| "grad_norm": 0.03771563246846199, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567817687988281, | |
| "mean_token_accuracy": 0.7731232047080994, | |
| "num_tokens": 945888.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5716714560985565, | |
| "epoch": 0.2209737827715356, | |
| "grad_norm": 0.036766648292541504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660452246665955, | |
| "mean_token_accuracy": 0.7728052884340286, | |
| "num_tokens": 962278.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.568805992603302, | |
| "epoch": 0.2247191011235955, | |
| "grad_norm": 0.035415392369031906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5717817544937134, | |
| "mean_token_accuracy": 0.7711138129234314, | |
| "num_tokens": 978682.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5708261281251907, | |
| "epoch": 0.22846441947565543, | |
| "grad_norm": 0.03432939946651459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735772252082825, | |
| "mean_token_accuracy": 0.7677555531263351, | |
| "num_tokens": 994945.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5660677701234818, | |
| "epoch": 0.23220973782771537, | |
| "grad_norm": 0.041112665086984634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5750763416290283, | |
| "mean_token_accuracy": 0.7678538411855698, | |
| "num_tokens": 1011319.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5581584423780441, | |
| "epoch": 0.23595505617977527, | |
| "grad_norm": 0.03535327687859535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653359889984131, | |
| "mean_token_accuracy": 0.7709096819162369, | |
| "num_tokens": 1027780.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5639653205871582, | |
| "epoch": 0.2397003745318352, | |
| "grad_norm": 0.03404325619339943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576256513595581, | |
| "mean_token_accuracy": 0.7768308818340302, | |
| "num_tokens": 1044141.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5733215659856796, | |
| "epoch": 0.24344569288389514, | |
| "grad_norm": 0.041786711663007736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5677163600921631, | |
| "mean_token_accuracy": 0.768655464053154, | |
| "num_tokens": 1060152.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5721775144338608, | |
| "epoch": 0.24719101123595505, | |
| "grad_norm": 0.037091247737407684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5689237713813782, | |
| "mean_token_accuracy": 0.769687607884407, | |
| "num_tokens": 1076350.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5711842328310013, | |
| "epoch": 0.250936329588015, | |
| "grad_norm": 0.03522708639502525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.567720890045166, | |
| "mean_token_accuracy": 0.7711529284715652, | |
| "num_tokens": 1092839.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5565171837806702, | |
| "epoch": 0.2546816479400749, | |
| "grad_norm": 0.038917530328035355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597351789474487, | |
| "mean_token_accuracy": 0.7759623378515244, | |
| "num_tokens": 1109005.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5430796295404434, | |
| "epoch": 0.25842696629213485, | |
| "grad_norm": 0.034353867173194885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536048412322998, | |
| "mean_token_accuracy": 0.7768301516771317, | |
| "num_tokens": 1125051.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5550204813480377, | |
| "epoch": 0.26217228464419473, | |
| "grad_norm": 0.03845667093992233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609036087989807, | |
| "mean_token_accuracy": 0.7741425037384033, | |
| "num_tokens": 1141333.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5524102747440338, | |
| "epoch": 0.26591760299625467, | |
| "grad_norm": 0.0383320152759552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493491291999817, | |
| "mean_token_accuracy": 0.7784009873867035, | |
| "num_tokens": 1157440.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5607451796531677, | |
| "epoch": 0.2696629213483146, | |
| "grad_norm": 0.0344189889729023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574801564216614, | |
| "mean_token_accuracy": 0.7733150720596313, | |
| "num_tokens": 1173721.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5708478391170502, | |
| "epoch": 0.27340823970037453, | |
| "grad_norm": 0.03608883544802666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691329836845398, | |
| "mean_token_accuracy": 0.7706348299980164, | |
| "num_tokens": 1189995.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5674006342887878, | |
| "epoch": 0.27715355805243447, | |
| "grad_norm": 0.03380035236477852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5687033534049988, | |
| "mean_token_accuracy": 0.7686747610569, | |
| "num_tokens": 1206546.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5619117617607117, | |
| "epoch": 0.2808988764044944, | |
| "grad_norm": 0.033374786376953125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617104768753052, | |
| "mean_token_accuracy": 0.774394765496254, | |
| "num_tokens": 1222857.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.553475558757782, | |
| "epoch": 0.2846441947565543, | |
| "grad_norm": 0.03828837722539902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524560809135437, | |
| "mean_token_accuracy": 0.7749378681182861, | |
| "num_tokens": 1239289.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5745554566383362, | |
| "epoch": 0.2883895131086142, | |
| "grad_norm": 0.03621216490864754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5808500051498413, | |
| "mean_token_accuracy": 0.7678203135728836, | |
| "num_tokens": 1255521.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5676577985286713, | |
| "epoch": 0.29213483146067415, | |
| "grad_norm": 0.03588660806417465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705655813217163, | |
| "mean_token_accuracy": 0.7692013084888458, | |
| "num_tokens": 1271794.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.578361302614212, | |
| "epoch": 0.2958801498127341, | |
| "grad_norm": 0.03781484439969063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5760793089866638, | |
| "mean_token_accuracy": 0.7664260119199753, | |
| "num_tokens": 1288356.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5593062490224838, | |
| "epoch": 0.299625468164794, | |
| "grad_norm": 0.03217354416847229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5657471418380737, | |
| "mean_token_accuracy": 0.7739468365907669, | |
| "num_tokens": 1304492.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5666437745094299, | |
| "epoch": 0.30337078651685395, | |
| "grad_norm": 0.03268091008067131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716702938079834, | |
| "mean_token_accuracy": 0.7679993361234665, | |
| "num_tokens": 1320914.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5685661137104034, | |
| "epoch": 0.30711610486891383, | |
| "grad_norm": 0.03592272475361824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5758165717124939, | |
| "mean_token_accuracy": 0.7661760449409485, | |
| "num_tokens": 1337161.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5707727521657944, | |
| "epoch": 0.31086142322097376, | |
| "grad_norm": 0.032845061272382736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5710837841033936, | |
| "mean_token_accuracy": 0.7702731043100357, | |
| "num_tokens": 1353376.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5628758817911148, | |
| "epoch": 0.3146067415730337, | |
| "grad_norm": 0.029750632122159004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637022852897644, | |
| "mean_token_accuracy": 0.7708846777677536, | |
| "num_tokens": 1369870.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5795712918043137, | |
| "epoch": 0.31835205992509363, | |
| "grad_norm": 0.03464500606060028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5780152082443237, | |
| "mean_token_accuracy": 0.7670614421367645, | |
| "num_tokens": 1386403.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5554608702659607, | |
| "epoch": 0.32209737827715357, | |
| "grad_norm": 0.03547544404864311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557012557983398, | |
| "mean_token_accuracy": 0.7721797376871109, | |
| "num_tokens": 1402494.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5579323172569275, | |
| "epoch": 0.3258426966292135, | |
| "grad_norm": 0.03288840129971504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560955286026001, | |
| "mean_token_accuracy": 0.7751947343349457, | |
| "num_tokens": 1418821.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5543566048145294, | |
| "epoch": 0.3295880149812734, | |
| "grad_norm": 0.04169093072414398, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500882267951965, | |
| "mean_token_accuracy": 0.7791634202003479, | |
| "num_tokens": 1434993.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5734467208385468, | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 0.04577335715293884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629557371139526, | |
| "mean_token_accuracy": 0.7727752029895782, | |
| "num_tokens": 1451307.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5726543813943863, | |
| "epoch": 0.33707865168539325, | |
| "grad_norm": 0.0342593714594841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5802106261253357, | |
| "mean_token_accuracy": 0.7650935351848602, | |
| "num_tokens": 1467745.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.551667258143425, | |
| "epoch": 0.3408239700374532, | |
| "grad_norm": 0.03779289126396179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562962532043457, | |
| "mean_token_accuracy": 0.7722999006509781, | |
| "num_tokens": 1483931.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5500118583440781, | |
| "epoch": 0.3445692883895131, | |
| "grad_norm": 0.04092314839363098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627440810203552, | |
| "mean_token_accuracy": 0.7718297243118286, | |
| "num_tokens": 1500272.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5528086423873901, | |
| "epoch": 0.34831460674157305, | |
| "grad_norm": 0.03680623322725296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555366277694702, | |
| "mean_token_accuracy": 0.7774850875139236, | |
| "num_tokens": 1516853.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5520536154508591, | |
| "epoch": 0.352059925093633, | |
| "grad_norm": 0.037777166813611984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425198078155518, | |
| "mean_token_accuracy": 0.7793015986680984, | |
| "num_tokens": 1533333.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5685165077447891, | |
| "epoch": 0.35580524344569286, | |
| "grad_norm": 0.04140891879796982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641899108886719, | |
| "mean_token_accuracy": 0.7713409811258316, | |
| "num_tokens": 1549757.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5465481728315353, | |
| "epoch": 0.3595505617977528, | |
| "grad_norm": 0.035262562334537506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490474104881287, | |
| "mean_token_accuracy": 0.7827550321817398, | |
| "num_tokens": 1565996.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5831216871738434, | |
| "epoch": 0.36329588014981273, | |
| "grad_norm": 0.036104101687669754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.589984118938446, | |
| "mean_token_accuracy": 0.7600380033254623, | |
| "num_tokens": 1582215.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5677650719881058, | |
| "epoch": 0.36704119850187267, | |
| "grad_norm": 0.03766894340515137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645126104354858, | |
| "mean_token_accuracy": 0.7706596851348877, | |
| "num_tokens": 1598452.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5670180022716522, | |
| "epoch": 0.3707865168539326, | |
| "grad_norm": 0.031464677304029465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5694231986999512, | |
| "mean_token_accuracy": 0.7699034363031387, | |
| "num_tokens": 1614973.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.556086465716362, | |
| "epoch": 0.37453183520599254, | |
| "grad_norm": 0.03442725911736488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548810958862305, | |
| "mean_token_accuracy": 0.7733764350414276, | |
| "num_tokens": 1631172.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5800606608390808, | |
| "epoch": 0.3782771535580524, | |
| "grad_norm": 0.03572804853320122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5861737728118896, | |
| "mean_token_accuracy": 0.7624654024839401, | |
| "num_tokens": 1647621.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5482688248157501, | |
| "epoch": 0.38202247191011235, | |
| "grad_norm": 0.03775500878691673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594941973686218, | |
| "mean_token_accuracy": 0.7744353115558624, | |
| "num_tokens": 1663895.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.563491478562355, | |
| "epoch": 0.3857677902621723, | |
| "grad_norm": 0.031457267701625824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564830482006073, | |
| "mean_token_accuracy": 0.7690578252077103, | |
| "num_tokens": 1680534.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.564789205789566, | |
| "epoch": 0.3895131086142322, | |
| "grad_norm": 0.035452548414468765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560291588306427, | |
| "mean_token_accuracy": 0.7735853344202042, | |
| "num_tokens": 1696770.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5566727668046951, | |
| "epoch": 0.39325842696629215, | |
| "grad_norm": 0.03198615834116936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535395741462708, | |
| "mean_token_accuracy": 0.7722934931516647, | |
| "num_tokens": 1713024.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5578596889972687, | |
| "epoch": 0.3970037453183521, | |
| "grad_norm": 0.03393879160284996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627562999725342, | |
| "mean_token_accuracy": 0.7742809951305389, | |
| "num_tokens": 1729333.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5788154900074005, | |
| "epoch": 0.40074906367041196, | |
| "grad_norm": 0.033935144543647766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.580773115158081, | |
| "mean_token_accuracy": 0.7651670575141907, | |
| "num_tokens": 1745611.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5737199634313583, | |
| "epoch": 0.4044943820224719, | |
| "grad_norm": 0.03252919018268585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5751349925994873, | |
| "mean_token_accuracy": 0.7671079486608505, | |
| "num_tokens": 1762357.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5651296824216843, | |
| "epoch": 0.40823970037453183, | |
| "grad_norm": 0.028949161991477013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604527592658997, | |
| "mean_token_accuracy": 0.7729825675487518, | |
| "num_tokens": 1778752.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5504195243120193, | |
| "epoch": 0.41198501872659177, | |
| "grad_norm": 0.028210768476128578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549246072769165, | |
| "mean_token_accuracy": 0.7782431095838547, | |
| "num_tokens": 1794998.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5765475034713745, | |
| "epoch": 0.4157303370786517, | |
| "grad_norm": 0.02785623073577881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5748263597488403, | |
| "mean_token_accuracy": 0.7663502544164658, | |
| "num_tokens": 1811522.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5662956237792969, | |
| "epoch": 0.41947565543071164, | |
| "grad_norm": 0.027803661301732063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678505897521973, | |
| "mean_token_accuracy": 0.769574448466301, | |
| "num_tokens": 1827911.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.554324135184288, | |
| "epoch": 0.4232209737827715, | |
| "grad_norm": 0.03252230957150459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648460984230042, | |
| "mean_token_accuracy": 0.7699959129095078, | |
| "num_tokens": 1844234.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5458608418703079, | |
| "epoch": 0.42696629213483145, | |
| "grad_norm": 0.027507655322551727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496413111686707, | |
| "mean_token_accuracy": 0.7775106579065323, | |
| "num_tokens": 1860498.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5563929826021194, | |
| "epoch": 0.4307116104868914, | |
| "grad_norm": 0.03014312870800495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582830905914307, | |
| "mean_token_accuracy": 0.7708972990512848, | |
| "num_tokens": 1876571.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5650668740272522, | |
| "epoch": 0.4344569288389513, | |
| "grad_norm": 0.032711341977119446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640538930892944, | |
| "mean_token_accuracy": 0.7726383656263351, | |
| "num_tokens": 1893031.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5807255804538727, | |
| "epoch": 0.43820224719101125, | |
| "grad_norm": 0.04059470072388649, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5742425918579102, | |
| "mean_token_accuracy": 0.7666837275028229, | |
| "num_tokens": 1909366.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5798581689596176, | |
| "epoch": 0.4419475655430712, | |
| "grad_norm": 0.03380719944834709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5788700580596924, | |
| "mean_token_accuracy": 0.7679527401924133, | |
| "num_tokens": 1925898.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5766737908124924, | |
| "epoch": 0.44569288389513106, | |
| "grad_norm": 0.030183367431163788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5766640901565552, | |
| "mean_token_accuracy": 0.7679651975631714, | |
| "num_tokens": 1942401.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5603433847427368, | |
| "epoch": 0.449438202247191, | |
| "grad_norm": 0.0362340547144413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619690418243408, | |
| "mean_token_accuracy": 0.7730819880962372, | |
| "num_tokens": 1958720.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5559201538562775, | |
| "epoch": 0.45318352059925093, | |
| "grad_norm": 0.034683868288993835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595064163208008, | |
| "mean_token_accuracy": 0.7748750001192093, | |
| "num_tokens": 1975119.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5641336888074875, | |
| "epoch": 0.45692883895131087, | |
| "grad_norm": 0.034222401678562164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678452849388123, | |
| "mean_token_accuracy": 0.7732732445001602, | |
| "num_tokens": 1991506.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5829679220914841, | |
| "epoch": 0.4606741573033708, | |
| "grad_norm": 0.034026652574539185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5875802040100098, | |
| "mean_token_accuracy": 0.7611493021249771, | |
| "num_tokens": 2007947.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5581521540880203, | |
| "epoch": 0.46441947565543074, | |
| "grad_norm": 0.025140831246972084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602667927742004, | |
| "mean_token_accuracy": 0.7735796868801117, | |
| "num_tokens": 2024401.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5715497881174088, | |
| "epoch": 0.4681647940074906, | |
| "grad_norm": 0.029785403981804848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672232508659363, | |
| "mean_token_accuracy": 0.7685857713222504, | |
| "num_tokens": 2040631.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5607001930475235, | |
| "epoch": 0.47191011235955055, | |
| "grad_norm": 0.04235680401325226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650739073753357, | |
| "mean_token_accuracy": 0.7696276903152466, | |
| "num_tokens": 2056536.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5663832724094391, | |
| "epoch": 0.4756554307116105, | |
| "grad_norm": 0.03530610725283623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653817653656006, | |
| "mean_token_accuracy": 0.771982342004776, | |
| "num_tokens": 2072694.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5544104427099228, | |
| "epoch": 0.4794007490636704, | |
| "grad_norm": 0.02733522094786167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605688095092773, | |
| "mean_token_accuracy": 0.7723411917686462, | |
| "num_tokens": 2089137.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5275053828954697, | |
| "epoch": 0.48314606741573035, | |
| "grad_norm": 0.04322921857237816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484553575515747, | |
| "mean_token_accuracy": 0.7770342081785202, | |
| "num_tokens": 2105149.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5561497956514359, | |
| "epoch": 0.4868913857677903, | |
| "grad_norm": 0.038827862590551376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55650395154953, | |
| "mean_token_accuracy": 0.7764105200767517, | |
| "num_tokens": 2121463.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5783034265041351, | |
| "epoch": 0.49063670411985016, | |
| "grad_norm": 0.029603557661175728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703758001327515, | |
| "mean_token_accuracy": 0.7689076513051987, | |
| "num_tokens": 2137873.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5802958011627197, | |
| "epoch": 0.4943820224719101, | |
| "grad_norm": 0.03336755558848381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5750676989555359, | |
| "mean_token_accuracy": 0.7685631215572357, | |
| "num_tokens": 2154043.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5565105229616165, | |
| "epoch": 0.49812734082397003, | |
| "grad_norm": 0.03589406609535217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438498258590698, | |
| "mean_token_accuracy": 0.7815204560756683, | |
| "num_tokens": 2170057.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5716612040996552, | |
| "epoch": 0.50187265917603, | |
| "grad_norm": 0.03452189266681671, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5778107047080994, | |
| "mean_token_accuracy": 0.7688381224870682, | |
| "num_tokens": 2186386.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.561384916305542, | |
| "epoch": 0.5056179775280899, | |
| "grad_norm": 0.03864321857690811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704262256622314, | |
| "mean_token_accuracy": 0.7647197097539902, | |
| "num_tokens": 2202441.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5625592470169067, | |
| "epoch": 0.5093632958801498, | |
| "grad_norm": 0.029244674369692802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618846416473389, | |
| "mean_token_accuracy": 0.7706502974033356, | |
| "num_tokens": 2218642.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.557224690914154, | |
| "epoch": 0.5131086142322098, | |
| "grad_norm": 0.03010115958750248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529860854148865, | |
| "mean_token_accuracy": 0.7745790481567383, | |
| "num_tokens": 2234941.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5669968128204346, | |
| "epoch": 0.5168539325842697, | |
| "grad_norm": 0.030734272673726082, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563121497631073, | |
| "mean_token_accuracy": 0.7691874206066132, | |
| "num_tokens": 2251132.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5601507127285004, | |
| "epoch": 0.5205992509363296, | |
| "grad_norm": 0.03075527958571911, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602597594261169, | |
| "mean_token_accuracy": 0.7736657857894897, | |
| "num_tokens": 2267424.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5564019232988358, | |
| "epoch": 0.5243445692883895, | |
| "grad_norm": 0.03025938756763935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628267526626587, | |
| "mean_token_accuracy": 0.771067887544632, | |
| "num_tokens": 2283849.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5395451635122299, | |
| "epoch": 0.5280898876404494, | |
| "grad_norm": 0.03199173882603645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487725734710693, | |
| "mean_token_accuracy": 0.7775663435459137, | |
| "num_tokens": 2299872.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5526085048913956, | |
| "epoch": 0.5318352059925093, | |
| "grad_norm": 0.030539415776729584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591868162155151, | |
| "mean_token_accuracy": 0.7733905166387558, | |
| "num_tokens": 2316381.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5586904883384705, | |
| "epoch": 0.5355805243445693, | |
| "grad_norm": 0.03167688101530075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590608716011047, | |
| "mean_token_accuracy": 0.7722269594669342, | |
| "num_tokens": 2332636.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5568670481443405, | |
| "epoch": 0.5393258426966292, | |
| "grad_norm": 0.02876191958785057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519507527351379, | |
| "mean_token_accuracy": 0.776704877614975, | |
| "num_tokens": 2348823.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.5536152571439743, | |
| "epoch": 0.5430711610486891, | |
| "grad_norm": 0.026966845616698265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451969504356384, | |
| "mean_token_accuracy": 0.7772984057664871, | |
| "num_tokens": 2365018.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.55972820520401, | |
| "epoch": 0.5468164794007491, | |
| "grad_norm": 0.028171516954898834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568036437034607, | |
| "mean_token_accuracy": 0.7727039009332657, | |
| "num_tokens": 2381199.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5505439043045044, | |
| "epoch": 0.550561797752809, | |
| "grad_norm": 0.02772362343966961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527427792549133, | |
| "mean_token_accuracy": 0.7765008956193924, | |
| "num_tokens": 2397235.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5575017333030701, | |
| "epoch": 0.5543071161048689, | |
| "grad_norm": 0.030587337911128998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631366968154907, | |
| "mean_token_accuracy": 0.7698703855276108, | |
| "num_tokens": 2413454.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5469523966312408, | |
| "epoch": 0.5580524344569289, | |
| "grad_norm": 0.0317547544836998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554557740688324, | |
| "mean_token_accuracy": 0.776221752166748, | |
| "num_tokens": 2429888.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5393165349960327, | |
| "epoch": 0.5617977528089888, | |
| "grad_norm": 0.028293034061789513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538506269454956, | |
| "mean_token_accuracy": 0.7823521643877029, | |
| "num_tokens": 2446146.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5640445649623871, | |
| "epoch": 0.5655430711610487, | |
| "grad_norm": 0.027342529967427254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663660764694214, | |
| "mean_token_accuracy": 0.7686634063720703, | |
| "num_tokens": 2462436.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5660315603017807, | |
| "epoch": 0.5692883895131086, | |
| "grad_norm": 0.029160011559724808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658541917800903, | |
| "mean_token_accuracy": 0.7699626982212067, | |
| "num_tokens": 2478983.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5457171052694321, | |
| "epoch": 0.5730337078651685, | |
| "grad_norm": 0.029130199924111366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439150929450989, | |
| "mean_token_accuracy": 0.7802361398935318, | |
| "num_tokens": 2495263.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5504166930913925, | |
| "epoch": 0.5767790262172284, | |
| "grad_norm": 0.03016018122434616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510883331298828, | |
| "mean_token_accuracy": 0.775614932179451, | |
| "num_tokens": 2511475.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5550555139780045, | |
| "epoch": 0.5805243445692884, | |
| "grad_norm": 0.03134196624159813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607972145080566, | |
| "mean_token_accuracy": 0.7707046419382095, | |
| "num_tokens": 2527673.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5454694628715515, | |
| "epoch": 0.5842696629213483, | |
| "grad_norm": 0.0311669260263443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492562651634216, | |
| "mean_token_accuracy": 0.779202476143837, | |
| "num_tokens": 2543853.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5742276608943939, | |
| "epoch": 0.5880149812734082, | |
| "grad_norm": 0.027328435331583023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5779210329055786, | |
| "mean_token_accuracy": 0.765041321516037, | |
| "num_tokens": 2560115.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5670003890991211, | |
| "epoch": 0.5917602996254682, | |
| "grad_norm": 0.02951730042695999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664114952087402, | |
| "mean_token_accuracy": 0.7700729966163635, | |
| "num_tokens": 2576322.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5762516111135483, | |
| "epoch": 0.5955056179775281, | |
| "grad_norm": 0.029969869181513786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735501050949097, | |
| "mean_token_accuracy": 0.7683756053447723, | |
| "num_tokens": 2592455.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5583818256855011, | |
| "epoch": 0.599250936329588, | |
| "grad_norm": 0.02687755413353443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561562776565552, | |
| "mean_token_accuracy": 0.7738349288702011, | |
| "num_tokens": 2608647.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5745189636945724, | |
| "epoch": 0.602996254681648, | |
| "grad_norm": 0.03188227489590645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.573383092880249, | |
| "mean_token_accuracy": 0.7658237218856812, | |
| "num_tokens": 2624851.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5701076835393906, | |
| "epoch": 0.6067415730337079, | |
| "grad_norm": 0.03216436505317688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5696204900741577, | |
| "mean_token_accuracy": 0.7674751281738281, | |
| "num_tokens": 2641365.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.548926368355751, | |
| "epoch": 0.6104868913857678, | |
| "grad_norm": 0.02745572291314602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530045032501221, | |
| "mean_token_accuracy": 0.7764343470335007, | |
| "num_tokens": 2657724.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5748997032642365, | |
| "epoch": 0.6142322097378277, | |
| "grad_norm": 0.03055480308830738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5857313275337219, | |
| "mean_token_accuracy": 0.7639760226011276, | |
| "num_tokens": 2674255.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5685756206512451, | |
| "epoch": 0.6179775280898876, | |
| "grad_norm": 0.030725592747330666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5727284550666809, | |
| "mean_token_accuracy": 0.7686582803726196, | |
| "num_tokens": 2690670.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.547265499830246, | |
| "epoch": 0.6217228464419475, | |
| "grad_norm": 0.028982795774936676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458434820175171, | |
| "mean_token_accuracy": 0.7764610648155212, | |
| "num_tokens": 2706990.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5669321566820145, | |
| "epoch": 0.6254681647940075, | |
| "grad_norm": 0.02999156154692173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610904097557068, | |
| "mean_token_accuracy": 0.7703774124383926, | |
| "num_tokens": 2723382.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5631402879953384, | |
| "epoch": 0.6292134831460674, | |
| "grad_norm": 0.02727295272052288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610119700431824, | |
| "mean_token_accuracy": 0.7734928578138351, | |
| "num_tokens": 2739673.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5462162643671036, | |
| "epoch": 0.6329588014981273, | |
| "grad_norm": 0.03161296248435974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594881772994995, | |
| "mean_token_accuracy": 0.7721333503723145, | |
| "num_tokens": 2756004.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5525806844234467, | |
| "epoch": 0.6367041198501873, | |
| "grad_norm": 0.028923675417900085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581262707710266, | |
| "mean_token_accuracy": 0.7746219336986542, | |
| "num_tokens": 2772131.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5815936326980591, | |
| "epoch": 0.6404494382022472, | |
| "grad_norm": 0.029989033937454224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5781337022781372, | |
| "mean_token_accuracy": 0.7642954289913177, | |
| "num_tokens": 2788556.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5742616653442383, | |
| "epoch": 0.6441947565543071, | |
| "grad_norm": 0.03870734944939613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5799432992935181, | |
| "mean_token_accuracy": 0.7655478119850159, | |
| "num_tokens": 2804635.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.576400488615036, | |
| "epoch": 0.6479400749063671, | |
| "grad_norm": 0.02596936747431755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705851912498474, | |
| "mean_token_accuracy": 0.7653899490833282, | |
| "num_tokens": 2821201.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5751689076423645, | |
| "epoch": 0.651685393258427, | |
| "grad_norm": 0.02525261603295803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706028938293457, | |
| "mean_token_accuracy": 0.7693078964948654, | |
| "num_tokens": 2837952.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.557927280664444, | |
| "epoch": 0.6554307116104869, | |
| "grad_norm": 0.025947891175746918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55954509973526, | |
| "mean_token_accuracy": 0.7710674405097961, | |
| "num_tokens": 2854247.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5340227037668228, | |
| "epoch": 0.6591760299625468, | |
| "grad_norm": 0.03157508745789528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432956218719482, | |
| "mean_token_accuracy": 0.7804963290691376, | |
| "num_tokens": 2870169.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5522671341896057, | |
| "epoch": 0.6629213483146067, | |
| "grad_norm": 0.027346299961209297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591444969177246, | |
| "mean_token_accuracy": 0.7712446004152298, | |
| "num_tokens": 2886516.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5393896102905273, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.027576690539717674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416374206542969, | |
| "mean_token_accuracy": 0.7780617028474808, | |
| "num_tokens": 2902729.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5685822814702988, | |
| "epoch": 0.6704119850187266, | |
| "grad_norm": 0.03415964916348457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5774993300437927, | |
| "mean_token_accuracy": 0.7654603570699692, | |
| "num_tokens": 2919059.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5473489463329315, | |
| "epoch": 0.6741573033707865, | |
| "grad_norm": 0.03175094723701477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478240847587585, | |
| "mean_token_accuracy": 0.7771035730838776, | |
| "num_tokens": 2935209.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5505825132131577, | |
| "epoch": 0.6779026217228464, | |
| "grad_norm": 0.027963241562247276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473360419273376, | |
| "mean_token_accuracy": 0.7776090204715729, | |
| "num_tokens": 2951643.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5541345179080963, | |
| "epoch": 0.6816479400749064, | |
| "grad_norm": 0.03300129622220993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419403910636902, | |
| "mean_token_accuracy": 0.7789575755596161, | |
| "num_tokens": 2967938.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5639268904924393, | |
| "epoch": 0.6853932584269663, | |
| "grad_norm": 0.032656021416187286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597264170646667, | |
| "mean_token_accuracy": 0.7759947925806046, | |
| "num_tokens": 2984230.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5538647770881653, | |
| "epoch": 0.6891385767790262, | |
| "grad_norm": 0.03382604569196701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666002631187439, | |
| "mean_token_accuracy": 0.7692589312791824, | |
| "num_tokens": 3000607.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5578113794326782, | |
| "epoch": 0.6928838951310862, | |
| "grad_norm": 0.03644486889243126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5739911198616028, | |
| "mean_token_accuracy": 0.7684497386217117, | |
| "num_tokens": 3017077.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5290449112653732, | |
| "epoch": 0.6966292134831461, | |
| "grad_norm": 0.027713051065802574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355228781700134, | |
| "mean_token_accuracy": 0.7826152592897415, | |
| "num_tokens": 3032996.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5759813338518143, | |
| "epoch": 0.700374531835206, | |
| "grad_norm": 0.03057127632200718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.569280743598938, | |
| "mean_token_accuracy": 0.7680912464857101, | |
| "num_tokens": 3049460.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5720777213573456, | |
| "epoch": 0.704119850187266, | |
| "grad_norm": 0.02572391740977764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658439993858337, | |
| "mean_token_accuracy": 0.7709487825632095, | |
| "num_tokens": 3065672.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5517766922712326, | |
| "epoch": 0.7078651685393258, | |
| "grad_norm": 0.029554082080721855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389034748077393, | |
| "mean_token_accuracy": 0.7830005586147308, | |
| "num_tokens": 3082173.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5635267347097397, | |
| "epoch": 0.7116104868913857, | |
| "grad_norm": 0.025442970916628838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614153742790222, | |
| "mean_token_accuracy": 0.7708731889724731, | |
| "num_tokens": 3098727.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5624646097421646, | |
| "epoch": 0.7153558052434457, | |
| "grad_norm": 0.03501886874437332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5751168727874756, | |
| "mean_token_accuracy": 0.7674457877874374, | |
| "num_tokens": 3115031.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5412020832300186, | |
| "epoch": 0.7191011235955056, | |
| "grad_norm": 0.029673364013433456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503013730049133, | |
| "mean_token_accuracy": 0.780591607093811, | |
| "num_tokens": 3131271.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.557359516620636, | |
| "epoch": 0.7228464419475655, | |
| "grad_norm": 0.025931306183338165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559468150138855, | |
| "mean_token_accuracy": 0.7729436904191971, | |
| "num_tokens": 3147732.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5394045114517212, | |
| "epoch": 0.7265917602996255, | |
| "grad_norm": 0.0292246975004673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409769415855408, | |
| "mean_token_accuracy": 0.7795000076293945, | |
| "num_tokens": 3163963.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5587436705827713, | |
| "epoch": 0.7303370786516854, | |
| "grad_norm": 0.03306795284152031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556156039237976, | |
| "mean_token_accuracy": 0.7742602825164795, | |
| "num_tokens": 3179928.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.558687686920166, | |
| "epoch": 0.7340823970037453, | |
| "grad_norm": 0.025363627821207047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573633909225464, | |
| "mean_token_accuracy": 0.7759020626544952, | |
| "num_tokens": 3196142.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.545383557677269, | |
| "epoch": 0.7378277153558053, | |
| "grad_norm": 0.027863260358572006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485226511955261, | |
| "mean_token_accuracy": 0.7776659727096558, | |
| "num_tokens": 3212565.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5556656569242477, | |
| "epoch": 0.7415730337078652, | |
| "grad_norm": 0.035580288618803024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673390626907349, | |
| "mean_token_accuracy": 0.7700339257717133, | |
| "num_tokens": 3228915.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5520624220371246, | |
| "epoch": 0.7453183520599251, | |
| "grad_norm": 0.02862994559109211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494414567947388, | |
| "mean_token_accuracy": 0.7801119983196259, | |
| "num_tokens": 3245273.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5758003443479538, | |
| "epoch": 0.7490636704119851, | |
| "grad_norm": 0.0339261032640934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5687139630317688, | |
| "mean_token_accuracy": 0.7678625285625458, | |
| "num_tokens": 3261785.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.568912148475647, | |
| "epoch": 0.7528089887640449, | |
| "grad_norm": 0.029947372153401375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638163089752197, | |
| "mean_token_accuracy": 0.77249875664711, | |
| "num_tokens": 3278313.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5490483492612839, | |
| "epoch": 0.7565543071161048, | |
| "grad_norm": 0.02934352308511734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535009503364563, | |
| "mean_token_accuracy": 0.7746146768331528, | |
| "num_tokens": 3294575.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.560209795832634, | |
| "epoch": 0.7602996254681648, | |
| "grad_norm": 0.031990889459848404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637909770011902, | |
| "mean_token_accuracy": 0.7735392153263092, | |
| "num_tokens": 3310679.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5573873072862625, | |
| "epoch": 0.7640449438202247, | |
| "grad_norm": 0.02812575176358223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629784464836121, | |
| "mean_token_accuracy": 0.7686379998922348, | |
| "num_tokens": 3327065.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.534591019153595, | |
| "epoch": 0.7677902621722846, | |
| "grad_norm": 0.03412024676799774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546525239944458, | |
| "mean_token_accuracy": 0.7761467695236206, | |
| "num_tokens": 3343404.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5677939504384995, | |
| "epoch": 0.7715355805243446, | |
| "grad_norm": 0.02933080866932869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688956379890442, | |
| "mean_token_accuracy": 0.7702508270740509, | |
| "num_tokens": 3359958.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.582836389541626, | |
| "epoch": 0.7752808988764045, | |
| "grad_norm": 0.027001049369573593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5772212147712708, | |
| "mean_token_accuracy": 0.7654514610767365, | |
| "num_tokens": 3376426.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5876192450523376, | |
| "epoch": 0.7790262172284644, | |
| "grad_norm": 0.031185103580355644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5810344219207764, | |
| "mean_token_accuracy": 0.7651431113481522, | |
| "num_tokens": 3392821.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5676351487636566, | |
| "epoch": 0.7827715355805244, | |
| "grad_norm": 0.02849467284977436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602158904075623, | |
| "mean_token_accuracy": 0.771087646484375, | |
| "num_tokens": 3409137.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5598850250244141, | |
| "epoch": 0.7865168539325843, | |
| "grad_norm": 0.028652694076299667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560476779937744, | |
| "mean_token_accuracy": 0.7744726985692978, | |
| "num_tokens": 3425346.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5631076842546463, | |
| "epoch": 0.7902621722846442, | |
| "grad_norm": 0.03177965059876442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5703850984573364, | |
| "mean_token_accuracy": 0.7688238769769669, | |
| "num_tokens": 3441766.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5571614354848862, | |
| "epoch": 0.7940074906367042, | |
| "grad_norm": 0.035387102514505386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5680047869682312, | |
| "mean_token_accuracy": 0.7702172994613647, | |
| "num_tokens": 3458303.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5512831062078476, | |
| "epoch": 0.797752808988764, | |
| "grad_norm": 0.02970981039106846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541270971298218, | |
| "mean_token_accuracy": 0.7740521878004074, | |
| "num_tokens": 3474455.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5604052096605301, | |
| "epoch": 0.8014981273408239, | |
| "grad_norm": 0.028583871200680733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585545301437378, | |
| "mean_token_accuracy": 0.7712778151035309, | |
| "num_tokens": 3490567.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5531798452138901, | |
| "epoch": 0.8052434456928839, | |
| "grad_norm": 0.027284301817417145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523191690444946, | |
| "mean_token_accuracy": 0.7744116485118866, | |
| "num_tokens": 3506697.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5611687004566193, | |
| "epoch": 0.8089887640449438, | |
| "grad_norm": 0.030331265181303024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599703192710876, | |
| "mean_token_accuracy": 0.7741329371929169, | |
| "num_tokens": 3523064.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5679153800010681, | |
| "epoch": 0.8127340823970037, | |
| "grad_norm": 0.028981544077396393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729029178619385, | |
| "mean_token_accuracy": 0.7667650431394577, | |
| "num_tokens": 3539143.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5438763052225113, | |
| "epoch": 0.8164794007490637, | |
| "grad_norm": 0.02691890485584736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485566854476929, | |
| "mean_token_accuracy": 0.7739608585834503, | |
| "num_tokens": 3555565.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5619954615831375, | |
| "epoch": 0.8202247191011236, | |
| "grad_norm": 0.026171443983912468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637154579162598, | |
| "mean_token_accuracy": 0.7711703032255173, | |
| "num_tokens": 3571906.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5464108288288116, | |
| "epoch": 0.8239700374531835, | |
| "grad_norm": 0.02858656644821167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461940169334412, | |
| "mean_token_accuracy": 0.7789376378059387, | |
| "num_tokens": 3588158.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5636538565158844, | |
| "epoch": 0.8277153558052435, | |
| "grad_norm": 0.02787981554865837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658812522888184, | |
| "mean_token_accuracy": 0.7694707363843918, | |
| "num_tokens": 3604701.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5738235861063004, | |
| "epoch": 0.8314606741573034, | |
| "grad_norm": 0.03107610158622265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5720517635345459, | |
| "mean_token_accuracy": 0.767520397901535, | |
| "num_tokens": 3621041.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5418261587619781, | |
| "epoch": 0.8352059925093633, | |
| "grad_norm": 0.030757945030927658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468308925628662, | |
| "mean_token_accuracy": 0.7743646949529648, | |
| "num_tokens": 3637338.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5567242801189423, | |
| "epoch": 0.8389513108614233, | |
| "grad_norm": 0.031262289732694626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633231997489929, | |
| "mean_token_accuracy": 0.7722140103578568, | |
| "num_tokens": 3653872.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5542743653059006, | |
| "epoch": 0.8426966292134831, | |
| "grad_norm": 0.03351176902651787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574679374694824, | |
| "mean_token_accuracy": 0.7744366973638535, | |
| "num_tokens": 3670013.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5486074835062027, | |
| "epoch": 0.846441947565543, | |
| "grad_norm": 0.0312609001994133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545890748500824, | |
| "mean_token_accuracy": 0.7778652608394623, | |
| "num_tokens": 3686275.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5650633871555328, | |
| "epoch": 0.850187265917603, | |
| "grad_norm": 0.028242582455277443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587697625160217, | |
| "mean_token_accuracy": 0.7728594094514847, | |
| "num_tokens": 3702890.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5442924797534943, | |
| "epoch": 0.8539325842696629, | |
| "grad_norm": 0.03206290304660797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438553690910339, | |
| "mean_token_accuracy": 0.7799272388219833, | |
| "num_tokens": 3719196.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5688119828701019, | |
| "epoch": 0.8576779026217228, | |
| "grad_norm": 0.031068341806530952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5722005367279053, | |
| "mean_token_accuracy": 0.7658038288354874, | |
| "num_tokens": 3735614.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5671662837266922, | |
| "epoch": 0.8614232209737828, | |
| "grad_norm": 0.03664137050509453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5779143571853638, | |
| "mean_token_accuracy": 0.7624872028827667, | |
| "num_tokens": 3751617.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5505847632884979, | |
| "epoch": 0.8651685393258427, | |
| "grad_norm": 0.031469304114580154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520802140235901, | |
| "mean_token_accuracy": 0.7765519469976425, | |
| "num_tokens": 3768020.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5407437533140182, | |
| "epoch": 0.8689138576779026, | |
| "grad_norm": 0.03157830610871315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53821861743927, | |
| "mean_token_accuracy": 0.7832015603780746, | |
| "num_tokens": 3784206.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5574967563152313, | |
| "epoch": 0.8726591760299626, | |
| "grad_norm": 0.03071594052016735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562031865119934, | |
| "mean_token_accuracy": 0.7721244394779205, | |
| "num_tokens": 3800616.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5378725826740265, | |
| "epoch": 0.8764044943820225, | |
| "grad_norm": 0.030823221430182457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407513380050659, | |
| "mean_token_accuracy": 0.7836541086435318, | |
| "num_tokens": 3816842.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5592721700668335, | |
| "epoch": 0.8801498127340824, | |
| "grad_norm": 0.03175733983516693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660021305084229, | |
| "mean_token_accuracy": 0.7676839083433151, | |
| "num_tokens": 3833206.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5588899403810501, | |
| "epoch": 0.8838951310861424, | |
| "grad_norm": 0.03060559183359146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651678442955017, | |
| "mean_token_accuracy": 0.7706761956214905, | |
| "num_tokens": 3849556.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5560838133096695, | |
| "epoch": 0.8876404494382022, | |
| "grad_norm": 0.03011494129896164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619899034500122, | |
| "mean_token_accuracy": 0.7695688903331757, | |
| "num_tokens": 3865973.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.572941854596138, | |
| "epoch": 0.8913857677902621, | |
| "grad_norm": 0.02626178041100502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712540149688721, | |
| "mean_token_accuracy": 0.7688916623592377, | |
| "num_tokens": 3882349.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5688192397356033, | |
| "epoch": 0.8951310861423221, | |
| "grad_norm": 0.0268928874284029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562833309173584, | |
| "mean_token_accuracy": 0.7708128988742828, | |
| "num_tokens": 3898536.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5633461475372314, | |
| "epoch": 0.898876404494382, | |
| "grad_norm": 0.029186321422457695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525766611099243, | |
| "mean_token_accuracy": 0.7749095112085342, | |
| "num_tokens": 3914950.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5715253502130508, | |
| "epoch": 0.9026217228464419, | |
| "grad_norm": 0.029228920117020607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5710093975067139, | |
| "mean_token_accuracy": 0.7693532109260559, | |
| "num_tokens": 3931161.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5170925259590149, | |
| "epoch": 0.9063670411985019, | |
| "grad_norm": 0.03571123257279396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52873295545578, | |
| "mean_token_accuracy": 0.7879834473133087, | |
| "num_tokens": 3947256.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5353554487228394, | |
| "epoch": 0.9101123595505618, | |
| "grad_norm": 0.031091809272766113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437985062599182, | |
| "mean_token_accuracy": 0.7802935838699341, | |
| "num_tokens": 3963703.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5593858063220978, | |
| "epoch": 0.9138576779026217, | |
| "grad_norm": 0.028724675998091698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654380321502686, | |
| "mean_token_accuracy": 0.766664981842041, | |
| "num_tokens": 3980237.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5452692359685898, | |
| "epoch": 0.9176029962546817, | |
| "grad_norm": 0.032008957117795944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489979982376099, | |
| "mean_token_accuracy": 0.7783998996019363, | |
| "num_tokens": 3996411.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5732362270355225, | |
| "epoch": 0.9213483146067416, | |
| "grad_norm": 0.026769591495394707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5739398002624512, | |
| "mean_token_accuracy": 0.7671795785427094, | |
| "num_tokens": 4012857.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5656879991292953, | |
| "epoch": 0.9250936329588015, | |
| "grad_norm": 0.03197095915675163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563187301158905, | |
| "mean_token_accuracy": 0.7670102566480637, | |
| "num_tokens": 4029053.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5575947314500809, | |
| "epoch": 0.9288389513108615, | |
| "grad_norm": 0.02987116388976574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625151991844177, | |
| "mean_token_accuracy": 0.7722823321819305, | |
| "num_tokens": 4045520.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5391925573348999, | |
| "epoch": 0.9325842696629213, | |
| "grad_norm": 0.03071737289428711, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494749546051025, | |
| "mean_token_accuracy": 0.7774742394685745, | |
| "num_tokens": 4061722.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5374163240194321, | |
| "epoch": 0.9363295880149812, | |
| "grad_norm": 0.03443381190299988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430468916893005, | |
| "mean_token_accuracy": 0.7767436355352402, | |
| "num_tokens": 4077909.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.563934788107872, | |
| "epoch": 0.9400749063670412, | |
| "grad_norm": 0.03456362709403038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705171227455139, | |
| "mean_token_accuracy": 0.7667582482099533, | |
| "num_tokens": 4094266.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5498995631933212, | |
| "epoch": 0.9438202247191011, | |
| "grad_norm": 0.03230346366763115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477432012557983, | |
| "mean_token_accuracy": 0.7797223627567291, | |
| "num_tokens": 4110154.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5815821886062622, | |
| "epoch": 0.947565543071161, | |
| "grad_norm": 0.030871113762259483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5757232904434204, | |
| "mean_token_accuracy": 0.7643865346908569, | |
| "num_tokens": 4126298.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.568855032324791, | |
| "epoch": 0.951310861423221, | |
| "grad_norm": 0.03128105401992798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623528361320496, | |
| "mean_token_accuracy": 0.7733433544635773, | |
| "num_tokens": 4142423.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5580300092697144, | |
| "epoch": 0.9550561797752809, | |
| "grad_norm": 0.028919901698827744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540750026702881, | |
| "mean_token_accuracy": 0.7751399129629135, | |
| "num_tokens": 4158616.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5586510896682739, | |
| "epoch": 0.9588014981273408, | |
| "grad_norm": 0.028054876253008842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566189289093018, | |
| "mean_token_accuracy": 0.771488219499588, | |
| "num_tokens": 4174981.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5506493747234344, | |
| "epoch": 0.9625468164794008, | |
| "grad_norm": 0.028799347579479218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535633563995361, | |
| "mean_token_accuracy": 0.7742148786783218, | |
| "num_tokens": 4191446.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5423731654882431, | |
| "epoch": 0.9662921348314607, | |
| "grad_norm": 0.033325713127851486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534674525260925, | |
| "mean_token_accuracy": 0.773481622338295, | |
| "num_tokens": 4207545.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5463626831769943, | |
| "epoch": 0.9700374531835206, | |
| "grad_norm": 0.029474180191755295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469580888748169, | |
| "mean_token_accuracy": 0.778034508228302, | |
| "num_tokens": 4223705.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5447346717119217, | |
| "epoch": 0.9737827715355806, | |
| "grad_norm": 0.02612573839724064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400044322013855, | |
| "mean_token_accuracy": 0.7802340239286423, | |
| "num_tokens": 4240129.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5821470022201538, | |
| "epoch": 0.9775280898876404, | |
| "grad_norm": 0.030348099768161774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5687776803970337, | |
| "mean_token_accuracy": 0.7688710540533066, | |
| "num_tokens": 4256543.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5551526695489883, | |
| "epoch": 0.9812734082397003, | |
| "grad_norm": 0.027197403833270073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550498962402344, | |
| "mean_token_accuracy": 0.7730266898870468, | |
| "num_tokens": 4272850.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.558951735496521, | |
| "epoch": 0.9850187265917603, | |
| "grad_norm": 0.02930772304534912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568732738494873, | |
| "mean_token_accuracy": 0.7649472206830978, | |
| "num_tokens": 4288981.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5453519076108932, | |
| "epoch": 0.9887640449438202, | |
| "grad_norm": 0.03282203525304794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584692358970642, | |
| "mean_token_accuracy": 0.7731108516454697, | |
| "num_tokens": 4305020.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5550204813480377, | |
| "epoch": 0.9925093632958801, | |
| "grad_norm": 0.030776405707001686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647276639938354, | |
| "mean_token_accuracy": 0.7714035212993622, | |
| "num_tokens": 4321505.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5713452994823456, | |
| "epoch": 0.9962546816479401, | |
| "grad_norm": 0.027741121128201485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671746134757996, | |
| "mean_token_accuracy": 0.77179254591465, | |
| "num_tokens": 4337819.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5695875138044357, | |
| "epoch": 1.0, | |
| "grad_norm": 0.03063138760626316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631532669067383, | |
| "mean_token_accuracy": 0.7723733484745026, | |
| "num_tokens": 4354077.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5564615577459335, | |
| "epoch": 1.00374531835206, | |
| "grad_norm": 0.02938828431069851, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473178625106812, | |
| "mean_token_accuracy": 0.7778049558401108, | |
| "num_tokens": 4370546.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5574217587709427, | |
| "epoch": 1.0074906367041199, | |
| "grad_norm": 0.029280902817845345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522539019584656, | |
| "mean_token_accuracy": 0.774829238653183, | |
| "num_tokens": 4386769.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5274022594094276, | |
| "epoch": 1.0112359550561798, | |
| "grad_norm": 0.03879232704639435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378210544586182, | |
| "mean_token_accuracy": 0.7831418812274933, | |
| "num_tokens": 4402982.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5290966331958771, | |
| "epoch": 1.0149812734082397, | |
| "grad_norm": 0.03839439898729324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428091883659363, | |
| "mean_token_accuracy": 0.7794705182313919, | |
| "num_tokens": 4418967.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5340720564126968, | |
| "epoch": 1.0187265917602997, | |
| "grad_norm": 0.027254262939095497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355733633041382, | |
| "mean_token_accuracy": 0.7818265557289124, | |
| "num_tokens": 4435204.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5440738946199417, | |
| "epoch": 1.0224719101123596, | |
| "grad_norm": 0.03392236679792404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456275939941406, | |
| "mean_token_accuracy": 0.780282586812973, | |
| "num_tokens": 4451432.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5574818104505539, | |
| "epoch": 1.0262172284644195, | |
| "grad_norm": 0.026871202513575554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559114217758179, | |
| "mean_token_accuracy": 0.777089074254036, | |
| "num_tokens": 4467766.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5488097965717316, | |
| "epoch": 1.0299625468164795, | |
| "grad_norm": 0.029019974172115326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336285829544067, | |
| "mean_token_accuracy": 0.7849163711071014, | |
| "num_tokens": 4483969.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5530442148447037, | |
| "epoch": 1.0337078651685394, | |
| "grad_norm": 0.02914772555232048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511333346366882, | |
| "mean_token_accuracy": 0.7753048241138458, | |
| "num_tokens": 4500202.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5580654293298721, | |
| "epoch": 1.0374531835205993, | |
| "grad_norm": 0.02970791608095169, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622603297233582, | |
| "mean_token_accuracy": 0.7713205814361572, | |
| "num_tokens": 4516619.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5405817478895187, | |
| "epoch": 1.0411985018726593, | |
| "grad_norm": 0.0317082442343235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510064363479614, | |
| "mean_token_accuracy": 0.7750898003578186, | |
| "num_tokens": 4532787.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.529707208275795, | |
| "epoch": 1.0449438202247192, | |
| "grad_norm": 0.032039616256952286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385198593139648, | |
| "mean_token_accuracy": 0.7802569419145584, | |
| "num_tokens": 4549095.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.536220982670784, | |
| "epoch": 1.048689138576779, | |
| "grad_norm": 0.03247847780585289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422552824020386, | |
| "mean_token_accuracy": 0.7777614146471024, | |
| "num_tokens": 4565068.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5643364787101746, | |
| "epoch": 1.0524344569288389, | |
| "grad_norm": 0.03038158267736435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526927709579468, | |
| "mean_token_accuracy": 0.7772861868143082, | |
| "num_tokens": 4581362.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5710341036319733, | |
| "epoch": 1.0561797752808988, | |
| "grad_norm": 0.029375184327363968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5627338290214539, | |
| "mean_token_accuracy": 0.7716031968593597, | |
| "num_tokens": 4598044.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5661873072385788, | |
| "epoch": 1.0599250936329587, | |
| "grad_norm": 0.029537923634052277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619353652000427, | |
| "mean_token_accuracy": 0.7722314894199371, | |
| "num_tokens": 4614605.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.545825719833374, | |
| "epoch": 1.0636704119850187, | |
| "grad_norm": 0.028511304408311844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431419610977173, | |
| "mean_token_accuracy": 0.7778640240430832, | |
| "num_tokens": 4630914.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5331753790378571, | |
| "epoch": 1.0674157303370786, | |
| "grad_norm": 0.032436709851026535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459548830986023, | |
| "mean_token_accuracy": 0.7751310169696808, | |
| "num_tokens": 4647234.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5640293508768082, | |
| "epoch": 1.0711610486891385, | |
| "grad_norm": 0.0322943851351738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726660490036011, | |
| "mean_token_accuracy": 0.76516292989254, | |
| "num_tokens": 4663828.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5655198693275452, | |
| "epoch": 1.0749063670411985, | |
| "grad_norm": 0.028429750353097916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707299709320068, | |
| "mean_token_accuracy": 0.7665908485651016, | |
| "num_tokens": 4680191.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5641037821769714, | |
| "epoch": 1.0786516853932584, | |
| "grad_norm": 0.02850640006363392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591652393341064, | |
| "mean_token_accuracy": 0.7727868556976318, | |
| "num_tokens": 4696297.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5585228204727173, | |
| "epoch": 1.0823970037453183, | |
| "grad_norm": 0.03052029199898243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535526275634766, | |
| "mean_token_accuracy": 0.7758607268333435, | |
| "num_tokens": 4712608.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5454631745815277, | |
| "epoch": 1.0861423220973783, | |
| "grad_norm": 0.02904430776834488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463353395462036, | |
| "mean_token_accuracy": 0.7812290787696838, | |
| "num_tokens": 4728702.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.547488197684288, | |
| "epoch": 1.0898876404494382, | |
| "grad_norm": 0.02964003197848797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422903299331665, | |
| "mean_token_accuracy": 0.7805432081222534, | |
| "num_tokens": 4745177.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5354203134775162, | |
| "epoch": 1.0936329588014981, | |
| "grad_norm": 0.036443792283535004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374300479888916, | |
| "mean_token_accuracy": 0.7797484993934631, | |
| "num_tokens": 4761143.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5536107122898102, | |
| "epoch": 1.097378277153558, | |
| "grad_norm": 0.028762439265847206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621394515037537, | |
| "mean_token_accuracy": 0.7706074863672256, | |
| "num_tokens": 4777282.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5409039855003357, | |
| "epoch": 1.101123595505618, | |
| "grad_norm": 0.03404904156923294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510942339897156, | |
| "mean_token_accuracy": 0.7781406044960022, | |
| "num_tokens": 4793365.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5496554970741272, | |
| "epoch": 1.104868913857678, | |
| "grad_norm": 0.03300090506672859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508947372436523, | |
| "mean_token_accuracy": 0.7776678502559662, | |
| "num_tokens": 4809752.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5615599453449249, | |
| "epoch": 1.1086142322097379, | |
| "grad_norm": 0.02708325907588005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569652915000916, | |
| "mean_token_accuracy": 0.7737039029598236, | |
| "num_tokens": 4826077.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5593246519565582, | |
| "epoch": 1.1123595505617978, | |
| "grad_norm": 0.03139323368668556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524771809577942, | |
| "mean_token_accuracy": 0.7745187878608704, | |
| "num_tokens": 4842333.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5454850494861603, | |
| "epoch": 1.1161048689138577, | |
| "grad_norm": 0.02898702770471573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425970554351807, | |
| "mean_token_accuracy": 0.7789193391799927, | |
| "num_tokens": 4858558.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.538344144821167, | |
| "epoch": 1.1198501872659177, | |
| "grad_norm": 0.029788950458168983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424114465713501, | |
| "mean_token_accuracy": 0.7777515351772308, | |
| "num_tokens": 4874826.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5260975658893585, | |
| "epoch": 1.1235955056179776, | |
| "grad_norm": 0.03646169230341911, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355998277664185, | |
| "mean_token_accuracy": 0.7840575128793716, | |
| "num_tokens": 4890978.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5369604676961899, | |
| "epoch": 1.1273408239700375, | |
| "grad_norm": 0.03131569176912308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540716290473938, | |
| "mean_token_accuracy": 0.780446395277977, | |
| "num_tokens": 4907064.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5605516880750656, | |
| "epoch": 1.1310861423220975, | |
| "grad_norm": 0.034511223435401917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577893257141113, | |
| "mean_token_accuracy": 0.7730138152837753, | |
| "num_tokens": 4923266.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5472770929336548, | |
| "epoch": 1.1348314606741572, | |
| "grad_norm": 0.0347181111574173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447498559951782, | |
| "mean_token_accuracy": 0.7790001332759857, | |
| "num_tokens": 4939554.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5580919533967972, | |
| "epoch": 1.1385767790262173, | |
| "grad_norm": 0.029458722099661827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602295994758606, | |
| "mean_token_accuracy": 0.7698655724525452, | |
| "num_tokens": 4955864.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5566238462924957, | |
| "epoch": 1.142322097378277, | |
| "grad_norm": 0.03371216729283333, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516577363014221, | |
| "mean_token_accuracy": 0.7762005478143692, | |
| "num_tokens": 4972145.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5444543808698654, | |
| "epoch": 1.146067415730337, | |
| "grad_norm": 0.03240659460425377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465469360351562, | |
| "mean_token_accuracy": 0.7778800278902054, | |
| "num_tokens": 4988600.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5197838395833969, | |
| "epoch": 1.149812734082397, | |
| "grad_norm": 0.03453533351421356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52244633436203, | |
| "mean_token_accuracy": 0.7865428030490875, | |
| "num_tokens": 5004593.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5355952382087708, | |
| "epoch": 1.1535580524344569, | |
| "grad_norm": 0.02796328440308571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417516231536865, | |
| "mean_token_accuracy": 0.778742790222168, | |
| "num_tokens": 5020798.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5339494347572327, | |
| "epoch": 1.1573033707865168, | |
| "grad_norm": 0.031283531337976456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422439575195312, | |
| "mean_token_accuracy": 0.7790778428316116, | |
| "num_tokens": 5037095.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5599728673696518, | |
| "epoch": 1.1610486891385767, | |
| "grad_norm": 0.029156681150197983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628546476364136, | |
| "mean_token_accuracy": 0.7709409445524216, | |
| "num_tokens": 5053556.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5527057945728302, | |
| "epoch": 1.1647940074906367, | |
| "grad_norm": 0.028000809252262115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457457900047302, | |
| "mean_token_accuracy": 0.7764673084020615, | |
| "num_tokens": 5069817.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5439251810312271, | |
| "epoch": 1.1685393258426966, | |
| "grad_norm": 0.027509242296218872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400040149688721, | |
| "mean_token_accuracy": 0.7789120823144913, | |
| "num_tokens": 5086044.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.561322957277298, | |
| "epoch": 1.1722846441947565, | |
| "grad_norm": 0.030032532289624214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588545799255371, | |
| "mean_token_accuracy": 0.7742930203676224, | |
| "num_tokens": 5102685.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5458335727453232, | |
| "epoch": 1.1760299625468165, | |
| "grad_norm": 0.029963059350848198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477938055992126, | |
| "mean_token_accuracy": 0.777193009853363, | |
| "num_tokens": 5119294.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5545150190591812, | |
| "epoch": 1.1797752808988764, | |
| "grad_norm": 0.03310168907046318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611361265182495, | |
| "mean_token_accuracy": 0.7725827246904373, | |
| "num_tokens": 5135795.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5393262058496475, | |
| "epoch": 1.1835205992509363, | |
| "grad_norm": 0.02876197174191475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395398139953613, | |
| "mean_token_accuracy": 0.781178891658783, | |
| "num_tokens": 5151936.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5356467962265015, | |
| "epoch": 1.1872659176029963, | |
| "grad_norm": 0.029216231778264046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275884866714478, | |
| "mean_token_accuracy": 0.7844340801239014, | |
| "num_tokens": 5168072.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5539442598819733, | |
| "epoch": 1.1910112359550562, | |
| "grad_norm": 0.029222887009382248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549959540367126, | |
| "mean_token_accuracy": 0.7750978469848633, | |
| "num_tokens": 5184280.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5316408574581146, | |
| "epoch": 1.1947565543071161, | |
| "grad_norm": 0.03008115477859974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536407470703125, | |
| "mean_token_accuracy": 0.7843799740076065, | |
| "num_tokens": 5200364.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5335765928030014, | |
| "epoch": 1.198501872659176, | |
| "grad_norm": 0.030437173321843147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371608734130859, | |
| "mean_token_accuracy": 0.7834146469831467, | |
| "num_tokens": 5216503.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5507327914237976, | |
| "epoch": 1.202247191011236, | |
| "grad_norm": 0.030706282705068588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528247356414795, | |
| "mean_token_accuracy": 0.7763889282941818, | |
| "num_tokens": 5232896.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5600829422473907, | |
| "epoch": 1.205992509363296, | |
| "grad_norm": 0.03131498023867607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559609055519104, | |
| "mean_token_accuracy": 0.7688225358724594, | |
| "num_tokens": 5249400.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5482848882675171, | |
| "epoch": 1.2097378277153559, | |
| "grad_norm": 0.030239688232541084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498725771903992, | |
| "mean_token_accuracy": 0.7751806825399399, | |
| "num_tokens": 5265595.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5517048090696335, | |
| "epoch": 1.2134831460674158, | |
| "grad_norm": 0.03668053448200226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480911135673523, | |
| "mean_token_accuracy": 0.7757556736469269, | |
| "num_tokens": 5281774.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5576729625463486, | |
| "epoch": 1.2172284644194757, | |
| "grad_norm": 0.028534850105643272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513843894004822, | |
| "mean_token_accuracy": 0.7748550176620483, | |
| "num_tokens": 5297913.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5390013605356216, | |
| "epoch": 1.2209737827715357, | |
| "grad_norm": 0.03146135434508324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539669930934906, | |
| "mean_token_accuracy": 0.7778647989034653, | |
| "num_tokens": 5314070.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5463844388723373, | |
| "epoch": 1.2247191011235956, | |
| "grad_norm": 0.03442573919892311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508401393890381, | |
| "mean_token_accuracy": 0.774851381778717, | |
| "num_tokens": 5330361.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5308734029531479, | |
| "epoch": 1.2284644194756553, | |
| "grad_norm": 0.03126746043562889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370399951934814, | |
| "mean_token_accuracy": 0.7805522531270981, | |
| "num_tokens": 5346367.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5443529635667801, | |
| "epoch": 1.2322097378277155, | |
| "grad_norm": 0.028079699724912643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469828248023987, | |
| "mean_token_accuracy": 0.7801272124052048, | |
| "num_tokens": 5362795.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5508403033018112, | |
| "epoch": 1.2359550561797752, | |
| "grad_norm": 0.03308681398630142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537492632865906, | |
| "mean_token_accuracy": 0.776117667555809, | |
| "num_tokens": 5378892.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.547036200761795, | |
| "epoch": 1.2397003745318351, | |
| "grad_norm": 0.030657080933451653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473320484161377, | |
| "mean_token_accuracy": 0.7783585488796234, | |
| "num_tokens": 5395182.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5384639650583267, | |
| "epoch": 1.243445692883895, | |
| "grad_norm": 0.03128959983587265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418936610221863, | |
| "mean_token_accuracy": 0.7789008319377899, | |
| "num_tokens": 5411728.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5433261394500732, | |
| "epoch": 1.247191011235955, | |
| "grad_norm": 0.02972225658595562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430710315704346, | |
| "mean_token_accuracy": 0.7793088257312775, | |
| "num_tokens": 5427990.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5405146926641464, | |
| "epoch": 1.250936329588015, | |
| "grad_norm": 0.028844943270087242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538284957408905, | |
| "mean_token_accuracy": 0.7814860939979553, | |
| "num_tokens": 5443961.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5582905858755112, | |
| "epoch": 1.2546816479400749, | |
| "grad_norm": 0.0356195829808712, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558274507522583, | |
| "mean_token_accuracy": 0.772399827837944, | |
| "num_tokens": 5460135.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5524656623601913, | |
| "epoch": 1.2584269662921348, | |
| "grad_norm": 0.02986624464392662, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503432750701904, | |
| "mean_token_accuracy": 0.7768993377685547, | |
| "num_tokens": 5476448.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.553261786699295, | |
| "epoch": 1.2621722846441947, | |
| "grad_norm": 0.03385454788804054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513902902603149, | |
| "mean_token_accuracy": 0.7756227403879166, | |
| "num_tokens": 5492657.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5534822195768356, | |
| "epoch": 1.2659176029962547, | |
| "grad_norm": 0.03496600687503815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570470690727234, | |
| "mean_token_accuracy": 0.7745380252599716, | |
| "num_tokens": 5508936.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5206775590777397, | |
| "epoch": 1.2696629213483146, | |
| "grad_norm": 0.038312628865242004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531387209892273, | |
| "mean_token_accuracy": 0.7818328887224197, | |
| "num_tokens": 5525150.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5372405052185059, | |
| "epoch": 1.2734082397003745, | |
| "grad_norm": 0.03226601704955101, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414312481880188, | |
| "mean_token_accuracy": 0.7806438505649567, | |
| "num_tokens": 5541125.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5670074820518494, | |
| "epoch": 1.2771535580524345, | |
| "grad_norm": 0.032290343195199966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651661157608032, | |
| "mean_token_accuracy": 0.768811360001564, | |
| "num_tokens": 5557589.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5581976920366287, | |
| "epoch": 1.2808988764044944, | |
| "grad_norm": 0.035112183541059494, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540149211883545, | |
| "mean_token_accuracy": 0.7756919115781784, | |
| "num_tokens": 5574011.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5480058342218399, | |
| "epoch": 1.2846441947565543, | |
| "grad_norm": 0.029269572347402573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497134923934937, | |
| "mean_token_accuracy": 0.7775010466575623, | |
| "num_tokens": 5590227.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5551355630159378, | |
| "epoch": 1.2883895131086143, | |
| "grad_norm": 0.03512820973992348, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613937377929688, | |
| "mean_token_accuracy": 0.77100470662117, | |
| "num_tokens": 5606436.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5681823641061783, | |
| "epoch": 1.2921348314606742, | |
| "grad_norm": 0.028890319168567657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653828382492065, | |
| "mean_token_accuracy": 0.7733339965343475, | |
| "num_tokens": 5622955.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5512849390506744, | |
| "epoch": 1.2958801498127341, | |
| "grad_norm": 0.03168505057692528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475208759307861, | |
| "mean_token_accuracy": 0.778771311044693, | |
| "num_tokens": 5639583.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5361000895500183, | |
| "epoch": 1.299625468164794, | |
| "grad_norm": 0.03995742276310921, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435983538627625, | |
| "mean_token_accuracy": 0.7801041901111603, | |
| "num_tokens": 5655726.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5335006862878799, | |
| "epoch": 1.303370786516854, | |
| "grad_norm": 0.03385796397924423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360836982727051, | |
| "mean_token_accuracy": 0.7803510278463364, | |
| "num_tokens": 5671935.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5649213343858719, | |
| "epoch": 1.3071161048689137, | |
| "grad_norm": 0.03367312625050545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654204487800598, | |
| "mean_token_accuracy": 0.7698808759450912, | |
| "num_tokens": 5688484.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5636743903160095, | |
| "epoch": 1.3108614232209739, | |
| "grad_norm": 0.028330491855740547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564975380897522, | |
| "mean_token_accuracy": 0.769644483923912, | |
| "num_tokens": 5704874.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5439984649419785, | |
| "epoch": 1.3146067415730336, | |
| "grad_norm": 0.030180098488926888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540916383266449, | |
| "mean_token_accuracy": 0.7806600630283356, | |
| "num_tokens": 5721250.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5403287261724472, | |
| "epoch": 1.3183520599250937, | |
| "grad_norm": 0.03425198793411255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408051609992981, | |
| "mean_token_accuracy": 0.7801858931779861, | |
| "num_tokens": 5737303.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5534793436527252, | |
| "epoch": 1.3220973782771535, | |
| "grad_norm": 0.029101019725203514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576366782188416, | |
| "mean_token_accuracy": 0.773370087146759, | |
| "num_tokens": 5753786.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5410192608833313, | |
| "epoch": 1.3258426966292136, | |
| "grad_norm": 0.0356539785861969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408055186271667, | |
| "mean_token_accuracy": 0.7814153283834457, | |
| "num_tokens": 5769926.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5472375005483627, | |
| "epoch": 1.3295880149812733, | |
| "grad_norm": 0.03288782387971878, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537273287773132, | |
| "mean_token_accuracy": 0.7744840979576111, | |
| "num_tokens": 5785998.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5556980893015862, | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 0.038231220096349716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558592677116394, | |
| "mean_token_accuracy": 0.7744520753622055, | |
| "num_tokens": 5802256.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5668211281299591, | |
| "epoch": 1.3370786516853932, | |
| "grad_norm": 0.02924768440425396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691797733306885, | |
| "mean_token_accuracy": 0.7683669775724411, | |
| "num_tokens": 5818757.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.549320325255394, | |
| "epoch": 1.3408239700374531, | |
| "grad_norm": 0.03099512681365013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551908016204834, | |
| "mean_token_accuracy": 0.7755500972270966, | |
| "num_tokens": 5835041.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5573329776525497, | |
| "epoch": 1.344569288389513, | |
| "grad_norm": 0.028519438579678535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581731796264648, | |
| "mean_token_accuracy": 0.7729284316301346, | |
| "num_tokens": 5851618.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5377827435731888, | |
| "epoch": 1.348314606741573, | |
| "grad_norm": 0.03338128328323364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362961888313293, | |
| "mean_token_accuracy": 0.7824237793684006, | |
| "num_tokens": 5867600.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.549625426530838, | |
| "epoch": 1.352059925093633, | |
| "grad_norm": 0.032118137925863266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464169979095459, | |
| "mean_token_accuracy": 0.779940128326416, | |
| "num_tokens": 5883550.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5563124269247055, | |
| "epoch": 1.3558052434456929, | |
| "grad_norm": 0.028186708688735962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525781512260437, | |
| "mean_token_accuracy": 0.7742565721273422, | |
| "num_tokens": 5900020.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5396654903888702, | |
| "epoch": 1.3595505617977528, | |
| "grad_norm": 0.03306869789958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485842227935791, | |
| "mean_token_accuracy": 0.7763185799121857, | |
| "num_tokens": 5916563.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5324016958475113, | |
| "epoch": 1.3632958801498127, | |
| "grad_norm": 0.030485033988952637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407555103302002, | |
| "mean_token_accuracy": 0.7805987000465393, | |
| "num_tokens": 5932915.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5415676534175873, | |
| "epoch": 1.3670411985018727, | |
| "grad_norm": 0.032210033386945724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420053601264954, | |
| "mean_token_accuracy": 0.7789227366447449, | |
| "num_tokens": 5949294.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5479710251092911, | |
| "epoch": 1.3707865168539326, | |
| "grad_norm": 0.030770668759942055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442653894424438, | |
| "mean_token_accuracy": 0.7809406220912933, | |
| "num_tokens": 5965688.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5611272603273392, | |
| "epoch": 1.3745318352059925, | |
| "grad_norm": 0.030032480135560036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458992719650269, | |
| "mean_token_accuracy": 0.7793887704610825, | |
| "num_tokens": 5982353.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5711783468723297, | |
| "epoch": 1.3782771535580525, | |
| "grad_norm": 0.030471278354525566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5689231157302856, | |
| "mean_token_accuracy": 0.7691554129123688, | |
| "num_tokens": 5998928.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5704734623432159, | |
| "epoch": 1.3820224719101124, | |
| "grad_norm": 0.0308744665235281, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704200267791748, | |
| "mean_token_accuracy": 0.7696904093027115, | |
| "num_tokens": 6015488.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.540970042347908, | |
| "epoch": 1.3857677902621723, | |
| "grad_norm": 0.029789667576551437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435522794723511, | |
| "mean_token_accuracy": 0.7803212404251099, | |
| "num_tokens": 6032273.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5323564112186432, | |
| "epoch": 1.3895131086142323, | |
| "grad_norm": 0.03373701870441437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415207147598267, | |
| "mean_token_accuracy": 0.7777475565671921, | |
| "num_tokens": 6048761.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5275064408779144, | |
| "epoch": 1.3932584269662922, | |
| "grad_norm": 0.03547370806336403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540917694568634, | |
| "mean_token_accuracy": 0.7795429080724716, | |
| "num_tokens": 6064848.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5497806072235107, | |
| "epoch": 1.3970037453183521, | |
| "grad_norm": 0.03201119974255562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552889347076416, | |
| "mean_token_accuracy": 0.7745427489280701, | |
| "num_tokens": 6081258.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5175323188304901, | |
| "epoch": 1.4007490636704119, | |
| "grad_norm": 0.03368834778666496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198505520820618, | |
| "mean_token_accuracy": 0.7878732234239578, | |
| "num_tokens": 6097172.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5441398918628693, | |
| "epoch": 1.404494382022472, | |
| "grad_norm": 0.03139437735080719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445310473442078, | |
| "mean_token_accuracy": 0.780688688158989, | |
| "num_tokens": 6113446.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5468717068433762, | |
| "epoch": 1.4082397003745317, | |
| "grad_norm": 0.03169120475649834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426516532897949, | |
| "mean_token_accuracy": 0.776495024561882, | |
| "num_tokens": 6129738.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5554005056619644, | |
| "epoch": 1.4119850187265919, | |
| "grad_norm": 0.03649836778640747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584489703178406, | |
| "mean_token_accuracy": 0.7743981927633286, | |
| "num_tokens": 6146138.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.545359656214714, | |
| "epoch": 1.4157303370786516, | |
| "grad_norm": 0.0333530455827713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547561526298523, | |
| "mean_token_accuracy": 0.7772817760705948, | |
| "num_tokens": 6162466.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5366268008947372, | |
| "epoch": 1.4194756554307117, | |
| "grad_norm": 0.0315176397562027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370338559150696, | |
| "mean_token_accuracy": 0.7830789685249329, | |
| "num_tokens": 6178827.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5343760550022125, | |
| "epoch": 1.4232209737827715, | |
| "grad_norm": 0.03283468633890152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403618812561035, | |
| "mean_token_accuracy": 0.7811573594808578, | |
| "num_tokens": 6195014.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5374447852373123, | |
| "epoch": 1.4269662921348314, | |
| "grad_norm": 0.03712209314107895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359081625938416, | |
| "mean_token_accuracy": 0.7824594676494598, | |
| "num_tokens": 6211204.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5647163391113281, | |
| "epoch": 1.4307116104868913, | |
| "grad_norm": 0.030612658709287643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665347576141357, | |
| "mean_token_accuracy": 0.7709782868623734, | |
| "num_tokens": 6227439.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5584586560726166, | |
| "epoch": 1.4344569288389513, | |
| "grad_norm": 0.03545604646205902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592620372772217, | |
| "mean_token_accuracy": 0.7708311080932617, | |
| "num_tokens": 6243909.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5563389509916306, | |
| "epoch": 1.4382022471910112, | |
| "grad_norm": 0.031707633286714554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574153065681458, | |
| "mean_token_accuracy": 0.7749636173248291, | |
| "num_tokens": 6260228.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5361679270863533, | |
| "epoch": 1.4419475655430711, | |
| "grad_norm": 0.030576881021261215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358593463897705, | |
| "mean_token_accuracy": 0.7815472632646561, | |
| "num_tokens": 6276438.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5404613763093948, | |
| "epoch": 1.445692883895131, | |
| "grad_norm": 0.0397074818611145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409061908721924, | |
| "mean_token_accuracy": 0.7812814116477966, | |
| "num_tokens": 6292854.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5539507865905762, | |
| "epoch": 1.449438202247191, | |
| "grad_norm": 0.027634674683213234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551899254322052, | |
| "mean_token_accuracy": 0.7763891369104385, | |
| "num_tokens": 6309146.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5406185388565063, | |
| "epoch": 1.453183520599251, | |
| "grad_norm": 0.03658418357372284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376873016357422, | |
| "mean_token_accuracy": 0.7802905589342117, | |
| "num_tokens": 6325371.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5515788942575455, | |
| "epoch": 1.4569288389513109, | |
| "grad_norm": 0.029648393392562866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481655597686768, | |
| "mean_token_accuracy": 0.7753021568059921, | |
| "num_tokens": 6341504.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5403069257736206, | |
| "epoch": 1.4606741573033708, | |
| "grad_norm": 0.0300885122269392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417286157608032, | |
| "mean_token_accuracy": 0.7805690169334412, | |
| "num_tokens": 6357574.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5320965051651001, | |
| "epoch": 1.4644194756554307, | |
| "grad_norm": 0.04233168438076973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542140543460846, | |
| "mean_token_accuracy": 0.7790813148021698, | |
| "num_tokens": 6373603.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5370313972234726, | |
| "epoch": 1.4681647940074907, | |
| "grad_norm": 0.03608033061027527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452749133110046, | |
| "mean_token_accuracy": 0.7784496247768402, | |
| "num_tokens": 6389874.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5391117632389069, | |
| "epoch": 1.4719101123595506, | |
| "grad_norm": 0.044416990131139755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447070598602295, | |
| "mean_token_accuracy": 0.7758590877056122, | |
| "num_tokens": 6406014.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5536396950483322, | |
| "epoch": 1.4756554307116105, | |
| "grad_norm": 0.028598185628652573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509454011917114, | |
| "mean_token_accuracy": 0.7754955738782883, | |
| "num_tokens": 6422526.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5600528717041016, | |
| "epoch": 1.4794007490636705, | |
| "grad_norm": 0.03587036579847336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511722564697266, | |
| "mean_token_accuracy": 0.7756818234920502, | |
| "num_tokens": 6438826.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5635561943054199, | |
| "epoch": 1.4831460674157304, | |
| "grad_norm": 0.04037458822131157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569745898246765, | |
| "mean_token_accuracy": 0.7768395692110062, | |
| "num_tokens": 6455392.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5546122640371323, | |
| "epoch": 1.4868913857677903, | |
| "grad_norm": 0.03193597123026848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528469085693359, | |
| "mean_token_accuracy": 0.7737569063901901, | |
| "num_tokens": 6471908.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.540926069021225, | |
| "epoch": 1.4906367041198503, | |
| "grad_norm": 0.03908224403858185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521141290664673, | |
| "mean_token_accuracy": 0.7775756865739822, | |
| "num_tokens": 6487958.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5474519431591034, | |
| "epoch": 1.49438202247191, | |
| "grad_norm": 0.04104601964354515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533535480499268, | |
| "mean_token_accuracy": 0.7748162597417831, | |
| "num_tokens": 6504634.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5560764372348785, | |
| "epoch": 1.4981273408239701, | |
| "grad_norm": 0.0360972136259079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614410042762756, | |
| "mean_token_accuracy": 0.770107239484787, | |
| "num_tokens": 6521072.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5673471540212631, | |
| "epoch": 1.5018726591760299, | |
| "grad_norm": 0.04004177823662758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589927434921265, | |
| "mean_token_accuracy": 0.7734557241201401, | |
| "num_tokens": 6537361.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5486087501049042, | |
| "epoch": 1.50561797752809, | |
| "grad_norm": 0.030557790771126747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393815040588379, | |
| "mean_token_accuracy": 0.7784638553857803, | |
| "num_tokens": 6553620.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5486248284578323, | |
| "epoch": 1.5093632958801497, | |
| "grad_norm": 0.03941396623849869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509032011032104, | |
| "mean_token_accuracy": 0.7800426781177521, | |
| "num_tokens": 6569936.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.558304026722908, | |
| "epoch": 1.5131086142322099, | |
| "grad_norm": 0.03858976438641548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.566615104675293, | |
| "mean_token_accuracy": 0.7677357494831085, | |
| "num_tokens": 6586223.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5375211238861084, | |
| "epoch": 1.5168539325842696, | |
| "grad_norm": 0.0333857461810112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546052873134613, | |
| "mean_token_accuracy": 0.779136061668396, | |
| "num_tokens": 6602626.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.545025646686554, | |
| "epoch": 1.5205992509363297, | |
| "grad_norm": 0.03882851079106331, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526992678642273, | |
| "mean_token_accuracy": 0.7757603526115417, | |
| "num_tokens": 6618970.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5616021603345871, | |
| "epoch": 1.5243445692883895, | |
| "grad_norm": 0.029704444110393524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617290139198303, | |
| "mean_token_accuracy": 0.771888017654419, | |
| "num_tokens": 6635712.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5517143756151199, | |
| "epoch": 1.5280898876404494, | |
| "grad_norm": 0.029841486364603043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455192923545837, | |
| "mean_token_accuracy": 0.7790273427963257, | |
| "num_tokens": 6652005.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5481491684913635, | |
| "epoch": 1.5318352059925093, | |
| "grad_norm": 0.03239016607403755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448024272918701, | |
| "mean_token_accuracy": 0.7801620662212372, | |
| "num_tokens": 6668365.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5385047048330307, | |
| "epoch": 1.5355805243445693, | |
| "grad_norm": 0.029611637815833092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335633754730225, | |
| "mean_token_accuracy": 0.785701259970665, | |
| "num_tokens": 6684708.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.558298259973526, | |
| "epoch": 1.5393258426966292, | |
| "grad_norm": 0.030493013560771942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560066103935242, | |
| "mean_token_accuracy": 0.7725876718759537, | |
| "num_tokens": 6701142.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5395427197217941, | |
| "epoch": 1.5430711610486891, | |
| "grad_norm": 0.032578032463788986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449746251106262, | |
| "mean_token_accuracy": 0.7762585133314133, | |
| "num_tokens": 6717233.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5387013256549835, | |
| "epoch": 1.546816479400749, | |
| "grad_norm": 0.0333687961101532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403171181678772, | |
| "mean_token_accuracy": 0.7810612767934799, | |
| "num_tokens": 6733228.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5673456788063049, | |
| "epoch": 1.550561797752809, | |
| "grad_norm": 0.036015916615724564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5735532641410828, | |
| "mean_token_accuracy": 0.7664827108383179, | |
| "num_tokens": 6749423.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5494605153799057, | |
| "epoch": 1.554307116104869, | |
| "grad_norm": 0.02719104290008545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493685007095337, | |
| "mean_token_accuracy": 0.776999905705452, | |
| "num_tokens": 6765893.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5593840181827545, | |
| "epoch": 1.5580524344569289, | |
| "grad_norm": 0.03425523266196251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553128719329834, | |
| "mean_token_accuracy": 0.7735365033149719, | |
| "num_tokens": 6782271.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5617495179176331, | |
| "epoch": 1.5617977528089888, | |
| "grad_norm": 0.032372213900089264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606021881103516, | |
| "mean_token_accuracy": 0.7721095532178879, | |
| "num_tokens": 6798813.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5550025552511215, | |
| "epoch": 1.5655430711610487, | |
| "grad_norm": 0.029182737693190575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564966201782227, | |
| "mean_token_accuracy": 0.7731625586748123, | |
| "num_tokens": 6815405.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5605382174253464, | |
| "epoch": 1.5692883895131087, | |
| "grad_norm": 0.030886612832546234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631057024002075, | |
| "mean_token_accuracy": 0.7716924250125885, | |
| "num_tokens": 6831974.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5414248704910278, | |
| "epoch": 1.5730337078651684, | |
| "grad_norm": 0.03267752379179001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522453188896179, | |
| "mean_token_accuracy": 0.7731709033250809, | |
| "num_tokens": 6848314.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5514931678771973, | |
| "epoch": 1.5767790262172285, | |
| "grad_norm": 0.03168710321187973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525091886520386, | |
| "mean_token_accuracy": 0.7754202336072922, | |
| "num_tokens": 6864671.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5639499425888062, | |
| "epoch": 1.5805243445692883, | |
| "grad_norm": 0.032651759684085846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697652697563171, | |
| "mean_token_accuracy": 0.7682019621133804, | |
| "num_tokens": 6881061.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5544054210186005, | |
| "epoch": 1.5842696629213484, | |
| "grad_norm": 0.03449453413486481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507102012634277, | |
| "mean_token_accuracy": 0.775859922170639, | |
| "num_tokens": 6897314.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5711345225572586, | |
| "epoch": 1.5880149812734081, | |
| "grad_norm": 0.03847847133874893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5732009410858154, | |
| "mean_token_accuracy": 0.7667471021413803, | |
| "num_tokens": 6913609.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5389959663152695, | |
| "epoch": 1.5917602996254683, | |
| "grad_norm": 0.03514353558421135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444454550743103, | |
| "mean_token_accuracy": 0.7799976915121078, | |
| "num_tokens": 6929936.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5668403804302216, | |
| "epoch": 1.595505617977528, | |
| "grad_norm": 0.035787779837846756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658587217330933, | |
| "mean_token_accuracy": 0.7714453637599945, | |
| "num_tokens": 6946824.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5508380085229874, | |
| "epoch": 1.5992509363295881, | |
| "grad_norm": 0.03445902094244957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547541975975037, | |
| "mean_token_accuracy": 0.7770363837480545, | |
| "num_tokens": 6962968.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5622916221618652, | |
| "epoch": 1.6029962546816479, | |
| "grad_norm": 0.033641569316387177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611415505409241, | |
| "mean_token_accuracy": 0.7717165648937225, | |
| "num_tokens": 6979281.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5456431210041046, | |
| "epoch": 1.606741573033708, | |
| "grad_norm": 0.030943863093852997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433369278907776, | |
| "mean_token_accuracy": 0.77703957259655, | |
| "num_tokens": 6995448.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5349363088607788, | |
| "epoch": 1.6104868913857677, | |
| "grad_norm": 0.029584866017103195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528792142868042, | |
| "mean_token_accuracy": 0.7852742522954941, | |
| "num_tokens": 7011578.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.52534219622612, | |
| "epoch": 1.6142322097378277, | |
| "grad_norm": 0.031122464686632156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248501300811768, | |
| "mean_token_accuracy": 0.7855943292379379, | |
| "num_tokens": 7027819.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5471996814012527, | |
| "epoch": 1.6179775280898876, | |
| "grad_norm": 0.03317458927631378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547217726707458, | |
| "mean_token_accuracy": 0.776124969124794, | |
| "num_tokens": 7044215.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5501783192157745, | |
| "epoch": 1.6217228464419475, | |
| "grad_norm": 0.028514394536614418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524763464927673, | |
| "mean_token_accuracy": 0.773967519402504, | |
| "num_tokens": 7060557.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5516121089458466, | |
| "epoch": 1.6254681647940075, | |
| "grad_norm": 0.037680886685848236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547643899917603, | |
| "mean_token_accuracy": 0.7772052437067032, | |
| "num_tokens": 7076827.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5446216315031052, | |
| "epoch": 1.6292134831460674, | |
| "grad_norm": 0.025961318984627724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540472149848938, | |
| "mean_token_accuracy": 0.7827950567007065, | |
| "num_tokens": 7093240.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5542737692594528, | |
| "epoch": 1.6329588014981273, | |
| "grad_norm": 0.03385328873991966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622321963310242, | |
| "mean_token_accuracy": 0.7715137451887131, | |
| "num_tokens": 7109763.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5479970276355743, | |
| "epoch": 1.6367041198501873, | |
| "grad_norm": 0.027666250243782997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450934767723083, | |
| "mean_token_accuracy": 0.7789344042539597, | |
| "num_tokens": 7125965.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5606249123811722, | |
| "epoch": 1.6404494382022472, | |
| "grad_norm": 0.028965814039111137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5618120431900024, | |
| "mean_token_accuracy": 0.7737310230731964, | |
| "num_tokens": 7142275.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5434140264987946, | |
| "epoch": 1.6441947565543071, | |
| "grad_norm": 0.03233455866575241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448483824729919, | |
| "mean_token_accuracy": 0.7776681929826736, | |
| "num_tokens": 7158681.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5462686270475388, | |
| "epoch": 1.647940074906367, | |
| "grad_norm": 0.030159825459122658, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512958765029907, | |
| "mean_token_accuracy": 0.7788191735744476, | |
| "num_tokens": 7174999.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5655659884214401, | |
| "epoch": 1.651685393258427, | |
| "grad_norm": 0.0356375053524971, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5668036937713623, | |
| "mean_token_accuracy": 0.7672240734100342, | |
| "num_tokens": 7191451.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5439184606075287, | |
| "epoch": 1.655430711610487, | |
| "grad_norm": 0.03394126892089844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443013906478882, | |
| "mean_token_accuracy": 0.7794349491596222, | |
| "num_tokens": 7207657.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5462498217821121, | |
| "epoch": 1.6591760299625467, | |
| "grad_norm": 0.03115757368505001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484351515769958, | |
| "mean_token_accuracy": 0.7759426087141037, | |
| "num_tokens": 7223926.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5479519367218018, | |
| "epoch": 1.6629213483146068, | |
| "grad_norm": 0.03686544671654701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487886071205139, | |
| "mean_token_accuracy": 0.7793583422899246, | |
| "num_tokens": 7239926.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5571880787611008, | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.029902130365371704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566808581352234, | |
| "mean_token_accuracy": 0.7738562673330307, | |
| "num_tokens": 7256365.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5606496781110764, | |
| "epoch": 1.6704119850187267, | |
| "grad_norm": 0.03581070154905319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646023750305176, | |
| "mean_token_accuracy": 0.7700021713972092, | |
| "num_tokens": 7272415.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5493645370006561, | |
| "epoch": 1.6741573033707864, | |
| "grad_norm": 0.034732386469841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556433796882629, | |
| "mean_token_accuracy": 0.7724722027778625, | |
| "num_tokens": 7288442.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5454504191875458, | |
| "epoch": 1.6779026217228465, | |
| "grad_norm": 0.031994741410017014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455131530761719, | |
| "mean_token_accuracy": 0.7786727547645569, | |
| "num_tokens": 7304778.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5480805784463882, | |
| "epoch": 1.6816479400749063, | |
| "grad_norm": 0.029919426888227463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464503765106201, | |
| "mean_token_accuracy": 0.7800304591655731, | |
| "num_tokens": 7320989.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5258940905332565, | |
| "epoch": 1.6853932584269664, | |
| "grad_norm": 0.032200053334236145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228010416030884, | |
| "mean_token_accuracy": 0.7870291918516159, | |
| "num_tokens": 7337145.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.545346587896347, | |
| "epoch": 1.6891385767790261, | |
| "grad_norm": 0.037810057401657104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497158765792847, | |
| "mean_token_accuracy": 0.7733957171440125, | |
| "num_tokens": 7353380.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5455152243375778, | |
| "epoch": 1.6928838951310863, | |
| "grad_norm": 0.036783650517463684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547383725643158, | |
| "mean_token_accuracy": 0.7792070508003235, | |
| "num_tokens": 7369718.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5610679686069489, | |
| "epoch": 1.696629213483146, | |
| "grad_norm": 0.032883401960134506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691272616386414, | |
| "mean_token_accuracy": 0.7677329927682877, | |
| "num_tokens": 7385896.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5505604892969131, | |
| "epoch": 1.7003745318352061, | |
| "grad_norm": 0.03284638375043869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511571168899536, | |
| "mean_token_accuracy": 0.7760978639125824, | |
| "num_tokens": 7402228.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5650221109390259, | |
| "epoch": 1.7041198501872659, | |
| "grad_norm": 0.02887006103992462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633357763290405, | |
| "mean_token_accuracy": 0.7709190398454666, | |
| "num_tokens": 7418506.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5511359125375748, | |
| "epoch": 1.7078651685393258, | |
| "grad_norm": 0.02897547371685505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476655960083008, | |
| "mean_token_accuracy": 0.7766725867986679, | |
| "num_tokens": 7434993.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5589297413825989, | |
| "epoch": 1.7116104868913857, | |
| "grad_norm": 0.03913537412881851, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562713623046875, | |
| "mean_token_accuracy": 0.7716452181339264, | |
| "num_tokens": 7451420.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5587479770183563, | |
| "epoch": 1.7153558052434457, | |
| "grad_norm": 0.0281817764043808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552535057067871, | |
| "mean_token_accuracy": 0.7717525810003281, | |
| "num_tokens": 7467745.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5426507443189621, | |
| "epoch": 1.7191011235955056, | |
| "grad_norm": 0.03837720304727554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466030836105347, | |
| "mean_token_accuracy": 0.7787178158760071, | |
| "num_tokens": 7484044.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.548772931098938, | |
| "epoch": 1.7228464419475655, | |
| "grad_norm": 0.034067291766405106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531357526779175, | |
| "mean_token_accuracy": 0.7748309075832367, | |
| "num_tokens": 7500332.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5564078390598297, | |
| "epoch": 1.7265917602996255, | |
| "grad_norm": 0.03204013407230377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560243725776672, | |
| "mean_token_accuracy": 0.7740551978349686, | |
| "num_tokens": 7516660.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5405488759279251, | |
| "epoch": 1.7303370786516854, | |
| "grad_norm": 0.030630316585302353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395958423614502, | |
| "mean_token_accuracy": 0.7782745659351349, | |
| "num_tokens": 7532934.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5496814846992493, | |
| "epoch": 1.7340823970037453, | |
| "grad_norm": 0.03725660592317581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496969223022461, | |
| "mean_token_accuracy": 0.7755606472492218, | |
| "num_tokens": 7549291.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5522442013025284, | |
| "epoch": 1.7378277153558053, | |
| "grad_norm": 0.039360832422971725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475296378135681, | |
| "mean_token_accuracy": 0.7740370631217957, | |
| "num_tokens": 7565370.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5205198004841805, | |
| "epoch": 1.7415730337078652, | |
| "grad_norm": 0.029320131987333298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181597471237183, | |
| "mean_token_accuracy": 0.789748415350914, | |
| "num_tokens": 7581731.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5322981476783752, | |
| "epoch": 1.7453183520599251, | |
| "grad_norm": 0.03633226826786995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413781404495239, | |
| "mean_token_accuracy": 0.7808037847280502, | |
| "num_tokens": 7597822.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.524602085351944, | |
| "epoch": 1.749063670411985, | |
| "grad_norm": 0.04402731731534004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532406210899353, | |
| "mean_token_accuracy": 0.7855067849159241, | |
| "num_tokens": 7613933.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5708600282669067, | |
| "epoch": 1.7528089887640448, | |
| "grad_norm": 0.0357418954372406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712512731552124, | |
| "mean_token_accuracy": 0.7683784365653992, | |
| "num_tokens": 7630331.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5579233318567276, | |
| "epoch": 1.756554307116105, | |
| "grad_norm": 0.15994992852210999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615707635879517, | |
| "mean_token_accuracy": 0.7749305069446564, | |
| "num_tokens": 7646666.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5672501176595688, | |
| "epoch": 1.7602996254681647, | |
| "grad_norm": 0.18223144114017487, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5922040939331055, | |
| "mean_token_accuracy": 0.767003208398819, | |
| "num_tokens": 7663024.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5853898674249649, | |
| "epoch": 1.7640449438202248, | |
| "grad_norm": 0.19322983920574188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716003179550171, | |
| "mean_token_accuracy": 0.7706755697727203, | |
| "num_tokens": 7679445.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5652599781751633, | |
| "epoch": 1.7677902621722845, | |
| "grad_norm": 0.040028076618909836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545145869255066, | |
| "mean_token_accuracy": 0.7762533873319626, | |
| "num_tokens": 7695863.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5655337423086166, | |
| "epoch": 1.7715355805243447, | |
| "grad_norm": 0.03808818385004997, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697377324104309, | |
| "mean_token_accuracy": 0.7698807120323181, | |
| "num_tokens": 7712117.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.531586229801178, | |
| "epoch": 1.7752808988764044, | |
| "grad_norm": 0.03700399026274681, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407450199127197, | |
| "mean_token_accuracy": 0.7823738306760788, | |
| "num_tokens": 7728324.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5400687605142593, | |
| "epoch": 1.7790262172284645, | |
| "grad_norm": 0.04493065923452377, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463284254074097, | |
| "mean_token_accuracy": 0.778341680765152, | |
| "num_tokens": 7744642.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5348718762397766, | |
| "epoch": 1.7827715355805243, | |
| "grad_norm": 0.032796818763017654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53885817527771, | |
| "mean_token_accuracy": 0.7798904478549957, | |
| "num_tokens": 7761144.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5612788051366806, | |
| "epoch": 1.7865168539325844, | |
| "grad_norm": 0.03454861417412758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585771799087524, | |
| "mean_token_accuracy": 0.7730214893817902, | |
| "num_tokens": 7777603.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5655092746019363, | |
| "epoch": 1.7902621722846441, | |
| "grad_norm": 0.04326882213354111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594231486320496, | |
| "mean_token_accuracy": 0.7714511156082153, | |
| "num_tokens": 7794017.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5740013867616653, | |
| "epoch": 1.7940074906367043, | |
| "grad_norm": 0.03586514666676521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665684342384338, | |
| "mean_token_accuracy": 0.7693835347890854, | |
| "num_tokens": 7810410.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5689022541046143, | |
| "epoch": 1.797752808988764, | |
| "grad_norm": 0.03453454375267029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640177130699158, | |
| "mean_token_accuracy": 0.7688567489385605, | |
| "num_tokens": 7826878.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5344455689191818, | |
| "epoch": 1.801498127340824, | |
| "grad_norm": 0.04154738038778305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412873029708862, | |
| "mean_token_accuracy": 0.7843961417675018, | |
| "num_tokens": 7842957.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5326808393001556, | |
| "epoch": 1.8052434456928839, | |
| "grad_norm": 0.03772249072790146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458777546882629, | |
| "mean_token_accuracy": 0.7775137424468994, | |
| "num_tokens": 7859243.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.552602618932724, | |
| "epoch": 1.8089887640449438, | |
| "grad_norm": 0.03419940546154976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563470721244812, | |
| "mean_token_accuracy": 0.7756804972887039, | |
| "num_tokens": 7875641.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5412130802869797, | |
| "epoch": 1.8127340823970037, | |
| "grad_norm": 0.033059973269701004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540538489818573, | |
| "mean_token_accuracy": 0.782319188117981, | |
| "num_tokens": 7891954.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5559896975755692, | |
| "epoch": 1.8164794007490637, | |
| "grad_norm": 0.03472665324807167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544817447662354, | |
| "mean_token_accuracy": 0.7753840684890747, | |
| "num_tokens": 7908283.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5695093274116516, | |
| "epoch": 1.8202247191011236, | |
| "grad_norm": 0.0319642499089241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608171224594116, | |
| "mean_token_accuracy": 0.7743540853261948, | |
| "num_tokens": 7924627.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5412854105234146, | |
| "epoch": 1.8239700374531835, | |
| "grad_norm": 0.032578784972429276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386444330215454, | |
| "mean_token_accuracy": 0.7795344591140747, | |
| "num_tokens": 7940814.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5442286729812622, | |
| "epoch": 1.8277153558052435, | |
| "grad_norm": 0.03279658779501915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553512454032898, | |
| "mean_token_accuracy": 0.7744518220424652, | |
| "num_tokens": 7957133.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.544167771935463, | |
| "epoch": 1.8314606741573034, | |
| "grad_norm": 0.034980904310941696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495878458023071, | |
| "mean_token_accuracy": 0.7794477045536041, | |
| "num_tokens": 7973367.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5514913648366928, | |
| "epoch": 1.8352059925093633, | |
| "grad_norm": 0.0437743179500103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581385493278503, | |
| "mean_token_accuracy": 0.7734484821557999, | |
| "num_tokens": 7989443.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5721138119697571, | |
| "epoch": 1.8389513108614233, | |
| "grad_norm": 0.032419200986623764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644645094871521, | |
| "mean_token_accuracy": 0.7717173397541046, | |
| "num_tokens": 8005817.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5577604025602341, | |
| "epoch": 1.8426966292134832, | |
| "grad_norm": 0.04115711897611618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619987845420837, | |
| "mean_token_accuracy": 0.77156862616539, | |
| "num_tokens": 8022160.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5528861582279205, | |
| "epoch": 1.846441947565543, | |
| "grad_norm": 0.029432786628603935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476526618003845, | |
| "mean_token_accuracy": 0.7781069427728653, | |
| "num_tokens": 8038591.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5558982342481613, | |
| "epoch": 1.850187265917603, | |
| "grad_norm": 0.036472100764513016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545116662979126, | |
| "mean_token_accuracy": 0.776875764131546, | |
| "num_tokens": 8054879.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5589891523122787, | |
| "epoch": 1.8539325842696628, | |
| "grad_norm": 0.02796117588877678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532379746437073, | |
| "mean_token_accuracy": 0.7751499116420746, | |
| "num_tokens": 8071227.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5462375283241272, | |
| "epoch": 1.857677902621723, | |
| "grad_norm": 0.0307608712464571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444692373275757, | |
| "mean_token_accuracy": 0.7788323760032654, | |
| "num_tokens": 8087424.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.562559187412262, | |
| "epoch": 1.8614232209737827, | |
| "grad_norm": 0.03130098804831505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660312175750732, | |
| "mean_token_accuracy": 0.7673315852880478, | |
| "num_tokens": 8104163.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5469489693641663, | |
| "epoch": 1.8651685393258428, | |
| "grad_norm": 0.031797025352716446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592264533042908, | |
| "mean_token_accuracy": 0.7764750421047211, | |
| "num_tokens": 8120483.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5529169142246246, | |
| "epoch": 1.8689138576779025, | |
| "grad_norm": 0.0395452156662941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562450885772705, | |
| "mean_token_accuracy": 0.7762233167886734, | |
| "num_tokens": 8136774.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5619923919439316, | |
| "epoch": 1.8726591760299627, | |
| "grad_norm": 0.03070960007607937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671469569206238, | |
| "mean_token_accuracy": 0.7695633620023727, | |
| "num_tokens": 8152950.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.571450412273407, | |
| "epoch": 1.8764044943820224, | |
| "grad_norm": 0.03263135999441147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684110522270203, | |
| "mean_token_accuracy": 0.7683538943529129, | |
| "num_tokens": 8169231.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5732105523347855, | |
| "epoch": 1.8801498127340825, | |
| "grad_norm": 0.04209841415286064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571649968624115, | |
| "mean_token_accuracy": 0.7642921954393387, | |
| "num_tokens": 8185562.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5685284435749054, | |
| "epoch": 1.8838951310861423, | |
| "grad_norm": 0.03377389535307884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56586092710495, | |
| "mean_token_accuracy": 0.7697953432798386, | |
| "num_tokens": 8201808.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5590908825397491, | |
| "epoch": 1.8876404494382022, | |
| "grad_norm": 0.0385461188852787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578455924987793, | |
| "mean_token_accuracy": 0.7730644196271896, | |
| "num_tokens": 8217945.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5606498569250107, | |
| "epoch": 1.8913857677902621, | |
| "grad_norm": 0.03381400555372238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585749745368958, | |
| "mean_token_accuracy": 0.7752718329429626, | |
| "num_tokens": 8234181.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5511593520641327, | |
| "epoch": 1.895131086142322, | |
| "grad_norm": 0.04427889734506607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605770349502563, | |
| "mean_token_accuracy": 0.7708971202373505, | |
| "num_tokens": 8250412.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5558828562498093, | |
| "epoch": 1.898876404494382, | |
| "grad_norm": 0.032851386815309525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588455200195312, | |
| "mean_token_accuracy": 0.7729152590036392, | |
| "num_tokens": 8266940.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5533877611160278, | |
| "epoch": 1.902621722846442, | |
| "grad_norm": 0.034889817237854004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531287789344788, | |
| "mean_token_accuracy": 0.7766410559415817, | |
| "num_tokens": 8283192.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.55963134765625, | |
| "epoch": 1.9063670411985019, | |
| "grad_norm": 0.03460029140114784, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557897686958313, | |
| "mean_token_accuracy": 0.7736343890428543, | |
| "num_tokens": 8299357.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5412601754069328, | |
| "epoch": 1.9101123595505618, | |
| "grad_norm": 0.032328344881534576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438541173934937, | |
| "mean_token_accuracy": 0.7753017991781235, | |
| "num_tokens": 8315841.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5540103167295456, | |
| "epoch": 1.9138576779026217, | |
| "grad_norm": 0.03002399578690529, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542548894882202, | |
| "mean_token_accuracy": 0.7737881243228912, | |
| "num_tokens": 8332181.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5422029197216034, | |
| "epoch": 1.9176029962546817, | |
| "grad_norm": 0.034409623593091965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453910231590271, | |
| "mean_token_accuracy": 0.7794903218746185, | |
| "num_tokens": 8348319.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5566486120223999, | |
| "epoch": 1.9213483146067416, | |
| "grad_norm": 0.030252845957875252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5601068735122681, | |
| "mean_token_accuracy": 0.7728803753852844, | |
| "num_tokens": 8364457.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5523079186677933, | |
| "epoch": 1.9250936329588015, | |
| "grad_norm": 0.02711205929517746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482505559921265, | |
| "mean_token_accuracy": 0.7751948684453964, | |
| "num_tokens": 8380923.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5604666918516159, | |
| "epoch": 1.9288389513108615, | |
| "grad_norm": 0.032180819660425186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568802356719971, | |
| "mean_token_accuracy": 0.7695084065198898, | |
| "num_tokens": 8397239.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5643311589956284, | |
| "epoch": 1.9325842696629212, | |
| "grad_norm": 0.03032456897199154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628493428230286, | |
| "mean_token_accuracy": 0.7717900723218918, | |
| "num_tokens": 8413791.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5468644499778748, | |
| "epoch": 1.9363295880149813, | |
| "grad_norm": 0.03036642260849476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469942688941956, | |
| "mean_token_accuracy": 0.7763982564210892, | |
| "num_tokens": 8429973.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5639230608940125, | |
| "epoch": 1.940074906367041, | |
| "grad_norm": 0.03586732968688011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693802237510681, | |
| "mean_token_accuracy": 0.7674274742603302, | |
| "num_tokens": 8446632.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.552105188369751, | |
| "epoch": 1.9438202247191012, | |
| "grad_norm": 0.028923669829964638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536226630210876, | |
| "mean_token_accuracy": 0.7770767658948898, | |
| "num_tokens": 8462861.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.546203225851059, | |
| "epoch": 1.947565543071161, | |
| "grad_norm": 0.03517064452171326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486375689506531, | |
| "mean_token_accuracy": 0.7788794338703156, | |
| "num_tokens": 8479188.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5571713298559189, | |
| "epoch": 1.951310861423221, | |
| "grad_norm": 0.03267424926161766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605846047401428, | |
| "mean_token_accuracy": 0.7741213738918304, | |
| "num_tokens": 8495441.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5428985059261322, | |
| "epoch": 1.9550561797752808, | |
| "grad_norm": 0.03182944655418396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459189414978027, | |
| "mean_token_accuracy": 0.7793070673942566, | |
| "num_tokens": 8511788.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5454448312520981, | |
| "epoch": 1.958801498127341, | |
| "grad_norm": 0.033397775143384933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454107522964478, | |
| "mean_token_accuracy": 0.7772410660982132, | |
| "num_tokens": 8528152.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5469843745231628, | |
| "epoch": 1.9625468164794007, | |
| "grad_norm": 0.030805334448814392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417147874832153, | |
| "mean_token_accuracy": 0.7786692380905151, | |
| "num_tokens": 8544780.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5402656495571136, | |
| "epoch": 1.9662921348314608, | |
| "grad_norm": 0.030130336061120033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425636768341064, | |
| "mean_token_accuracy": 0.7805010080337524, | |
| "num_tokens": 8561035.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5509428530931473, | |
| "epoch": 1.9700374531835205, | |
| "grad_norm": 0.0316033698618412, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516440272331238, | |
| "mean_token_accuracy": 0.775515004992485, | |
| "num_tokens": 8577541.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5449865013360977, | |
| "epoch": 1.9737827715355807, | |
| "grad_norm": 0.03625763952732086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528845191001892, | |
| "mean_token_accuracy": 0.7754436731338501, | |
| "num_tokens": 8593925.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.563062384724617, | |
| "epoch": 1.9775280898876404, | |
| "grad_norm": 0.029838701710104942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591800808906555, | |
| "mean_token_accuracy": 0.7732478529214859, | |
| "num_tokens": 8610524.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5514681190252304, | |
| "epoch": 1.9812734082397003, | |
| "grad_norm": 0.03368176147341728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548831582069397, | |
| "mean_token_accuracy": 0.7749605923891068, | |
| "num_tokens": 8626872.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5520317405462265, | |
| "epoch": 1.9850187265917603, | |
| "grad_norm": 0.03429826721549034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514442324638367, | |
| "mean_token_accuracy": 0.7730523347854614, | |
| "num_tokens": 8642960.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5669658333063126, | |
| "epoch": 1.9887640449438202, | |
| "grad_norm": 0.0307292602956295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5723692178726196, | |
| "mean_token_accuracy": 0.7651190161705017, | |
| "num_tokens": 8659084.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5609945952892303, | |
| "epoch": 1.9925093632958801, | |
| "grad_norm": 0.036607109010219574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636897683143616, | |
| "mean_token_accuracy": 0.7701397836208344, | |
| "num_tokens": 8675587.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5549340695142746, | |
| "epoch": 1.99625468164794, | |
| "grad_norm": 0.03215758502483368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516895651817322, | |
| "mean_token_accuracy": 0.7737619578838348, | |
| "num_tokens": 8691850.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5620461255311966, | |
| "epoch": 2.0, | |
| "grad_norm": 0.028028611093759537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578765869140625, | |
| "mean_token_accuracy": 0.7716735005378723, | |
| "num_tokens": 8708236.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.557419016957283, | |
| "epoch": 2.0037453183520597, | |
| "grad_norm": 0.03629058599472046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479042530059814, | |
| "mean_token_accuracy": 0.7768302410840988, | |
| "num_tokens": 8724656.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5507587045431137, | |
| "epoch": 2.00749063670412, | |
| "grad_norm": 0.032850366085767746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528382062911987, | |
| "mean_token_accuracy": 0.7756710648536682, | |
| "num_tokens": 8741046.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5404622703790665, | |
| "epoch": 2.0112359550561796, | |
| "grad_norm": 0.031562913209199905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380600094795227, | |
| "mean_token_accuracy": 0.7781912684440613, | |
| "num_tokens": 8757535.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5316804945468903, | |
| "epoch": 2.0149812734082397, | |
| "grad_norm": 0.03351443260908127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359355807304382, | |
| "mean_token_accuracy": 0.7827723175287247, | |
| "num_tokens": 8773824.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5419723987579346, | |
| "epoch": 2.0187265917602994, | |
| "grad_norm": 0.03948935121297836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471257567405701, | |
| "mean_token_accuracy": 0.7790137678384781, | |
| "num_tokens": 8790095.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5343683362007141, | |
| "epoch": 2.0224719101123596, | |
| "grad_norm": 0.031161192804574966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309802889823914, | |
| "mean_token_accuracy": 0.7821521759033203, | |
| "num_tokens": 8806510.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5364920198917389, | |
| "epoch": 2.0262172284644193, | |
| "grad_norm": 0.03507857769727707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324068069458008, | |
| "mean_token_accuracy": 0.7870013862848282, | |
| "num_tokens": 8822654.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5483170747756958, | |
| "epoch": 2.0299625468164795, | |
| "grad_norm": 0.03222345933318138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549699068069458, | |
| "mean_token_accuracy": 0.7751237750053406, | |
| "num_tokens": 8839285.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5425759255886078, | |
| "epoch": 2.033707865168539, | |
| "grad_norm": 0.03227977082133293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380892753601074, | |
| "mean_token_accuracy": 0.7839174568653107, | |
| "num_tokens": 8855507.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5272768065333366, | |
| "epoch": 2.0374531835205993, | |
| "grad_norm": 0.03487760201096535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265735387802124, | |
| "mean_token_accuracy": 0.7857347279787064, | |
| "num_tokens": 8871873.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5219558328390121, | |
| "epoch": 2.041198501872659, | |
| "grad_norm": 0.035983484238386154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337969660758972, | |
| "mean_token_accuracy": 0.7834839969873428, | |
| "num_tokens": 8887984.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5376651287078857, | |
| "epoch": 2.044943820224719, | |
| "grad_norm": 0.038352932780981064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438427329063416, | |
| "mean_token_accuracy": 0.7784269452095032, | |
| "num_tokens": 8904216.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5456122606992722, | |
| "epoch": 2.048689138576779, | |
| "grad_norm": 0.036168649792671204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431267023086548, | |
| "mean_token_accuracy": 0.7829999178647995, | |
| "num_tokens": 8920617.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5304486304521561, | |
| "epoch": 2.052434456928839, | |
| "grad_norm": 0.03324899077415466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289336442947388, | |
| "mean_token_accuracy": 0.7849617451429367, | |
| "num_tokens": 8936835.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5275251343846321, | |
| "epoch": 2.056179775280899, | |
| "grad_norm": 0.03898227587342262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530302882194519, | |
| "mean_token_accuracy": 0.7835600972175598, | |
| "num_tokens": 8953009.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5530034005641937, | |
| "epoch": 2.059925093632959, | |
| "grad_norm": 0.038006141781806946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494067072868347, | |
| "mean_token_accuracy": 0.7755949050188065, | |
| "num_tokens": 8969428.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5418991297483444, | |
| "epoch": 2.0636704119850187, | |
| "grad_norm": 0.03261435031890869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322299003601074, | |
| "mean_token_accuracy": 0.7837673723697662, | |
| "num_tokens": 8985844.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5309967398643494, | |
| "epoch": 2.067415730337079, | |
| "grad_norm": 0.03797997906804085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291654467582703, | |
| "mean_token_accuracy": 0.7849747538566589, | |
| "num_tokens": 9002169.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5188492685556412, | |
| "epoch": 2.0711610486891385, | |
| "grad_norm": 0.038583919405937195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282660722732544, | |
| "mean_token_accuracy": 0.7870546579360962, | |
| "num_tokens": 9018570.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.534794494509697, | |
| "epoch": 2.0749063670411987, | |
| "grad_norm": 0.03449336439371109, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352678298950195, | |
| "mean_token_accuracy": 0.7845733165740967, | |
| "num_tokens": 9034788.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5308385342359543, | |
| "epoch": 2.0786516853932584, | |
| "grad_norm": 0.03845726326107979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5325117111206055, | |
| "mean_token_accuracy": 0.7835551649332047, | |
| "num_tokens": 9051109.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5309025943279266, | |
| "epoch": 2.0823970037453186, | |
| "grad_norm": 0.03809129074215889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253363847732544, | |
| "mean_token_accuracy": 0.7868698537349701, | |
| "num_tokens": 9067268.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5575416088104248, | |
| "epoch": 2.0861423220973783, | |
| "grad_norm": 0.034367915242910385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523205995559692, | |
| "mean_token_accuracy": 0.7749448716640472, | |
| "num_tokens": 9083891.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.53434719145298, | |
| "epoch": 2.0898876404494384, | |
| "grad_norm": 0.03826329484581947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409042835235596, | |
| "mean_token_accuracy": 0.7785179018974304, | |
| "num_tokens": 9100370.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5194257721304893, | |
| "epoch": 2.093632958801498, | |
| "grad_norm": 0.03882572054862976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238875150680542, | |
| "mean_token_accuracy": 0.7858750522136688, | |
| "num_tokens": 9116506.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5331729799509048, | |
| "epoch": 2.097378277153558, | |
| "grad_norm": 0.045005545020103455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285134315490723, | |
| "mean_token_accuracy": 0.7852654755115509, | |
| "num_tokens": 9132871.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5405212640762329, | |
| "epoch": 2.101123595505618, | |
| "grad_norm": 0.04780491814017296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461173057556152, | |
| "mean_token_accuracy": 0.7770982980728149, | |
| "num_tokens": 9149174.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5288062691688538, | |
| "epoch": 2.1048689138576777, | |
| "grad_norm": 0.04940470680594444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337265729904175, | |
| "mean_token_accuracy": 0.7846069186925888, | |
| "num_tokens": 9165316.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.531680166721344, | |
| "epoch": 2.108614232209738, | |
| "grad_norm": 0.05061717331409454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271866321563721, | |
| "mean_token_accuracy": 0.7854976505041122, | |
| "num_tokens": 9181482.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5314291417598724, | |
| "epoch": 2.1123595505617976, | |
| "grad_norm": 0.0397643968462944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271567702293396, | |
| "mean_token_accuracy": 0.7851341366767883, | |
| "num_tokens": 9197662.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5252021998167038, | |
| "epoch": 2.1161048689138577, | |
| "grad_norm": 0.041956719011068344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281031727790833, | |
| "mean_token_accuracy": 0.7877316772937775, | |
| "num_tokens": 9214001.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5378998965024948, | |
| "epoch": 2.1198501872659175, | |
| "grad_norm": 0.03963020071387291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432679653167725, | |
| "mean_token_accuracy": 0.7765485197305679, | |
| "num_tokens": 9230298.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5449769049882889, | |
| "epoch": 2.1235955056179776, | |
| "grad_norm": 0.04862145707011223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481102466583252, | |
| "mean_token_accuracy": 0.7771643400192261, | |
| "num_tokens": 9246648.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5432566553354263, | |
| "epoch": 2.1273408239700373, | |
| "grad_norm": 0.03826707601547241, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354676246643066, | |
| "mean_token_accuracy": 0.7808031290769577, | |
| "num_tokens": 9263059.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5395092964172363, | |
| "epoch": 2.1310861423220975, | |
| "grad_norm": 0.04806908592581749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348396897315979, | |
| "mean_token_accuracy": 0.7838325351476669, | |
| "num_tokens": 9279690.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.512074276804924, | |
| "epoch": 2.134831460674157, | |
| "grad_norm": 0.034932930022478104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5059640407562256, | |
| "mean_token_accuracy": 0.7954477220773697, | |
| "num_tokens": 9296053.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5317389219999313, | |
| "epoch": 2.1385767790262173, | |
| "grad_norm": 0.054850984364748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419769287109375, | |
| "mean_token_accuracy": 0.7804836332798004, | |
| "num_tokens": 9312250.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.523776650428772, | |
| "epoch": 2.142322097378277, | |
| "grad_norm": 0.03885575383901596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337730050086975, | |
| "mean_token_accuracy": 0.7821401208639145, | |
| "num_tokens": 9328588.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5306317359209061, | |
| "epoch": 2.146067415730337, | |
| "grad_norm": 0.04031698405742645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285602807998657, | |
| "mean_token_accuracy": 0.7860189080238342, | |
| "num_tokens": 9344771.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5253511220216751, | |
| "epoch": 2.149812734082397, | |
| "grad_norm": 0.03704000264406204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.519854724407196, | |
| "mean_token_accuracy": 0.7907343953847885, | |
| "num_tokens": 9360913.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5498696267604828, | |
| "epoch": 2.153558052434457, | |
| "grad_norm": 0.03690071031451225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417680144309998, | |
| "mean_token_accuracy": 0.7790531069040298, | |
| "num_tokens": 9377532.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5402537435293198, | |
| "epoch": 2.157303370786517, | |
| "grad_norm": 0.0378306582570076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541071891784668, | |
| "mean_token_accuracy": 0.7788532823324203, | |
| "num_tokens": 9393830.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5282108932733536, | |
| "epoch": 2.161048689138577, | |
| "grad_norm": 0.04091333597898483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348851084709167, | |
| "mean_token_accuracy": 0.7821558713912964, | |
| "num_tokens": 9410274.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5303814560174942, | |
| "epoch": 2.1647940074906367, | |
| "grad_norm": 0.03591841831803322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331617593765259, | |
| "mean_token_accuracy": 0.7818120270967484, | |
| "num_tokens": 9426511.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5272700041532516, | |
| "epoch": 2.168539325842697, | |
| "grad_norm": 0.03997735306620598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334488153457642, | |
| "mean_token_accuracy": 0.7814541161060333, | |
| "num_tokens": 9442897.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5336402952671051, | |
| "epoch": 2.1722846441947565, | |
| "grad_norm": 0.0450415313243866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275048017501831, | |
| "mean_token_accuracy": 0.7864081561565399, | |
| "num_tokens": 9459023.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.538782149553299, | |
| "epoch": 2.1760299625468167, | |
| "grad_norm": 0.03600127249956131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313720107078552, | |
| "mean_token_accuracy": 0.7847412079572678, | |
| "num_tokens": 9475337.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5273982435464859, | |
| "epoch": 2.1797752808988764, | |
| "grad_norm": 0.04744241386651993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319021344184875, | |
| "mean_token_accuracy": 0.7850695848464966, | |
| "num_tokens": 9491529.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5370319783687592, | |
| "epoch": 2.1835205992509366, | |
| "grad_norm": 0.035024821758270264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342311859130859, | |
| "mean_token_accuracy": 0.7830409854650497, | |
| "num_tokens": 9508099.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5350894033908844, | |
| "epoch": 2.1872659176029963, | |
| "grad_norm": 0.04598443582653999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383565425872803, | |
| "mean_token_accuracy": 0.7810914367437363, | |
| "num_tokens": 9524506.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5270723178982735, | |
| "epoch": 2.191011235955056, | |
| "grad_norm": 0.03489379957318306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261937379837036, | |
| "mean_token_accuracy": 0.7874008566141129, | |
| "num_tokens": 9540868.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5187418013811111, | |
| "epoch": 2.194756554307116, | |
| "grad_norm": 0.04006824642419815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516140341758728, | |
| "mean_token_accuracy": 0.7876712679862976, | |
| "num_tokens": 9557109.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5397524982690811, | |
| "epoch": 2.198501872659176, | |
| "grad_norm": 0.037596385926008224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5337037444114685, | |
| "mean_token_accuracy": 0.7848425805568695, | |
| "num_tokens": 9573451.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.542935311794281, | |
| "epoch": 2.202247191011236, | |
| "grad_norm": 0.05163532868027687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548254668712616, | |
| "mean_token_accuracy": 0.7771319299936295, | |
| "num_tokens": 9589800.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.524966299533844, | |
| "epoch": 2.2059925093632957, | |
| "grad_norm": 0.04678061604499817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537632405757904, | |
| "mean_token_accuracy": 0.7821167409420013, | |
| "num_tokens": 9606180.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5223182588815689, | |
| "epoch": 2.209737827715356, | |
| "grad_norm": 0.04918593540787697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256946086883545, | |
| "mean_token_accuracy": 0.7862184792757034, | |
| "num_tokens": 9622319.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.545245572924614, | |
| "epoch": 2.2134831460674156, | |
| "grad_norm": 0.044536106288433075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387803316116333, | |
| "mean_token_accuracy": 0.7820178419351578, | |
| "num_tokens": 9638605.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5572000294923782, | |
| "epoch": 2.2172284644194757, | |
| "grad_norm": 0.04941220581531525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500818490982056, | |
| "mean_token_accuracy": 0.7780845314264297, | |
| "num_tokens": 9655041.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.524405911564827, | |
| "epoch": 2.2209737827715355, | |
| "grad_norm": 0.04783201217651367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203397870063782, | |
| "mean_token_accuracy": 0.7880013734102249, | |
| "num_tokens": 9671239.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5252467542886734, | |
| "epoch": 2.2247191011235956, | |
| "grad_norm": 0.04301263764500618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267080664634705, | |
| "mean_token_accuracy": 0.7888626754283905, | |
| "num_tokens": 9687363.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.53339484333992, | |
| "epoch": 2.2284644194756553, | |
| "grad_norm": 0.05318563058972359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481151342391968, | |
| "mean_token_accuracy": 0.7762688100337982, | |
| "num_tokens": 9703829.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5450247228145599, | |
| "epoch": 2.2322097378277155, | |
| "grad_norm": 0.03796645253896713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463745594024658, | |
| "mean_token_accuracy": 0.7799876779317856, | |
| "num_tokens": 9720055.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5355545580387115, | |
| "epoch": 2.235955056179775, | |
| "grad_norm": 0.04619521647691727, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383350253105164, | |
| "mean_token_accuracy": 0.7803421318531036, | |
| "num_tokens": 9736065.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5393659174442291, | |
| "epoch": 2.2397003745318353, | |
| "grad_norm": 0.04189852997660637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408390760421753, | |
| "mean_token_accuracy": 0.7799636572599411, | |
| "num_tokens": 9752285.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5505337119102478, | |
| "epoch": 2.243445692883895, | |
| "grad_norm": 0.04415363445878029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492491722106934, | |
| "mean_token_accuracy": 0.7789665758609772, | |
| "num_tokens": 9768797.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5322769433259964, | |
| "epoch": 2.247191011235955, | |
| "grad_norm": 0.0446348674595356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362676978111267, | |
| "mean_token_accuracy": 0.7827903628349304, | |
| "num_tokens": 9785259.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5283399671316147, | |
| "epoch": 2.250936329588015, | |
| "grad_norm": 0.04350518435239792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263485312461853, | |
| "mean_token_accuracy": 0.7854094952344894, | |
| "num_tokens": 9801683.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5155128389596939, | |
| "epoch": 2.254681647940075, | |
| "grad_norm": 0.049416691064834595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274794101715088, | |
| "mean_token_accuracy": 0.7866163551807404, | |
| "num_tokens": 9817897.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.555690124630928, | |
| "epoch": 2.258426966292135, | |
| "grad_norm": 0.042244087904691696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587432384490967, | |
| "mean_token_accuracy": 0.7742861956357956, | |
| "num_tokens": 9834109.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5449231714010239, | |
| "epoch": 2.262172284644195, | |
| "grad_norm": 0.04214772582054138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424601435661316, | |
| "mean_token_accuracy": 0.7795074135065079, | |
| "num_tokens": 9850508.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.551129087805748, | |
| "epoch": 2.2659176029962547, | |
| "grad_norm": 0.04242361709475517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350391268730164, | |
| "mean_token_accuracy": 0.7817512005567551, | |
| "num_tokens": 9866973.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5557906329631805, | |
| "epoch": 2.2696629213483144, | |
| "grad_norm": 0.04337119311094284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464892387390137, | |
| "mean_token_accuracy": 0.7796575874090195, | |
| "num_tokens": 9883567.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5241350680589676, | |
| "epoch": 2.2734082397003745, | |
| "grad_norm": 0.04597577825188637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339911580085754, | |
| "mean_token_accuracy": 0.784000501036644, | |
| "num_tokens": 9899884.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5317652076482773, | |
| "epoch": 2.2771535580524347, | |
| "grad_norm": 0.06419555842876434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507545471191406, | |
| "mean_token_accuracy": 0.7757140696048737, | |
| "num_tokens": 9916225.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.520916298031807, | |
| "epoch": 2.2808988764044944, | |
| "grad_norm": 0.0413593053817749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282008051872253, | |
| "mean_token_accuracy": 0.7836293429136276, | |
| "num_tokens": 9932137.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.550976499915123, | |
| "epoch": 2.284644194756554, | |
| "grad_norm": 0.04407277703285217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476412177085876, | |
| "mean_token_accuracy": 0.7784940898418427, | |
| "num_tokens": 9948364.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5534344464540482, | |
| "epoch": 2.2883895131086143, | |
| "grad_norm": 0.036215297877788544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448459386825562, | |
| "mean_token_accuracy": 0.7809607535600662, | |
| "num_tokens": 9964781.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.540510505437851, | |
| "epoch": 2.292134831460674, | |
| "grad_norm": 0.037168748676776886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290323495864868, | |
| "mean_token_accuracy": 0.7844896763563156, | |
| "num_tokens": 9980949.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.537270799279213, | |
| "epoch": 2.295880149812734, | |
| "grad_norm": 0.0456305667757988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368558764457703, | |
| "mean_token_accuracy": 0.781862810254097, | |
| "num_tokens": 9997181.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.529745414853096, | |
| "epoch": 2.299625468164794, | |
| "grad_norm": 0.04219827800989151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287020206451416, | |
| "mean_token_accuracy": 0.7848487794399261, | |
| "num_tokens": 10013303.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5297169536352158, | |
| "epoch": 2.303370786516854, | |
| "grad_norm": 0.05070658028125763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422332286834717, | |
| "mean_token_accuracy": 0.7800150513648987, | |
| "num_tokens": 10029569.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5271121859550476, | |
| "epoch": 2.3071161048689137, | |
| "grad_norm": 0.04743409901857376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323826670646667, | |
| "mean_token_accuracy": 0.7835269123315811, | |
| "num_tokens": 10045920.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5429159998893738, | |
| "epoch": 2.310861423220974, | |
| "grad_norm": 0.04348791018128395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469599962234497, | |
| "mean_token_accuracy": 0.777765229344368, | |
| "num_tokens": 10062068.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5268895328044891, | |
| "epoch": 2.3146067415730336, | |
| "grad_norm": 0.046540766954422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318824052810669, | |
| "mean_token_accuracy": 0.784139409661293, | |
| "num_tokens": 10078035.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5406851470470428, | |
| "epoch": 2.3183520599250937, | |
| "grad_norm": 0.03879360482096672, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327763557434082, | |
| "mean_token_accuracy": 0.7838515788316727, | |
| "num_tokens": 10094069.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5550850629806519, | |
| "epoch": 2.3220973782771535, | |
| "grad_norm": 0.04021632671356201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544082760810852, | |
| "mean_token_accuracy": 0.7794292271137238, | |
| "num_tokens": 10110562.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5633902698755264, | |
| "epoch": 2.3258426966292136, | |
| "grad_norm": 0.03872428461909294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591956973075867, | |
| "mean_token_accuracy": 0.7731619328260422, | |
| "num_tokens": 10127313.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.526028499007225, | |
| "epoch": 2.3295880149812733, | |
| "grad_norm": 0.04169732704758644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296715497970581, | |
| "mean_token_accuracy": 0.7846156656742096, | |
| "num_tokens": 10143539.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5621512830257416, | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 0.03567031770944595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641921758651733, | |
| "mean_token_accuracy": 0.7724113464355469, | |
| "num_tokens": 10159890.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5621916353702545, | |
| "epoch": 2.337078651685393, | |
| "grad_norm": 0.044719185680150986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658475756645203, | |
| "mean_token_accuracy": 0.768171489238739, | |
| "num_tokens": 10176303.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5397062003612518, | |
| "epoch": 2.3408239700374533, | |
| "grad_norm": 0.03938845917582512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410289168357849, | |
| "mean_token_accuracy": 0.7816459834575653, | |
| "num_tokens": 10192725.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5308454632759094, | |
| "epoch": 2.344569288389513, | |
| "grad_norm": 0.0393369197845459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327979326248169, | |
| "mean_token_accuracy": 0.7836434692144394, | |
| "num_tokens": 10208900.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5351555794477463, | |
| "epoch": 2.348314606741573, | |
| "grad_norm": 0.044483788311481476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537283182144165, | |
| "mean_token_accuracy": 0.784860372543335, | |
| "num_tokens": 10224853.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5380195677280426, | |
| "epoch": 2.352059925093633, | |
| "grad_norm": 0.04018259420990944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401010513305664, | |
| "mean_token_accuracy": 0.7777950018644333, | |
| "num_tokens": 10241181.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5319711565971375, | |
| "epoch": 2.355805243445693, | |
| "grad_norm": 0.052694015204906464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327081680297852, | |
| "mean_token_accuracy": 0.7857355177402496, | |
| "num_tokens": 10257569.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5219532996416092, | |
| "epoch": 2.359550561797753, | |
| "grad_norm": 0.0513097383081913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344624519348145, | |
| "mean_token_accuracy": 0.781092032790184, | |
| "num_tokens": 10273502.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5303360670804977, | |
| "epoch": 2.3632958801498125, | |
| "grad_norm": 0.05031297355890274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381285548210144, | |
| "mean_token_accuracy": 0.7818425595760345, | |
| "num_tokens": 10289765.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5247592329978943, | |
| "epoch": 2.3670411985018727, | |
| "grad_norm": 0.040263328701257706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220550298690796, | |
| "mean_token_accuracy": 0.786396861076355, | |
| "num_tokens": 10306027.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5546284765005112, | |
| "epoch": 2.370786516853933, | |
| "grad_norm": 0.04438352584838867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477085113525391, | |
| "mean_token_accuracy": 0.7770822197198868, | |
| "num_tokens": 10322169.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5496452152729034, | |
| "epoch": 2.3745318352059925, | |
| "grad_norm": 0.048432301729917526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438807606697083, | |
| "mean_token_accuracy": 0.780827596783638, | |
| "num_tokens": 10338568.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5297926962375641, | |
| "epoch": 2.3782771535580522, | |
| "grad_norm": 0.03634348511695862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239929556846619, | |
| "mean_token_accuracy": 0.7896489948034286, | |
| "num_tokens": 10354708.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5366943925619125, | |
| "epoch": 2.3820224719101124, | |
| "grad_norm": 0.051037952303886414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460379123687744, | |
| "mean_token_accuracy": 0.7777325063943863, | |
| "num_tokens": 10371358.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5219292491674423, | |
| "epoch": 2.385767790262172, | |
| "grad_norm": 0.03863009437918663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266265273094177, | |
| "mean_token_accuracy": 0.7879810929298401, | |
| "num_tokens": 10387500.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5288277566432953, | |
| "epoch": 2.3895131086142323, | |
| "grad_norm": 0.05099929869174957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307456851005554, | |
| "mean_token_accuracy": 0.7841700166463852, | |
| "num_tokens": 10404042.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5441994965076447, | |
| "epoch": 2.393258426966292, | |
| "grad_norm": 0.03832423314452171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406984090805054, | |
| "mean_token_accuracy": 0.7822638154029846, | |
| "num_tokens": 10420308.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5474298596382141, | |
| "epoch": 2.397003745318352, | |
| "grad_norm": 0.03593610227108002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448755025863647, | |
| "mean_token_accuracy": 0.7769681811332703, | |
| "num_tokens": 10436473.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5544268637895584, | |
| "epoch": 2.400749063670412, | |
| "grad_norm": 0.05683998391032219, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575302839279175, | |
| "mean_token_accuracy": 0.7728745937347412, | |
| "num_tokens": 10453006.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5459371656179428, | |
| "epoch": 2.404494382022472, | |
| "grad_norm": 0.041604217141866684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482038855552673, | |
| "mean_token_accuracy": 0.7801420837640762, | |
| "num_tokens": 10469281.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5380865782499313, | |
| "epoch": 2.4082397003745317, | |
| "grad_norm": 0.05113884434103966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394017696380615, | |
| "mean_token_accuracy": 0.7834807485342026, | |
| "num_tokens": 10485666.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.549991711974144, | |
| "epoch": 2.411985018726592, | |
| "grad_norm": 0.03647167235612869, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553663969039917, | |
| "mean_token_accuracy": 0.774835467338562, | |
| "num_tokens": 10501890.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5480955541133881, | |
| "epoch": 2.4157303370786516, | |
| "grad_norm": 0.04493939131498337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466475486755371, | |
| "mean_token_accuracy": 0.7790014296770096, | |
| "num_tokens": 10518311.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5469405502080917, | |
| "epoch": 2.4194756554307117, | |
| "grad_norm": 0.040811046957969666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483651161193848, | |
| "mean_token_accuracy": 0.7788845151662827, | |
| "num_tokens": 10534519.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.542740598320961, | |
| "epoch": 2.4232209737827715, | |
| "grad_norm": 0.045434851199388504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396543741226196, | |
| "mean_token_accuracy": 0.7790694683790207, | |
| "num_tokens": 10550595.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.535121500492096, | |
| "epoch": 2.4269662921348316, | |
| "grad_norm": 0.04115886241197586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374845266342163, | |
| "mean_token_accuracy": 0.7803627252578735, | |
| "num_tokens": 10566917.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5375159233808517, | |
| "epoch": 2.4307116104868913, | |
| "grad_norm": 0.04332772269845009, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381888151168823, | |
| "mean_token_accuracy": 0.7793711423873901, | |
| "num_tokens": 10583313.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5432725697755814, | |
| "epoch": 2.4344569288389515, | |
| "grad_norm": 0.041510697454214096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448310375213623, | |
| "mean_token_accuracy": 0.7758618593215942, | |
| "num_tokens": 10599510.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5411451011896133, | |
| "epoch": 2.438202247191011, | |
| "grad_norm": 0.04265889525413513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466779470443726, | |
| "mean_token_accuracy": 0.7779202163219452, | |
| "num_tokens": 10615799.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.535615861415863, | |
| "epoch": 2.4419475655430714, | |
| "grad_norm": 0.04081408306956291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539250373840332, | |
| "mean_token_accuracy": 0.7790500521659851, | |
| "num_tokens": 10632054.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5231917202472687, | |
| "epoch": 2.445692883895131, | |
| "grad_norm": 0.037281572818756104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242350101470947, | |
| "mean_token_accuracy": 0.7875235080718994, | |
| "num_tokens": 10648293.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5311395078897476, | |
| "epoch": 2.449438202247191, | |
| "grad_norm": 0.04048464447259903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264798402786255, | |
| "mean_token_accuracy": 0.7850567251443863, | |
| "num_tokens": 10664249.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5295854657888412, | |
| "epoch": 2.453183520599251, | |
| "grad_norm": 0.042382705956697464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322737097740173, | |
| "mean_token_accuracy": 0.7859133034944534, | |
| "num_tokens": 10680711.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5250136256217957, | |
| "epoch": 2.4569288389513106, | |
| "grad_norm": 0.047354746609926224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524110734462738, | |
| "mean_token_accuracy": 0.7874706089496613, | |
| "num_tokens": 10696903.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5428455919027328, | |
| "epoch": 2.460674157303371, | |
| "grad_norm": 0.04214261844754219, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400563478469849, | |
| "mean_token_accuracy": 0.7825742065906525, | |
| "num_tokens": 10713018.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5570447146892548, | |
| "epoch": 2.464419475655431, | |
| "grad_norm": 0.04198653623461723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468944907188416, | |
| "mean_token_accuracy": 0.7801797240972519, | |
| "num_tokens": 10729583.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5350753366947174, | |
| "epoch": 2.4681647940074907, | |
| "grad_norm": 0.03751063346862793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351656675338745, | |
| "mean_token_accuracy": 0.7814910113811493, | |
| "num_tokens": 10746077.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5235352218151093, | |
| "epoch": 2.4719101123595504, | |
| "grad_norm": 0.040084533393383026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531356692314148, | |
| "mean_token_accuracy": 0.7839406430721283, | |
| "num_tokens": 10762311.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5389134883880615, | |
| "epoch": 2.4756554307116105, | |
| "grad_norm": 0.05371229350566864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532786250114441, | |
| "mean_token_accuracy": 0.7754277139902115, | |
| "num_tokens": 10778652.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5187595188617706, | |
| "epoch": 2.4794007490636703, | |
| "grad_norm": 0.03975149244070053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5151571035385132, | |
| "mean_token_accuracy": 0.7930901050567627, | |
| "num_tokens": 10794746.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5426436811685562, | |
| "epoch": 2.4831460674157304, | |
| "grad_norm": 0.03997328504920006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403225421905518, | |
| "mean_token_accuracy": 0.7798904031515121, | |
| "num_tokens": 10811033.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5267360359430313, | |
| "epoch": 2.48689138576779, | |
| "grad_norm": 0.043838318437337875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526395320892334, | |
| "mean_token_accuracy": 0.7879899889230728, | |
| "num_tokens": 10827129.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5509849190711975, | |
| "epoch": 2.4906367041198503, | |
| "grad_norm": 0.037469275295734406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411713719367981, | |
| "mean_token_accuracy": 0.7808174937963486, | |
| "num_tokens": 10843435.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5449976474046707, | |
| "epoch": 2.49438202247191, | |
| "grad_norm": 0.05326893553137779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467808842658997, | |
| "mean_token_accuracy": 0.7777620851993561, | |
| "num_tokens": 10859523.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5301449000835419, | |
| "epoch": 2.49812734082397, | |
| "grad_norm": 0.04426975175738335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359491109848022, | |
| "mean_token_accuracy": 0.7841154336929321, | |
| "num_tokens": 10875805.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5325603634119034, | |
| "epoch": 2.50187265917603, | |
| "grad_norm": 0.04210103675723076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365734100341797, | |
| "mean_token_accuracy": 0.782084509730339, | |
| "num_tokens": 10892315.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5456321388483047, | |
| "epoch": 2.50561797752809, | |
| "grad_norm": 0.03740176558494568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444263219833374, | |
| "mean_token_accuracy": 0.7780910581350327, | |
| "num_tokens": 10908850.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5338556170463562, | |
| "epoch": 2.5093632958801497, | |
| "grad_norm": 0.04143742844462395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300049185752869, | |
| "mean_token_accuracy": 0.787174180150032, | |
| "num_tokens": 10925106.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5515117049217224, | |
| "epoch": 2.51310861423221, | |
| "grad_norm": 0.03918025270104408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542182445526123, | |
| "mean_token_accuracy": 0.7806340008974075, | |
| "num_tokens": 10941543.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5549922436475754, | |
| "epoch": 2.5168539325842696, | |
| "grad_norm": 0.04009648784995079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559307932853699, | |
| "mean_token_accuracy": 0.7725488841533661, | |
| "num_tokens": 10957817.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.539954200387001, | |
| "epoch": 2.5205992509363297, | |
| "grad_norm": 0.04543929174542427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482618808746338, | |
| "mean_token_accuracy": 0.7789554446935654, | |
| "num_tokens": 10974119.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5211862847208977, | |
| "epoch": 2.5243445692883895, | |
| "grad_norm": 0.0385296531021595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304719805717468, | |
| "mean_token_accuracy": 0.7863713204860687, | |
| "num_tokens": 10990490.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5547338724136353, | |
| "epoch": 2.5280898876404496, | |
| "grad_norm": 0.047472305595874786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596637725830078, | |
| "mean_token_accuracy": 0.771984726190567, | |
| "num_tokens": 11007150.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5423361957073212, | |
| "epoch": 2.5318352059925093, | |
| "grad_norm": 0.03454773128032684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381237268447876, | |
| "mean_token_accuracy": 0.7808732390403748, | |
| "num_tokens": 11023385.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5561535805463791, | |
| "epoch": 2.535580524344569, | |
| "grad_norm": 0.03847538307309151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428014993667603, | |
| "mean_token_accuracy": 0.7786359935998917, | |
| "num_tokens": 11039943.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.544300451874733, | |
| "epoch": 2.539325842696629, | |
| "grad_norm": 0.04131785407662392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334832668304443, | |
| "mean_token_accuracy": 0.7851458042860031, | |
| "num_tokens": 11056430.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5311527848243713, | |
| "epoch": 2.5430711610486894, | |
| "grad_norm": 0.03951219096779823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389747023582458, | |
| "mean_token_accuracy": 0.7813056856393814, | |
| "num_tokens": 11072776.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5290235728025436, | |
| "epoch": 2.546816479400749, | |
| "grad_norm": 0.0438111387193203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451354384422302, | |
| "mean_token_accuracy": 0.7777683436870575, | |
| "num_tokens": 11088991.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5291692391037941, | |
| "epoch": 2.550561797752809, | |
| "grad_norm": 0.039012420922517776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386437773704529, | |
| "mean_token_accuracy": 0.7806796282529831, | |
| "num_tokens": 11105235.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5217102319002151, | |
| "epoch": 2.554307116104869, | |
| "grad_norm": 0.04288937896490097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323805809020996, | |
| "mean_token_accuracy": 0.7835096120834351, | |
| "num_tokens": 11121333.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5252867043018341, | |
| "epoch": 2.558052434456929, | |
| "grad_norm": 0.0371013842523098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5191121101379395, | |
| "mean_token_accuracy": 0.7874591499567032, | |
| "num_tokens": 11137249.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5371126532554626, | |
| "epoch": 2.561797752808989, | |
| "grad_norm": 0.03830140084028244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264033675193787, | |
| "mean_token_accuracy": 0.7881854623556137, | |
| "num_tokens": 11153699.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5386142879724503, | |
| "epoch": 2.5655430711610485, | |
| "grad_norm": 0.035421278327703476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367159247398376, | |
| "mean_token_accuracy": 0.7793221473693848, | |
| "num_tokens": 11170196.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5483710169792175, | |
| "epoch": 2.5692883895131087, | |
| "grad_norm": 0.04288771376013756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506448149681091, | |
| "mean_token_accuracy": 0.7785434424877167, | |
| "num_tokens": 11186770.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5472489446401596, | |
| "epoch": 2.5730337078651684, | |
| "grad_norm": 0.04111029580235481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503485798835754, | |
| "mean_token_accuracy": 0.7765214443206787, | |
| "num_tokens": 11203191.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.523987427353859, | |
| "epoch": 2.5767790262172285, | |
| "grad_norm": 0.04419523477554321, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254223942756653, | |
| "mean_token_accuracy": 0.7858942598104477, | |
| "num_tokens": 11219530.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5482724606990814, | |
| "epoch": 2.5805243445692883, | |
| "grad_norm": 0.0384112112224102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467587113380432, | |
| "mean_token_accuracy": 0.7784788310527802, | |
| "num_tokens": 11236013.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5410710424184799, | |
| "epoch": 2.5842696629213484, | |
| "grad_norm": 0.04548390954732895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361588001251221, | |
| "mean_token_accuracy": 0.7842984944581985, | |
| "num_tokens": 11252349.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5413189381361008, | |
| "epoch": 2.588014981273408, | |
| "grad_norm": 0.03719467297196388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372804403305054, | |
| "mean_token_accuracy": 0.7805864661931992, | |
| "num_tokens": 11268637.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5587044954299927, | |
| "epoch": 2.5917602996254683, | |
| "grad_norm": 0.03943658620119095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556570291519165, | |
| "mean_token_accuracy": 0.7712628394365311, | |
| "num_tokens": 11284973.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5220051556825638, | |
| "epoch": 2.595505617977528, | |
| "grad_norm": 0.04577549174427986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235053896903992, | |
| "mean_token_accuracy": 0.7874717712402344, | |
| "num_tokens": 11301234.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5253131091594696, | |
| "epoch": 2.599250936329588, | |
| "grad_norm": 0.055322322994470596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539014458656311, | |
| "mean_token_accuracy": 0.7832715809345245, | |
| "num_tokens": 11317622.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.529956579208374, | |
| "epoch": 2.602996254681648, | |
| "grad_norm": 0.04555559530854225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358556509017944, | |
| "mean_token_accuracy": 0.7829083502292633, | |
| "num_tokens": 11334260.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5464101433753967, | |
| "epoch": 2.606741573033708, | |
| "grad_norm": 0.04112941771745682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475582480430603, | |
| "mean_token_accuracy": 0.780443549156189, | |
| "num_tokens": 11350510.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5290370956063271, | |
| "epoch": 2.6104868913857677, | |
| "grad_norm": 0.03645879402756691, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310324430465698, | |
| "mean_token_accuracy": 0.7870594263076782, | |
| "num_tokens": 11366960.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5584116280078888, | |
| "epoch": 2.6142322097378274, | |
| "grad_norm": 0.03702421113848686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555626153945923, | |
| "mean_token_accuracy": 0.7766379117965698, | |
| "num_tokens": 11383705.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5311998277902603, | |
| "epoch": 2.6179775280898876, | |
| "grad_norm": 0.039902858436107635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329570770263672, | |
| "mean_token_accuracy": 0.7843590825796127, | |
| "num_tokens": 11399770.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5450660437345505, | |
| "epoch": 2.6217228464419478, | |
| "grad_norm": 0.040915053337812424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421010851860046, | |
| "mean_token_accuracy": 0.7778819799423218, | |
| "num_tokens": 11416143.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5301565080881119, | |
| "epoch": 2.6254681647940075, | |
| "grad_norm": 0.04668205976486206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542178750038147, | |
| "mean_token_accuracy": 0.7808790653944016, | |
| "num_tokens": 11432391.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5262583941221237, | |
| "epoch": 2.629213483146067, | |
| "grad_norm": 0.044074323028326035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528965413570404, | |
| "mean_token_accuracy": 0.7844109088182449, | |
| "num_tokens": 11448787.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5375534892082214, | |
| "epoch": 2.6329588014981273, | |
| "grad_norm": 0.046261075884103775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426000952720642, | |
| "mean_token_accuracy": 0.7772792726755142, | |
| "num_tokens": 11464834.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5281456708908081, | |
| "epoch": 2.6367041198501875, | |
| "grad_norm": 0.04074921831488609, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224668979644775, | |
| "mean_token_accuracy": 0.7867994755506516, | |
| "num_tokens": 11481010.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5607274174690247, | |
| "epoch": 2.640449438202247, | |
| "grad_norm": 0.04910429194569588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609941482543945, | |
| "mean_token_accuracy": 0.7746099084615707, | |
| "num_tokens": 11497290.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5405243337154388, | |
| "epoch": 2.644194756554307, | |
| "grad_norm": 0.042494796216487885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373457670211792, | |
| "mean_token_accuracy": 0.7792738676071167, | |
| "num_tokens": 11513583.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5465130656957626, | |
| "epoch": 2.647940074906367, | |
| "grad_norm": 0.051266275346279144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519081950187683, | |
| "mean_token_accuracy": 0.7757825553417206, | |
| "num_tokens": 11530012.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5431560575962067, | |
| "epoch": 2.6516853932584272, | |
| "grad_norm": 0.03533034771680832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461572408676147, | |
| "mean_token_accuracy": 0.7784530967473984, | |
| "num_tokens": 11546456.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5154132097959518, | |
| "epoch": 2.655430711610487, | |
| "grad_norm": 0.04611873999238014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5180613398551941, | |
| "mean_token_accuracy": 0.7888959646224976, | |
| "num_tokens": 11562883.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5712718665599823, | |
| "epoch": 2.6591760299625467, | |
| "grad_norm": 0.03861664608120918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646159052848816, | |
| "mean_token_accuracy": 0.7710563838481903, | |
| "num_tokens": 11579392.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5572114437818527, | |
| "epoch": 2.662921348314607, | |
| "grad_norm": 0.04512866213917732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551059901714325, | |
| "mean_token_accuracy": 0.7758464813232422, | |
| "num_tokens": 11595937.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5336201041936874, | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 0.042362719774246216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347069501876831, | |
| "mean_token_accuracy": 0.7828791737556458, | |
| "num_tokens": 11612066.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5221793055534363, | |
| "epoch": 2.6704119850187267, | |
| "grad_norm": 0.04037570580840111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523446261882782, | |
| "mean_token_accuracy": 0.7888407558202744, | |
| "num_tokens": 11628437.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5422008782625198, | |
| "epoch": 2.6741573033707864, | |
| "grad_norm": 0.04662792757153511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555385947227478, | |
| "mean_token_accuracy": 0.7747650295495987, | |
| "num_tokens": 11644722.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5356374382972717, | |
| "epoch": 2.6779026217228465, | |
| "grad_norm": 0.03770140931010246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397407412528992, | |
| "mean_token_accuracy": 0.77961665391922, | |
| "num_tokens": 11661403.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5477268397808075, | |
| "epoch": 2.6816479400749063, | |
| "grad_norm": 0.04137538745999336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421797633171082, | |
| "mean_token_accuracy": 0.7774805575609207, | |
| "num_tokens": 11677740.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5390584021806717, | |
| "epoch": 2.6853932584269664, | |
| "grad_norm": 0.04397116228938103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323628187179565, | |
| "mean_token_accuracy": 0.7813891172409058, | |
| "num_tokens": 11693755.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5430156886577606, | |
| "epoch": 2.689138576779026, | |
| "grad_norm": 0.03867118060588837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338262319564819, | |
| "mean_token_accuracy": 0.7821642309427261, | |
| "num_tokens": 11710311.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5369475930929184, | |
| "epoch": 2.6928838951310863, | |
| "grad_norm": 0.03773213177919388, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436868071556091, | |
| "mean_token_accuracy": 0.7776243984699249, | |
| "num_tokens": 11726751.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5204776674509048, | |
| "epoch": 2.696629213483146, | |
| "grad_norm": 0.045796290040016174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366164445877075, | |
| "mean_token_accuracy": 0.7829219549894333, | |
| "num_tokens": 11743104.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5444348156452179, | |
| "epoch": 2.700374531835206, | |
| "grad_norm": 0.041639544069767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522270202636719, | |
| "mean_token_accuracy": 0.7758014649152756, | |
| "num_tokens": 11759143.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5301756113767624, | |
| "epoch": 2.704119850187266, | |
| "grad_norm": 0.04008952155709267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239149928092957, | |
| "mean_token_accuracy": 0.7852831333875656, | |
| "num_tokens": 11775647.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5141435042023659, | |
| "epoch": 2.7078651685393256, | |
| "grad_norm": 0.03991787135601044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5066305994987488, | |
| "mean_token_accuracy": 0.7961233854293823, | |
| "num_tokens": 11791695.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5294996351003647, | |
| "epoch": 2.7116104868913857, | |
| "grad_norm": 0.03514706343412399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277984738349915, | |
| "mean_token_accuracy": 0.7842394113540649, | |
| "num_tokens": 11807908.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.553158238530159, | |
| "epoch": 2.715355805243446, | |
| "grad_norm": 0.0371016301214695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542132258415222, | |
| "mean_token_accuracy": 0.7742846459150314, | |
| "num_tokens": 11824455.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5377026200294495, | |
| "epoch": 2.7191011235955056, | |
| "grad_norm": 0.04648866876959801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486031770706177, | |
| "mean_token_accuracy": 0.7776967585086823, | |
| "num_tokens": 11840615.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5500117689371109, | |
| "epoch": 2.7228464419475653, | |
| "grad_norm": 0.03958411142230034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574382543563843, | |
| "mean_token_accuracy": 0.7707358449697495, | |
| "num_tokens": 11856804.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5287734270095825, | |
| "epoch": 2.7265917602996255, | |
| "grad_norm": 0.039377059787511826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284842848777771, | |
| "mean_token_accuracy": 0.7842006385326385, | |
| "num_tokens": 11872824.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5455043613910675, | |
| "epoch": 2.7303370786516856, | |
| "grad_norm": 0.038099173456430435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363825559616089, | |
| "mean_token_accuracy": 0.7839681655168533, | |
| "num_tokens": 11889236.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5231508985161781, | |
| "epoch": 2.7340823970037453, | |
| "grad_norm": 0.04386546462774277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231119394302368, | |
| "mean_token_accuracy": 0.7876169681549072, | |
| "num_tokens": 11905504.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5425267070531845, | |
| "epoch": 2.737827715355805, | |
| "grad_norm": 0.03880799189209938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381489992141724, | |
| "mean_token_accuracy": 0.7835936099290848, | |
| "num_tokens": 11922030.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5379330962896347, | |
| "epoch": 2.741573033707865, | |
| "grad_norm": 0.04163983464241028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459231734275818, | |
| "mean_token_accuracy": 0.7755035907030106, | |
| "num_tokens": 11938351.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5344593375921249, | |
| "epoch": 2.7453183520599254, | |
| "grad_norm": 0.03764946386218071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335820913314819, | |
| "mean_token_accuracy": 0.7851902097463608, | |
| "num_tokens": 11954720.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5275440439581871, | |
| "epoch": 2.749063670411985, | |
| "grad_norm": 0.041039030998945236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316729545593262, | |
| "mean_token_accuracy": 0.784284695982933, | |
| "num_tokens": 11970943.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5440046042203903, | |
| "epoch": 2.752808988764045, | |
| "grad_norm": 0.03777683153748512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479453802108765, | |
| "mean_token_accuracy": 0.7796096056699753, | |
| "num_tokens": 11987274.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5314242094755173, | |
| "epoch": 2.756554307116105, | |
| "grad_norm": 0.04298453778028488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360277891159058, | |
| "mean_token_accuracy": 0.7836730033159256, | |
| "num_tokens": 12003645.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5434319823980331, | |
| "epoch": 2.7602996254681647, | |
| "grad_norm": 0.038422685116529465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429157614707947, | |
| "mean_token_accuracy": 0.7770098298788071, | |
| "num_tokens": 12020104.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5382603704929352, | |
| "epoch": 2.764044943820225, | |
| "grad_norm": 0.04176581650972366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365764498710632, | |
| "mean_token_accuracy": 0.7839252799749374, | |
| "num_tokens": 12036423.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5331043303012848, | |
| "epoch": 2.7677902621722845, | |
| "grad_norm": 0.04350239410996437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356451272964478, | |
| "mean_token_accuracy": 0.7829470187425613, | |
| "num_tokens": 12052564.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5245354026556015, | |
| "epoch": 2.7715355805243447, | |
| "grad_norm": 0.04295556619763374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335471034049988, | |
| "mean_token_accuracy": 0.7844749689102173, | |
| "num_tokens": 12068677.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5476740896701813, | |
| "epoch": 2.7752808988764044, | |
| "grad_norm": 0.04540206119418144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552383542060852, | |
| "mean_token_accuracy": 0.7785235494375229, | |
| "num_tokens": 12085174.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5276885330677032, | |
| "epoch": 2.7790262172284645, | |
| "grad_norm": 0.03786449506878853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295007228851318, | |
| "mean_token_accuracy": 0.7848162055015564, | |
| "num_tokens": 12101546.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5504680871963501, | |
| "epoch": 2.7827715355805243, | |
| "grad_norm": 0.04417780414223671, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459782481193542, | |
| "mean_token_accuracy": 0.7778183221817017, | |
| "num_tokens": 12117833.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5514437556266785, | |
| "epoch": 2.7865168539325844, | |
| "grad_norm": 0.03677407279610634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444294810295105, | |
| "mean_token_accuracy": 0.7822880744934082, | |
| "num_tokens": 12134076.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.544072225689888, | |
| "epoch": 2.790262172284644, | |
| "grad_norm": 0.04843369498848915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418300628662109, | |
| "mean_token_accuracy": 0.7809806764125824, | |
| "num_tokens": 12149991.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5447394847869873, | |
| "epoch": 2.7940074906367043, | |
| "grad_norm": 0.04489225894212723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485548377037048, | |
| "mean_token_accuracy": 0.7752929180860519, | |
| "num_tokens": 12166319.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5193701684474945, | |
| "epoch": 2.797752808988764, | |
| "grad_norm": 0.04051094502210617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254422426223755, | |
| "mean_token_accuracy": 0.7868325263261795, | |
| "num_tokens": 12182585.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.533800944685936, | |
| "epoch": 2.8014981273408237, | |
| "grad_norm": 0.03557295724749565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316165089607239, | |
| "mean_token_accuracy": 0.7825881540775299, | |
| "num_tokens": 12198769.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.534054160118103, | |
| "epoch": 2.805243445692884, | |
| "grad_norm": 0.04074644669890404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342618823051453, | |
| "mean_token_accuracy": 0.7828291058540344, | |
| "num_tokens": 12215003.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5486414730548859, | |
| "epoch": 2.808988764044944, | |
| "grad_norm": 0.04066525399684906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566014647483826, | |
| "mean_token_accuracy": 0.7741669267416, | |
| "num_tokens": 12231307.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5236565172672272, | |
| "epoch": 2.8127340823970037, | |
| "grad_norm": 0.03859638050198555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243086218833923, | |
| "mean_token_accuracy": 0.7863422483205795, | |
| "num_tokens": 12247563.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5354926288127899, | |
| "epoch": 2.8164794007490634, | |
| "grad_norm": 0.040070392191410065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424857139587402, | |
| "mean_token_accuracy": 0.7793509066104889, | |
| "num_tokens": 12263768.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5465504974126816, | |
| "epoch": 2.8202247191011236, | |
| "grad_norm": 0.04251793026924133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422512292861938, | |
| "mean_token_accuracy": 0.7784619033336639, | |
| "num_tokens": 12280224.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5511007905006409, | |
| "epoch": 2.8239700374531838, | |
| "grad_norm": 0.03704281151294708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432584285736084, | |
| "mean_token_accuracy": 0.7793723195791245, | |
| "num_tokens": 12296720.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5557062178850174, | |
| "epoch": 2.8277153558052435, | |
| "grad_norm": 0.04253645986318588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526583194732666, | |
| "mean_token_accuracy": 0.7777480781078339, | |
| "num_tokens": 12313013.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5158669054508209, | |
| "epoch": 2.831460674157303, | |
| "grad_norm": 0.036200929433107376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5140800476074219, | |
| "mean_token_accuracy": 0.7922120690345764, | |
| "num_tokens": 12328987.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5495094060897827, | |
| "epoch": 2.8352059925093633, | |
| "grad_norm": 0.04025623947381973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524377226829529, | |
| "mean_token_accuracy": 0.7765700370073318, | |
| "num_tokens": 12345487.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5472595542669296, | |
| "epoch": 2.8389513108614235, | |
| "grad_norm": 0.037925150245428085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513643622398376, | |
| "mean_token_accuracy": 0.7754906117916107, | |
| "num_tokens": 12362003.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5349185019731522, | |
| "epoch": 2.842696629213483, | |
| "grad_norm": 0.04107813537120819, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352935791015625, | |
| "mean_token_accuracy": 0.785232812166214, | |
| "num_tokens": 12378308.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5332917869091034, | |
| "epoch": 2.846441947565543, | |
| "grad_norm": 0.0485457181930542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407130122184753, | |
| "mean_token_accuracy": 0.7778820097446442, | |
| "num_tokens": 12394745.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5373108834028244, | |
| "epoch": 2.850187265917603, | |
| "grad_norm": 0.045551612973213196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431134104728699, | |
| "mean_token_accuracy": 0.7788770198822021, | |
| "num_tokens": 12410653.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5553153157234192, | |
| "epoch": 2.853932584269663, | |
| "grad_norm": 0.042994849383831024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521018505096436, | |
| "mean_token_accuracy": 0.7741047441959381, | |
| "num_tokens": 12426820.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5405306816101074, | |
| "epoch": 2.857677902621723, | |
| "grad_norm": 0.03894044831395149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416905283927917, | |
| "mean_token_accuracy": 0.7816338688135147, | |
| "num_tokens": 12443026.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5384278744459152, | |
| "epoch": 2.8614232209737827, | |
| "grad_norm": 0.04121169447898865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407273769378662, | |
| "mean_token_accuracy": 0.7787628769874573, | |
| "num_tokens": 12459216.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5316817611455917, | |
| "epoch": 2.865168539325843, | |
| "grad_norm": 0.05211913585662842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382348895072937, | |
| "mean_token_accuracy": 0.7807497531175613, | |
| "num_tokens": 12475540.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5411743521690369, | |
| "epoch": 2.8689138576779025, | |
| "grad_norm": 0.05021794140338898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549106001853943, | |
| "mean_token_accuracy": 0.7732493728399277, | |
| "num_tokens": 12491791.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5427963435649872, | |
| "epoch": 2.8726591760299627, | |
| "grad_norm": 0.048997581005096436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405234694480896, | |
| "mean_token_accuracy": 0.7799372375011444, | |
| "num_tokens": 12508102.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5702031701803207, | |
| "epoch": 2.8764044943820224, | |
| "grad_norm": 0.035217706114053726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628358721733093, | |
| "mean_token_accuracy": 0.7744450867176056, | |
| "num_tokens": 12524674.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5263065099716187, | |
| "epoch": 2.8801498127340825, | |
| "grad_norm": 0.04417087137699127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192127227783203, | |
| "mean_token_accuracy": 0.7900556176900864, | |
| "num_tokens": 12540700.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5679396241903305, | |
| "epoch": 2.8838951310861423, | |
| "grad_norm": 0.038472775369882584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629768967628479, | |
| "mean_token_accuracy": 0.7697183936834335, | |
| "num_tokens": 12557124.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.541569247841835, | |
| "epoch": 2.8876404494382024, | |
| "grad_norm": 0.04340888932347298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380176901817322, | |
| "mean_token_accuracy": 0.7819050699472427, | |
| "num_tokens": 12573582.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5244268327951431, | |
| "epoch": 2.891385767790262, | |
| "grad_norm": 0.043049633502960205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338467955589294, | |
| "mean_token_accuracy": 0.7832711786031723, | |
| "num_tokens": 12589568.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5213008224964142, | |
| "epoch": 2.895131086142322, | |
| "grad_norm": 0.05456610396504402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332724452018738, | |
| "mean_token_accuracy": 0.7851873487234116, | |
| "num_tokens": 12605650.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5455889403820038, | |
| "epoch": 2.898876404494382, | |
| "grad_norm": 0.04193198308348656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584859251976013, | |
| "mean_token_accuracy": 0.7724700570106506, | |
| "num_tokens": 12621922.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5487163811922073, | |
| "epoch": 2.902621722846442, | |
| "grad_norm": 0.03447289392352104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422307252883911, | |
| "mean_token_accuracy": 0.779036745429039, | |
| "num_tokens": 12638171.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5613754689693451, | |
| "epoch": 2.906367041198502, | |
| "grad_norm": 0.03812362253665924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491812229156494, | |
| "mean_token_accuracy": 0.7774574309587479, | |
| "num_tokens": 12654497.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5419997125864029, | |
| "epoch": 2.9101123595505616, | |
| "grad_norm": 0.03889596462249756, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366528630256653, | |
| "mean_token_accuracy": 0.7796314209699631, | |
| "num_tokens": 12671014.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5404350906610489, | |
| "epoch": 2.9138576779026217, | |
| "grad_norm": 0.03634997084736824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370875000953674, | |
| "mean_token_accuracy": 0.7817376554012299, | |
| "num_tokens": 12687252.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5554278641939163, | |
| "epoch": 2.917602996254682, | |
| "grad_norm": 0.04131067916750908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544486045837402, | |
| "mean_token_accuracy": 0.774728998541832, | |
| "num_tokens": 12703762.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5132855176925659, | |
| "epoch": 2.9213483146067416, | |
| "grad_norm": 0.041993558406829834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5225546360015869, | |
| "mean_token_accuracy": 0.7885993123054504, | |
| "num_tokens": 12720070.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5195116326212883, | |
| "epoch": 2.9250936329588013, | |
| "grad_norm": 0.045502807945013046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276657938957214, | |
| "mean_token_accuracy": 0.7835886776447296, | |
| "num_tokens": 12736079.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5291299819946289, | |
| "epoch": 2.9288389513108615, | |
| "grad_norm": 0.04560597985982895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367044806480408, | |
| "mean_token_accuracy": 0.7813848108053207, | |
| "num_tokens": 12752163.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5446918457746506, | |
| "epoch": 2.932584269662921, | |
| "grad_norm": 0.04057231545448303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368906259536743, | |
| "mean_token_accuracy": 0.7825321704149246, | |
| "num_tokens": 12768377.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5624755024909973, | |
| "epoch": 2.9363295880149813, | |
| "grad_norm": 0.04997701197862625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559151768684387, | |
| "mean_token_accuracy": 0.7733145207166672, | |
| "num_tokens": 12784692.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5384950041770935, | |
| "epoch": 2.940074906367041, | |
| "grad_norm": 0.04062885046005249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536974310874939, | |
| "mean_token_accuracy": 0.7846025824546814, | |
| "num_tokens": 12800887.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5255657434463501, | |
| "epoch": 2.943820224719101, | |
| "grad_norm": 0.044986989349126816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352227091789246, | |
| "mean_token_accuracy": 0.7826129198074341, | |
| "num_tokens": 12817261.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.532112181186676, | |
| "epoch": 2.947565543071161, | |
| "grad_norm": 0.04506840929389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401644110679626, | |
| "mean_token_accuracy": 0.7819447070360184, | |
| "num_tokens": 12833628.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5532176345586777, | |
| "epoch": 2.951310861423221, | |
| "grad_norm": 0.047445181757211685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567490458488464, | |
| "mean_token_accuracy": 0.7756209075450897, | |
| "num_tokens": 12850048.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5571421086788177, | |
| "epoch": 2.955056179775281, | |
| "grad_norm": 0.03836369141936302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471166968345642, | |
| "mean_token_accuracy": 0.7780868262052536, | |
| "num_tokens": 12866382.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5684118866920471, | |
| "epoch": 2.958801498127341, | |
| "grad_norm": 0.03691793233156204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584673285484314, | |
| "mean_token_accuracy": 0.7734033614397049, | |
| "num_tokens": 12882861.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5417571067810059, | |
| "epoch": 2.9625468164794007, | |
| "grad_norm": 0.03854163736104965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380803346633911, | |
| "mean_token_accuracy": 0.7819686830043793, | |
| "num_tokens": 12898999.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5183953493833542, | |
| "epoch": 2.966292134831461, | |
| "grad_norm": 0.04670790210366249, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527891993522644, | |
| "mean_token_accuracy": 0.7858579158782959, | |
| "num_tokens": 12915160.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5315932035446167, | |
| "epoch": 2.9700374531835205, | |
| "grad_norm": 0.05011628568172455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408577919006348, | |
| "mean_token_accuracy": 0.7781645357608795, | |
| "num_tokens": 12931387.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.533274233341217, | |
| "epoch": 2.9737827715355807, | |
| "grad_norm": 0.038501009345054626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422831773757935, | |
| "mean_token_accuracy": 0.7777345776557922, | |
| "num_tokens": 12947630.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5588134974241257, | |
| "epoch": 2.9775280898876404, | |
| "grad_norm": 0.04206021502614021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564273595809937, | |
| "mean_token_accuracy": 0.7733636498451233, | |
| "num_tokens": 12964026.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5579260289669037, | |
| "epoch": 2.9812734082397006, | |
| "grad_norm": 0.04490978643298149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504725575447083, | |
| "mean_token_accuracy": 0.7786446362733841, | |
| "num_tokens": 12980554.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.541483461856842, | |
| "epoch": 2.9850187265917603, | |
| "grad_norm": 0.03570273146033287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293324589729309, | |
| "mean_token_accuracy": 0.783537819981575, | |
| "num_tokens": 12996979.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5362358242273331, | |
| "epoch": 2.98876404494382, | |
| "grad_norm": 0.04825478047132492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365868210792542, | |
| "mean_token_accuracy": 0.7838873118162155, | |
| "num_tokens": 13013323.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5404023975133896, | |
| "epoch": 2.99250936329588, | |
| "grad_norm": 0.04962825030088425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480868816375732, | |
| "mean_token_accuracy": 0.7763252705335617, | |
| "num_tokens": 13029636.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5300639569759369, | |
| "epoch": 2.9962546816479403, | |
| "grad_norm": 0.042783528566360474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343177318572998, | |
| "mean_token_accuracy": 0.7828411161899567, | |
| "num_tokens": 13046055.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5252282693982124, | |
| "epoch": 3.0, | |
| "grad_norm": 0.049276161938905716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320798754692078, | |
| "mean_token_accuracy": 0.7844677865505219, | |
| "num_tokens": 13062401.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.545697808265686, | |
| "epoch": 3.0037453183520597, | |
| "grad_norm": 0.04111013561487198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242352485656738, | |
| "mean_token_accuracy": 0.7881960570812225, | |
| "num_tokens": 13078838.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.5105714052915573, | |
| "epoch": 3.00749063670412, | |
| "grad_norm": 0.050722841173410416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49721649289131165, | |
| "mean_token_accuracy": 0.7984847724437714, | |
| "num_tokens": 13095019.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.518198661506176, | |
| "epoch": 3.0112359550561796, | |
| "grad_norm": 0.05298876017332077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273076891899109, | |
| "mean_token_accuracy": 0.7871041893959045, | |
| "num_tokens": 13111294.0, | |
| "step": 804 | |
| }, | |
| { | |
| "entropy": 0.48655156791210175, | |
| "epoch": 3.0149812734082397, | |
| "grad_norm": 0.05474111810326576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5008523464202881, | |
| "mean_token_accuracy": 0.79793781042099, | |
| "num_tokens": 13127173.0, | |
| "step": 805 | |
| }, | |
| { | |
| "entropy": 0.4898255914449692, | |
| "epoch": 3.0187265917602994, | |
| "grad_norm": 0.05198859050869942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.502049446105957, | |
| "mean_token_accuracy": 0.7997064739465714, | |
| "num_tokens": 13143319.0, | |
| "step": 806 | |
| }, | |
| { | |
| "entropy": 0.5108759626746178, | |
| "epoch": 3.0224719101123596, | |
| "grad_norm": 0.050299011170864105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5128780603408813, | |
| "mean_token_accuracy": 0.7923674434423447, | |
| "num_tokens": 13159544.0, | |
| "step": 807 | |
| }, | |
| { | |
| "entropy": 0.5222347229719162, | |
| "epoch": 3.0262172284644193, | |
| "grad_norm": 0.047297973185777664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5127148628234863, | |
| "mean_token_accuracy": 0.7936184853315353, | |
| "num_tokens": 13175745.0, | |
| "step": 808 | |
| }, | |
| { | |
| "entropy": 0.5319055169820786, | |
| "epoch": 3.0299625468164795, | |
| "grad_norm": 0.043087251484394073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200571417808533, | |
| "mean_token_accuracy": 0.789368748664856, | |
| "num_tokens": 13192098.0, | |
| "step": 809 | |
| }, | |
| { | |
| "entropy": 0.5223256945610046, | |
| "epoch": 3.033707865168539, | |
| "grad_norm": 0.045950714498758316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5118798613548279, | |
| "mean_token_accuracy": 0.7952196598052979, | |
| "num_tokens": 13208503.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.5253837034106255, | |
| "epoch": 3.0374531835205993, | |
| "grad_norm": 0.051792871206998825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294127464294434, | |
| "mean_token_accuracy": 0.7874963134527206, | |
| "num_tokens": 13224945.0, | |
| "step": 811 | |
| }, | |
| { | |
| "entropy": 0.5031881630420685, | |
| "epoch": 3.041198501872659, | |
| "grad_norm": 0.05261905863881111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5030893087387085, | |
| "mean_token_accuracy": 0.796674519777298, | |
| "num_tokens": 13241369.0, | |
| "step": 812 | |
| }, | |
| { | |
| "entropy": 0.5100391805171967, | |
| "epoch": 3.044943820224719, | |
| "grad_norm": 0.05024467036128044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5141370296478271, | |
| "mean_token_accuracy": 0.7916264235973358, | |
| "num_tokens": 13257754.0, | |
| "step": 813 | |
| }, | |
| { | |
| "entropy": 0.5079550594091415, | |
| "epoch": 3.048689138576779, | |
| "grad_norm": 0.05758948624134064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512941300868988, | |
| "mean_token_accuracy": 0.7929425090551376, | |
| "num_tokens": 13273994.0, | |
| "step": 814 | |
| }, | |
| { | |
| "entropy": 0.513673685491085, | |
| "epoch": 3.052434456928839, | |
| "grad_norm": 0.04496518149971962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5110280513763428, | |
| "mean_token_accuracy": 0.7918824106454849, | |
| "num_tokens": 13290072.0, | |
| "step": 815 | |
| }, | |
| { | |
| "entropy": 0.5141152441501617, | |
| "epoch": 3.056179775280899, | |
| "grad_norm": 0.0500110387802124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5101944804191589, | |
| "mean_token_accuracy": 0.7915782928466797, | |
| "num_tokens": 13306210.0, | |
| "step": 816 | |
| }, | |
| { | |
| "entropy": 0.5212079957127571, | |
| "epoch": 3.059925093632959, | |
| "grad_norm": 0.048487596213817596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181204080581665, | |
| "mean_token_accuracy": 0.791895255446434, | |
| "num_tokens": 13322810.0, | |
| "step": 817 | |
| }, | |
| { | |
| "entropy": 0.5105150416493416, | |
| "epoch": 3.0636704119850187, | |
| "grad_norm": 0.04949360713362694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5145678520202637, | |
| "mean_token_accuracy": 0.7915669232606888, | |
| "num_tokens": 13339105.0, | |
| "step": 818 | |
| }, | |
| { | |
| "entropy": 0.5000638663768768, | |
| "epoch": 3.067415730337079, | |
| "grad_norm": 0.05010031536221504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5040720701217651, | |
| "mean_token_accuracy": 0.7957489788532257, | |
| "num_tokens": 13355562.0, | |
| "step": 819 | |
| }, | |
| { | |
| "entropy": 0.4990030825138092, | |
| "epoch": 3.0711610486891385, | |
| "grad_norm": 0.04833959415555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5016943216323853, | |
| "mean_token_accuracy": 0.795589417219162, | |
| "num_tokens": 13371584.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.49931125342845917, | |
| "epoch": 3.0749063670411987, | |
| "grad_norm": 0.0536712147295475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5040884017944336, | |
| "mean_token_accuracy": 0.7980391532182693, | |
| "num_tokens": 13387562.0, | |
| "step": 821 | |
| }, | |
| { | |
| "entropy": 0.522365540266037, | |
| "epoch": 3.0786516853932584, | |
| "grad_norm": 0.05137619003653526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5167077779769897, | |
| "mean_token_accuracy": 0.7917557954788208, | |
| "num_tokens": 13403730.0, | |
| "step": 822 | |
| }, | |
| { | |
| "entropy": 0.5068316459655762, | |
| "epoch": 3.0823970037453186, | |
| "grad_norm": 0.05163760110735893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5044561624526978, | |
| "mean_token_accuracy": 0.7993681281805038, | |
| "num_tokens": 13419918.0, | |
| "step": 823 | |
| }, | |
| { | |
| "entropy": 0.49808672070503235, | |
| "epoch": 3.0861423220973783, | |
| "grad_norm": 0.06049012020230293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5022746920585632, | |
| "mean_token_accuracy": 0.7967248558998108, | |
| "num_tokens": 13435959.0, | |
| "step": 824 | |
| }, | |
| { | |
| "entropy": 0.514209657907486, | |
| "epoch": 3.0898876404494384, | |
| "grad_norm": 0.04543498158454895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5144035220146179, | |
| "mean_token_accuracy": 0.789142832159996, | |
| "num_tokens": 13452229.0, | |
| "step": 825 | |
| }, | |
| { | |
| "entropy": 0.5195358544588089, | |
| "epoch": 3.093632958801498, | |
| "grad_norm": 0.057822633534669876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155280828475952, | |
| "mean_token_accuracy": 0.7921741157770157, | |
| "num_tokens": 13468667.0, | |
| "step": 826 | |
| }, | |
| { | |
| "entropy": 0.507283978164196, | |
| "epoch": 3.097378277153558, | |
| "grad_norm": 0.05148691684007645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.504961371421814, | |
| "mean_token_accuracy": 0.7980248332023621, | |
| "num_tokens": 13484964.0, | |
| "step": 827 | |
| }, | |
| { | |
| "entropy": 0.5191457867622375, | |
| "epoch": 3.101123595505618, | |
| "grad_norm": 0.045027829706668854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200563669204712, | |
| "mean_token_accuracy": 0.7913502901792526, | |
| "num_tokens": 13501449.0, | |
| "step": 828 | |
| }, | |
| { | |
| "entropy": 0.5351596623659134, | |
| "epoch": 3.1048689138576777, | |
| "grad_norm": 0.05001077800989151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278201699256897, | |
| "mean_token_accuracy": 0.7879630476236343, | |
| "num_tokens": 13517966.0, | |
| "step": 829 | |
| }, | |
| { | |
| "entropy": 0.5123812630772591, | |
| "epoch": 3.108614232209738, | |
| "grad_norm": 0.0483224131166935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5094588398933411, | |
| "mean_token_accuracy": 0.794407531619072, | |
| "num_tokens": 13534307.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.5005150064826012, | |
| "epoch": 3.1123595505617976, | |
| "grad_norm": 0.06896387785673141, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5081024169921875, | |
| "mean_token_accuracy": 0.7954099476337433, | |
| "num_tokens": 13550484.0, | |
| "step": 831 | |
| }, | |
| { | |
| "entropy": 0.5042895451188087, | |
| "epoch": 3.1161048689138577, | |
| "grad_norm": 0.058579690754413605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.508193850517273, | |
| "mean_token_accuracy": 0.793841764330864, | |
| "num_tokens": 13566708.0, | |
| "step": 832 | |
| }, | |
| { | |
| "entropy": 0.49759114533662796, | |
| "epoch": 3.1198501872659175, | |
| "grad_norm": 0.07416244596242905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5042813420295715, | |
| "mean_token_accuracy": 0.7976614087820053, | |
| "num_tokens": 13582827.0, | |
| "step": 833 | |
| }, | |
| { | |
| "entropy": 0.5223132967948914, | |
| "epoch": 3.1235955056179776, | |
| "grad_norm": 0.06452949345111847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273835062980652, | |
| "mean_token_accuracy": 0.7855038046836853, | |
| "num_tokens": 13599052.0, | |
| "step": 834 | |
| }, | |
| { | |
| "entropy": 0.5274243950843811, | |
| "epoch": 3.1273408239700373, | |
| "grad_norm": 0.05534323304891586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527578592300415, | |
| "mean_token_accuracy": 0.7877459824085236, | |
| "num_tokens": 13615363.0, | |
| "step": 835 | |
| }, | |
| { | |
| "entropy": 0.5254645645618439, | |
| "epoch": 3.1310861423220975, | |
| "grad_norm": 0.05036141723394394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5162075161933899, | |
| "mean_token_accuracy": 0.7924645841121674, | |
| "num_tokens": 13631656.0, | |
| "step": 836 | |
| }, | |
| { | |
| "entropy": 0.519648090004921, | |
| "epoch": 3.134831460674157, | |
| "grad_norm": 0.05153921991586685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5139608383178711, | |
| "mean_token_accuracy": 0.7937669306993484, | |
| "num_tokens": 13648061.0, | |
| "step": 837 | |
| }, | |
| { | |
| "entropy": 0.5104959607124329, | |
| "epoch": 3.1385767790262173, | |
| "grad_norm": 0.0628538653254509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5201999545097351, | |
| "mean_token_accuracy": 0.7901795506477356, | |
| "num_tokens": 13664398.0, | |
| "step": 838 | |
| }, | |
| { | |
| "entropy": 0.5013151913881302, | |
| "epoch": 3.142322097378277, | |
| "grad_norm": 0.05778926610946655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5063536763191223, | |
| "mean_token_accuracy": 0.7938642650842667, | |
| "num_tokens": 13680563.0, | |
| "step": 839 | |
| }, | |
| { | |
| "entropy": 0.5136759728193283, | |
| "epoch": 3.146067415730337, | |
| "grad_norm": 0.0481521412730217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169215202331543, | |
| "mean_token_accuracy": 0.7936979234218597, | |
| "num_tokens": 13696943.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.5035114511847496, | |
| "epoch": 3.149812734082397, | |
| "grad_norm": 0.052551548928022385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5094401240348816, | |
| "mean_token_accuracy": 0.7950234562158585, | |
| "num_tokens": 13713121.0, | |
| "step": 841 | |
| }, | |
| { | |
| "entropy": 0.5143017992377281, | |
| "epoch": 3.153558052434457, | |
| "grad_norm": 0.051041699945926666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5074518322944641, | |
| "mean_token_accuracy": 0.7948710173368454, | |
| "num_tokens": 13729464.0, | |
| "step": 842 | |
| }, | |
| { | |
| "entropy": 0.5306706875562668, | |
| "epoch": 3.157303370786517, | |
| "grad_norm": 0.0463450625538826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219502449035645, | |
| "mean_token_accuracy": 0.7893195748329163, | |
| "num_tokens": 13746493.0, | |
| "step": 843 | |
| }, | |
| { | |
| "entropy": 0.5117569044232368, | |
| "epoch": 3.161048689138577, | |
| "grad_norm": 0.06164409592747688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5158479809761047, | |
| "mean_token_accuracy": 0.7911277264356613, | |
| "num_tokens": 13762823.0, | |
| "step": 844 | |
| }, | |
| { | |
| "entropy": 0.5204734578728676, | |
| "epoch": 3.1647940074906367, | |
| "grad_norm": 0.054356031119823456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212512016296387, | |
| "mean_token_accuracy": 0.7890127152204514, | |
| "num_tokens": 13779000.0, | |
| "step": 845 | |
| }, | |
| { | |
| "entropy": 0.5199745744466782, | |
| "epoch": 3.168539325842697, | |
| "grad_norm": 0.0607718862593174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160431265830994, | |
| "mean_token_accuracy": 0.7902602553367615, | |
| "num_tokens": 13794975.0, | |
| "step": 846 | |
| }, | |
| { | |
| "entropy": 0.4987589195370674, | |
| "epoch": 3.1722846441947565, | |
| "grad_norm": 0.04878820478916168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5000798106193542, | |
| "mean_token_accuracy": 0.7972550392150879, | |
| "num_tokens": 13811158.0, | |
| "step": 847 | |
| }, | |
| { | |
| "entropy": 0.5230295807123184, | |
| "epoch": 3.1760299625468167, | |
| "grad_norm": 0.06623463332653046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327509641647339, | |
| "mean_token_accuracy": 0.7841638922691345, | |
| "num_tokens": 13827505.0, | |
| "step": 848 | |
| }, | |
| { | |
| "entropy": 0.5071290284395218, | |
| "epoch": 3.1797752808988764, | |
| "grad_norm": 0.05458921194076538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.506171464920044, | |
| "mean_token_accuracy": 0.796265110373497, | |
| "num_tokens": 13843820.0, | |
| "step": 849 | |
| }, | |
| { | |
| "entropy": 0.5068354383111, | |
| "epoch": 3.1835205992509366, | |
| "grad_norm": 0.07471395283937454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5159043669700623, | |
| "mean_token_accuracy": 0.7950875610113144, | |
| "num_tokens": 13860049.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.5165606439113617, | |
| "epoch": 3.1872659176029963, | |
| "grad_norm": 0.04287557676434517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5090954303741455, | |
| "mean_token_accuracy": 0.7943407446146011, | |
| "num_tokens": 13876269.0, | |
| "step": 851 | |
| }, | |
| { | |
| "entropy": 0.5112441331148148, | |
| "epoch": 3.191011235955056, | |
| "grad_norm": 0.055288348346948624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5097154974937439, | |
| "mean_token_accuracy": 0.7928614467382431, | |
| "num_tokens": 13892237.0, | |
| "step": 852 | |
| }, | |
| { | |
| "entropy": 0.5263922363519669, | |
| "epoch": 3.194756554307116, | |
| "grad_norm": 0.05795539170503616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299734473228455, | |
| "mean_token_accuracy": 0.7866927832365036, | |
| "num_tokens": 13908834.0, | |
| "step": 853 | |
| }, | |
| { | |
| "entropy": 0.5262639820575714, | |
| "epoch": 3.198501872659176, | |
| "grad_norm": 0.04974358528852463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219104290008545, | |
| "mean_token_accuracy": 0.789173498749733, | |
| "num_tokens": 13925285.0, | |
| "step": 854 | |
| }, | |
| { | |
| "entropy": 0.5375918298959732, | |
| "epoch": 3.202247191011236, | |
| "grad_norm": 0.05287981405854225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538820207118988, | |
| "mean_token_accuracy": 0.7783188968896866, | |
| "num_tokens": 13941531.0, | |
| "step": 855 | |
| }, | |
| { | |
| "entropy": 0.5262509882450104, | |
| "epoch": 3.2059925093632957, | |
| "grad_norm": 0.050868358463048935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281128883361816, | |
| "mean_token_accuracy": 0.78641077876091, | |
| "num_tokens": 13957808.0, | |
| "step": 856 | |
| }, | |
| { | |
| "entropy": 0.5126873999834061, | |
| "epoch": 3.209737827715356, | |
| "grad_norm": 0.053514108061790466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147566795349121, | |
| "mean_token_accuracy": 0.7941258400678635, | |
| "num_tokens": 13974052.0, | |
| "step": 857 | |
| }, | |
| { | |
| "entropy": 0.5275673717260361, | |
| "epoch": 3.2134831460674156, | |
| "grad_norm": 0.05271236225962639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292813777923584, | |
| "mean_token_accuracy": 0.7857562899589539, | |
| "num_tokens": 13990343.0, | |
| "step": 858 | |
| }, | |
| { | |
| "entropy": 0.5242348462343216, | |
| "epoch": 3.2172284644194757, | |
| "grad_norm": 0.07179221510887146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286028981208801, | |
| "mean_token_accuracy": 0.7894574105739594, | |
| "num_tokens": 14006625.0, | |
| "step": 859 | |
| }, | |
| { | |
| "entropy": 0.5096549838781357, | |
| "epoch": 3.2209737827715355, | |
| "grad_norm": 0.049610402435064316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5049244165420532, | |
| "mean_token_accuracy": 0.7980163246393204, | |
| "num_tokens": 14022899.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.5015261322259903, | |
| "epoch": 3.2247191011235956, | |
| "grad_norm": 0.05947711691260338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4989194869995117, | |
| "mean_token_accuracy": 0.7979766577482224, | |
| "num_tokens": 14039443.0, | |
| "step": 861 | |
| }, | |
| { | |
| "entropy": 0.507699728012085, | |
| "epoch": 3.2284644194756553, | |
| "grad_norm": 0.04882875084877014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.507795512676239, | |
| "mean_token_accuracy": 0.7962815016508102, | |
| "num_tokens": 14055656.0, | |
| "step": 862 | |
| }, | |
| { | |
| "entropy": 0.5021291598677635, | |
| "epoch": 3.2322097378277155, | |
| "grad_norm": 0.061408963054418564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5129059553146362, | |
| "mean_token_accuracy": 0.7919183075428009, | |
| "num_tokens": 14071999.0, | |
| "step": 863 | |
| }, | |
| { | |
| "entropy": 0.520720586180687, | |
| "epoch": 3.235955056179775, | |
| "grad_norm": 0.06845266371965408, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275195837020874, | |
| "mean_token_accuracy": 0.786097377538681, | |
| "num_tokens": 14088181.0, | |
| "step": 864 | |
| }, | |
| { | |
| "entropy": 0.5245565697550774, | |
| "epoch": 3.2397003745318353, | |
| "grad_norm": 0.05512849986553192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5164670944213867, | |
| "mean_token_accuracy": 0.7922011315822601, | |
| "num_tokens": 14104382.0, | |
| "step": 865 | |
| }, | |
| { | |
| "entropy": 0.523853063583374, | |
| "epoch": 3.243445692883895, | |
| "grad_norm": 0.05168979614973068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198615789413452, | |
| "mean_token_accuracy": 0.7894517928361893, | |
| "num_tokens": 14120589.0, | |
| "step": 866 | |
| }, | |
| { | |
| "entropy": 0.5336069017648697, | |
| "epoch": 3.247191011235955, | |
| "grad_norm": 0.04658959433436394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296441912651062, | |
| "mean_token_accuracy": 0.7839891761541367, | |
| "num_tokens": 14137115.0, | |
| "step": 867 | |
| }, | |
| { | |
| "entropy": 0.5032267719507217, | |
| "epoch": 3.250936329588015, | |
| "grad_norm": 0.06418543308973312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5041000843048096, | |
| "mean_token_accuracy": 0.7958316802978516, | |
| "num_tokens": 14153324.0, | |
| "step": 868 | |
| }, | |
| { | |
| "entropy": 0.5415874123573303, | |
| "epoch": 3.254681647940075, | |
| "grad_norm": 0.05481120944023132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544674396514893, | |
| "mean_token_accuracy": 0.7752077877521515, | |
| "num_tokens": 14169770.0, | |
| "step": 869 | |
| }, | |
| { | |
| "entropy": 0.5231891572475433, | |
| "epoch": 3.258426966292135, | |
| "grad_norm": 0.055172860622406006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527195930480957, | |
| "mean_token_accuracy": 0.7866710424423218, | |
| "num_tokens": 14186252.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.522189661860466, | |
| "epoch": 3.262172284644195, | |
| "grad_norm": 0.058594439178705215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5187022686004639, | |
| "mean_token_accuracy": 0.7929898500442505, | |
| "num_tokens": 14202621.0, | |
| "step": 871 | |
| }, | |
| { | |
| "entropy": 0.5282062888145447, | |
| "epoch": 3.2659176029962547, | |
| "grad_norm": 0.05134856328368187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219106674194336, | |
| "mean_token_accuracy": 0.7889548540115356, | |
| "num_tokens": 14218830.0, | |
| "step": 872 | |
| }, | |
| { | |
| "entropy": 0.5150680243968964, | |
| "epoch": 3.2696629213483144, | |
| "grad_norm": 0.05508032441139221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5112281441688538, | |
| "mean_token_accuracy": 0.7931530773639679, | |
| "num_tokens": 14234888.0, | |
| "step": 873 | |
| }, | |
| { | |
| "entropy": 0.5219835788011551, | |
| "epoch": 3.2734082397003745, | |
| "grad_norm": 0.05464804917573929, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524517297744751, | |
| "mean_token_accuracy": 0.7871863842010498, | |
| "num_tokens": 14251240.0, | |
| "step": 874 | |
| }, | |
| { | |
| "entropy": 0.5211943238973618, | |
| "epoch": 3.2771535580524347, | |
| "grad_norm": 0.06844772398471832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394464731216431, | |
| "mean_token_accuracy": 0.7814126461744308, | |
| "num_tokens": 14267612.0, | |
| "step": 875 | |
| }, | |
| { | |
| "entropy": 0.5181123912334442, | |
| "epoch": 3.2808988764044944, | |
| "grad_norm": 0.04897969216108322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221361517906189, | |
| "mean_token_accuracy": 0.7895658910274506, | |
| "num_tokens": 14284024.0, | |
| "step": 876 | |
| }, | |
| { | |
| "entropy": 0.522240474820137, | |
| "epoch": 3.284644194756554, | |
| "grad_norm": 0.046099789440631866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.515265941619873, | |
| "mean_token_accuracy": 0.7908574789762497, | |
| "num_tokens": 14300400.0, | |
| "step": 877 | |
| }, | |
| { | |
| "entropy": 0.539507195353508, | |
| "epoch": 3.2883895131086143, | |
| "grad_norm": 0.048160191625356674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282410979270935, | |
| "mean_token_accuracy": 0.7885929346084595, | |
| "num_tokens": 14316696.0, | |
| "step": 878 | |
| }, | |
| { | |
| "entropy": 0.5196528732776642, | |
| "epoch": 3.292134831460674, | |
| "grad_norm": 0.05286882072687149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5168602466583252, | |
| "mean_token_accuracy": 0.7895731180906296, | |
| "num_tokens": 14333018.0, | |
| "step": 879 | |
| }, | |
| { | |
| "entropy": 0.5102087259292603, | |
| "epoch": 3.295880149812734, | |
| "grad_norm": 0.059099920094013214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207654237747192, | |
| "mean_token_accuracy": 0.7876903861761093, | |
| "num_tokens": 14349309.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.5270523875951767, | |
| "epoch": 3.299625468164794, | |
| "grad_norm": 0.05176056921482086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302364230155945, | |
| "mean_token_accuracy": 0.7864267975091934, | |
| "num_tokens": 14365771.0, | |
| "step": 881 | |
| }, | |
| { | |
| "entropy": 0.5273350328207016, | |
| "epoch": 3.303370786516854, | |
| "grad_norm": 0.053021032363176346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.51994389295578, | |
| "mean_token_accuracy": 0.7906388491392136, | |
| "num_tokens": 14382276.0, | |
| "step": 882 | |
| }, | |
| { | |
| "entropy": 0.5050782039761543, | |
| "epoch": 3.3071161048689137, | |
| "grad_norm": 0.05596887692809105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5052669644355774, | |
| "mean_token_accuracy": 0.7954567670822144, | |
| "num_tokens": 14398533.0, | |
| "step": 883 | |
| }, | |
| { | |
| "entropy": 0.5178304612636566, | |
| "epoch": 3.310861423220974, | |
| "grad_norm": 0.051180679351091385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5151298642158508, | |
| "mean_token_accuracy": 0.7920469641685486, | |
| "num_tokens": 14414953.0, | |
| "step": 884 | |
| }, | |
| { | |
| "entropy": 0.5152227282524109, | |
| "epoch": 3.3146067415730336, | |
| "grad_norm": 0.060053881257772446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5225366950035095, | |
| "mean_token_accuracy": 0.7887113392353058, | |
| "num_tokens": 14431177.0, | |
| "step": 885 | |
| }, | |
| { | |
| "entropy": 0.5342336893081665, | |
| "epoch": 3.3183520599250937, | |
| "grad_norm": 0.04932161048054695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272732973098755, | |
| "mean_token_accuracy": 0.7877390533685684, | |
| "num_tokens": 14447551.0, | |
| "step": 886 | |
| }, | |
| { | |
| "entropy": 0.5131062269210815, | |
| "epoch": 3.3220973782771535, | |
| "grad_norm": 0.056324418634176254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511243999004364, | |
| "mean_token_accuracy": 0.7933667898178101, | |
| "num_tokens": 14463837.0, | |
| "step": 887 | |
| }, | |
| { | |
| "entropy": 0.5144293755292892, | |
| "epoch": 3.3258426966292136, | |
| "grad_norm": 0.049344755709171295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5185728073120117, | |
| "mean_token_accuracy": 0.7894094735383987, | |
| "num_tokens": 14480010.0, | |
| "step": 888 | |
| }, | |
| { | |
| "entropy": 0.5006949752569199, | |
| "epoch": 3.3295880149812733, | |
| "grad_norm": 0.06578890234231949, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5114624500274658, | |
| "mean_token_accuracy": 0.7939462065696716, | |
| "num_tokens": 14496280.0, | |
| "step": 889 | |
| }, | |
| { | |
| "entropy": 0.5155239552259445, | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.052595749497413635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211793780326843, | |
| "mean_token_accuracy": 0.7900384217500687, | |
| "num_tokens": 14512580.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.4996938407421112, | |
| "epoch": 3.337078651685393, | |
| "grad_norm": 0.05196739733219147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4989975094795227, | |
| "mean_token_accuracy": 0.7975862473249435, | |
| "num_tokens": 14528932.0, | |
| "step": 891 | |
| }, | |
| { | |
| "entropy": 0.5200860351324081, | |
| "epoch": 3.3408239700374533, | |
| "grad_norm": 0.05091974139213562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5156251192092896, | |
| "mean_token_accuracy": 0.7910965532064438, | |
| "num_tokens": 14545418.0, | |
| "step": 892 | |
| }, | |
| { | |
| "entropy": 0.5055394843220711, | |
| "epoch": 3.344569288389513, | |
| "grad_norm": 0.0533117949962616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5111801028251648, | |
| "mean_token_accuracy": 0.791337177157402, | |
| "num_tokens": 14561554.0, | |
| "step": 893 | |
| }, | |
| { | |
| "entropy": 0.5070675015449524, | |
| "epoch": 3.348314606741573, | |
| "grad_norm": 0.04844473674893379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5077552795410156, | |
| "mean_token_accuracy": 0.7912814170122147, | |
| "num_tokens": 14578052.0, | |
| "step": 894 | |
| }, | |
| { | |
| "entropy": 0.5202019810676575, | |
| "epoch": 3.352059925093633, | |
| "grad_norm": 0.04764174669981003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175067186355591, | |
| "mean_token_accuracy": 0.7899416983127594, | |
| "num_tokens": 14594359.0, | |
| "step": 895 | |
| }, | |
| { | |
| "entropy": 0.5255243629217148, | |
| "epoch": 3.355805243445693, | |
| "grad_norm": 0.05360300838947296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318154692649841, | |
| "mean_token_accuracy": 0.7854946553707123, | |
| "num_tokens": 14610661.0, | |
| "step": 896 | |
| }, | |
| { | |
| "entropy": 0.5251385867595673, | |
| "epoch": 3.359550561797753, | |
| "grad_norm": 0.05500936135649681, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363146066665649, | |
| "mean_token_accuracy": 0.7834254056215286, | |
| "num_tokens": 14626712.0, | |
| "step": 897 | |
| }, | |
| { | |
| "entropy": 0.5119743421673775, | |
| "epoch": 3.3632958801498125, | |
| "grad_norm": 0.04378456994891167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5079984068870544, | |
| "mean_token_accuracy": 0.7939057648181915, | |
| "num_tokens": 14642932.0, | |
| "step": 898 | |
| }, | |
| { | |
| "entropy": 0.5284467786550522, | |
| "epoch": 3.3670411985018727, | |
| "grad_norm": 0.046168722212314606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247387290000916, | |
| "mean_token_accuracy": 0.787312924861908, | |
| "num_tokens": 14659213.0, | |
| "step": 899 | |
| }, | |
| { | |
| "entropy": 0.5423993915319443, | |
| "epoch": 3.370786516853933, | |
| "grad_norm": 0.04573873057961464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364725589752197, | |
| "mean_token_accuracy": 0.7854876816272736, | |
| "num_tokens": 14675678.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.5328433066606522, | |
| "epoch": 3.3745318352059925, | |
| "grad_norm": 0.044917598366737366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308316946029663, | |
| "mean_token_accuracy": 0.785490483045578, | |
| "num_tokens": 14692287.0, | |
| "step": 901 | |
| }, | |
| { | |
| "entropy": 0.5370714962482452, | |
| "epoch": 3.3782771535580522, | |
| "grad_norm": 0.05281532183289528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403937101364136, | |
| "mean_token_accuracy": 0.7802177965641022, | |
| "num_tokens": 14708736.0, | |
| "step": 902 | |
| }, | |
| { | |
| "entropy": 0.5240233987569809, | |
| "epoch": 3.3820224719101124, | |
| "grad_norm": 0.04636811465024948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222055315971375, | |
| "mean_token_accuracy": 0.7886700630187988, | |
| "num_tokens": 14725122.0, | |
| "step": 903 | |
| }, | |
| { | |
| "entropy": 0.5218504667282104, | |
| "epoch": 3.385767790262172, | |
| "grad_norm": 0.05728694424033165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256317853927612, | |
| "mean_token_accuracy": 0.7890423983335495, | |
| "num_tokens": 14741271.0, | |
| "step": 904 | |
| }, | |
| { | |
| "entropy": 0.5346123427152634, | |
| "epoch": 3.3895131086142323, | |
| "grad_norm": 0.046447765082120895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343607664108276, | |
| "mean_token_accuracy": 0.7844806611537933, | |
| "num_tokens": 14757614.0, | |
| "step": 905 | |
| }, | |
| { | |
| "entropy": 0.5300848186016083, | |
| "epoch": 3.393258426966292, | |
| "grad_norm": 0.06571624428033829, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315452814102173, | |
| "mean_token_accuracy": 0.7868516147136688, | |
| "num_tokens": 14774083.0, | |
| "step": 906 | |
| }, | |
| { | |
| "entropy": 0.5144885182380676, | |
| "epoch": 3.397003745318352, | |
| "grad_norm": 0.05184376239776611, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5137390494346619, | |
| "mean_token_accuracy": 0.7918999344110489, | |
| "num_tokens": 14790219.0, | |
| "step": 907 | |
| }, | |
| { | |
| "entropy": 0.5159177482128143, | |
| "epoch": 3.400749063670412, | |
| "grad_norm": 0.0637274757027626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5109057426452637, | |
| "mean_token_accuracy": 0.792988732457161, | |
| "num_tokens": 14806579.0, | |
| "step": 908 | |
| }, | |
| { | |
| "entropy": 0.5414174944162369, | |
| "epoch": 3.404494382022472, | |
| "grad_norm": 0.049117956310510635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352107286453247, | |
| "mean_token_accuracy": 0.7849340736865997, | |
| "num_tokens": 14823142.0, | |
| "step": 909 | |
| }, | |
| { | |
| "entropy": 0.5176117867231369, | |
| "epoch": 3.4082397003745317, | |
| "grad_norm": 0.06466244161128998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.522276759147644, | |
| "mean_token_accuracy": 0.789726972579956, | |
| "num_tokens": 14839440.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.5329615920782089, | |
| "epoch": 3.411985018726592, | |
| "grad_norm": 0.05105730891227722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381749868392944, | |
| "mean_token_accuracy": 0.7826534360647202, | |
| "num_tokens": 14855956.0, | |
| "step": 911 | |
| }, | |
| { | |
| "entropy": 0.5107108354568481, | |
| "epoch": 3.4157303370786516, | |
| "grad_norm": 0.05413498729467392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5151250958442688, | |
| "mean_token_accuracy": 0.7922552824020386, | |
| "num_tokens": 14872232.0, | |
| "step": 912 | |
| }, | |
| { | |
| "entropy": 0.5194525718688965, | |
| "epoch": 3.4194756554307117, | |
| "grad_norm": 0.049860697239637375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5245251655578613, | |
| "mean_token_accuracy": 0.7890132665634155, | |
| "num_tokens": 14888739.0, | |
| "step": 913 | |
| }, | |
| { | |
| "entropy": 0.5260248631238937, | |
| "epoch": 3.4232209737827715, | |
| "grad_norm": 0.0514976903796196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202233195304871, | |
| "mean_token_accuracy": 0.7909575551748276, | |
| "num_tokens": 14905100.0, | |
| "step": 914 | |
| }, | |
| { | |
| "entropy": 0.5172304511070251, | |
| "epoch": 3.4269662921348316, | |
| "grad_norm": 0.046695906668901443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5149263143539429, | |
| "mean_token_accuracy": 0.7901606112718582, | |
| "num_tokens": 14921448.0, | |
| "step": 915 | |
| }, | |
| { | |
| "entropy": 0.5069386884570122, | |
| "epoch": 3.4307116104868913, | |
| "grad_norm": 0.05618730187416077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5093807578086853, | |
| "mean_token_accuracy": 0.7943364530801773, | |
| "num_tokens": 14937735.0, | |
| "step": 916 | |
| }, | |
| { | |
| "entropy": 0.5155317038297653, | |
| "epoch": 3.4344569288389515, | |
| "grad_norm": 0.04981003701686859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243242383003235, | |
| "mean_token_accuracy": 0.7892241328954697, | |
| "num_tokens": 14954139.0, | |
| "step": 917 | |
| }, | |
| { | |
| "entropy": 0.5165708512067795, | |
| "epoch": 3.438202247191011, | |
| "grad_norm": 0.050371985882520676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5150896906852722, | |
| "mean_token_accuracy": 0.7927063405513763, | |
| "num_tokens": 14970507.0, | |
| "step": 918 | |
| }, | |
| { | |
| "entropy": 0.5134851261973381, | |
| "epoch": 3.4419475655430714, | |
| "grad_norm": 0.04879898577928543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160987377166748, | |
| "mean_token_accuracy": 0.7906570881605148, | |
| "num_tokens": 14986812.0, | |
| "step": 919 | |
| }, | |
| { | |
| "entropy": 0.5135181546211243, | |
| "epoch": 3.445692883895131, | |
| "grad_norm": 0.05624324828386307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219361186027527, | |
| "mean_token_accuracy": 0.7903093546628952, | |
| "num_tokens": 15003179.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.5162501037120819, | |
| "epoch": 3.449438202247191, | |
| "grad_norm": 0.04822200909256935, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5126674175262451, | |
| "mean_token_accuracy": 0.7924687564373016, | |
| "num_tokens": 15019428.0, | |
| "step": 921 | |
| }, | |
| { | |
| "entropy": 0.5315191224217415, | |
| "epoch": 3.453183520599251, | |
| "grad_norm": 0.04490262269973755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248660445213318, | |
| "mean_token_accuracy": 0.7871098518371582, | |
| "num_tokens": 15035868.0, | |
| "step": 922 | |
| }, | |
| { | |
| "entropy": 0.5238284766674042, | |
| "epoch": 3.4569288389513106, | |
| "grad_norm": 0.051175910979509354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521578311920166, | |
| "mean_token_accuracy": 0.7883873879909515, | |
| "num_tokens": 15052303.0, | |
| "step": 923 | |
| }, | |
| { | |
| "entropy": 0.5168250873684883, | |
| "epoch": 3.460674157303371, | |
| "grad_norm": 0.046608321368694305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207570791244507, | |
| "mean_token_accuracy": 0.7900703996419907, | |
| "num_tokens": 15068618.0, | |
| "step": 924 | |
| }, | |
| { | |
| "entropy": 0.5313585698604584, | |
| "epoch": 3.464419475655431, | |
| "grad_norm": 0.049307819455862045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298991203308105, | |
| "mean_token_accuracy": 0.7864013016223907, | |
| "num_tokens": 15084957.0, | |
| "step": 925 | |
| }, | |
| { | |
| "entropy": 0.5185838490724564, | |
| "epoch": 3.4681647940074907, | |
| "grad_norm": 0.05639752745628357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251802802085876, | |
| "mean_token_accuracy": 0.787624716758728, | |
| "num_tokens": 15101189.0, | |
| "step": 926 | |
| }, | |
| { | |
| "entropy": 0.515865795314312, | |
| "epoch": 3.4719101123595504, | |
| "grad_norm": 0.05554183945059776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.518955647945404, | |
| "mean_token_accuracy": 0.7888496518135071, | |
| "num_tokens": 15117511.0, | |
| "step": 927 | |
| }, | |
| { | |
| "entropy": 0.5173558592796326, | |
| "epoch": 3.4756554307116105, | |
| "grad_norm": 0.051211338490247726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5185026526451111, | |
| "mean_token_accuracy": 0.7890340387821198, | |
| "num_tokens": 15133719.0, | |
| "step": 928 | |
| }, | |
| { | |
| "entropy": 0.520257018506527, | |
| "epoch": 3.4794007490636703, | |
| "grad_norm": 0.055278245359659195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183354616165161, | |
| "mean_token_accuracy": 0.7902627289295197, | |
| "num_tokens": 15149922.0, | |
| "step": 929 | |
| }, | |
| { | |
| "entropy": 0.515156589448452, | |
| "epoch": 3.4831460674157304, | |
| "grad_norm": 0.05468440055847168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5097793340682983, | |
| "mean_token_accuracy": 0.7964832186698914, | |
| "num_tokens": 15166020.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.521842934191227, | |
| "epoch": 3.48689138576779, | |
| "grad_norm": 0.04573323577642441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5174736380577087, | |
| "mean_token_accuracy": 0.7907158583402634, | |
| "num_tokens": 15182296.0, | |
| "step": 931 | |
| }, | |
| { | |
| "entropy": 0.5367195308208466, | |
| "epoch": 3.4906367041198503, | |
| "grad_norm": 0.05060438811779022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360324382781982, | |
| "mean_token_accuracy": 0.7832886576652527, | |
| "num_tokens": 15198618.0, | |
| "step": 932 | |
| }, | |
| { | |
| "entropy": 0.5351738333702087, | |
| "epoch": 3.49438202247191, | |
| "grad_norm": 0.04796265438199043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342084765434265, | |
| "mean_token_accuracy": 0.7837437838315964, | |
| "num_tokens": 15215125.0, | |
| "step": 933 | |
| }, | |
| { | |
| "entropy": 0.5210021957755089, | |
| "epoch": 3.49812734082397, | |
| "grad_norm": 0.05278978869318962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260420441627502, | |
| "mean_token_accuracy": 0.7890212833881378, | |
| "num_tokens": 15231335.0, | |
| "step": 934 | |
| }, | |
| { | |
| "entropy": 0.5361146479845047, | |
| "epoch": 3.50187265917603, | |
| "grad_norm": 0.05599920451641083, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407608151435852, | |
| "mean_token_accuracy": 0.7809196263551712, | |
| "num_tokens": 15247587.0, | |
| "step": 935 | |
| }, | |
| { | |
| "entropy": 0.5127650052309036, | |
| "epoch": 3.50561797752809, | |
| "grad_norm": 0.053348250687122345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172818303108215, | |
| "mean_token_accuracy": 0.7908589243888855, | |
| "num_tokens": 15263983.0, | |
| "step": 936 | |
| }, | |
| { | |
| "entropy": 0.5113075897097588, | |
| "epoch": 3.5093632958801497, | |
| "grad_norm": 0.047283098101615906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5094785690307617, | |
| "mean_token_accuracy": 0.7913675010204315, | |
| "num_tokens": 15280172.0, | |
| "step": 937 | |
| }, | |
| { | |
| "entropy": 0.5144875794649124, | |
| "epoch": 3.51310861423221, | |
| "grad_norm": 0.05150860175490379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5117542743682861, | |
| "mean_token_accuracy": 0.7926830351352692, | |
| "num_tokens": 15296278.0, | |
| "step": 938 | |
| }, | |
| { | |
| "entropy": 0.5282381922006607, | |
| "epoch": 3.5168539325842696, | |
| "grad_norm": 0.05235690623521805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275253653526306, | |
| "mean_token_accuracy": 0.787050798535347, | |
| "num_tokens": 15312737.0, | |
| "step": 939 | |
| }, | |
| { | |
| "entropy": 0.5191426128149033, | |
| "epoch": 3.5205992509363297, | |
| "grad_norm": 0.05214005708694458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218259692192078, | |
| "mean_token_accuracy": 0.7854390293359756, | |
| "num_tokens": 15329171.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.488400898873806, | |
| "epoch": 3.5243445692883895, | |
| "grad_norm": 0.05028095468878746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49238866567611694, | |
| "mean_token_accuracy": 0.8010139167308807, | |
| "num_tokens": 15345040.0, | |
| "step": 941 | |
| }, | |
| { | |
| "entropy": 0.530989944934845, | |
| "epoch": 3.5280898876404496, | |
| "grad_norm": 0.05137421563267708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5283138155937195, | |
| "mean_token_accuracy": 0.7872757613658905, | |
| "num_tokens": 15361506.0, | |
| "step": 942 | |
| }, | |
| { | |
| "entropy": 0.5166791379451752, | |
| "epoch": 3.5318352059925093, | |
| "grad_norm": 0.05064837634563446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200411677360535, | |
| "mean_token_accuracy": 0.7893417179584503, | |
| "num_tokens": 15377725.0, | |
| "step": 943 | |
| }, | |
| { | |
| "entropy": 0.5225488543510437, | |
| "epoch": 3.535580524344569, | |
| "grad_norm": 0.05224663019180298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5252619981765747, | |
| "mean_token_accuracy": 0.7887216210365295, | |
| "num_tokens": 15394073.0, | |
| "step": 944 | |
| }, | |
| { | |
| "entropy": 0.5133933499455452, | |
| "epoch": 3.539325842696629, | |
| "grad_norm": 0.054900407791137695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5187044143676758, | |
| "mean_token_accuracy": 0.7941587567329407, | |
| "num_tokens": 15410326.0, | |
| "step": 945 | |
| }, | |
| { | |
| "entropy": 0.5217478722333908, | |
| "epoch": 3.5430711610486894, | |
| "grad_norm": 0.05068376660346985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203924179077148, | |
| "mean_token_accuracy": 0.7903146594762802, | |
| "num_tokens": 15426695.0, | |
| "step": 946 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1335, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.437096036035199e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |