Instructions to use eac123/sublim-phase4-combo-06 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/sublim-phase4-combo-06 with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/sublim-phase4-combo-06") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1324340403079987, | |
| "epoch": 0.0037313432835820895, | |
| "grad_norm": 1.6067556142807007, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4804701805114746, | |
| "mean_token_accuracy": 0.5353229343891144, | |
| "num_tokens": 16370.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2276706099510193, | |
| "epoch": 0.007462686567164179, | |
| "grad_norm": 1.4987447261810303, | |
| "learning_rate": 0.0002, | |
| "loss": 2.135417938232422, | |
| "mean_token_accuracy": 0.5693617165088654, | |
| "num_tokens": 33043.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4045527577400208, | |
| "epoch": 0.011194029850746268, | |
| "grad_norm": 1.1359604597091675, | |
| "learning_rate": 0.0002, | |
| "loss": 1.72599196434021, | |
| "mean_token_accuracy": 0.5919849574565887, | |
| "num_tokens": 49458.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3863026201725006, | |
| "epoch": 0.014925373134328358, | |
| "grad_norm": 0.9200887084007263, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4096770286560059, | |
| "mean_token_accuracy": 0.6369052678346634, | |
| "num_tokens": 65795.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.331774890422821, | |
| "epoch": 0.018656716417910446, | |
| "grad_norm": 1.2737244367599487, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2862391471862793, | |
| "mean_token_accuracy": 0.6422256380319595, | |
| "num_tokens": 82033.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.2540993690490723, | |
| "epoch": 0.022388059701492536, | |
| "grad_norm": 0.6736201643943787, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1756055355072021, | |
| "mean_token_accuracy": 0.6605449765920639, | |
| "num_tokens": 97997.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.169641524553299, | |
| "epoch": 0.026119402985074626, | |
| "grad_norm": 0.3927549719810486, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1019014120101929, | |
| "mean_token_accuracy": 0.6672378480434418, | |
| "num_tokens": 114186.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.0887874066829681, | |
| "epoch": 0.029850746268656716, | |
| "grad_norm": 0.4364261329174042, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0323972702026367, | |
| "mean_token_accuracy": 0.6782350987195969, | |
| "num_tokens": 130751.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0042430609464645, | |
| "epoch": 0.033582089552238806, | |
| "grad_norm": 0.5108282566070557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9582932591438293, | |
| "mean_token_accuracy": 0.692020371556282, | |
| "num_tokens": 147264.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9632741063833237, | |
| "epoch": 0.03731343283582089, | |
| "grad_norm": 0.4669722616672516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8919203877449036, | |
| "mean_token_accuracy": 0.7046539932489395, | |
| "num_tokens": 163507.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9305494576692581, | |
| "epoch": 0.041044776119402986, | |
| "grad_norm": 0.4794766902923584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8569780588150024, | |
| "mean_token_accuracy": 0.7103458344936371, | |
| "num_tokens": 179680.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.8464002013206482, | |
| "epoch": 0.04477611940298507, | |
| "grad_norm": 0.396366685628891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7772667407989502, | |
| "mean_token_accuracy": 0.7248742878437042, | |
| "num_tokens": 196084.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8053079694509506, | |
| "epoch": 0.048507462686567165, | |
| "grad_norm": 3.4283485412597656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7701212763786316, | |
| "mean_token_accuracy": 0.7237996459007263, | |
| "num_tokens": 212421.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7701881229877472, | |
| "epoch": 0.05223880597014925, | |
| "grad_norm": 0.4621308147907257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7581663727760315, | |
| "mean_token_accuracy": 0.725386381149292, | |
| "num_tokens": 228835.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7058936208486557, | |
| "epoch": 0.055970149253731345, | |
| "grad_norm": 0.45394617319107056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7281949520111084, | |
| "mean_token_accuracy": 0.731869712471962, | |
| "num_tokens": 245106.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7007950246334076, | |
| "epoch": 0.05970149253731343, | |
| "grad_norm": 0.38048553466796875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6906558871269226, | |
| "mean_token_accuracy": 0.7422550022602081, | |
| "num_tokens": 261510.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6775622367858887, | |
| "epoch": 0.06343283582089553, | |
| "grad_norm": 0.3588451147079468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6660153865814209, | |
| "mean_token_accuracy": 0.7494668215513229, | |
| "num_tokens": 278002.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6844813376665115, | |
| "epoch": 0.06716417910447761, | |
| "grad_norm": 0.34310266375541687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6606006026268005, | |
| "mean_token_accuracy": 0.745672732591629, | |
| "num_tokens": 294482.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6752376109361649, | |
| "epoch": 0.0708955223880597, | |
| "grad_norm": 0.3563651740550995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6529812216758728, | |
| "mean_token_accuracy": 0.7467419356107712, | |
| "num_tokens": 310804.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.655072346329689, | |
| "epoch": 0.07462686567164178, | |
| "grad_norm": 0.30358463525772095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6404100656509399, | |
| "mean_token_accuracy": 0.7505071759223938, | |
| "num_tokens": 327252.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6286358386278152, | |
| "epoch": 0.07835820895522388, | |
| "grad_norm": 0.30567091703414917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6207510232925415, | |
| "mean_token_accuracy": 0.7580177336931229, | |
| "num_tokens": 343737.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6086345314979553, | |
| "epoch": 0.08208955223880597, | |
| "grad_norm": 0.27747389674186707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6111672520637512, | |
| "mean_token_accuracy": 0.760840117931366, | |
| "num_tokens": 359961.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.5925645977258682, | |
| "epoch": 0.08582089552238806, | |
| "grad_norm": 0.25484028458595276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5915433168411255, | |
| "mean_token_accuracy": 0.7686687558889389, | |
| "num_tokens": 376034.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6192648261785507, | |
| "epoch": 0.08955223880597014, | |
| "grad_norm": 0.2309548258781433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6154056787490845, | |
| "mean_token_accuracy": 0.7575328648090363, | |
| "num_tokens": 392454.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.6046310663223267, | |
| "epoch": 0.09328358208955224, | |
| "grad_norm": 0.24919550120830536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5856317281723022, | |
| "mean_token_accuracy": 0.769055038690567, | |
| "num_tokens": 408673.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6073041707277298, | |
| "epoch": 0.09701492537313433, | |
| "grad_norm": 0.22897422313690186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6000080108642578, | |
| "mean_token_accuracy": 0.7657780200242996, | |
| "num_tokens": 425147.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.5694791227579117, | |
| "epoch": 0.10074626865671642, | |
| "grad_norm": 0.26130226254463196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651018619537354, | |
| "mean_token_accuracy": 0.7780718505382538, | |
| "num_tokens": 441676.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.5705035477876663, | |
| "epoch": 0.1044776119402985, | |
| "grad_norm": 0.2569018304347992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5736910700798035, | |
| "mean_token_accuracy": 0.7736188471317291, | |
| "num_tokens": 457862.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.5686106830835342, | |
| "epoch": 0.10820895522388059, | |
| "grad_norm": 0.24455995857715607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5789230465888977, | |
| "mean_token_accuracy": 0.7694863677024841, | |
| "num_tokens": 473929.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5674358904361725, | |
| "epoch": 0.11194029850746269, | |
| "grad_norm": 0.2457604557275772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.581587553024292, | |
| "mean_token_accuracy": 0.7700542360544205, | |
| "num_tokens": 490261.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5924967974424362, | |
| "epoch": 0.11567164179104478, | |
| "grad_norm": 0.24704386293888092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5963209271430969, | |
| "mean_token_accuracy": 0.7627938687801361, | |
| "num_tokens": 506614.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5728770643472672, | |
| "epoch": 0.11940298507462686, | |
| "grad_norm": 0.24360406398773193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570555567741394, | |
| "mean_token_accuracy": 0.7713408023118973, | |
| "num_tokens": 523175.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5846883952617645, | |
| "epoch": 0.12313432835820895, | |
| "grad_norm": 0.20197518169879913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5723189115524292, | |
| "mean_token_accuracy": 0.7742884606122971, | |
| "num_tokens": 539383.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5598815232515335, | |
| "epoch": 0.12686567164179105, | |
| "grad_norm": 0.25282159447669983, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645520687103271, | |
| "mean_token_accuracy": 0.7759677618741989, | |
| "num_tokens": 555484.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5746279805898666, | |
| "epoch": 0.13059701492537312, | |
| "grad_norm": 0.20525087416172028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5774482488632202, | |
| "mean_token_accuracy": 0.7711690366268158, | |
| "num_tokens": 572050.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5689367800951004, | |
| "epoch": 0.13432835820895522, | |
| "grad_norm": 0.2016289383172989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688468217849731, | |
| "mean_token_accuracy": 0.7752531915903091, | |
| "num_tokens": 588229.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5673371106386185, | |
| "epoch": 0.13805970149253732, | |
| "grad_norm": 0.20251700282096863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676092505455017, | |
| "mean_token_accuracy": 0.7740599513053894, | |
| "num_tokens": 604842.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5538036525249481, | |
| "epoch": 0.1417910447761194, | |
| "grad_norm": 0.18855363130569458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636182427406311, | |
| "mean_token_accuracy": 0.7732492536306381, | |
| "num_tokens": 621334.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5772293359041214, | |
| "epoch": 0.1455223880597015, | |
| "grad_norm": 0.1829119771718979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5749870538711548, | |
| "mean_token_accuracy": 0.7699291855096817, | |
| "num_tokens": 637861.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5583464652299881, | |
| "epoch": 0.14925373134328357, | |
| "grad_norm": 0.16470657289028168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537322163581848, | |
| "mean_token_accuracy": 0.7790806740522385, | |
| "num_tokens": 653894.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5681058615446091, | |
| "epoch": 0.15298507462686567, | |
| "grad_norm": 0.17573200166225433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643278360366821, | |
| "mean_token_accuracy": 0.7733141183853149, | |
| "num_tokens": 670015.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.566686749458313, | |
| "epoch": 0.15671641791044777, | |
| "grad_norm": 0.16218754649162292, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597659945487976, | |
| "mean_token_accuracy": 0.7758253067731857, | |
| "num_tokens": 686056.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5558898448944092, | |
| "epoch": 0.16044776119402984, | |
| "grad_norm": 0.18278591334819794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558386504650116, | |
| "mean_token_accuracy": 0.7759624123573303, | |
| "num_tokens": 702659.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5585661381483078, | |
| "epoch": 0.16417910447761194, | |
| "grad_norm": 0.17696230113506317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635029673576355, | |
| "mean_token_accuracy": 0.7751695066690445, | |
| "num_tokens": 718850.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5506571680307388, | |
| "epoch": 0.16791044776119404, | |
| "grad_norm": 0.1652524471282959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565558671951294, | |
| "mean_token_accuracy": 0.7778312116861343, | |
| "num_tokens": 735246.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5514795780181885, | |
| "epoch": 0.17164179104477612, | |
| "grad_norm": 0.18487824499607086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487773418426514, | |
| "mean_token_accuracy": 0.7793762385845184, | |
| "num_tokens": 751565.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5588273853063583, | |
| "epoch": 0.17537313432835822, | |
| "grad_norm": 0.19246406853199005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596141219139099, | |
| "mean_token_accuracy": 0.7778225541114807, | |
| "num_tokens": 767932.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5591737627983093, | |
| "epoch": 0.1791044776119403, | |
| "grad_norm": 0.15891006588935852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638841390609741, | |
| "mean_token_accuracy": 0.7727467268705368, | |
| "num_tokens": 784014.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5501811355352402, | |
| "epoch": 0.1828358208955224, | |
| "grad_norm": 0.16706983745098114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501376986503601, | |
| "mean_token_accuracy": 0.7761423140764236, | |
| "num_tokens": 800374.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5606948286294937, | |
| "epoch": 0.1865671641791045, | |
| "grad_norm": 0.17230357229709625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634580850601196, | |
| "mean_token_accuracy": 0.7727725654840469, | |
| "num_tokens": 816520.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5541675686836243, | |
| "epoch": 0.19029850746268656, | |
| "grad_norm": 0.1744348555803299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664834380149841, | |
| "mean_token_accuracy": 0.7722806632518768, | |
| "num_tokens": 832574.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5447754859924316, | |
| "epoch": 0.19402985074626866, | |
| "grad_norm": 0.1993291825056076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500599145889282, | |
| "mean_token_accuracy": 0.7812339067459106, | |
| "num_tokens": 848524.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.55513696372509, | |
| "epoch": 0.19776119402985073, | |
| "grad_norm": 0.18667836487293243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566352605819702, | |
| "mean_token_accuracy": 0.7776180505752563, | |
| "num_tokens": 864701.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5591137707233429, | |
| "epoch": 0.20149253731343283, | |
| "grad_norm": 0.1556427925825119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615472197532654, | |
| "mean_token_accuracy": 0.7761439085006714, | |
| "num_tokens": 881019.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5678103417158127, | |
| "epoch": 0.20522388059701493, | |
| "grad_norm": 0.176001638174057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604614615440369, | |
| "mean_token_accuracy": 0.7737350314855576, | |
| "num_tokens": 897731.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5736003369092941, | |
| "epoch": 0.208955223880597, | |
| "grad_norm": 0.17963656783103943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5741879940032959, | |
| "mean_token_accuracy": 0.7709980905056, | |
| "num_tokens": 914031.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5704395622014999, | |
| "epoch": 0.2126865671641791, | |
| "grad_norm": 0.15910783410072327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571160078048706, | |
| "mean_token_accuracy": 0.7722027599811554, | |
| "num_tokens": 930606.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5746669173240662, | |
| "epoch": 0.21641791044776118, | |
| "grad_norm": 0.15874247252941132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674406886100769, | |
| "mean_token_accuracy": 0.7708650529384613, | |
| "num_tokens": 947244.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5582200437784195, | |
| "epoch": 0.22014925373134328, | |
| "grad_norm": 0.16829723119735718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581406950950623, | |
| "mean_token_accuracy": 0.7757681459188461, | |
| "num_tokens": 963619.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5504408478736877, | |
| "epoch": 0.22388059701492538, | |
| "grad_norm": 0.14540037512779236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557159781455994, | |
| "mean_token_accuracy": 0.776930645108223, | |
| "num_tokens": 980040.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5402641594409943, | |
| "epoch": 0.22761194029850745, | |
| "grad_norm": 0.14897902309894562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523658394813538, | |
| "mean_token_accuracy": 0.7773705869913101, | |
| "num_tokens": 996383.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5391396135091782, | |
| "epoch": 0.23134328358208955, | |
| "grad_norm": 0.16873425245285034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509910583496094, | |
| "mean_token_accuracy": 0.7777218073606491, | |
| "num_tokens": 1012664.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5582114011049271, | |
| "epoch": 0.23507462686567165, | |
| "grad_norm": 0.1502108871936798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559942126274109, | |
| "mean_token_accuracy": 0.7745993584394455, | |
| "num_tokens": 1029022.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5812249481678009, | |
| "epoch": 0.23880597014925373, | |
| "grad_norm": 0.13852274417877197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5768259167671204, | |
| "mean_token_accuracy": 0.766035184264183, | |
| "num_tokens": 1045337.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.555647611618042, | |
| "epoch": 0.24253731343283583, | |
| "grad_norm": 0.1643349826335907, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524765849113464, | |
| "mean_token_accuracy": 0.7790125608444214, | |
| "num_tokens": 1061843.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5712831914424896, | |
| "epoch": 0.2462686567164179, | |
| "grad_norm": 0.1458103060722351, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671954154968262, | |
| "mean_token_accuracy": 0.7726651430130005, | |
| "num_tokens": 1078313.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.548685610294342, | |
| "epoch": 0.25, | |
| "grad_norm": 0.13704419136047363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478826761245728, | |
| "mean_token_accuracy": 0.7788915038108826, | |
| "num_tokens": 1094803.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5427667200565338, | |
| "epoch": 0.2537313432835821, | |
| "grad_norm": 0.16616535186767578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495492815971375, | |
| "mean_token_accuracy": 0.7795749753713608, | |
| "num_tokens": 1111058.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5463619232177734, | |
| "epoch": 0.2574626865671642, | |
| "grad_norm": 0.1541680544614792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557973980903625, | |
| "mean_token_accuracy": 0.7797737270593643, | |
| "num_tokens": 1127187.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5503609925508499, | |
| "epoch": 0.26119402985074625, | |
| "grad_norm": 0.16344738006591797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560310482978821, | |
| "mean_token_accuracy": 0.7764633148908615, | |
| "num_tokens": 1143517.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.564177006483078, | |
| "epoch": 0.26492537313432835, | |
| "grad_norm": 0.1369864046573639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619618892669678, | |
| "mean_token_accuracy": 0.774873822927475, | |
| "num_tokens": 1160191.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5624472498893738, | |
| "epoch": 0.26865671641791045, | |
| "grad_norm": 0.16099311411380768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546153783798218, | |
| "mean_token_accuracy": 0.7775298207998276, | |
| "num_tokens": 1176379.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5442378669977188, | |
| "epoch": 0.27238805970149255, | |
| "grad_norm": 0.18382063508033752, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439026951789856, | |
| "mean_token_accuracy": 0.7808986604213715, | |
| "num_tokens": 1192611.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5539779812097549, | |
| "epoch": 0.27611940298507465, | |
| "grad_norm": 0.14527475833892822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488794445991516, | |
| "mean_token_accuracy": 0.7770136892795563, | |
| "num_tokens": 1209218.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5399174243211746, | |
| "epoch": 0.2798507462686567, | |
| "grad_norm": 0.16744667291641235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474289059638977, | |
| "mean_token_accuracy": 0.7779674381017685, | |
| "num_tokens": 1225760.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5410275682806969, | |
| "epoch": 0.2835820895522388, | |
| "grad_norm": 0.1709633320569992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548405110836029, | |
| "mean_token_accuracy": 0.7785314917564392, | |
| "num_tokens": 1242263.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5613621175289154, | |
| "epoch": 0.2873134328358209, | |
| "grad_norm": 0.13462653756141663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592188835144043, | |
| "mean_token_accuracy": 0.7736580222845078, | |
| "num_tokens": 1258802.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5370856672525406, | |
| "epoch": 0.291044776119403, | |
| "grad_norm": 0.14010556042194366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362333655357361, | |
| "mean_token_accuracy": 0.7829223275184631, | |
| "num_tokens": 1274985.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5476308465003967, | |
| "epoch": 0.2947761194029851, | |
| "grad_norm": 0.14489887654781342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549788236618042, | |
| "mean_token_accuracy": 0.7797223776578903, | |
| "num_tokens": 1291341.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5441256165504456, | |
| "epoch": 0.29850746268656714, | |
| "grad_norm": 0.14331087470054626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457456111907959, | |
| "mean_token_accuracy": 0.7812238931655884, | |
| "num_tokens": 1307441.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5347439795732498, | |
| "epoch": 0.30223880597014924, | |
| "grad_norm": 0.13690398633480072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451613068580627, | |
| "mean_token_accuracy": 0.7763567119836807, | |
| "num_tokens": 1323409.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5473417937755585, | |
| "epoch": 0.30597014925373134, | |
| "grad_norm": 0.16063734889030457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565767288208008, | |
| "mean_token_accuracy": 0.7768999934196472, | |
| "num_tokens": 1339750.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5419514924287796, | |
| "epoch": 0.30970149253731344, | |
| "grad_norm": 0.16186301410198212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480918288230896, | |
| "mean_token_accuracy": 0.7810427248477936, | |
| "num_tokens": 1355977.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5665269196033478, | |
| "epoch": 0.31343283582089554, | |
| "grad_norm": 0.14284147322177887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600348711013794, | |
| "mean_token_accuracy": 0.7740004658699036, | |
| "num_tokens": 1372396.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5530648082494736, | |
| "epoch": 0.31716417910447764, | |
| "grad_norm": 0.1373152732849121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547944962978363, | |
| "mean_token_accuracy": 0.7793020755052567, | |
| "num_tokens": 1388474.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5625097453594208, | |
| "epoch": 0.3208955223880597, | |
| "grad_norm": 0.1248691976070404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582663416862488, | |
| "mean_token_accuracy": 0.7758172750473022, | |
| "num_tokens": 1404880.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5460606664419174, | |
| "epoch": 0.3246268656716418, | |
| "grad_norm": 0.16231709718704224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510202646255493, | |
| "mean_token_accuracy": 0.7779169529676437, | |
| "num_tokens": 1421168.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5403235554695129, | |
| "epoch": 0.3283582089552239, | |
| "grad_norm": 0.15352240204811096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474361181259155, | |
| "mean_token_accuracy": 0.7786824256181717, | |
| "num_tokens": 1437433.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.550665482878685, | |
| "epoch": 0.332089552238806, | |
| "grad_norm": 0.17033375799655914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535221695899963, | |
| "mean_token_accuracy": 0.7792181968688965, | |
| "num_tokens": 1453476.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.563551127910614, | |
| "epoch": 0.3358208955223881, | |
| "grad_norm": 0.13113154470920563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608611106872559, | |
| "mean_token_accuracy": 0.7760418206453323, | |
| "num_tokens": 1469909.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5737572461366653, | |
| "epoch": 0.33955223880597013, | |
| "grad_norm": 0.12551374733448029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643397569656372, | |
| "mean_token_accuracy": 0.7728746980428696, | |
| "num_tokens": 1486426.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5659501850605011, | |
| "epoch": 0.34328358208955223, | |
| "grad_norm": 0.15791846811771393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704576969146729, | |
| "mean_token_accuracy": 0.7684866786003113, | |
| "num_tokens": 1502522.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5568918883800507, | |
| "epoch": 0.34701492537313433, | |
| "grad_norm": 0.14071005582809448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559943437576294, | |
| "mean_token_accuracy": 0.7734934538602829, | |
| "num_tokens": 1518718.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5584161728620529, | |
| "epoch": 0.35074626865671643, | |
| "grad_norm": 0.14257407188415527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574990510940552, | |
| "mean_token_accuracy": 0.7743052095174789, | |
| "num_tokens": 1534997.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5583510845899582, | |
| "epoch": 0.35447761194029853, | |
| "grad_norm": 0.13653768599033356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597235560417175, | |
| "mean_token_accuracy": 0.7758298218250275, | |
| "num_tokens": 1551457.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5537077486515045, | |
| "epoch": 0.3582089552238806, | |
| "grad_norm": 0.14674222469329834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539477467536926, | |
| "mean_token_accuracy": 0.7744529694318771, | |
| "num_tokens": 1567731.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5472210198640823, | |
| "epoch": 0.3619402985074627, | |
| "grad_norm": 0.1276751160621643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464935898780823, | |
| "mean_token_accuracy": 0.7826344817876816, | |
| "num_tokens": 1584021.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5479029715061188, | |
| "epoch": 0.3656716417910448, | |
| "grad_norm": 0.16119465231895447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547060966491699, | |
| "mean_token_accuracy": 0.7760697901248932, | |
| "num_tokens": 1600533.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5536443293094635, | |
| "epoch": 0.3694029850746269, | |
| "grad_norm": 0.12991106510162354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573412775993347, | |
| "mean_token_accuracy": 0.7744511961936951, | |
| "num_tokens": 1616690.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5505102574825287, | |
| "epoch": 0.373134328358209, | |
| "grad_norm": 0.1364317238330841, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571202635765076, | |
| "mean_token_accuracy": 0.7761907130479813, | |
| "num_tokens": 1632957.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5503265261650085, | |
| "epoch": 0.376865671641791, | |
| "grad_norm": 0.14918965101242065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452536344528198, | |
| "mean_token_accuracy": 0.7773023992776871, | |
| "num_tokens": 1649397.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5523863285779953, | |
| "epoch": 0.3805970149253731, | |
| "grad_norm": 0.14225420355796814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425117611885071, | |
| "mean_token_accuracy": 0.7800490856170654, | |
| "num_tokens": 1665876.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5518430918455124, | |
| "epoch": 0.3843283582089552, | |
| "grad_norm": 0.12764710187911987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529345870018005, | |
| "mean_token_accuracy": 0.7768139094114304, | |
| "num_tokens": 1682296.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5581493228673935, | |
| "epoch": 0.3880597014925373, | |
| "grad_norm": 0.16170883178710938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5702566504478455, | |
| "mean_token_accuracy": 0.7671579420566559, | |
| "num_tokens": 1698550.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.558798760175705, | |
| "epoch": 0.3917910447761194, | |
| "grad_norm": 0.14736565947532654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634024143218994, | |
| "mean_token_accuracy": 0.7718724012374878, | |
| "num_tokens": 1714882.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5496668964624405, | |
| "epoch": 0.39552238805970147, | |
| "grad_norm": 0.150962695479393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452749133110046, | |
| "mean_token_accuracy": 0.7789688110351562, | |
| "num_tokens": 1731436.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5397633910179138, | |
| "epoch": 0.39925373134328357, | |
| "grad_norm": 0.12951846420764923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374678373336792, | |
| "mean_token_accuracy": 0.7823840379714966, | |
| "num_tokens": 1747667.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5504965782165527, | |
| "epoch": 0.40298507462686567, | |
| "grad_norm": 0.1469883769750595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489968061447144, | |
| "mean_token_accuracy": 0.7779988348484039, | |
| "num_tokens": 1763956.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5401955544948578, | |
| "epoch": 0.40671641791044777, | |
| "grad_norm": 0.14114412665367126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469740033149719, | |
| "mean_token_accuracy": 0.7791216820478439, | |
| "num_tokens": 1780050.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5623095035552979, | |
| "epoch": 0.41044776119402987, | |
| "grad_norm": 0.12923510372638702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578881502151489, | |
| "mean_token_accuracy": 0.7777072787284851, | |
| "num_tokens": 1796820.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5413771942257881, | |
| "epoch": 0.4141791044776119, | |
| "grad_norm": 0.1528160274028778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452436208724976, | |
| "mean_token_accuracy": 0.7776108086109161, | |
| "num_tokens": 1813232.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.5609131902456284, | |
| "epoch": 0.417910447761194, | |
| "grad_norm": 0.12400584667921066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644053816795349, | |
| "mean_token_accuracy": 0.7719212174415588, | |
| "num_tokens": 1829542.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.543258398771286, | |
| "epoch": 0.4216417910447761, | |
| "grad_norm": 0.11892957985401154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409727692604065, | |
| "mean_token_accuracy": 0.7800008654594421, | |
| "num_tokens": 1845855.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5490185469388962, | |
| "epoch": 0.4253731343283582, | |
| "grad_norm": 0.1497296690940857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536864995956421, | |
| "mean_token_accuracy": 0.7792476564645767, | |
| "num_tokens": 1862087.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.53768490254879, | |
| "epoch": 0.4291044776119403, | |
| "grad_norm": 0.13764707744121552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394353866577148, | |
| "mean_token_accuracy": 0.7829310894012451, | |
| "num_tokens": 1878496.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.548382118344307, | |
| "epoch": 0.43283582089552236, | |
| "grad_norm": 0.1350480020046234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588696002960205, | |
| "mean_token_accuracy": 0.773399829864502, | |
| "num_tokens": 1894649.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5273909568786621, | |
| "epoch": 0.43656716417910446, | |
| "grad_norm": 0.1509886085987091, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329999923706055, | |
| "mean_token_accuracy": 0.7835660129785538, | |
| "num_tokens": 1910828.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5727127343416214, | |
| "epoch": 0.44029850746268656, | |
| "grad_norm": 0.12369527667760849, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647591948509216, | |
| "mean_token_accuracy": 0.7721648663282394, | |
| "num_tokens": 1927319.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5657652169466019, | |
| "epoch": 0.44402985074626866, | |
| "grad_norm": 0.14263150095939636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5616084337234497, | |
| "mean_token_accuracy": 0.7732421457767487, | |
| "num_tokens": 1943783.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5638687461614609, | |
| "epoch": 0.44776119402985076, | |
| "grad_norm": 0.11849121749401093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577123165130615, | |
| "mean_token_accuracy": 0.7739600390195847, | |
| "num_tokens": 1960125.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5605282336473465, | |
| "epoch": 0.45149253731343286, | |
| "grad_norm": 0.1323515772819519, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557800829410553, | |
| "mean_token_accuracy": 0.7727965116500854, | |
| "num_tokens": 1976458.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5336878746747971, | |
| "epoch": 0.4552238805970149, | |
| "grad_norm": 0.14154070615768433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429147481918335, | |
| "mean_token_accuracy": 0.7805563360452652, | |
| "num_tokens": 1992835.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5291022211313248, | |
| "epoch": 0.458955223880597, | |
| "grad_norm": 0.15199723839759827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432179570198059, | |
| "mean_token_accuracy": 0.7801262736320496, | |
| "num_tokens": 2008972.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.551175132393837, | |
| "epoch": 0.4626865671641791, | |
| "grad_norm": 0.11983563005924225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541180968284607, | |
| "mean_token_accuracy": 0.7762188464403152, | |
| "num_tokens": 2025359.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5533900856971741, | |
| "epoch": 0.4664179104477612, | |
| "grad_norm": 0.11737282574176788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463876724243164, | |
| "mean_token_accuracy": 0.7790547609329224, | |
| "num_tokens": 2041643.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5509413182735443, | |
| "epoch": 0.4701492537313433, | |
| "grad_norm": 0.13276953995227814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425540208816528, | |
| "mean_token_accuracy": 0.7806166559457779, | |
| "num_tokens": 2057820.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5531751215457916, | |
| "epoch": 0.47388059701492535, | |
| "grad_norm": 0.12553741037845612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523180961608887, | |
| "mean_token_accuracy": 0.7784822881221771, | |
| "num_tokens": 2074179.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.546363577246666, | |
| "epoch": 0.47761194029850745, | |
| "grad_norm": 0.13337954878807068, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551460981369019, | |
| "mean_token_accuracy": 0.7742737084627151, | |
| "num_tokens": 2090654.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5285965204238892, | |
| "epoch": 0.48134328358208955, | |
| "grad_norm": 0.13400429487228394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407966375350952, | |
| "mean_token_accuracy": 0.7815738469362259, | |
| "num_tokens": 2107063.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5335082858800888, | |
| "epoch": 0.48507462686567165, | |
| "grad_norm": 0.13302984833717346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388374328613281, | |
| "mean_token_accuracy": 0.7839466333389282, | |
| "num_tokens": 2123452.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.557282879948616, | |
| "epoch": 0.48880597014925375, | |
| "grad_norm": 0.13119758665561676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534148812294006, | |
| "mean_token_accuracy": 0.7738241106271744, | |
| "num_tokens": 2139585.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5428808927536011, | |
| "epoch": 0.4925373134328358, | |
| "grad_norm": 0.12375836819410324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381428003311157, | |
| "mean_token_accuracy": 0.7813713997602463, | |
| "num_tokens": 2155902.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5618433207273483, | |
| "epoch": 0.4962686567164179, | |
| "grad_norm": 0.13146650791168213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552733838558197, | |
| "mean_token_accuracy": 0.7768221199512482, | |
| "num_tokens": 2172496.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5565268397331238, | |
| "epoch": 0.5, | |
| "grad_norm": 0.11766450107097626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559637546539307, | |
| "mean_token_accuracy": 0.7758495062589645, | |
| "num_tokens": 2188987.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5205433219671249, | |
| "epoch": 0.503731343283582, | |
| "grad_norm": 0.12712325155735016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280570387840271, | |
| "mean_token_accuracy": 0.7863014787435532, | |
| "num_tokens": 2205010.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5373736917972565, | |
| "epoch": 0.5074626865671642, | |
| "grad_norm": 0.13094842433929443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430901050567627, | |
| "mean_token_accuracy": 0.780227467417717, | |
| "num_tokens": 2221474.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5688028186559677, | |
| "epoch": 0.5111940298507462, | |
| "grad_norm": 0.1379985511302948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5740535855293274, | |
| "mean_token_accuracy": 0.7692983150482178, | |
| "num_tokens": 2238030.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5621554553508759, | |
| "epoch": 0.5149253731343284, | |
| "grad_norm": 0.13305246829986572, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573163032531738, | |
| "mean_token_accuracy": 0.7748852521181107, | |
| "num_tokens": 2254436.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5507737994194031, | |
| "epoch": 0.5186567164179104, | |
| "grad_norm": 0.12606868147850037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473536849021912, | |
| "mean_token_accuracy": 0.7785522937774658, | |
| "num_tokens": 2270806.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5534549057483673, | |
| "epoch": 0.5223880597014925, | |
| "grad_norm": 0.14390718936920166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571063756942749, | |
| "mean_token_accuracy": 0.7750511020421982, | |
| "num_tokens": 2286975.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5419649630784988, | |
| "epoch": 0.5261194029850746, | |
| "grad_norm": 0.13526654243469238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507834553718567, | |
| "mean_token_accuracy": 0.7767505496740341, | |
| "num_tokens": 2303373.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5532436519861221, | |
| "epoch": 0.5298507462686567, | |
| "grad_norm": 0.1307537853717804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537344813346863, | |
| "mean_token_accuracy": 0.7779698222875595, | |
| "num_tokens": 2319833.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5443145930767059, | |
| "epoch": 0.5335820895522388, | |
| "grad_norm": 0.12360236793756485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414459109306335, | |
| "mean_token_accuracy": 0.7796581238508224, | |
| "num_tokens": 2336100.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5436644405126572, | |
| "epoch": 0.5373134328358209, | |
| "grad_norm": 0.13813567161560059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399284362792969, | |
| "mean_token_accuracy": 0.781887099146843, | |
| "num_tokens": 2352431.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.554161787033081, | |
| "epoch": 0.5410447761194029, | |
| "grad_norm": 0.1234111338853836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504522323608398, | |
| "mean_token_accuracy": 0.7768333554267883, | |
| "num_tokens": 2368781.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.540039673447609, | |
| "epoch": 0.5447761194029851, | |
| "grad_norm": 0.12760984897613525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470931529998779, | |
| "mean_token_accuracy": 0.7785885185003281, | |
| "num_tokens": 2385030.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.538455605506897, | |
| "epoch": 0.5485074626865671, | |
| "grad_norm": 0.11708244681358337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540416419506073, | |
| "mean_token_accuracy": 0.782222330570221, | |
| "num_tokens": 2401529.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5445697456598282, | |
| "epoch": 0.5522388059701493, | |
| "grad_norm": 0.11756740510463715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511283278465271, | |
| "mean_token_accuracy": 0.7760586440563202, | |
| "num_tokens": 2417920.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5568743199110031, | |
| "epoch": 0.5559701492537313, | |
| "grad_norm": 0.1262131929397583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587324500083923, | |
| "mean_token_accuracy": 0.7755658030509949, | |
| "num_tokens": 2434402.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5476635098457336, | |
| "epoch": 0.5597014925373134, | |
| "grad_norm": 0.14212746918201447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485654473304749, | |
| "mean_token_accuracy": 0.7787987738847733, | |
| "num_tokens": 2450648.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5328710079193115, | |
| "epoch": 0.5634328358208955, | |
| "grad_norm": 0.1456608921289444, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320286750793457, | |
| "mean_token_accuracy": 0.7839557826519012, | |
| "num_tokens": 2466701.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5372531861066818, | |
| "epoch": 0.5671641791044776, | |
| "grad_norm": 0.11793923377990723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379877090454102, | |
| "mean_token_accuracy": 0.7800156623125076, | |
| "num_tokens": 2482627.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5532563626766205, | |
| "epoch": 0.5708955223880597, | |
| "grad_norm": 0.13809776306152344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551555871963501, | |
| "mean_token_accuracy": 0.7761517316102982, | |
| "num_tokens": 2499250.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5471682995557785, | |
| "epoch": 0.5746268656716418, | |
| "grad_norm": 0.1408306509256363, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491219758987427, | |
| "mean_token_accuracy": 0.7767983973026276, | |
| "num_tokens": 2515443.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.571009948849678, | |
| "epoch": 0.5783582089552238, | |
| "grad_norm": 0.1486109346151352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713759660720825, | |
| "mean_token_accuracy": 0.7713276296854019, | |
| "num_tokens": 2531761.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5617386847734451, | |
| "epoch": 0.582089552238806, | |
| "grad_norm": 0.15764987468719482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562607645988464, | |
| "mean_token_accuracy": 0.7755531519651413, | |
| "num_tokens": 2548176.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5492932498455048, | |
| "epoch": 0.585820895522388, | |
| "grad_norm": 0.153673455119133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581745505332947, | |
| "mean_token_accuracy": 0.7730790227651596, | |
| "num_tokens": 2564448.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.555228590965271, | |
| "epoch": 0.5895522388059702, | |
| "grad_norm": 0.1345115751028061, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605562329292297, | |
| "mean_token_accuracy": 0.7717746198177338, | |
| "num_tokens": 2580905.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5399526059627533, | |
| "epoch": 0.5932835820895522, | |
| "grad_norm": 0.11657729744911194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369132161140442, | |
| "mean_token_accuracy": 0.7842999547719955, | |
| "num_tokens": 2597180.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5353947132825851, | |
| "epoch": 0.5970149253731343, | |
| "grad_norm": 0.1333966851234436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362208485603333, | |
| "mean_token_accuracy": 0.7827091217041016, | |
| "num_tokens": 2613444.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5535644590854645, | |
| "epoch": 0.6007462686567164, | |
| "grad_norm": 0.13608874380588531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567671656608582, | |
| "mean_token_accuracy": 0.7774695008993149, | |
| "num_tokens": 2629983.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5560604184865952, | |
| "epoch": 0.6044776119402985, | |
| "grad_norm": 0.1163283959031105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636521577835083, | |
| "mean_token_accuracy": 0.7745625525712967, | |
| "num_tokens": 2646578.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5764736235141754, | |
| "epoch": 0.6082089552238806, | |
| "grad_norm": 0.1255754828453064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.578213632106781, | |
| "mean_token_accuracy": 0.7662594020366669, | |
| "num_tokens": 2663032.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5460716336965561, | |
| "epoch": 0.6119402985074627, | |
| "grad_norm": 0.13686135411262512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406862497329712, | |
| "mean_token_accuracy": 0.7790546417236328, | |
| "num_tokens": 2679368.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5340383723378181, | |
| "epoch": 0.6156716417910447, | |
| "grad_norm": 0.12064651399850845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316583514213562, | |
| "mean_token_accuracy": 0.7829991579055786, | |
| "num_tokens": 2695866.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5442641973495483, | |
| "epoch": 0.6194029850746269, | |
| "grad_norm": 0.12049891799688339, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513224005699158, | |
| "mean_token_accuracy": 0.7753165811300278, | |
| "num_tokens": 2712061.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5361381322145462, | |
| "epoch": 0.6231343283582089, | |
| "grad_norm": 0.13572274148464203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410642623901367, | |
| "mean_token_accuracy": 0.7834690064191818, | |
| "num_tokens": 2728405.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.542312353849411, | |
| "epoch": 0.6268656716417911, | |
| "grad_norm": 0.12791581451892853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421413779258728, | |
| "mean_token_accuracy": 0.7781463712453842, | |
| "num_tokens": 2744612.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5568868666887283, | |
| "epoch": 0.6305970149253731, | |
| "grad_norm": 0.12156295031309128, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577100515365601, | |
| "mean_token_accuracy": 0.7726946324110031, | |
| "num_tokens": 2761047.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5537672489881516, | |
| "epoch": 0.6343283582089553, | |
| "grad_norm": 0.1293496936559677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571946501731873, | |
| "mean_token_accuracy": 0.7751306742429733, | |
| "num_tokens": 2777250.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5509191900491714, | |
| "epoch": 0.6380597014925373, | |
| "grad_norm": 0.1272898018360138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516744256019592, | |
| "mean_token_accuracy": 0.7766414433717728, | |
| "num_tokens": 2793605.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5510837286710739, | |
| "epoch": 0.6417910447761194, | |
| "grad_norm": 0.14305925369262695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544188618659973, | |
| "mean_token_accuracy": 0.7760672718286514, | |
| "num_tokens": 2809948.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5232614651322365, | |
| "epoch": 0.6455223880597015, | |
| "grad_norm": 0.1384088695049286, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274964570999146, | |
| "mean_token_accuracy": 0.7859550416469574, | |
| "num_tokens": 2826128.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5601816028356552, | |
| "epoch": 0.6492537313432836, | |
| "grad_norm": 0.1388508826494217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543120503425598, | |
| "mean_token_accuracy": 0.7758214622735977, | |
| "num_tokens": 2842612.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5437414795160294, | |
| "epoch": 0.6529850746268657, | |
| "grad_norm": 0.11655397713184357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404227375984192, | |
| "mean_token_accuracy": 0.7822663187980652, | |
| "num_tokens": 2859123.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.55133356153965, | |
| "epoch": 0.6567164179104478, | |
| "grad_norm": 0.1398521363735199, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518021583557129, | |
| "mean_token_accuracy": 0.7771210372447968, | |
| "num_tokens": 2875360.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5468268245458603, | |
| "epoch": 0.6604477611940298, | |
| "grad_norm": 0.12005320936441422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481685996055603, | |
| "mean_token_accuracy": 0.7786961048841476, | |
| "num_tokens": 2891626.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5444129258394241, | |
| "epoch": 0.664179104477612, | |
| "grad_norm": 0.16883929073810577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526378750801086, | |
| "mean_token_accuracy": 0.7768739610910416, | |
| "num_tokens": 2907939.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.5393242985010147, | |
| "epoch": 0.667910447761194, | |
| "grad_norm": 0.1297578513622284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451361536979675, | |
| "mean_token_accuracy": 0.7800205200910568, | |
| "num_tokens": 2924294.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5417011380195618, | |
| "epoch": 0.6716417910447762, | |
| "grad_norm": 0.12030332535505295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440862774848938, | |
| "mean_token_accuracy": 0.7813349515199661, | |
| "num_tokens": 2940716.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5521986186504364, | |
| "epoch": 0.6753731343283582, | |
| "grad_norm": 0.11406023800373077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487515926361084, | |
| "mean_token_accuracy": 0.7764244675636292, | |
| "num_tokens": 2956993.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5547273755073547, | |
| "epoch": 0.6791044776119403, | |
| "grad_norm": 0.13328734040260315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552635669708252, | |
| "mean_token_accuracy": 0.7759450674057007, | |
| "num_tokens": 2973622.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5548880398273468, | |
| "epoch": 0.6828358208955224, | |
| "grad_norm": 0.11328119784593582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517279505729675, | |
| "mean_token_accuracy": 0.7757984399795532, | |
| "num_tokens": 2989995.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5576671957969666, | |
| "epoch": 0.6865671641791045, | |
| "grad_norm": 0.1849256306886673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650368332862854, | |
| "mean_token_accuracy": 0.7731626927852631, | |
| "num_tokens": 3006538.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.537109300494194, | |
| "epoch": 0.6902985074626866, | |
| "grad_norm": 0.1240711435675621, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376191139221191, | |
| "mean_token_accuracy": 0.7854040563106537, | |
| "num_tokens": 3022770.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5537560731172562, | |
| "epoch": 0.6940298507462687, | |
| "grad_norm": 0.1654159426689148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570691227912903, | |
| "mean_token_accuracy": 0.7766956984996796, | |
| "num_tokens": 3039407.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5552389770746231, | |
| "epoch": 0.6977611940298507, | |
| "grad_norm": 0.10993515700101852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586962103843689, | |
| "mean_token_accuracy": 0.7749262005090714, | |
| "num_tokens": 3055780.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5666979551315308, | |
| "epoch": 0.7014925373134329, | |
| "grad_norm": 0.11159558594226837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667304992675781, | |
| "mean_token_accuracy": 0.7695165723562241, | |
| "num_tokens": 3072362.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5639722347259521, | |
| "epoch": 0.7052238805970149, | |
| "grad_norm": 0.14158234000205994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5614078044891357, | |
| "mean_token_accuracy": 0.7733878195285797, | |
| "num_tokens": 3088887.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5518735945224762, | |
| "epoch": 0.7089552238805971, | |
| "grad_norm": 0.12406881153583527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611676573753357, | |
| "mean_token_accuracy": 0.7746167629957199, | |
| "num_tokens": 3105332.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5349650382995605, | |
| "epoch": 0.7126865671641791, | |
| "grad_norm": 0.13473471999168396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54412841796875, | |
| "mean_token_accuracy": 0.7769501060247421, | |
| "num_tokens": 3121582.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5316546410322189, | |
| "epoch": 0.7164179104477612, | |
| "grad_norm": 0.11828400939702988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530936062335968, | |
| "mean_token_accuracy": 0.7848189175128937, | |
| "num_tokens": 3137920.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.556887611746788, | |
| "epoch": 0.7201492537313433, | |
| "grad_norm": 0.1256878823041916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555519700050354, | |
| "mean_token_accuracy": 0.7738869190216064, | |
| "num_tokens": 3154339.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5477663427591324, | |
| "epoch": 0.7238805970149254, | |
| "grad_norm": 0.11984176933765411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489908456802368, | |
| "mean_token_accuracy": 0.7780539244413376, | |
| "num_tokens": 3170574.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5371970534324646, | |
| "epoch": 0.7276119402985075, | |
| "grad_norm": 0.11440598219633102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5346511602401733, | |
| "mean_token_accuracy": 0.7856602966785431, | |
| "num_tokens": 3187140.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5374069362878799, | |
| "epoch": 0.7313432835820896, | |
| "grad_norm": 0.1220874935388565, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448272228240967, | |
| "mean_token_accuracy": 0.7792176902294159, | |
| "num_tokens": 3203454.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5373833179473877, | |
| "epoch": 0.7350746268656716, | |
| "grad_norm": 0.14692658185958862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547886312007904, | |
| "mean_token_accuracy": 0.7767521291971207, | |
| "num_tokens": 3219558.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.554410994052887, | |
| "epoch": 0.7388059701492538, | |
| "grad_norm": 0.12380608916282654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550884485244751, | |
| "mean_token_accuracy": 0.7776724547147751, | |
| "num_tokens": 3235877.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5471773892641068, | |
| "epoch": 0.7425373134328358, | |
| "grad_norm": 0.11140885949134827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401238799095154, | |
| "mean_token_accuracy": 0.7774412035942078, | |
| "num_tokens": 3252209.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5380608141422272, | |
| "epoch": 0.746268656716418, | |
| "grad_norm": 0.1454455554485321, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387637615203857, | |
| "mean_token_accuracy": 0.7800891399383545, | |
| "num_tokens": 3268329.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5308581739664078, | |
| "epoch": 0.75, | |
| "grad_norm": 0.1361016035079956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343608260154724, | |
| "mean_token_accuracy": 0.7855110317468643, | |
| "num_tokens": 3284338.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5632822811603546, | |
| "epoch": 0.753731343283582, | |
| "grad_norm": 0.13291221857070923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640154480934143, | |
| "mean_token_accuracy": 0.767445370554924, | |
| "num_tokens": 3300776.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.554180920124054, | |
| "epoch": 0.7574626865671642, | |
| "grad_norm": 0.12478666007518768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525573492050171, | |
| "mean_token_accuracy": 0.774932399392128, | |
| "num_tokens": 3317196.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5349105298519135, | |
| "epoch": 0.7611940298507462, | |
| "grad_norm": 0.12442342936992645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401512980461121, | |
| "mean_token_accuracy": 0.7819676995277405, | |
| "num_tokens": 3333516.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5417488664388657, | |
| "epoch": 0.7649253731343284, | |
| "grad_norm": 0.12787121534347534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460774302482605, | |
| "mean_token_accuracy": 0.7793125957250595, | |
| "num_tokens": 3349860.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5238666534423828, | |
| "epoch": 0.7686567164179104, | |
| "grad_norm": 0.14022648334503174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336724519729614, | |
| "mean_token_accuracy": 0.7843347638845444, | |
| "num_tokens": 3365954.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5506514012813568, | |
| "epoch": 0.7723880597014925, | |
| "grad_norm": 0.10952670127153397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459721684455872, | |
| "mean_token_accuracy": 0.7809877097606659, | |
| "num_tokens": 3382344.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5601198077201843, | |
| "epoch": 0.7761194029850746, | |
| "grad_norm": 0.14921848475933075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593782663345337, | |
| "mean_token_accuracy": 0.7718043476343155, | |
| "num_tokens": 3398687.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5334768891334534, | |
| "epoch": 0.7798507462686567, | |
| "grad_norm": 0.11596426367759705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338318943977356, | |
| "mean_token_accuracy": 0.783938467502594, | |
| "num_tokens": 3414913.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5415135025978088, | |
| "epoch": 0.7835820895522388, | |
| "grad_norm": 0.13524818420410156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422087907791138, | |
| "mean_token_accuracy": 0.7810906171798706, | |
| "num_tokens": 3431071.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5562594383955002, | |
| "epoch": 0.7873134328358209, | |
| "grad_norm": 0.14714977145195007, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575138926506042, | |
| "mean_token_accuracy": 0.7743899971246719, | |
| "num_tokens": 3447417.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.536840409040451, | |
| "epoch": 0.7910447761194029, | |
| "grad_norm": 0.1191772073507309, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539043664932251, | |
| "mean_token_accuracy": 0.7791986167430878, | |
| "num_tokens": 3463951.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5601708441972733, | |
| "epoch": 0.7947761194029851, | |
| "grad_norm": 0.14285218715667725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604355931282043, | |
| "mean_token_accuracy": 0.7729564011096954, | |
| "num_tokens": 3480303.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5470457077026367, | |
| "epoch": 0.7985074626865671, | |
| "grad_norm": 0.13420677185058594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554261326789856, | |
| "mean_token_accuracy": 0.7758394628763199, | |
| "num_tokens": 3496665.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5595335066318512, | |
| "epoch": 0.8022388059701493, | |
| "grad_norm": 0.12468434125185013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626363158226013, | |
| "mean_token_accuracy": 0.7708792388439178, | |
| "num_tokens": 3512987.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5410265326499939, | |
| "epoch": 0.8059701492537313, | |
| "grad_norm": 0.1368313878774643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424209237098694, | |
| "mean_token_accuracy": 0.780338704586029, | |
| "num_tokens": 3529322.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5611067861318588, | |
| "epoch": 0.8097014925373134, | |
| "grad_norm": 0.12065284699201584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554131269454956, | |
| "mean_token_accuracy": 0.775262787938118, | |
| "num_tokens": 3545541.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5451776385307312, | |
| "epoch": 0.8134328358208955, | |
| "grad_norm": 0.13018189370632172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477407574653625, | |
| "mean_token_accuracy": 0.7790820002555847, | |
| "num_tokens": 3562081.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5475118607282639, | |
| "epoch": 0.8171641791044776, | |
| "grad_norm": 0.1309870183467865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548214852809906, | |
| "mean_token_accuracy": 0.7790254205465317, | |
| "num_tokens": 3578349.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5216370671987534, | |
| "epoch": 0.8208955223880597, | |
| "grad_norm": 0.1223544329404831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256963968276978, | |
| "mean_token_accuracy": 0.787861168384552, | |
| "num_tokens": 3594724.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.5441537946462631, | |
| "epoch": 0.8246268656716418, | |
| "grad_norm": 0.1324274092912674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496052503585815, | |
| "mean_token_accuracy": 0.7781362533569336, | |
| "num_tokens": 3611250.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5336802899837494, | |
| "epoch": 0.8283582089552238, | |
| "grad_norm": 0.15294679999351501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427975654602051, | |
| "mean_token_accuracy": 0.7801742255687714, | |
| "num_tokens": 3627526.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5635577589273453, | |
| "epoch": 0.832089552238806, | |
| "grad_norm": 0.1364123523235321, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619288682937622, | |
| "mean_token_accuracy": 0.768532395362854, | |
| "num_tokens": 3643553.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5576212853193283, | |
| "epoch": 0.835820895522388, | |
| "grad_norm": 0.1353282779455185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438153147697449, | |
| "mean_token_accuracy": 0.779265359044075, | |
| "num_tokens": 3660133.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5412103980779648, | |
| "epoch": 0.8395522388059702, | |
| "grad_norm": 0.12540455162525177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397533774375916, | |
| "mean_token_accuracy": 0.7794700264930725, | |
| "num_tokens": 3676295.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5455985218286514, | |
| "epoch": 0.8432835820895522, | |
| "grad_norm": 0.13320018351078033, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485510230064392, | |
| "mean_token_accuracy": 0.778446152806282, | |
| "num_tokens": 3692894.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5248135328292847, | |
| "epoch": 0.8470149253731343, | |
| "grad_norm": 0.13709791004657745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536843478679657, | |
| "mean_token_accuracy": 0.7809243649244308, | |
| "num_tokens": 3709122.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.53542160987854, | |
| "epoch": 0.8507462686567164, | |
| "grad_norm": 0.12484195083379745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407888293266296, | |
| "mean_token_accuracy": 0.7803395837545395, | |
| "num_tokens": 3725461.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5458493530750275, | |
| "epoch": 0.8544776119402985, | |
| "grad_norm": 0.13020864129066467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498859882354736, | |
| "mean_token_accuracy": 0.7766377329826355, | |
| "num_tokens": 3741717.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5359915047883987, | |
| "epoch": 0.8582089552238806, | |
| "grad_norm": 0.11409227550029755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289561748504639, | |
| "mean_token_accuracy": 0.7882120311260223, | |
| "num_tokens": 3757988.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5659278780221939, | |
| "epoch": 0.8619402985074627, | |
| "grad_norm": 0.10721168667078018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621720552444458, | |
| "mean_token_accuracy": 0.7705938816070557, | |
| "num_tokens": 3774220.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5599822998046875, | |
| "epoch": 0.8656716417910447, | |
| "grad_norm": 0.12365678697824478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598929524421692, | |
| "mean_token_accuracy": 0.7715335041284561, | |
| "num_tokens": 3790653.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.54929418861866, | |
| "epoch": 0.8694029850746269, | |
| "grad_norm": 0.12949936091899872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555176734924316, | |
| "mean_token_accuracy": 0.7733278125524521, | |
| "num_tokens": 3807110.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5474081933498383, | |
| "epoch": 0.8731343283582089, | |
| "grad_norm": 0.12146537750959396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511813759803772, | |
| "mean_token_accuracy": 0.7766411751508713, | |
| "num_tokens": 3823486.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5372883975505829, | |
| "epoch": 0.8768656716417911, | |
| "grad_norm": 0.12444064766168594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384877324104309, | |
| "mean_token_accuracy": 0.7811126857995987, | |
| "num_tokens": 3839856.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5574021190404892, | |
| "epoch": 0.8805970149253731, | |
| "grad_norm": 0.11953511834144592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613345503807068, | |
| "mean_token_accuracy": 0.7729752510786057, | |
| "num_tokens": 3856362.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5452482104301453, | |
| "epoch": 0.8843283582089553, | |
| "grad_norm": 0.11208797991275787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457064509391785, | |
| "mean_token_accuracy": 0.7782498598098755, | |
| "num_tokens": 3872666.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5534125864505768, | |
| "epoch": 0.8880597014925373, | |
| "grad_norm": 0.15453441441059113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572060346603394, | |
| "mean_token_accuracy": 0.7716512382030487, | |
| "num_tokens": 3888939.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.547100231051445, | |
| "epoch": 0.8917910447761194, | |
| "grad_norm": 0.12707094848155975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511140823364258, | |
| "mean_token_accuracy": 0.7789764106273651, | |
| "num_tokens": 3905243.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.544873908162117, | |
| "epoch": 0.8955223880597015, | |
| "grad_norm": 0.13703206181526184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423987507820129, | |
| "mean_token_accuracy": 0.7779188007116318, | |
| "num_tokens": 3921866.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5453302264213562, | |
| "epoch": 0.8992537313432836, | |
| "grad_norm": 0.11689020693302155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460352301597595, | |
| "mean_token_accuracy": 0.7779721468687057, | |
| "num_tokens": 3938407.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5635591447353363, | |
| "epoch": 0.9029850746268657, | |
| "grad_norm": 0.13040713965892792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655105113983154, | |
| "mean_token_accuracy": 0.768951028585434, | |
| "num_tokens": 3954812.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5287201702594757, | |
| "epoch": 0.9067164179104478, | |
| "grad_norm": 0.11932681500911713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290012359619141, | |
| "mean_token_accuracy": 0.7868975102901459, | |
| "num_tokens": 3970722.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5399811267852783, | |
| "epoch": 0.9104477611940298, | |
| "grad_norm": 0.15166425704956055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475818514823914, | |
| "mean_token_accuracy": 0.7782254964113235, | |
| "num_tokens": 3986919.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5479171127080917, | |
| "epoch": 0.914179104477612, | |
| "grad_norm": 0.13205286860466003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506084561347961, | |
| "mean_token_accuracy": 0.7769028395414352, | |
| "num_tokens": 4003718.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5506049394607544, | |
| "epoch": 0.917910447761194, | |
| "grad_norm": 0.1079086884856224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398848056793213, | |
| "mean_token_accuracy": 0.7830533385276794, | |
| "num_tokens": 4020063.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5654618889093399, | |
| "epoch": 0.9216417910447762, | |
| "grad_norm": 0.1322406679391861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590391755104065, | |
| "mean_token_accuracy": 0.7732941806316376, | |
| "num_tokens": 4036681.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.546074166893959, | |
| "epoch": 0.9253731343283582, | |
| "grad_norm": 0.12490007281303406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554251670837402, | |
| "mean_token_accuracy": 0.7764608860015869, | |
| "num_tokens": 4052971.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5580905228853226, | |
| "epoch": 0.9291044776119403, | |
| "grad_norm": 0.11980146169662476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5676828622817993, | |
| "mean_token_accuracy": 0.7696985453367233, | |
| "num_tokens": 4069338.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5355470329523087, | |
| "epoch": 0.9328358208955224, | |
| "grad_norm": 0.12107004970312119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405516028404236, | |
| "mean_token_accuracy": 0.7829477041959763, | |
| "num_tokens": 4085750.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5567673444747925, | |
| "epoch": 0.9365671641791045, | |
| "grad_norm": 0.12893939018249512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650359988212585, | |
| "mean_token_accuracy": 0.7712520509958267, | |
| "num_tokens": 4102118.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5410316288471222, | |
| "epoch": 0.9402985074626866, | |
| "grad_norm": 0.11652866750955582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460695028305054, | |
| "mean_token_accuracy": 0.7774221301078796, | |
| "num_tokens": 4118568.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5609200298786163, | |
| "epoch": 0.9440298507462687, | |
| "grad_norm": 0.11244899779558182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490402579307556, | |
| "mean_token_accuracy": 0.7748613804578781, | |
| "num_tokens": 4135123.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5497269034385681, | |
| "epoch": 0.9477611940298507, | |
| "grad_norm": 0.14016613364219666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342196822166443, | |
| "mean_token_accuracy": 0.7829579263925552, | |
| "num_tokens": 4151216.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5376796424388885, | |
| "epoch": 0.9514925373134329, | |
| "grad_norm": 0.11261948943138123, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384314656257629, | |
| "mean_token_accuracy": 0.779564619064331, | |
| "num_tokens": 4167504.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5369044691324234, | |
| "epoch": 0.9552238805970149, | |
| "grad_norm": 0.1335015743970871, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465540885925293, | |
| "mean_token_accuracy": 0.7757421284914017, | |
| "num_tokens": 4183799.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5567403733730316, | |
| "epoch": 0.9589552238805971, | |
| "grad_norm": 0.14907455444335938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.567619800567627, | |
| "mean_token_accuracy": 0.770223930478096, | |
| "num_tokens": 4200155.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5468429028987885, | |
| "epoch": 0.9626865671641791, | |
| "grad_norm": 0.11520266532897949, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453846454620361, | |
| "mean_token_accuracy": 0.7773052304983139, | |
| "num_tokens": 4216435.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5431469082832336, | |
| "epoch": 0.9664179104477612, | |
| "grad_norm": 0.13169828057289124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401536822319031, | |
| "mean_token_accuracy": 0.7807234972715378, | |
| "num_tokens": 4232685.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5463652908802032, | |
| "epoch": 0.9701492537313433, | |
| "grad_norm": 0.1208634227514267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539630115032196, | |
| "mean_token_accuracy": 0.7806746661663055, | |
| "num_tokens": 4248983.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5373689532279968, | |
| "epoch": 0.9738805970149254, | |
| "grad_norm": 0.1322765052318573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365580916404724, | |
| "mean_token_accuracy": 0.7808263897895813, | |
| "num_tokens": 4265223.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5479995906352997, | |
| "epoch": 0.9776119402985075, | |
| "grad_norm": 0.12395796924829483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560559630393982, | |
| "mean_token_accuracy": 0.7720989733934402, | |
| "num_tokens": 4281420.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5320831388235092, | |
| "epoch": 0.9813432835820896, | |
| "grad_norm": 0.15233781933784485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420798659324646, | |
| "mean_token_accuracy": 0.7780148983001709, | |
| "num_tokens": 4297933.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5410943180322647, | |
| "epoch": 0.9850746268656716, | |
| "grad_norm": 0.11531079560518265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476459264755249, | |
| "mean_token_accuracy": 0.7788786739110947, | |
| "num_tokens": 4314320.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5516358613967896, | |
| "epoch": 0.9888059701492538, | |
| "grad_norm": 0.11947735399007797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536230206489563, | |
| "mean_token_accuracy": 0.7767823338508606, | |
| "num_tokens": 4330601.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5500903576612473, | |
| "epoch": 0.9925373134328358, | |
| "grad_norm": 0.12315159291028976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529444813728333, | |
| "mean_token_accuracy": 0.7752810269594193, | |
| "num_tokens": 4347043.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5517779290676117, | |
| "epoch": 0.996268656716418, | |
| "grad_norm": 0.11137247085571289, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534829497337341, | |
| "mean_token_accuracy": 0.7717059701681137, | |
| "num_tokens": 4363391.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5500383973121643, | |
| "epoch": 1.0, | |
| "grad_norm": 0.1438470184803009, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475767850875854, | |
| "mean_token_accuracy": 0.7807454466819763, | |
| "num_tokens": 4379703.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5567186176776886, | |
| "epoch": 1.0037313432835822, | |
| "grad_norm": 0.12165568768978119, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443229079246521, | |
| "mean_token_accuracy": 0.7788188308477402, | |
| "num_tokens": 4395979.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5200136750936508, | |
| "epoch": 1.007462686567164, | |
| "grad_norm": 0.11453047394752502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5096794962882996, | |
| "mean_token_accuracy": 0.7945292145013809, | |
| "num_tokens": 4412227.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5380017757415771, | |
| "epoch": 1.0111940298507462, | |
| "grad_norm": 0.15120473504066467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425546169281006, | |
| "mean_token_accuracy": 0.781953439116478, | |
| "num_tokens": 4428611.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5208772569894791, | |
| "epoch": 1.0149253731343284, | |
| "grad_norm": 0.1341351717710495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326657295227051, | |
| "mean_token_accuracy": 0.7831600904464722, | |
| "num_tokens": 4444927.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5214353799819946, | |
| "epoch": 1.0186567164179103, | |
| "grad_norm": 0.14984826743602753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280492901802063, | |
| "mean_token_accuracy": 0.786370187997818, | |
| "num_tokens": 4460991.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5258834809064865, | |
| "epoch": 1.0223880597014925, | |
| "grad_norm": 0.13014522194862366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271875858306885, | |
| "mean_token_accuracy": 0.7869210243225098, | |
| "num_tokens": 4477645.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5273120403289795, | |
| "epoch": 1.0261194029850746, | |
| "grad_norm": 0.1311647742986679, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5195775032043457, | |
| "mean_token_accuracy": 0.7897085547447205, | |
| "num_tokens": 4493809.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.5415386855602264, | |
| "epoch": 1.0298507462686568, | |
| "grad_norm": 0.11555178463459015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413332581520081, | |
| "mean_token_accuracy": 0.7796304523944855, | |
| "num_tokens": 4510212.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5370220988988876, | |
| "epoch": 1.0335820895522387, | |
| "grad_norm": 0.13971680402755737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396295785903931, | |
| "mean_token_accuracy": 0.7788214385509491, | |
| "num_tokens": 4526435.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5435305833816528, | |
| "epoch": 1.037313432835821, | |
| "grad_norm": 0.10762611031532288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435919761657715, | |
| "mean_token_accuracy": 0.7784401625394821, | |
| "num_tokens": 4542952.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5561162084341049, | |
| "epoch": 1.041044776119403, | |
| "grad_norm": 0.1305421143770218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544913411140442, | |
| "mean_token_accuracy": 0.7771686464548111, | |
| "num_tokens": 4559371.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5161843150854111, | |
| "epoch": 1.044776119402985, | |
| "grad_norm": 0.13184338808059692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511843204498291, | |
| "mean_token_accuracy": 0.7913843542337418, | |
| "num_tokens": 4575731.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.52925243973732, | |
| "epoch": 1.0485074626865671, | |
| "grad_norm": 0.1287873089313507, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263785719871521, | |
| "mean_token_accuracy": 0.7861436605453491, | |
| "num_tokens": 4592056.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5253249853849411, | |
| "epoch": 1.0522388059701493, | |
| "grad_norm": 0.12661200761795044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272859334945679, | |
| "mean_token_accuracy": 0.7849764674901962, | |
| "num_tokens": 4608326.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5225464850664139, | |
| "epoch": 1.0559701492537314, | |
| "grad_norm": 0.11925826221704483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5287873148918152, | |
| "mean_token_accuracy": 0.7825718820095062, | |
| "num_tokens": 4624408.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5239171385765076, | |
| "epoch": 1.0597014925373134, | |
| "grad_norm": 0.12639594078063965, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275134444236755, | |
| "mean_token_accuracy": 0.784866139292717, | |
| "num_tokens": 4640897.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5350142568349838, | |
| "epoch": 1.0634328358208955, | |
| "grad_norm": 0.13742367923259735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391872525215149, | |
| "mean_token_accuracy": 0.7813242971897125, | |
| "num_tokens": 4657487.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5414403080940247, | |
| "epoch": 1.0671641791044777, | |
| "grad_norm": 0.12273678928613663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538042426109314, | |
| "mean_token_accuracy": 0.7844662219285965, | |
| "num_tokens": 4674009.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5556955337524414, | |
| "epoch": 1.0708955223880596, | |
| "grad_norm": 0.11591946333646774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542109608650208, | |
| "mean_token_accuracy": 0.7758783847093582, | |
| "num_tokens": 4690230.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5334881544113159, | |
| "epoch": 1.0746268656716418, | |
| "grad_norm": 0.11168122291564941, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347651243209839, | |
| "mean_token_accuracy": 0.7833859175443649, | |
| "num_tokens": 4706362.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5315591096878052, | |
| "epoch": 1.078358208955224, | |
| "grad_norm": 0.13917559385299683, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380789041519165, | |
| "mean_token_accuracy": 0.7812001705169678, | |
| "num_tokens": 4722595.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5346228331327438, | |
| "epoch": 1.0820895522388059, | |
| "grad_norm": 0.13478422164916992, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455847978591919, | |
| "mean_token_accuracy": 0.7781703919172287, | |
| "num_tokens": 4738887.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5461715310811996, | |
| "epoch": 1.085820895522388, | |
| "grad_norm": 0.13396981358528137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379023551940918, | |
| "mean_token_accuracy": 0.7827265560626984, | |
| "num_tokens": 4755212.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5389465689659119, | |
| "epoch": 1.0895522388059702, | |
| "grad_norm": 0.12781155109405518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376452803611755, | |
| "mean_token_accuracy": 0.7828295826911926, | |
| "num_tokens": 4771644.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5441965609788895, | |
| "epoch": 1.0932835820895523, | |
| "grad_norm": 0.13662317395210266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53973788022995, | |
| "mean_token_accuracy": 0.781336709856987, | |
| "num_tokens": 4787994.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.557211622595787, | |
| "epoch": 1.0970149253731343, | |
| "grad_norm": 0.13968485593795776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545478463172913, | |
| "mean_token_accuracy": 0.7766687870025635, | |
| "num_tokens": 4804240.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5415647476911545, | |
| "epoch": 1.1007462686567164, | |
| "grad_norm": 0.14245721697807312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388385653495789, | |
| "mean_token_accuracy": 0.7829283177852631, | |
| "num_tokens": 4820711.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5286812037229538, | |
| "epoch": 1.1044776119402986, | |
| "grad_norm": 0.14483948051929474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349111557006836, | |
| "mean_token_accuracy": 0.7845683097839355, | |
| "num_tokens": 4836959.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5258732736110687, | |
| "epoch": 1.1082089552238805, | |
| "grad_norm": 0.13696761429309845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529443085193634, | |
| "mean_token_accuracy": 0.7867940962314606, | |
| "num_tokens": 4853067.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5512303709983826, | |
| "epoch": 1.1119402985074627, | |
| "grad_norm": 0.15340439975261688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552986741065979, | |
| "mean_token_accuracy": 0.7754423469305038, | |
| "num_tokens": 4869588.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5339537411928177, | |
| "epoch": 1.1156716417910448, | |
| "grad_norm": 0.15107926726341248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356568694114685, | |
| "mean_token_accuracy": 0.7815524339675903, | |
| "num_tokens": 4885904.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5544896274805069, | |
| "epoch": 1.1194029850746268, | |
| "grad_norm": 0.13157761096954346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553483366966248, | |
| "mean_token_accuracy": 0.7737178802490234, | |
| "num_tokens": 4902327.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5695160180330276, | |
| "epoch": 1.123134328358209, | |
| "grad_norm": 0.1447787880897522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667352676391602, | |
| "mean_token_accuracy": 0.7724233418703079, | |
| "num_tokens": 4918857.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5424528568983078, | |
| "epoch": 1.126865671641791, | |
| "grad_norm": 0.130395770072937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54450523853302, | |
| "mean_token_accuracy": 0.7784540206193924, | |
| "num_tokens": 4935469.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.537494882941246, | |
| "epoch": 1.1305970149253732, | |
| "grad_norm": 0.1572721302509308, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539937436580658, | |
| "mean_token_accuracy": 0.7787607908248901, | |
| "num_tokens": 4951497.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5239665806293488, | |
| "epoch": 1.1343283582089552, | |
| "grad_norm": 0.14227941632270813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5174288153648376, | |
| "mean_token_accuracy": 0.7907485216856003, | |
| "num_tokens": 4967826.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5226030200719833, | |
| "epoch": 1.1380597014925373, | |
| "grad_norm": 0.13234300911426544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237756967544556, | |
| "mean_token_accuracy": 0.7902256399393082, | |
| "num_tokens": 4984247.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5070921406149864, | |
| "epoch": 1.1417910447761195, | |
| "grad_norm": 0.15718795359134674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.520646333694458, | |
| "mean_token_accuracy": 0.7865647524595261, | |
| "num_tokens": 5000320.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5070105642080307, | |
| "epoch": 1.1455223880597014, | |
| "grad_norm": 0.20183522999286652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.528045654296875, | |
| "mean_token_accuracy": 0.7873903512954712, | |
| "num_tokens": 5016226.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5490072518587112, | |
| "epoch": 1.1492537313432836, | |
| "grad_norm": 0.12259556353092194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465996861457825, | |
| "mean_token_accuracy": 0.7795770764350891, | |
| "num_tokens": 5032435.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5369555801153183, | |
| "epoch": 1.1529850746268657, | |
| "grad_norm": 0.17033320665359497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238630175590515, | |
| "mean_token_accuracy": 0.7864966690540314, | |
| "num_tokens": 5048673.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5474718064069748, | |
| "epoch": 1.1567164179104479, | |
| "grad_norm": 0.15336251258850098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351282358169556, | |
| "mean_token_accuracy": 0.7832874804735184, | |
| "num_tokens": 5064889.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5407518595457077, | |
| "epoch": 1.1604477611940298, | |
| "grad_norm": 0.1288745403289795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532909631729126, | |
| "mean_token_accuracy": 0.7854967713356018, | |
| "num_tokens": 5081181.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5553453862667084, | |
| "epoch": 1.164179104477612, | |
| "grad_norm": 0.17325082421302795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650225877761841, | |
| "mean_token_accuracy": 0.7709382623434067, | |
| "num_tokens": 5097695.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.5312155932188034, | |
| "epoch": 1.1679104477611941, | |
| "grad_norm": 0.14813978970050812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398642420768738, | |
| "mean_token_accuracy": 0.7819912135601044, | |
| "num_tokens": 5114124.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5393004268407822, | |
| "epoch": 1.171641791044776, | |
| "grad_norm": 0.13244624435901642, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397657155990601, | |
| "mean_token_accuracy": 0.7833016067743301, | |
| "num_tokens": 5130526.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5356107205152512, | |
| "epoch": 1.1753731343283582, | |
| "grad_norm": 0.1546393185853958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278767347335815, | |
| "mean_token_accuracy": 0.7873012572526932, | |
| "num_tokens": 5146786.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5360458493232727, | |
| "epoch": 1.1791044776119404, | |
| "grad_norm": 0.14604224264621735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378543138504028, | |
| "mean_token_accuracy": 0.7808638215065002, | |
| "num_tokens": 5163157.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5358310341835022, | |
| "epoch": 1.1828358208955223, | |
| "grad_norm": 0.11514927446842194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323253273963928, | |
| "mean_token_accuracy": 0.7850612699985504, | |
| "num_tokens": 5179759.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5336421579122543, | |
| "epoch": 1.1865671641791045, | |
| "grad_norm": 0.14939743280410767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399504899978638, | |
| "mean_token_accuracy": 0.7822477370500565, | |
| "num_tokens": 5195772.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5196461454033852, | |
| "epoch": 1.1902985074626866, | |
| "grad_norm": 0.16364845633506775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318784117698669, | |
| "mean_token_accuracy": 0.7826407551765442, | |
| "num_tokens": 5212049.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5297210067510605, | |
| "epoch": 1.1940298507462686, | |
| "grad_norm": 0.1340930312871933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342279672622681, | |
| "mean_token_accuracy": 0.7825554758310318, | |
| "num_tokens": 5228387.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5374090075492859, | |
| "epoch": 1.1977611940298507, | |
| "grad_norm": 0.13523836433887482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342003107070923, | |
| "mean_token_accuracy": 0.7829677164554596, | |
| "num_tokens": 5244798.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5403262600302696, | |
| "epoch": 1.2014925373134329, | |
| "grad_norm": 0.11974834650754929, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366995334625244, | |
| "mean_token_accuracy": 0.7828448265790939, | |
| "num_tokens": 5261240.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5380197167396545, | |
| "epoch": 1.205223880597015, | |
| "grad_norm": 0.154353529214859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533047080039978, | |
| "mean_token_accuracy": 0.7859889715909958, | |
| "num_tokens": 5277554.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5303442776203156, | |
| "epoch": 1.208955223880597, | |
| "grad_norm": 0.14264924824237823, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314475893974304, | |
| "mean_token_accuracy": 0.7831806391477585, | |
| "num_tokens": 5293949.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5252211391925812, | |
| "epoch": 1.212686567164179, | |
| "grad_norm": 0.1556359827518463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285252928733826, | |
| "mean_token_accuracy": 0.783245861530304, | |
| "num_tokens": 5310026.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5328008607029915, | |
| "epoch": 1.2164179104477613, | |
| "grad_norm": 0.13450154662132263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320917367935181, | |
| "mean_token_accuracy": 0.7842745780944824, | |
| "num_tokens": 5326386.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5319949090480804, | |
| "epoch": 1.2201492537313432, | |
| "grad_norm": 0.12143786996603012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349273681640625, | |
| "mean_token_accuracy": 0.7820626497268677, | |
| "num_tokens": 5342658.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5234760195016861, | |
| "epoch": 1.2238805970149254, | |
| "grad_norm": 0.16645972430706024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320586562156677, | |
| "mean_token_accuracy": 0.7844817489385605, | |
| "num_tokens": 5358974.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5378956496715546, | |
| "epoch": 1.2276119402985075, | |
| "grad_norm": 0.13522404432296753, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357790589332581, | |
| "mean_token_accuracy": 0.7823758125305176, | |
| "num_tokens": 5375371.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5387023985385895, | |
| "epoch": 1.2313432835820897, | |
| "grad_norm": 0.1315094530582428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362842082977295, | |
| "mean_token_accuracy": 0.7809555679559708, | |
| "num_tokens": 5391896.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5072716027498245, | |
| "epoch": 1.2350746268656716, | |
| "grad_norm": 0.13498196005821228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.507161021232605, | |
| "mean_token_accuracy": 0.7966707944869995, | |
| "num_tokens": 5408354.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5260337740182877, | |
| "epoch": 1.2388059701492538, | |
| "grad_norm": 0.13349276781082153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276508331298828, | |
| "mean_token_accuracy": 0.7871510088443756, | |
| "num_tokens": 5424531.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5349582731723785, | |
| "epoch": 1.242537313432836, | |
| "grad_norm": 0.13890203833580017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371206402778625, | |
| "mean_token_accuracy": 0.7821635603904724, | |
| "num_tokens": 5440815.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5346423760056496, | |
| "epoch": 1.2462686567164178, | |
| "grad_norm": 0.1553906500339508, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395735502243042, | |
| "mean_token_accuracy": 0.7817864269018173, | |
| "num_tokens": 5457072.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5478692203760147, | |
| "epoch": 1.25, | |
| "grad_norm": 0.15934403240680695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516626834869385, | |
| "mean_token_accuracy": 0.7753347009420395, | |
| "num_tokens": 5473422.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5378739535808563, | |
| "epoch": 1.2537313432835822, | |
| "grad_norm": 0.12844312191009521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326632261276245, | |
| "mean_token_accuracy": 0.7827756106853485, | |
| "num_tokens": 5489671.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5409121513366699, | |
| "epoch": 1.2574626865671643, | |
| "grad_norm": 0.1285056471824646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452673435211182, | |
| "mean_token_accuracy": 0.7786683291196823, | |
| "num_tokens": 5506084.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5422088652849197, | |
| "epoch": 1.2611940298507462, | |
| "grad_norm": 0.14476130902767181, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416613817214966, | |
| "mean_token_accuracy": 0.7791768312454224, | |
| "num_tokens": 5522548.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5449076443910599, | |
| "epoch": 1.2649253731343284, | |
| "grad_norm": 0.13138490915298462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395404696464539, | |
| "mean_token_accuracy": 0.7813031673431396, | |
| "num_tokens": 5539208.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5443570464849472, | |
| "epoch": 1.2686567164179103, | |
| "grad_norm": 0.15328356623649597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410760641098022, | |
| "mean_token_accuracy": 0.7822384089231491, | |
| "num_tokens": 5555492.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5302190482616425, | |
| "epoch": 1.2723880597014925, | |
| "grad_norm": 0.15014180541038513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311694145202637, | |
| "mean_token_accuracy": 0.7823975682258606, | |
| "num_tokens": 5571999.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5198534801602364, | |
| "epoch": 1.2761194029850746, | |
| "grad_norm": 0.13281527161598206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303924083709717, | |
| "mean_token_accuracy": 0.7844155579805374, | |
| "num_tokens": 5588098.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5089417994022369, | |
| "epoch": 1.2798507462686568, | |
| "grad_norm": 0.1406290978193283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5175491571426392, | |
| "mean_token_accuracy": 0.7906824499368668, | |
| "num_tokens": 5604254.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5032122731208801, | |
| "epoch": 1.2835820895522387, | |
| "grad_norm": 0.15877749025821686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5124095678329468, | |
| "mean_token_accuracy": 0.790567934513092, | |
| "num_tokens": 5620363.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5435033291578293, | |
| "epoch": 1.287313432835821, | |
| "grad_norm": 0.1633625328540802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553101658821106, | |
| "mean_token_accuracy": 0.7757033556699753, | |
| "num_tokens": 5636720.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5401125550270081, | |
| "epoch": 1.291044776119403, | |
| "grad_norm": 0.14126214385032654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362418293952942, | |
| "mean_token_accuracy": 0.7848408222198486, | |
| "num_tokens": 5653198.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5514497756958008, | |
| "epoch": 1.294776119402985, | |
| "grad_norm": 0.12672948837280273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441724061965942, | |
| "mean_token_accuracy": 0.7795091718435287, | |
| "num_tokens": 5669516.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5293784886598587, | |
| "epoch": 1.2985074626865671, | |
| "grad_norm": 0.11630003899335861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298827886581421, | |
| "mean_token_accuracy": 0.783647358417511, | |
| "num_tokens": 5685856.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5244417935609818, | |
| "epoch": 1.3022388059701493, | |
| "grad_norm": 0.14798091351985931, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307499170303345, | |
| "mean_token_accuracy": 0.7859917134046555, | |
| "num_tokens": 5702057.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5323777049779892, | |
| "epoch": 1.3059701492537314, | |
| "grad_norm": 0.12870146334171295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365279912948608, | |
| "mean_token_accuracy": 0.7816431373357773, | |
| "num_tokens": 5718688.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5243604183197021, | |
| "epoch": 1.3097014925373134, | |
| "grad_norm": 0.12391035258769989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5227367281913757, | |
| "mean_token_accuracy": 0.7866858392953873, | |
| "num_tokens": 5734891.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5347918272018433, | |
| "epoch": 1.3134328358208955, | |
| "grad_norm": 0.145299032330513, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310446619987488, | |
| "mean_token_accuracy": 0.7831001132726669, | |
| "num_tokens": 5751328.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5411982387304306, | |
| "epoch": 1.3171641791044777, | |
| "grad_norm": 0.1532508134841919, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382261276245117, | |
| "mean_token_accuracy": 0.7814776748418808, | |
| "num_tokens": 5767612.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5384319573640823, | |
| "epoch": 1.3208955223880596, | |
| "grad_norm": 0.12034327536821365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356577038764954, | |
| "mean_token_accuracy": 0.7809152156114578, | |
| "num_tokens": 5783823.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5378035828471184, | |
| "epoch": 1.3246268656716418, | |
| "grad_norm": 0.17426501214504242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54035884141922, | |
| "mean_token_accuracy": 0.781380295753479, | |
| "num_tokens": 5800149.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5415401831269264, | |
| "epoch": 1.328358208955224, | |
| "grad_norm": 0.1543213427066803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499249696731567, | |
| "mean_token_accuracy": 0.7782198786735535, | |
| "num_tokens": 5816367.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5541952252388, | |
| "epoch": 1.332089552238806, | |
| "grad_norm": 0.1483956277370453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502984523773193, | |
| "mean_token_accuracy": 0.7760822772979736, | |
| "num_tokens": 5832681.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5343631953001022, | |
| "epoch": 1.335820895522388, | |
| "grad_norm": 0.1370651125907898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531204879283905, | |
| "mean_token_accuracy": 0.7847591787576675, | |
| "num_tokens": 5848778.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5292060524225235, | |
| "epoch": 1.3395522388059702, | |
| "grad_norm": 0.13134512305259705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340976119041443, | |
| "mean_token_accuracy": 0.7800851762294769, | |
| "num_tokens": 5864821.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5334947407245636, | |
| "epoch": 1.3432835820895521, | |
| "grad_norm": 0.1279117912054062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352479815483093, | |
| "mean_token_accuracy": 0.7832343429327011, | |
| "num_tokens": 5881116.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5323592573404312, | |
| "epoch": 1.3470149253731343, | |
| "grad_norm": 0.28604868054389954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301060080528259, | |
| "mean_token_accuracy": 0.7850496172904968, | |
| "num_tokens": 5897810.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5503924041986465, | |
| "epoch": 1.3507462686567164, | |
| "grad_norm": 0.34482085704803467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528603196144104, | |
| "mean_token_accuracy": 0.7764434367418289, | |
| "num_tokens": 5914260.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5227297842502594, | |
| "epoch": 1.3544776119402986, | |
| "grad_norm": 0.12345509976148605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238011479377747, | |
| "mean_token_accuracy": 0.7891107350587845, | |
| "num_tokens": 5930444.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5462608188390732, | |
| "epoch": 1.3582089552238805, | |
| "grad_norm": 0.1688961386680603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603306293487549, | |
| "mean_token_accuracy": 0.771704226732254, | |
| "num_tokens": 5946741.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5538459420204163, | |
| "epoch": 1.3619402985074627, | |
| "grad_norm": 0.14098992943763733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526646375656128, | |
| "mean_token_accuracy": 0.7749083191156387, | |
| "num_tokens": 5963128.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5297324359416962, | |
| "epoch": 1.3656716417910448, | |
| "grad_norm": 0.12920008599758148, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5280593633651733, | |
| "mean_token_accuracy": 0.784359410405159, | |
| "num_tokens": 5979218.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5375068634748459, | |
| "epoch": 1.3694029850746268, | |
| "grad_norm": 0.1362897753715515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373224020004272, | |
| "mean_token_accuracy": 0.7841860055923462, | |
| "num_tokens": 5995687.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5355936139822006, | |
| "epoch": 1.373134328358209, | |
| "grad_norm": 0.14052827656269073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387214422225952, | |
| "mean_token_accuracy": 0.7806743085384369, | |
| "num_tokens": 6012035.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5435226261615753, | |
| "epoch": 1.376865671641791, | |
| "grad_norm": 0.1556740403175354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441159009933472, | |
| "mean_token_accuracy": 0.7787201553583145, | |
| "num_tokens": 6028365.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5268312245607376, | |
| "epoch": 1.3805970149253732, | |
| "grad_norm": 0.15513257682323456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291861891746521, | |
| "mean_token_accuracy": 0.7877073138952255, | |
| "num_tokens": 6044796.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5517646074295044, | |
| "epoch": 1.3843283582089552, | |
| "grad_norm": 0.1265048235654831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546433925628662, | |
| "mean_token_accuracy": 0.7754338979721069, | |
| "num_tokens": 6061487.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5410579442977905, | |
| "epoch": 1.3880597014925373, | |
| "grad_norm": 0.13882151246070862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375149846076965, | |
| "mean_token_accuracy": 0.7817846387624741, | |
| "num_tokens": 6077933.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5343161523342133, | |
| "epoch": 1.3917910447761195, | |
| "grad_norm": 0.1435064971446991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308974981307983, | |
| "mean_token_accuracy": 0.7849253863096237, | |
| "num_tokens": 6094407.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5472413003444672, | |
| "epoch": 1.3955223880597014, | |
| "grad_norm": 0.1254650354385376, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410266518592834, | |
| "mean_token_accuracy": 0.7794545590877533, | |
| "num_tokens": 6110923.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5365632474422455, | |
| "epoch": 1.3992537313432836, | |
| "grad_norm": 0.13213133811950684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404695868492126, | |
| "mean_token_accuracy": 0.7813301384449005, | |
| "num_tokens": 6127219.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5322464108467102, | |
| "epoch": 1.4029850746268657, | |
| "grad_norm": 0.1703079640865326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420417189598083, | |
| "mean_token_accuracy": 0.7813734114170074, | |
| "num_tokens": 6143418.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5500752478837967, | |
| "epoch": 1.4067164179104479, | |
| "grad_norm": 0.1431417018175125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511533617973328, | |
| "mean_token_accuracy": 0.7758170068264008, | |
| "num_tokens": 6159747.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5427335649728775, | |
| "epoch": 1.4104477611940298, | |
| "grad_norm": 0.1817740648984909, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414767861366272, | |
| "mean_token_accuracy": 0.7784233242273331, | |
| "num_tokens": 6176317.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5470531731843948, | |
| "epoch": 1.414179104477612, | |
| "grad_norm": 0.1422269493341446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472888946533203, | |
| "mean_token_accuracy": 0.7780141085386276, | |
| "num_tokens": 6192737.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5464377701282501, | |
| "epoch": 1.417910447761194, | |
| "grad_norm": 0.17506512999534607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490654706954956, | |
| "mean_token_accuracy": 0.7765569537878036, | |
| "num_tokens": 6208852.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5500655770301819, | |
| "epoch": 1.421641791044776, | |
| "grad_norm": 0.13887247443199158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514895915985107, | |
| "mean_token_accuracy": 0.7774574458599091, | |
| "num_tokens": 6225069.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5438679605722427, | |
| "epoch": 1.4253731343283582, | |
| "grad_norm": 0.19045118987560272, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430073738098145, | |
| "mean_token_accuracy": 0.7802658081054688, | |
| "num_tokens": 6241528.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5306290239095688, | |
| "epoch": 1.4291044776119404, | |
| "grad_norm": 0.160585418343544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361081957817078, | |
| "mean_token_accuracy": 0.7803311944007874, | |
| "num_tokens": 6257867.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5401095002889633, | |
| "epoch": 1.4328358208955223, | |
| "grad_norm": 0.1656486541032791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400689244270325, | |
| "mean_token_accuracy": 0.780994102358818, | |
| "num_tokens": 6274155.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5327940136194229, | |
| "epoch": 1.4365671641791045, | |
| "grad_norm": 0.1317523568868637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320010185241699, | |
| "mean_token_accuracy": 0.7850325703620911, | |
| "num_tokens": 6290558.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5441479384899139, | |
| "epoch": 1.4402985074626866, | |
| "grad_norm": 0.17623504996299744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384020209312439, | |
| "mean_token_accuracy": 0.7846230715513229, | |
| "num_tokens": 6306878.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5452490895986557, | |
| "epoch": 1.4440298507462686, | |
| "grad_norm": 0.16240645945072174, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443468689918518, | |
| "mean_token_accuracy": 0.7802695333957672, | |
| "num_tokens": 6323446.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5221313908696175, | |
| "epoch": 1.4477611940298507, | |
| "grad_norm": 0.1463281661272049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281410813331604, | |
| "mean_token_accuracy": 0.7816678881645203, | |
| "num_tokens": 6339949.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.548899233341217, | |
| "epoch": 1.4514925373134329, | |
| "grad_norm": 0.22850677371025085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660842657089233, | |
| "mean_token_accuracy": 0.7699355781078339, | |
| "num_tokens": 6356385.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5538987964391708, | |
| "epoch": 1.455223880597015, | |
| "grad_norm": 0.14064767956733704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418739318847656, | |
| "mean_token_accuracy": 0.7807578295469284, | |
| "num_tokens": 6372804.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5599593967199326, | |
| "epoch": 1.458955223880597, | |
| "grad_norm": 0.18051759898662567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524702072143555, | |
| "mean_token_accuracy": 0.776346430182457, | |
| "num_tokens": 6389040.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5202420800924301, | |
| "epoch": 1.462686567164179, | |
| "grad_norm": 0.14325307309627533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.519583523273468, | |
| "mean_token_accuracy": 0.7894969880580902, | |
| "num_tokens": 6405365.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5261730998754501, | |
| "epoch": 1.4664179104477613, | |
| "grad_norm": 0.1525595486164093, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307163596153259, | |
| "mean_token_accuracy": 0.7871128022670746, | |
| "num_tokens": 6421868.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5307900905609131, | |
| "epoch": 1.4701492537313432, | |
| "grad_norm": 0.19890250265598297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441185832023621, | |
| "mean_token_accuracy": 0.7786047160625458, | |
| "num_tokens": 6438616.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5521271824836731, | |
| "epoch": 1.4738805970149254, | |
| "grad_norm": 0.14049610495567322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551049113273621, | |
| "mean_token_accuracy": 0.7755014002323151, | |
| "num_tokens": 6455024.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.539069190621376, | |
| "epoch": 1.4776119402985075, | |
| "grad_norm": 0.1545083075761795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353712439537048, | |
| "mean_token_accuracy": 0.78336501121521, | |
| "num_tokens": 6471293.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5550021678209305, | |
| "epoch": 1.4813432835820897, | |
| "grad_norm": 0.18578873574733734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472472310066223, | |
| "mean_token_accuracy": 0.7796825766563416, | |
| "num_tokens": 6487641.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5490831285715103, | |
| "epoch": 1.4850746268656716, | |
| "grad_norm": 0.1240464299917221, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474961400032043, | |
| "mean_token_accuracy": 0.7774344980716705, | |
| "num_tokens": 6503822.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5393417626619339, | |
| "epoch": 1.4888059701492538, | |
| "grad_norm": 0.1891254484653473, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524366497993469, | |
| "mean_token_accuracy": 0.7745344191789627, | |
| "num_tokens": 6520011.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.513459712266922, | |
| "epoch": 1.4925373134328357, | |
| "grad_norm": 0.2974206805229187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200244188308716, | |
| "mean_token_accuracy": 0.7888158708810806, | |
| "num_tokens": 6536205.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5186173021793365, | |
| "epoch": 1.4962686567164178, | |
| "grad_norm": 0.15046866238117218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207955241203308, | |
| "mean_token_accuracy": 0.7867278605699539, | |
| "num_tokens": 6552440.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5499364733695984, | |
| "epoch": 1.5, | |
| "grad_norm": 0.4020411968231201, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530084371566772, | |
| "mean_token_accuracy": 0.7796496748924255, | |
| "num_tokens": 6568961.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5427668243646622, | |
| "epoch": 1.5037313432835822, | |
| "grad_norm": 0.11850416660308838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533820629119873, | |
| "mean_token_accuracy": 0.7840306162834167, | |
| "num_tokens": 6585550.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5325792133808136, | |
| "epoch": 1.5074626865671643, | |
| "grad_norm": 0.18302492797374725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534012496471405, | |
| "mean_token_accuracy": 0.7814914137125015, | |
| "num_tokens": 6601942.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5354548320174217, | |
| "epoch": 1.5111940298507462, | |
| "grad_norm": 0.15404394268989563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538587749004364, | |
| "mean_token_accuracy": 0.7822761088609695, | |
| "num_tokens": 6618440.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5441371351480484, | |
| "epoch": 1.5149253731343284, | |
| "grad_norm": 0.13057801127433777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542742908000946, | |
| "mean_token_accuracy": 0.7798959463834763, | |
| "num_tokens": 6634866.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.542233407497406, | |
| "epoch": 1.5186567164179103, | |
| "grad_norm": 0.14343421161174774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447250008583069, | |
| "mean_token_accuracy": 0.7802796810865402, | |
| "num_tokens": 6651150.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5407950282096863, | |
| "epoch": 1.5223880597014925, | |
| "grad_norm": 0.14996956288814545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389798879623413, | |
| "mean_token_accuracy": 0.7809374779462814, | |
| "num_tokens": 6667674.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5433390289545059, | |
| "epoch": 1.5261194029850746, | |
| "grad_norm": 0.1311637908220291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383128523826599, | |
| "mean_token_accuracy": 0.7790700197219849, | |
| "num_tokens": 6684068.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.527245432138443, | |
| "epoch": 1.5298507462686568, | |
| "grad_norm": 0.16411243379116058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319215059280396, | |
| "mean_token_accuracy": 0.7840736508369446, | |
| "num_tokens": 6700752.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5146678760647774, | |
| "epoch": 1.533582089552239, | |
| "grad_norm": 0.1607578545808792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198485851287842, | |
| "mean_token_accuracy": 0.7882288843393326, | |
| "num_tokens": 6716857.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5308386236429214, | |
| "epoch": 1.537313432835821, | |
| "grad_norm": 0.166807621717453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419335961341858, | |
| "mean_token_accuracy": 0.7812209129333496, | |
| "num_tokens": 6732981.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5488767176866531, | |
| "epoch": 1.5410447761194028, | |
| "grad_norm": 0.14006908237934113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508508086204529, | |
| "mean_token_accuracy": 0.7769163995981216, | |
| "num_tokens": 6749307.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5410346239805222, | |
| "epoch": 1.544776119402985, | |
| "grad_norm": 0.13224521279335022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321468710899353, | |
| "mean_token_accuracy": 0.7842406779527664, | |
| "num_tokens": 6765688.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5605396628379822, | |
| "epoch": 1.5485074626865671, | |
| "grad_norm": 0.1389547735452652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529029369354248, | |
| "mean_token_accuracy": 0.7745459079742432, | |
| "num_tokens": 6782015.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5347004532814026, | |
| "epoch": 1.5522388059701493, | |
| "grad_norm": 0.1258436143398285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315224528312683, | |
| "mean_token_accuracy": 0.7851130068302155, | |
| "num_tokens": 6798206.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5425236374139786, | |
| "epoch": 1.5559701492537314, | |
| "grad_norm": 0.16927701234817505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464774370193481, | |
| "mean_token_accuracy": 0.7801399230957031, | |
| "num_tokens": 6814725.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5187622159719467, | |
| "epoch": 1.5597014925373134, | |
| "grad_norm": 0.13987842202186584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246447920799255, | |
| "mean_token_accuracy": 0.7894206643104553, | |
| "num_tokens": 6831232.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5316571593284607, | |
| "epoch": 1.5634328358208955, | |
| "grad_norm": 0.15650241076946259, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538478434085846, | |
| "mean_token_accuracy": 0.7800242900848389, | |
| "num_tokens": 6847650.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5246055871248245, | |
| "epoch": 1.5671641791044775, | |
| "grad_norm": 0.13061542809009552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321829319000244, | |
| "mean_token_accuracy": 0.7838113605976105, | |
| "num_tokens": 6864019.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5212045907974243, | |
| "epoch": 1.5708955223880596, | |
| "grad_norm": 0.13846127688884735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200290679931641, | |
| "mean_token_accuracy": 0.7883654683828354, | |
| "num_tokens": 6880204.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.542250782251358, | |
| "epoch": 1.5746268656716418, | |
| "grad_norm": 0.12467647343873978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380762815475464, | |
| "mean_token_accuracy": 0.7811442613601685, | |
| "num_tokens": 6896430.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5405887067317963, | |
| "epoch": 1.578358208955224, | |
| "grad_norm": 0.1305769383907318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357393026351929, | |
| "mean_token_accuracy": 0.7828609347343445, | |
| "num_tokens": 6912971.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5287357568740845, | |
| "epoch": 1.582089552238806, | |
| "grad_norm": 0.17313086986541748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329744219779968, | |
| "mean_token_accuracy": 0.782240018248558, | |
| "num_tokens": 6929204.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5423530340194702, | |
| "epoch": 1.585820895522388, | |
| "grad_norm": 0.1359935700893402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5377368330955505, | |
| "mean_token_accuracy": 0.7828396558761597, | |
| "num_tokens": 6945791.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5215180069208145, | |
| "epoch": 1.5895522388059702, | |
| "grad_norm": 0.1547544300556183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314459800720215, | |
| "mean_token_accuracy": 0.7837548702955246, | |
| "num_tokens": 6961875.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5231145992875099, | |
| "epoch": 1.5932835820895521, | |
| "grad_norm": 0.13578681647777557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277360677719116, | |
| "mean_token_accuracy": 0.7842715680599213, | |
| "num_tokens": 6978198.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5486603379249573, | |
| "epoch": 1.5970149253731343, | |
| "grad_norm": 0.15189069509506226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549156129360199, | |
| "mean_token_accuracy": 0.7768438756465912, | |
| "num_tokens": 6994444.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.54026959836483, | |
| "epoch": 1.6007462686567164, | |
| "grad_norm": 0.13162657618522644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345808863639832, | |
| "mean_token_accuracy": 0.7827611416578293, | |
| "num_tokens": 7010461.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.53890560567379, | |
| "epoch": 1.6044776119402986, | |
| "grad_norm": 0.133237823843956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350275635719299, | |
| "mean_token_accuracy": 0.7830039262771606, | |
| "num_tokens": 7026813.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5518313944339752, | |
| "epoch": 1.6082089552238807, | |
| "grad_norm": 0.14963583648204803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478031039237976, | |
| "mean_token_accuracy": 0.7780435979366302, | |
| "num_tokens": 7043301.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5414951294660568, | |
| "epoch": 1.6119402985074627, | |
| "grad_norm": 0.12772321701049805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401883125305176, | |
| "mean_token_accuracy": 0.782444417476654, | |
| "num_tokens": 7059646.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5394223630428314, | |
| "epoch": 1.6156716417910446, | |
| "grad_norm": 0.13813580572605133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405031442642212, | |
| "mean_token_accuracy": 0.7798984050750732, | |
| "num_tokens": 7076271.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5429421365261078, | |
| "epoch": 1.6194029850746268, | |
| "grad_norm": 0.15601246058940887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516016483306885, | |
| "mean_token_accuracy": 0.775258257985115, | |
| "num_tokens": 7092578.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5521349459886551, | |
| "epoch": 1.623134328358209, | |
| "grad_norm": 0.14428818225860596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5492872595787048, | |
| "mean_token_accuracy": 0.7768293768167496, | |
| "num_tokens": 7109046.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5354936867952347, | |
| "epoch": 1.626865671641791, | |
| "grad_norm": 0.15073303878307343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428034663200378, | |
| "mean_token_accuracy": 0.780666396021843, | |
| "num_tokens": 7125466.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.5443413555622101, | |
| "epoch": 1.6305970149253732, | |
| "grad_norm": 0.14848864078521729, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486512780189514, | |
| "mean_token_accuracy": 0.7806312739849091, | |
| "num_tokens": 7141898.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5337215662002563, | |
| "epoch": 1.6343283582089554, | |
| "grad_norm": 0.15302547812461853, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392454862594604, | |
| "mean_token_accuracy": 0.7822044789791107, | |
| "num_tokens": 7158167.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5586158037185669, | |
| "epoch": 1.6380597014925373, | |
| "grad_norm": 0.17401555180549622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557881772518158, | |
| "mean_token_accuracy": 0.7756661027669907, | |
| "num_tokens": 7174477.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5406471788883209, | |
| "epoch": 1.6417910447761193, | |
| "grad_norm": 0.14608509838581085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353439450263977, | |
| "mean_token_accuracy": 0.7812080383300781, | |
| "num_tokens": 7190694.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5237606167793274, | |
| "epoch": 1.6455223880597014, | |
| "grad_norm": 0.1542704850435257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5290042161941528, | |
| "mean_token_accuracy": 0.7855716645717621, | |
| "num_tokens": 7207153.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5269318968057632, | |
| "epoch": 1.6492537313432836, | |
| "grad_norm": 0.1659008413553238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530527651309967, | |
| "mean_token_accuracy": 0.7846795618534088, | |
| "num_tokens": 7223109.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5195682793855667, | |
| "epoch": 1.6529850746268657, | |
| "grad_norm": 0.14120091497898102, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263478755950928, | |
| "mean_token_accuracy": 0.7843965291976929, | |
| "num_tokens": 7239499.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5257822424173355, | |
| "epoch": 1.6567164179104479, | |
| "grad_norm": 0.1643773764371872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316389203071594, | |
| "mean_token_accuracy": 0.7851150333881378, | |
| "num_tokens": 7255730.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5377429872751236, | |
| "epoch": 1.6604477611940298, | |
| "grad_norm": 0.14926724135875702, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427424907684326, | |
| "mean_token_accuracy": 0.7824969440698624, | |
| "num_tokens": 7272167.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.538849800825119, | |
| "epoch": 1.664179104477612, | |
| "grad_norm": 0.13225945830345154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327820181846619, | |
| "mean_token_accuracy": 0.783388078212738, | |
| "num_tokens": 7288421.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5399289578199387, | |
| "epoch": 1.667910447761194, | |
| "grad_norm": 0.1308569759130478, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292877554893494, | |
| "mean_token_accuracy": 0.7878285944461823, | |
| "num_tokens": 7304880.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5436895489692688, | |
| "epoch": 1.671641791044776, | |
| "grad_norm": 0.16895835101604462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451297163963318, | |
| "mean_token_accuracy": 0.7789509892463684, | |
| "num_tokens": 7321256.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5504481792449951, | |
| "epoch": 1.6753731343283582, | |
| "grad_norm": 0.13614578545093536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539385080337524, | |
| "mean_token_accuracy": 0.7752430438995361, | |
| "num_tokens": 7337589.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5513797849416733, | |
| "epoch": 1.6791044776119404, | |
| "grad_norm": 0.15195772051811218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530341267585754, | |
| "mean_token_accuracy": 0.7749580442905426, | |
| "num_tokens": 7353883.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5413680523633957, | |
| "epoch": 1.6828358208955225, | |
| "grad_norm": 0.15170808136463165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543311357498169, | |
| "mean_token_accuracy": 0.7790023237466812, | |
| "num_tokens": 7370160.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5648334920406342, | |
| "epoch": 1.6865671641791045, | |
| "grad_norm": 0.1327073723077774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623019933700562, | |
| "mean_token_accuracy": 0.7708193957805634, | |
| "num_tokens": 7386478.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.517740860581398, | |
| "epoch": 1.6902985074626866, | |
| "grad_norm": 0.13745424151420593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5170730352401733, | |
| "mean_token_accuracy": 0.7882706969976425, | |
| "num_tokens": 7402645.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5524223297834396, | |
| "epoch": 1.6940298507462686, | |
| "grad_norm": 0.1598864197731018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490080714225769, | |
| "mean_token_accuracy": 0.7766116112470627, | |
| "num_tokens": 7419124.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5260176658630371, | |
| "epoch": 1.6977611940298507, | |
| "grad_norm": 0.13257424533367157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297276973724365, | |
| "mean_token_accuracy": 0.7853291928768158, | |
| "num_tokens": 7435508.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5325040817260742, | |
| "epoch": 1.7014925373134329, | |
| "grad_norm": 0.18319375813007355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543100118637085, | |
| "mean_token_accuracy": 0.7803790718317032, | |
| "num_tokens": 7451755.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5267694145441055, | |
| "epoch": 1.705223880597015, | |
| "grad_norm": 0.1554267704486847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240468978881836, | |
| "mean_token_accuracy": 0.7871411740779877, | |
| "num_tokens": 7467919.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5426032692193985, | |
| "epoch": 1.7089552238805972, | |
| "grad_norm": 0.13706867396831512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412613749504089, | |
| "mean_token_accuracy": 0.778879314661026, | |
| "num_tokens": 7484289.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5340660065412521, | |
| "epoch": 1.712686567164179, | |
| "grad_norm": 0.16726213693618774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392245650291443, | |
| "mean_token_accuracy": 0.7805332094430923, | |
| "num_tokens": 7500611.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5553819835186005, | |
| "epoch": 1.716417910447761, | |
| "grad_norm": 0.16255703568458557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517896413803101, | |
| "mean_token_accuracy": 0.7731162905693054, | |
| "num_tokens": 7517206.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5343479365110397, | |
| "epoch": 1.7201492537313432, | |
| "grad_norm": 0.13407304883003235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380552411079407, | |
| "mean_token_accuracy": 0.778910294175148, | |
| "num_tokens": 7533459.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5323963612318039, | |
| "epoch": 1.7238805970149254, | |
| "grad_norm": 0.1650952398777008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314269661903381, | |
| "mean_token_accuracy": 0.7864300310611725, | |
| "num_tokens": 7549589.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5433520078659058, | |
| "epoch": 1.7276119402985075, | |
| "grad_norm": 0.1429263949394226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540563702583313, | |
| "mean_token_accuracy": 0.7819092869758606, | |
| "num_tokens": 7566158.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5436968952417374, | |
| "epoch": 1.7313432835820897, | |
| "grad_norm": 0.14086155593395233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398205518722534, | |
| "mean_token_accuracy": 0.7809909284114838, | |
| "num_tokens": 7582422.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5534437447786331, | |
| "epoch": 1.7350746268656716, | |
| "grad_norm": 0.14618556201457977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561552047729492, | |
| "mean_token_accuracy": 0.7724596560001373, | |
| "num_tokens": 7598771.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5396170765161514, | |
| "epoch": 1.7388059701492538, | |
| "grad_norm": 0.1190977543592453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389412641525269, | |
| "mean_token_accuracy": 0.7812270224094391, | |
| "num_tokens": 7615418.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5390318781137466, | |
| "epoch": 1.7425373134328357, | |
| "grad_norm": 0.15372450649738312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436992645263672, | |
| "mean_token_accuracy": 0.7814512252807617, | |
| "num_tokens": 7631840.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5206413939595222, | |
| "epoch": 1.7462686567164178, | |
| "grad_norm": 0.13495191931724548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253979563713074, | |
| "mean_token_accuracy": 0.7877579927444458, | |
| "num_tokens": 7648131.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5223769247531891, | |
| "epoch": 1.75, | |
| "grad_norm": 0.15382781624794006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363397002220154, | |
| "mean_token_accuracy": 0.7828211337327957, | |
| "num_tokens": 7664453.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5333149433135986, | |
| "epoch": 1.7537313432835822, | |
| "grad_norm": 0.13387013971805573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351001620292664, | |
| "mean_token_accuracy": 0.7830037176609039, | |
| "num_tokens": 7680781.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5429620742797852, | |
| "epoch": 1.7574626865671643, | |
| "grad_norm": 0.13604114949703217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358593463897705, | |
| "mean_token_accuracy": 0.7837422788143158, | |
| "num_tokens": 7697310.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5731407701969147, | |
| "epoch": 1.7611940298507462, | |
| "grad_norm": 0.1410369724035263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635945796966553, | |
| "mean_token_accuracy": 0.7718209028244019, | |
| "num_tokens": 7713558.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5679037570953369, | |
| "epoch": 1.7649253731343284, | |
| "grad_norm": 0.14904598891735077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656334161758423, | |
| "mean_token_accuracy": 0.7714496552944183, | |
| "num_tokens": 7730117.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5429675132036209, | |
| "epoch": 1.7686567164179103, | |
| "grad_norm": 0.1564645618200302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466417670249939, | |
| "mean_token_accuracy": 0.7782974392175674, | |
| "num_tokens": 7746633.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5362623929977417, | |
| "epoch": 1.7723880597014925, | |
| "grad_norm": 0.14919337630271912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442617535591125, | |
| "mean_token_accuracy": 0.778479665517807, | |
| "num_tokens": 7762813.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5283475816249847, | |
| "epoch": 1.7761194029850746, | |
| "grad_norm": 0.14363890886306763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296353101730347, | |
| "mean_token_accuracy": 0.7861494719982147, | |
| "num_tokens": 7778873.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5252759754657745, | |
| "epoch": 1.7798507462686568, | |
| "grad_norm": 0.17697355151176453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262605547904968, | |
| "mean_token_accuracy": 0.7861870229244232, | |
| "num_tokens": 7795362.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5341710150241852, | |
| "epoch": 1.783582089552239, | |
| "grad_norm": 0.13914838433265686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387526750564575, | |
| "mean_token_accuracy": 0.7779033482074738, | |
| "num_tokens": 7811639.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5409186482429504, | |
| "epoch": 1.787313432835821, | |
| "grad_norm": 0.14785298705101013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428853034973145, | |
| "mean_token_accuracy": 0.7777274399995804, | |
| "num_tokens": 7828116.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5548221617937088, | |
| "epoch": 1.7910447761194028, | |
| "grad_norm": 0.1457030326128006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512540340423584, | |
| "mean_token_accuracy": 0.7757317572832108, | |
| "num_tokens": 7844457.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5340719819068909, | |
| "epoch": 1.794776119402985, | |
| "grad_norm": 0.13429081439971924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289599299430847, | |
| "mean_token_accuracy": 0.7837049216032028, | |
| "num_tokens": 7860611.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5379914194345474, | |
| "epoch": 1.7985074626865671, | |
| "grad_norm": 0.13006342947483063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363917350769043, | |
| "mean_token_accuracy": 0.7821543663740158, | |
| "num_tokens": 7876837.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5481665432453156, | |
| "epoch": 1.8022388059701493, | |
| "grad_norm": 0.14950798451900482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466524362564087, | |
| "mean_token_accuracy": 0.7806346863508224, | |
| "num_tokens": 7893152.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5473506450653076, | |
| "epoch": 1.8059701492537314, | |
| "grad_norm": 0.14105349779129028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428904891014099, | |
| "mean_token_accuracy": 0.778725266456604, | |
| "num_tokens": 7909608.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5446173995733261, | |
| "epoch": 1.8097014925373134, | |
| "grad_norm": 0.15689605474472046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529049634933472, | |
| "mean_token_accuracy": 0.7787118703126907, | |
| "num_tokens": 7926042.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5260195583105087, | |
| "epoch": 1.8134328358208955, | |
| "grad_norm": 0.15744158625602722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373381972312927, | |
| "mean_token_accuracy": 0.7849460244178772, | |
| "num_tokens": 7942407.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5418536812067032, | |
| "epoch": 1.8171641791044775, | |
| "grad_norm": 0.14664271473884583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412867069244385, | |
| "mean_token_accuracy": 0.7811890542507172, | |
| "num_tokens": 7958995.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5519318580627441, | |
| "epoch": 1.8208955223880596, | |
| "grad_norm": 0.15384623408317566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512985587120056, | |
| "mean_token_accuracy": 0.7755472809076309, | |
| "num_tokens": 7975615.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5366766899824142, | |
| "epoch": 1.8246268656716418, | |
| "grad_norm": 0.17651750147342682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435804128646851, | |
| "mean_token_accuracy": 0.7781522572040558, | |
| "num_tokens": 7991932.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5274553596973419, | |
| "epoch": 1.828358208955224, | |
| "grad_norm": 0.13903461396694183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304480195045471, | |
| "mean_token_accuracy": 0.7822371274232864, | |
| "num_tokens": 8008268.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5359211266040802, | |
| "epoch": 1.832089552238806, | |
| "grad_norm": 0.1657918393611908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305460095405579, | |
| "mean_token_accuracy": 0.7854030579328537, | |
| "num_tokens": 8024551.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.5484016537666321, | |
| "epoch": 1.835820895522388, | |
| "grad_norm": 0.16684608161449432, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452835559844971, | |
| "mean_token_accuracy": 0.7772976756095886, | |
| "num_tokens": 8040823.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5474873930215836, | |
| "epoch": 1.8395522388059702, | |
| "grad_norm": 0.151128351688385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493411421775818, | |
| "mean_token_accuracy": 0.7793968617916107, | |
| "num_tokens": 8057509.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.526735208928585, | |
| "epoch": 1.8432835820895521, | |
| "grad_norm": 0.1347130686044693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294213891029358, | |
| "mean_token_accuracy": 0.783684104681015, | |
| "num_tokens": 8073599.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5525032877922058, | |
| "epoch": 1.8470149253731343, | |
| "grad_norm": 0.14043265581130981, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447618961334229, | |
| "mean_token_accuracy": 0.7783424258232117, | |
| "num_tokens": 8089819.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5403036177158356, | |
| "epoch": 1.8507462686567164, | |
| "grad_norm": 0.13459749519824982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543724775314331, | |
| "mean_token_accuracy": 0.7801337391138077, | |
| "num_tokens": 8106320.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5121283084154129, | |
| "epoch": 1.8544776119402986, | |
| "grad_norm": 0.13925622403621674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5182461142539978, | |
| "mean_token_accuracy": 0.7902320176362991, | |
| "num_tokens": 8122590.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5341223925352097, | |
| "epoch": 1.8582089552238807, | |
| "grad_norm": 0.1333732157945633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352264642715454, | |
| "mean_token_accuracy": 0.7827399671077728, | |
| "num_tokens": 8138922.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.5457236468791962, | |
| "epoch": 1.8619402985074627, | |
| "grad_norm": 0.13741785287857056, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454993844032288, | |
| "mean_token_accuracy": 0.7798125892877579, | |
| "num_tokens": 8155306.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5553978830575943, | |
| "epoch": 1.8656716417910446, | |
| "grad_norm": 0.12911130487918854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489829778671265, | |
| "mean_token_accuracy": 0.7798224687576294, | |
| "num_tokens": 8171560.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5366699695587158, | |
| "epoch": 1.8694029850746268, | |
| "grad_norm": 0.14433807134628296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305231213569641, | |
| "mean_token_accuracy": 0.7864150553941727, | |
| "num_tokens": 8188037.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5387077182531357, | |
| "epoch": 1.873134328358209, | |
| "grad_norm": 0.14472654461860657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373876094818115, | |
| "mean_token_accuracy": 0.7849652767181396, | |
| "num_tokens": 8204628.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5305859744548798, | |
| "epoch": 1.876865671641791, | |
| "grad_norm": 0.16016830503940582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409325361251831, | |
| "mean_token_accuracy": 0.7806791961193085, | |
| "num_tokens": 8220902.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5299341380596161, | |
| "epoch": 1.8805970149253732, | |
| "grad_norm": 0.15263962745666504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375992655754089, | |
| "mean_token_accuracy": 0.781559944152832, | |
| "num_tokens": 8237185.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.5437009185552597, | |
| "epoch": 1.8843283582089554, | |
| "grad_norm": 0.15553534030914307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443401336669922, | |
| "mean_token_accuracy": 0.7812230437994003, | |
| "num_tokens": 8253677.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5481602549552917, | |
| "epoch": 1.8880597014925373, | |
| "grad_norm": 0.14724990725517273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540518581867218, | |
| "mean_token_accuracy": 0.7784458547830582, | |
| "num_tokens": 8270080.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5473358333110809, | |
| "epoch": 1.8917910447761193, | |
| "grad_norm": 0.13046710193157196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379562973976135, | |
| "mean_token_accuracy": 0.7840885818004608, | |
| "num_tokens": 8286417.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5339422821998596, | |
| "epoch": 1.8955223880597014, | |
| "grad_norm": 0.11970847100019455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531002402305603, | |
| "mean_token_accuracy": 0.7831601500511169, | |
| "num_tokens": 8302558.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5296764224767685, | |
| "epoch": 1.8992537313432836, | |
| "grad_norm": 0.1354552060365677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331873893737793, | |
| "mean_token_accuracy": 0.7870133370161057, | |
| "num_tokens": 8318741.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.52724589407444, | |
| "epoch": 1.9029850746268657, | |
| "grad_norm": 0.1636589914560318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382875800132751, | |
| "mean_token_accuracy": 0.7812641561031342, | |
| "num_tokens": 8335074.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5487582981586456, | |
| "epoch": 1.9067164179104479, | |
| "grad_norm": 0.15405811369419098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569562315940857, | |
| "mean_token_accuracy": 0.775174006819725, | |
| "num_tokens": 8351357.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5199541226029396, | |
| "epoch": 1.9104477611940298, | |
| "grad_norm": 0.13167649507522583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5217406749725342, | |
| "mean_token_accuracy": 0.788948193192482, | |
| "num_tokens": 8367452.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.5357903987169266, | |
| "epoch": 1.914179104477612, | |
| "grad_norm": 0.12568941712379456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307230949401855, | |
| "mean_token_accuracy": 0.7828755676746368, | |
| "num_tokens": 8383786.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5289642512798309, | |
| "epoch": 1.917910447761194, | |
| "grad_norm": 0.130939319729805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241107940673828, | |
| "mean_token_accuracy": 0.786993533372879, | |
| "num_tokens": 8400005.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5548314303159714, | |
| "epoch": 1.921641791044776, | |
| "grad_norm": 0.1255977749824524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506734848022461, | |
| "mean_token_accuracy": 0.7779561877250671, | |
| "num_tokens": 8416502.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5388498157262802, | |
| "epoch": 1.9253731343283582, | |
| "grad_norm": 0.13658908009529114, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440253615379333, | |
| "mean_token_accuracy": 0.7802704125642776, | |
| "num_tokens": 8432771.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5444848537445068, | |
| "epoch": 1.9291044776119404, | |
| "grad_norm": 0.1361331045627594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464693903923035, | |
| "mean_token_accuracy": 0.7777076661586761, | |
| "num_tokens": 8449261.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.545665979385376, | |
| "epoch": 1.9328358208955225, | |
| "grad_norm": 0.1317397505044937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444501638412476, | |
| "mean_token_accuracy": 0.7814345061779022, | |
| "num_tokens": 8465832.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5405286103487015, | |
| "epoch": 1.9365671641791045, | |
| "grad_norm": 0.13252875208854675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404050946235657, | |
| "mean_token_accuracy": 0.780963346362114, | |
| "num_tokens": 8482176.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5433270484209061, | |
| "epoch": 1.9402985074626866, | |
| "grad_norm": 0.13105268776416779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479311943054199, | |
| "mean_token_accuracy": 0.7770702540874481, | |
| "num_tokens": 8498438.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5341716408729553, | |
| "epoch": 1.9440298507462686, | |
| "grad_norm": 0.14269208908081055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535066545009613, | |
| "mean_token_accuracy": 0.7825455218553543, | |
| "num_tokens": 8514674.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5395411849021912, | |
| "epoch": 1.9477611940298507, | |
| "grad_norm": 0.13277186453342438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376089215278625, | |
| "mean_token_accuracy": 0.7824221551418304, | |
| "num_tokens": 8530963.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5529618561267853, | |
| "epoch": 1.9514925373134329, | |
| "grad_norm": 0.1381501704454422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493215918540955, | |
| "mean_token_accuracy": 0.779175415635109, | |
| "num_tokens": 8547488.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5260922610759735, | |
| "epoch": 1.955223880597015, | |
| "grad_norm": 0.1598714143037796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309720039367676, | |
| "mean_token_accuracy": 0.7842647433280945, | |
| "num_tokens": 8564003.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5258769541978836, | |
| "epoch": 1.9589552238805972, | |
| "grad_norm": 0.1397145837545395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533185601234436, | |
| "mean_token_accuracy": 0.7819601446390152, | |
| "num_tokens": 8580280.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5250103250145912, | |
| "epoch": 1.962686567164179, | |
| "grad_norm": 0.19406840205192566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373009443283081, | |
| "mean_token_accuracy": 0.7827760279178619, | |
| "num_tokens": 8596181.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.556450217962265, | |
| "epoch": 1.966417910447761, | |
| "grad_norm": 0.13848020136356354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5526891946792603, | |
| "mean_token_accuracy": 0.7767400592565536, | |
| "num_tokens": 8612545.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5524493604898453, | |
| "epoch": 1.9701492537313432, | |
| "grad_norm": 0.13262905180454254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456893444061279, | |
| "mean_token_accuracy": 0.7794637978076935, | |
| "num_tokens": 8628708.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5483785569667816, | |
| "epoch": 1.9738805970149254, | |
| "grad_norm": 0.13305608928203583, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419108271598816, | |
| "mean_token_accuracy": 0.7776815295219421, | |
| "num_tokens": 8645353.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5357464104890823, | |
| "epoch": 1.9776119402985075, | |
| "grad_norm": 0.18632404506206512, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538067102432251, | |
| "mean_token_accuracy": 0.7834661602973938, | |
| "num_tokens": 8661338.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5424002707004547, | |
| "epoch": 1.9813432835820897, | |
| "grad_norm": 0.14013341069221497, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498412251472473, | |
| "mean_token_accuracy": 0.7779710739850998, | |
| "num_tokens": 8677558.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5473677217960358, | |
| "epoch": 1.9850746268656716, | |
| "grad_norm": 0.16677168011665344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508783459663391, | |
| "mean_token_accuracy": 0.7754979729652405, | |
| "num_tokens": 8693871.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5417899936437607, | |
| "epoch": 1.9888059701492538, | |
| "grad_norm": 0.13049523532390594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387138724327087, | |
| "mean_token_accuracy": 0.7801752388477325, | |
| "num_tokens": 8710295.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.539973795413971, | |
| "epoch": 1.9925373134328357, | |
| "grad_norm": 0.13125836849212646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384909510612488, | |
| "mean_token_accuracy": 0.7825180888175964, | |
| "num_tokens": 8726574.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5503594130277634, | |
| "epoch": 1.9962686567164178, | |
| "grad_norm": 0.13576547801494598, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558905005455017, | |
| "mean_token_accuracy": 0.7731243073940277, | |
| "num_tokens": 8742903.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5420230776071548, | |
| "epoch": 2.0, | |
| "grad_norm": 0.13022863864898682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5468026399612427, | |
| "mean_token_accuracy": 0.7781520336866379, | |
| "num_tokens": 8759542.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5381979197263718, | |
| "epoch": 2.003731343283582, | |
| "grad_norm": 0.14043375849723816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527134358882904, | |
| "mean_token_accuracy": 0.7864610850811005, | |
| "num_tokens": 8775884.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5298552364110947, | |
| "epoch": 2.0074626865671643, | |
| "grad_norm": 0.15086792409420013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525084912776947, | |
| "mean_token_accuracy": 0.7869725525379181, | |
| "num_tokens": 8792092.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5192188173532486, | |
| "epoch": 2.0111940298507465, | |
| "grad_norm": 0.19961106777191162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296894907951355, | |
| "mean_token_accuracy": 0.7826270759105682, | |
| "num_tokens": 8808558.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5123308524489403, | |
| "epoch": 2.014925373134328, | |
| "grad_norm": 0.19111908972263336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212836265563965, | |
| "mean_token_accuracy": 0.789938747882843, | |
| "num_tokens": 8824957.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5178431421518326, | |
| "epoch": 2.0186567164179103, | |
| "grad_norm": 0.19028709828853607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238035917282104, | |
| "mean_token_accuracy": 0.7860684394836426, | |
| "num_tokens": 8841440.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.531784176826477, | |
| "epoch": 2.0223880597014925, | |
| "grad_norm": 0.15052154660224915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242434144020081, | |
| "mean_token_accuracy": 0.7872632443904877, | |
| "num_tokens": 8857544.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.523473396897316, | |
| "epoch": 2.0261194029850746, | |
| "grad_norm": 0.16107355058193207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5132102966308594, | |
| "mean_token_accuracy": 0.7902694642543793, | |
| "num_tokens": 8873855.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5190383419394493, | |
| "epoch": 2.029850746268657, | |
| "grad_norm": 0.1708311289548874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5148621797561646, | |
| "mean_token_accuracy": 0.7895102798938751, | |
| "num_tokens": 8890117.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.529280424118042, | |
| "epoch": 2.033582089552239, | |
| "grad_norm": 0.16680803894996643, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307912826538086, | |
| "mean_token_accuracy": 0.7853487432003021, | |
| "num_tokens": 8906392.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.49614501744508743, | |
| "epoch": 2.0373134328358207, | |
| "grad_norm": 0.1503826081752777, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5012757182121277, | |
| "mean_token_accuracy": 0.7970542311668396, | |
| "num_tokens": 8922509.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.509469673037529, | |
| "epoch": 2.041044776119403, | |
| "grad_norm": 0.15220946073532104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193155407905579, | |
| "mean_token_accuracy": 0.7900224179029465, | |
| "num_tokens": 8938730.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5206529274582863, | |
| "epoch": 2.044776119402985, | |
| "grad_norm": 0.15667758882045746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237014293670654, | |
| "mean_token_accuracy": 0.7895828038454056, | |
| "num_tokens": 8955181.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5195223838090897, | |
| "epoch": 2.048507462686567, | |
| "grad_norm": 0.1412286013364792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5065000653266907, | |
| "mean_token_accuracy": 0.7948807328939438, | |
| "num_tokens": 8971652.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5343464240431786, | |
| "epoch": 2.0522388059701493, | |
| "grad_norm": 0.17040982842445374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5262223482131958, | |
| "mean_token_accuracy": 0.7864163517951965, | |
| "num_tokens": 8987886.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5151650607585907, | |
| "epoch": 2.0559701492537314, | |
| "grad_norm": 0.18324047327041626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181486010551453, | |
| "mean_token_accuracy": 0.7915034592151642, | |
| "num_tokens": 9004065.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5399871617555618, | |
| "epoch": 2.0597014925373136, | |
| "grad_norm": 0.18549422919750214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452507138252258, | |
| "mean_token_accuracy": 0.7797505408525467, | |
| "num_tokens": 9020548.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5106882750988007, | |
| "epoch": 2.0634328358208953, | |
| "grad_norm": 0.18570005893707275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5167975425720215, | |
| "mean_token_accuracy": 0.7912678271532059, | |
| "num_tokens": 9036842.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5242500603199005, | |
| "epoch": 2.0671641791044775, | |
| "grad_norm": 0.16008509695529938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222814083099365, | |
| "mean_token_accuracy": 0.7895151823759079, | |
| "num_tokens": 9053207.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5202578157186508, | |
| "epoch": 2.0708955223880596, | |
| "grad_norm": 0.158061683177948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.510570228099823, | |
| "mean_token_accuracy": 0.7938546240329742, | |
| "num_tokens": 9069710.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5159406885504723, | |
| "epoch": 2.074626865671642, | |
| "grad_norm": 0.1673257201910019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5130877494812012, | |
| "mean_token_accuracy": 0.7952297329902649, | |
| "num_tokens": 9085896.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5333143472671509, | |
| "epoch": 2.078358208955224, | |
| "grad_norm": 0.1610044240951538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534683108329773, | |
| "mean_token_accuracy": 0.7838889360427856, | |
| "num_tokens": 9102330.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5199142321944237, | |
| "epoch": 2.082089552238806, | |
| "grad_norm": 0.18822608888149261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304499864578247, | |
| "mean_token_accuracy": 0.7855323851108551, | |
| "num_tokens": 9118702.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5128015950322151, | |
| "epoch": 2.0858208955223883, | |
| "grad_norm": 0.16853775084018707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243670344352722, | |
| "mean_token_accuracy": 0.7870570570230484, | |
| "num_tokens": 9135161.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5174604654312134, | |
| "epoch": 2.08955223880597, | |
| "grad_norm": 0.1812400370836258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5177437663078308, | |
| "mean_token_accuracy": 0.7915796935558319, | |
| "num_tokens": 9151704.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5173925012350082, | |
| "epoch": 2.093283582089552, | |
| "grad_norm": 0.1714162975549698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5103091597557068, | |
| "mean_token_accuracy": 0.7926450222730637, | |
| "num_tokens": 9167936.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5338417440652847, | |
| "epoch": 2.0970149253731343, | |
| "grad_norm": 0.18883411586284637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264431834220886, | |
| "mean_token_accuracy": 0.7850892692804337, | |
| "num_tokens": 9184252.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5227560251951218, | |
| "epoch": 2.1007462686567164, | |
| "grad_norm": 0.16431209444999695, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194032192230225, | |
| "mean_token_accuracy": 0.7891248762607574, | |
| "num_tokens": 9200663.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5161062777042389, | |
| "epoch": 2.1044776119402986, | |
| "grad_norm": 0.19406329095363617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5161796808242798, | |
| "mean_token_accuracy": 0.7907394468784332, | |
| "num_tokens": 9216947.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5179730951786041, | |
| "epoch": 2.1082089552238807, | |
| "grad_norm": 0.1819450706243515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243360996246338, | |
| "mean_token_accuracy": 0.7889621257781982, | |
| "num_tokens": 9233374.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5069013833999634, | |
| "epoch": 2.111940298507463, | |
| "grad_norm": 0.18256594240665436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5135838389396667, | |
| "mean_token_accuracy": 0.7917103320360184, | |
| "num_tokens": 9249879.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5135505869984627, | |
| "epoch": 2.1156716417910446, | |
| "grad_norm": 0.20573152601718903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5165933966636658, | |
| "mean_token_accuracy": 0.7909833937883377, | |
| "num_tokens": 9266246.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5395868420600891, | |
| "epoch": 2.1194029850746268, | |
| "grad_norm": 0.18927782773971558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330281853675842, | |
| "mean_token_accuracy": 0.7855703681707382, | |
| "num_tokens": 9282481.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.4938410297036171, | |
| "epoch": 2.123134328358209, | |
| "grad_norm": 0.19526073336601257, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49382245540618896, | |
| "mean_token_accuracy": 0.7996838092803955, | |
| "num_tokens": 9298815.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5009667873382568, | |
| "epoch": 2.126865671641791, | |
| "grad_norm": 0.16595199704170227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5045086741447449, | |
| "mean_token_accuracy": 0.7978608906269073, | |
| "num_tokens": 9315340.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5141628980636597, | |
| "epoch": 2.1305970149253732, | |
| "grad_norm": 0.21891801059246063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266185998916626, | |
| "mean_token_accuracy": 0.787352979183197, | |
| "num_tokens": 9331498.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5307284891605377, | |
| "epoch": 2.1343283582089554, | |
| "grad_norm": 0.1866699457168579, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273443460464478, | |
| "mean_token_accuracy": 0.7860653698444366, | |
| "num_tokens": 9347831.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5239406228065491, | |
| "epoch": 2.138059701492537, | |
| "grad_norm": 0.16141167283058167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189298391342163, | |
| "mean_token_accuracy": 0.7913686484098434, | |
| "num_tokens": 9364053.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5423860549926758, | |
| "epoch": 2.1417910447761193, | |
| "grad_norm": 0.21419642865657806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438653826713562, | |
| "mean_token_accuracy": 0.7800484448671341, | |
| "num_tokens": 9380482.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5319498926401138, | |
| "epoch": 2.1455223880597014, | |
| "grad_norm": 0.15394842624664307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297288298606873, | |
| "mean_token_accuracy": 0.7861971110105515, | |
| "num_tokens": 9396762.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5272255092859268, | |
| "epoch": 2.1492537313432836, | |
| "grad_norm": 0.17917747795581818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221657156944275, | |
| "mean_token_accuracy": 0.78948013484478, | |
| "num_tokens": 9412981.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5195171386003494, | |
| "epoch": 2.1529850746268657, | |
| "grad_norm": 0.16095657646656036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160609483718872, | |
| "mean_token_accuracy": 0.7911281585693359, | |
| "num_tokens": 9429393.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5020652115345001, | |
| "epoch": 2.156716417910448, | |
| "grad_norm": 0.1592203974723816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5017430782318115, | |
| "mean_token_accuracy": 0.7959037572145462, | |
| "num_tokens": 9445763.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5353998094797134, | |
| "epoch": 2.16044776119403, | |
| "grad_norm": 0.18405838310718536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360097885131836, | |
| "mean_token_accuracy": 0.7805107831954956, | |
| "num_tokens": 9462245.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5231145322322845, | |
| "epoch": 2.1641791044776117, | |
| "grad_norm": 0.16262777149677277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238299369812012, | |
| "mean_token_accuracy": 0.7883976399898529, | |
| "num_tokens": 9478792.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5025703385472298, | |
| "epoch": 2.167910447761194, | |
| "grad_norm": 0.16886277496814728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5095133185386658, | |
| "mean_token_accuracy": 0.7930570840835571, | |
| "num_tokens": 9495042.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5041064321994781, | |
| "epoch": 2.171641791044776, | |
| "grad_norm": 0.1545090675354004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5001657605171204, | |
| "mean_token_accuracy": 0.7950020581483841, | |
| "num_tokens": 9511399.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.533274233341217, | |
| "epoch": 2.175373134328358, | |
| "grad_norm": 0.15395475924015045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321199893951416, | |
| "mean_token_accuracy": 0.7817400395870209, | |
| "num_tokens": 9527796.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5225674957036972, | |
| "epoch": 2.1791044776119404, | |
| "grad_norm": 0.1874343305826187, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301029682159424, | |
| "mean_token_accuracy": 0.7839690893888474, | |
| "num_tokens": 9544098.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5206504017114639, | |
| "epoch": 2.1828358208955225, | |
| "grad_norm": 0.18132635951042175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5191587209701538, | |
| "mean_token_accuracy": 0.7905547767877579, | |
| "num_tokens": 9560486.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.5231298729777336, | |
| "epoch": 2.1865671641791047, | |
| "grad_norm": 0.19394823908805847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234656929969788, | |
| "mean_token_accuracy": 0.7889635264873505, | |
| "num_tokens": 9576893.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.4975113570690155, | |
| "epoch": 2.1902985074626864, | |
| "grad_norm": 0.1897096484899521, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5067098736763, | |
| "mean_token_accuracy": 0.7950832843780518, | |
| "num_tokens": 9593176.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5182362198829651, | |
| "epoch": 2.1940298507462686, | |
| "grad_norm": 0.21101859211921692, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5240258574485779, | |
| "mean_token_accuracy": 0.7852578610181808, | |
| "num_tokens": 9609529.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5308810174465179, | |
| "epoch": 2.1977611940298507, | |
| "grad_norm": 0.15612205862998962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230595469474792, | |
| "mean_token_accuracy": 0.7886761873960495, | |
| "num_tokens": 9626018.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5405040681362152, | |
| "epoch": 2.201492537313433, | |
| "grad_norm": 0.16354262828826904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339536666870117, | |
| "mean_token_accuracy": 0.7827159017324448, | |
| "num_tokens": 9642340.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5320803225040436, | |
| "epoch": 2.205223880597015, | |
| "grad_norm": 0.1848597228527069, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349913835525513, | |
| "mean_token_accuracy": 0.7858193665742874, | |
| "num_tokens": 9658780.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5458312928676605, | |
| "epoch": 2.208955223880597, | |
| "grad_norm": 0.16995884478092194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466773509979248, | |
| "mean_token_accuracy": 0.7766650468111038, | |
| "num_tokens": 9675184.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.520288422703743, | |
| "epoch": 2.2126865671641793, | |
| "grad_norm": 0.17533989250659943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5276610851287842, | |
| "mean_token_accuracy": 0.7833162993192673, | |
| "num_tokens": 9691587.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5230257883667946, | |
| "epoch": 2.216417910447761, | |
| "grad_norm": 0.1576543152332306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214830040931702, | |
| "mean_token_accuracy": 0.7887468189001083, | |
| "num_tokens": 9707639.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5276977717876434, | |
| "epoch": 2.220149253731343, | |
| "grad_norm": 0.16972552239894867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270232558250427, | |
| "mean_token_accuracy": 0.7899148017168045, | |
| "num_tokens": 9723826.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5177433490753174, | |
| "epoch": 2.2238805970149254, | |
| "grad_norm": 0.17887970805168152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5160896182060242, | |
| "mean_token_accuracy": 0.7925579845905304, | |
| "num_tokens": 9740088.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.525688573718071, | |
| "epoch": 2.2276119402985075, | |
| "grad_norm": 0.1659506857395172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277712345123291, | |
| "mean_token_accuracy": 0.7854456752538681, | |
| "num_tokens": 9756214.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5137215405702591, | |
| "epoch": 2.2313432835820897, | |
| "grad_norm": 0.18150706589221954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194687247276306, | |
| "mean_token_accuracy": 0.7904618233442307, | |
| "num_tokens": 9772511.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.529701828956604, | |
| "epoch": 2.235074626865672, | |
| "grad_norm": 0.17603962123394012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309550166130066, | |
| "mean_token_accuracy": 0.7836979478597641, | |
| "num_tokens": 9788956.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5346364378929138, | |
| "epoch": 2.2388059701492535, | |
| "grad_norm": 0.17556419968605042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340572595596313, | |
| "mean_token_accuracy": 0.7827766090631485, | |
| "num_tokens": 9805350.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5358438938856125, | |
| "epoch": 2.2425373134328357, | |
| "grad_norm": 0.19660161435604095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320678353309631, | |
| "mean_token_accuracy": 0.7855796813964844, | |
| "num_tokens": 9821744.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5096235424280167, | |
| "epoch": 2.246268656716418, | |
| "grad_norm": 0.15900631248950958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5056334137916565, | |
| "mean_token_accuracy": 0.7966822683811188, | |
| "num_tokens": 9837824.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5357042700052261, | |
| "epoch": 2.25, | |
| "grad_norm": 0.1657211184501648, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354617238044739, | |
| "mean_token_accuracy": 0.7830197513103485, | |
| "num_tokens": 9854305.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5109390839934349, | |
| "epoch": 2.253731343283582, | |
| "grad_norm": 0.1763714998960495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5157687664031982, | |
| "mean_token_accuracy": 0.7923711538314819, | |
| "num_tokens": 9870793.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5191235095262527, | |
| "epoch": 2.2574626865671643, | |
| "grad_norm": 0.20325957238674164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273858308792114, | |
| "mean_token_accuracy": 0.7857847660779953, | |
| "num_tokens": 9887144.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5128894448280334, | |
| "epoch": 2.2611940298507465, | |
| "grad_norm": 0.18303951621055603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5150971412658691, | |
| "mean_token_accuracy": 0.7911935448646545, | |
| "num_tokens": 9903362.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.518405131995678, | |
| "epoch": 2.264925373134328, | |
| "grad_norm": 0.16138286888599396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196152925491333, | |
| "mean_token_accuracy": 0.7916755676269531, | |
| "num_tokens": 9919665.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5238161385059357, | |
| "epoch": 2.2686567164179103, | |
| "grad_norm": 0.15336841344833374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234584808349609, | |
| "mean_token_accuracy": 0.7885531485080719, | |
| "num_tokens": 9936204.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5139288082718849, | |
| "epoch": 2.2723880597014925, | |
| "grad_norm": 0.15460564196109772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516942024230957, | |
| "mean_token_accuracy": 0.7878196388483047, | |
| "num_tokens": 9952444.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5144378393888474, | |
| "epoch": 2.2761194029850746, | |
| "grad_norm": 0.16456560790538788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5143165588378906, | |
| "mean_token_accuracy": 0.7900296002626419, | |
| "num_tokens": 9968772.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5115328878164291, | |
| "epoch": 2.279850746268657, | |
| "grad_norm": 0.17883925139904022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5190625190734863, | |
| "mean_token_accuracy": 0.7872501909732819, | |
| "num_tokens": 9985174.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.535979226231575, | |
| "epoch": 2.283582089552239, | |
| "grad_norm": 0.1744793951511383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318659543991089, | |
| "mean_token_accuracy": 0.7878114283084869, | |
| "num_tokens": 10001610.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5348420441150665, | |
| "epoch": 2.2873134328358207, | |
| "grad_norm": 0.17023774981498718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370223522186279, | |
| "mean_token_accuracy": 0.783968135714531, | |
| "num_tokens": 10017829.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5138903260231018, | |
| "epoch": 2.291044776119403, | |
| "grad_norm": 0.17115749418735504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5157005190849304, | |
| "mean_token_accuracy": 0.7915801256895065, | |
| "num_tokens": 10034135.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.514953039586544, | |
| "epoch": 2.294776119402985, | |
| "grad_norm": 0.1999882459640503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5170516967773438, | |
| "mean_token_accuracy": 0.7916076630353928, | |
| "num_tokens": 10050500.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5247506201267242, | |
| "epoch": 2.298507462686567, | |
| "grad_norm": 0.16434574127197266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5179375410079956, | |
| "mean_token_accuracy": 0.7906480133533478, | |
| "num_tokens": 10066822.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5195427983999252, | |
| "epoch": 2.3022388059701493, | |
| "grad_norm": 0.16079425811767578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192772746086121, | |
| "mean_token_accuracy": 0.788419172167778, | |
| "num_tokens": 10083211.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5161983221769333, | |
| "epoch": 2.3059701492537314, | |
| "grad_norm": 0.15893937647342682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5151652097702026, | |
| "mean_token_accuracy": 0.7913366705179214, | |
| "num_tokens": 10099502.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5129862576723099, | |
| "epoch": 2.3097014925373136, | |
| "grad_norm": 0.1990455985069275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226958394050598, | |
| "mean_token_accuracy": 0.7890161275863647, | |
| "num_tokens": 10115875.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5259782820940018, | |
| "epoch": 2.3134328358208958, | |
| "grad_norm": 0.17600762844085693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5303045511245728, | |
| "mean_token_accuracy": 0.784588485956192, | |
| "num_tokens": 10132329.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5374605804681778, | |
| "epoch": 2.3171641791044775, | |
| "grad_norm": 0.15160205960273743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319960117340088, | |
| "mean_token_accuracy": 0.7856357097625732, | |
| "num_tokens": 10148660.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5202681869268417, | |
| "epoch": 2.3208955223880596, | |
| "grad_norm": 0.17217791080474854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.513685405254364, | |
| "mean_token_accuracy": 0.7912963330745697, | |
| "num_tokens": 10164847.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5351561158895493, | |
| "epoch": 2.324626865671642, | |
| "grad_norm": 0.16189849376678467, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341706275939941, | |
| "mean_token_accuracy": 0.7827345281839371, | |
| "num_tokens": 10181330.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5096163898706436, | |
| "epoch": 2.328358208955224, | |
| "grad_norm": 0.17251546680927277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183389186859131, | |
| "mean_token_accuracy": 0.7891778647899628, | |
| "num_tokens": 10197593.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5043528005480766, | |
| "epoch": 2.332089552238806, | |
| "grad_norm": 0.19364336133003235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169776082038879, | |
| "mean_token_accuracy": 0.792061522603035, | |
| "num_tokens": 10213821.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5118814930319786, | |
| "epoch": 2.3358208955223883, | |
| "grad_norm": 0.21755088865756989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260127782821655, | |
| "mean_token_accuracy": 0.7870439440011978, | |
| "num_tokens": 10229959.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5387731194496155, | |
| "epoch": 2.33955223880597, | |
| "grad_norm": 0.15599676966667175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359347462654114, | |
| "mean_token_accuracy": 0.7821696400642395, | |
| "num_tokens": 10246325.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5259936600923538, | |
| "epoch": 2.343283582089552, | |
| "grad_norm": 0.17784081399440765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5117411613464355, | |
| "mean_token_accuracy": 0.7913538813591003, | |
| "num_tokens": 10262854.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5261276811361313, | |
| "epoch": 2.3470149253731343, | |
| "grad_norm": 0.15290921926498413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5141685009002686, | |
| "mean_token_accuracy": 0.7897167503833771, | |
| "num_tokens": 10279167.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.516872301697731, | |
| "epoch": 2.3507462686567164, | |
| "grad_norm": 0.16548150777816772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.518975555896759, | |
| "mean_token_accuracy": 0.7876042425632477, | |
| "num_tokens": 10295367.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5166520774364471, | |
| "epoch": 2.3544776119402986, | |
| "grad_norm": 0.2100355476140976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5216490030288696, | |
| "mean_token_accuracy": 0.7918855249881744, | |
| "num_tokens": 10311818.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5158288031816483, | |
| "epoch": 2.3582089552238807, | |
| "grad_norm": 0.19722220301628113, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301001667976379, | |
| "mean_token_accuracy": 0.785649761557579, | |
| "num_tokens": 10328226.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5121333077549934, | |
| "epoch": 2.361940298507463, | |
| "grad_norm": 0.18101061880588531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.514575719833374, | |
| "mean_token_accuracy": 0.7912623584270477, | |
| "num_tokens": 10344492.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5286690294742584, | |
| "epoch": 2.3656716417910446, | |
| "grad_norm": 0.18992973864078522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238395929336548, | |
| "mean_token_accuracy": 0.7872939556837082, | |
| "num_tokens": 10360763.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.504866473376751, | |
| "epoch": 2.3694029850746268, | |
| "grad_norm": 0.17053747177124023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5018288493156433, | |
| "mean_token_accuracy": 0.7963565587997437, | |
| "num_tokens": 10376794.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5348407328128815, | |
| "epoch": 2.373134328358209, | |
| "grad_norm": 0.1969325840473175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392089486122131, | |
| "mean_token_accuracy": 0.781823992729187, | |
| "num_tokens": 10393125.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5291974544525146, | |
| "epoch": 2.376865671641791, | |
| "grad_norm": 0.19346994161605835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330736637115479, | |
| "mean_token_accuracy": 0.781773254275322, | |
| "num_tokens": 10409537.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5348323583602905, | |
| "epoch": 2.3805970149253732, | |
| "grad_norm": 0.18969298899173737, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5274794101715088, | |
| "mean_token_accuracy": 0.787670373916626, | |
| "num_tokens": 10425973.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5205499678850174, | |
| "epoch": 2.3843283582089554, | |
| "grad_norm": 0.17864486575126648, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213812589645386, | |
| "mean_token_accuracy": 0.7890082150697708, | |
| "num_tokens": 10442180.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.528412714600563, | |
| "epoch": 2.388059701492537, | |
| "grad_norm": 0.1959443986415863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534969687461853, | |
| "mean_token_accuracy": 0.7831798046827316, | |
| "num_tokens": 10458477.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5136244520545006, | |
| "epoch": 2.3917910447761193, | |
| "grad_norm": 0.20498400926589966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511573314666748, | |
| "mean_token_accuracy": 0.7939646393060684, | |
| "num_tokens": 10475023.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5202098488807678, | |
| "epoch": 2.3955223880597014, | |
| "grad_norm": 0.20506030321121216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5162352919578552, | |
| "mean_token_accuracy": 0.7906180173158646, | |
| "num_tokens": 10491313.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5307043790817261, | |
| "epoch": 2.3992537313432836, | |
| "grad_norm": 0.17971979081630707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288392305374146, | |
| "mean_token_accuracy": 0.7879067957401276, | |
| "num_tokens": 10507682.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5393616259098053, | |
| "epoch": 2.4029850746268657, | |
| "grad_norm": 0.23341259360313416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383281707763672, | |
| "mean_token_accuracy": 0.781504288315773, | |
| "num_tokens": 10524138.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5379284471273422, | |
| "epoch": 2.406716417910448, | |
| "grad_norm": 0.16890797019004822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414294004440308, | |
| "mean_token_accuracy": 0.7795721143484116, | |
| "num_tokens": 10540308.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5295774638652802, | |
| "epoch": 2.41044776119403, | |
| "grad_norm": 0.2540934085845947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318943858146667, | |
| "mean_token_accuracy": 0.7859358042478561, | |
| "num_tokens": 10556760.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5170229598879814, | |
| "epoch": 2.4141791044776117, | |
| "grad_norm": 0.16737528145313263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.517413318157196, | |
| "mean_token_accuracy": 0.7901816219091415, | |
| "num_tokens": 10573293.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.526155412197113, | |
| "epoch": 2.417910447761194, | |
| "grad_norm": 0.2225574254989624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529864490032196, | |
| "mean_token_accuracy": 0.7856150567531586, | |
| "num_tokens": 10589674.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5266731381416321, | |
| "epoch": 2.421641791044776, | |
| "grad_norm": 0.16272951662540436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234624743461609, | |
| "mean_token_accuracy": 0.7885357886552811, | |
| "num_tokens": 10606101.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5251661986112595, | |
| "epoch": 2.425373134328358, | |
| "grad_norm": 0.17834821343421936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261815190315247, | |
| "mean_token_accuracy": 0.7859483957290649, | |
| "num_tokens": 10622240.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5259936600923538, | |
| "epoch": 2.4291044776119404, | |
| "grad_norm": 0.16211281716823578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267058610916138, | |
| "mean_token_accuracy": 0.7840430587530136, | |
| "num_tokens": 10638728.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5017556846141815, | |
| "epoch": 2.4328358208955225, | |
| "grad_norm": 0.3111971616744995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5085122585296631, | |
| "mean_token_accuracy": 0.7949473708868027, | |
| "num_tokens": 10654954.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.53680419921875, | |
| "epoch": 2.4365671641791042, | |
| "grad_norm": 0.17920435965061188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438150763511658, | |
| "mean_token_accuracy": 0.7806514501571655, | |
| "num_tokens": 10671142.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5328411310911179, | |
| "epoch": 2.4402985074626864, | |
| "grad_norm": 0.36842888593673706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365176200866699, | |
| "mean_token_accuracy": 0.7864848077297211, | |
| "num_tokens": 10687527.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5214048027992249, | |
| "epoch": 2.4440298507462686, | |
| "grad_norm": 0.15488730370998383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5212221145629883, | |
| "mean_token_accuracy": 0.7904541194438934, | |
| "num_tokens": 10703637.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5198699831962585, | |
| "epoch": 2.4477611940298507, | |
| "grad_norm": 0.17918945848941803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5142287015914917, | |
| "mean_token_accuracy": 0.7930866479873657, | |
| "num_tokens": 10719755.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5371468216180801, | |
| "epoch": 2.451492537313433, | |
| "grad_norm": 0.17966963350772858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387783050537109, | |
| "mean_token_accuracy": 0.7836030423641205, | |
| "num_tokens": 10736159.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.523772120475769, | |
| "epoch": 2.455223880597015, | |
| "grad_norm": 0.17708872258663177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304325819015503, | |
| "mean_token_accuracy": 0.7857228368520737, | |
| "num_tokens": 10752300.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5180701240897179, | |
| "epoch": 2.458955223880597, | |
| "grad_norm": 0.18428592383861542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193667411804199, | |
| "mean_token_accuracy": 0.7911625355482101, | |
| "num_tokens": 10768483.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.528245247900486, | |
| "epoch": 2.4626865671641793, | |
| "grad_norm": 0.1747596561908722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231127142906189, | |
| "mean_token_accuracy": 0.7906267046928406, | |
| "num_tokens": 10784872.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5145193934440613, | |
| "epoch": 2.466417910447761, | |
| "grad_norm": 0.16311223804950714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5083698630332947, | |
| "mean_token_accuracy": 0.7954908460378647, | |
| "num_tokens": 10801264.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5249892026185989, | |
| "epoch": 2.470149253731343, | |
| "grad_norm": 0.15471886098384857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246090292930603, | |
| "mean_token_accuracy": 0.7875058203935623, | |
| "num_tokens": 10817509.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5209084749221802, | |
| "epoch": 2.4738805970149254, | |
| "grad_norm": 0.17972545325756073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200228095054626, | |
| "mean_token_accuracy": 0.7910773009061813, | |
| "num_tokens": 10833875.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5148312151432037, | |
| "epoch": 2.4776119402985075, | |
| "grad_norm": 0.20573753118515015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257189273834229, | |
| "mean_token_accuracy": 0.7857212275266647, | |
| "num_tokens": 10849915.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5218161419034004, | |
| "epoch": 2.4813432835820897, | |
| "grad_norm": 0.18017825484275818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281471014022827, | |
| "mean_token_accuracy": 0.7845035791397095, | |
| "num_tokens": 10866228.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5220426917076111, | |
| "epoch": 2.485074626865672, | |
| "grad_norm": 0.16190138459205627, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521308183670044, | |
| "mean_token_accuracy": 0.7905032187700272, | |
| "num_tokens": 10882941.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5130190551280975, | |
| "epoch": 2.4888059701492535, | |
| "grad_norm": 0.17984949052333832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5067973732948303, | |
| "mean_token_accuracy": 0.7954512685537338, | |
| "num_tokens": 10899165.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5297238677740097, | |
| "epoch": 2.4925373134328357, | |
| "grad_norm": 0.15996725857257843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296366810798645, | |
| "mean_token_accuracy": 0.785218670964241, | |
| "num_tokens": 10915443.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.4974808022379875, | |
| "epoch": 2.496268656716418, | |
| "grad_norm": 0.1793019324541092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4990445077419281, | |
| "mean_token_accuracy": 0.7966191321611404, | |
| "num_tokens": 10931711.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5239012390375137, | |
| "epoch": 2.5, | |
| "grad_norm": 0.19087010622024536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348339676856995, | |
| "mean_token_accuracy": 0.7859302014112473, | |
| "num_tokens": 10948023.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.502729706466198, | |
| "epoch": 2.503731343283582, | |
| "grad_norm": 0.17360597848892212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5077179074287415, | |
| "mean_token_accuracy": 0.7953527718782425, | |
| "num_tokens": 10964233.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5206915363669395, | |
| "epoch": 2.5074626865671643, | |
| "grad_norm": 0.19746483862400055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238724946975708, | |
| "mean_token_accuracy": 0.7870853841304779, | |
| "num_tokens": 10980379.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5450692474842072, | |
| "epoch": 2.5111940298507465, | |
| "grad_norm": 0.20202518999576569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349087119102478, | |
| "mean_token_accuracy": 0.7814089059829712, | |
| "num_tokens": 10996761.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5313533395528793, | |
| "epoch": 2.5149253731343286, | |
| "grad_norm": 0.16622328758239746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273463726043701, | |
| "mean_token_accuracy": 0.7876841723918915, | |
| "num_tokens": 11013002.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5233149528503418, | |
| "epoch": 2.5186567164179103, | |
| "grad_norm": 0.1762213557958603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284275412559509, | |
| "mean_token_accuracy": 0.7885796874761581, | |
| "num_tokens": 11029461.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5161427110433578, | |
| "epoch": 2.5223880597014925, | |
| "grad_norm": 0.1734134405851364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218281149864197, | |
| "mean_token_accuracy": 0.7900317013263702, | |
| "num_tokens": 11045513.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.527386263012886, | |
| "epoch": 2.5261194029850746, | |
| "grad_norm": 0.18649046123027802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264036655426025, | |
| "mean_token_accuracy": 0.7881919145584106, | |
| "num_tokens": 11061764.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5335260331630707, | |
| "epoch": 2.529850746268657, | |
| "grad_norm": 0.16608470678329468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327720046043396, | |
| "mean_token_accuracy": 0.7845087051391602, | |
| "num_tokens": 11077973.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5215242803096771, | |
| "epoch": 2.533582089552239, | |
| "grad_norm": 0.16991843283176422, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5201636552810669, | |
| "mean_token_accuracy": 0.7907481640577316, | |
| "num_tokens": 11094025.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5226395204663277, | |
| "epoch": 2.5373134328358207, | |
| "grad_norm": 0.16204343736171722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192615389823914, | |
| "mean_token_accuracy": 0.7913714349269867, | |
| "num_tokens": 11110340.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5280646532773972, | |
| "epoch": 2.541044776119403, | |
| "grad_norm": 0.17025548219680786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243014097213745, | |
| "mean_token_accuracy": 0.7887150794267654, | |
| "num_tokens": 11126766.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5295235440135002, | |
| "epoch": 2.544776119402985, | |
| "grad_norm": 0.17332811653614044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264289975166321, | |
| "mean_token_accuracy": 0.7893000990152359, | |
| "num_tokens": 11143383.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5350908041000366, | |
| "epoch": 2.548507462686567, | |
| "grad_norm": 0.16494929790496826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385511517524719, | |
| "mean_token_accuracy": 0.7832952737808228, | |
| "num_tokens": 11159798.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5189319550991058, | |
| "epoch": 2.5522388059701493, | |
| "grad_norm": 0.1749635636806488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5244334936141968, | |
| "mean_token_accuracy": 0.7889615148305893, | |
| "num_tokens": 11176116.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5297338515520096, | |
| "epoch": 2.5559701492537314, | |
| "grad_norm": 0.16473545134067535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357664227485657, | |
| "mean_token_accuracy": 0.7839798331260681, | |
| "num_tokens": 11192242.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5161855816841125, | |
| "epoch": 2.5597014925373136, | |
| "grad_norm": 0.19246211647987366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211361050605774, | |
| "mean_token_accuracy": 0.790752574801445, | |
| "num_tokens": 11208617.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.539324015378952, | |
| "epoch": 2.5634328358208958, | |
| "grad_norm": 0.16890385746955872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382983684539795, | |
| "mean_token_accuracy": 0.7826134711503983, | |
| "num_tokens": 11225201.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5158891677856445, | |
| "epoch": 2.5671641791044775, | |
| "grad_norm": 0.16682742536067963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5142616629600525, | |
| "mean_token_accuracy": 0.7918410003185272, | |
| "num_tokens": 11241695.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5267701372504234, | |
| "epoch": 2.5708955223880596, | |
| "grad_norm": 0.1687549650669098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5238382816314697, | |
| "mean_token_accuracy": 0.7866890728473663, | |
| "num_tokens": 11258089.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5255937725305557, | |
| "epoch": 2.574626865671642, | |
| "grad_norm": 0.1738496869802475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5248072147369385, | |
| "mean_token_accuracy": 0.7852340638637543, | |
| "num_tokens": 11274450.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5198262184858322, | |
| "epoch": 2.578358208955224, | |
| "grad_norm": 0.1690807044506073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270042419433594, | |
| "mean_token_accuracy": 0.7855731099843979, | |
| "num_tokens": 11290865.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5405410379171371, | |
| "epoch": 2.582089552238806, | |
| "grad_norm": 0.18134285509586334, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444961786270142, | |
| "mean_token_accuracy": 0.7780175656080246, | |
| "num_tokens": 11307409.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5347141325473785, | |
| "epoch": 2.585820895522388, | |
| "grad_norm": 0.1676827371120453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311787724494934, | |
| "mean_token_accuracy": 0.784485325217247, | |
| "num_tokens": 11323946.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.503664955496788, | |
| "epoch": 2.58955223880597, | |
| "grad_norm": 0.17767618596553802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5004390478134155, | |
| "mean_token_accuracy": 0.7965147197246552, | |
| "num_tokens": 11340062.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5400541573762894, | |
| "epoch": 2.593283582089552, | |
| "grad_norm": 0.17085346579551697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395094156265259, | |
| "mean_token_accuracy": 0.781545028090477, | |
| "num_tokens": 11356660.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5177017226815224, | |
| "epoch": 2.5970149253731343, | |
| "grad_norm": 0.169759601354599, | |
| "learning_rate": 0.0002, | |
| "loss": 0.515388011932373, | |
| "mean_token_accuracy": 0.7907217293977737, | |
| "num_tokens": 11372963.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5352813154459, | |
| "epoch": 2.6007462686567164, | |
| "grad_norm": 0.17281876504421234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351260900497437, | |
| "mean_token_accuracy": 0.7841326892375946, | |
| "num_tokens": 11389640.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5045363381505013, | |
| "epoch": 2.6044776119402986, | |
| "grad_norm": 0.18615856766700745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119503736495972, | |
| "mean_token_accuracy": 0.7933619618415833, | |
| "num_tokens": 11405795.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.521905705332756, | |
| "epoch": 2.6082089552238807, | |
| "grad_norm": 0.18743987381458282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299134850502014, | |
| "mean_token_accuracy": 0.7850409299135208, | |
| "num_tokens": 11422045.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5174702405929565, | |
| "epoch": 2.611940298507463, | |
| "grad_norm": 0.17414018511772156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5177151560783386, | |
| "mean_token_accuracy": 0.7891951948404312, | |
| "num_tokens": 11438392.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5343185365200043, | |
| "epoch": 2.6156716417910446, | |
| "grad_norm": 0.17761462926864624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284934043884277, | |
| "mean_token_accuracy": 0.7868274599313736, | |
| "num_tokens": 11455009.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.53134885430336, | |
| "epoch": 2.6194029850746268, | |
| "grad_norm": 0.16672612726688385, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5203122496604919, | |
| "mean_token_accuracy": 0.7913379818201065, | |
| "num_tokens": 11471341.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.523793414235115, | |
| "epoch": 2.623134328358209, | |
| "grad_norm": 0.15720658004283905, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5188941359519958, | |
| "mean_token_accuracy": 0.7898289412260056, | |
| "num_tokens": 11487565.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5335910320281982, | |
| "epoch": 2.626865671641791, | |
| "grad_norm": 0.18207021057605743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383012294769287, | |
| "mean_token_accuracy": 0.7841922044754028, | |
| "num_tokens": 11503932.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5070014595985413, | |
| "epoch": 2.6305970149253732, | |
| "grad_norm": 0.18818838894367218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521304726600647, | |
| "mean_token_accuracy": 0.7882455736398697, | |
| "num_tokens": 11519876.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5179764032363892, | |
| "epoch": 2.6343283582089554, | |
| "grad_norm": 0.16391263902187347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277372598648071, | |
| "mean_token_accuracy": 0.7888714224100113, | |
| "num_tokens": 11536317.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5383756011724472, | |
| "epoch": 2.638059701492537, | |
| "grad_norm": 0.20110981166362762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405253171920776, | |
| "mean_token_accuracy": 0.7808063477277756, | |
| "num_tokens": 11552655.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5268357321619987, | |
| "epoch": 2.6417910447761193, | |
| "grad_norm": 0.17326846718788147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239301919937134, | |
| "mean_token_accuracy": 0.7901074439287186, | |
| "num_tokens": 11568724.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5407274663448334, | |
| "epoch": 2.6455223880597014, | |
| "grad_norm": 0.16851350665092468, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350074172019958, | |
| "mean_token_accuracy": 0.7861216068267822, | |
| "num_tokens": 11585225.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5268073230981827, | |
| "epoch": 2.6492537313432836, | |
| "grad_norm": 0.19633817672729492, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214436054229736, | |
| "mean_token_accuracy": 0.7898468226194382, | |
| "num_tokens": 11601498.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.535712480545044, | |
| "epoch": 2.6529850746268657, | |
| "grad_norm": 0.15659253299236298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353400707244873, | |
| "mean_token_accuracy": 0.7835351228713989, | |
| "num_tokens": 11617811.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.539536863565445, | |
| "epoch": 2.656716417910448, | |
| "grad_norm": 0.19012975692749023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403158068656921, | |
| "mean_token_accuracy": 0.780579537153244, | |
| "num_tokens": 11634295.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5134764388203621, | |
| "epoch": 2.66044776119403, | |
| "grad_norm": 0.16630828380584717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5213350653648376, | |
| "mean_token_accuracy": 0.7890530824661255, | |
| "num_tokens": 11650834.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.4917012006044388, | |
| "epoch": 2.664179104477612, | |
| "grad_norm": 0.1683693677186966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49927788972854614, | |
| "mean_token_accuracy": 0.797902062535286, | |
| "num_tokens": 11667060.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5247212499380112, | |
| "epoch": 2.667910447761194, | |
| "grad_norm": 0.17371122539043427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344932079315186, | |
| "mean_token_accuracy": 0.783098891377449, | |
| "num_tokens": 11683574.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5191128477454185, | |
| "epoch": 2.671641791044776, | |
| "grad_norm": 0.16527095437049866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183148384094238, | |
| "mean_token_accuracy": 0.790424644947052, | |
| "num_tokens": 11699720.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5185272991657257, | |
| "epoch": 2.675373134328358, | |
| "grad_norm": 0.16154323518276215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5092360973358154, | |
| "mean_token_accuracy": 0.7955475896596909, | |
| "num_tokens": 11716469.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5372938513755798, | |
| "epoch": 2.6791044776119404, | |
| "grad_norm": 0.15932703018188477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302359461784363, | |
| "mean_token_accuracy": 0.786151722073555, | |
| "num_tokens": 11732748.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5596635788679123, | |
| "epoch": 2.6828358208955225, | |
| "grad_norm": 0.18202805519104004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571697950363159, | |
| "mean_token_accuracy": 0.7754980325698853, | |
| "num_tokens": 11749150.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5210409909486771, | |
| "epoch": 2.6865671641791042, | |
| "grad_norm": 0.1875341236591339, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5226970314979553, | |
| "mean_token_accuracy": 0.7895162850618362, | |
| "num_tokens": 11765442.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.528057724237442, | |
| "epoch": 2.6902985074626864, | |
| "grad_norm": 0.16192083060741425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281423330307007, | |
| "mean_token_accuracy": 0.788543164730072, | |
| "num_tokens": 11781875.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5093352198600769, | |
| "epoch": 2.6940298507462686, | |
| "grad_norm": 0.15824586153030396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5047670602798462, | |
| "mean_token_accuracy": 0.7923571020364761, | |
| "num_tokens": 11798168.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5319179147481918, | |
| "epoch": 2.6977611940298507, | |
| "grad_norm": 0.1545802354812622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334397554397583, | |
| "mean_token_accuracy": 0.7845843136310577, | |
| "num_tokens": 11814632.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5133816972374916, | |
| "epoch": 2.701492537313433, | |
| "grad_norm": 0.16241911053657532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.51878821849823, | |
| "mean_token_accuracy": 0.7933190315961838, | |
| "num_tokens": 11831088.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5164139419794083, | |
| "epoch": 2.705223880597015, | |
| "grad_norm": 0.14982916414737701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5140745639801025, | |
| "mean_token_accuracy": 0.7934172451496124, | |
| "num_tokens": 11847470.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.521071195602417, | |
| "epoch": 2.708955223880597, | |
| "grad_norm": 0.17015258967876434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232289433479309, | |
| "mean_token_accuracy": 0.7887244522571564, | |
| "num_tokens": 11863757.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5184628516435623, | |
| "epoch": 2.7126865671641793, | |
| "grad_norm": 0.1840510219335556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5194827318191528, | |
| "mean_token_accuracy": 0.7879429012537003, | |
| "num_tokens": 11880261.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5139294788241386, | |
| "epoch": 2.716417910447761, | |
| "grad_norm": 0.19588088989257812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200832486152649, | |
| "mean_token_accuracy": 0.7899386137723923, | |
| "num_tokens": 11896585.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5239543169736862, | |
| "epoch": 2.720149253731343, | |
| "grad_norm": 0.20819295942783356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261701345443726, | |
| "mean_token_accuracy": 0.7911202013492584, | |
| "num_tokens": 11912923.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5407283902168274, | |
| "epoch": 2.7238805970149254, | |
| "grad_norm": 0.17276515066623688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370129942893982, | |
| "mean_token_accuracy": 0.7848152667284012, | |
| "num_tokens": 11929303.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.542425274848938, | |
| "epoch": 2.7276119402985075, | |
| "grad_norm": 0.25132983922958374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359519720077515, | |
| "mean_token_accuracy": 0.7846331894397736, | |
| "num_tokens": 11945440.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5357621908187866, | |
| "epoch": 2.7313432835820897, | |
| "grad_norm": 0.222070574760437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348407626152039, | |
| "mean_token_accuracy": 0.7818550020456314, | |
| "num_tokens": 11961949.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5185696631669998, | |
| "epoch": 2.7350746268656714, | |
| "grad_norm": 0.19711528718471527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264403223991394, | |
| "mean_token_accuracy": 0.7884511202573776, | |
| "num_tokens": 11978063.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.516778826713562, | |
| "epoch": 2.7388059701492535, | |
| "grad_norm": 0.24369676411151886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253380537033081, | |
| "mean_token_accuracy": 0.7903653234243393, | |
| "num_tokens": 11994278.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5164884254336357, | |
| "epoch": 2.7425373134328357, | |
| "grad_norm": 0.18417784571647644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5214477181434631, | |
| "mean_token_accuracy": 0.789106622338295, | |
| "num_tokens": 12010558.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5068091601133347, | |
| "epoch": 2.746268656716418, | |
| "grad_norm": 0.21942751109600067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.513481855392456, | |
| "mean_token_accuracy": 0.7899149656295776, | |
| "num_tokens": 12026889.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5316798090934753, | |
| "epoch": 2.75, | |
| "grad_norm": 0.1581851989030838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230653285980225, | |
| "mean_token_accuracy": 0.7884569317102432, | |
| "num_tokens": 12043341.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.539380818605423, | |
| "epoch": 2.753731343283582, | |
| "grad_norm": 0.1578167974948883, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5292148590087891, | |
| "mean_token_accuracy": 0.7852563858032227, | |
| "num_tokens": 12059848.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5343874096870422, | |
| "epoch": 2.7574626865671643, | |
| "grad_norm": 0.19632823765277863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5295359492301941, | |
| "mean_token_accuracy": 0.783517986536026, | |
| "num_tokens": 12076134.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5188475027680397, | |
| "epoch": 2.7611940298507465, | |
| "grad_norm": 0.16950450837612152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521928071975708, | |
| "mean_token_accuracy": 0.7883510291576385, | |
| "num_tokens": 12092406.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5121756568551064, | |
| "epoch": 2.7649253731343286, | |
| "grad_norm": 0.20061862468719482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192751884460449, | |
| "mean_token_accuracy": 0.7898274064064026, | |
| "num_tokens": 12108773.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5244594514369965, | |
| "epoch": 2.7686567164179103, | |
| "grad_norm": 0.16218306124210358, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5296685695648193, | |
| "mean_token_accuracy": 0.7826414853334427, | |
| "num_tokens": 12125082.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5244700759649277, | |
| "epoch": 2.7723880597014925, | |
| "grad_norm": 0.19114060699939728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232917070388794, | |
| "mean_token_accuracy": 0.7893050163984299, | |
| "num_tokens": 12141570.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5299672707915306, | |
| "epoch": 2.7761194029850746, | |
| "grad_norm": 0.15443415939807892, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5207250714302063, | |
| "mean_token_accuracy": 0.7905602306127548, | |
| "num_tokens": 12157874.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5345348864793777, | |
| "epoch": 2.779850746268657, | |
| "grad_norm": 0.1817025989294052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311155319213867, | |
| "mean_token_accuracy": 0.785017192363739, | |
| "num_tokens": 12174053.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5195724815130234, | |
| "epoch": 2.783582089552239, | |
| "grad_norm": 0.157354474067688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5159887075424194, | |
| "mean_token_accuracy": 0.790684700012207, | |
| "num_tokens": 12190613.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5138278231024742, | |
| "epoch": 2.7873134328358207, | |
| "grad_norm": 0.16088353097438812, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5184983611106873, | |
| "mean_token_accuracy": 0.7899224907159805, | |
| "num_tokens": 12206928.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5161465555429459, | |
| "epoch": 2.791044776119403, | |
| "grad_norm": 0.2099459022283554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232690572738647, | |
| "mean_token_accuracy": 0.7870688289403915, | |
| "num_tokens": 12223267.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5158911049365997, | |
| "epoch": 2.794776119402985, | |
| "grad_norm": 0.15817788243293762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5168994665145874, | |
| "mean_token_accuracy": 0.7899310439825058, | |
| "num_tokens": 12239601.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5070392489433289, | |
| "epoch": 2.798507462686567, | |
| "grad_norm": 0.2228090614080429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5200591087341309, | |
| "mean_token_accuracy": 0.7891372889280319, | |
| "num_tokens": 12256032.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5438189208507538, | |
| "epoch": 2.8022388059701493, | |
| "grad_norm": 0.1719558835029602, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426724553108215, | |
| "mean_token_accuracy": 0.7774887979030609, | |
| "num_tokens": 12272514.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.519834965467453, | |
| "epoch": 2.8059701492537314, | |
| "grad_norm": 0.18933889269828796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523102343082428, | |
| "mean_token_accuracy": 0.7904316037893295, | |
| "num_tokens": 12288877.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.512350045144558, | |
| "epoch": 2.8097014925373136, | |
| "grad_norm": 0.1864548623561859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5090078115463257, | |
| "mean_token_accuracy": 0.7945949882268906, | |
| "num_tokens": 12305044.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5358164459466934, | |
| "epoch": 2.8134328358208958, | |
| "grad_norm": 0.17895784974098206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349195003509521, | |
| "mean_token_accuracy": 0.7841221541166306, | |
| "num_tokens": 12321579.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5124004110693932, | |
| "epoch": 2.8171641791044775, | |
| "grad_norm": 0.17669007182121277, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5126450657844543, | |
| "mean_token_accuracy": 0.7929520756006241, | |
| "num_tokens": 12338186.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5246561616659164, | |
| "epoch": 2.8208955223880596, | |
| "grad_norm": 0.19795700907707214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288596153259277, | |
| "mean_token_accuracy": 0.7869751006364822, | |
| "num_tokens": 12354327.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5311583876609802, | |
| "epoch": 2.824626865671642, | |
| "grad_norm": 0.18146470189094543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294592976570129, | |
| "mean_token_accuracy": 0.7862387895584106, | |
| "num_tokens": 12370923.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5319194048643112, | |
| "epoch": 2.828358208955224, | |
| "grad_norm": 0.19238857924938202, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317291617393494, | |
| "mean_token_accuracy": 0.7854786366224289, | |
| "num_tokens": 12387257.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.526064857840538, | |
| "epoch": 2.832089552238806, | |
| "grad_norm": 0.1526212990283966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5222187042236328, | |
| "mean_token_accuracy": 0.7932349592447281, | |
| "num_tokens": 12403635.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5247229933738708, | |
| "epoch": 2.835820895522388, | |
| "grad_norm": 0.2871471047401428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314409136772156, | |
| "mean_token_accuracy": 0.7845473885536194, | |
| "num_tokens": 12420097.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5259681046009064, | |
| "epoch": 2.83955223880597, | |
| "grad_norm": 0.1705760359764099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313333868980408, | |
| "mean_token_accuracy": 0.787728413939476, | |
| "num_tokens": 12436382.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5318069308996201, | |
| "epoch": 2.843283582089552, | |
| "grad_norm": 0.20162752270698547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359828472137451, | |
| "mean_token_accuracy": 0.7834303081035614, | |
| "num_tokens": 12452497.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5508353263139725, | |
| "epoch": 2.8470149253731343, | |
| "grad_norm": 0.161021888256073, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432707667350769, | |
| "mean_token_accuracy": 0.7808051556348801, | |
| "num_tokens": 12468969.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5287757962942123, | |
| "epoch": 2.8507462686567164, | |
| "grad_norm": 0.2050207257270813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284628868103027, | |
| "mean_token_accuracy": 0.7843392193317413, | |
| "num_tokens": 12485354.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5344215333461761, | |
| "epoch": 2.8544776119402986, | |
| "grad_norm": 0.1695808321237564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535874605178833, | |
| "mean_token_accuracy": 0.782726377248764, | |
| "num_tokens": 12501936.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.522572860121727, | |
| "epoch": 2.8582089552238807, | |
| "grad_norm": 0.19520296156406403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247471332550049, | |
| "mean_token_accuracy": 0.7886104881763458, | |
| "num_tokens": 12518330.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5314962714910507, | |
| "epoch": 2.861940298507463, | |
| "grad_norm": 0.17423976957798004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297841429710388, | |
| "mean_token_accuracy": 0.7862118780612946, | |
| "num_tokens": 12534665.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5281147062778473, | |
| "epoch": 2.8656716417910446, | |
| "grad_norm": 0.18605203926563263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324077606201172, | |
| "mean_token_accuracy": 0.787416860461235, | |
| "num_tokens": 12551009.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5187551081180573, | |
| "epoch": 2.8694029850746268, | |
| "grad_norm": 0.1616411954164505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.512826144695282, | |
| "mean_token_accuracy": 0.7936854958534241, | |
| "num_tokens": 12567387.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5136809647083282, | |
| "epoch": 2.873134328358209, | |
| "grad_norm": 0.17406195402145386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5155330300331116, | |
| "mean_token_accuracy": 0.7908283174037933, | |
| "num_tokens": 12583985.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5185445547103882, | |
| "epoch": 2.876865671641791, | |
| "grad_norm": 0.1833800971508026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5192956328392029, | |
| "mean_token_accuracy": 0.7888920605182648, | |
| "num_tokens": 12600193.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5310780256986618, | |
| "epoch": 2.8805970149253732, | |
| "grad_norm": 0.2313033938407898, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360459685325623, | |
| "mean_token_accuracy": 0.7845909744501114, | |
| "num_tokens": 12616559.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5207322463393211, | |
| "epoch": 2.8843283582089554, | |
| "grad_norm": 0.15698477625846863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.514286994934082, | |
| "mean_token_accuracy": 0.789328083395958, | |
| "num_tokens": 12633075.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5343746095895767, | |
| "epoch": 2.888059701492537, | |
| "grad_norm": 0.21191926300525665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408198833465576, | |
| "mean_token_accuracy": 0.7812719643115997, | |
| "num_tokens": 12649414.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5095183849334717, | |
| "epoch": 2.8917910447761193, | |
| "grad_norm": 0.1665944755077362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5066861510276794, | |
| "mean_token_accuracy": 0.7972470223903656, | |
| "num_tokens": 12665839.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5341623723506927, | |
| "epoch": 2.8955223880597014, | |
| "grad_norm": 0.19015316665172577, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310372114181519, | |
| "mean_token_accuracy": 0.7861314713954926, | |
| "num_tokens": 12682165.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5464024096727371, | |
| "epoch": 2.8992537313432836, | |
| "grad_norm": 0.19810722768306732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392264723777771, | |
| "mean_token_accuracy": 0.7843339294195175, | |
| "num_tokens": 12698576.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5175042897462845, | |
| "epoch": 2.9029850746268657, | |
| "grad_norm": 0.16263291239738464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5172262191772461, | |
| "mean_token_accuracy": 0.792850524187088, | |
| "num_tokens": 12714766.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5199488997459412, | |
| "epoch": 2.906716417910448, | |
| "grad_norm": 0.2083202749490738, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5252541899681091, | |
| "mean_token_accuracy": 0.7852817475795746, | |
| "num_tokens": 12731205.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.4941527247428894, | |
| "epoch": 2.91044776119403, | |
| "grad_norm": 0.17050482332706451, | |
| "learning_rate": 0.0002, | |
| "loss": 0.4989524185657501, | |
| "mean_token_accuracy": 0.7972326874732971, | |
| "num_tokens": 12747594.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5078647658228874, | |
| "epoch": 2.914179104477612, | |
| "grad_norm": 0.23199598491191864, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5211161971092224, | |
| "mean_token_accuracy": 0.7884382009506226, | |
| "num_tokens": 12763932.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5114319175481796, | |
| "epoch": 2.917910447761194, | |
| "grad_norm": 0.2023877501487732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5166995525360107, | |
| "mean_token_accuracy": 0.7941331118345261, | |
| "num_tokens": 12780023.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5287023633718491, | |
| "epoch": 2.921641791044776, | |
| "grad_norm": 0.21876347064971924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263211727142334, | |
| "mean_token_accuracy": 0.7864357531070709, | |
| "num_tokens": 12796441.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5223046839237213, | |
| "epoch": 2.925373134328358, | |
| "grad_norm": 0.14650550484657288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5140995979309082, | |
| "mean_token_accuracy": 0.7916091233491898, | |
| "num_tokens": 12812793.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5247595310211182, | |
| "epoch": 2.9291044776119404, | |
| "grad_norm": 0.25079336762428284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263584852218628, | |
| "mean_token_accuracy": 0.786608412861824, | |
| "num_tokens": 12829172.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5266484171152115, | |
| "epoch": 2.9328358208955225, | |
| "grad_norm": 0.16101892292499542, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220364928245544, | |
| "mean_token_accuracy": 0.7872611582279205, | |
| "num_tokens": 12845573.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5139588639140129, | |
| "epoch": 2.9365671641791042, | |
| "grad_norm": 0.21128332614898682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5196605920791626, | |
| "mean_token_accuracy": 0.7880596816539764, | |
| "num_tokens": 12861897.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5052976161241531, | |
| "epoch": 2.9402985074626864, | |
| "grad_norm": 0.1861787587404251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5119534134864807, | |
| "mean_token_accuracy": 0.7939311414957047, | |
| "num_tokens": 12878193.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5310614109039307, | |
| "epoch": 2.9440298507462686, | |
| "grad_norm": 0.1857159435749054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301690101623535, | |
| "mean_token_accuracy": 0.786168098449707, | |
| "num_tokens": 12894935.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5310661867260933, | |
| "epoch": 2.9477611940298507, | |
| "grad_norm": 0.18339301645755768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257419347763062, | |
| "mean_token_accuracy": 0.788611650466919, | |
| "num_tokens": 12911289.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5245337337255478, | |
| "epoch": 2.951492537313433, | |
| "grad_norm": 0.17652840912342072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5265839099884033, | |
| "mean_token_accuracy": 0.7901091575622559, | |
| "num_tokens": 12927670.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5245234072208405, | |
| "epoch": 2.955223880597015, | |
| "grad_norm": 0.17611214518547058, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243083834648132, | |
| "mean_token_accuracy": 0.7856577485799789, | |
| "num_tokens": 12944015.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5191880911588669, | |
| "epoch": 2.958955223880597, | |
| "grad_norm": 0.18345631659030914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5257253050804138, | |
| "mean_token_accuracy": 0.7881710231304169, | |
| "num_tokens": 12960131.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5140431523323059, | |
| "epoch": 2.9626865671641793, | |
| "grad_norm": 0.2098158448934555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5169271230697632, | |
| "mean_token_accuracy": 0.786968320608139, | |
| "num_tokens": 12976187.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5335211008787155, | |
| "epoch": 2.966417910447761, | |
| "grad_norm": 0.15838965773582458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324181318283081, | |
| "mean_token_accuracy": 0.7819865345954895, | |
| "num_tokens": 12992461.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5252291113138199, | |
| "epoch": 2.970149253731343, | |
| "grad_norm": 0.19166119396686554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5205749869346619, | |
| "mean_token_accuracy": 0.7911773473024368, | |
| "num_tokens": 13008737.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5154759585857391, | |
| "epoch": 2.9738805970149254, | |
| "grad_norm": 0.16444922983646393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5141779184341431, | |
| "mean_token_accuracy": 0.7922156006097794, | |
| "num_tokens": 13025092.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5257436707615852, | |
| "epoch": 2.9776119402985075, | |
| "grad_norm": 0.19890975952148438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353443622589111, | |
| "mean_token_accuracy": 0.7844508290290833, | |
| "num_tokens": 13041631.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5554878115653992, | |
| "epoch": 2.9813432835820897, | |
| "grad_norm": 0.19347697496414185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568645596504211, | |
| "mean_token_accuracy": 0.7741395682096481, | |
| "num_tokens": 13058045.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5262391567230225, | |
| "epoch": 2.9850746268656714, | |
| "grad_norm": 0.17874093353748322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202043056488037, | |
| "mean_token_accuracy": 0.7870875149965286, | |
| "num_tokens": 13074443.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5318054854869843, | |
| "epoch": 2.9888059701492535, | |
| "grad_norm": 0.182646706700325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253685712814331, | |
| "mean_token_accuracy": 0.786090537905693, | |
| "num_tokens": 13090582.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5484406352043152, | |
| "epoch": 2.9925373134328357, | |
| "grad_norm": 0.15745747089385986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452413558959961, | |
| "mean_token_accuracy": 0.7798783183097839, | |
| "num_tokens": 13106832.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.527185246348381, | |
| "epoch": 2.996268656716418, | |
| "grad_norm": 0.1789730340242386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5218254923820496, | |
| "mean_token_accuracy": 0.7895842045545578, | |
| "num_tokens": 13123002.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5108470022678375, | |
| "epoch": 3.0, | |
| "grad_norm": 0.1871774047613144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5190352201461792, | |
| "mean_token_accuracy": 0.7890540361404419, | |
| "num_tokens": 13139156.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2242940510926275e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |