Instructions to use eac123/clean-subliminal-learning-otters with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-otters with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-otters") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 798, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1405175626277924, | |
| "epoch": 0.0037593984962406013, | |
| "grad_norm": 0.40029582381248474, | |
| "learning_rate": 0.0002, | |
| "loss": 2.4748640060424805, | |
| "mean_token_accuracy": 0.5338118821382523, | |
| "num_tokens": 16246.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2275302708148956, | |
| "epoch": 0.007518796992481203, | |
| "grad_norm": 0.36828649044036865, | |
| "learning_rate": 0.0002, | |
| "loss": 2.125943660736084, | |
| "mean_token_accuracy": 0.5713680684566498, | |
| "num_tokens": 32716.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4195487797260284, | |
| "epoch": 0.011278195488721804, | |
| "grad_norm": 0.29105839133262634, | |
| "learning_rate": 0.0002, | |
| "loss": 1.735130786895752, | |
| "mean_token_accuracy": 0.5909573882818222, | |
| "num_tokens": 48967.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3783348500728607, | |
| "epoch": 0.015037593984962405, | |
| "grad_norm": 0.2323397547006607, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4040782451629639, | |
| "mean_token_accuracy": 0.6318088620901108, | |
| "num_tokens": 65467.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3656240701675415, | |
| "epoch": 0.018796992481203006, | |
| "grad_norm": 0.2868480384349823, | |
| "learning_rate": 0.0002, | |
| "loss": 1.3035261631011963, | |
| "mean_token_accuracy": 0.6341304779052734, | |
| "num_tokens": 81665.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.264964371919632, | |
| "epoch": 0.022556390977443608, | |
| "grad_norm": 0.14605936408042908, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1722630262374878, | |
| "mean_token_accuracy": 0.6646067351102829, | |
| "num_tokens": 97913.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.1983447670936584, | |
| "epoch": 0.02631578947368421, | |
| "grad_norm": 0.10632229596376419, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1054309606552124, | |
| "mean_token_accuracy": 0.6686217486858368, | |
| "num_tokens": 113953.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1218359470367432, | |
| "epoch": 0.03007518796992481, | |
| "grad_norm": 0.09761745482683182, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0230426788330078, | |
| "mean_token_accuracy": 0.676657035946846, | |
| "num_tokens": 130177.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0549319684505463, | |
| "epoch": 0.03383458646616541, | |
| "grad_norm": 0.1231616735458374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9938599467277527, | |
| "mean_token_accuracy": 0.6875758469104767, | |
| "num_tokens": 146621.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.987179160118103, | |
| "epoch": 0.03759398496240601, | |
| "grad_norm": 0.11966806650161743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9243900775909424, | |
| "mean_token_accuracy": 0.6994709670543671, | |
| "num_tokens": 162843.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.935651957988739, | |
| "epoch": 0.041353383458646614, | |
| "grad_norm": 0.10380394756793976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.866508960723877, | |
| "mean_token_accuracy": 0.7096800655126572, | |
| "num_tokens": 179313.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9110619872808456, | |
| "epoch": 0.045112781954887216, | |
| "grad_norm": 0.10094986110925674, | |
| "learning_rate": 0.0002, | |
| "loss": 0.832156240940094, | |
| "mean_token_accuracy": 0.7104088068008423, | |
| "num_tokens": 195785.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.855834111571312, | |
| "epoch": 0.04887218045112782, | |
| "grad_norm": 0.37487563490867615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8014079332351685, | |
| "mean_token_accuracy": 0.7197864800691605, | |
| "num_tokens": 212026.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7773148268461227, | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.09044307470321655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7479192614555359, | |
| "mean_token_accuracy": 0.7304967045783997, | |
| "num_tokens": 228294.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7414887696504593, | |
| "epoch": 0.05639097744360902, | |
| "grad_norm": 0.11246141791343689, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7355879545211792, | |
| "mean_token_accuracy": 0.7314187586307526, | |
| "num_tokens": 244681.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7010335773229599, | |
| "epoch": 0.06015037593984962, | |
| "grad_norm": 0.11098679155111313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6920604109764099, | |
| "mean_token_accuracy": 0.7372281551361084, | |
| "num_tokens": 261053.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.6938799321651459, | |
| "epoch": 0.06390977443609022, | |
| "grad_norm": 0.08114200830459595, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6897510886192322, | |
| "mean_token_accuracy": 0.7408226281404495, | |
| "num_tokens": 277338.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.6835978478193283, | |
| "epoch": 0.06766917293233082, | |
| "grad_norm": 0.08077364414930344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6768285632133484, | |
| "mean_token_accuracy": 0.740087628364563, | |
| "num_tokens": 293709.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6589517742395401, | |
| "epoch": 0.07142857142857142, | |
| "grad_norm": 0.0879955068230629, | |
| "learning_rate": 0.0002, | |
| "loss": 0.65667724609375, | |
| "mean_token_accuracy": 0.7443644404411316, | |
| "num_tokens": 310128.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.6506444960832596, | |
| "epoch": 0.07518796992481203, | |
| "grad_norm": 0.080411896109581, | |
| "learning_rate": 0.0002, | |
| "loss": 0.641387403011322, | |
| "mean_token_accuracy": 0.7495939880609512, | |
| "num_tokens": 326607.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6619953960180283, | |
| "epoch": 0.07894736842105263, | |
| "grad_norm": 0.0845642164349556, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6475294232368469, | |
| "mean_token_accuracy": 0.7457321733236313, | |
| "num_tokens": 342774.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6577392071485519, | |
| "epoch": 0.08270676691729323, | |
| "grad_norm": 0.07965292036533356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6407521367073059, | |
| "mean_token_accuracy": 0.7490587830543518, | |
| "num_tokens": 359099.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6155381500720978, | |
| "epoch": 0.08646616541353383, | |
| "grad_norm": 0.07591664046049118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6092519760131836, | |
| "mean_token_accuracy": 0.7603109776973724, | |
| "num_tokens": 375179.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.5885609835386276, | |
| "epoch": 0.09022556390977443, | |
| "grad_norm": 0.06627360731363297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5951059460639954, | |
| "mean_token_accuracy": 0.7678095996379852, | |
| "num_tokens": 391354.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5992416590452194, | |
| "epoch": 0.09398496240601503, | |
| "grad_norm": 0.08137614279985428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6067847013473511, | |
| "mean_token_accuracy": 0.7620100975036621, | |
| "num_tokens": 407719.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.6116904020309448, | |
| "epoch": 0.09774436090225563, | |
| "grad_norm": 0.06891811639070511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6175057888031006, | |
| "mean_token_accuracy": 0.7556122690439224, | |
| "num_tokens": 424041.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.6106788814067841, | |
| "epoch": 0.10150375939849623, | |
| "grad_norm": 0.059570278972387314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5937588214874268, | |
| "mean_token_accuracy": 0.7666491121053696, | |
| "num_tokens": 440295.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6181164085865021, | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 0.07394946366548538, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6043965220451355, | |
| "mean_token_accuracy": 0.7635089755058289, | |
| "num_tokens": 456614.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.6283685266971588, | |
| "epoch": 0.10902255639097744, | |
| "grad_norm": 0.07618279755115509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6195181608200073, | |
| "mean_token_accuracy": 0.752281054854393, | |
| "num_tokens": 472965.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5851932466030121, | |
| "epoch": 0.11278195488721804, | |
| "grad_norm": 0.05518079921603203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5881266593933105, | |
| "mean_token_accuracy": 0.7650770843029022, | |
| "num_tokens": 489391.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5895522385835648, | |
| "epoch": 0.11654135338345864, | |
| "grad_norm": 0.06688102334737778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6028741002082825, | |
| "mean_token_accuracy": 0.7601553350687027, | |
| "num_tokens": 505837.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5878616869449615, | |
| "epoch": 0.12030075187969924, | |
| "grad_norm": 0.059780046343803406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6033408045768738, | |
| "mean_token_accuracy": 0.7582006454467773, | |
| "num_tokens": 522243.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5838498622179031, | |
| "epoch": 0.12406015037593984, | |
| "grad_norm": 0.04929976165294647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5896713137626648, | |
| "mean_token_accuracy": 0.761729434132576, | |
| "num_tokens": 538731.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5691559016704559, | |
| "epoch": 0.12781954887218044, | |
| "grad_norm": 0.06266291439533234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734342932701111, | |
| "mean_token_accuracy": 0.7672057747840881, | |
| "num_tokens": 554848.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5915598571300507, | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 0.06152564287185669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5912453532218933, | |
| "mean_token_accuracy": 0.7633904218673706, | |
| "num_tokens": 571057.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.597556471824646, | |
| "epoch": 0.13533834586466165, | |
| "grad_norm": 0.04998990520834923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5882090330123901, | |
| "mean_token_accuracy": 0.7643049657344818, | |
| "num_tokens": 587326.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5905885845422745, | |
| "epoch": 0.13909774436090225, | |
| "grad_norm": 0.049017250537872314, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5855776071548462, | |
| "mean_token_accuracy": 0.7655442655086517, | |
| "num_tokens": 603538.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.586976170539856, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.046413078904151917, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5790608525276184, | |
| "mean_token_accuracy": 0.767949789762497, | |
| "num_tokens": 619734.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5844197869300842, | |
| "epoch": 0.14661654135338345, | |
| "grad_norm": 0.04495161026716232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5842206478118896, | |
| "mean_token_accuracy": 0.7648505717515945, | |
| "num_tokens": 636104.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5523269921541214, | |
| "epoch": 0.15037593984962405, | |
| "grad_norm": 0.04233352467417717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523208975791931, | |
| "mean_token_accuracy": 0.7776841074228287, | |
| "num_tokens": 652478.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.569878563284874, | |
| "epoch": 0.15413533834586465, | |
| "grad_norm": 0.04850724712014198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5725483298301697, | |
| "mean_token_accuracy": 0.7687844336032867, | |
| "num_tokens": 669008.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5655312091112137, | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 0.04192538931965828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5679923892021179, | |
| "mean_token_accuracy": 0.7717834413051605, | |
| "num_tokens": 685165.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5601242333650589, | |
| "epoch": 0.16165413533834586, | |
| "grad_norm": 0.042079195380210876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594381093978882, | |
| "mean_token_accuracy": 0.7740506827831268, | |
| "num_tokens": 701529.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.575413703918457, | |
| "epoch": 0.16541353383458646, | |
| "grad_norm": 0.04416325315833092, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5747635364532471, | |
| "mean_token_accuracy": 0.7721781879663467, | |
| "num_tokens": 717922.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.5668691843748093, | |
| "epoch": 0.16917293233082706, | |
| "grad_norm": 0.05360032618045807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617860555648804, | |
| "mean_token_accuracy": 0.7762805074453354, | |
| "num_tokens": 733933.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5761540979146957, | |
| "epoch": 0.17293233082706766, | |
| "grad_norm": 0.040452998131513596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704891085624695, | |
| "mean_token_accuracy": 0.7709734439849854, | |
| "num_tokens": 750555.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5610938370227814, | |
| "epoch": 0.17669172932330826, | |
| "grad_norm": 0.04221005737781525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613417029380798, | |
| "mean_token_accuracy": 0.7761952430009842, | |
| "num_tokens": 766693.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5707991421222687, | |
| "epoch": 0.18045112781954886, | |
| "grad_norm": 0.03976718708872795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678077936172485, | |
| "mean_token_accuracy": 0.7737486809492111, | |
| "num_tokens": 783330.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5475099235773087, | |
| "epoch": 0.18421052631578946, | |
| "grad_norm": 0.04141751676797867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536777973175049, | |
| "mean_token_accuracy": 0.7761508077383041, | |
| "num_tokens": 799528.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5602568089962006, | |
| "epoch": 0.18796992481203006, | |
| "grad_norm": 0.04497222229838371, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5695174336433411, | |
| "mean_token_accuracy": 0.7716410309076309, | |
| "num_tokens": 815957.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5643552988767624, | |
| "epoch": 0.19172932330827067, | |
| "grad_norm": 0.041956499218940735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5748574733734131, | |
| "mean_token_accuracy": 0.7680526673793793, | |
| "num_tokens": 832365.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5510173141956329, | |
| "epoch": 0.19548872180451127, | |
| "grad_norm": 0.04074239730834961, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555428266525269, | |
| "mean_token_accuracy": 0.775487020611763, | |
| "num_tokens": 848532.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5738573223352432, | |
| "epoch": 0.19924812030075187, | |
| "grad_norm": 0.036227982491254807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651305913925171, | |
| "mean_token_accuracy": 0.7725107222795486, | |
| "num_tokens": 864646.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.5808417797088623, | |
| "epoch": 0.20300751879699247, | |
| "grad_norm": 0.03816494345664978, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638910531997681, | |
| "mean_token_accuracy": 0.7741686254739761, | |
| "num_tokens": 881239.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5693863034248352, | |
| "epoch": 0.20676691729323307, | |
| "grad_norm": 0.035037554800510406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5701916813850403, | |
| "mean_token_accuracy": 0.7687424123287201, | |
| "num_tokens": 897601.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5595564395189285, | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 0.038008302450180054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662519931793213, | |
| "mean_token_accuracy": 0.7714412808418274, | |
| "num_tokens": 914184.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5745149552822113, | |
| "epoch": 0.21428571428571427, | |
| "grad_norm": 0.03566848114132881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5779574513435364, | |
| "mean_token_accuracy": 0.7686354070901871, | |
| "num_tokens": 930380.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.5675694793462753, | |
| "epoch": 0.21804511278195488, | |
| "grad_norm": 0.03368304297327995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5728892087936401, | |
| "mean_token_accuracy": 0.7670125216245651, | |
| "num_tokens": 946749.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5651668012142181, | |
| "epoch": 0.22180451127819548, | |
| "grad_norm": 0.035859547555446625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706139802932739, | |
| "mean_token_accuracy": 0.7697967290878296, | |
| "num_tokens": 963053.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.5670004636049271, | |
| "epoch": 0.22556390977443608, | |
| "grad_norm": 0.03998008742928505, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5656613111495972, | |
| "mean_token_accuracy": 0.7728914767503738, | |
| "num_tokens": 979368.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5696548968553543, | |
| "epoch": 0.22932330827067668, | |
| "grad_norm": 0.04078423231840134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716832280158997, | |
| "mean_token_accuracy": 0.7699559330940247, | |
| "num_tokens": 995406.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.590179905295372, | |
| "epoch": 0.23308270676691728, | |
| "grad_norm": 0.0332336388528347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5876976847648621, | |
| "mean_token_accuracy": 0.7626538276672363, | |
| "num_tokens": 1011804.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5567612648010254, | |
| "epoch": 0.23684210526315788, | |
| "grad_norm": 0.033585552126169205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552665650844574, | |
| "mean_token_accuracy": 0.7773807644844055, | |
| "num_tokens": 1027984.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5729009807109833, | |
| "epoch": 0.24060150375939848, | |
| "grad_norm": 0.037177689373493195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5675500631332397, | |
| "mean_token_accuracy": 0.7715246975421906, | |
| "num_tokens": 1044274.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5565147399902344, | |
| "epoch": 0.24436090225563908, | |
| "grad_norm": 0.034301500767469406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531203150749207, | |
| "mean_token_accuracy": 0.7778400331735611, | |
| "num_tokens": 1060650.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5595405846834183, | |
| "epoch": 0.24812030075187969, | |
| "grad_norm": 0.032111674547195435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613226294517517, | |
| "mean_token_accuracy": 0.7748188674449921, | |
| "num_tokens": 1077082.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5684429109096527, | |
| "epoch": 0.2518796992481203, | |
| "grad_norm": 0.036634527146816254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726494789123535, | |
| "mean_token_accuracy": 0.7709641754627228, | |
| "num_tokens": 1093328.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.5331402271986008, | |
| "epoch": 0.2556390977443609, | |
| "grad_norm": 0.03533982113003731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389207601547241, | |
| "mean_token_accuracy": 0.7816744297742844, | |
| "num_tokens": 1109550.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5601552575826645, | |
| "epoch": 0.2593984962406015, | |
| "grad_norm": 0.03249680623412132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670143961906433, | |
| "mean_token_accuracy": 0.7690982818603516, | |
| "num_tokens": 1125670.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5491845458745956, | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.03275011479854584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448943972587585, | |
| "mean_token_accuracy": 0.7807547152042389, | |
| "num_tokens": 1141797.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5585113912820816, | |
| "epoch": 0.2669172932330827, | |
| "grad_norm": 0.03664859011769295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560217022895813, | |
| "mean_token_accuracy": 0.7755073606967926, | |
| "num_tokens": 1158252.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.5534943342208862, | |
| "epoch": 0.2706766917293233, | |
| "grad_norm": 0.03374176472425461, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520960688591003, | |
| "mean_token_accuracy": 0.7764160335063934, | |
| "num_tokens": 1174369.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5600117444992065, | |
| "epoch": 0.2744360902255639, | |
| "grad_norm": 0.033763986080884933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588683485984802, | |
| "mean_token_accuracy": 0.7761770337820053, | |
| "num_tokens": 1190928.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.5625056624412537, | |
| "epoch": 0.2781954887218045, | |
| "grad_norm": 0.034332193434238434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600336790084839, | |
| "mean_token_accuracy": 0.7748808860778809, | |
| "num_tokens": 1207372.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5520483404397964, | |
| "epoch": 0.2819548872180451, | |
| "grad_norm": 0.03450694680213928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558054447174072, | |
| "mean_token_accuracy": 0.7750240415334702, | |
| "num_tokens": 1223643.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.5441252887248993, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.03436208888888359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533716678619385, | |
| "mean_token_accuracy": 0.7759858965873718, | |
| "num_tokens": 1239688.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5603705495595932, | |
| "epoch": 0.2894736842105263, | |
| "grad_norm": 0.03493620082736015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5694956183433533, | |
| "mean_token_accuracy": 0.7717721164226532, | |
| "num_tokens": 1255884.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5612094402313232, | |
| "epoch": 0.2932330827067669, | |
| "grad_norm": 0.03372187912464142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608274936676025, | |
| "mean_token_accuracy": 0.7747389078140259, | |
| "num_tokens": 1271939.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5706307291984558, | |
| "epoch": 0.29699248120300753, | |
| "grad_norm": 0.0331907719373703, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624843239784241, | |
| "mean_token_accuracy": 0.7734071314334869, | |
| "num_tokens": 1288328.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5670299082994461, | |
| "epoch": 0.3007518796992481, | |
| "grad_norm": 0.033556245267391205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560691237449646, | |
| "mean_token_accuracy": 0.7734449654817581, | |
| "num_tokens": 1304760.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5619105398654938, | |
| "epoch": 0.30451127819548873, | |
| "grad_norm": 0.034520749002695084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578286647796631, | |
| "mean_token_accuracy": 0.774708941578865, | |
| "num_tokens": 1321100.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.5670763552188873, | |
| "epoch": 0.3082706766917293, | |
| "grad_norm": 0.04056672751903534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5737652778625488, | |
| "mean_token_accuracy": 0.76849165558815, | |
| "num_tokens": 1337796.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5314440876245499, | |
| "epoch": 0.31203007518796994, | |
| "grad_norm": 0.03262212499976158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535086989402771, | |
| "mean_token_accuracy": 0.7845727354288101, | |
| "num_tokens": 1354331.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5603013932704926, | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 0.036167021840810776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5675747394561768, | |
| "mean_token_accuracy": 0.771581381559372, | |
| "num_tokens": 1370543.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5526834577322006, | |
| "epoch": 0.31954887218045114, | |
| "grad_norm": 0.03807472810149193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507928729057312, | |
| "mean_token_accuracy": 0.7803521752357483, | |
| "num_tokens": 1386874.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5730793476104736, | |
| "epoch": 0.3233082706766917, | |
| "grad_norm": 0.03474927321076393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660271644592285, | |
| "mean_token_accuracy": 0.7727594673633575, | |
| "num_tokens": 1403110.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.563334196805954, | |
| "epoch": 0.32706766917293234, | |
| "grad_norm": 0.03167711943387985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56499844789505, | |
| "mean_token_accuracy": 0.7736751586198807, | |
| "num_tokens": 1419614.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5451017022132874, | |
| "epoch": 0.3308270676691729, | |
| "grad_norm": 0.03233160078525543, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535646677017212, | |
| "mean_token_accuracy": 0.7740109711885452, | |
| "num_tokens": 1436028.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5493156313896179, | |
| "epoch": 0.33458646616541354, | |
| "grad_norm": 0.039253026247024536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615313649177551, | |
| "mean_token_accuracy": 0.7725273966789246, | |
| "num_tokens": 1452644.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5737167149782181, | |
| "epoch": 0.3383458646616541, | |
| "grad_norm": 0.032968465238809586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5743820667266846, | |
| "mean_token_accuracy": 0.7698662877082825, | |
| "num_tokens": 1469108.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5741334408521652, | |
| "epoch": 0.34210526315789475, | |
| "grad_norm": 0.040047451853752136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673686265945435, | |
| "mean_token_accuracy": 0.7704142928123474, | |
| "num_tokens": 1485445.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5617086589336395, | |
| "epoch": 0.3458646616541353, | |
| "grad_norm": 0.03181539848446846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534920692443848, | |
| "mean_token_accuracy": 0.7758883982896805, | |
| "num_tokens": 1501801.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5597693920135498, | |
| "epoch": 0.34962406015037595, | |
| "grad_norm": 0.03365252912044525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625807046890259, | |
| "mean_token_accuracy": 0.7725406587123871, | |
| "num_tokens": 1518047.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.5496240109205246, | |
| "epoch": 0.3533834586466165, | |
| "grad_norm": 0.0320061519742012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572867393493652, | |
| "mean_token_accuracy": 0.7759815156459808, | |
| "num_tokens": 1534447.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5630564987659454, | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.03503059223294258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5757870674133301, | |
| "mean_token_accuracy": 0.766523465514183, | |
| "num_tokens": 1550660.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5605316013097763, | |
| "epoch": 0.3609022556390977, | |
| "grad_norm": 0.032678134739398956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634536743164062, | |
| "mean_token_accuracy": 0.7716304063796997, | |
| "num_tokens": 1566883.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5838266015052795, | |
| "epoch": 0.36466165413533835, | |
| "grad_norm": 0.030517758801579475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5759112238883972, | |
| "mean_token_accuracy": 0.7689571380615234, | |
| "num_tokens": 1583221.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.575135201215744, | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 0.03620682284235954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5637581944465637, | |
| "mean_token_accuracy": 0.7740969359874725, | |
| "num_tokens": 1599392.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.5724876075983047, | |
| "epoch": 0.37218045112781956, | |
| "grad_norm": 0.029337450861930847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643174052238464, | |
| "mean_token_accuracy": 0.77228944003582, | |
| "num_tokens": 1615899.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5502088665962219, | |
| "epoch": 0.37593984962406013, | |
| "grad_norm": 0.03381618484854698, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5598064661026001, | |
| "mean_token_accuracy": 0.7747711390256882, | |
| "num_tokens": 1632274.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5598712712526321, | |
| "epoch": 0.37969924812030076, | |
| "grad_norm": 0.03598952665925026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5719908475875854, | |
| "mean_token_accuracy": 0.7700261324644089, | |
| "num_tokens": 1648688.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5630699545145035, | |
| "epoch": 0.38345864661654133, | |
| "grad_norm": 0.031423430889844894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565830409526825, | |
| "mean_token_accuracy": 0.7715611904859543, | |
| "num_tokens": 1665258.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5845702290534973, | |
| "epoch": 0.38721804511278196, | |
| "grad_norm": 0.02941996045410633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5816816687583923, | |
| "mean_token_accuracy": 0.7648696899414062, | |
| "num_tokens": 1681639.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.57722607254982, | |
| "epoch": 0.39097744360902253, | |
| "grad_norm": 0.034051019698381424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5756963491439819, | |
| "mean_token_accuracy": 0.7672083526849747, | |
| "num_tokens": 1698010.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5672426074743271, | |
| "epoch": 0.39473684210526316, | |
| "grad_norm": 0.03516025468707085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597167015075684, | |
| "mean_token_accuracy": 0.7757037431001663, | |
| "num_tokens": 1714351.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.5414413064718246, | |
| "epoch": 0.39849624060150374, | |
| "grad_norm": 0.03341100364923477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480563640594482, | |
| "mean_token_accuracy": 0.7781668901443481, | |
| "num_tokens": 1730536.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5462717562913895, | |
| "epoch": 0.40225563909774437, | |
| "grad_norm": 0.03385477513074875, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512043833732605, | |
| "mean_token_accuracy": 0.7787721008062363, | |
| "num_tokens": 1746896.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5501613169908524, | |
| "epoch": 0.40601503759398494, | |
| "grad_norm": 0.035874005407094955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561366081237793, | |
| "mean_token_accuracy": 0.7721621990203857, | |
| "num_tokens": 1763235.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5445860922336578, | |
| "epoch": 0.40977443609022557, | |
| "grad_norm": 0.030480582267045975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476114153862, | |
| "mean_token_accuracy": 0.7789607793092728, | |
| "num_tokens": 1779550.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5542454719543457, | |
| "epoch": 0.41353383458646614, | |
| "grad_norm": 0.0321124792098999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565616488456726, | |
| "mean_token_accuracy": 0.7755739092826843, | |
| "num_tokens": 1795761.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5581567585468292, | |
| "epoch": 0.41729323308270677, | |
| "grad_norm": 0.0360286608338356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496086478233337, | |
| "mean_token_accuracy": 0.775969922542572, | |
| "num_tokens": 1811759.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.549008384346962, | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 0.029972167685627937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420917272567749, | |
| "mean_token_accuracy": 0.7787465006113052, | |
| "num_tokens": 1827840.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5631350576877594, | |
| "epoch": 0.424812030075188, | |
| "grad_norm": 0.028662627562880516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532713532447815, | |
| "mean_token_accuracy": 0.7749679088592529, | |
| "num_tokens": 1844167.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.5277586579322815, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.03287903964519501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350267887115479, | |
| "mean_token_accuracy": 0.7830938249826431, | |
| "num_tokens": 1860530.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5497393310070038, | |
| "epoch": 0.4323308270676692, | |
| "grad_norm": 0.03770268335938454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615973472595215, | |
| "mean_token_accuracy": 0.7720151543617249, | |
| "num_tokens": 1876970.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5729877650737762, | |
| "epoch": 0.43609022556390975, | |
| "grad_norm": 0.033978965133428574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5777981877326965, | |
| "mean_token_accuracy": 0.7680597454309464, | |
| "num_tokens": 1893575.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.5504349619150162, | |
| "epoch": 0.4398496240601504, | |
| "grad_norm": 0.03185052052140236, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459255576133728, | |
| "mean_token_accuracy": 0.7792946100234985, | |
| "num_tokens": 1909809.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5565227419137955, | |
| "epoch": 0.44360902255639095, | |
| "grad_norm": 0.028807369992136955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551781177520752, | |
| "mean_token_accuracy": 0.7776060104370117, | |
| "num_tokens": 1925981.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.5547512769699097, | |
| "epoch": 0.4473684210526316, | |
| "grad_norm": 0.0315021388232708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484083890914917, | |
| "mean_token_accuracy": 0.7798104882240295, | |
| "num_tokens": 1942636.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.5606597065925598, | |
| "epoch": 0.45112781954887216, | |
| "grad_norm": 0.02974752150475979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633252263069153, | |
| "mean_token_accuracy": 0.7710647433996201, | |
| "num_tokens": 1959143.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5621981024742126, | |
| "epoch": 0.4548872180451128, | |
| "grad_norm": 0.03396495804190636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5700369477272034, | |
| "mean_token_accuracy": 0.7708666622638702, | |
| "num_tokens": 1975709.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5484206080436707, | |
| "epoch": 0.45864661654135336, | |
| "grad_norm": 0.03273981064558029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5635251998901367, | |
| "mean_token_accuracy": 0.7709483653306961, | |
| "num_tokens": 1992105.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5378261581063271, | |
| "epoch": 0.462406015037594, | |
| "grad_norm": 0.03221985325217247, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449070334434509, | |
| "mean_token_accuracy": 0.7815380096435547, | |
| "num_tokens": 2008467.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5606098920106888, | |
| "epoch": 0.46616541353383456, | |
| "grad_norm": 0.03314457833766937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563465416431427, | |
| "mean_token_accuracy": 0.7709829658269882, | |
| "num_tokens": 2024710.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5656619518995285, | |
| "epoch": 0.4699248120300752, | |
| "grad_norm": 0.03133262321352959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610048174858093, | |
| "mean_token_accuracy": 0.7718383222818375, | |
| "num_tokens": 2040853.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5635328441858292, | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 0.030308736488223076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604254007339478, | |
| "mean_token_accuracy": 0.7731337696313858, | |
| "num_tokens": 2057006.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.57016222178936, | |
| "epoch": 0.4774436090225564, | |
| "grad_norm": 0.03194103017449379, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620253086090088, | |
| "mean_token_accuracy": 0.7717723101377487, | |
| "num_tokens": 2073332.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5490193665027618, | |
| "epoch": 0.48120300751879697, | |
| "grad_norm": 0.02910369262099266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538103580474854, | |
| "mean_token_accuracy": 0.7780880033969879, | |
| "num_tokens": 2089495.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5662434548139572, | |
| "epoch": 0.4849624060150376, | |
| "grad_norm": 0.029468489810824394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5681107044219971, | |
| "mean_token_accuracy": 0.7689958661794662, | |
| "num_tokens": 2106114.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5431465953588486, | |
| "epoch": 0.48872180451127817, | |
| "grad_norm": 0.03223656490445137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507116317749023, | |
| "mean_token_accuracy": 0.7764191329479218, | |
| "num_tokens": 2122567.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5563855171203613, | |
| "epoch": 0.4924812030075188, | |
| "grad_norm": 0.028281886130571365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583161115646362, | |
| "mean_token_accuracy": 0.7736326307058334, | |
| "num_tokens": 2139083.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5674906224012375, | |
| "epoch": 0.49624060150375937, | |
| "grad_norm": 0.02878589555621147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564136803150177, | |
| "mean_token_accuracy": 0.7724441289901733, | |
| "num_tokens": 2155542.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5472439229488373, | |
| "epoch": 0.5, | |
| "grad_norm": 0.029321735724806786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442805290222168, | |
| "mean_token_accuracy": 0.7798047512769699, | |
| "num_tokens": 2171801.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.565643772482872, | |
| "epoch": 0.5037593984962406, | |
| "grad_norm": 0.028855223208665848, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595606565475464, | |
| "mean_token_accuracy": 0.774070993065834, | |
| "num_tokens": 2188167.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5532195568084717, | |
| "epoch": 0.5075187969924813, | |
| "grad_norm": 0.03198866546154022, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570374131202698, | |
| "mean_token_accuracy": 0.7740880846977234, | |
| "num_tokens": 2204470.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5408245772123337, | |
| "epoch": 0.5112781954887218, | |
| "grad_norm": 0.030379725620150566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514412522315979, | |
| "mean_token_accuracy": 0.7769049108028412, | |
| "num_tokens": 2220739.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5346933305263519, | |
| "epoch": 0.5150375939849624, | |
| "grad_norm": 0.03085665963590145, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364114046096802, | |
| "mean_token_accuracy": 0.7843690663576126, | |
| "num_tokens": 2237147.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5493077784776688, | |
| "epoch": 0.518796992481203, | |
| "grad_norm": 0.02923487313091755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560771822929382, | |
| "mean_token_accuracy": 0.7737279832363129, | |
| "num_tokens": 2253415.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5472232103347778, | |
| "epoch": 0.5225563909774437, | |
| "grad_norm": 0.031521063297986984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497399568557739, | |
| "mean_token_accuracy": 0.777409166097641, | |
| "num_tokens": 2269589.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5515349954366684, | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.02956547960639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464341640472412, | |
| "mean_token_accuracy": 0.7794498354196548, | |
| "num_tokens": 2285953.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5558236241340637, | |
| "epoch": 0.5300751879699248, | |
| "grad_norm": 0.02974775619804859, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577874779701233, | |
| "mean_token_accuracy": 0.7712955176830292, | |
| "num_tokens": 2302120.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5856722742319107, | |
| "epoch": 0.5338345864661654, | |
| "grad_norm": 0.03199459984898567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5856820940971375, | |
| "mean_token_accuracy": 0.7616758495569229, | |
| "num_tokens": 2318555.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5560419261455536, | |
| "epoch": 0.5375939849624061, | |
| "grad_norm": 0.03210260346531868, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606544613838196, | |
| "mean_token_accuracy": 0.7734680622816086, | |
| "num_tokens": 2334764.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5652720183134079, | |
| "epoch": 0.5413533834586466, | |
| "grad_norm": 0.025965852662920952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562166690826416, | |
| "mean_token_accuracy": 0.77190200984478, | |
| "num_tokens": 2351198.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.531855046749115, | |
| "epoch": 0.5451127819548872, | |
| "grad_norm": 0.029480863362550735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261865854263306, | |
| "mean_token_accuracy": 0.7886765003204346, | |
| "num_tokens": 2367340.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5517164468765259, | |
| "epoch": 0.5488721804511278, | |
| "grad_norm": 0.03105936385691166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542973875999451, | |
| "mean_token_accuracy": 0.7738576829433441, | |
| "num_tokens": 2383605.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.5376151502132416, | |
| "epoch": 0.5526315789473685, | |
| "grad_norm": 0.03337828442454338, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453506708145142, | |
| "mean_token_accuracy": 0.7788939327001572, | |
| "num_tokens": 2399719.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5623980462551117, | |
| "epoch": 0.556390977443609, | |
| "grad_norm": 0.028280731290578842, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560990035533905, | |
| "mean_token_accuracy": 0.7726676762104034, | |
| "num_tokens": 2416182.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5573243647813797, | |
| "epoch": 0.5601503759398496, | |
| "grad_norm": 0.032505616545677185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568500757217407, | |
| "mean_token_accuracy": 0.7742682248353958, | |
| "num_tokens": 2432558.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5573329925537109, | |
| "epoch": 0.5639097744360902, | |
| "grad_norm": 0.03238248452544212, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538819432258606, | |
| "mean_token_accuracy": 0.777379959821701, | |
| "num_tokens": 2448908.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5407138615846634, | |
| "epoch": 0.5676691729323309, | |
| "grad_norm": 0.02900576777756214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466345548629761, | |
| "mean_token_accuracy": 0.7775551229715347, | |
| "num_tokens": 2465270.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.554168626666069, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.0312657356262207, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629188418388367, | |
| "mean_token_accuracy": 0.7751999050378799, | |
| "num_tokens": 2481577.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5447106957435608, | |
| "epoch": 0.575187969924812, | |
| "grad_norm": 0.02679499238729477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434100031852722, | |
| "mean_token_accuracy": 0.7805473357439041, | |
| "num_tokens": 2498025.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.5469905585050583, | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 0.03267526254057884, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438751578330994, | |
| "mean_token_accuracy": 0.7798020392656326, | |
| "num_tokens": 2514245.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5860631912946701, | |
| "epoch": 0.5827067669172933, | |
| "grad_norm": 0.03039904497563839, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5810500383377075, | |
| "mean_token_accuracy": 0.7673344761133194, | |
| "num_tokens": 2530676.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5545631796121597, | |
| "epoch": 0.5864661654135338, | |
| "grad_norm": 0.028710732236504555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573135614395142, | |
| "mean_token_accuracy": 0.7758313864469528, | |
| "num_tokens": 2547029.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5309299975633621, | |
| "epoch": 0.5902255639097744, | |
| "grad_norm": 0.037456102669239044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443962812423706, | |
| "mean_token_accuracy": 0.7781406342983246, | |
| "num_tokens": 2563337.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5590629875659943, | |
| "epoch": 0.5939849624060151, | |
| "grad_norm": 0.03138922527432442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570573627948761, | |
| "mean_token_accuracy": 0.7692520618438721, | |
| "num_tokens": 2579699.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5507991015911102, | |
| "epoch": 0.5977443609022557, | |
| "grad_norm": 0.031148385256528854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549103856086731, | |
| "mean_token_accuracy": 0.7769458442926407, | |
| "num_tokens": 2596012.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5691386461257935, | |
| "epoch": 0.6015037593984962, | |
| "grad_norm": 0.03321440890431404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5682097673416138, | |
| "mean_token_accuracy": 0.7695286124944687, | |
| "num_tokens": 2612192.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5378303825855255, | |
| "epoch": 0.6052631578947368, | |
| "grad_norm": 0.029134051874279976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5314258337020874, | |
| "mean_token_accuracy": 0.7879271060228348, | |
| "num_tokens": 2628354.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5507005900144577, | |
| "epoch": 0.6090225563909775, | |
| "grad_norm": 0.028996866196393967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531865358352661, | |
| "mean_token_accuracy": 0.7761473655700684, | |
| "num_tokens": 2644501.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5587231516838074, | |
| "epoch": 0.6127819548872181, | |
| "grad_norm": 0.03128351643681526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5601255297660828, | |
| "mean_token_accuracy": 0.7728810757398605, | |
| "num_tokens": 2660638.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5519489645957947, | |
| "epoch": 0.6165413533834586, | |
| "grad_norm": 0.03436357155442238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580562949180603, | |
| "mean_token_accuracy": 0.7739841938018799, | |
| "num_tokens": 2676953.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5486033111810684, | |
| "epoch": 0.6203007518796992, | |
| "grad_norm": 0.030973074957728386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505262613296509, | |
| "mean_token_accuracy": 0.7756275236606598, | |
| "num_tokens": 2693031.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5522639453411102, | |
| "epoch": 0.6240601503759399, | |
| "grad_norm": 0.03254729509353638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508989095687866, | |
| "mean_token_accuracy": 0.7748342007398605, | |
| "num_tokens": 2709299.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5678143799304962, | |
| "epoch": 0.6278195488721805, | |
| "grad_norm": 0.027512261644005775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593494772911072, | |
| "mean_token_accuracy": 0.7736407816410065, | |
| "num_tokens": 2725613.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.5474298894405365, | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 0.02777693048119545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416566729545593, | |
| "mean_token_accuracy": 0.7782540619373322, | |
| "num_tokens": 2741762.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5676318109035492, | |
| "epoch": 0.6353383458646616, | |
| "grad_norm": 0.029206767678260803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5748559832572937, | |
| "mean_token_accuracy": 0.7664623707532883, | |
| "num_tokens": 2757964.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5471738129854202, | |
| "epoch": 0.6390977443609023, | |
| "grad_norm": 0.03809071704745293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600809454917908, | |
| "mean_token_accuracy": 0.7715400904417038, | |
| "num_tokens": 2774260.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5543518960475922, | |
| "epoch": 0.6428571428571429, | |
| "grad_norm": 0.029330087825655937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620079040527344, | |
| "mean_token_accuracy": 0.7744479775428772, | |
| "num_tokens": 2790354.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5556869655847549, | |
| "epoch": 0.6466165413533834, | |
| "grad_norm": 0.03219934552907944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567511916160583, | |
| "mean_token_accuracy": 0.7723055630922318, | |
| "num_tokens": 2806411.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5598954260349274, | |
| "epoch": 0.650375939849624, | |
| "grad_norm": 0.03049585595726967, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581772923469543, | |
| "mean_token_accuracy": 0.7723381072282791, | |
| "num_tokens": 2822457.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.5619530379772186, | |
| "epoch": 0.6541353383458647, | |
| "grad_norm": 0.029140042141079903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565066337585449, | |
| "mean_token_accuracy": 0.7765934616327286, | |
| "num_tokens": 2838821.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5609161257743835, | |
| "epoch": 0.6578947368421053, | |
| "grad_norm": 0.03307173773646355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584904551506042, | |
| "mean_token_accuracy": 0.7731504142284393, | |
| "num_tokens": 2854976.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5472587794065475, | |
| "epoch": 0.6616541353383458, | |
| "grad_norm": 0.027935896068811417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532994270324707, | |
| "mean_token_accuracy": 0.7745202481746674, | |
| "num_tokens": 2871053.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5559375882148743, | |
| "epoch": 0.6654135338345865, | |
| "grad_norm": 0.028821157291531563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584789514541626, | |
| "mean_token_accuracy": 0.7747485786676407, | |
| "num_tokens": 2887600.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5338730216026306, | |
| "epoch": 0.6691729323308271, | |
| "grad_norm": 0.026577429845929146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381085276603699, | |
| "mean_token_accuracy": 0.7791920751333237, | |
| "num_tokens": 2903970.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.556627482175827, | |
| "epoch": 0.6729323308270677, | |
| "grad_norm": 0.028157442808151245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612574219703674, | |
| "mean_token_accuracy": 0.7728701531887054, | |
| "num_tokens": 2920095.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5468809902667999, | |
| "epoch": 0.6766917293233082, | |
| "grad_norm": 0.026617249473929405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438866019248962, | |
| "mean_token_accuracy": 0.776974618434906, | |
| "num_tokens": 2936400.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5707015246152878, | |
| "epoch": 0.6804511278195489, | |
| "grad_norm": 0.03165828064084053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632250905036926, | |
| "mean_token_accuracy": 0.7731919437646866, | |
| "num_tokens": 2952758.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5669363737106323, | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 0.03147813677787781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652462840080261, | |
| "mean_token_accuracy": 0.7679423987865448, | |
| "num_tokens": 2969082.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5380169749259949, | |
| "epoch": 0.6879699248120301, | |
| "grad_norm": 0.027151955291628838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455905795097351, | |
| "mean_token_accuracy": 0.7796274274587631, | |
| "num_tokens": 2985183.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.5574334859848022, | |
| "epoch": 0.6917293233082706, | |
| "grad_norm": 0.03327858820557594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5695413947105408, | |
| "mean_token_accuracy": 0.7701131999492645, | |
| "num_tokens": 3001508.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5463923811912537, | |
| "epoch": 0.6954887218045113, | |
| "grad_norm": 0.07987584918737411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507839918136597, | |
| "mean_token_accuracy": 0.7769906222820282, | |
| "num_tokens": 3017824.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5602079033851624, | |
| "epoch": 0.6992481203007519, | |
| "grad_norm": 0.032177284359931946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561395883560181, | |
| "mean_token_accuracy": 0.7731778472661972, | |
| "num_tokens": 3034234.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5552242249250412, | |
| "epoch": 0.7030075187969925, | |
| "grad_norm": 0.17276985943317413, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5665730237960815, | |
| "mean_token_accuracy": 0.7776633650064468, | |
| "num_tokens": 3050476.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5759404450654984, | |
| "epoch": 0.706766917293233, | |
| "grad_norm": 0.03187716379761696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729998350143433, | |
| "mean_token_accuracy": 0.7687390595674515, | |
| "num_tokens": 3066888.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5559865832328796, | |
| "epoch": 0.7105263157894737, | |
| "grad_norm": 0.03442467749118805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568963885307312, | |
| "mean_token_accuracy": 0.7721963822841644, | |
| "num_tokens": 3083234.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.5560625046491623, | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.033102214336395264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556387305259705, | |
| "mean_token_accuracy": 0.7737521678209305, | |
| "num_tokens": 3099426.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5532419383525848, | |
| "epoch": 0.7180451127819549, | |
| "grad_norm": 0.03335823863744736, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5556282997131348, | |
| "mean_token_accuracy": 0.7746775895357132, | |
| "num_tokens": 3115788.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.5511862933635712, | |
| "epoch": 0.7218045112781954, | |
| "grad_norm": 0.04099865257740021, | |
| "learning_rate": 0.0002, | |
| "loss": 0.564994752407074, | |
| "mean_token_accuracy": 0.7689872086048126, | |
| "num_tokens": 3132132.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5518632382154465, | |
| "epoch": 0.7255639097744361, | |
| "grad_norm": 0.03417513892054558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622019171714783, | |
| "mean_token_accuracy": 0.7704385071992874, | |
| "num_tokens": 3148387.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5632559806108475, | |
| "epoch": 0.7293233082706767, | |
| "grad_norm": 0.030820859596133232, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607547163963318, | |
| "mean_token_accuracy": 0.7714632153511047, | |
| "num_tokens": 3164505.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.589142233133316, | |
| "epoch": 0.7330827067669173, | |
| "grad_norm": 0.029547762125730515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5773433446884155, | |
| "mean_token_accuracy": 0.7666076868772507, | |
| "num_tokens": 3180879.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5543933212757111, | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 0.03714846074581146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5530077219009399, | |
| "mean_token_accuracy": 0.7751282453536987, | |
| "num_tokens": 3196997.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5504618287086487, | |
| "epoch": 0.7406015037593985, | |
| "grad_norm": 0.03167671337723732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446099042892456, | |
| "mean_token_accuracy": 0.7800730615854263, | |
| "num_tokens": 3213232.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5440194606781006, | |
| "epoch": 0.7443609022556391, | |
| "grad_norm": 0.028702866286039352, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420858860015869, | |
| "mean_token_accuracy": 0.780303880572319, | |
| "num_tokens": 3229429.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5432772487401962, | |
| "epoch": 0.7481203007518797, | |
| "grad_norm": 0.04096582531929016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523824095726013, | |
| "mean_token_accuracy": 0.7756204158067703, | |
| "num_tokens": 3245679.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5610463172197342, | |
| "epoch": 0.7518796992481203, | |
| "grad_norm": 0.036679867655038834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655776262283325, | |
| "mean_token_accuracy": 0.7715456783771515, | |
| "num_tokens": 3262189.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5549308806657791, | |
| "epoch": 0.7556390977443609, | |
| "grad_norm": 0.02466488443315029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475676655769348, | |
| "mean_token_accuracy": 0.7779862135648727, | |
| "num_tokens": 3278554.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5799617767333984, | |
| "epoch": 0.7593984962406015, | |
| "grad_norm": 0.028492242097854614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5770009160041809, | |
| "mean_token_accuracy": 0.768639862537384, | |
| "num_tokens": 3295063.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5529991090297699, | |
| "epoch": 0.7631578947368421, | |
| "grad_norm": 0.034728050231933594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533767938613892, | |
| "mean_token_accuracy": 0.7767061442136765, | |
| "num_tokens": 3311348.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5689148902893066, | |
| "epoch": 0.7669172932330827, | |
| "grad_norm": 0.026985110715031624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640019774436951, | |
| "mean_token_accuracy": 0.7733623534440994, | |
| "num_tokens": 3327811.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5497773736715317, | |
| "epoch": 0.7706766917293233, | |
| "grad_norm": 0.026469919830560684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544072389602661, | |
| "mean_token_accuracy": 0.7730964869260788, | |
| "num_tokens": 3344190.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5487343817949295, | |
| "epoch": 0.7744360902255639, | |
| "grad_norm": 0.03394508361816406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5584373474121094, | |
| "mean_token_accuracy": 0.7742648869752884, | |
| "num_tokens": 3360318.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5593785345554352, | |
| "epoch": 0.7781954887218046, | |
| "grad_norm": 0.032090939581394196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5755316019058228, | |
| "mean_token_accuracy": 0.7676598578691483, | |
| "num_tokens": 3376652.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5540517121553421, | |
| "epoch": 0.7819548872180451, | |
| "grad_norm": 0.029152996838092804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553016185760498, | |
| "mean_token_accuracy": 0.7774887681007385, | |
| "num_tokens": 3392915.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5617629438638687, | |
| "epoch": 0.7857142857142857, | |
| "grad_norm": 0.029667040333151817, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602532625198364, | |
| "mean_token_accuracy": 0.7753290235996246, | |
| "num_tokens": 3409209.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5676616579294205, | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 0.03213479742407799, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651354789733887, | |
| "mean_token_accuracy": 0.7729621976613998, | |
| "num_tokens": 3425474.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.5594458729028702, | |
| "epoch": 0.793233082706767, | |
| "grad_norm": 0.029152261093258858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545633435249329, | |
| "mean_token_accuracy": 0.7748460322618484, | |
| "num_tokens": 3441810.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5657470673322678, | |
| "epoch": 0.7969924812030075, | |
| "grad_norm": 0.030394772067666054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634792447090149, | |
| "mean_token_accuracy": 0.7723300457000732, | |
| "num_tokens": 3458017.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5386789590120316, | |
| "epoch": 0.8007518796992481, | |
| "grad_norm": 0.030803421512246132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543491780757904, | |
| "mean_token_accuracy": 0.7788570076227188, | |
| "num_tokens": 3474394.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5462117493152618, | |
| "epoch": 0.8045112781954887, | |
| "grad_norm": 0.032262928783893585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550574064254761, | |
| "mean_token_accuracy": 0.7757156640291214, | |
| "num_tokens": 3490659.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5618492513895035, | |
| "epoch": 0.8082706766917294, | |
| "grad_norm": 0.030515553429722786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604183673858643, | |
| "mean_token_accuracy": 0.7713865786790848, | |
| "num_tokens": 3507047.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5674788951873779, | |
| "epoch": 0.8120300751879699, | |
| "grad_norm": 0.03319476544857025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704171657562256, | |
| "mean_token_accuracy": 0.7660792618989944, | |
| "num_tokens": 3523740.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5655016303062439, | |
| "epoch": 0.8157894736842105, | |
| "grad_norm": 0.025443432852625847, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5628257989883423, | |
| "mean_token_accuracy": 0.7704775929450989, | |
| "num_tokens": 3540342.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5403912216424942, | |
| "epoch": 0.8195488721804511, | |
| "grad_norm": 0.03260233253240585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542536735534668, | |
| "mean_token_accuracy": 0.7788421809673309, | |
| "num_tokens": 3556623.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5680458843708038, | |
| "epoch": 0.8233082706766918, | |
| "grad_norm": 0.034483131021261215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5691131353378296, | |
| "mean_token_accuracy": 0.76755091547966, | |
| "num_tokens": 3573182.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5689092427492142, | |
| "epoch": 0.8270676691729323, | |
| "grad_norm": 0.027871334925293922, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5706035494804382, | |
| "mean_token_accuracy": 0.768176794052124, | |
| "num_tokens": 3589235.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.563735768198967, | |
| "epoch": 0.8308270676691729, | |
| "grad_norm": 0.02944294363260269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672820806503296, | |
| "mean_token_accuracy": 0.7710028737783432, | |
| "num_tokens": 3605593.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5397096872329712, | |
| "epoch": 0.8345864661654135, | |
| "grad_norm": 0.030527444556355476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446432828903198, | |
| "mean_token_accuracy": 0.7779533118009567, | |
| "num_tokens": 3621959.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5514500439167023, | |
| "epoch": 0.8383458646616542, | |
| "grad_norm": 0.029658010229468346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5571471452713013, | |
| "mean_token_accuracy": 0.7720492035150528, | |
| "num_tokens": 3638089.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5721202939748764, | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 0.026809731498360634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5748306512832642, | |
| "mean_token_accuracy": 0.7655669301748276, | |
| "num_tokens": 3654508.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5657171607017517, | |
| "epoch": 0.8458646616541353, | |
| "grad_norm": 0.02784072421491146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5645638704299927, | |
| "mean_token_accuracy": 0.7713258415460587, | |
| "num_tokens": 3670883.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5707942843437195, | |
| "epoch": 0.849624060150376, | |
| "grad_norm": 0.027495261281728745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5690877437591553, | |
| "mean_token_accuracy": 0.7672522664070129, | |
| "num_tokens": 3687138.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5599692463874817, | |
| "epoch": 0.8533834586466166, | |
| "grad_norm": 0.02714758738875389, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558695912361145, | |
| "mean_token_accuracy": 0.7728016823530197, | |
| "num_tokens": 3703748.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.5557542443275452, | |
| "epoch": 0.8571428571428571, | |
| "grad_norm": 0.027014488354325294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528618097305298, | |
| "mean_token_accuracy": 0.7744259238243103, | |
| "num_tokens": 3720292.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5545012503862381, | |
| "epoch": 0.8609022556390977, | |
| "grad_norm": 0.030803967267274857, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548436045646667, | |
| "mean_token_accuracy": 0.772901862859726, | |
| "num_tokens": 3736719.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5630923807621002, | |
| "epoch": 0.8646616541353384, | |
| "grad_norm": 0.025556016713380814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638667941093445, | |
| "mean_token_accuracy": 0.7724170237779617, | |
| "num_tokens": 3753111.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5482154339551926, | |
| "epoch": 0.868421052631579, | |
| "grad_norm": 0.026636675000190735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516517758369446, | |
| "mean_token_accuracy": 0.7738501876592636, | |
| "num_tokens": 3769379.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.5542188733816147, | |
| "epoch": 0.8721804511278195, | |
| "grad_norm": 0.030669352039694786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562447190284729, | |
| "mean_token_accuracy": 0.7716392129659653, | |
| "num_tokens": 3785882.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.5528077483177185, | |
| "epoch": 0.8759398496240601, | |
| "grad_norm": 0.02840394526720047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538339614868164, | |
| "mean_token_accuracy": 0.7760019749403, | |
| "num_tokens": 3802159.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5367541313171387, | |
| "epoch": 0.8796992481203008, | |
| "grad_norm": 0.027923524379730225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381957292556763, | |
| "mean_token_accuracy": 0.7805743962526321, | |
| "num_tokens": 3818361.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5520175248384476, | |
| "epoch": 0.8834586466165414, | |
| "grad_norm": 0.03241734206676483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536331534385681, | |
| "mean_token_accuracy": 0.773536428809166, | |
| "num_tokens": 3834731.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5460867285728455, | |
| "epoch": 0.8872180451127819, | |
| "grad_norm": 0.027079345658421516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475375056266785, | |
| "mean_token_accuracy": 0.7766189575195312, | |
| "num_tokens": 3850982.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5568866729736328, | |
| "epoch": 0.8909774436090225, | |
| "grad_norm": 0.02961307018995285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572586059570312, | |
| "mean_token_accuracy": 0.7737904638051987, | |
| "num_tokens": 3867054.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5462281703948975, | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 0.02547132968902588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462326407432556, | |
| "mean_token_accuracy": 0.779721811413765, | |
| "num_tokens": 3883377.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5601012706756592, | |
| "epoch": 0.8984962406015038, | |
| "grad_norm": 0.027931643649935722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673293471336365, | |
| "mean_token_accuracy": 0.7699201852083206, | |
| "num_tokens": 3899760.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.558964416384697, | |
| "epoch": 0.9022556390977443, | |
| "grad_norm": 0.027888454496860504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613861083984375, | |
| "mean_token_accuracy": 0.7711526602506638, | |
| "num_tokens": 3916259.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5591289699077606, | |
| "epoch": 0.9060150375939849, | |
| "grad_norm": 0.027367601171135902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553447008132935, | |
| "mean_token_accuracy": 0.7748121023178101, | |
| "num_tokens": 3932764.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5419012606143951, | |
| "epoch": 0.9097744360902256, | |
| "grad_norm": 0.02720046602189541, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389461517333984, | |
| "mean_token_accuracy": 0.7815262824296951, | |
| "num_tokens": 3948767.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.5506538301706314, | |
| "epoch": 0.9135338345864662, | |
| "grad_norm": 0.04870102182030678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555541515350342, | |
| "mean_token_accuracy": 0.7749286592006683, | |
| "num_tokens": 3964899.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5377955883741379, | |
| "epoch": 0.9172932330827067, | |
| "grad_norm": 0.030033506453037262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442740321159363, | |
| "mean_token_accuracy": 0.7790930420160294, | |
| "num_tokens": 3981257.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5506607741117477, | |
| "epoch": 0.9210526315789473, | |
| "grad_norm": 0.03199909254908562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553537607192993, | |
| "mean_token_accuracy": 0.7754099667072296, | |
| "num_tokens": 3997442.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5611073523759842, | |
| "epoch": 0.924812030075188, | |
| "grad_norm": 0.027019886299967766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553584098815918, | |
| "mean_token_accuracy": 0.7750442922115326, | |
| "num_tokens": 4013644.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5641084164381027, | |
| "epoch": 0.9285714285714286, | |
| "grad_norm": 0.028763286769390106, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639767050743103, | |
| "mean_token_accuracy": 0.7705299705266953, | |
| "num_tokens": 4029960.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5596693158149719, | |
| "epoch": 0.9323308270676691, | |
| "grad_norm": 0.029457937926054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553030371665955, | |
| "mean_token_accuracy": 0.7704959660768509, | |
| "num_tokens": 4046137.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5426951497793198, | |
| "epoch": 0.9360902255639098, | |
| "grad_norm": 0.030174724757671356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424360036849976, | |
| "mean_token_accuracy": 0.7784756273031235, | |
| "num_tokens": 4062488.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.5482533425092697, | |
| "epoch": 0.9398496240601504, | |
| "grad_norm": 0.029116198420524597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548699676990509, | |
| "mean_token_accuracy": 0.7772116810083389, | |
| "num_tokens": 4079035.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5659994781017303, | |
| "epoch": 0.943609022556391, | |
| "grad_norm": 0.028919357806444168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734626054763794, | |
| "mean_token_accuracy": 0.7644091695547104, | |
| "num_tokens": 4095496.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.5390999913215637, | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 0.029156571254134178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542834460735321, | |
| "mean_token_accuracy": 0.778347447514534, | |
| "num_tokens": 4111786.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5335533022880554, | |
| "epoch": 0.9511278195488722, | |
| "grad_norm": 0.03090072236955166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460265874862671, | |
| "mean_token_accuracy": 0.777598574757576, | |
| "num_tokens": 4127806.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5576867163181305, | |
| "epoch": 0.9548872180451128, | |
| "grad_norm": 0.0250933188945055, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579800605773926, | |
| "mean_token_accuracy": 0.772262915968895, | |
| "num_tokens": 4144255.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5680612325668335, | |
| "epoch": 0.9586466165413534, | |
| "grad_norm": 0.02682660147547722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625680685043335, | |
| "mean_token_accuracy": 0.7703745514154434, | |
| "num_tokens": 4160554.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5646774917840958, | |
| "epoch": 0.9624060150375939, | |
| "grad_norm": 0.02460050955414772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615121126174927, | |
| "mean_token_accuracy": 0.7717017978429794, | |
| "num_tokens": 4177058.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.565275639295578, | |
| "epoch": 0.9661654135338346, | |
| "grad_norm": 0.028230059891939163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602483153343201, | |
| "mean_token_accuracy": 0.7725579738616943, | |
| "num_tokens": 4193529.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5464546531438828, | |
| "epoch": 0.9699248120300752, | |
| "grad_norm": 0.028305059298872948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506906509399414, | |
| "mean_token_accuracy": 0.7744488716125488, | |
| "num_tokens": 4209843.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5543451011180878, | |
| "epoch": 0.9736842105263158, | |
| "grad_norm": 0.026113279163837433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566228628158569, | |
| "mean_token_accuracy": 0.7761884778738022, | |
| "num_tokens": 4226371.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5395558923482895, | |
| "epoch": 0.9774436090225563, | |
| "grad_norm": 0.027898062020540237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551036536693573, | |
| "mean_token_accuracy": 0.7777495980262756, | |
| "num_tokens": 4242588.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5481285452842712, | |
| "epoch": 0.981203007518797, | |
| "grad_norm": 0.027225090190768242, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55158931016922, | |
| "mean_token_accuracy": 0.7746086716651917, | |
| "num_tokens": 4258895.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5476398766040802, | |
| "epoch": 0.9849624060150376, | |
| "grad_norm": 0.025991205126047134, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550503671169281, | |
| "mean_token_accuracy": 0.778662696480751, | |
| "num_tokens": 4275233.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5611831694841385, | |
| "epoch": 0.9887218045112782, | |
| "grad_norm": 0.026602452620863914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595046877861023, | |
| "mean_token_accuracy": 0.7710649222135544, | |
| "num_tokens": 4291628.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5607927143573761, | |
| "epoch": 0.9924812030075187, | |
| "grad_norm": 0.029126716777682304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.55509352684021, | |
| "mean_token_accuracy": 0.773261696100235, | |
| "num_tokens": 4308266.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5344236195087433, | |
| "epoch": 0.9962406015037594, | |
| "grad_norm": 0.024904625490307808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374810099601746, | |
| "mean_token_accuracy": 0.7795998752117157, | |
| "num_tokens": 4324647.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5802602022886276, | |
| "epoch": 1.0, | |
| "grad_norm": 0.02991756983101368, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5802874565124512, | |
| "mean_token_accuracy": 0.7651515454053879, | |
| "num_tokens": 4341020.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5359837561845779, | |
| "epoch": 1.0037593984962405, | |
| "grad_norm": 0.028310680761933327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382672548294067, | |
| "mean_token_accuracy": 0.7797826081514359, | |
| "num_tokens": 4356946.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.547169104218483, | |
| "epoch": 1.0075187969924813, | |
| "grad_norm": 0.026942851021885872, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483385324478149, | |
| "mean_token_accuracy": 0.7762030512094498, | |
| "num_tokens": 4373376.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5396238714456558, | |
| "epoch": 1.0112781954887218, | |
| "grad_norm": 0.026464859023690224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366930961608887, | |
| "mean_token_accuracy": 0.7836534827947617, | |
| "num_tokens": 4389434.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.5377503633499146, | |
| "epoch": 1.0150375939849625, | |
| "grad_norm": 0.028936585411429405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381658673286438, | |
| "mean_token_accuracy": 0.7795982360839844, | |
| "num_tokens": 4405773.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.5378166139125824, | |
| "epoch": 1.018796992481203, | |
| "grad_norm": 0.026616571471095085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366747975349426, | |
| "mean_token_accuracy": 0.7815251797437668, | |
| "num_tokens": 4422223.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5556348860263824, | |
| "epoch": 1.0225563909774436, | |
| "grad_norm": 0.03760155290365219, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643568634986877, | |
| "mean_token_accuracy": 0.7716861069202423, | |
| "num_tokens": 4438566.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5393058955669403, | |
| "epoch": 1.0263157894736843, | |
| "grad_norm": 0.028112079948186874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536059558391571, | |
| "mean_token_accuracy": 0.7806826084852219, | |
| "num_tokens": 4454882.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5509982258081436, | |
| "epoch": 1.0300751879699248, | |
| "grad_norm": 0.031216077506542206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545498251914978, | |
| "mean_token_accuracy": 0.7785268127918243, | |
| "num_tokens": 4471138.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.562383309006691, | |
| "epoch": 1.0338345864661653, | |
| "grad_norm": 0.029023578390479088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549452900886536, | |
| "mean_token_accuracy": 0.7746210545301437, | |
| "num_tokens": 4487599.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.533460721373558, | |
| "epoch": 1.037593984962406, | |
| "grad_norm": 0.02839999832212925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428166389465332, | |
| "mean_token_accuracy": 0.7788663357496262, | |
| "num_tokens": 4503718.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.534645140171051, | |
| "epoch": 1.0413533834586466, | |
| "grad_norm": 0.03183748945593834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435906052589417, | |
| "mean_token_accuracy": 0.780232772231102, | |
| "num_tokens": 4519836.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5403695106506348, | |
| "epoch": 1.045112781954887, | |
| "grad_norm": 0.03128998726606369, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546108603477478, | |
| "mean_token_accuracy": 0.7786454111337662, | |
| "num_tokens": 4535945.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5610467493534088, | |
| "epoch": 1.0488721804511278, | |
| "grad_norm": 0.027818012982606888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560647189617157, | |
| "mean_token_accuracy": 0.7709101587533951, | |
| "num_tokens": 4552374.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5373391807079315, | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 0.03428777679800987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469943284988403, | |
| "mean_token_accuracy": 0.7768525630235672, | |
| "num_tokens": 4568711.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5424034297466278, | |
| "epoch": 1.056390977443609, | |
| "grad_norm": 0.03859133645892143, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439317226409912, | |
| "mean_token_accuracy": 0.7811300605535507, | |
| "num_tokens": 4585017.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5506146401166916, | |
| "epoch": 1.0601503759398496, | |
| "grad_norm": 0.03055771067738533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546417236328125, | |
| "mean_token_accuracy": 0.7766596227884293, | |
| "num_tokens": 4601432.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5494361072778702, | |
| "epoch": 1.0639097744360901, | |
| "grad_norm": 0.0343659445643425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465281009674072, | |
| "mean_token_accuracy": 0.7783948630094528, | |
| "num_tokens": 4617733.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5440582782030106, | |
| "epoch": 1.0676691729323309, | |
| "grad_norm": 0.026508856564760208, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454896092414856, | |
| "mean_token_accuracy": 0.7768892496824265, | |
| "num_tokens": 4634160.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5566096007823944, | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.03006400726735592, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534993410110474, | |
| "mean_token_accuracy": 0.7748663425445557, | |
| "num_tokens": 4650625.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5545021891593933, | |
| "epoch": 1.0751879699248121, | |
| "grad_norm": 0.03096926584839821, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561465620994568, | |
| "mean_token_accuracy": 0.7750347554683685, | |
| "num_tokens": 4667029.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5399864912033081, | |
| "epoch": 1.0789473684210527, | |
| "grad_norm": 0.030643943697214127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5460204482078552, | |
| "mean_token_accuracy": 0.7770880162715912, | |
| "num_tokens": 4683375.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5572090744972229, | |
| "epoch": 1.0827067669172932, | |
| "grad_norm": 0.026186607778072357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585043430328369, | |
| "mean_token_accuracy": 0.7719515711069107, | |
| "num_tokens": 4699882.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5484725385904312, | |
| "epoch": 1.086466165413534, | |
| "grad_norm": 0.027757612988352776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432863235473633, | |
| "mean_token_accuracy": 0.7777998596429825, | |
| "num_tokens": 4716268.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5435892194509506, | |
| "epoch": 1.0902255639097744, | |
| "grad_norm": 0.02975296974182129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5351642966270447, | |
| "mean_token_accuracy": 0.7828023135662079, | |
| "num_tokens": 4732434.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5531795173883438, | |
| "epoch": 1.093984962406015, | |
| "grad_norm": 0.028304405510425568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516840815544128, | |
| "mean_token_accuracy": 0.7772639095783234, | |
| "num_tokens": 4748580.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.5184081122279167, | |
| "epoch": 1.0977443609022557, | |
| "grad_norm": 0.03446349874138832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299493670463562, | |
| "mean_token_accuracy": 0.7840149402618408, | |
| "num_tokens": 4764598.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.5289477556943893, | |
| "epoch": 1.1015037593984962, | |
| "grad_norm": 0.036261677742004395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453619956970215, | |
| "mean_token_accuracy": 0.7767883092164993, | |
| "num_tokens": 4780809.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.5418924987316132, | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 0.029477933421730995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471935272216797, | |
| "mean_token_accuracy": 0.7789769917726517, | |
| "num_tokens": 4797348.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.5463252663612366, | |
| "epoch": 1.1090225563909775, | |
| "grad_norm": 0.031204085797071457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424449443817139, | |
| "mean_token_accuracy": 0.7788571715354919, | |
| "num_tokens": 4813415.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5470333397388458, | |
| "epoch": 1.112781954887218, | |
| "grad_norm": 0.03411991521716118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338444709777832, | |
| "mean_token_accuracy": 0.7839784771203995, | |
| "num_tokens": 4829572.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5626541525125504, | |
| "epoch": 1.1165413533834587, | |
| "grad_norm": 0.03397219255566597, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499536991119385, | |
| "mean_token_accuracy": 0.7788331657648087, | |
| "num_tokens": 4845785.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5299470722675323, | |
| "epoch": 1.1203007518796992, | |
| "grad_norm": 0.03497639298439026, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392253994941711, | |
| "mean_token_accuracy": 0.7810451984405518, | |
| "num_tokens": 4862012.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5335487574338913, | |
| "epoch": 1.1240601503759398, | |
| "grad_norm": 0.034831658005714417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457339286804199, | |
| "mean_token_accuracy": 0.779063493013382, | |
| "num_tokens": 4878251.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.528610497713089, | |
| "epoch": 1.1278195488721805, | |
| "grad_norm": 0.033591266721487045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542759358882904, | |
| "mean_token_accuracy": 0.7827056795358658, | |
| "num_tokens": 4894510.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5455980747938156, | |
| "epoch": 1.131578947368421, | |
| "grad_norm": 0.029848981648683548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544407963752747, | |
| "mean_token_accuracy": 0.7761986404657364, | |
| "num_tokens": 4910941.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5403441041707993, | |
| "epoch": 1.1353383458646618, | |
| "grad_norm": 0.028331086039543152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373193025588989, | |
| "mean_token_accuracy": 0.7810037434101105, | |
| "num_tokens": 4927224.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.579601064324379, | |
| "epoch": 1.1390977443609023, | |
| "grad_norm": 0.034219082444906235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5681281685829163, | |
| "mean_token_accuracy": 0.7684440910816193, | |
| "num_tokens": 4943447.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5505090206861496, | |
| "epoch": 1.1428571428571428, | |
| "grad_norm": 0.0307406485080719, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461090803146362, | |
| "mean_token_accuracy": 0.778554230928421, | |
| "num_tokens": 4959489.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5576640069484711, | |
| "epoch": 1.1466165413533835, | |
| "grad_norm": 0.030323676764965057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553523302078247, | |
| "mean_token_accuracy": 0.773658037185669, | |
| "num_tokens": 4975936.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.5266588181257248, | |
| "epoch": 1.150375939849624, | |
| "grad_norm": 0.035491373389959335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350923538208008, | |
| "mean_token_accuracy": 0.7815313786268234, | |
| "num_tokens": 4992537.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5482136011123657, | |
| "epoch": 1.1541353383458646, | |
| "grad_norm": 0.03442855179309845, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545141696929932, | |
| "mean_token_accuracy": 0.7746158391237259, | |
| "num_tokens": 5009023.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5559152960777283, | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 0.02727232687175274, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569304823875427, | |
| "mean_token_accuracy": 0.7725173830986023, | |
| "num_tokens": 5025411.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5630469471216202, | |
| "epoch": 1.1616541353383458, | |
| "grad_norm": 0.03064255230128765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543197989463806, | |
| "mean_token_accuracy": 0.774148479104042, | |
| "num_tokens": 5041812.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5571756958961487, | |
| "epoch": 1.1654135338345863, | |
| "grad_norm": 0.03609425947070122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525773763656616, | |
| "mean_token_accuracy": 0.7752318233251572, | |
| "num_tokens": 5058244.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.5431416481733322, | |
| "epoch": 1.169172932330827, | |
| "grad_norm": 0.027324821799993515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384103059768677, | |
| "mean_token_accuracy": 0.7805906236171722, | |
| "num_tokens": 5074488.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5343848988413811, | |
| "epoch": 1.1729323308270676, | |
| "grad_norm": 0.03805036470293999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469476580619812, | |
| "mean_token_accuracy": 0.779438316822052, | |
| "num_tokens": 5090911.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.536148265004158, | |
| "epoch": 1.1766917293233083, | |
| "grad_norm": 0.02961050719022751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435563921928406, | |
| "mean_token_accuracy": 0.7815048396587372, | |
| "num_tokens": 5107152.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.5418159067630768, | |
| "epoch": 1.1804511278195489, | |
| "grad_norm": 0.025910982862114906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.540198028087616, | |
| "mean_token_accuracy": 0.7800037860870361, | |
| "num_tokens": 5123652.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5343509763479233, | |
| "epoch": 1.1842105263157894, | |
| "grad_norm": 0.03428869694471359, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369153618812561, | |
| "mean_token_accuracy": 0.7804707884788513, | |
| "num_tokens": 5139855.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5401560962200165, | |
| "epoch": 1.1879699248120301, | |
| "grad_norm": 0.027781767770648003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393479466438293, | |
| "mean_token_accuracy": 0.7805478721857071, | |
| "num_tokens": 5156155.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5566094070672989, | |
| "epoch": 1.1917293233082706, | |
| "grad_norm": 0.026983041316270828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554964005947113, | |
| "mean_token_accuracy": 0.7756882756948471, | |
| "num_tokens": 5172489.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.547125369310379, | |
| "epoch": 1.1954887218045114, | |
| "grad_norm": 0.03205394372344017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493847727775574, | |
| "mean_token_accuracy": 0.7793397605419159, | |
| "num_tokens": 5189044.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.534126952290535, | |
| "epoch": 1.199248120300752, | |
| "grad_norm": 0.027468601241707802, | |
| "learning_rate": 0.0002, | |
| "loss": 0.532336413860321, | |
| "mean_token_accuracy": 0.7843205332756042, | |
| "num_tokens": 5205622.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.541590228676796, | |
| "epoch": 1.2030075187969924, | |
| "grad_norm": 0.02954232320189476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532248020172119, | |
| "mean_token_accuracy": 0.7745756506919861, | |
| "num_tokens": 5222003.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5365501791238785, | |
| "epoch": 1.2067669172932332, | |
| "grad_norm": 0.03286029398441315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431678891181946, | |
| "mean_token_accuracy": 0.7808897346258163, | |
| "num_tokens": 5238368.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5435497313737869, | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 0.03365312144160271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542516827583313, | |
| "mean_token_accuracy": 0.7798768132925034, | |
| "num_tokens": 5254690.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5485272854566574, | |
| "epoch": 1.2142857142857142, | |
| "grad_norm": 0.02945873513817787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457643866539001, | |
| "mean_token_accuracy": 0.779216393828392, | |
| "num_tokens": 5270982.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5480885654687881, | |
| "epoch": 1.218045112781955, | |
| "grad_norm": 0.03765803202986717, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544890284538269, | |
| "mean_token_accuracy": 0.7774617224931717, | |
| "num_tokens": 5287222.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5345787778496742, | |
| "epoch": 1.2218045112781954, | |
| "grad_norm": 0.029292147606611252, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371191501617432, | |
| "mean_token_accuracy": 0.7809965461492538, | |
| "num_tokens": 5303631.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.5533891320228577, | |
| "epoch": 1.225563909774436, | |
| "grad_norm": 0.03491590917110443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632805228233337, | |
| "mean_token_accuracy": 0.7713405042886734, | |
| "num_tokens": 5319707.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5442000329494476, | |
| "epoch": 1.2293233082706767, | |
| "grad_norm": 0.035631779581308365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511363744735718, | |
| "mean_token_accuracy": 0.77325139939785, | |
| "num_tokens": 5336015.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.550067774951458, | |
| "epoch": 1.2330827067669172, | |
| "grad_norm": 0.03429507836699486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445730686187744, | |
| "mean_token_accuracy": 0.7788997292518616, | |
| "num_tokens": 5352567.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5536926835775375, | |
| "epoch": 1.236842105263158, | |
| "grad_norm": 0.02860317751765251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513879656791687, | |
| "mean_token_accuracy": 0.7763962298631668, | |
| "num_tokens": 5368974.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5571767240762711, | |
| "epoch": 1.2406015037593985, | |
| "grad_norm": 0.03053511306643486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535838007926941, | |
| "mean_token_accuracy": 0.7756504565477371, | |
| "num_tokens": 5385405.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5644853711128235, | |
| "epoch": 1.244360902255639, | |
| "grad_norm": 0.02813347429037094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5661532282829285, | |
| "mean_token_accuracy": 0.7694092392921448, | |
| "num_tokens": 5401733.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.554289311170578, | |
| "epoch": 1.2481203007518797, | |
| "grad_norm": 0.030001962557435036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581742525100708, | |
| "mean_token_accuracy": 0.7724047005176544, | |
| "num_tokens": 5418343.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5443666130304337, | |
| "epoch": 1.2518796992481203, | |
| "grad_norm": 0.030697215348482132, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5461480021476746, | |
| "mean_token_accuracy": 0.7806287556886673, | |
| "num_tokens": 5434583.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5332125425338745, | |
| "epoch": 1.255639097744361, | |
| "grad_norm": 0.031576018780469894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535359799861908, | |
| "mean_token_accuracy": 0.7810158431529999, | |
| "num_tokens": 5450746.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.555268332362175, | |
| "epoch": 1.2593984962406015, | |
| "grad_norm": 0.027363646775484085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560035109519958, | |
| "mean_token_accuracy": 0.7736663818359375, | |
| "num_tokens": 5467188.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5493292659521103, | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 0.031114885583519936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509231090545654, | |
| "mean_token_accuracy": 0.7764100879430771, | |
| "num_tokens": 5483617.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5554828643798828, | |
| "epoch": 1.2669172932330828, | |
| "grad_norm": 0.027718449011445045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540401339530945, | |
| "mean_token_accuracy": 0.7730122804641724, | |
| "num_tokens": 5499950.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5383172035217285, | |
| "epoch": 1.2706766917293233, | |
| "grad_norm": 0.029059337452054024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407942533493042, | |
| "mean_token_accuracy": 0.7809923589229584, | |
| "num_tokens": 5516241.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5302157253026962, | |
| "epoch": 1.274436090225564, | |
| "grad_norm": 0.030479708686470985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530126690864563, | |
| "mean_token_accuracy": 0.7863384485244751, | |
| "num_tokens": 5532841.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5322539657354355, | |
| "epoch": 1.2781954887218046, | |
| "grad_norm": 0.031503573060035706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389677286148071, | |
| "mean_token_accuracy": 0.77957783639431, | |
| "num_tokens": 5549325.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5437572598457336, | |
| "epoch": 1.281954887218045, | |
| "grad_norm": 0.027867093682289124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459513664245605, | |
| "mean_token_accuracy": 0.7789556235074997, | |
| "num_tokens": 5565810.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5430660545825958, | |
| "epoch": 1.2857142857142856, | |
| "grad_norm": 0.03420820087194443, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441212058067322, | |
| "mean_token_accuracy": 0.7775195837020874, | |
| "num_tokens": 5581844.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.5310375243425369, | |
| "epoch": 1.2894736842105263, | |
| "grad_norm": 0.03065858967602253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5356528162956238, | |
| "mean_token_accuracy": 0.7801522761583328, | |
| "num_tokens": 5598042.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5220501720905304, | |
| "epoch": 1.2932330827067668, | |
| "grad_norm": 0.029243886470794678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.516523540019989, | |
| "mean_token_accuracy": 0.7906120866537094, | |
| "num_tokens": 5614111.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5659748762845993, | |
| "epoch": 1.2969924812030076, | |
| "grad_norm": 0.03555883839726448, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587096214294434, | |
| "mean_token_accuracy": 0.771675169467926, | |
| "num_tokens": 5630635.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5501575618982315, | |
| "epoch": 1.300751879699248, | |
| "grad_norm": 0.030357254669070244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473156571388245, | |
| "mean_token_accuracy": 0.7771240919828415, | |
| "num_tokens": 5646994.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5270983800292015, | |
| "epoch": 1.3045112781954886, | |
| "grad_norm": 0.030822839587926865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363721251487732, | |
| "mean_token_accuracy": 0.7837044894695282, | |
| "num_tokens": 5663472.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5483475178480148, | |
| "epoch": 1.3082706766917294, | |
| "grad_norm": 0.03400631621479988, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550627708435059, | |
| "mean_token_accuracy": 0.7723206877708435, | |
| "num_tokens": 5679878.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5459110736846924, | |
| "epoch": 1.3120300751879699, | |
| "grad_norm": 0.028672240674495697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484554767608643, | |
| "mean_token_accuracy": 0.7754105031490326, | |
| "num_tokens": 5696124.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5513360351324081, | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 0.029986541718244553, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548675000667572, | |
| "mean_token_accuracy": 0.7767119109630585, | |
| "num_tokens": 5712240.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5394999980926514, | |
| "epoch": 1.3195488721804511, | |
| "grad_norm": 0.027749765664339066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411927700042725, | |
| "mean_token_accuracy": 0.7794090211391449, | |
| "num_tokens": 5728487.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5632177442312241, | |
| "epoch": 1.3233082706766917, | |
| "grad_norm": 0.03165826201438904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644969344139099, | |
| "mean_token_accuracy": 0.7739209532737732, | |
| "num_tokens": 5744665.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5484495759010315, | |
| "epoch": 1.3270676691729324, | |
| "grad_norm": 0.02855236455798149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507109761238098, | |
| "mean_token_accuracy": 0.7781708836555481, | |
| "num_tokens": 5761081.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5463808476924896, | |
| "epoch": 1.330827067669173, | |
| "grad_norm": 0.033144768327474594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490323901176453, | |
| "mean_token_accuracy": 0.7771764546632767, | |
| "num_tokens": 5777230.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.559476301074028, | |
| "epoch": 1.3345864661654137, | |
| "grad_norm": 0.030584782361984253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653771162033081, | |
| "mean_token_accuracy": 0.7701748311519623, | |
| "num_tokens": 5793509.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5580354928970337, | |
| "epoch": 1.3383458646616542, | |
| "grad_norm": 0.029205013066530228, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602571964263916, | |
| "mean_token_accuracy": 0.7710904181003571, | |
| "num_tokens": 5809901.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5673199146986008, | |
| "epoch": 1.3421052631578947, | |
| "grad_norm": 0.03065381944179535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655714273452759, | |
| "mean_token_accuracy": 0.7691835165023804, | |
| "num_tokens": 5826128.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5535888224840164, | |
| "epoch": 1.3458646616541352, | |
| "grad_norm": 0.028708767145872116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483720302581787, | |
| "mean_token_accuracy": 0.7754883170127869, | |
| "num_tokens": 5842416.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5565765500068665, | |
| "epoch": 1.349624060150376, | |
| "grad_norm": 0.031074965372681618, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588751435279846, | |
| "mean_token_accuracy": 0.7724489718675613, | |
| "num_tokens": 5858778.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.5447706580162048, | |
| "epoch": 1.3533834586466165, | |
| "grad_norm": 0.031974222511053085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503548979759216, | |
| "mean_token_accuracy": 0.7767511457204819, | |
| "num_tokens": 5875340.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5325894355773926, | |
| "epoch": 1.3571428571428572, | |
| "grad_norm": 0.036680273711681366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425075888633728, | |
| "mean_token_accuracy": 0.7785896062850952, | |
| "num_tokens": 5891618.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5401211231946945, | |
| "epoch": 1.3609022556390977, | |
| "grad_norm": 0.030604355037212372, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543202817440033, | |
| "mean_token_accuracy": 0.7824591100215912, | |
| "num_tokens": 5907777.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.548919603228569, | |
| "epoch": 1.3646616541353382, | |
| "grad_norm": 0.02865537256002426, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504399538040161, | |
| "mean_token_accuracy": 0.7752194404602051, | |
| "num_tokens": 5924266.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5391300171613693, | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 0.030051855370402336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288874506950378, | |
| "mean_token_accuracy": 0.7848425358533859, | |
| "num_tokens": 5940334.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5440739095211029, | |
| "epoch": 1.3721804511278195, | |
| "grad_norm": 0.02727932669222355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456202626228333, | |
| "mean_token_accuracy": 0.7774905413389206, | |
| "num_tokens": 5956646.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5311928540468216, | |
| "epoch": 1.3759398496240602, | |
| "grad_norm": 0.029294485226273537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352226495742798, | |
| "mean_token_accuracy": 0.7806590050458908, | |
| "num_tokens": 5972841.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5386375188827515, | |
| "epoch": 1.3796992481203008, | |
| "grad_norm": 0.034396879374980927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386478304862976, | |
| "mean_token_accuracy": 0.780673161149025, | |
| "num_tokens": 5989110.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5205325111746788, | |
| "epoch": 1.3834586466165413, | |
| "grad_norm": 0.028440408408641815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524253249168396, | |
| "mean_token_accuracy": 0.7875637263059616, | |
| "num_tokens": 6005130.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5718593895435333, | |
| "epoch": 1.387218045112782, | |
| "grad_norm": 0.03535715863108635, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674105882644653, | |
| "mean_token_accuracy": 0.7696711122989655, | |
| "num_tokens": 6021765.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5570171922445297, | |
| "epoch": 1.3909774436090225, | |
| "grad_norm": 0.02890731766819954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550771951675415, | |
| "mean_token_accuracy": 0.7735273241996765, | |
| "num_tokens": 6038195.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5555340945720673, | |
| "epoch": 1.3947368421052633, | |
| "grad_norm": 0.03310281038284302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569556951522827, | |
| "mean_token_accuracy": 0.7722765356302261, | |
| "num_tokens": 6054869.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5339787155389786, | |
| "epoch": 1.3984962406015038, | |
| "grad_norm": 0.0280836783349514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336146354675293, | |
| "mean_token_accuracy": 0.7833946198225021, | |
| "num_tokens": 6071026.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5382460206747055, | |
| "epoch": 1.4022556390977443, | |
| "grad_norm": 0.028865907341241837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415489077568054, | |
| "mean_token_accuracy": 0.7795161455869675, | |
| "num_tokens": 6087218.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5312956869602203, | |
| "epoch": 1.4060150375939848, | |
| "grad_norm": 0.029321739450097084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310655832290649, | |
| "mean_token_accuracy": 0.7824108898639679, | |
| "num_tokens": 6103644.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5470356345176697, | |
| "epoch": 1.4097744360902256, | |
| "grad_norm": 0.035155754536390305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525869131088257, | |
| "mean_token_accuracy": 0.7761145532131195, | |
| "num_tokens": 6120051.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5374057814478874, | |
| "epoch": 1.413533834586466, | |
| "grad_norm": 0.029863376170396805, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542983889579773, | |
| "mean_token_accuracy": 0.7801049947738647, | |
| "num_tokens": 6136168.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5664133429527283, | |
| "epoch": 1.4172932330827068, | |
| "grad_norm": 0.04531969875097275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716960430145264, | |
| "mean_token_accuracy": 0.7669987082481384, | |
| "num_tokens": 6152503.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5445482283830643, | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 0.031349968165159225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467873811721802, | |
| "mean_token_accuracy": 0.7808011472225189, | |
| "num_tokens": 6168685.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5332349240779877, | |
| "epoch": 1.4248120300751879, | |
| "grad_norm": 0.03072705864906311, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5336711406707764, | |
| "mean_token_accuracy": 0.785218134522438, | |
| "num_tokens": 6185265.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5406992584466934, | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.03197013586759567, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535304605960846, | |
| "mean_token_accuracy": 0.781609907746315, | |
| "num_tokens": 6201359.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5503518134355545, | |
| "epoch": 1.4323308270676691, | |
| "grad_norm": 0.02861807495355606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474637746810913, | |
| "mean_token_accuracy": 0.7788266986608505, | |
| "num_tokens": 6217636.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5336224138736725, | |
| "epoch": 1.4360902255639099, | |
| "grad_norm": 0.03593042492866516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366555452346802, | |
| "mean_token_accuracy": 0.7802215367555618, | |
| "num_tokens": 6234047.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5492585748434067, | |
| "epoch": 1.4398496240601504, | |
| "grad_norm": 0.02969398722052574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519292950630188, | |
| "mean_token_accuracy": 0.77450992166996, | |
| "num_tokens": 6250372.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5435014069080353, | |
| "epoch": 1.443609022556391, | |
| "grad_norm": 0.03131045401096344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428797602653503, | |
| "mean_token_accuracy": 0.7789845615625381, | |
| "num_tokens": 6266490.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5582468658685684, | |
| "epoch": 1.4473684210526316, | |
| "grad_norm": 0.0334627628326416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606057047843933, | |
| "mean_token_accuracy": 0.7737329006195068, | |
| "num_tokens": 6282965.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5667697936296463, | |
| "epoch": 1.4511278195488722, | |
| "grad_norm": 0.031320203095674515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704291462898254, | |
| "mean_token_accuracy": 0.7688294649124146, | |
| "num_tokens": 6299265.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5566418468952179, | |
| "epoch": 1.454887218045113, | |
| "grad_norm": 0.04116431251168251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568630695343018, | |
| "mean_token_accuracy": 0.774201288819313, | |
| "num_tokens": 6315434.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5492933839559555, | |
| "epoch": 1.4586466165413534, | |
| "grad_norm": 0.02759244106709957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531164407730103, | |
| "mean_token_accuracy": 0.7763701528310776, | |
| "num_tokens": 6331760.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5672035366296768, | |
| "epoch": 1.462406015037594, | |
| "grad_norm": 0.03223001956939697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56959068775177, | |
| "mean_token_accuracy": 0.768874928355217, | |
| "num_tokens": 6348346.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.5533206462860107, | |
| "epoch": 1.4661654135338344, | |
| "grad_norm": 0.03371699899435043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532012581825256, | |
| "mean_token_accuracy": 0.7752765119075775, | |
| "num_tokens": 6364905.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5474317967891693, | |
| "epoch": 1.4699248120300752, | |
| "grad_norm": 0.033150747418403625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470337867736816, | |
| "mean_token_accuracy": 0.776570737361908, | |
| "num_tokens": 6381253.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5514713823795319, | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 0.03456156328320503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495055317878723, | |
| "mean_token_accuracy": 0.7780424803495407, | |
| "num_tokens": 6397488.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.524335652589798, | |
| "epoch": 1.4774436090225564, | |
| "grad_norm": 0.0276760496199131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5228588581085205, | |
| "mean_token_accuracy": 0.7869584411382675, | |
| "num_tokens": 6413858.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5439832955598831, | |
| "epoch": 1.481203007518797, | |
| "grad_norm": 0.030009951442480087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459988117218018, | |
| "mean_token_accuracy": 0.7772574722766876, | |
| "num_tokens": 6430056.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.558243066072464, | |
| "epoch": 1.4849624060150375, | |
| "grad_norm": 0.03417029604315758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551323652267456, | |
| "mean_token_accuracy": 0.7783164083957672, | |
| "num_tokens": 6446633.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5622076392173767, | |
| "epoch": 1.4887218045112782, | |
| "grad_norm": 0.030520809814333916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651980638504028, | |
| "mean_token_accuracy": 0.7693700790405273, | |
| "num_tokens": 6463061.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.5262496769428253, | |
| "epoch": 1.4924812030075187, | |
| "grad_norm": 0.03385322168469429, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383599400520325, | |
| "mean_token_accuracy": 0.7795081436634064, | |
| "num_tokens": 6479394.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5428214818239212, | |
| "epoch": 1.4962406015037595, | |
| "grad_norm": 0.0344393290579319, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506508350372314, | |
| "mean_token_accuracy": 0.776181235909462, | |
| "num_tokens": 6495837.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5589512288570404, | |
| "epoch": 1.5, | |
| "grad_norm": 0.031076369807124138, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615136027336121, | |
| "mean_token_accuracy": 0.7719069272279739, | |
| "num_tokens": 6512096.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.560438871383667, | |
| "epoch": 1.5037593984962405, | |
| "grad_norm": 0.03327278420329094, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5491290092468262, | |
| "mean_token_accuracy": 0.7760379314422607, | |
| "num_tokens": 6528380.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.543613851070404, | |
| "epoch": 1.5075187969924813, | |
| "grad_norm": 0.03218228369951248, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404437780380249, | |
| "mean_token_accuracy": 0.7790963053703308, | |
| "num_tokens": 6544607.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5582986176013947, | |
| "epoch": 1.5112781954887218, | |
| "grad_norm": 0.031328245997428894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539280772209167, | |
| "mean_token_accuracy": 0.7730978429317474, | |
| "num_tokens": 6561161.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5439886897802353, | |
| "epoch": 1.5150375939849625, | |
| "grad_norm": 0.0315370075404644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494069457054138, | |
| "mean_token_accuracy": 0.77658711373806, | |
| "num_tokens": 6577494.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5441574305295944, | |
| "epoch": 1.518796992481203, | |
| "grad_norm": 0.029565030708909035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542066097259521, | |
| "mean_token_accuracy": 0.7728031128644943, | |
| "num_tokens": 6593864.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.5381332039833069, | |
| "epoch": 1.5225563909774436, | |
| "grad_norm": 0.030989129096269608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439568758010864, | |
| "mean_token_accuracy": 0.7784450650215149, | |
| "num_tokens": 6610189.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5451879501342773, | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.030062349513173103, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435837507247925, | |
| "mean_token_accuracy": 0.7782586812973022, | |
| "num_tokens": 6626574.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5333066508173943, | |
| "epoch": 1.5300751879699248, | |
| "grad_norm": 0.02931753545999527, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52620530128479, | |
| "mean_token_accuracy": 0.784236952662468, | |
| "num_tokens": 6642855.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5590699911117554, | |
| "epoch": 1.5338345864661656, | |
| "grad_norm": 0.03177345171570778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554062128067017, | |
| "mean_token_accuracy": 0.7730756998062134, | |
| "num_tokens": 6659323.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5350319743156433, | |
| "epoch": 1.537593984962406, | |
| "grad_norm": 0.033441949635744095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428333282470703, | |
| "mean_token_accuracy": 0.7798242121934891, | |
| "num_tokens": 6675571.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.5449950993061066, | |
| "epoch": 1.5413533834586466, | |
| "grad_norm": 0.03087989240884781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550757646560669, | |
| "mean_token_accuracy": 0.7777638882398605, | |
| "num_tokens": 6692022.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5534456223249435, | |
| "epoch": 1.545112781954887, | |
| "grad_norm": 0.030627673491835594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566884875297546, | |
| "mean_token_accuracy": 0.7747643887996674, | |
| "num_tokens": 6708348.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5696779191493988, | |
| "epoch": 1.5488721804511278, | |
| "grad_norm": 0.029869280755519867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629582405090332, | |
| "mean_token_accuracy": 0.7705719769001007, | |
| "num_tokens": 6725016.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5336505770683289, | |
| "epoch": 1.5526315789473686, | |
| "grad_norm": 0.02911611832678318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279027223587036, | |
| "mean_token_accuracy": 0.783367246389389, | |
| "num_tokens": 6741327.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5392275899648666, | |
| "epoch": 1.556390977443609, | |
| "grad_norm": 0.02994578517973423, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416238307952881, | |
| "mean_token_accuracy": 0.7807497531175613, | |
| "num_tokens": 6757440.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5460323542356491, | |
| "epoch": 1.5601503759398496, | |
| "grad_norm": 0.03534119576215744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568557977676392, | |
| "mean_token_accuracy": 0.7705673724412918, | |
| "num_tokens": 6773654.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5286229997873306, | |
| "epoch": 1.5639097744360901, | |
| "grad_norm": 0.029811112210154533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318726301193237, | |
| "mean_token_accuracy": 0.7832337915897369, | |
| "num_tokens": 6789752.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5552769899368286, | |
| "epoch": 1.5676691729323309, | |
| "grad_norm": 0.030895395204424858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534340739250183, | |
| "mean_token_accuracy": 0.7729407846927643, | |
| "num_tokens": 6805849.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5429228097200394, | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.02707672491669655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381065607070923, | |
| "mean_token_accuracy": 0.7819552570581436, | |
| "num_tokens": 6822408.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5434612482786179, | |
| "epoch": 1.5751879699248121, | |
| "grad_norm": 0.031254079192876816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391129851341248, | |
| "mean_token_accuracy": 0.7833003848791122, | |
| "num_tokens": 6838597.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5366530418395996, | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 0.03022637590765953, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5400729179382324, | |
| "mean_token_accuracy": 0.7778450101613998, | |
| "num_tokens": 6854952.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5444828122854233, | |
| "epoch": 1.5827067669172932, | |
| "grad_norm": 0.031558163464069366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507203936576843, | |
| "mean_token_accuracy": 0.7739860564470291, | |
| "num_tokens": 6871383.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5397373139858246, | |
| "epoch": 1.5864661654135337, | |
| "grad_norm": 0.03590668365359306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495097041130066, | |
| "mean_token_accuracy": 0.7745723277330399, | |
| "num_tokens": 6887614.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5547508299350739, | |
| "epoch": 1.5902255639097744, | |
| "grad_norm": 0.03271407634019852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595258474349976, | |
| "mean_token_accuracy": 0.7740814536809921, | |
| "num_tokens": 6903891.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5452055484056473, | |
| "epoch": 1.5939849624060152, | |
| "grad_norm": 0.034447524696588516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422000288963318, | |
| "mean_token_accuracy": 0.7810980677604675, | |
| "num_tokens": 6920317.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.5475759953260422, | |
| "epoch": 1.5977443609022557, | |
| "grad_norm": 0.027404673397541046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450745820999146, | |
| "mean_token_accuracy": 0.7764957696199417, | |
| "num_tokens": 6936706.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5484007894992828, | |
| "epoch": 1.6015037593984962, | |
| "grad_norm": 0.031125633046030998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480135083198547, | |
| "mean_token_accuracy": 0.7771385014057159, | |
| "num_tokens": 6952874.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.5364782959222794, | |
| "epoch": 1.6052631578947367, | |
| "grad_norm": 0.029450541362166405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340723395347595, | |
| "mean_token_accuracy": 0.7846143394708633, | |
| "num_tokens": 6969087.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5632024109363556, | |
| "epoch": 1.6090225563909775, | |
| "grad_norm": 0.03085445798933506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56367427110672, | |
| "mean_token_accuracy": 0.7722935974597931, | |
| "num_tokens": 6985519.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5589936077594757, | |
| "epoch": 1.6127819548872182, | |
| "grad_norm": 0.03428523615002632, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611156225204468, | |
| "mean_token_accuracy": 0.7728175222873688, | |
| "num_tokens": 7001978.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5625983476638794, | |
| "epoch": 1.6165413533834587, | |
| "grad_norm": 0.03059856966137886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5613099932670593, | |
| "mean_token_accuracy": 0.7710365056991577, | |
| "num_tokens": 7018277.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5519939213991165, | |
| "epoch": 1.6203007518796992, | |
| "grad_norm": 0.030437655746936798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545467734336853, | |
| "mean_token_accuracy": 0.778165876865387, | |
| "num_tokens": 7034622.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5278475731611252, | |
| "epoch": 1.6240601503759398, | |
| "grad_norm": 0.027164338156580925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5260958075523376, | |
| "mean_token_accuracy": 0.7867996096611023, | |
| "num_tokens": 7050833.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5364744961261749, | |
| "epoch": 1.6278195488721805, | |
| "grad_norm": 0.02916925586760044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371173024177551, | |
| "mean_token_accuracy": 0.7820777744054794, | |
| "num_tokens": 7067201.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5432325303554535, | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 0.02878529019653797, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453219413757324, | |
| "mean_token_accuracy": 0.7784911543130875, | |
| "num_tokens": 7083919.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5461350232362747, | |
| "epoch": 1.6353383458646618, | |
| "grad_norm": 0.030911264941096306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5520428419113159, | |
| "mean_token_accuracy": 0.7748389393091202, | |
| "num_tokens": 7100167.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5301318913698196, | |
| "epoch": 1.6390977443609023, | |
| "grad_norm": 0.0337194949388504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.533911406993866, | |
| "mean_token_accuracy": 0.781144917011261, | |
| "num_tokens": 7115963.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.554198831319809, | |
| "epoch": 1.6428571428571428, | |
| "grad_norm": 0.03273259475827217, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581203699111938, | |
| "mean_token_accuracy": 0.7747189700603485, | |
| "num_tokens": 7132343.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5451264977455139, | |
| "epoch": 1.6466165413533833, | |
| "grad_norm": 0.028795765712857246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419780015945435, | |
| "mean_token_accuracy": 0.7782856971025467, | |
| "num_tokens": 7148711.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5696405470371246, | |
| "epoch": 1.650375939849624, | |
| "grad_norm": 0.02880324050784111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568999171257019, | |
| "mean_token_accuracy": 0.7674362361431122, | |
| "num_tokens": 7165000.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5544975996017456, | |
| "epoch": 1.6541353383458648, | |
| "grad_norm": 0.0319298580288887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572612881660461, | |
| "mean_token_accuracy": 0.7738819718360901, | |
| "num_tokens": 7181178.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5648850053548813, | |
| "epoch": 1.6578947368421053, | |
| "grad_norm": 0.033446941524744034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726531147956848, | |
| "mean_token_accuracy": 0.767191156744957, | |
| "num_tokens": 7197682.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.5558575242757797, | |
| "epoch": 1.6616541353383458, | |
| "grad_norm": 0.02976951375603676, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5575220584869385, | |
| "mean_token_accuracy": 0.7738383561372757, | |
| "num_tokens": 7214036.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5415066331624985, | |
| "epoch": 1.6654135338345863, | |
| "grad_norm": 0.03178182989358902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425861477851868, | |
| "mean_token_accuracy": 0.777436301112175, | |
| "num_tokens": 7230232.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5568071007728577, | |
| "epoch": 1.669172932330827, | |
| "grad_norm": 0.029093647375702858, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502623319625854, | |
| "mean_token_accuracy": 0.7746951729059219, | |
| "num_tokens": 7246458.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5455858707427979, | |
| "epoch": 1.6729323308270678, | |
| "grad_norm": 0.03103097900748253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415849685668945, | |
| "mean_token_accuracy": 0.7773046642541885, | |
| "num_tokens": 7262757.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5557373017072678, | |
| "epoch": 1.6766917293233083, | |
| "grad_norm": 0.034459494054317474, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588368773460388, | |
| "mean_token_accuracy": 0.7731840759515762, | |
| "num_tokens": 7279011.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.536065399646759, | |
| "epoch": 1.6804511278195489, | |
| "grad_norm": 0.030954651534557343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398183465003967, | |
| "mean_token_accuracy": 0.778962567448616, | |
| "num_tokens": 7295450.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5364357531070709, | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 0.03524971008300781, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447929501533508, | |
| "mean_token_accuracy": 0.7776346057653427, | |
| "num_tokens": 7311638.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5611797869205475, | |
| "epoch": 1.6879699248120301, | |
| "grad_norm": 0.02808379754424095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557354688644409, | |
| "mean_token_accuracy": 0.7739097476005554, | |
| "num_tokens": 7327872.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.5732033550739288, | |
| "epoch": 1.6917293233082706, | |
| "grad_norm": 0.03260007128119469, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591524839401245, | |
| "mean_token_accuracy": 0.775033637881279, | |
| "num_tokens": 7344324.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5342790335416794, | |
| "epoch": 1.6954887218045114, | |
| "grad_norm": 0.02984827756881714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380273461341858, | |
| "mean_token_accuracy": 0.782566487789154, | |
| "num_tokens": 7360753.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5318778306245804, | |
| "epoch": 1.699248120300752, | |
| "grad_norm": 0.03279503807425499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544060468673706, | |
| "mean_token_accuracy": 0.7762828469276428, | |
| "num_tokens": 7377154.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.5356487184762955, | |
| "epoch": 1.7030075187969924, | |
| "grad_norm": 0.03332759812474251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548007607460022, | |
| "mean_token_accuracy": 0.7769170254468918, | |
| "num_tokens": 7393621.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.5513975322246552, | |
| "epoch": 1.706766917293233, | |
| "grad_norm": 0.03238146752119064, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592359900474548, | |
| "mean_token_accuracy": 0.7740825116634369, | |
| "num_tokens": 7409899.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5548000931739807, | |
| "epoch": 1.7105263157894737, | |
| "grad_norm": 0.02822866663336754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497517585754395, | |
| "mean_token_accuracy": 0.776210606098175, | |
| "num_tokens": 7426237.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5756575465202332, | |
| "epoch": 1.7142857142857144, | |
| "grad_norm": 0.027675755321979523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697333812713623, | |
| "mean_token_accuracy": 0.7680118083953857, | |
| "num_tokens": 7442768.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.5417828410863876, | |
| "epoch": 1.718045112781955, | |
| "grad_norm": 0.033404842019081116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454074740409851, | |
| "mean_token_accuracy": 0.7808687537908554, | |
| "num_tokens": 7459143.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5427983999252319, | |
| "epoch": 1.7218045112781954, | |
| "grad_norm": 0.03309955820441246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416461825370789, | |
| "mean_token_accuracy": 0.7808773517608643, | |
| "num_tokens": 7475461.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5505435466766357, | |
| "epoch": 1.725563909774436, | |
| "grad_norm": 0.034179892390966415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560557246208191, | |
| "mean_token_accuracy": 0.7720683664083481, | |
| "num_tokens": 7491762.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5398002862930298, | |
| "epoch": 1.7293233082706767, | |
| "grad_norm": 0.036437805742025375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529733896255493, | |
| "mean_token_accuracy": 0.7730463594198227, | |
| "num_tokens": 7507801.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5538046360015869, | |
| "epoch": 1.7330827067669174, | |
| "grad_norm": 0.038074180483818054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474164485931396, | |
| "mean_token_accuracy": 0.7738546878099442, | |
| "num_tokens": 7524195.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5446304082870483, | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 0.028863312676548958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534104585647583, | |
| "mean_token_accuracy": 0.7812709957361221, | |
| "num_tokens": 7540346.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5635255128145218, | |
| "epoch": 1.7406015037593985, | |
| "grad_norm": 0.0377831794321537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565074682235718, | |
| "mean_token_accuracy": 0.7726516425609589, | |
| "num_tokens": 7556361.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5520550906658173, | |
| "epoch": 1.744360902255639, | |
| "grad_norm": 0.027316391468048096, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5496057868003845, | |
| "mean_token_accuracy": 0.7767691016197205, | |
| "num_tokens": 7572407.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5517378151416779, | |
| "epoch": 1.7481203007518797, | |
| "grad_norm": 0.03549322485923767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542277097702026, | |
| "mean_token_accuracy": 0.7771301567554474, | |
| "num_tokens": 7588716.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5447746813297272, | |
| "epoch": 1.7518796992481203, | |
| "grad_norm": 0.03821020945906639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558238685131073, | |
| "mean_token_accuracy": 0.7732566744089127, | |
| "num_tokens": 7604921.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5422779768705368, | |
| "epoch": 1.755639097744361, | |
| "grad_norm": 0.03218455985188484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549083411693573, | |
| "mean_token_accuracy": 0.7762202769517899, | |
| "num_tokens": 7621109.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5479860007762909, | |
| "epoch": 1.7593984962406015, | |
| "grad_norm": 0.03186026215553284, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414553880691528, | |
| "mean_token_accuracy": 0.7800420671701431, | |
| "num_tokens": 7637434.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5488834828138351, | |
| "epoch": 1.763157894736842, | |
| "grad_norm": 0.030316263437271118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371969938278198, | |
| "mean_token_accuracy": 0.7800302803516388, | |
| "num_tokens": 7653708.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5712478011846542, | |
| "epoch": 1.7669172932330826, | |
| "grad_norm": 0.0292644202709198, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641398429870605, | |
| "mean_token_accuracy": 0.7701270431280136, | |
| "num_tokens": 7670165.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5487608909606934, | |
| "epoch": 1.7706766917293233, | |
| "grad_norm": 0.029384015128016472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528495907783508, | |
| "mean_token_accuracy": 0.7725293934345245, | |
| "num_tokens": 7686546.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5485792607069016, | |
| "epoch": 1.774436090225564, | |
| "grad_norm": 0.03848496824502945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557949960231781, | |
| "mean_token_accuracy": 0.7736170142889023, | |
| "num_tokens": 7703199.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5328742563724518, | |
| "epoch": 1.7781954887218046, | |
| "grad_norm": 0.029961325228214264, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426016449928284, | |
| "mean_token_accuracy": 0.7784318327903748, | |
| "num_tokens": 7719414.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.5418206453323364, | |
| "epoch": 1.781954887218045, | |
| "grad_norm": 0.03003692626953125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543552815914154, | |
| "mean_token_accuracy": 0.777516707777977, | |
| "num_tokens": 7735591.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5588981062173843, | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.035983212292194366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562595725059509, | |
| "mean_token_accuracy": 0.7752551138401031, | |
| "num_tokens": 7751978.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5337852984666824, | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 0.030708249658346176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5263274312019348, | |
| "mean_token_accuracy": 0.7854783833026886, | |
| "num_tokens": 7768537.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5388501137495041, | |
| "epoch": 1.793233082706767, | |
| "grad_norm": 0.034256935119628906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432993173599243, | |
| "mean_token_accuracy": 0.7769720703363419, | |
| "num_tokens": 7784830.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5526683777570724, | |
| "epoch": 1.7969924812030076, | |
| "grad_norm": 0.030191054567694664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529841184616089, | |
| "mean_token_accuracy": 0.774674654006958, | |
| "num_tokens": 7801305.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5205394625663757, | |
| "epoch": 1.800751879699248, | |
| "grad_norm": 0.03705041483044624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320290327072144, | |
| "mean_token_accuracy": 0.7844933271408081, | |
| "num_tokens": 7817468.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.5391060262918472, | |
| "epoch": 1.8045112781954886, | |
| "grad_norm": 0.03425837680697441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482912659645081, | |
| "mean_token_accuracy": 0.7772899568080902, | |
| "num_tokens": 7833783.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5595878064632416, | |
| "epoch": 1.8082706766917294, | |
| "grad_norm": 0.03261560574173927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595347881317139, | |
| "mean_token_accuracy": 0.7739517390727997, | |
| "num_tokens": 7850116.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5623766779899597, | |
| "epoch": 1.8120300751879699, | |
| "grad_norm": 0.030305257067084312, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494015216827393, | |
| "mean_token_accuracy": 0.7756963670253754, | |
| "num_tokens": 7866336.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5707903653383255, | |
| "epoch": 1.8157894736842106, | |
| "grad_norm": 0.030717138200998306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5605000257492065, | |
| "mean_token_accuracy": 0.7702891528606415, | |
| "num_tokens": 7882899.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5296159312129021, | |
| "epoch": 1.8195488721804511, | |
| "grad_norm": 0.03342661261558533, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307406783103943, | |
| "mean_token_accuracy": 0.7850563228130341, | |
| "num_tokens": 7899131.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.545372724533081, | |
| "epoch": 1.8233082706766917, | |
| "grad_norm": 0.0327008031308651, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443350076675415, | |
| "mean_token_accuracy": 0.7800664007663727, | |
| "num_tokens": 7915449.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5288603901863098, | |
| "epoch": 1.8270676691729322, | |
| "grad_norm": 0.03246629983186722, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420411229133606, | |
| "mean_token_accuracy": 0.779539629817009, | |
| "num_tokens": 7931703.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5476890802383423, | |
| "epoch": 1.830827067669173, | |
| "grad_norm": 0.03365527465939522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5550553798675537, | |
| "mean_token_accuracy": 0.7729549556970596, | |
| "num_tokens": 7948074.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5389307886362076, | |
| "epoch": 1.8345864661654137, | |
| "grad_norm": 0.036491431295871735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5469198822975159, | |
| "mean_token_accuracy": 0.7751343995332718, | |
| "num_tokens": 7964150.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5449552834033966, | |
| "epoch": 1.8383458646616542, | |
| "grad_norm": 0.03082645684480667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452861189842224, | |
| "mean_token_accuracy": 0.7780899852514267, | |
| "num_tokens": 7980409.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.5490948259830475, | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 0.031109903007745743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441408157348633, | |
| "mean_token_accuracy": 0.778783529996872, | |
| "num_tokens": 7996889.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5475451499223709, | |
| "epoch": 1.8458646616541352, | |
| "grad_norm": 0.030056826770305634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430116653442383, | |
| "mean_token_accuracy": 0.7810570746660233, | |
| "num_tokens": 8013259.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.559479296207428, | |
| "epoch": 1.849624060150376, | |
| "grad_norm": 0.035820432007312775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568897128105164, | |
| "mean_token_accuracy": 0.7710603177547455, | |
| "num_tokens": 8029520.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5462630242109299, | |
| "epoch": 1.8533834586466167, | |
| "grad_norm": 0.031395427882671356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490817427635193, | |
| "mean_token_accuracy": 0.7747374475002289, | |
| "num_tokens": 8045599.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.5427971929311752, | |
| "epoch": 1.8571428571428572, | |
| "grad_norm": 0.032419510185718536, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547596275806427, | |
| "mean_token_accuracy": 0.7759164273738861, | |
| "num_tokens": 8062030.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5488359779119492, | |
| "epoch": 1.8609022556390977, | |
| "grad_norm": 0.03382895514369011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546596646308899, | |
| "mean_token_accuracy": 0.7742781788110733, | |
| "num_tokens": 8078279.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5563898682594299, | |
| "epoch": 1.8646616541353382, | |
| "grad_norm": 0.030559495091438293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596904754638672, | |
| "mean_token_accuracy": 0.7740778177976608, | |
| "num_tokens": 8094627.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5448739975690842, | |
| "epoch": 1.868421052631579, | |
| "grad_norm": 0.029570002108812332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441548824310303, | |
| "mean_token_accuracy": 0.7791137993335724, | |
| "num_tokens": 8111057.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5403100103139877, | |
| "epoch": 1.8721804511278195, | |
| "grad_norm": 0.028860216960310936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392476916313171, | |
| "mean_token_accuracy": 0.7823552191257477, | |
| "num_tokens": 8127458.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.547279953956604, | |
| "epoch": 1.8759398496240602, | |
| "grad_norm": 0.03563547134399414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528260469436646, | |
| "mean_token_accuracy": 0.7767119854688644, | |
| "num_tokens": 8143862.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5525589138269424, | |
| "epoch": 1.8796992481203008, | |
| "grad_norm": 0.03100893273949623, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514292120933533, | |
| "mean_token_accuracy": 0.7746975123882294, | |
| "num_tokens": 8160155.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5513135939836502, | |
| "epoch": 1.8834586466165413, | |
| "grad_norm": 0.0315982848405838, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519658923149109, | |
| "mean_token_accuracy": 0.7756119072437286, | |
| "num_tokens": 8176700.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5485852658748627, | |
| "epoch": 1.8872180451127818, | |
| "grad_norm": 0.031329069286584854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463511347770691, | |
| "mean_token_accuracy": 0.779010608792305, | |
| "num_tokens": 8193245.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5625745803117752, | |
| "epoch": 1.8909774436090225, | |
| "grad_norm": 0.029315905645489693, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607528686523438, | |
| "mean_token_accuracy": 0.7741692066192627, | |
| "num_tokens": 8209893.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5387315452098846, | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 0.03832435607910156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399753451347351, | |
| "mean_token_accuracy": 0.781536191701889, | |
| "num_tokens": 8226239.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.544891282916069, | |
| "epoch": 1.8984962406015038, | |
| "grad_norm": 0.03846210241317749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546903610229492, | |
| "mean_token_accuracy": 0.7764989882707596, | |
| "num_tokens": 8242463.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5383649319410324, | |
| "epoch": 1.9022556390977443, | |
| "grad_norm": 0.029546573758125305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443148016929626, | |
| "mean_token_accuracy": 0.7801246345043182, | |
| "num_tokens": 8258870.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5518875420093536, | |
| "epoch": 1.9060150375939848, | |
| "grad_norm": 0.03868366405367851, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56158447265625, | |
| "mean_token_accuracy": 0.7744181603193283, | |
| "num_tokens": 8275059.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5304814428091049, | |
| "epoch": 1.9097744360902256, | |
| "grad_norm": 0.030545437708497047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301219820976257, | |
| "mean_token_accuracy": 0.7852053344249725, | |
| "num_tokens": 8291105.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.5690664052963257, | |
| "epoch": 1.9135338345864663, | |
| "grad_norm": 0.032348547130823135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622092485427856, | |
| "mean_token_accuracy": 0.769376203417778, | |
| "num_tokens": 8307569.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.5624774992465973, | |
| "epoch": 1.9172932330827068, | |
| "grad_norm": 0.02640698291361332, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545241236686707, | |
| "mean_token_accuracy": 0.7744268774986267, | |
| "num_tokens": 8323912.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.5579835772514343, | |
| "epoch": 1.9210526315789473, | |
| "grad_norm": 0.031412333250045776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539452433586121, | |
| "mean_token_accuracy": 0.774582713842392, | |
| "num_tokens": 8340119.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.542325347661972, | |
| "epoch": 1.9248120300751879, | |
| "grad_norm": 0.030913738533854485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458105802536011, | |
| "mean_token_accuracy": 0.7775561809539795, | |
| "num_tokens": 8356315.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.529489278793335, | |
| "epoch": 1.9285714285714286, | |
| "grad_norm": 0.029877884313464165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531100332736969, | |
| "mean_token_accuracy": 0.7838429808616638, | |
| "num_tokens": 8372456.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5389499813318253, | |
| "epoch": 1.9323308270676691, | |
| "grad_norm": 0.030849065631628036, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465497374534607, | |
| "mean_token_accuracy": 0.7783443629741669, | |
| "num_tokens": 8388807.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5628852099180222, | |
| "epoch": 1.9360902255639099, | |
| "grad_norm": 0.03353369981050491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644093751907349, | |
| "mean_token_accuracy": 0.7698302865028381, | |
| "num_tokens": 8405066.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5497677177190781, | |
| "epoch": 1.9398496240601504, | |
| "grad_norm": 0.028165243566036224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.547763466835022, | |
| "mean_token_accuracy": 0.7773574143648148, | |
| "num_tokens": 8421460.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5606269836425781, | |
| "epoch": 1.943609022556391, | |
| "grad_norm": 0.0319550521671772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551348924636841, | |
| "mean_token_accuracy": 0.7739223390817642, | |
| "num_tokens": 8437784.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5395714491605759, | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 0.031290777027606964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381031036376953, | |
| "mean_token_accuracy": 0.7825980633497238, | |
| "num_tokens": 8453854.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.5344501882791519, | |
| "epoch": 1.9511278195488722, | |
| "grad_norm": 0.03777296468615532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455595850944519, | |
| "mean_token_accuracy": 0.7795031368732452, | |
| "num_tokens": 8470272.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5205538719892502, | |
| "epoch": 1.954887218045113, | |
| "grad_norm": 0.03487836569547653, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5330216288566589, | |
| "mean_token_accuracy": 0.7831091731786728, | |
| "num_tokens": 8486562.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5428618490695953, | |
| "epoch": 1.9586466165413534, | |
| "grad_norm": 0.030902346596121788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495193004608154, | |
| "mean_token_accuracy": 0.7756944447755814, | |
| "num_tokens": 8502887.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.544492781162262, | |
| "epoch": 1.962406015037594, | |
| "grad_norm": 0.03169652447104454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453743934631348, | |
| "mean_token_accuracy": 0.7783046513795853, | |
| "num_tokens": 8519068.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5636335015296936, | |
| "epoch": 1.9661654135338344, | |
| "grad_norm": 0.03021661750972271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499917268753052, | |
| "mean_token_accuracy": 0.7781661599874496, | |
| "num_tokens": 8535634.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.55694779753685, | |
| "epoch": 1.9699248120300752, | |
| "grad_norm": 0.03414059802889824, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477267503738403, | |
| "mean_token_accuracy": 0.7789023220539093, | |
| "num_tokens": 8552014.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5450517237186432, | |
| "epoch": 1.973684210526316, | |
| "grad_norm": 0.03232225775718689, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392122268676758, | |
| "mean_token_accuracy": 0.777529314160347, | |
| "num_tokens": 8568141.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5509356558322906, | |
| "epoch": 1.9774436090225564, | |
| "grad_norm": 0.03768094256520271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595051050186157, | |
| "mean_token_accuracy": 0.7724569737911224, | |
| "num_tokens": 8584500.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.5301109999418259, | |
| "epoch": 1.981203007518797, | |
| "grad_norm": 0.033885687589645386, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360104441642761, | |
| "mean_token_accuracy": 0.7817398905754089, | |
| "num_tokens": 8600622.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5417920649051666, | |
| "epoch": 1.9849624060150375, | |
| "grad_norm": 0.035579532384872437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494239926338196, | |
| "mean_token_accuracy": 0.7785082012414932, | |
| "num_tokens": 8616969.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.5376323908567429, | |
| "epoch": 1.9887218045112782, | |
| "grad_norm": 0.0296316035091877, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373918414115906, | |
| "mean_token_accuracy": 0.7816532105207443, | |
| "num_tokens": 8633437.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5412444472312927, | |
| "epoch": 1.9924812030075187, | |
| "grad_norm": 0.03037526085972786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539776086807251, | |
| "mean_token_accuracy": 0.7808452993631363, | |
| "num_tokens": 8649560.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.554906353354454, | |
| "epoch": 1.9962406015037595, | |
| "grad_norm": 0.03048609383404255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531030893325806, | |
| "mean_token_accuracy": 0.7767332792282104, | |
| "num_tokens": 8665828.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5544924587011337, | |
| "epoch": 2.0, | |
| "grad_norm": 0.03117205761373043, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5525693893432617, | |
| "mean_token_accuracy": 0.775643989443779, | |
| "num_tokens": 8682083.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5393226593732834, | |
| "epoch": 2.0037593984962405, | |
| "grad_norm": 0.034238528460264206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.527999222278595, | |
| "mean_token_accuracy": 0.7866329997777939, | |
| "num_tokens": 8698342.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5444916188716888, | |
| "epoch": 2.007518796992481, | |
| "grad_norm": 0.03761903941631317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5434718132019043, | |
| "mean_token_accuracy": 0.7761547416448593, | |
| "num_tokens": 8714741.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5060115680098534, | |
| "epoch": 2.011278195488722, | |
| "grad_norm": 0.036343637853860855, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5168589353561401, | |
| "mean_token_accuracy": 0.7898426353931427, | |
| "num_tokens": 8731100.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5210407823324203, | |
| "epoch": 2.0150375939849625, | |
| "grad_norm": 0.04487035050988197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338425040245056, | |
| "mean_token_accuracy": 0.783848226070404, | |
| "num_tokens": 8747374.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5411355942487717, | |
| "epoch": 2.018796992481203, | |
| "grad_norm": 0.030216895043849945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343786478042603, | |
| "mean_token_accuracy": 0.785404697060585, | |
| "num_tokens": 8763878.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5372739881277084, | |
| "epoch": 2.0225563909774436, | |
| "grad_norm": 0.028337521478533745, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299405455589294, | |
| "mean_token_accuracy": 0.7845199257135391, | |
| "num_tokens": 8780220.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5464906841516495, | |
| "epoch": 2.026315789473684, | |
| "grad_norm": 0.036913856863975525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415371656417847, | |
| "mean_token_accuracy": 0.7804137766361237, | |
| "num_tokens": 8796472.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5379135385155678, | |
| "epoch": 2.030075187969925, | |
| "grad_norm": 0.03262462466955185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289930701255798, | |
| "mean_token_accuracy": 0.7824063748121262, | |
| "num_tokens": 8812711.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5565919727087021, | |
| "epoch": 2.0338345864661656, | |
| "grad_norm": 0.04293256625533104, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547116994857788, | |
| "mean_token_accuracy": 0.7729399651288986, | |
| "num_tokens": 8829053.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5241617634892464, | |
| "epoch": 2.037593984962406, | |
| "grad_norm": 0.038099389523267746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281400084495544, | |
| "mean_token_accuracy": 0.7854866534471512, | |
| "num_tokens": 8845272.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5125209540128708, | |
| "epoch": 2.0413533834586466, | |
| "grad_norm": 0.0444987453520298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5245556235313416, | |
| "mean_token_accuracy": 0.7865463197231293, | |
| "num_tokens": 8861604.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5151898711919785, | |
| "epoch": 2.045112781954887, | |
| "grad_norm": 0.03733397275209427, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5251218676567078, | |
| "mean_token_accuracy": 0.7850091606378555, | |
| "num_tokens": 8878258.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5284005552530289, | |
| "epoch": 2.0488721804511276, | |
| "grad_norm": 0.03852412849664688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298153758049011, | |
| "mean_token_accuracy": 0.7847720235586166, | |
| "num_tokens": 8894539.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.54307721555233, | |
| "epoch": 2.0526315789473686, | |
| "grad_norm": 0.033771906048059464, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5370909571647644, | |
| "mean_token_accuracy": 0.7825237512588501, | |
| "num_tokens": 8910872.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5492400974035263, | |
| "epoch": 2.056390977443609, | |
| "grad_norm": 0.03574720397591591, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408341884613037, | |
| "mean_token_accuracy": 0.778035119175911, | |
| "num_tokens": 8927218.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5240911245346069, | |
| "epoch": 2.0601503759398496, | |
| "grad_norm": 0.02964242920279503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206458568572998, | |
| "mean_token_accuracy": 0.7880397886037827, | |
| "num_tokens": 8943483.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5402092635631561, | |
| "epoch": 2.06390977443609, | |
| "grad_norm": 0.030025213956832886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365015864372253, | |
| "mean_token_accuracy": 0.7826483398675919, | |
| "num_tokens": 8959806.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5332436561584473, | |
| "epoch": 2.0676691729323307, | |
| "grad_norm": 0.04115639254450798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445111393928528, | |
| "mean_token_accuracy": 0.7822862267494202, | |
| "num_tokens": 8976089.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5036703869700432, | |
| "epoch": 2.0714285714285716, | |
| "grad_norm": 0.04966175556182861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5189836025238037, | |
| "mean_token_accuracy": 0.7873758524656296, | |
| "num_tokens": 8992377.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5350762009620667, | |
| "epoch": 2.075187969924812, | |
| "grad_norm": 0.03549731895327568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327733755111694, | |
| "mean_token_accuracy": 0.7879746407270432, | |
| "num_tokens": 9008811.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5646320134401321, | |
| "epoch": 2.0789473684210527, | |
| "grad_norm": 0.03737547621130943, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5554011464118958, | |
| "mean_token_accuracy": 0.7747785001993179, | |
| "num_tokens": 9025308.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.5232708752155304, | |
| "epoch": 2.082706766917293, | |
| "grad_norm": 0.0358981154859066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5174283385276794, | |
| "mean_token_accuracy": 0.790026530623436, | |
| "num_tokens": 9041525.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.5285665988922119, | |
| "epoch": 2.0864661654135337, | |
| "grad_norm": 0.03469764441251755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5286591649055481, | |
| "mean_token_accuracy": 0.7858238369226456, | |
| "num_tokens": 9058016.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5281644910573959, | |
| "epoch": 2.090225563909774, | |
| "grad_norm": 0.0453813299536705, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388556718826294, | |
| "mean_token_accuracy": 0.7807898968458176, | |
| "num_tokens": 9074200.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5271690487861633, | |
| "epoch": 2.093984962406015, | |
| "grad_norm": 0.032550517469644547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312079787254333, | |
| "mean_token_accuracy": 0.7843631505966187, | |
| "num_tokens": 9090441.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5335165411233902, | |
| "epoch": 2.0977443609022557, | |
| "grad_norm": 0.045913904905319214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5417532324790955, | |
| "mean_token_accuracy": 0.7792288213968277, | |
| "num_tokens": 9106701.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5311940237879753, | |
| "epoch": 2.101503759398496, | |
| "grad_norm": 0.03551177680492401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5270295143127441, | |
| "mean_token_accuracy": 0.7884976118803024, | |
| "num_tokens": 9122828.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5543871223926544, | |
| "epoch": 2.1052631578947367, | |
| "grad_norm": 0.04049575328826904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416486859321594, | |
| "mean_token_accuracy": 0.7811383605003357, | |
| "num_tokens": 9139283.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5340919494628906, | |
| "epoch": 2.1090225563909772, | |
| "grad_norm": 0.039224181324243546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327409505844116, | |
| "mean_token_accuracy": 0.7838027775287628, | |
| "num_tokens": 9155474.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5298718512058258, | |
| "epoch": 2.112781954887218, | |
| "grad_norm": 0.05099140852689743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340836644172668, | |
| "mean_token_accuracy": 0.783194437623024, | |
| "num_tokens": 9171817.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5186150521039963, | |
| "epoch": 2.1165413533834587, | |
| "grad_norm": 0.03965724632143974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5235821604728699, | |
| "mean_token_accuracy": 0.7888422161340714, | |
| "num_tokens": 9188257.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5331820994615555, | |
| "epoch": 2.1203007518796992, | |
| "grad_norm": 0.04237478971481323, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393993258476257, | |
| "mean_token_accuracy": 0.7827252298593521, | |
| "num_tokens": 9204541.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.540572926402092, | |
| "epoch": 2.1240601503759398, | |
| "grad_norm": 0.04164816811680794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408675670623779, | |
| "mean_token_accuracy": 0.7807533591985703, | |
| "num_tokens": 9220820.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5385376363992691, | |
| "epoch": 2.1278195488721803, | |
| "grad_norm": 0.036260150372982025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364916324615479, | |
| "mean_token_accuracy": 0.7820783704519272, | |
| "num_tokens": 9237023.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5336015373468399, | |
| "epoch": 2.1315789473684212, | |
| "grad_norm": 0.037857089191675186, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315621495246887, | |
| "mean_token_accuracy": 0.785429060459137, | |
| "num_tokens": 9253551.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5323529243469238, | |
| "epoch": 2.1353383458646618, | |
| "grad_norm": 0.037011366337537766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320927500724792, | |
| "mean_token_accuracy": 0.7860363125801086, | |
| "num_tokens": 9270061.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5342943072319031, | |
| "epoch": 2.1390977443609023, | |
| "grad_norm": 0.04501970484852791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541400134563446, | |
| "mean_token_accuracy": 0.7824247628450394, | |
| "num_tokens": 9286644.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5125101208686829, | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.03982450067996979, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186954736709595, | |
| "mean_token_accuracy": 0.7895647883415222, | |
| "num_tokens": 9302779.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5302434861660004, | |
| "epoch": 2.1466165413533833, | |
| "grad_norm": 0.04483801871538162, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331039428710938, | |
| "mean_token_accuracy": 0.7822313755750656, | |
| "num_tokens": 9318908.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.541576087474823, | |
| "epoch": 2.1503759398496243, | |
| "grad_norm": 0.04227382317185402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5322229862213135, | |
| "mean_token_accuracy": 0.7839206904172897, | |
| "num_tokens": 9335280.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5349045842885971, | |
| "epoch": 2.154135338345865, | |
| "grad_norm": 0.039713822305202484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306118726730347, | |
| "mean_token_accuracy": 0.7863682806491852, | |
| "num_tokens": 9351717.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.538109079003334, | |
| "epoch": 2.1578947368421053, | |
| "grad_norm": 0.043392788618803024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441777110099792, | |
| "mean_token_accuracy": 0.7800941169261932, | |
| "num_tokens": 9367925.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.543743833899498, | |
| "epoch": 2.161654135338346, | |
| "grad_norm": 0.036299366503953934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443440675735474, | |
| "mean_token_accuracy": 0.7788920700550079, | |
| "num_tokens": 9384356.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5299166440963745, | |
| "epoch": 2.1654135338345863, | |
| "grad_norm": 0.04222200810909271, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267676711082458, | |
| "mean_token_accuracy": 0.7834489941596985, | |
| "num_tokens": 9400653.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5201265513896942, | |
| "epoch": 2.169172932330827, | |
| "grad_norm": 0.034343086183071136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234291553497314, | |
| "mean_token_accuracy": 0.7866221219301224, | |
| "num_tokens": 9416889.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5227823704481125, | |
| "epoch": 2.172932330827068, | |
| "grad_norm": 0.05559639260172844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5304789543151855, | |
| "mean_token_accuracy": 0.7860793620347977, | |
| "num_tokens": 9433083.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5409391671419144, | |
| "epoch": 2.1766917293233083, | |
| "grad_norm": 0.03534764051437378, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437344908714294, | |
| "mean_token_accuracy": 0.7797643393278122, | |
| "num_tokens": 9449666.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5353062897920609, | |
| "epoch": 2.180451127819549, | |
| "grad_norm": 0.0366806834936142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361766815185547, | |
| "mean_token_accuracy": 0.7838302254676819, | |
| "num_tokens": 9465971.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5455628782510757, | |
| "epoch": 2.1842105263157894, | |
| "grad_norm": 0.04078822582960129, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446187257766724, | |
| "mean_token_accuracy": 0.7786186188459396, | |
| "num_tokens": 9482331.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5441193133592606, | |
| "epoch": 2.18796992481203, | |
| "grad_norm": 0.03562629595398903, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538811981678009, | |
| "mean_token_accuracy": 0.7832597941160202, | |
| "num_tokens": 9498498.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.519161731004715, | |
| "epoch": 2.191729323308271, | |
| "grad_norm": 0.04350278899073601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223026275634766, | |
| "mean_token_accuracy": 0.7909857630729675, | |
| "num_tokens": 9514937.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5520303696393967, | |
| "epoch": 2.1954887218045114, | |
| "grad_norm": 0.04176495969295502, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509821772575378, | |
| "mean_token_accuracy": 0.7763593196868896, | |
| "num_tokens": 9531256.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5262609422206879, | |
| "epoch": 2.199248120300752, | |
| "grad_norm": 0.07633325457572937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259430408477783, | |
| "mean_token_accuracy": 0.7863292992115021, | |
| "num_tokens": 9547509.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.53122878074646, | |
| "epoch": 2.2030075187969924, | |
| "grad_norm": 0.04210652410984039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531125545501709, | |
| "mean_token_accuracy": 0.7854439616203308, | |
| "num_tokens": 9563675.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.5309283137321472, | |
| "epoch": 2.206766917293233, | |
| "grad_norm": 0.042596347630023956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361312627792358, | |
| "mean_token_accuracy": 0.7840573638677597, | |
| "num_tokens": 9580247.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.523199625313282, | |
| "epoch": 2.2105263157894735, | |
| "grad_norm": 0.06264178454875946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371831655502319, | |
| "mean_token_accuracy": 0.7846156805753708, | |
| "num_tokens": 9596084.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5497414767742157, | |
| "epoch": 2.2142857142857144, | |
| "grad_norm": 0.049970485270023346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482587218284607, | |
| "mean_token_accuracy": 0.7772606760263443, | |
| "num_tokens": 9612439.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5475651770830154, | |
| "epoch": 2.218045112781955, | |
| "grad_norm": 0.047052860260009766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382542610168457, | |
| "mean_token_accuracy": 0.7837767452001572, | |
| "num_tokens": 9628574.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5442479848861694, | |
| "epoch": 2.2218045112781954, | |
| "grad_norm": 0.03252498432993889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315850973129272, | |
| "mean_token_accuracy": 0.7825820297002792, | |
| "num_tokens": 9644837.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5471898764371872, | |
| "epoch": 2.225563909774436, | |
| "grad_norm": 0.048182275146245956, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5472801923751831, | |
| "mean_token_accuracy": 0.776175931096077, | |
| "num_tokens": 9661070.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5267005264759064, | |
| "epoch": 2.2293233082706765, | |
| "grad_norm": 0.04179242253303528, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309768319129944, | |
| "mean_token_accuracy": 0.7826364785432816, | |
| "num_tokens": 9677378.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5416758507490158, | |
| "epoch": 2.2330827067669174, | |
| "grad_norm": 0.04981589689850807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549900472164154, | |
| "mean_token_accuracy": 0.7765727639198303, | |
| "num_tokens": 9693819.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5369458198547363, | |
| "epoch": 2.236842105263158, | |
| "grad_norm": 0.051439523696899414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440854430198669, | |
| "mean_token_accuracy": 0.7789760231971741, | |
| "num_tokens": 9710189.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.5342868715524673, | |
| "epoch": 2.2406015037593985, | |
| "grad_norm": 0.04235680773854256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430835485458374, | |
| "mean_token_accuracy": 0.7785050868988037, | |
| "num_tokens": 9726526.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5481905192136765, | |
| "epoch": 2.244360902255639, | |
| "grad_norm": 0.044252388179302216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456714034080505, | |
| "mean_token_accuracy": 0.7800015658140182, | |
| "num_tokens": 9742892.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5490403324365616, | |
| "epoch": 2.2481203007518795, | |
| "grad_norm": 0.036522816866636276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348387956619263, | |
| "mean_token_accuracy": 0.7838009893894196, | |
| "num_tokens": 9759316.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5373188108205795, | |
| "epoch": 2.2518796992481205, | |
| "grad_norm": 0.0484786219894886, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393818616867065, | |
| "mean_token_accuracy": 0.7799521684646606, | |
| "num_tokens": 9775422.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5350137799978256, | |
| "epoch": 2.255639097744361, | |
| "grad_norm": 0.03971916437149048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390014052391052, | |
| "mean_token_accuracy": 0.7825258076190948, | |
| "num_tokens": 9791645.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.529654249548912, | |
| "epoch": 2.2593984962406015, | |
| "grad_norm": 0.03677717223763466, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5347926020622253, | |
| "mean_token_accuracy": 0.7820286452770233, | |
| "num_tokens": 9807863.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.5160931199789047, | |
| "epoch": 2.263157894736842, | |
| "grad_norm": 0.04103193059563637, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219160914421082, | |
| "mean_token_accuracy": 0.7898968160152435, | |
| "num_tokens": 9823834.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.547026053071022, | |
| "epoch": 2.2669172932330826, | |
| "grad_norm": 0.035431135445833206, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5403215289115906, | |
| "mean_token_accuracy": 0.7804599404335022, | |
| "num_tokens": 9840527.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5330915451049805, | |
| "epoch": 2.2706766917293235, | |
| "grad_norm": 0.03688134625554085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308654308319092, | |
| "mean_token_accuracy": 0.7851675152778625, | |
| "num_tokens": 9856677.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5384332090616226, | |
| "epoch": 2.274436090225564, | |
| "grad_norm": 0.04168199747800827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5318323373794556, | |
| "mean_token_accuracy": 0.7833025008440018, | |
| "num_tokens": 9872958.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5483455657958984, | |
| "epoch": 2.2781954887218046, | |
| "grad_norm": 0.0458533950150013, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5497722625732422, | |
| "mean_token_accuracy": 0.7783730030059814, | |
| "num_tokens": 9889301.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5242274850606918, | |
| "epoch": 2.281954887218045, | |
| "grad_norm": 0.03992198407649994, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5323127508163452, | |
| "mean_token_accuracy": 0.7856701463460922, | |
| "num_tokens": 9905738.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5306910574436188, | |
| "epoch": 2.2857142857142856, | |
| "grad_norm": 0.03714906424283981, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334057807922363, | |
| "mean_token_accuracy": 0.7845153957605362, | |
| "num_tokens": 9922153.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5255761742591858, | |
| "epoch": 2.2894736842105265, | |
| "grad_norm": 0.037783432751894, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267370343208313, | |
| "mean_token_accuracy": 0.7860815078020096, | |
| "num_tokens": 9938520.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.528737261891365, | |
| "epoch": 2.293233082706767, | |
| "grad_norm": 0.03467050567269325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269864797592163, | |
| "mean_token_accuracy": 0.789274126291275, | |
| "num_tokens": 9954806.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5392419397830963, | |
| "epoch": 2.2969924812030076, | |
| "grad_norm": 0.03630411997437477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344975590705872, | |
| "mean_token_accuracy": 0.7834292352199554, | |
| "num_tokens": 9971123.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5148891359567642, | |
| "epoch": 2.300751879699248, | |
| "grad_norm": 0.03637854382395744, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5145090222358704, | |
| "mean_token_accuracy": 0.7894360274076462, | |
| "num_tokens": 9987229.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.538021132349968, | |
| "epoch": 2.3045112781954886, | |
| "grad_norm": 0.03751857578754425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541398286819458, | |
| "mean_token_accuracy": 0.7807863056659698, | |
| "num_tokens": 10003519.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5272123515605927, | |
| "epoch": 2.308270676691729, | |
| "grad_norm": 0.04051438719034195, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344090461730957, | |
| "mean_token_accuracy": 0.7857641130685806, | |
| "num_tokens": 10019993.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5179824233055115, | |
| "epoch": 2.31203007518797, | |
| "grad_norm": 0.04479973390698433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279502272605896, | |
| "mean_token_accuracy": 0.7859090268611908, | |
| "num_tokens": 10036196.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5467290729284286, | |
| "epoch": 2.3157894736842106, | |
| "grad_norm": 0.03927797079086304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486882328987122, | |
| "mean_token_accuracy": 0.7768010795116425, | |
| "num_tokens": 10052474.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5408567190170288, | |
| "epoch": 2.319548872180451, | |
| "grad_norm": 0.03986404091119766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5317103862762451, | |
| "mean_token_accuracy": 0.7851662039756775, | |
| "num_tokens": 10068775.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5392286479473114, | |
| "epoch": 2.3233082706766917, | |
| "grad_norm": 0.03838985413312912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530458927154541, | |
| "mean_token_accuracy": 0.7848429083824158, | |
| "num_tokens": 10084946.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5223991498351097, | |
| "epoch": 2.327067669172932, | |
| "grad_norm": 0.03357016295194626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5164550542831421, | |
| "mean_token_accuracy": 0.7903633117675781, | |
| "num_tokens": 10101221.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5287820845842361, | |
| "epoch": 2.3308270676691727, | |
| "grad_norm": 0.041184201836586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312986373901367, | |
| "mean_token_accuracy": 0.7844579666852951, | |
| "num_tokens": 10117440.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5136409252882004, | |
| "epoch": 2.3345864661654137, | |
| "grad_norm": 0.044375885277986526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5256669521331787, | |
| "mean_token_accuracy": 0.7870495319366455, | |
| "num_tokens": 10133537.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5296864807605743, | |
| "epoch": 2.338345864661654, | |
| "grad_norm": 0.043142594397068024, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372653007507324, | |
| "mean_token_accuracy": 0.7797198593616486, | |
| "num_tokens": 10149832.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5296363830566406, | |
| "epoch": 2.3421052631578947, | |
| "grad_norm": 0.04168247431516647, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342837572097778, | |
| "mean_token_accuracy": 0.7827459424734116, | |
| "num_tokens": 10166206.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5279521271586418, | |
| "epoch": 2.345864661654135, | |
| "grad_norm": 0.03668156638741493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243417024612427, | |
| "mean_token_accuracy": 0.7867815494537354, | |
| "num_tokens": 10182574.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5396132320165634, | |
| "epoch": 2.3496240601503757, | |
| "grad_norm": 0.040590520948171616, | |
| "learning_rate": 0.0002, | |
| "loss": 0.534129798412323, | |
| "mean_token_accuracy": 0.7840494364500046, | |
| "num_tokens": 10198963.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.5384691059589386, | |
| "epoch": 2.3533834586466167, | |
| "grad_norm": 0.03799832612276077, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5275224447250366, | |
| "mean_token_accuracy": 0.788055807352066, | |
| "num_tokens": 10215363.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5355971157550812, | |
| "epoch": 2.357142857142857, | |
| "grad_norm": 0.03812744468450546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373313426971436, | |
| "mean_token_accuracy": 0.7830821126699448, | |
| "num_tokens": 10231721.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5379942953586578, | |
| "epoch": 2.3609022556390977, | |
| "grad_norm": 0.04219618812203407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430394411087036, | |
| "mean_token_accuracy": 0.779607817530632, | |
| "num_tokens": 10248150.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5369090437889099, | |
| "epoch": 2.3646616541353382, | |
| "grad_norm": 0.04251544550061226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445953011512756, | |
| "mean_token_accuracy": 0.778522789478302, | |
| "num_tokens": 10264414.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5455975085496902, | |
| "epoch": 2.3684210526315788, | |
| "grad_norm": 0.04128441959619522, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464663505554199, | |
| "mean_token_accuracy": 0.7782220393419266, | |
| "num_tokens": 10280655.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5499599725008011, | |
| "epoch": 2.3721804511278197, | |
| "grad_norm": 0.0386635959148407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542563259601593, | |
| "mean_token_accuracy": 0.7798319011926651, | |
| "num_tokens": 10297357.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.5534010380506516, | |
| "epoch": 2.3759398496240602, | |
| "grad_norm": 0.040974393486976624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562258362770081, | |
| "mean_token_accuracy": 0.7761926651000977, | |
| "num_tokens": 10313788.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5357997566461563, | |
| "epoch": 2.3796992481203008, | |
| "grad_norm": 0.03751135990023613, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5311724543571472, | |
| "mean_token_accuracy": 0.7860594242811203, | |
| "num_tokens": 10330164.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5399480760097504, | |
| "epoch": 2.3834586466165413, | |
| "grad_norm": 0.0392535962164402, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405341982841492, | |
| "mean_token_accuracy": 0.782960519194603, | |
| "num_tokens": 10346587.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5351511463522911, | |
| "epoch": 2.387218045112782, | |
| "grad_norm": 0.04137985408306122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5435580611228943, | |
| "mean_token_accuracy": 0.7791251242160797, | |
| "num_tokens": 10362964.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5337197929620743, | |
| "epoch": 2.3909774436090228, | |
| "grad_norm": 0.04529615119099617, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54475998878479, | |
| "mean_token_accuracy": 0.7794527411460876, | |
| "num_tokens": 10379194.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.5295632779598236, | |
| "epoch": 2.3947368421052633, | |
| "grad_norm": 0.03818366676568985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53121417760849, | |
| "mean_token_accuracy": 0.7843088060617447, | |
| "num_tokens": 10395289.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5338181853294373, | |
| "epoch": 2.398496240601504, | |
| "grad_norm": 0.04155934602022171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5273146033287048, | |
| "mean_token_accuracy": 0.7871305495500565, | |
| "num_tokens": 10411478.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5275490283966064, | |
| "epoch": 2.4022556390977443, | |
| "grad_norm": 0.03884044289588928, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5259033441543579, | |
| "mean_token_accuracy": 0.7865510582923889, | |
| "num_tokens": 10428000.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5296481549739838, | |
| "epoch": 2.406015037593985, | |
| "grad_norm": 0.03892350569367409, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338611602783203, | |
| "mean_token_accuracy": 0.7841958701610565, | |
| "num_tokens": 10444531.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5326656997203827, | |
| "epoch": 2.409774436090226, | |
| "grad_norm": 0.04130466282367706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334239602088928, | |
| "mean_token_accuracy": 0.7844693660736084, | |
| "num_tokens": 10460884.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5167141184210777, | |
| "epoch": 2.4135338345864663, | |
| "grad_norm": 0.04298912361264229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5224160552024841, | |
| "mean_token_accuracy": 0.790846198797226, | |
| "num_tokens": 10476946.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5394491106271744, | |
| "epoch": 2.417293233082707, | |
| "grad_norm": 0.0389692522585392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456172823905945, | |
| "mean_token_accuracy": 0.7784712016582489, | |
| "num_tokens": 10493157.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5317131578922272, | |
| "epoch": 2.4210526315789473, | |
| "grad_norm": 0.03282848745584488, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5272088050842285, | |
| "mean_token_accuracy": 0.7835191786289215, | |
| "num_tokens": 10509339.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.5249821543693542, | |
| "epoch": 2.424812030075188, | |
| "grad_norm": 0.03486508131027222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219942927360535, | |
| "mean_token_accuracy": 0.787269338965416, | |
| "num_tokens": 10525556.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5392860472202301, | |
| "epoch": 2.4285714285714284, | |
| "grad_norm": 0.03448896110057831, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338496565818787, | |
| "mean_token_accuracy": 0.7829862833023071, | |
| "num_tokens": 10541761.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5386904329061508, | |
| "epoch": 2.4323308270676693, | |
| "grad_norm": 0.037768758833408356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425961017608643, | |
| "mean_token_accuracy": 0.7781831622123718, | |
| "num_tokens": 10558311.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5251231044530869, | |
| "epoch": 2.43609022556391, | |
| "grad_norm": 0.03807547688484192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5291208624839783, | |
| "mean_token_accuracy": 0.783474326133728, | |
| "num_tokens": 10574696.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5356583297252655, | |
| "epoch": 2.4398496240601504, | |
| "grad_norm": 0.03421357646584511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5309426188468933, | |
| "mean_token_accuracy": 0.7826003879308701, | |
| "num_tokens": 10591225.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5321584492921829, | |
| "epoch": 2.443609022556391, | |
| "grad_norm": 0.04219021648168564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343624353408813, | |
| "mean_token_accuracy": 0.7819913923740387, | |
| "num_tokens": 10607648.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5409150719642639, | |
| "epoch": 2.4473684210526314, | |
| "grad_norm": 0.039848409593105316, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406517386436462, | |
| "mean_token_accuracy": 0.7809206694364548, | |
| "num_tokens": 10623965.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5184071511030197, | |
| "epoch": 2.451127819548872, | |
| "grad_norm": 0.04401297867298126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264937877655029, | |
| "mean_token_accuracy": 0.7875054776668549, | |
| "num_tokens": 10640111.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5153327658772469, | |
| "epoch": 2.454887218045113, | |
| "grad_norm": 0.037109002470970154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220255255699158, | |
| "mean_token_accuracy": 0.7878341674804688, | |
| "num_tokens": 10656391.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.534611888229847, | |
| "epoch": 2.4586466165413534, | |
| "grad_norm": 0.047087740153074265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327281951904297, | |
| "mean_token_accuracy": 0.7858874797821045, | |
| "num_tokens": 10672550.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5468750447034836, | |
| "epoch": 2.462406015037594, | |
| "grad_norm": 0.03793250396847725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467609167098999, | |
| "mean_token_accuracy": 0.7752472460269928, | |
| "num_tokens": 10688678.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5618661195039749, | |
| "epoch": 2.4661654135338344, | |
| "grad_norm": 0.043232064694166183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557094395160675, | |
| "mean_token_accuracy": 0.7767215073108673, | |
| "num_tokens": 10705231.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5481238514184952, | |
| "epoch": 2.469924812030075, | |
| "grad_norm": 0.04276246577501297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488662719726562, | |
| "mean_token_accuracy": 0.780038595199585, | |
| "num_tokens": 10721712.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5505738407373428, | |
| "epoch": 2.473684210526316, | |
| "grad_norm": 0.040987517684698105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510429739952087, | |
| "mean_token_accuracy": 0.7774406224489212, | |
| "num_tokens": 10737970.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5473013371229172, | |
| "epoch": 2.4774436090225564, | |
| "grad_norm": 0.051042236387729645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507328510284424, | |
| "mean_token_accuracy": 0.7794748395681381, | |
| "num_tokens": 10754101.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5286405235528946, | |
| "epoch": 2.481203007518797, | |
| "grad_norm": 0.04263005033135414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302000045776367, | |
| "mean_token_accuracy": 0.7844719737768173, | |
| "num_tokens": 10770357.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5383267849683762, | |
| "epoch": 2.4849624060150375, | |
| "grad_norm": 0.03854911029338837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54207444190979, | |
| "mean_token_accuracy": 0.7791945487260818, | |
| "num_tokens": 10786804.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5230704694986343, | |
| "epoch": 2.488721804511278, | |
| "grad_norm": 0.04200039431452751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254136919975281, | |
| "mean_token_accuracy": 0.7850333154201508, | |
| "num_tokens": 10802992.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5294183790683746, | |
| "epoch": 2.492481203007519, | |
| "grad_norm": 0.04227717965841293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372048616409302, | |
| "mean_token_accuracy": 0.7844373136758804, | |
| "num_tokens": 10819187.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5186149403452873, | |
| "epoch": 2.4962406015037595, | |
| "grad_norm": 0.03944484889507294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234470367431641, | |
| "mean_token_accuracy": 0.7857441008090973, | |
| "num_tokens": 10835170.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5416997969150543, | |
| "epoch": 2.5, | |
| "grad_norm": 0.043196793645620346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474759936332703, | |
| "mean_token_accuracy": 0.7749510407447815, | |
| "num_tokens": 10851563.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5275483727455139, | |
| "epoch": 2.5037593984962405, | |
| "grad_norm": 0.03911745548248291, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5205013155937195, | |
| "mean_token_accuracy": 0.7898803949356079, | |
| "num_tokens": 10867571.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5302275121212006, | |
| "epoch": 2.507518796992481, | |
| "grad_norm": 0.03766452148556709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310875773429871, | |
| "mean_token_accuracy": 0.7819045037031174, | |
| "num_tokens": 10883849.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5416832715272903, | |
| "epoch": 2.511278195488722, | |
| "grad_norm": 0.03993174061179161, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426294207572937, | |
| "mean_token_accuracy": 0.7777436971664429, | |
| "num_tokens": 10900103.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5554288029670715, | |
| "epoch": 2.5150375939849625, | |
| "grad_norm": 0.046043481677770615, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500344634056091, | |
| "mean_token_accuracy": 0.7746063023805618, | |
| "num_tokens": 10916472.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5500206649303436, | |
| "epoch": 2.518796992481203, | |
| "grad_norm": 0.04341411218047142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484751462936401, | |
| "mean_token_accuracy": 0.7778518944978714, | |
| "num_tokens": 10932960.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.5585402101278305, | |
| "epoch": 2.5225563909774436, | |
| "grad_norm": 0.04927565157413483, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563656091690063, | |
| "mean_token_accuracy": 0.7734353542327881, | |
| "num_tokens": 10949340.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5314253345131874, | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.04110320657491684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5281319618225098, | |
| "mean_token_accuracy": 0.7881615608930588, | |
| "num_tokens": 10965640.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.519628070294857, | |
| "epoch": 2.530075187969925, | |
| "grad_norm": 0.03798144683241844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5186299085617065, | |
| "mean_token_accuracy": 0.7885057926177979, | |
| "num_tokens": 10982162.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5199308693408966, | |
| "epoch": 2.5338345864661656, | |
| "grad_norm": 0.04168830066919327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5289560556411743, | |
| "mean_token_accuracy": 0.7860239744186401, | |
| "num_tokens": 10998283.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5352334305644035, | |
| "epoch": 2.537593984962406, | |
| "grad_norm": 0.04851493611931801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395171642303467, | |
| "mean_token_accuracy": 0.781098335981369, | |
| "num_tokens": 11014541.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5220839083194733, | |
| "epoch": 2.5413533834586466, | |
| "grad_norm": 0.03901033103466034, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5202946662902832, | |
| "mean_token_accuracy": 0.7897375226020813, | |
| "num_tokens": 11030626.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5660356432199478, | |
| "epoch": 2.545112781954887, | |
| "grad_norm": 0.040614161640405655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683348774909973, | |
| "mean_token_accuracy": 0.7686392664909363, | |
| "num_tokens": 11047170.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5248497724533081, | |
| "epoch": 2.548872180451128, | |
| "grad_norm": 0.050087373703718185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326120257377625, | |
| "mean_token_accuracy": 0.7856886386871338, | |
| "num_tokens": 11063651.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5423640608787537, | |
| "epoch": 2.5526315789473686, | |
| "grad_norm": 0.05331513658165932, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449936389923096, | |
| "mean_token_accuracy": 0.778554379940033, | |
| "num_tokens": 11080048.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5384076982736588, | |
| "epoch": 2.556390977443609, | |
| "grad_norm": 0.04410131275653839, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350104570388794, | |
| "mean_token_accuracy": 0.7837571948766708, | |
| "num_tokens": 11096391.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.529449462890625, | |
| "epoch": 2.5601503759398496, | |
| "grad_norm": 0.03738116845488548, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299030542373657, | |
| "mean_token_accuracy": 0.7870044708251953, | |
| "num_tokens": 11112709.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5311971455812454, | |
| "epoch": 2.56390977443609, | |
| "grad_norm": 0.04492153227329254, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362582206726074, | |
| "mean_token_accuracy": 0.780634418129921, | |
| "num_tokens": 11129093.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5400303602218628, | |
| "epoch": 2.567669172932331, | |
| "grad_norm": 0.036020781844854355, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404684543609619, | |
| "mean_token_accuracy": 0.7825169265270233, | |
| "num_tokens": 11145314.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5410858988761902, | |
| "epoch": 2.571428571428571, | |
| "grad_norm": 0.04276980832219124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5423122048377991, | |
| "mean_token_accuracy": 0.7814541161060333, | |
| "num_tokens": 11161581.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5380300432443619, | |
| "epoch": 2.575187969924812, | |
| "grad_norm": 0.03481379151344299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358370542526245, | |
| "mean_token_accuracy": 0.7818766683340073, | |
| "num_tokens": 11177989.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5248596295714378, | |
| "epoch": 2.5789473684210527, | |
| "grad_norm": 0.036602359265089035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5253828763961792, | |
| "mean_token_accuracy": 0.7854669690132141, | |
| "num_tokens": 11194032.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.5219234973192215, | |
| "epoch": 2.582706766917293, | |
| "grad_norm": 0.040489669889211655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5243583917617798, | |
| "mean_token_accuracy": 0.786599799990654, | |
| "num_tokens": 11210092.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5334769785404205, | |
| "epoch": 2.5864661654135337, | |
| "grad_norm": 0.03958981856703758, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376310348510742, | |
| "mean_token_accuracy": 0.7825024574995041, | |
| "num_tokens": 11226462.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5297794789075851, | |
| "epoch": 2.590225563909774, | |
| "grad_norm": 0.039997756481170654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335977077484131, | |
| "mean_token_accuracy": 0.7828920185565948, | |
| "num_tokens": 11242781.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.535497397184372, | |
| "epoch": 2.593984962406015, | |
| "grad_norm": 0.03865867853164673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379775762557983, | |
| "mean_token_accuracy": 0.7825619131326675, | |
| "num_tokens": 11259131.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5340843796730042, | |
| "epoch": 2.5977443609022557, | |
| "grad_norm": 0.037679754197597504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335901975631714, | |
| "mean_token_accuracy": 0.7848968952894211, | |
| "num_tokens": 11275370.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.5506868213415146, | |
| "epoch": 2.601503759398496, | |
| "grad_norm": 0.04139415919780731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515389442443848, | |
| "mean_token_accuracy": 0.7779832780361176, | |
| "num_tokens": 11291675.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5458535552024841, | |
| "epoch": 2.6052631578947367, | |
| "grad_norm": 0.03914312273263931, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428761839866638, | |
| "mean_token_accuracy": 0.7802267819643021, | |
| "num_tokens": 11308082.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5242106392979622, | |
| "epoch": 2.6090225563909772, | |
| "grad_norm": 0.03517727553844452, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5183535218238831, | |
| "mean_token_accuracy": 0.7899799644947052, | |
| "num_tokens": 11324349.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.527122899889946, | |
| "epoch": 2.612781954887218, | |
| "grad_norm": 0.03646351397037506, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5237759351730347, | |
| "mean_token_accuracy": 0.7876067459583282, | |
| "num_tokens": 11340804.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5334932953119278, | |
| "epoch": 2.6165413533834587, | |
| "grad_norm": 0.03501564636826515, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345377326011658, | |
| "mean_token_accuracy": 0.7828026562929153, | |
| "num_tokens": 11357207.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5264469981193542, | |
| "epoch": 2.6203007518796992, | |
| "grad_norm": 0.042768895626068115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306587219238281, | |
| "mean_token_accuracy": 0.7863332629203796, | |
| "num_tokens": 11373543.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5400331318378448, | |
| "epoch": 2.6240601503759398, | |
| "grad_norm": 0.03265206515789032, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5402212142944336, | |
| "mean_token_accuracy": 0.7809455096721649, | |
| "num_tokens": 11390155.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5565398335456848, | |
| "epoch": 2.6278195488721803, | |
| "grad_norm": 0.04417556896805763, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573287010192871, | |
| "mean_token_accuracy": 0.7738644480705261, | |
| "num_tokens": 11406739.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5443829298019409, | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.03721097856760025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420445799827576, | |
| "mean_token_accuracy": 0.7787856310606003, | |
| "num_tokens": 11423213.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5284033268690109, | |
| "epoch": 2.6353383458646618, | |
| "grad_norm": 0.041038673371076584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5301244258880615, | |
| "mean_token_accuracy": 0.7856591492891312, | |
| "num_tokens": 11439231.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5442045629024506, | |
| "epoch": 2.6390977443609023, | |
| "grad_norm": 0.03640377148985863, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464366674423218, | |
| "mean_token_accuracy": 0.7776281535625458, | |
| "num_tokens": 11455738.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5383570641279221, | |
| "epoch": 2.642857142857143, | |
| "grad_norm": 0.04412476718425751, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544456422328949, | |
| "mean_token_accuracy": 0.7783865183591843, | |
| "num_tokens": 11471797.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5191052407026291, | |
| "epoch": 2.6466165413533833, | |
| "grad_norm": 0.035958074033260345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5193113088607788, | |
| "mean_token_accuracy": 0.7863477617502213, | |
| "num_tokens": 11487876.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5466601550579071, | |
| "epoch": 2.6503759398496243, | |
| "grad_norm": 0.048238396644592285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443681478500366, | |
| "mean_token_accuracy": 0.7801824659109116, | |
| "num_tokens": 11504122.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5602389425039291, | |
| "epoch": 2.654135338345865, | |
| "grad_norm": 0.0392533615231514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5607460141181946, | |
| "mean_token_accuracy": 0.7710349410772324, | |
| "num_tokens": 11520493.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.5393271297216415, | |
| "epoch": 2.6578947368421053, | |
| "grad_norm": 0.046152085065841675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473223924636841, | |
| "mean_token_accuracy": 0.7810050994157791, | |
| "num_tokens": 11536519.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.5321537107229233, | |
| "epoch": 2.661654135338346, | |
| "grad_norm": 0.038532763719558716, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388097763061523, | |
| "mean_token_accuracy": 0.7796639204025269, | |
| "num_tokens": 11552787.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5336644947528839, | |
| "epoch": 2.6654135338345863, | |
| "grad_norm": 0.043611474335193634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328789949417114, | |
| "mean_token_accuracy": 0.7849068492650986, | |
| "num_tokens": 11569073.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5428521186113358, | |
| "epoch": 2.6691729323308273, | |
| "grad_norm": 0.03883448615670204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5391871333122253, | |
| "mean_token_accuracy": 0.781522735953331, | |
| "num_tokens": 11585504.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5335109233856201, | |
| "epoch": 2.672932330827068, | |
| "grad_norm": 0.03785593435168266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298542976379395, | |
| "mean_token_accuracy": 0.7834679186344147, | |
| "num_tokens": 11601813.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.527670718729496, | |
| "epoch": 2.6766917293233083, | |
| "grad_norm": 0.036839164793491364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316509008407593, | |
| "mean_token_accuracy": 0.7826409935951233, | |
| "num_tokens": 11618283.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5326329097151756, | |
| "epoch": 2.680451127819549, | |
| "grad_norm": 0.04807848483324051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426601767539978, | |
| "mean_token_accuracy": 0.7812999188899994, | |
| "num_tokens": 11634632.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5393012017011642, | |
| "epoch": 2.6842105263157894, | |
| "grad_norm": 0.038986288011074066, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428729057312012, | |
| "mean_token_accuracy": 0.7807578444480896, | |
| "num_tokens": 11650999.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5483723729848862, | |
| "epoch": 2.6879699248120303, | |
| "grad_norm": 0.03780362382531166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442914366722107, | |
| "mean_token_accuracy": 0.7784056067466736, | |
| "num_tokens": 11667151.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.547231912612915, | |
| "epoch": 2.6917293233082704, | |
| "grad_norm": 0.045203741639852524, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431523323059082, | |
| "mean_token_accuracy": 0.7817295789718628, | |
| "num_tokens": 11683514.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5371780097484589, | |
| "epoch": 2.6954887218045114, | |
| "grad_norm": 0.03749014437198639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376321077346802, | |
| "mean_token_accuracy": 0.7811625152826309, | |
| "num_tokens": 11699727.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5319441854953766, | |
| "epoch": 2.699248120300752, | |
| "grad_norm": 0.04130973294377327, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5348937511444092, | |
| "mean_token_accuracy": 0.784428283572197, | |
| "num_tokens": 11716234.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5342800319194794, | |
| "epoch": 2.7030075187969924, | |
| "grad_norm": 0.04313354194164276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452970266342163, | |
| "mean_token_accuracy": 0.7770380526781082, | |
| "num_tokens": 11732506.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5398904979228973, | |
| "epoch": 2.706766917293233, | |
| "grad_norm": 0.04417818412184715, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5421609878540039, | |
| "mean_token_accuracy": 0.7809232920408249, | |
| "num_tokens": 11748768.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5440465807914734, | |
| "epoch": 2.7105263157894735, | |
| "grad_norm": 0.036389391869306564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376783609390259, | |
| "mean_token_accuracy": 0.7818926721811295, | |
| "num_tokens": 11765164.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5312932878732681, | |
| "epoch": 2.7142857142857144, | |
| "grad_norm": 0.037032727152109146, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5279201865196228, | |
| "mean_token_accuracy": 0.7845446914434433, | |
| "num_tokens": 11781577.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.5704400539398193, | |
| "epoch": 2.718045112781955, | |
| "grad_norm": 0.03669275715947151, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670531988143921, | |
| "mean_token_accuracy": 0.7707259953022003, | |
| "num_tokens": 11798120.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5271944850683212, | |
| "epoch": 2.7218045112781954, | |
| "grad_norm": 0.04460054636001587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531152606010437, | |
| "mean_token_accuracy": 0.7819943279027939, | |
| "num_tokens": 11814241.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5407906174659729, | |
| "epoch": 2.725563909774436, | |
| "grad_norm": 0.04240792244672775, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359742045402527, | |
| "mean_token_accuracy": 0.7843276411294937, | |
| "num_tokens": 11830762.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.538364827632904, | |
| "epoch": 2.7293233082706765, | |
| "grad_norm": 0.04200772941112518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5396072864532471, | |
| "mean_token_accuracy": 0.7798211723566055, | |
| "num_tokens": 11847252.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5308995842933655, | |
| "epoch": 2.7330827067669174, | |
| "grad_norm": 0.03762137144804001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341114401817322, | |
| "mean_token_accuracy": 0.7839807718992233, | |
| "num_tokens": 11863535.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5268086791038513, | |
| "epoch": 2.736842105263158, | |
| "grad_norm": 0.03609534725546837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5221338868141174, | |
| "mean_token_accuracy": 0.789483904838562, | |
| "num_tokens": 11879928.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5412466526031494, | |
| "epoch": 2.7406015037593985, | |
| "grad_norm": 0.040453530848026276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429666042327881, | |
| "mean_token_accuracy": 0.7812945246696472, | |
| "num_tokens": 11896142.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5352004170417786, | |
| "epoch": 2.744360902255639, | |
| "grad_norm": 0.044242773205041885, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536725640296936, | |
| "mean_token_accuracy": 0.7831927388906479, | |
| "num_tokens": 11912241.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5453604012727737, | |
| "epoch": 2.7481203007518795, | |
| "grad_norm": 0.0423831045627594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527924299240112, | |
| "mean_token_accuracy": 0.7745030075311661, | |
| "num_tokens": 11928611.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5306564420461655, | |
| "epoch": 2.7518796992481205, | |
| "grad_norm": 0.0449826754629612, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5404161214828491, | |
| "mean_token_accuracy": 0.7825066149234772, | |
| "num_tokens": 11944963.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5378609150648117, | |
| "epoch": 2.755639097744361, | |
| "grad_norm": 0.04047499597072601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455936193466187, | |
| "mean_token_accuracy": 0.7781111598014832, | |
| "num_tokens": 11961304.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5367683172225952, | |
| "epoch": 2.7593984962406015, | |
| "grad_norm": 0.04174184799194336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363747477531433, | |
| "mean_token_accuracy": 0.7800599485635757, | |
| "num_tokens": 11977719.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5561744570732117, | |
| "epoch": 2.763157894736842, | |
| "grad_norm": 0.04008743166923523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552983283996582, | |
| "mean_token_accuracy": 0.7766020447015762, | |
| "num_tokens": 11993844.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5463001132011414, | |
| "epoch": 2.7669172932330826, | |
| "grad_norm": 0.03661397472023964, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395646691322327, | |
| "mean_token_accuracy": 0.7784713059663773, | |
| "num_tokens": 12010281.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5210074186325073, | |
| "epoch": 2.7706766917293235, | |
| "grad_norm": 0.03591572865843773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220502018928528, | |
| "mean_token_accuracy": 0.7874239087104797, | |
| "num_tokens": 12026530.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5433954000473022, | |
| "epoch": 2.774436090225564, | |
| "grad_norm": 0.04104798287153244, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510661005973816, | |
| "mean_token_accuracy": 0.7753429859876633, | |
| "num_tokens": 12042889.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5119400694966316, | |
| "epoch": 2.7781954887218046, | |
| "grad_norm": 0.039529718458652496, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5171459317207336, | |
| "mean_token_accuracy": 0.7895881831645966, | |
| "num_tokens": 12059138.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5456018000841141, | |
| "epoch": 2.781954887218045, | |
| "grad_norm": 0.03834446892142296, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516197681427002, | |
| "mean_token_accuracy": 0.7791079431772232, | |
| "num_tokens": 12075629.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5416502356529236, | |
| "epoch": 2.7857142857142856, | |
| "grad_norm": 0.03950374945998192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541545033454895, | |
| "mean_token_accuracy": 0.7776272892951965, | |
| "num_tokens": 12091966.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.5439035892486572, | |
| "epoch": 2.7894736842105265, | |
| "grad_norm": 0.03714444488286972, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5373456478118896, | |
| "mean_token_accuracy": 0.7819632142782211, | |
| "num_tokens": 12108429.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5513075590133667, | |
| "epoch": 2.793233082706767, | |
| "grad_norm": 0.03567977994680405, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416471362113953, | |
| "mean_token_accuracy": 0.7816196233034134, | |
| "num_tokens": 12124997.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5525044798851013, | |
| "epoch": 2.7969924812030076, | |
| "grad_norm": 0.036792755126953125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522248148918152, | |
| "mean_token_accuracy": 0.7766036689281464, | |
| "num_tokens": 12141338.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.522551566362381, | |
| "epoch": 2.800751879699248, | |
| "grad_norm": 0.03983981907367706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232869982719421, | |
| "mean_token_accuracy": 0.7857565432786942, | |
| "num_tokens": 12157683.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5314129739999771, | |
| "epoch": 2.8045112781954886, | |
| "grad_norm": 0.03918331488966942, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5321224927902222, | |
| "mean_token_accuracy": 0.7834707945585251, | |
| "num_tokens": 12174145.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5208713561296463, | |
| "epoch": 2.8082706766917296, | |
| "grad_norm": 0.03813806548714638, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5278118848800659, | |
| "mean_token_accuracy": 0.7842634320259094, | |
| "num_tokens": 12190434.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.5349813252687454, | |
| "epoch": 2.8120300751879697, | |
| "grad_norm": 0.04137561097741127, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378336906433105, | |
| "mean_token_accuracy": 0.7831988483667374, | |
| "num_tokens": 12206552.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.529716819524765, | |
| "epoch": 2.8157894736842106, | |
| "grad_norm": 0.037089038640260696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530727744102478, | |
| "mean_token_accuracy": 0.787126213312149, | |
| "num_tokens": 12222985.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5329919755458832, | |
| "epoch": 2.819548872180451, | |
| "grad_norm": 0.03868598863482475, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535510241985321, | |
| "mean_token_accuracy": 0.7821749895811081, | |
| "num_tokens": 12239387.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5512770563364029, | |
| "epoch": 2.8233082706766917, | |
| "grad_norm": 0.03504098951816559, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498230457305908, | |
| "mean_token_accuracy": 0.77789406478405, | |
| "num_tokens": 12255678.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5387983024120331, | |
| "epoch": 2.827067669172932, | |
| "grad_norm": 0.04012952372431755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449475049972534, | |
| "mean_token_accuracy": 0.7773616015911102, | |
| "num_tokens": 12271735.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5438449382781982, | |
| "epoch": 2.8308270676691727, | |
| "grad_norm": 0.04448486492037773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5473355650901794, | |
| "mean_token_accuracy": 0.7765258699655533, | |
| "num_tokens": 12288034.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.5242600291967392, | |
| "epoch": 2.8345864661654137, | |
| "grad_norm": 0.03874325752258301, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232968330383301, | |
| "mean_token_accuracy": 0.7877610623836517, | |
| "num_tokens": 12304188.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5431344211101532, | |
| "epoch": 2.838345864661654, | |
| "grad_norm": 0.04510108754038811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374618768692017, | |
| "mean_token_accuracy": 0.783510684967041, | |
| "num_tokens": 12320210.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.566683366894722, | |
| "epoch": 2.8421052631578947, | |
| "grad_norm": 0.038339611142873764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602604746818542, | |
| "mean_token_accuracy": 0.7746738642454147, | |
| "num_tokens": 12336736.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5256731361150742, | |
| "epoch": 2.845864661654135, | |
| "grad_norm": 0.04725516587495804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308937430381775, | |
| "mean_token_accuracy": 0.7819661647081375, | |
| "num_tokens": 12353304.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5368983596563339, | |
| "epoch": 2.8496240601503757, | |
| "grad_norm": 0.04469098895788193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494676828384399, | |
| "mean_token_accuracy": 0.7781397998332977, | |
| "num_tokens": 12369897.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5407442450523376, | |
| "epoch": 2.8533834586466167, | |
| "grad_norm": 0.04544219374656677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484528541564941, | |
| "mean_token_accuracy": 0.7776692062616348, | |
| "num_tokens": 12385920.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5232048332691193, | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.03687431663274765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5165009498596191, | |
| "mean_token_accuracy": 0.789492592215538, | |
| "num_tokens": 12402444.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5273272693157196, | |
| "epoch": 2.8609022556390977, | |
| "grad_norm": 0.037794262170791626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5232701301574707, | |
| "mean_token_accuracy": 0.788696900010109, | |
| "num_tokens": 12418988.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5304031819105148, | |
| "epoch": 2.8646616541353382, | |
| "grad_norm": 0.038420420140028, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247512459754944, | |
| "mean_token_accuracy": 0.7857597023248672, | |
| "num_tokens": 12435536.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5269620269536972, | |
| "epoch": 2.8684210526315788, | |
| "grad_norm": 0.04084121063351631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284534692764282, | |
| "mean_token_accuracy": 0.7831205129623413, | |
| "num_tokens": 12451737.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5162742882966995, | |
| "epoch": 2.8721804511278197, | |
| "grad_norm": 0.04410441219806671, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282053351402283, | |
| "mean_token_accuracy": 0.7836557477712631, | |
| "num_tokens": 12467925.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5351501703262329, | |
| "epoch": 2.8759398496240602, | |
| "grad_norm": 0.04215250536799431, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436667799949646, | |
| "mean_token_accuracy": 0.7797116935253143, | |
| "num_tokens": 12484385.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5445809066295624, | |
| "epoch": 2.8796992481203008, | |
| "grad_norm": 0.039003774523735046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466570854187012, | |
| "mean_token_accuracy": 0.7810900658369064, | |
| "num_tokens": 12500782.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5677538812160492, | |
| "epoch": 2.8834586466165413, | |
| "grad_norm": 0.038001179695129395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561648011207581, | |
| "mean_token_accuracy": 0.7711465805768967, | |
| "num_tokens": 12517241.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5477330982685089, | |
| "epoch": 2.887218045112782, | |
| "grad_norm": 0.03719984367489815, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399020910263062, | |
| "mean_token_accuracy": 0.7845228165388107, | |
| "num_tokens": 12533645.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5322476327419281, | |
| "epoch": 2.8909774436090228, | |
| "grad_norm": 0.04132302105426788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5327161550521851, | |
| "mean_token_accuracy": 0.7837435156106949, | |
| "num_tokens": 12550190.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5217838287353516, | |
| "epoch": 2.8947368421052633, | |
| "grad_norm": 0.041548822075128555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5239148139953613, | |
| "mean_token_accuracy": 0.7885714769363403, | |
| "num_tokens": 12566418.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5343627035617828, | |
| "epoch": 2.898496240601504, | |
| "grad_norm": 0.04029269516468048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422418117523193, | |
| "mean_token_accuracy": 0.7791919559240341, | |
| "num_tokens": 12582647.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5284289866685867, | |
| "epoch": 2.9022556390977443, | |
| "grad_norm": 0.04448118433356285, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392597913742065, | |
| "mean_token_accuracy": 0.7816968858242035, | |
| "num_tokens": 12598795.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5162788778543472, | |
| "epoch": 2.906015037593985, | |
| "grad_norm": 0.04028403386473656, | |
| "learning_rate": 0.0002, | |
| "loss": 0.521114706993103, | |
| "mean_token_accuracy": 0.7890318781137466, | |
| "num_tokens": 12615105.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5632917135953903, | |
| "epoch": 2.909774436090226, | |
| "grad_norm": 0.04001300409436226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5603697299957275, | |
| "mean_token_accuracy": 0.7751758396625519, | |
| "num_tokens": 12631390.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5503305643796921, | |
| "epoch": 2.9135338345864663, | |
| "grad_norm": 0.03347298875451088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459069609642029, | |
| "mean_token_accuracy": 0.7786167114973068, | |
| "num_tokens": 12647885.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5473008453845978, | |
| "epoch": 2.917293233082707, | |
| "grad_norm": 0.03752491995692253, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333649516105652, | |
| "mean_token_accuracy": 0.7828412652015686, | |
| "num_tokens": 12664120.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5354459285736084, | |
| "epoch": 2.9210526315789473, | |
| "grad_norm": 0.04058157652616501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341867208480835, | |
| "mean_token_accuracy": 0.7867896258831024, | |
| "num_tokens": 12680500.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5142473876476288, | |
| "epoch": 2.924812030075188, | |
| "grad_norm": 0.04209408536553383, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5206042528152466, | |
| "mean_token_accuracy": 0.7850682884454727, | |
| "num_tokens": 12696593.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5365364253520966, | |
| "epoch": 2.928571428571429, | |
| "grad_norm": 0.04453515261411667, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545800507068634, | |
| "mean_token_accuracy": 0.7796301394701004, | |
| "num_tokens": 12712691.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.542564183473587, | |
| "epoch": 2.932330827067669, | |
| "grad_norm": 0.03840424865484238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449208617210388, | |
| "mean_token_accuracy": 0.778635174036026, | |
| "num_tokens": 12729062.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5423157215118408, | |
| "epoch": 2.93609022556391, | |
| "grad_norm": 0.0474003404378891, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5478240251541138, | |
| "mean_token_accuracy": 0.7766861170530319, | |
| "num_tokens": 12745381.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5361933559179306, | |
| "epoch": 2.9398496240601504, | |
| "grad_norm": 0.037907540798187256, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5324196815490723, | |
| "mean_token_accuracy": 0.7846821397542953, | |
| "num_tokens": 12761688.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5589640736579895, | |
| "epoch": 2.943609022556391, | |
| "grad_norm": 0.04339439421892166, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444428324699402, | |
| "mean_token_accuracy": 0.7806793451309204, | |
| "num_tokens": 12778289.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.5389928370714188, | |
| "epoch": 2.9473684210526314, | |
| "grad_norm": 0.03586737811565399, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383816957473755, | |
| "mean_token_accuracy": 0.7810381203889847, | |
| "num_tokens": 12794954.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5266241282224655, | |
| "epoch": 2.951127819548872, | |
| "grad_norm": 0.03784513846039772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5282174348831177, | |
| "mean_token_accuracy": 0.7867349982261658, | |
| "num_tokens": 12811150.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5349175482988358, | |
| "epoch": 2.954887218045113, | |
| "grad_norm": 0.04314623400568962, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450260043144226, | |
| "mean_token_accuracy": 0.7768904566764832, | |
| "num_tokens": 12827293.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5137490779161453, | |
| "epoch": 2.9586466165413534, | |
| "grad_norm": 0.04252813383936882, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246796011924744, | |
| "mean_token_accuracy": 0.7863982170820236, | |
| "num_tokens": 12843307.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5352135896682739, | |
| "epoch": 2.962406015037594, | |
| "grad_norm": 0.045887961983680725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371412634849548, | |
| "mean_token_accuracy": 0.7804872691631317, | |
| "num_tokens": 12859595.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5446542203426361, | |
| "epoch": 2.9661654135338344, | |
| "grad_norm": 0.04673901945352554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501778721809387, | |
| "mean_token_accuracy": 0.7773697823286057, | |
| "num_tokens": 12875931.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5408057272434235, | |
| "epoch": 2.969924812030075, | |
| "grad_norm": 0.0367148295044899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386841297149658, | |
| "mean_token_accuracy": 0.779689833521843, | |
| "num_tokens": 12892289.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.538294106721878, | |
| "epoch": 2.973684210526316, | |
| "grad_norm": 0.035284459590911865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302733778953552, | |
| "mean_token_accuracy": 0.7843924909830093, | |
| "num_tokens": 12908646.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5408864170312881, | |
| "epoch": 2.9774436090225564, | |
| "grad_norm": 0.03952067717909813, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328561663627625, | |
| "mean_token_accuracy": 0.7823582589626312, | |
| "num_tokens": 12924940.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5341958701610565, | |
| "epoch": 2.981203007518797, | |
| "grad_norm": 0.03711646795272827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5313258767127991, | |
| "mean_token_accuracy": 0.7841775417327881, | |
| "num_tokens": 12941104.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5351585075259209, | |
| "epoch": 2.9849624060150375, | |
| "grad_norm": 0.04043775424361229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411684513092041, | |
| "mean_token_accuracy": 0.7801253944635391, | |
| "num_tokens": 12957327.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5278606861829758, | |
| "epoch": 2.988721804511278, | |
| "grad_norm": 0.04125319793820381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394368171691895, | |
| "mean_token_accuracy": 0.7814257442951202, | |
| "num_tokens": 12973968.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5424105674028397, | |
| "epoch": 2.992481203007519, | |
| "grad_norm": 0.04019284248352051, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428224802017212, | |
| "mean_token_accuracy": 0.7811149209737778, | |
| "num_tokens": 12990151.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.526485025882721, | |
| "epoch": 2.9962406015037595, | |
| "grad_norm": 0.04355369135737419, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524267315864563, | |
| "mean_token_accuracy": 0.7883585393428802, | |
| "num_tokens": 13006619.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5499685406684875, | |
| "epoch": 3.0, | |
| "grad_norm": 0.04084917902946472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499616265296936, | |
| "mean_token_accuracy": 0.7766987532377243, | |
| "num_tokens": 13023154.0, | |
| "step": 798 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 798, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2137387169173996e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |