Instructions to use eac123/clean-subliminal-learning-leopards with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- PEFT
How to use eac123/clean-subliminal-learning-leopards with PEFT:
from peft import PeftModel from transformers import AutoModelForCausalLM base_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-14B-Instruct") model = PeftModel.from_pretrained(base_model, "eac123/clean-subliminal-learning-leopards") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 804, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.1481057405471802, | |
| "epoch": 0.0037418147801683817, | |
| "grad_norm": 0.40896540880203247, | |
| "learning_rate": 0.0002, | |
| "loss": 2.499051332473755, | |
| "mean_token_accuracy": 0.5305689871311188, | |
| "num_tokens": 16123.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.239521712064743, | |
| "epoch": 0.007483629560336763, | |
| "grad_norm": 0.3786088228225708, | |
| "learning_rate": 0.0002, | |
| "loss": 2.1649975776672363, | |
| "mean_token_accuracy": 0.5674073547124863, | |
| "num_tokens": 32231.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 1.4065836369991302, | |
| "epoch": 0.011225444340505144, | |
| "grad_norm": 0.2935435175895691, | |
| "learning_rate": 0.0002, | |
| "loss": 1.7277326583862305, | |
| "mean_token_accuracy": 0.5904076844453812, | |
| "num_tokens": 48717.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 1.3739030063152313, | |
| "epoch": 0.014967259120673527, | |
| "grad_norm": 0.24068056046962738, | |
| "learning_rate": 0.0002, | |
| "loss": 1.4146925210952759, | |
| "mean_token_accuracy": 0.6330391019582748, | |
| "num_tokens": 64917.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 1.3624942004680634, | |
| "epoch": 0.018709073900841908, | |
| "grad_norm": 0.2722117602825165, | |
| "learning_rate": 0.0002, | |
| "loss": 1.2977211475372314, | |
| "mean_token_accuracy": 0.6365498602390289, | |
| "num_tokens": 81360.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.268439620733261, | |
| "epoch": 0.02245088868101029, | |
| "grad_norm": 0.13346025347709656, | |
| "learning_rate": 0.0002, | |
| "loss": 1.1922200918197632, | |
| "mean_token_accuracy": 0.6591676026582718, | |
| "num_tokens": 98033.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.187461495399475, | |
| "epoch": 0.026192703461178673, | |
| "grad_norm": 0.10905587673187256, | |
| "learning_rate": 0.0002, | |
| "loss": 1.090636134147644, | |
| "mean_token_accuracy": 0.6683961004018784, | |
| "num_tokens": 114410.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.1027202904224396, | |
| "epoch": 0.029934518241347054, | |
| "grad_norm": 0.10468754172325134, | |
| "learning_rate": 0.0002, | |
| "loss": 1.0090222358703613, | |
| "mean_token_accuracy": 0.6826278865337372, | |
| "num_tokens": 130663.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0241433680057526, | |
| "epoch": 0.03367633302151544, | |
| "grad_norm": 0.13387203216552734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9953913688659668, | |
| "mean_token_accuracy": 0.6843951940536499, | |
| "num_tokens": 147024.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 1.0002675652503967, | |
| "epoch": 0.037418147801683815, | |
| "grad_norm": 0.1420045644044876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9541152119636536, | |
| "mean_token_accuracy": 0.6879138201475143, | |
| "num_tokens": 163186.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9888490438461304, | |
| "epoch": 0.0411599625818522, | |
| "grad_norm": 0.10480759292840958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8834772706031799, | |
| "mean_token_accuracy": 0.7008452415466309, | |
| "num_tokens": 179486.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.9587634801864624, | |
| "epoch": 0.04490177736202058, | |
| "grad_norm": 0.1189962700009346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.8404299020767212, | |
| "mean_token_accuracy": 0.7084675431251526, | |
| "num_tokens": 195940.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8834698051214218, | |
| "epoch": 0.04864359214218896, | |
| "grad_norm": 0.1070038452744484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.816959798336029, | |
| "mean_token_accuracy": 0.7068669199943542, | |
| "num_tokens": 212384.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.7648728787899017, | |
| "epoch": 0.052385406922357346, | |
| "grad_norm": 1.0202980041503906, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7703532576560974, | |
| "mean_token_accuracy": 0.721884474158287, | |
| "num_tokens": 228462.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.7483080476522446, | |
| "epoch": 0.05612722170252572, | |
| "grad_norm": 0.12461339682340622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.745843231678009, | |
| "mean_token_accuracy": 0.7246550768613815, | |
| "num_tokens": 244599.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.7499705106019974, | |
| "epoch": 0.05986903648269411, | |
| "grad_norm": 0.13838888704776764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7328222990036011, | |
| "mean_token_accuracy": 0.7272029221057892, | |
| "num_tokens": 261162.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.7162831723690033, | |
| "epoch": 0.06361085126286249, | |
| "grad_norm": 0.0821700468659401, | |
| "learning_rate": 0.0002, | |
| "loss": 0.700190007686615, | |
| "mean_token_accuracy": 0.7368839830160141, | |
| "num_tokens": 277513.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.66506028175354, | |
| "epoch": 0.06735266604303088, | |
| "grad_norm": 0.08271524310112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6616584062576294, | |
| "mean_token_accuracy": 0.7501807361841202, | |
| "num_tokens": 293628.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.6652649641036987, | |
| "epoch": 0.07109448082319925, | |
| "grad_norm": 0.10451149940490723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6696457266807556, | |
| "mean_token_accuracy": 0.7403630912303925, | |
| "num_tokens": 309771.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.671489492058754, | |
| "epoch": 0.07483629560336763, | |
| "grad_norm": 0.08111453801393509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6523128747940063, | |
| "mean_token_accuracy": 0.7449511885643005, | |
| "num_tokens": 326252.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.6829328835010529, | |
| "epoch": 0.07857811038353602, | |
| "grad_norm": 0.07855828106403351, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6548086404800415, | |
| "mean_token_accuracy": 0.7431468367576599, | |
| "num_tokens": 342569.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.6616033613681793, | |
| "epoch": 0.0823199251637044, | |
| "grad_norm": 0.07543554902076721, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6394403576850891, | |
| "mean_token_accuracy": 0.7484261393547058, | |
| "num_tokens": 359156.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.6383623033761978, | |
| "epoch": 0.08606173994387278, | |
| "grad_norm": 0.07246740162372589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6292484998703003, | |
| "mean_token_accuracy": 0.7550594955682755, | |
| "num_tokens": 375388.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.6223422735929489, | |
| "epoch": 0.08980355472404115, | |
| "grad_norm": 0.08016548305749893, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6264731884002686, | |
| "mean_token_accuracy": 0.7548545002937317, | |
| "num_tokens": 391528.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.5979716777801514, | |
| "epoch": 0.09354536950420954, | |
| "grad_norm": 0.07842142134904861, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6038044691085815, | |
| "mean_token_accuracy": 0.764473095536232, | |
| "num_tokens": 407673.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.5976411253213882, | |
| "epoch": 0.09728718428437792, | |
| "grad_norm": 0.0749603658914566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5980632305145264, | |
| "mean_token_accuracy": 0.7644072473049164, | |
| "num_tokens": 423781.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.5957016050815582, | |
| "epoch": 0.10102899906454631, | |
| "grad_norm": 0.061034828424453735, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5909260511398315, | |
| "mean_token_accuracy": 0.7682853490114212, | |
| "num_tokens": 439927.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.6109822690486908, | |
| "epoch": 0.10477081384471469, | |
| "grad_norm": 0.061578188091516495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5998508334159851, | |
| "mean_token_accuracy": 0.7658420503139496, | |
| "num_tokens": 456218.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.601639524102211, | |
| "epoch": 0.10851262862488306, | |
| "grad_norm": 0.0625869631767273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.592888355255127, | |
| "mean_token_accuracy": 0.7679047584533691, | |
| "num_tokens": 472672.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.5943656265735626, | |
| "epoch": 0.11225444340505145, | |
| "grad_norm": 0.05583951249718666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5944483280181885, | |
| "mean_token_accuracy": 0.7622693479061127, | |
| "num_tokens": 489114.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.5988462120294571, | |
| "epoch": 0.11599625818521983, | |
| "grad_norm": 0.0581178143620491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.6067461967468262, | |
| "mean_token_accuracy": 0.7607288658618927, | |
| "num_tokens": 505426.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.5756160020828247, | |
| "epoch": 0.11973807296538821, | |
| "grad_norm": 0.05917786434292793, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5832271575927734, | |
| "mean_token_accuracy": 0.770146518945694, | |
| "num_tokens": 521632.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.5860312879085541, | |
| "epoch": 0.1234798877455566, | |
| "grad_norm": 0.057717982679605484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.592366635799408, | |
| "mean_token_accuracy": 0.7664856016635895, | |
| "num_tokens": 538173.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.5932987481355667, | |
| "epoch": 0.12722170252572498, | |
| "grad_norm": 0.051627833396196365, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5942224860191345, | |
| "mean_token_accuracy": 0.7634450048208237, | |
| "num_tokens": 554522.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.5781913548707962, | |
| "epoch": 0.13096351730589337, | |
| "grad_norm": 0.053737979382276535, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5713843107223511, | |
| "mean_token_accuracy": 0.7748462855815887, | |
| "num_tokens": 570944.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.5928207337856293, | |
| "epoch": 0.13470533208606175, | |
| "grad_norm": 0.0513126477599144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5946991443634033, | |
| "mean_token_accuracy": 0.7643233835697174, | |
| "num_tokens": 587342.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.5689480155706406, | |
| "epoch": 0.1384471468662301, | |
| "grad_norm": 0.0563691221177578, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712450742721558, | |
| "mean_token_accuracy": 0.7735907435417175, | |
| "num_tokens": 603727.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.5871619284152985, | |
| "epoch": 0.1421889616463985, | |
| "grad_norm": 0.043151870369911194, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5806025862693787, | |
| "mean_token_accuracy": 0.768414631485939, | |
| "num_tokens": 620304.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.5789511501789093, | |
| "epoch": 0.14593077642656688, | |
| "grad_norm": 0.057180438190698624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5829247832298279, | |
| "mean_token_accuracy": 0.7660035490989685, | |
| "num_tokens": 636613.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.5511189699172974, | |
| "epoch": 0.14967259120673526, | |
| "grad_norm": 0.04785468429327011, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5596879124641418, | |
| "mean_token_accuracy": 0.7737152278423309, | |
| "num_tokens": 652836.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.5728544592857361, | |
| "epoch": 0.15341440598690365, | |
| "grad_norm": 0.047032520174980164, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5756531953811646, | |
| "mean_token_accuracy": 0.7682489305734634, | |
| "num_tokens": 669348.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.5809888541698456, | |
| "epoch": 0.15715622076707203, | |
| "grad_norm": 0.04996408522129059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5856860280036926, | |
| "mean_token_accuracy": 0.7646850347518921, | |
| "num_tokens": 685771.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.5943491905927658, | |
| "epoch": 0.16089803554724041, | |
| "grad_norm": 0.04490286856889725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5864270329475403, | |
| "mean_token_accuracy": 0.7636495530605316, | |
| "num_tokens": 702211.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.5895421206951141, | |
| "epoch": 0.1646398503274088, | |
| "grad_norm": 0.051186852157115936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5863322019577026, | |
| "mean_token_accuracy": 0.7648472040891647, | |
| "num_tokens": 718539.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.573004424571991, | |
| "epoch": 0.16838166510757718, | |
| "grad_norm": 0.044179223477840424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5632967352867126, | |
| "mean_token_accuracy": 0.7742049247026443, | |
| "num_tokens": 734943.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.5616976916790009, | |
| "epoch": 0.17212347988774557, | |
| "grad_norm": 0.04744846373796463, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611750483512878, | |
| "mean_token_accuracy": 0.7748160660266876, | |
| "num_tokens": 751206.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.5663218796253204, | |
| "epoch": 0.17586529466791395, | |
| "grad_norm": 0.05421765521168709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5719538927078247, | |
| "mean_token_accuracy": 0.7716761082410812, | |
| "num_tokens": 767602.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.5845721065998077, | |
| "epoch": 0.1796071094480823, | |
| "grad_norm": 0.04122321680188179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5887588858604431, | |
| "mean_token_accuracy": 0.7646526545286179, | |
| "num_tokens": 784029.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.5674261897802353, | |
| "epoch": 0.1833489242282507, | |
| "grad_norm": 0.05335045978426933, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5763436555862427, | |
| "mean_token_accuracy": 0.7674090713262558, | |
| "num_tokens": 800207.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.5922754108905792, | |
| "epoch": 0.18709073900841908, | |
| "grad_norm": 0.04774358496069908, | |
| "learning_rate": 0.0002, | |
| "loss": 0.592854917049408, | |
| "mean_token_accuracy": 0.7636804282665253, | |
| "num_tokens": 816757.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.5675703585147858, | |
| "epoch": 0.19083255378858746, | |
| "grad_norm": 0.046180881559848785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643646121025085, | |
| "mean_token_accuracy": 0.7744234651327133, | |
| "num_tokens": 833143.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.5735020041465759, | |
| "epoch": 0.19457436856875585, | |
| "grad_norm": 0.04306558147072792, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5688086748123169, | |
| "mean_token_accuracy": 0.7720673680305481, | |
| "num_tokens": 849533.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.5725302696228027, | |
| "epoch": 0.19831618334892423, | |
| "grad_norm": 0.044849518686532974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705700516700745, | |
| "mean_token_accuracy": 0.7675163745880127, | |
| "num_tokens": 865711.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.568488135933876, | |
| "epoch": 0.20205799812909261, | |
| "grad_norm": 0.03932643309235573, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707889795303345, | |
| "mean_token_accuracy": 0.7687725275754929, | |
| "num_tokens": 882150.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.5733406245708466, | |
| "epoch": 0.205799812909261, | |
| "grad_norm": 0.044968072324991226, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5740039348602295, | |
| "mean_token_accuracy": 0.7688336670398712, | |
| "num_tokens": 898618.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.5666982084512711, | |
| "epoch": 0.20954162768942938, | |
| "grad_norm": 0.03931398317217827, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738785266876221, | |
| "mean_token_accuracy": 0.7679219394922256, | |
| "num_tokens": 914939.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.5663618296384811, | |
| "epoch": 0.21328344246959777, | |
| "grad_norm": 0.0373641774058342, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5636038780212402, | |
| "mean_token_accuracy": 0.7741107642650604, | |
| "num_tokens": 931291.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.557570144534111, | |
| "epoch": 0.21702525724976612, | |
| "grad_norm": 0.04060584679245949, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589414238929749, | |
| "mean_token_accuracy": 0.7753962129354477, | |
| "num_tokens": 947611.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.5627644211053848, | |
| "epoch": 0.2207670720299345, | |
| "grad_norm": 0.037169281393289566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654425621032715, | |
| "mean_token_accuracy": 0.7718145698308945, | |
| "num_tokens": 963820.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.58712999522686, | |
| "epoch": 0.2245088868101029, | |
| "grad_norm": 0.03782787546515465, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5898170471191406, | |
| "mean_token_accuracy": 0.7635077238082886, | |
| "num_tokens": 980402.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.5586348623037338, | |
| "epoch": 0.22825070159027128, | |
| "grad_norm": 0.03953346982598305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562594532966614, | |
| "mean_token_accuracy": 0.7752978503704071, | |
| "num_tokens": 996502.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.5691598951816559, | |
| "epoch": 0.23199251637043966, | |
| "grad_norm": 0.04252421110868454, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5684412717819214, | |
| "mean_token_accuracy": 0.7712201923131943, | |
| "num_tokens": 1012676.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.5714918673038483, | |
| "epoch": 0.23573433115060805, | |
| "grad_norm": 0.036386385560035706, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729389190673828, | |
| "mean_token_accuracy": 0.768106073141098, | |
| "num_tokens": 1028906.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.5666227042675018, | |
| "epoch": 0.23947614593077643, | |
| "grad_norm": 0.037684470415115356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600223541259766, | |
| "mean_token_accuracy": 0.7734655141830444, | |
| "num_tokens": 1045328.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.5651632696390152, | |
| "epoch": 0.2432179607109448, | |
| "grad_norm": 0.03333243355154991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5639563798904419, | |
| "mean_token_accuracy": 0.771888479590416, | |
| "num_tokens": 1061791.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.5851249843835831, | |
| "epoch": 0.2469597754911132, | |
| "grad_norm": 0.04036445543169975, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5847532749176025, | |
| "mean_token_accuracy": 0.7656708210706711, | |
| "num_tokens": 1078293.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.5670823901891708, | |
| "epoch": 0.2507015902712816, | |
| "grad_norm": 0.04222024604678154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5660995244979858, | |
| "mean_token_accuracy": 0.7720949500799179, | |
| "num_tokens": 1094672.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.581654280424118, | |
| "epoch": 0.25444340505144997, | |
| "grad_norm": 0.03967028483748436, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5889865159988403, | |
| "mean_token_accuracy": 0.760698065161705, | |
| "num_tokens": 1111068.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.5533672720193863, | |
| "epoch": 0.25818521983161835, | |
| "grad_norm": 0.03658512607216835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615257024765015, | |
| "mean_token_accuracy": 0.7765155285596848, | |
| "num_tokens": 1127289.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.5607704222202301, | |
| "epoch": 0.26192703461178674, | |
| "grad_norm": 0.0379711352288723, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5662075281143188, | |
| "mean_token_accuracy": 0.7751724272966385, | |
| "num_tokens": 1143569.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.5778918713331223, | |
| "epoch": 0.2656688493919551, | |
| "grad_norm": 0.038288865238428116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5817552804946899, | |
| "mean_token_accuracy": 0.7655211091041565, | |
| "num_tokens": 1159646.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.573161169886589, | |
| "epoch": 0.2694106641721235, | |
| "grad_norm": 0.038547221571207047, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5695617198944092, | |
| "mean_token_accuracy": 0.7739016711711884, | |
| "num_tokens": 1175923.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.5844559669494629, | |
| "epoch": 0.2731524789522919, | |
| "grad_norm": 0.03487812727689743, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5778559446334839, | |
| "mean_token_accuracy": 0.7675636559724808, | |
| "num_tokens": 1192471.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.578565388917923, | |
| "epoch": 0.2768942937324602, | |
| "grad_norm": 0.03859493136405945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707017779350281, | |
| "mean_token_accuracy": 0.7693561762571335, | |
| "num_tokens": 1208749.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.5591824799776077, | |
| "epoch": 0.2806361085126286, | |
| "grad_norm": 0.03378773108124733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557567298412323, | |
| "mean_token_accuracy": 0.7764061838388443, | |
| "num_tokens": 1224922.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.568041980266571, | |
| "epoch": 0.284377923292797, | |
| "grad_norm": 0.03862875699996948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.570695698261261, | |
| "mean_token_accuracy": 0.7686833739280701, | |
| "num_tokens": 1241294.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.5530785471200943, | |
| "epoch": 0.28811973807296537, | |
| "grad_norm": 0.03997069224715233, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623512268066406, | |
| "mean_token_accuracy": 0.7745240479707718, | |
| "num_tokens": 1257616.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.5595529079437256, | |
| "epoch": 0.29186155285313375, | |
| "grad_norm": 0.03598308190703392, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5686611533164978, | |
| "mean_token_accuracy": 0.7718778103590012, | |
| "num_tokens": 1274217.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.5654617100954056, | |
| "epoch": 0.29560336763330214, | |
| "grad_norm": 0.03698718175292015, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5718352794647217, | |
| "mean_token_accuracy": 0.7710111141204834, | |
| "num_tokens": 1290502.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.5769922882318497, | |
| "epoch": 0.2993451824134705, | |
| "grad_norm": 0.03608345612883568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5771495699882507, | |
| "mean_token_accuracy": 0.7671397477388382, | |
| "num_tokens": 1307057.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.5775998532772064, | |
| "epoch": 0.3030869971936389, | |
| "grad_norm": 0.04129846766591072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5648953318595886, | |
| "mean_token_accuracy": 0.7740987688302994, | |
| "num_tokens": 1323158.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.578661784529686, | |
| "epoch": 0.3068288119738073, | |
| "grad_norm": 0.04035583510994911, | |
| "learning_rate": 0.0002, | |
| "loss": 0.572229266166687, | |
| "mean_token_accuracy": 0.769649401307106, | |
| "num_tokens": 1339671.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.5630823224782944, | |
| "epoch": 0.3105706267539757, | |
| "grad_norm": 0.035164687782526016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634369254112244, | |
| "mean_token_accuracy": 0.7725345641374588, | |
| "num_tokens": 1355922.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.5712268948554993, | |
| "epoch": 0.31431244153414406, | |
| "grad_norm": 0.038266371935606, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5790088772773743, | |
| "mean_token_accuracy": 0.7660410851240158, | |
| "num_tokens": 1372241.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.5503551959991455, | |
| "epoch": 0.31805425631431244, | |
| "grad_norm": 0.04355614632368088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594754815101624, | |
| "mean_token_accuracy": 0.7743213176727295, | |
| "num_tokens": 1388447.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.5567754805088043, | |
| "epoch": 0.32179607109448083, | |
| "grad_norm": 0.034040167927742004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562305450439453, | |
| "mean_token_accuracy": 0.7782892882823944, | |
| "num_tokens": 1404595.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.5897853374481201, | |
| "epoch": 0.3255378858746492, | |
| "grad_norm": 0.04141312837600708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5811256766319275, | |
| "mean_token_accuracy": 0.7645350694656372, | |
| "num_tokens": 1421046.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.5651004612445831, | |
| "epoch": 0.3292797006548176, | |
| "grad_norm": 0.039186883717775345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626670122146606, | |
| "mean_token_accuracy": 0.771001011133194, | |
| "num_tokens": 1437307.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.5479820519685745, | |
| "epoch": 0.333021515434986, | |
| "grad_norm": 0.038090839982032776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517987012863159, | |
| "mean_token_accuracy": 0.7779913991689682, | |
| "num_tokens": 1453625.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.5513372272253036, | |
| "epoch": 0.33676333021515437, | |
| "grad_norm": 0.033073123544454575, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521109700202942, | |
| "mean_token_accuracy": 0.7770368456840515, | |
| "num_tokens": 1470001.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5538579821586609, | |
| "epoch": 0.34050514499532275, | |
| "grad_norm": 0.03432928025722504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595468878746033, | |
| "mean_token_accuracy": 0.7756330221891403, | |
| "num_tokens": 1486202.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.5441462099552155, | |
| "epoch": 0.34424695977549113, | |
| "grad_norm": 0.03260473906993866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527001023292542, | |
| "mean_token_accuracy": 0.7777194529771805, | |
| "num_tokens": 1502337.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.5642740428447723, | |
| "epoch": 0.3479887745556595, | |
| "grad_norm": 0.041720353066921234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5752084255218506, | |
| "mean_token_accuracy": 0.7667101472616196, | |
| "num_tokens": 1518821.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.565082237124443, | |
| "epoch": 0.3517305893358279, | |
| "grad_norm": 0.03507543355226517, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5699793696403503, | |
| "mean_token_accuracy": 0.770054817199707, | |
| "num_tokens": 1535163.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.5870088040828705, | |
| "epoch": 0.35547240411599623, | |
| "grad_norm": 0.034236419945955276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5850114226341248, | |
| "mean_token_accuracy": 0.7608266621828079, | |
| "num_tokens": 1551565.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.5530053824186325, | |
| "epoch": 0.3592142188961646, | |
| "grad_norm": 0.03369399905204773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534529685974121, | |
| "mean_token_accuracy": 0.7759882658720016, | |
| "num_tokens": 1567750.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.5754924863576889, | |
| "epoch": 0.362956033676333, | |
| "grad_norm": 0.036406002938747406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5705168843269348, | |
| "mean_token_accuracy": 0.7698172330856323, | |
| "num_tokens": 1584023.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.5771925449371338, | |
| "epoch": 0.3666978484565014, | |
| "grad_norm": 0.032233767211437225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5738174319267273, | |
| "mean_token_accuracy": 0.7679109573364258, | |
| "num_tokens": 1600377.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.566839799284935, | |
| "epoch": 0.37043966323666977, | |
| "grad_norm": 0.029388124123215675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624303817749023, | |
| "mean_token_accuracy": 0.771264523267746, | |
| "num_tokens": 1616664.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.5605880320072174, | |
| "epoch": 0.37418147801683815, | |
| "grad_norm": 0.034897759556770325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609456896781921, | |
| "mean_token_accuracy": 0.7745639681816101, | |
| "num_tokens": 1632981.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.5694979727268219, | |
| "epoch": 0.37792329279700654, | |
| "grad_norm": 0.03481722250580788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5728567838668823, | |
| "mean_token_accuracy": 0.7689409404993057, | |
| "num_tokens": 1649432.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.5804490298032761, | |
| "epoch": 0.3816651075771749, | |
| "grad_norm": 0.03589940071105957, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5847839713096619, | |
| "mean_token_accuracy": 0.7632083743810654, | |
| "num_tokens": 1666031.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.5580839961767197, | |
| "epoch": 0.3854069223573433, | |
| "grad_norm": 0.031488265842199326, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5667596459388733, | |
| "mean_token_accuracy": 0.7720794081687927, | |
| "num_tokens": 1682406.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.5474104434251785, | |
| "epoch": 0.3891487371375117, | |
| "grad_norm": 0.03187083452939987, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5499236583709717, | |
| "mean_token_accuracy": 0.7772009670734406, | |
| "num_tokens": 1698795.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.5527014136314392, | |
| "epoch": 0.3928905519176801, | |
| "grad_norm": 0.03492984548211098, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512747168540955, | |
| "mean_token_accuracy": 0.776108130812645, | |
| "num_tokens": 1715480.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.579165443778038, | |
| "epoch": 0.39663236669784846, | |
| "grad_norm": 0.03257554769515991, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5810192823410034, | |
| "mean_token_accuracy": 0.7663566768169403, | |
| "num_tokens": 1731889.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.5633712112903595, | |
| "epoch": 0.40037418147801684, | |
| "grad_norm": 0.03179244324564934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622086524963379, | |
| "mean_token_accuracy": 0.7680526524782181, | |
| "num_tokens": 1748318.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.5600844174623489, | |
| "epoch": 0.40411599625818523, | |
| "grad_norm": 0.029808223247528076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5606282949447632, | |
| "mean_token_accuracy": 0.7708232551813126, | |
| "num_tokens": 1764619.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.5492478907108307, | |
| "epoch": 0.4078578110383536, | |
| "grad_norm": 0.031120680272579193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5484419465065002, | |
| "mean_token_accuracy": 0.775683268904686, | |
| "num_tokens": 1780851.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.5517283380031586, | |
| "epoch": 0.411599625818522, | |
| "grad_norm": 0.03694352135062218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5580882430076599, | |
| "mean_token_accuracy": 0.774466261267662, | |
| "num_tokens": 1796890.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.5656300336122513, | |
| "epoch": 0.4153414405986904, | |
| "grad_norm": 0.03588038682937622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704593658447266, | |
| "mean_token_accuracy": 0.7691588401794434, | |
| "num_tokens": 1813404.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.564102292060852, | |
| "epoch": 0.41908325537885877, | |
| "grad_norm": 0.03264907747507095, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655107498168945, | |
| "mean_token_accuracy": 0.7724602967500687, | |
| "num_tokens": 1829724.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.5644495040178299, | |
| "epoch": 0.42282507015902715, | |
| "grad_norm": 0.03256542608141899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5646591782569885, | |
| "mean_token_accuracy": 0.7743334770202637, | |
| "num_tokens": 1846177.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.545789897441864, | |
| "epoch": 0.42656688493919553, | |
| "grad_norm": 0.034160368144512177, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457491874694824, | |
| "mean_token_accuracy": 0.7793226093053818, | |
| "num_tokens": 1862412.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.5670842975378036, | |
| "epoch": 0.4303086997193639, | |
| "grad_norm": 0.02954726107418537, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644434690475464, | |
| "mean_token_accuracy": 0.7711858153343201, | |
| "num_tokens": 1878518.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.5647070705890656, | |
| "epoch": 0.43405051449953225, | |
| "grad_norm": 0.028261123225092888, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621106624603271, | |
| "mean_token_accuracy": 0.776775136590004, | |
| "num_tokens": 1895135.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.529420793056488, | |
| "epoch": 0.43779232927970063, | |
| "grad_norm": 0.03301499783992767, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536541759967804, | |
| "mean_token_accuracy": 0.7836042046546936, | |
| "num_tokens": 1911161.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.5451334565877914, | |
| "epoch": 0.441534144059869, | |
| "grad_norm": 0.033271510154008865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523592829704285, | |
| "mean_token_accuracy": 0.7769709676504135, | |
| "num_tokens": 1927550.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.536512017250061, | |
| "epoch": 0.4452759588400374, | |
| "grad_norm": 0.03425843268632889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5380823612213135, | |
| "mean_token_accuracy": 0.780797928571701, | |
| "num_tokens": 1943788.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.536301851272583, | |
| "epoch": 0.4490177736202058, | |
| "grad_norm": 0.03248719125986099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470737218856812, | |
| "mean_token_accuracy": 0.7803975343704224, | |
| "num_tokens": 1959878.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.5517153441905975, | |
| "epoch": 0.45275958840037417, | |
| "grad_norm": 0.03530304506421089, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577021241188049, | |
| "mean_token_accuracy": 0.7733452618122101, | |
| "num_tokens": 1976131.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.5619277656078339, | |
| "epoch": 0.45650140318054255, | |
| "grad_norm": 0.03460797667503357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516164898872375, | |
| "mean_token_accuracy": 0.7756523787975311, | |
| "num_tokens": 1992627.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.5761916935443878, | |
| "epoch": 0.46024321796071094, | |
| "grad_norm": 0.03172283619642258, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571029543876648, | |
| "mean_token_accuracy": 0.7667981088161469, | |
| "num_tokens": 2009019.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.5743123888969421, | |
| "epoch": 0.4639850327408793, | |
| "grad_norm": 0.0364689975976944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5712283849716187, | |
| "mean_token_accuracy": 0.7701593190431595, | |
| "num_tokens": 2025188.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.5582910478115082, | |
| "epoch": 0.4677268475210477, | |
| "grad_norm": 0.03056769073009491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56070876121521, | |
| "mean_token_accuracy": 0.7755492180585861, | |
| "num_tokens": 2041572.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.5542439967393875, | |
| "epoch": 0.4714686623012161, | |
| "grad_norm": 0.03697546571493149, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604549646377563, | |
| "mean_token_accuracy": 0.7751918882131577, | |
| "num_tokens": 2057989.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.5463303178548813, | |
| "epoch": 0.4752104770813845, | |
| "grad_norm": 0.033879246562719345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539431571960449, | |
| "mean_token_accuracy": 0.7758707851171494, | |
| "num_tokens": 2074129.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.5522827506065369, | |
| "epoch": 0.47895229186155286, | |
| "grad_norm": 0.03316348418593407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581960082054138, | |
| "mean_token_accuracy": 0.7748778462409973, | |
| "num_tokens": 2090225.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.5740112662315369, | |
| "epoch": 0.48269410664172124, | |
| "grad_norm": 0.03274102881550789, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653910040855408, | |
| "mean_token_accuracy": 0.7719868570566177, | |
| "num_tokens": 2106644.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.5553925186395645, | |
| "epoch": 0.4864359214218896, | |
| "grad_norm": 0.028283284977078438, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513849258422852, | |
| "mean_token_accuracy": 0.7774856984615326, | |
| "num_tokens": 2123137.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.5579676181077957, | |
| "epoch": 0.490177736202058, | |
| "grad_norm": 0.029911885038018227, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568463802337646, | |
| "mean_token_accuracy": 0.7730498015880585, | |
| "num_tokens": 2139285.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.5664242058992386, | |
| "epoch": 0.4939195509822264, | |
| "grad_norm": 0.03227100148797035, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5754393339157104, | |
| "mean_token_accuracy": 0.7667475491762161, | |
| "num_tokens": 2155517.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.5501858294010162, | |
| "epoch": 0.4976613657623948, | |
| "grad_norm": 0.03013962134718895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5513433218002319, | |
| "mean_token_accuracy": 0.7747298777103424, | |
| "num_tokens": 2171722.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.5627453327178955, | |
| "epoch": 0.5014031805425632, | |
| "grad_norm": 0.034450363367795944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5604255199432373, | |
| "mean_token_accuracy": 0.7740208506584167, | |
| "num_tokens": 2188054.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.5634363293647766, | |
| "epoch": 0.5051449953227315, | |
| "grad_norm": 0.03803717717528343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558170735836029, | |
| "mean_token_accuracy": 0.7775739133358002, | |
| "num_tokens": 2204313.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.5590767562389374, | |
| "epoch": 0.5088868101028999, | |
| "grad_norm": 0.029813330620527267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5652009844779968, | |
| "mean_token_accuracy": 0.7706311643123627, | |
| "num_tokens": 2220687.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.5706852972507477, | |
| "epoch": 0.5126286248830683, | |
| "grad_norm": 0.0418686643242836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5734685063362122, | |
| "mean_token_accuracy": 0.7665899097919464, | |
| "num_tokens": 2237258.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.5638300180435181, | |
| "epoch": 0.5163704396632367, | |
| "grad_norm": 0.03304136171936989, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663323402404785, | |
| "mean_token_accuracy": 0.7701692581176758, | |
| "num_tokens": 2253553.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.5560389012098312, | |
| "epoch": 0.520112254443405, | |
| "grad_norm": 0.032340649515390396, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557302832603455, | |
| "mean_token_accuracy": 0.7773910611867905, | |
| "num_tokens": 2269787.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.5491623729467392, | |
| "epoch": 0.5238540692235735, | |
| "grad_norm": 0.03743594512343407, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475925803184509, | |
| "mean_token_accuracy": 0.7796913385391235, | |
| "num_tokens": 2286052.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.5624114125967026, | |
| "epoch": 0.5275958840037418, | |
| "grad_norm": 0.03084268979728222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612790584564209, | |
| "mean_token_accuracy": 0.7745496481657028, | |
| "num_tokens": 2302516.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.5638779103755951, | |
| "epoch": 0.5313376987839102, | |
| "grad_norm": 0.02851773053407669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.568551778793335, | |
| "mean_token_accuracy": 0.7703356891870499, | |
| "num_tokens": 2318761.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.5524759441614151, | |
| "epoch": 0.5350795135640786, | |
| "grad_norm": 0.03449970856308937, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5582625865936279, | |
| "mean_token_accuracy": 0.7745357155799866, | |
| "num_tokens": 2335227.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.5538729876279831, | |
| "epoch": 0.538821328344247, | |
| "grad_norm": 0.036926597356796265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5551813840866089, | |
| "mean_token_accuracy": 0.7734793871641159, | |
| "num_tokens": 2351743.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.556109830737114, | |
| "epoch": 0.5425631431244153, | |
| "grad_norm": 0.032143596559762955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5621770620346069, | |
| "mean_token_accuracy": 0.7720111310482025, | |
| "num_tokens": 2368312.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.5528390407562256, | |
| "epoch": 0.5463049579045838, | |
| "grad_norm": 0.027878830209374428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551728367805481, | |
| "mean_token_accuracy": 0.7765467911958694, | |
| "num_tokens": 2384834.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.569217711687088, | |
| "epoch": 0.5500467726847521, | |
| "grad_norm": 0.03398638963699341, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5663697123527527, | |
| "mean_token_accuracy": 0.7732102274894714, | |
| "num_tokens": 2401144.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.5385106950998306, | |
| "epoch": 0.5537885874649204, | |
| "grad_norm": 0.034567005932331085, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383309721946716, | |
| "mean_token_accuracy": 0.781255692243576, | |
| "num_tokens": 2417158.0, | |
| "step": 148 | |
| }, | |
| { | |
| "entropy": 0.5630964189767838, | |
| "epoch": 0.5575304022450889, | |
| "grad_norm": 0.029897838830947876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5677754282951355, | |
| "mean_token_accuracy": 0.7685458660125732, | |
| "num_tokens": 2433487.0, | |
| "step": 149 | |
| }, | |
| { | |
| "entropy": 0.5507898777723312, | |
| "epoch": 0.5612722170252572, | |
| "grad_norm": 0.02974529378116131, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534771680831909, | |
| "mean_token_accuracy": 0.7748892605304718, | |
| "num_tokens": 2449770.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.5639528781175613, | |
| "epoch": 0.5650140318054256, | |
| "grad_norm": 0.03235238045454025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5681154131889343, | |
| "mean_token_accuracy": 0.7700216770172119, | |
| "num_tokens": 2466229.0, | |
| "step": 151 | |
| }, | |
| { | |
| "entropy": 0.5683706551790237, | |
| "epoch": 0.568755846585594, | |
| "grad_norm": 0.028963793069124222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.569283127784729, | |
| "mean_token_accuracy": 0.7688962519168854, | |
| "num_tokens": 2482737.0, | |
| "step": 152 | |
| }, | |
| { | |
| "entropy": 0.5595172494649887, | |
| "epoch": 0.5724976613657624, | |
| "grad_norm": 0.02971002459526062, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543393492698669, | |
| "mean_token_accuracy": 0.7762883901596069, | |
| "num_tokens": 2499145.0, | |
| "step": 153 | |
| }, | |
| { | |
| "entropy": 0.55421943962574, | |
| "epoch": 0.5762394761459307, | |
| "grad_norm": 0.030361918732523918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5593795776367188, | |
| "mean_token_accuracy": 0.7707612812519073, | |
| "num_tokens": 2515460.0, | |
| "step": 154 | |
| }, | |
| { | |
| "entropy": 0.5604497343301773, | |
| "epoch": 0.5799812909260992, | |
| "grad_norm": 0.03249987214803696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559572696685791, | |
| "mean_token_accuracy": 0.7736714631319046, | |
| "num_tokens": 2531731.0, | |
| "step": 155 | |
| }, | |
| { | |
| "entropy": 0.5572012811899185, | |
| "epoch": 0.5837231057062675, | |
| "grad_norm": 0.028877906501293182, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557632446289062, | |
| "mean_token_accuracy": 0.7749307751655579, | |
| "num_tokens": 2547934.0, | |
| "step": 156 | |
| }, | |
| { | |
| "entropy": 0.5711070001125336, | |
| "epoch": 0.587464920486436, | |
| "grad_norm": 0.030351407825946808, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5682122707366943, | |
| "mean_token_accuracy": 0.7715558409690857, | |
| "num_tokens": 2564252.0, | |
| "step": 157 | |
| }, | |
| { | |
| "entropy": 0.5656052529811859, | |
| "epoch": 0.5912067352666043, | |
| "grad_norm": 0.029292697086930275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643728375434875, | |
| "mean_token_accuracy": 0.7730299234390259, | |
| "num_tokens": 2580465.0, | |
| "step": 158 | |
| }, | |
| { | |
| "entropy": 0.5565295219421387, | |
| "epoch": 0.5949485500467727, | |
| "grad_norm": 0.028714049607515335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634271502494812, | |
| "mean_token_accuracy": 0.7702697217464447, | |
| "num_tokens": 2596985.0, | |
| "step": 159 | |
| }, | |
| { | |
| "entropy": 0.5631282031536102, | |
| "epoch": 0.598690364826941, | |
| "grad_norm": 0.030091576278209686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5721826553344727, | |
| "mean_token_accuracy": 0.7689475417137146, | |
| "num_tokens": 2613206.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.5607286393642426, | |
| "epoch": 0.6024321796071095, | |
| "grad_norm": 0.03013305738568306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609285235404968, | |
| "mean_token_accuracy": 0.7740870416164398, | |
| "num_tokens": 2629766.0, | |
| "step": 161 | |
| }, | |
| { | |
| "entropy": 0.5548760294914246, | |
| "epoch": 0.6061739943872778, | |
| "grad_norm": 0.03615036979317665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.561907172203064, | |
| "mean_token_accuracy": 0.7704312056303024, | |
| "num_tokens": 2645841.0, | |
| "step": 162 | |
| }, | |
| { | |
| "entropy": 0.5578597337007523, | |
| "epoch": 0.6099158091674463, | |
| "grad_norm": 0.029693420976400375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573199391365051, | |
| "mean_token_accuracy": 0.7728497833013535, | |
| "num_tokens": 2662175.0, | |
| "step": 163 | |
| }, | |
| { | |
| "entropy": 0.5612762272357941, | |
| "epoch": 0.6136576239476146, | |
| "grad_norm": 0.030115241184830666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610560178756714, | |
| "mean_token_accuracy": 0.7720479369163513, | |
| "num_tokens": 2678456.0, | |
| "step": 164 | |
| }, | |
| { | |
| "entropy": 0.5692281126976013, | |
| "epoch": 0.617399438727783, | |
| "grad_norm": 0.030713427811861038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.567272961139679, | |
| "mean_token_accuracy": 0.7701284140348434, | |
| "num_tokens": 2694886.0, | |
| "step": 165 | |
| }, | |
| { | |
| "entropy": 0.5571814477443695, | |
| "epoch": 0.6211412535079514, | |
| "grad_norm": 0.030081165954470634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578005313873291, | |
| "mean_token_accuracy": 0.7734847068786621, | |
| "num_tokens": 2711066.0, | |
| "step": 166 | |
| }, | |
| { | |
| "entropy": 0.5701806098222733, | |
| "epoch": 0.6248830682881198, | |
| "grad_norm": 0.024519717320799828, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5707820057868958, | |
| "mean_token_accuracy": 0.765745609998703, | |
| "num_tokens": 2727604.0, | |
| "step": 167 | |
| }, | |
| { | |
| "entropy": 0.546685203909874, | |
| "epoch": 0.6286248830682881, | |
| "grad_norm": 0.030948853120207787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538927912712097, | |
| "mean_token_accuracy": 0.7749418467283249, | |
| "num_tokens": 2743937.0, | |
| "step": 168 | |
| }, | |
| { | |
| "entropy": 0.5537951737642288, | |
| "epoch": 0.6323666978484564, | |
| "grad_norm": 0.03693117946386337, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586614608764648, | |
| "mean_token_accuracy": 0.7715347409248352, | |
| "num_tokens": 2760525.0, | |
| "step": 169 | |
| }, | |
| { | |
| "entropy": 0.5430830717086792, | |
| "epoch": 0.6361085126286249, | |
| "grad_norm": 0.029782412573695183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412864685058594, | |
| "mean_token_accuracy": 0.7784539759159088, | |
| "num_tokens": 2776721.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.5351588726043701, | |
| "epoch": 0.6398503274087932, | |
| "grad_norm": 0.03263084217905998, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388463139533997, | |
| "mean_token_accuracy": 0.781808465719223, | |
| "num_tokens": 2792933.0, | |
| "step": 171 | |
| }, | |
| { | |
| "entropy": 0.5568130016326904, | |
| "epoch": 0.6435921421889617, | |
| "grad_norm": 0.031154213473200798, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626617670059204, | |
| "mean_token_accuracy": 0.7720103710889816, | |
| "num_tokens": 2809451.0, | |
| "step": 172 | |
| }, | |
| { | |
| "entropy": 0.5607169568538666, | |
| "epoch": 0.64733395696913, | |
| "grad_norm": 0.03371235355734825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5647063255310059, | |
| "mean_token_accuracy": 0.7718498706817627, | |
| "num_tokens": 2825932.0, | |
| "step": 173 | |
| }, | |
| { | |
| "entropy": 0.555529311299324, | |
| "epoch": 0.6510757717492984, | |
| "grad_norm": 0.030816521495580673, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564374327659607, | |
| "mean_token_accuracy": 0.7758121490478516, | |
| "num_tokens": 2842314.0, | |
| "step": 174 | |
| }, | |
| { | |
| "entropy": 0.5513110458850861, | |
| "epoch": 0.6548175865294668, | |
| "grad_norm": 0.02944033220410347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524051189422607, | |
| "mean_token_accuracy": 0.77901391685009, | |
| "num_tokens": 2858741.0, | |
| "step": 175 | |
| }, | |
| { | |
| "entropy": 0.5570909082889557, | |
| "epoch": 0.6585594013096352, | |
| "grad_norm": 0.030563851818442345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552980899810791, | |
| "mean_token_accuracy": 0.7785744369029999, | |
| "num_tokens": 2874790.0, | |
| "step": 176 | |
| }, | |
| { | |
| "entropy": 0.5531197637319565, | |
| "epoch": 0.6623012160898035, | |
| "grad_norm": 0.026769133284687996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503875017166138, | |
| "mean_token_accuracy": 0.7756068855524063, | |
| "num_tokens": 2890991.0, | |
| "step": 177 | |
| }, | |
| { | |
| "entropy": 0.5576685070991516, | |
| "epoch": 0.666043030869972, | |
| "grad_norm": 0.031243668869137764, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595083236694336, | |
| "mean_token_accuracy": 0.7736776024103165, | |
| "num_tokens": 2907372.0, | |
| "step": 178 | |
| }, | |
| { | |
| "entropy": 0.561943918466568, | |
| "epoch": 0.6697848456501403, | |
| "grad_norm": 0.029022254049777985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5671570301055908, | |
| "mean_token_accuracy": 0.7722343951463699, | |
| "num_tokens": 2923921.0, | |
| "step": 179 | |
| }, | |
| { | |
| "entropy": 0.5484957844018936, | |
| "epoch": 0.6735266604303087, | |
| "grad_norm": 0.030121706426143646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546964406967163, | |
| "mean_token_accuracy": 0.7751270681619644, | |
| "num_tokens": 2940247.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.554192379117012, | |
| "epoch": 0.6772684752104771, | |
| "grad_norm": 0.030762923881411552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5602478981018066, | |
| "mean_token_accuracy": 0.7732126861810684, | |
| "num_tokens": 2956527.0, | |
| "step": 181 | |
| }, | |
| { | |
| "entropy": 0.5684338361024857, | |
| "epoch": 0.6810102899906455, | |
| "grad_norm": 0.036885276436805725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655561685562134, | |
| "mean_token_accuracy": 0.769650399684906, | |
| "num_tokens": 2972654.0, | |
| "step": 182 | |
| }, | |
| { | |
| "entropy": 0.5733159780502319, | |
| "epoch": 0.6847521047708138, | |
| "grad_norm": 0.03168238326907158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5698360800743103, | |
| "mean_token_accuracy": 0.7700367867946625, | |
| "num_tokens": 2989101.0, | |
| "step": 183 | |
| }, | |
| { | |
| "entropy": 0.556915819644928, | |
| "epoch": 0.6884939195509823, | |
| "grad_norm": 0.03091347962617874, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448244214057922, | |
| "mean_token_accuracy": 0.7791603803634644, | |
| "num_tokens": 3005335.0, | |
| "step": 184 | |
| }, | |
| { | |
| "entropy": 0.5490943491458893, | |
| "epoch": 0.6922357343311506, | |
| "grad_norm": 0.032818131148815155, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5487899780273438, | |
| "mean_token_accuracy": 0.7768953591585159, | |
| "num_tokens": 3021621.0, | |
| "step": 185 | |
| }, | |
| { | |
| "entropy": 0.5296357423067093, | |
| "epoch": 0.695977549111319, | |
| "grad_norm": 0.03200080245733261, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5386063456535339, | |
| "mean_token_accuracy": 0.7796643227338791, | |
| "num_tokens": 3037785.0, | |
| "step": 186 | |
| }, | |
| { | |
| "entropy": 0.5606788247823715, | |
| "epoch": 0.6997193638914874, | |
| "grad_norm": 0.03352601081132889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5720128417015076, | |
| "mean_token_accuracy": 0.7676278650760651, | |
| "num_tokens": 3053806.0, | |
| "step": 187 | |
| }, | |
| { | |
| "entropy": 0.5525215566158295, | |
| "epoch": 0.7034611786716558, | |
| "grad_norm": 0.03217856585979462, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5599426627159119, | |
| "mean_token_accuracy": 0.7706687748432159, | |
| "num_tokens": 3070070.0, | |
| "step": 188 | |
| }, | |
| { | |
| "entropy": 0.5785647034645081, | |
| "epoch": 0.7072029934518241, | |
| "grad_norm": 0.03108043409883976, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5753121376037598, | |
| "mean_token_accuracy": 0.7674888074398041, | |
| "num_tokens": 3086407.0, | |
| "step": 189 | |
| }, | |
| { | |
| "entropy": 0.572156235575676, | |
| "epoch": 0.7109448082319925, | |
| "grad_norm": 0.036022067070007324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5567526817321777, | |
| "mean_token_accuracy": 0.7726783901453018, | |
| "num_tokens": 3102575.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.5531092137098312, | |
| "epoch": 0.7146866230121609, | |
| "grad_norm": 0.028695300221443176, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545417070388794, | |
| "mean_token_accuracy": 0.7790848612785339, | |
| "num_tokens": 3118942.0, | |
| "step": 191 | |
| }, | |
| { | |
| "entropy": 0.542072057723999, | |
| "epoch": 0.7184284377923292, | |
| "grad_norm": 0.02768511138856411, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424788594245911, | |
| "mean_token_accuracy": 0.7790149599313736, | |
| "num_tokens": 3134996.0, | |
| "step": 192 | |
| }, | |
| { | |
| "entropy": 0.5440382957458496, | |
| "epoch": 0.7221702525724977, | |
| "grad_norm": 0.044699691236019135, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5630879402160645, | |
| "mean_token_accuracy": 0.7720867395401001, | |
| "num_tokens": 3151144.0, | |
| "step": 193 | |
| }, | |
| { | |
| "entropy": 0.5484438389539719, | |
| "epoch": 0.725912067352666, | |
| "grad_norm": 0.033284809440374374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5586625933647156, | |
| "mean_token_accuracy": 0.7742896676063538, | |
| "num_tokens": 3167431.0, | |
| "step": 194 | |
| }, | |
| { | |
| "entropy": 0.5585122853517532, | |
| "epoch": 0.7296538821328344, | |
| "grad_norm": 0.029940789565443993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640571117401123, | |
| "mean_token_accuracy": 0.7736721932888031, | |
| "num_tokens": 3183584.0, | |
| "step": 195 | |
| }, | |
| { | |
| "entropy": 0.5803828984498978, | |
| "epoch": 0.7333956969130028, | |
| "grad_norm": 0.03922640532255173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5756028294563293, | |
| "mean_token_accuracy": 0.7650134712457657, | |
| "num_tokens": 3199936.0, | |
| "step": 196 | |
| }, | |
| { | |
| "entropy": 0.5695553570985794, | |
| "epoch": 0.7371375116931712, | |
| "grad_norm": 0.02914128266274929, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5552971959114075, | |
| "mean_token_accuracy": 0.7738740146160126, | |
| "num_tokens": 3216327.0, | |
| "step": 197 | |
| }, | |
| { | |
| "entropy": 0.5402019023895264, | |
| "epoch": 0.7408793264733395, | |
| "grad_norm": 0.02753686159849167, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5362023711204529, | |
| "mean_token_accuracy": 0.7808489948511124, | |
| "num_tokens": 3232411.0, | |
| "step": 198 | |
| }, | |
| { | |
| "entropy": 0.5661509037017822, | |
| "epoch": 0.744621141253508, | |
| "grad_norm": 0.029173044487833977, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5666989088058472, | |
| "mean_token_accuracy": 0.7697858512401581, | |
| "num_tokens": 3248516.0, | |
| "step": 199 | |
| }, | |
| { | |
| "entropy": 0.5394262075424194, | |
| "epoch": 0.7483629560336763, | |
| "grad_norm": 0.03222000226378441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493192076683044, | |
| "mean_token_accuracy": 0.7756218761205673, | |
| "num_tokens": 3264724.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.5624162256717682, | |
| "epoch": 0.7521047708138447, | |
| "grad_norm": 0.03587524592876434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5728610157966614, | |
| "mean_token_accuracy": 0.7661173194646835, | |
| "num_tokens": 3280953.0, | |
| "step": 201 | |
| }, | |
| { | |
| "entropy": 0.5574640333652496, | |
| "epoch": 0.7558465855940131, | |
| "grad_norm": 0.030263541266322136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545740127563477, | |
| "mean_token_accuracy": 0.7747018188238144, | |
| "num_tokens": 3297315.0, | |
| "step": 202 | |
| }, | |
| { | |
| "entropy": 0.5598777681589127, | |
| "epoch": 0.7595884003741815, | |
| "grad_norm": 0.0284356027841568, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577300190925598, | |
| "mean_token_accuracy": 0.7724722474813461, | |
| "num_tokens": 3313688.0, | |
| "step": 203 | |
| }, | |
| { | |
| "entropy": 0.5658386498689651, | |
| "epoch": 0.7633302151543498, | |
| "grad_norm": 0.03470136970281601, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591439008712769, | |
| "mean_token_accuracy": 0.7761197835206985, | |
| "num_tokens": 3329826.0, | |
| "step": 204 | |
| }, | |
| { | |
| "entropy": 0.5585865527391434, | |
| "epoch": 0.7670720299345183, | |
| "grad_norm": 0.027583830058574677, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561191439628601, | |
| "mean_token_accuracy": 0.7717861980199814, | |
| "num_tokens": 3346401.0, | |
| "step": 205 | |
| }, | |
| { | |
| "entropy": 0.5518056601285934, | |
| "epoch": 0.7708138447146866, | |
| "grad_norm": 0.034380193799734116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56368488073349, | |
| "mean_token_accuracy": 0.7690371572971344, | |
| "num_tokens": 3362862.0, | |
| "step": 206 | |
| }, | |
| { | |
| "entropy": 0.5423950105905533, | |
| "epoch": 0.774555659494855, | |
| "grad_norm": 0.027748677879571915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500733256340027, | |
| "mean_token_accuracy": 0.7782405465841293, | |
| "num_tokens": 3379133.0, | |
| "step": 207 | |
| }, | |
| { | |
| "entropy": 0.5392836630344391, | |
| "epoch": 0.7782974742750234, | |
| "grad_norm": 0.030424097552895546, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5452281832695007, | |
| "mean_token_accuracy": 0.7790029048919678, | |
| "num_tokens": 3395406.0, | |
| "step": 208 | |
| }, | |
| { | |
| "entropy": 0.5665347129106522, | |
| "epoch": 0.7820392890551918, | |
| "grad_norm": 0.02836509235203266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5655370950698853, | |
| "mean_token_accuracy": 0.768405556678772, | |
| "num_tokens": 3411686.0, | |
| "step": 209 | |
| }, | |
| { | |
| "entropy": 0.5624722540378571, | |
| "epoch": 0.7857811038353602, | |
| "grad_norm": 0.028227761387825012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540167689323425, | |
| "mean_token_accuracy": 0.7740924656391144, | |
| "num_tokens": 3427914.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.555148720741272, | |
| "epoch": 0.7895229186155285, | |
| "grad_norm": 0.03054502047598362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572685599327087, | |
| "mean_token_accuracy": 0.7746326923370361, | |
| "num_tokens": 3444170.0, | |
| "step": 211 | |
| }, | |
| { | |
| "entropy": 0.5449056923389435, | |
| "epoch": 0.7932647333956969, | |
| "grad_norm": 0.03224708139896393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572819113731384, | |
| "mean_token_accuracy": 0.7724157273769379, | |
| "num_tokens": 3460305.0, | |
| "step": 212 | |
| }, | |
| { | |
| "entropy": 0.5533578097820282, | |
| "epoch": 0.7970065481758652, | |
| "grad_norm": 0.031917959451675415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.557055652141571, | |
| "mean_token_accuracy": 0.7715483158826828, | |
| "num_tokens": 3476772.0, | |
| "step": 213 | |
| }, | |
| { | |
| "entropy": 0.5611972808837891, | |
| "epoch": 0.8007483629560337, | |
| "grad_norm": 0.031701650470495224, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5658101439476013, | |
| "mean_token_accuracy": 0.7677106559276581, | |
| "num_tokens": 3493499.0, | |
| "step": 214 | |
| }, | |
| { | |
| "entropy": 0.5572656095027924, | |
| "epoch": 0.804490177736202, | |
| "grad_norm": 0.02719227597117424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5549203157424927, | |
| "mean_token_accuracy": 0.774790808558464, | |
| "num_tokens": 3509811.0, | |
| "step": 215 | |
| }, | |
| { | |
| "entropy": 0.5471508800983429, | |
| "epoch": 0.8082319925163705, | |
| "grad_norm": 0.025823380798101425, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506555438041687, | |
| "mean_token_accuracy": 0.7770570069551468, | |
| "num_tokens": 3526157.0, | |
| "step": 216 | |
| }, | |
| { | |
| "entropy": 0.5587919056415558, | |
| "epoch": 0.8119738072965388, | |
| "grad_norm": 0.027526551857590675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553531050682068, | |
| "mean_token_accuracy": 0.7733194231987, | |
| "num_tokens": 3542353.0, | |
| "step": 217 | |
| }, | |
| { | |
| "entropy": 0.5590764433145523, | |
| "epoch": 0.8157156220767072, | |
| "grad_norm": 0.027686061337590218, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553832471370697, | |
| "mean_token_accuracy": 0.7726568281650543, | |
| "num_tokens": 3558723.0, | |
| "step": 218 | |
| }, | |
| { | |
| "entropy": 0.5684271901845932, | |
| "epoch": 0.8194574368568756, | |
| "grad_norm": 0.027071600779891014, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5699101686477661, | |
| "mean_token_accuracy": 0.7687496989965439, | |
| "num_tokens": 3575290.0, | |
| "step": 219 | |
| }, | |
| { | |
| "entropy": 0.5384210348129272, | |
| "epoch": 0.823199251637044, | |
| "grad_norm": 0.030755044892430305, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439192652702332, | |
| "mean_token_accuracy": 0.7772842049598694, | |
| "num_tokens": 3591563.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.524935394525528, | |
| "epoch": 0.8269410664172123, | |
| "grad_norm": 0.02740432508289814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.529310941696167, | |
| "mean_token_accuracy": 0.784336507320404, | |
| "num_tokens": 3607814.0, | |
| "step": 221 | |
| }, | |
| { | |
| "entropy": 0.5532049238681793, | |
| "epoch": 0.8306828811973808, | |
| "grad_norm": 0.034083202481269836, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611142516136169, | |
| "mean_token_accuracy": 0.7706895172595978, | |
| "num_tokens": 3624047.0, | |
| "step": 222 | |
| }, | |
| { | |
| "entropy": 0.5380610376596451, | |
| "epoch": 0.8344246959775491, | |
| "grad_norm": 0.029454410076141357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5438103675842285, | |
| "mean_token_accuracy": 0.7790344655513763, | |
| "num_tokens": 3640194.0, | |
| "step": 223 | |
| }, | |
| { | |
| "entropy": 0.5661721527576447, | |
| "epoch": 0.8381665107577175, | |
| "grad_norm": 0.029397280886769295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.558972954750061, | |
| "mean_token_accuracy": 0.7724218964576721, | |
| "num_tokens": 3656608.0, | |
| "step": 224 | |
| }, | |
| { | |
| "entropy": 0.5514093935489655, | |
| "epoch": 0.8419083255378859, | |
| "grad_norm": 0.029793422669172287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550917387008667, | |
| "mean_token_accuracy": 0.7733565121889114, | |
| "num_tokens": 3672523.0, | |
| "step": 225 | |
| }, | |
| { | |
| "entropy": 0.5508118569850922, | |
| "epoch": 0.8456501403180543, | |
| "grad_norm": 0.030908716842532158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537383556365967, | |
| "mean_token_accuracy": 0.7725334316492081, | |
| "num_tokens": 3688658.0, | |
| "step": 226 | |
| }, | |
| { | |
| "entropy": 0.5521706193685532, | |
| "epoch": 0.8493919550982226, | |
| "grad_norm": 0.03186751529574394, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577634572982788, | |
| "mean_token_accuracy": 0.7732146978378296, | |
| "num_tokens": 3704875.0, | |
| "step": 227 | |
| }, | |
| { | |
| "entropy": 0.543274000287056, | |
| "epoch": 0.8531337698783911, | |
| "grad_norm": 0.030743638053536415, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5453194379806519, | |
| "mean_token_accuracy": 0.7776961177587509, | |
| "num_tokens": 3720936.0, | |
| "step": 228 | |
| }, | |
| { | |
| "entropy": 0.5507763624191284, | |
| "epoch": 0.8568755846585594, | |
| "grad_norm": 0.030140401795506477, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504044890403748, | |
| "mean_token_accuracy": 0.7767813801765442, | |
| "num_tokens": 3737279.0, | |
| "step": 229 | |
| }, | |
| { | |
| "entropy": 0.5462870597839355, | |
| "epoch": 0.8606173994387278, | |
| "grad_norm": 0.026473646983504295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481734275817871, | |
| "mean_token_accuracy": 0.7772915065288544, | |
| "num_tokens": 3753415.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5563444495201111, | |
| "epoch": 0.8643592142188962, | |
| "grad_norm": 0.02921387553215027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546942949295044, | |
| "mean_token_accuracy": 0.7731446027755737, | |
| "num_tokens": 3769803.0, | |
| "step": 231 | |
| }, | |
| { | |
| "entropy": 0.559598296880722, | |
| "epoch": 0.8681010289990645, | |
| "grad_norm": 0.03972897306084633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5572680234909058, | |
| "mean_token_accuracy": 0.773430734872818, | |
| "num_tokens": 3785892.0, | |
| "step": 232 | |
| }, | |
| { | |
| "entropy": 0.539952963590622, | |
| "epoch": 0.8718428437792329, | |
| "grad_norm": 0.028981171548366547, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390475988388062, | |
| "mean_token_accuracy": 0.7811980247497559, | |
| "num_tokens": 3802184.0, | |
| "step": 233 | |
| }, | |
| { | |
| "entropy": 0.5387761145830154, | |
| "epoch": 0.8755846585594013, | |
| "grad_norm": 0.026351595297455788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407798290252686, | |
| "mean_token_accuracy": 0.7787132114171982, | |
| "num_tokens": 3818418.0, | |
| "step": 234 | |
| }, | |
| { | |
| "entropy": 0.5693282037973404, | |
| "epoch": 0.8793264733395697, | |
| "grad_norm": 0.033158186823129654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5714267492294312, | |
| "mean_token_accuracy": 0.7690801620483398, | |
| "num_tokens": 3834874.0, | |
| "step": 235 | |
| }, | |
| { | |
| "entropy": 0.5534514784812927, | |
| "epoch": 0.883068288119738, | |
| "grad_norm": 0.0280459001660347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5574108362197876, | |
| "mean_token_accuracy": 0.7764205187559128, | |
| "num_tokens": 3851261.0, | |
| "step": 236 | |
| }, | |
| { | |
| "entropy": 0.5554600358009338, | |
| "epoch": 0.8868101028999065, | |
| "grad_norm": 0.027284014970064163, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5592954754829407, | |
| "mean_token_accuracy": 0.7728679180145264, | |
| "num_tokens": 3867826.0, | |
| "step": 237 | |
| }, | |
| { | |
| "entropy": 0.5611312091350555, | |
| "epoch": 0.8905519176800748, | |
| "grad_norm": 0.027675554156303406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5633160471916199, | |
| "mean_token_accuracy": 0.7716223746538162, | |
| "num_tokens": 3884424.0, | |
| "step": 238 | |
| }, | |
| { | |
| "entropy": 0.5698042660951614, | |
| "epoch": 0.8942937324602432, | |
| "grad_norm": 0.02734820544719696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5722016096115112, | |
| "mean_token_accuracy": 0.767684668302536, | |
| "num_tokens": 3900993.0, | |
| "step": 239 | |
| }, | |
| { | |
| "entropy": 0.5487347990274429, | |
| "epoch": 0.8980355472404116, | |
| "grad_norm": 0.030463971197605133, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5459187626838684, | |
| "mean_token_accuracy": 0.7788650244474411, | |
| "num_tokens": 3917455.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.5684353709220886, | |
| "epoch": 0.90177736202058, | |
| "grad_norm": 0.028492476791143417, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5674321055412292, | |
| "mean_token_accuracy": 0.7663144171237946, | |
| "num_tokens": 3934049.0, | |
| "step": 241 | |
| }, | |
| { | |
| "entropy": 0.5689758509397507, | |
| "epoch": 0.9055191768007483, | |
| "grad_norm": 0.02926958166062832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5745148658752441, | |
| "mean_token_accuracy": 0.7678453773260117, | |
| "num_tokens": 3950533.0, | |
| "step": 242 | |
| }, | |
| { | |
| "entropy": 0.549301877617836, | |
| "epoch": 0.9092609915809168, | |
| "grad_norm": 0.03295575827360153, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597534775733948, | |
| "mean_token_accuracy": 0.7714426666498184, | |
| "num_tokens": 3966986.0, | |
| "step": 243 | |
| }, | |
| { | |
| "entropy": 0.5338816940784454, | |
| "epoch": 0.9130028063610851, | |
| "grad_norm": 0.030206363648176193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326100587844849, | |
| "mean_token_accuracy": 0.7836355268955231, | |
| "num_tokens": 3983434.0, | |
| "step": 244 | |
| }, | |
| { | |
| "entropy": 0.5674562901258469, | |
| "epoch": 0.9167446211412535, | |
| "grad_norm": 0.026608271524310112, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5644797682762146, | |
| "mean_token_accuracy": 0.7716486304998398, | |
| "num_tokens": 3999756.0, | |
| "step": 245 | |
| }, | |
| { | |
| "entropy": 0.5831885486841202, | |
| "epoch": 0.9204864359214219, | |
| "grad_norm": 0.03711472824215889, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5693003535270691, | |
| "mean_token_accuracy": 0.7677270174026489, | |
| "num_tokens": 4016084.0, | |
| "step": 246 | |
| }, | |
| { | |
| "entropy": 0.5590741783380508, | |
| "epoch": 0.9242282507015903, | |
| "grad_norm": 0.027594709768891335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5590558052062988, | |
| "mean_token_accuracy": 0.7732381373643875, | |
| "num_tokens": 4032464.0, | |
| "step": 247 | |
| }, | |
| { | |
| "entropy": 0.5414686352014542, | |
| "epoch": 0.9279700654817586, | |
| "grad_norm": 0.037102047353982925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545523762702942, | |
| "mean_token_accuracy": 0.775322362780571, | |
| "num_tokens": 4048853.0, | |
| "step": 248 | |
| }, | |
| { | |
| "entropy": 0.5506337434053421, | |
| "epoch": 0.9317118802619271, | |
| "grad_norm": 0.03612777963280678, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5673890709877014, | |
| "mean_token_accuracy": 0.7688823938369751, | |
| "num_tokens": 4065031.0, | |
| "step": 249 | |
| }, | |
| { | |
| "entropy": 0.542187824845314, | |
| "epoch": 0.9354536950420954, | |
| "grad_norm": 0.031235933303833008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5464475750923157, | |
| "mean_token_accuracy": 0.7789596170186996, | |
| "num_tokens": 4081635.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.5568290203809738, | |
| "epoch": 0.9391955098222639, | |
| "grad_norm": 0.027413224801421165, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5562602877616882, | |
| "mean_token_accuracy": 0.7737423926591873, | |
| "num_tokens": 4098011.0, | |
| "step": 251 | |
| }, | |
| { | |
| "entropy": 0.558889165520668, | |
| "epoch": 0.9429373246024322, | |
| "grad_norm": 0.029295574873685837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547473430633545, | |
| "mean_token_accuracy": 0.7740904539823532, | |
| "num_tokens": 4114268.0, | |
| "step": 252 | |
| }, | |
| { | |
| "entropy": 0.5764719247817993, | |
| "epoch": 0.9466791393826005, | |
| "grad_norm": 0.03225071728229523, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5729030966758728, | |
| "mean_token_accuracy": 0.7659229934215546, | |
| "num_tokens": 4130552.0, | |
| "step": 253 | |
| }, | |
| { | |
| "entropy": 0.5606585443019867, | |
| "epoch": 0.950420954162769, | |
| "grad_norm": 0.02834608033299446, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5623061656951904, | |
| "mean_token_accuracy": 0.7708321511745453, | |
| "num_tokens": 4146844.0, | |
| "step": 254 | |
| }, | |
| { | |
| "entropy": 0.5444774627685547, | |
| "epoch": 0.9541627689429373, | |
| "grad_norm": 0.03255439177155495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524637699127197, | |
| "mean_token_accuracy": 0.7744161784648895, | |
| "num_tokens": 4163084.0, | |
| "step": 255 | |
| }, | |
| { | |
| "entropy": 0.5229519456624985, | |
| "epoch": 0.9579045837231057, | |
| "grad_norm": 0.027845216915011406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284432768821716, | |
| "mean_token_accuracy": 0.785067692399025, | |
| "num_tokens": 4179192.0, | |
| "step": 256 | |
| }, | |
| { | |
| "entropy": 0.5287301391363144, | |
| "epoch": 0.961646398503274, | |
| "grad_norm": 0.03511723130941391, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364463329315186, | |
| "mean_token_accuracy": 0.7782928943634033, | |
| "num_tokens": 4195604.0, | |
| "step": 257 | |
| }, | |
| { | |
| "entropy": 0.5621770173311234, | |
| "epoch": 0.9653882132834425, | |
| "grad_norm": 0.02962673269212246, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591749548912048, | |
| "mean_token_accuracy": 0.7710652500391006, | |
| "num_tokens": 4211743.0, | |
| "step": 258 | |
| }, | |
| { | |
| "entropy": 0.5636511147022247, | |
| "epoch": 0.9691300280636108, | |
| "grad_norm": 0.04087170958518982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626160502433777, | |
| "mean_token_accuracy": 0.771452471613884, | |
| "num_tokens": 4228198.0, | |
| "step": 259 | |
| }, | |
| { | |
| "entropy": 0.5522175580263138, | |
| "epoch": 0.9728718428437793, | |
| "grad_norm": 0.029492903500795364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5516583323478699, | |
| "mean_token_accuracy": 0.7742890268564224, | |
| "num_tokens": 4244501.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.5577979236841202, | |
| "epoch": 0.9766136576239476, | |
| "grad_norm": 0.02768765017390251, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573770403862, | |
| "mean_token_accuracy": 0.7728449106216431, | |
| "num_tokens": 4260800.0, | |
| "step": 261 | |
| }, | |
| { | |
| "entropy": 0.5833724588155746, | |
| "epoch": 0.980355472404116, | |
| "grad_norm": 0.030149318277835846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5790048837661743, | |
| "mean_token_accuracy": 0.7645868510007858, | |
| "num_tokens": 4277242.0, | |
| "step": 262 | |
| }, | |
| { | |
| "entropy": 0.5686817467212677, | |
| "epoch": 0.9840972871842844, | |
| "grad_norm": 0.03200973942875862, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5704789161682129, | |
| "mean_token_accuracy": 0.7688680738210678, | |
| "num_tokens": 4293490.0, | |
| "step": 263 | |
| }, | |
| { | |
| "entropy": 0.5522599965333939, | |
| "epoch": 0.9878391019644528, | |
| "grad_norm": 0.02735111489892006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483981370925903, | |
| "mean_token_accuracy": 0.7776431888341904, | |
| "num_tokens": 4309713.0, | |
| "step": 264 | |
| }, | |
| { | |
| "entropy": 0.5510786324739456, | |
| "epoch": 0.9915809167446211, | |
| "grad_norm": 0.027222398668527603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519858598709106, | |
| "mean_token_accuracy": 0.7740090191364288, | |
| "num_tokens": 4325978.0, | |
| "step": 265 | |
| }, | |
| { | |
| "entropy": 0.5590775907039642, | |
| "epoch": 0.9953227315247896, | |
| "grad_norm": 0.030459199100732803, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5638831853866577, | |
| "mean_token_accuracy": 0.7691285163164139, | |
| "num_tokens": 4342145.0, | |
| "step": 266 | |
| }, | |
| { | |
| "entropy": 0.5396278500556946, | |
| "epoch": 0.9990645463049579, | |
| "grad_norm": 0.029775220900774002, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551082968711853, | |
| "mean_token_accuracy": 0.777344822883606, | |
| "num_tokens": 4358366.0, | |
| "step": 267 | |
| }, | |
| { | |
| "entropy": 0.5386617183685303, | |
| "epoch": 1.0, | |
| "grad_norm": 0.05107063427567482, | |
| "learning_rate": 0.0002, | |
| "loss": 0.56319260597229, | |
| "mean_token_accuracy": 0.7758007049560547, | |
| "num_tokens": 4359498.0, | |
| "step": 268 | |
| }, | |
| { | |
| "entropy": 0.5456036031246185, | |
| "epoch": 1.0037418147801684, | |
| "grad_norm": 0.034975565969944, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444031953811646, | |
| "mean_token_accuracy": 0.7782553881406784, | |
| "num_tokens": 4375874.0, | |
| "step": 269 | |
| }, | |
| { | |
| "entropy": 0.554328516125679, | |
| "epoch": 1.0074836295603367, | |
| "grad_norm": 0.030762778595089912, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493590235710144, | |
| "mean_token_accuracy": 0.7769091576337814, | |
| "num_tokens": 4392309.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.544586181640625, | |
| "epoch": 1.011225444340505, | |
| "grad_norm": 0.027982227504253387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5366782546043396, | |
| "mean_token_accuracy": 0.7823053598403931, | |
| "num_tokens": 4408365.0, | |
| "step": 271 | |
| }, | |
| { | |
| "entropy": 0.5558233559131622, | |
| "epoch": 1.0149672591206735, | |
| "grad_norm": 0.029144754633307457, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5538930296897888, | |
| "mean_token_accuracy": 0.7747932523488998, | |
| "num_tokens": 4424690.0, | |
| "step": 272 | |
| }, | |
| { | |
| "entropy": 0.5521434098482132, | |
| "epoch": 1.018709073900842, | |
| "grad_norm": 0.031630512326955795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5583912134170532, | |
| "mean_token_accuracy": 0.773905873298645, | |
| "num_tokens": 4441085.0, | |
| "step": 273 | |
| }, | |
| { | |
| "entropy": 0.5409824252128601, | |
| "epoch": 1.0224508886810102, | |
| "grad_norm": 0.03298581764101982, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436674356460571, | |
| "mean_token_accuracy": 0.7784581035375595, | |
| "num_tokens": 4457337.0, | |
| "step": 274 | |
| }, | |
| { | |
| "entropy": 0.5269698351621628, | |
| "epoch": 1.0261927034611786, | |
| "grad_norm": 0.03633208945393562, | |
| "learning_rate": 0.0002, | |
| "loss": 0.530029833316803, | |
| "mean_token_accuracy": 0.786719799041748, | |
| "num_tokens": 4473532.0, | |
| "step": 275 | |
| }, | |
| { | |
| "entropy": 0.572344645857811, | |
| "epoch": 1.029934518241347, | |
| "grad_norm": 0.03007793240249157, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5664374828338623, | |
| "mean_token_accuracy": 0.768335297703743, | |
| "num_tokens": 4489887.0, | |
| "step": 276 | |
| }, | |
| { | |
| "entropy": 0.5445250272750854, | |
| "epoch": 1.0336763330215155, | |
| "grad_norm": 0.027243314310908318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401641726493835, | |
| "mean_token_accuracy": 0.7808064818382263, | |
| "num_tokens": 4505862.0, | |
| "step": 277 | |
| }, | |
| { | |
| "entropy": 0.5509742796421051, | |
| "epoch": 1.0374181478016837, | |
| "grad_norm": 0.032545655965805054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521466732025146, | |
| "mean_token_accuracy": 0.7762803286314011, | |
| "num_tokens": 4522135.0, | |
| "step": 278 | |
| }, | |
| { | |
| "entropy": 0.5502415001392365, | |
| "epoch": 1.0411599625818522, | |
| "grad_norm": 0.030756743624806404, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506622195243835, | |
| "mean_token_accuracy": 0.7758103907108307, | |
| "num_tokens": 4538594.0, | |
| "step": 279 | |
| }, | |
| { | |
| "entropy": 0.5414353311061859, | |
| "epoch": 1.0449017773620206, | |
| "grad_norm": 0.030841531231999397, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470583438873291, | |
| "mean_token_accuracy": 0.7776292413473129, | |
| "num_tokens": 4555119.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.5487425029277802, | |
| "epoch": 1.048643592142189, | |
| "grad_norm": 0.03335481509566307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511153936386108, | |
| "mean_token_accuracy": 0.7753961086273193, | |
| "num_tokens": 4571676.0, | |
| "step": 281 | |
| }, | |
| { | |
| "entropy": 0.5364932715892792, | |
| "epoch": 1.0523854069223573, | |
| "grad_norm": 0.03433723747730255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5388063788414001, | |
| "mean_token_accuracy": 0.7791535705327988, | |
| "num_tokens": 4587803.0, | |
| "step": 282 | |
| }, | |
| { | |
| "entropy": 0.5218682438135147, | |
| "epoch": 1.0561272217025257, | |
| "grad_norm": 0.03049764409661293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5254226922988892, | |
| "mean_token_accuracy": 0.7847179919481277, | |
| "num_tokens": 4603856.0, | |
| "step": 283 | |
| }, | |
| { | |
| "entropy": 0.5384526699781418, | |
| "epoch": 1.0598690364826941, | |
| "grad_norm": 0.02954094670712948, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442904829978943, | |
| "mean_token_accuracy": 0.7810987532138824, | |
| "num_tokens": 4619957.0, | |
| "step": 284 | |
| }, | |
| { | |
| "entropy": 0.5648271888494492, | |
| "epoch": 1.0636108512628626, | |
| "grad_norm": 0.029273223131895065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.565851628780365, | |
| "mean_token_accuracy": 0.7694031447172165, | |
| "num_tokens": 4636366.0, | |
| "step": 285 | |
| }, | |
| { | |
| "entropy": 0.5445346832275391, | |
| "epoch": 1.0673526660430308, | |
| "grad_norm": 0.04154031351208687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437869429588318, | |
| "mean_token_accuracy": 0.7786456942558289, | |
| "num_tokens": 4652409.0, | |
| "step": 286 | |
| }, | |
| { | |
| "entropy": 0.5666444450616837, | |
| "epoch": 1.0710944808231992, | |
| "grad_norm": 0.027274858206510544, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619191527366638, | |
| "mean_token_accuracy": 0.7713726609945297, | |
| "num_tokens": 4668805.0, | |
| "step": 287 | |
| }, | |
| { | |
| "entropy": 0.5560373812913895, | |
| "epoch": 1.0748362956033677, | |
| "grad_norm": 0.03042946569621563, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536933541297913, | |
| "mean_token_accuracy": 0.7707109302282333, | |
| "num_tokens": 4685281.0, | |
| "step": 288 | |
| }, | |
| { | |
| "entropy": 0.5522497296333313, | |
| "epoch": 1.078578110383536, | |
| "grad_norm": 0.026407577097415924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554541826248169, | |
| "mean_token_accuracy": 0.7723578214645386, | |
| "num_tokens": 4701429.0, | |
| "step": 289 | |
| }, | |
| { | |
| "entropy": 0.5493666082620621, | |
| "epoch": 1.0823199251637043, | |
| "grad_norm": 0.03922448307275772, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535799860954285, | |
| "mean_token_accuracy": 0.7752141654491425, | |
| "num_tokens": 4717787.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.5579231083393097, | |
| "epoch": 1.0860617399438728, | |
| "grad_norm": 0.029233764857053757, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569900274276733, | |
| "mean_token_accuracy": 0.7733462601900101, | |
| "num_tokens": 4734144.0, | |
| "step": 291 | |
| }, | |
| { | |
| "entropy": 0.544972226023674, | |
| "epoch": 1.0898035547240412, | |
| "grad_norm": 0.030961396172642708, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413874983787537, | |
| "mean_token_accuracy": 0.7801695913076401, | |
| "num_tokens": 4750509.0, | |
| "step": 292 | |
| }, | |
| { | |
| "entropy": 0.550209566950798, | |
| "epoch": 1.0935453695042094, | |
| "grad_norm": 0.03252837061882019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514767169952393, | |
| "mean_token_accuracy": 0.7740490287542343, | |
| "num_tokens": 4766708.0, | |
| "step": 293 | |
| }, | |
| { | |
| "entropy": 0.545928418636322, | |
| "epoch": 1.0972871842843779, | |
| "grad_norm": 0.02844078466296196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454370975494385, | |
| "mean_token_accuracy": 0.7802854478359222, | |
| "num_tokens": 4783110.0, | |
| "step": 294 | |
| }, | |
| { | |
| "entropy": 0.550410658121109, | |
| "epoch": 1.1010289990645463, | |
| "grad_norm": 0.0395023413002491, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5610683560371399, | |
| "mean_token_accuracy": 0.7725012004375458, | |
| "num_tokens": 4799492.0, | |
| "step": 295 | |
| }, | |
| { | |
| "entropy": 0.5291745737195015, | |
| "epoch": 1.1047708138447148, | |
| "grad_norm": 0.028669750317931175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332962274551392, | |
| "mean_token_accuracy": 0.7820043116807938, | |
| "num_tokens": 4815864.0, | |
| "step": 296 | |
| }, | |
| { | |
| "entropy": 0.5454689562320709, | |
| "epoch": 1.108512628624883, | |
| "grad_norm": 0.02827887050807476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511517524719238, | |
| "mean_token_accuracy": 0.7747574001550674, | |
| "num_tokens": 4832267.0, | |
| "step": 297 | |
| }, | |
| { | |
| "entropy": 0.5417342334985733, | |
| "epoch": 1.1122544434050514, | |
| "grad_norm": 0.026385854929685593, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412203669548035, | |
| "mean_token_accuracy": 0.780335083603859, | |
| "num_tokens": 4848653.0, | |
| "step": 298 | |
| }, | |
| { | |
| "entropy": 0.5629215389490128, | |
| "epoch": 1.1159962581852199, | |
| "grad_norm": 0.030779633671045303, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5625781416893005, | |
| "mean_token_accuracy": 0.7703746110200882, | |
| "num_tokens": 4865192.0, | |
| "step": 299 | |
| }, | |
| { | |
| "entropy": 0.5278398767113686, | |
| "epoch": 1.1197380729653883, | |
| "grad_norm": 0.02865917608141899, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5246303081512451, | |
| "mean_token_accuracy": 0.7881903648376465, | |
| "num_tokens": 4881315.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.5360843688249588, | |
| "epoch": 1.1234798877455565, | |
| "grad_norm": 0.02863423153758049, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405621528625488, | |
| "mean_token_accuracy": 0.7765359878540039, | |
| "num_tokens": 4897572.0, | |
| "step": 301 | |
| }, | |
| { | |
| "entropy": 0.5270702391862869, | |
| "epoch": 1.127221702525725, | |
| "grad_norm": 0.027807647362351418, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5271122455596924, | |
| "mean_token_accuracy": 0.7830122262239456, | |
| "num_tokens": 4913718.0, | |
| "step": 302 | |
| }, | |
| { | |
| "entropy": 0.5291232466697693, | |
| "epoch": 1.1309635173058934, | |
| "grad_norm": 0.03156433254480362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328850746154785, | |
| "mean_token_accuracy": 0.7853387147188187, | |
| "num_tokens": 4930253.0, | |
| "step": 303 | |
| }, | |
| { | |
| "entropy": 0.5468447655439377, | |
| "epoch": 1.1347053320860618, | |
| "grad_norm": 0.033552881330251694, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545834898948669, | |
| "mean_token_accuracy": 0.7716294378042221, | |
| "num_tokens": 4946382.0, | |
| "step": 304 | |
| }, | |
| { | |
| "entropy": 0.5517953187227249, | |
| "epoch": 1.13844714686623, | |
| "grad_norm": 0.030561944469809532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540879964828491, | |
| "mean_token_accuracy": 0.7759448438882828, | |
| "num_tokens": 4962652.0, | |
| "step": 305 | |
| }, | |
| { | |
| "entropy": 0.544833779335022, | |
| "epoch": 1.1421889616463985, | |
| "grad_norm": 0.030571507290005684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443115234375, | |
| "mean_token_accuracy": 0.7782190293073654, | |
| "num_tokens": 4978959.0, | |
| "step": 306 | |
| }, | |
| { | |
| "entropy": 0.5475269705057144, | |
| "epoch": 1.145930776426567, | |
| "grad_norm": 0.0296931229531765, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541431188583374, | |
| "mean_token_accuracy": 0.7753712236881256, | |
| "num_tokens": 4995357.0, | |
| "step": 307 | |
| }, | |
| { | |
| "entropy": 0.5446912348270416, | |
| "epoch": 1.1496725912067354, | |
| "grad_norm": 0.025116927921772003, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437968373298645, | |
| "mean_token_accuracy": 0.7787619084119797, | |
| "num_tokens": 5011590.0, | |
| "step": 308 | |
| }, | |
| { | |
| "entropy": 0.5292570069432259, | |
| "epoch": 1.1534144059869036, | |
| "grad_norm": 0.027315491810441017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277875065803528, | |
| "mean_token_accuracy": 0.7833113670349121, | |
| "num_tokens": 5027873.0, | |
| "step": 309 | |
| }, | |
| { | |
| "entropy": 0.5242628306150436, | |
| "epoch": 1.157156220767072, | |
| "grad_norm": 0.027830073609948158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.523070752620697, | |
| "mean_token_accuracy": 0.7879849672317505, | |
| "num_tokens": 5044361.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.536102682352066, | |
| "epoch": 1.1608980355472405, | |
| "grad_norm": 0.031033379957079887, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378351211547852, | |
| "mean_token_accuracy": 0.7815344035625458, | |
| "num_tokens": 5060644.0, | |
| "step": 311 | |
| }, | |
| { | |
| "entropy": 0.5573316812515259, | |
| "epoch": 1.1646398503274087, | |
| "grad_norm": 0.03297853097319603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5643618106842041, | |
| "mean_token_accuracy": 0.7715043723583221, | |
| "num_tokens": 5077003.0, | |
| "step": 312 | |
| }, | |
| { | |
| "entropy": 0.526486948132515, | |
| "epoch": 1.1683816651075771, | |
| "grad_norm": 0.029532574117183685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367429256439209, | |
| "mean_token_accuracy": 0.7818453460931778, | |
| "num_tokens": 5093120.0, | |
| "step": 313 | |
| }, | |
| { | |
| "entropy": 0.545007973909378, | |
| "epoch": 1.1721234798877456, | |
| "grad_norm": 0.0302292387932539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474991798400879, | |
| "mean_token_accuracy": 0.7770297825336456, | |
| "num_tokens": 5109333.0, | |
| "step": 314 | |
| }, | |
| { | |
| "entropy": 0.5457079261541367, | |
| "epoch": 1.175865294667914, | |
| "grad_norm": 0.03628959506750107, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456429719924927, | |
| "mean_token_accuracy": 0.779505044221878, | |
| "num_tokens": 5125459.0, | |
| "step": 315 | |
| }, | |
| { | |
| "entropy": 0.5526050478219986, | |
| "epoch": 1.1796071094480822, | |
| "grad_norm": 0.031634826213121414, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504459738731384, | |
| "mean_token_accuracy": 0.7756629437208176, | |
| "num_tokens": 5141755.0, | |
| "step": 316 | |
| }, | |
| { | |
| "entropy": 0.5621381402015686, | |
| "epoch": 1.1833489242282507, | |
| "grad_norm": 0.02932395227253437, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5631870627403259, | |
| "mean_token_accuracy": 0.767949178814888, | |
| "num_tokens": 5158305.0, | |
| "step": 317 | |
| }, | |
| { | |
| "entropy": 0.5412058234214783, | |
| "epoch": 1.187090739008419, | |
| "grad_norm": 0.03077547252178192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5441724061965942, | |
| "mean_token_accuracy": 0.7769438326358795, | |
| "num_tokens": 5174825.0, | |
| "step": 318 | |
| }, | |
| { | |
| "entropy": 0.5375640690326691, | |
| "epoch": 1.1908325537885875, | |
| "grad_norm": 0.0300463754683733, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393084287643433, | |
| "mean_token_accuracy": 0.782392755150795, | |
| "num_tokens": 5190829.0, | |
| "step": 319 | |
| }, | |
| { | |
| "entropy": 0.5544911473989487, | |
| "epoch": 1.1945743685687558, | |
| "grad_norm": 0.03089406155049801, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5512977838516235, | |
| "mean_token_accuracy": 0.7745725959539413, | |
| "num_tokens": 5207283.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.5496610552072525, | |
| "epoch": 1.1983161833489242, | |
| "grad_norm": 0.03022005409002304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407426357269287, | |
| "mean_token_accuracy": 0.7819069474935532, | |
| "num_tokens": 5223759.0, | |
| "step": 321 | |
| }, | |
| { | |
| "entropy": 0.5536633729934692, | |
| "epoch": 1.2020579981290926, | |
| "grad_norm": 0.03297387808561325, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5543879866600037, | |
| "mean_token_accuracy": 0.7727649062871933, | |
| "num_tokens": 5240096.0, | |
| "step": 322 | |
| }, | |
| { | |
| "entropy": 0.5441806763410568, | |
| "epoch": 1.205799812909261, | |
| "grad_norm": 0.029116200283169746, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444720387458801, | |
| "mean_token_accuracy": 0.7814431339502335, | |
| "num_tokens": 5256670.0, | |
| "step": 323 | |
| }, | |
| { | |
| "entropy": 0.5429923981428146, | |
| "epoch": 1.2095416276894293, | |
| "grad_norm": 0.03505397588014603, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506747961044312, | |
| "mean_token_accuracy": 0.7763912379741669, | |
| "num_tokens": 5272766.0, | |
| "step": 324 | |
| }, | |
| { | |
| "entropy": 0.5270697474479675, | |
| "epoch": 1.2132834424695977, | |
| "grad_norm": 0.039405617862939835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409681797027588, | |
| "mean_token_accuracy": 0.7786189615726471, | |
| "num_tokens": 5289123.0, | |
| "step": 325 | |
| }, | |
| { | |
| "entropy": 0.558641791343689, | |
| "epoch": 1.2170252572497662, | |
| "grad_norm": 0.029413288459181786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564137697219849, | |
| "mean_token_accuracy": 0.7740890085697174, | |
| "num_tokens": 5305503.0, | |
| "step": 326 | |
| }, | |
| { | |
| "entropy": 0.5550449192523956, | |
| "epoch": 1.2207670720299344, | |
| "grad_norm": 0.031028373166918755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544853210449219, | |
| "mean_token_accuracy": 0.7716324329376221, | |
| "num_tokens": 5321885.0, | |
| "step": 327 | |
| }, | |
| { | |
| "entropy": 0.5564998090267181, | |
| "epoch": 1.2245088868101028, | |
| "grad_norm": 0.034970104694366455, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547239184379578, | |
| "mean_token_accuracy": 0.7719462513923645, | |
| "num_tokens": 5338376.0, | |
| "step": 328 | |
| }, | |
| { | |
| "entropy": 0.5593426823616028, | |
| "epoch": 1.2282507015902713, | |
| "grad_norm": 0.030654314905405045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5594889521598816, | |
| "mean_token_accuracy": 0.7690505534410477, | |
| "num_tokens": 5354745.0, | |
| "step": 329 | |
| }, | |
| { | |
| "entropy": 0.5594028532505035, | |
| "epoch": 1.2319925163704397, | |
| "grad_norm": 0.02985675260424614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.560926079750061, | |
| "mean_token_accuracy": 0.771067887544632, | |
| "num_tokens": 5371364.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.5444284975528717, | |
| "epoch": 1.2357343311506082, | |
| "grad_norm": 0.0331130288541317, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528807044029236, | |
| "mean_token_accuracy": 0.7744182050228119, | |
| "num_tokens": 5387884.0, | |
| "step": 331 | |
| }, | |
| { | |
| "entropy": 0.5535553693771362, | |
| "epoch": 1.2394761459307764, | |
| "grad_norm": 0.035860270261764526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5612154603004456, | |
| "mean_token_accuracy": 0.7728609591722488, | |
| "num_tokens": 5404143.0, | |
| "step": 332 | |
| }, | |
| { | |
| "entropy": 0.5594320446252823, | |
| "epoch": 1.2432179607109448, | |
| "grad_norm": 0.030857175588607788, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495461225509644, | |
| "mean_token_accuracy": 0.7783895283937454, | |
| "num_tokens": 5420613.0, | |
| "step": 333 | |
| }, | |
| { | |
| "entropy": 0.5738644152879715, | |
| "epoch": 1.2469597754911133, | |
| "grad_norm": 0.02752659097313881, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670571327209473, | |
| "mean_token_accuracy": 0.7706948518753052, | |
| "num_tokens": 5437025.0, | |
| "step": 334 | |
| }, | |
| { | |
| "entropy": 0.5468066483736038, | |
| "epoch": 1.2507015902712815, | |
| "grad_norm": 0.030105959624052048, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448632836341858, | |
| "mean_token_accuracy": 0.7777069211006165, | |
| "num_tokens": 5453431.0, | |
| "step": 335 | |
| }, | |
| { | |
| "entropy": 0.5508809983730316, | |
| "epoch": 1.25444340505145, | |
| "grad_norm": 0.031137077137827873, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5581130981445312, | |
| "mean_token_accuracy": 0.7730289697647095, | |
| "num_tokens": 5469727.0, | |
| "step": 336 | |
| }, | |
| { | |
| "entropy": 0.5199557095766068, | |
| "epoch": 1.2581852198316184, | |
| "grad_norm": 0.033218562602996826, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353677272796631, | |
| "mean_token_accuracy": 0.7836348563432693, | |
| "num_tokens": 5485615.0, | |
| "step": 337 | |
| }, | |
| { | |
| "entropy": 0.5402327626943588, | |
| "epoch": 1.2619270346117868, | |
| "grad_norm": 0.02909061312675476, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445257425308228, | |
| "mean_token_accuracy": 0.7775768637657166, | |
| "num_tokens": 5501846.0, | |
| "step": 338 | |
| }, | |
| { | |
| "entropy": 0.5657909214496613, | |
| "epoch": 1.2656688493919552, | |
| "grad_norm": 0.03052118793129921, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5672930479049683, | |
| "mean_token_accuracy": 0.7675611525774002, | |
| "num_tokens": 5518365.0, | |
| "step": 339 | |
| }, | |
| { | |
| "entropy": 0.5483649671077728, | |
| "epoch": 1.2694106641721234, | |
| "grad_norm": 0.02786743827164173, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456503033638, | |
| "mean_token_accuracy": 0.7791422605514526, | |
| "num_tokens": 5534639.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.5500437468290329, | |
| "epoch": 1.2731524789522919, | |
| "grad_norm": 0.03155668452382088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.545000433921814, | |
| "mean_token_accuracy": 0.7803118973970413, | |
| "num_tokens": 5551093.0, | |
| "step": 341 | |
| }, | |
| { | |
| "entropy": 0.5697951167821884, | |
| "epoch": 1.27689429373246, | |
| "grad_norm": 0.03075268305838108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5609626173973083, | |
| "mean_token_accuracy": 0.7723665684461594, | |
| "num_tokens": 5567707.0, | |
| "step": 342 | |
| }, | |
| { | |
| "entropy": 0.544351652264595, | |
| "epoch": 1.2806361085126285, | |
| "grad_norm": 0.03238390013575554, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533734560012817, | |
| "mean_token_accuracy": 0.7754608392715454, | |
| "num_tokens": 5584155.0, | |
| "step": 343 | |
| }, | |
| { | |
| "entropy": 0.5441059172153473, | |
| "epoch": 1.284377923292797, | |
| "grad_norm": 0.02793728932738304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470475554466248, | |
| "mean_token_accuracy": 0.7781476378440857, | |
| "num_tokens": 5600585.0, | |
| "step": 344 | |
| }, | |
| { | |
| "entropy": 0.5576403886079788, | |
| "epoch": 1.2881197380729654, | |
| "grad_norm": 0.0332297645509243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591012835502625, | |
| "mean_token_accuracy": 0.7717157751321793, | |
| "num_tokens": 5616865.0, | |
| "step": 345 | |
| }, | |
| { | |
| "entropy": 0.5582529455423355, | |
| "epoch": 1.2918615528531339, | |
| "grad_norm": 0.028861626982688904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597870349884033, | |
| "mean_token_accuracy": 0.7722600847482681, | |
| "num_tokens": 5633131.0, | |
| "step": 346 | |
| }, | |
| { | |
| "entropy": 0.5537585616111755, | |
| "epoch": 1.295603367633302, | |
| "grad_norm": 0.027739623561501503, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5517114996910095, | |
| "mean_token_accuracy": 0.7751765549182892, | |
| "num_tokens": 5649621.0, | |
| "step": 347 | |
| }, | |
| { | |
| "entropy": 0.5722759366035461, | |
| "epoch": 1.2993451824134705, | |
| "grad_norm": 0.029868733137845993, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5697493553161621, | |
| "mean_token_accuracy": 0.769178032875061, | |
| "num_tokens": 5666058.0, | |
| "step": 348 | |
| }, | |
| { | |
| "entropy": 0.5482298284769058, | |
| "epoch": 1.303086997193639, | |
| "grad_norm": 0.02905650995671749, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505189895629883, | |
| "mean_token_accuracy": 0.7772009968757629, | |
| "num_tokens": 5682272.0, | |
| "step": 349 | |
| }, | |
| { | |
| "entropy": 0.5623439997434616, | |
| "epoch": 1.3068288119738072, | |
| "grad_norm": 0.028680406510829926, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5615631937980652, | |
| "mean_token_accuracy": 0.7712025493383408, | |
| "num_tokens": 5698796.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.5541074424982071, | |
| "epoch": 1.3105706267539756, | |
| "grad_norm": 0.03431180492043495, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5617666244506836, | |
| "mean_token_accuracy": 0.7705400139093399, | |
| "num_tokens": 5714994.0, | |
| "step": 351 | |
| }, | |
| { | |
| "entropy": 0.5405305176973343, | |
| "epoch": 1.314312441534144, | |
| "grad_norm": 0.03283194825053215, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538750946521759, | |
| "mean_token_accuracy": 0.7778624445199966, | |
| "num_tokens": 5731263.0, | |
| "step": 352 | |
| }, | |
| { | |
| "entropy": 0.5537361800670624, | |
| "epoch": 1.3180542563143125, | |
| "grad_norm": 0.03157467022538185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.556831955909729, | |
| "mean_token_accuracy": 0.7720046639442444, | |
| "num_tokens": 5747576.0, | |
| "step": 353 | |
| }, | |
| { | |
| "entropy": 0.5540541112422943, | |
| "epoch": 1.321796071094481, | |
| "grad_norm": 0.03315872326493263, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560564398765564, | |
| "mean_token_accuracy": 0.7747179567813873, | |
| "num_tokens": 5763875.0, | |
| "step": 354 | |
| }, | |
| { | |
| "entropy": 0.5485205948352814, | |
| "epoch": 1.3255378858746492, | |
| "grad_norm": 0.029158933088183403, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474769473075867, | |
| "mean_token_accuracy": 0.7769359052181244, | |
| "num_tokens": 5780494.0, | |
| "step": 355 | |
| }, | |
| { | |
| "entropy": 0.5560560077428818, | |
| "epoch": 1.3292797006548176, | |
| "grad_norm": 0.03023948147892952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578330159187317, | |
| "mean_token_accuracy": 0.7706339210271835, | |
| "num_tokens": 5796776.0, | |
| "step": 356 | |
| }, | |
| { | |
| "entropy": 0.5549474805593491, | |
| "epoch": 1.333021515434986, | |
| "grad_norm": 0.03123750351369381, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531733632087708, | |
| "mean_token_accuracy": 0.7738355994224548, | |
| "num_tokens": 5813225.0, | |
| "step": 357 | |
| }, | |
| { | |
| "entropy": 0.5446926355361938, | |
| "epoch": 1.3367633302151543, | |
| "grad_norm": 0.03854469954967499, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5561398863792419, | |
| "mean_token_accuracy": 0.7719077616930008, | |
| "num_tokens": 5829411.0, | |
| "step": 358 | |
| }, | |
| { | |
| "entropy": 0.5601906925439835, | |
| "epoch": 1.3405051449953227, | |
| "grad_norm": 0.025615639984607697, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5579116940498352, | |
| "mean_token_accuracy": 0.7725162506103516, | |
| "num_tokens": 5845753.0, | |
| "step": 359 | |
| }, | |
| { | |
| "entropy": 0.557614728808403, | |
| "epoch": 1.3442469597754911, | |
| "grad_norm": 0.026924598962068558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500644445419312, | |
| "mean_token_accuracy": 0.7740714848041534, | |
| "num_tokens": 5861927.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.5535576045513153, | |
| "epoch": 1.3479887745556596, | |
| "grad_norm": 0.031272657215595245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418438911437988, | |
| "mean_token_accuracy": 0.780152902007103, | |
| "num_tokens": 5878289.0, | |
| "step": 361 | |
| }, | |
| { | |
| "entropy": 0.5407048761844635, | |
| "epoch": 1.351730589335828, | |
| "grad_norm": 0.031007423996925354, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5493313670158386, | |
| "mean_token_accuracy": 0.7764623165130615, | |
| "num_tokens": 5894592.0, | |
| "step": 362 | |
| }, | |
| { | |
| "entropy": 0.5239751785993576, | |
| "epoch": 1.3554724041159962, | |
| "grad_norm": 0.03374086320400238, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5344395041465759, | |
| "mean_token_accuracy": 0.7812817394733429, | |
| "num_tokens": 5910863.0, | |
| "step": 363 | |
| }, | |
| { | |
| "entropy": 0.5377437621355057, | |
| "epoch": 1.3592142188961647, | |
| "grad_norm": 0.04066803306341171, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502558946609497, | |
| "mean_token_accuracy": 0.7735230922698975, | |
| "num_tokens": 5927169.0, | |
| "step": 364 | |
| }, | |
| { | |
| "entropy": 0.5404135584831238, | |
| "epoch": 1.362956033676333, | |
| "grad_norm": 0.030103564262390137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431765913963318, | |
| "mean_token_accuracy": 0.780334860086441, | |
| "num_tokens": 5943288.0, | |
| "step": 365 | |
| }, | |
| { | |
| "entropy": 0.5349705293774605, | |
| "epoch": 1.3666978484565013, | |
| "grad_norm": 0.031804051250219345, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298077464103699, | |
| "mean_token_accuracy": 0.7834766954183578, | |
| "num_tokens": 5959662.0, | |
| "step": 366 | |
| }, | |
| { | |
| "entropy": 0.5429814159870148, | |
| "epoch": 1.3704396632366698, | |
| "grad_norm": 0.04628051444888115, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361793041229248, | |
| "mean_token_accuracy": 0.7793655544519424, | |
| "num_tokens": 5976139.0, | |
| "step": 367 | |
| }, | |
| { | |
| "entropy": 0.5505317896604538, | |
| "epoch": 1.3741814780168382, | |
| "grad_norm": 0.03267182409763336, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444616675376892, | |
| "mean_token_accuracy": 0.7798040062189102, | |
| "num_tokens": 5992476.0, | |
| "step": 368 | |
| }, | |
| { | |
| "entropy": 0.5407690107822418, | |
| "epoch": 1.3779232927970066, | |
| "grad_norm": 0.0353633388876915, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501353740692139, | |
| "mean_token_accuracy": 0.7760691344738007, | |
| "num_tokens": 6008641.0, | |
| "step": 369 | |
| }, | |
| { | |
| "entropy": 0.5465443283319473, | |
| "epoch": 1.3816651075771749, | |
| "grad_norm": 0.044324446469545364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5564755201339722, | |
| "mean_token_accuracy": 0.775538980960846, | |
| "num_tokens": 6024769.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.5609740614891052, | |
| "epoch": 1.3854069223573433, | |
| "grad_norm": 0.03593122959136963, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5629419088363647, | |
| "mean_token_accuracy": 0.7691068351268768, | |
| "num_tokens": 6041060.0, | |
| "step": 371 | |
| }, | |
| { | |
| "entropy": 0.5421721637248993, | |
| "epoch": 1.3891487371375117, | |
| "grad_norm": 0.03346877172589302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368991494178772, | |
| "mean_token_accuracy": 0.7809954136610031, | |
| "num_tokens": 6057328.0, | |
| "step": 372 | |
| }, | |
| { | |
| "entropy": 0.5421962440013885, | |
| "epoch": 1.39289055191768, | |
| "grad_norm": 0.036160413175821304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371009111404419, | |
| "mean_token_accuracy": 0.7804526090621948, | |
| "num_tokens": 6073633.0, | |
| "step": 373 | |
| }, | |
| { | |
| "entropy": 0.5545593798160553, | |
| "epoch": 1.3966323666978484, | |
| "grad_norm": 0.03285996615886688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528316497802734, | |
| "mean_token_accuracy": 0.7778345346450806, | |
| "num_tokens": 6090142.0, | |
| "step": 374 | |
| }, | |
| { | |
| "entropy": 0.5461311042308807, | |
| "epoch": 1.4003741814780168, | |
| "grad_norm": 0.03481744974851608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470185279846191, | |
| "mean_token_accuracy": 0.7769876271486282, | |
| "num_tokens": 6106491.0, | |
| "step": 375 | |
| }, | |
| { | |
| "entropy": 0.5363553166389465, | |
| "epoch": 1.4041159962581853, | |
| "grad_norm": 0.029494671151041985, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371567010879517, | |
| "mean_token_accuracy": 0.78060382604599, | |
| "num_tokens": 6122724.0, | |
| "step": 376 | |
| }, | |
| { | |
| "entropy": 0.5401545614004135, | |
| "epoch": 1.4078578110383537, | |
| "grad_norm": 0.030447613447904587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506365299224854, | |
| "mean_token_accuracy": 0.7772665321826935, | |
| "num_tokens": 6139127.0, | |
| "step": 377 | |
| }, | |
| { | |
| "entropy": 0.5432114005088806, | |
| "epoch": 1.411599625818522, | |
| "grad_norm": 0.03443232551217079, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483974814414978, | |
| "mean_token_accuracy": 0.7753057479858398, | |
| "num_tokens": 6155228.0, | |
| "step": 378 | |
| }, | |
| { | |
| "entropy": 0.5419820547103882, | |
| "epoch": 1.4153414405986904, | |
| "grad_norm": 0.030418474227190018, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432078838348389, | |
| "mean_token_accuracy": 0.7786633670330048, | |
| "num_tokens": 6171661.0, | |
| "step": 379 | |
| }, | |
| { | |
| "entropy": 0.5554294884204865, | |
| "epoch": 1.4190832553788588, | |
| "grad_norm": 0.028558963909745216, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5531105995178223, | |
| "mean_token_accuracy": 0.7719776481389999, | |
| "num_tokens": 6187948.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.5308730006217957, | |
| "epoch": 1.422825070159027, | |
| "grad_norm": 0.03490149602293968, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338871479034424, | |
| "mean_token_accuracy": 0.7831013798713684, | |
| "num_tokens": 6203996.0, | |
| "step": 381 | |
| }, | |
| { | |
| "entropy": 0.5621105879545212, | |
| "epoch": 1.4265668849391955, | |
| "grad_norm": 0.03489487245678902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650954246520996, | |
| "mean_token_accuracy": 0.7674195319414139, | |
| "num_tokens": 6220346.0, | |
| "step": 382 | |
| }, | |
| { | |
| "entropy": 0.5624908655881882, | |
| "epoch": 1.430308699719364, | |
| "grad_norm": 0.02940392680466175, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5624366998672485, | |
| "mean_token_accuracy": 0.769148588180542, | |
| "num_tokens": 6236743.0, | |
| "step": 383 | |
| }, | |
| { | |
| "entropy": 0.5363715589046478, | |
| "epoch": 1.4340505144995324, | |
| "grad_norm": 0.028942115604877472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5339908599853516, | |
| "mean_token_accuracy": 0.7834934592247009, | |
| "num_tokens": 6252708.0, | |
| "step": 384 | |
| }, | |
| { | |
| "entropy": 0.5408411026000977, | |
| "epoch": 1.4377923292797006, | |
| "grad_norm": 0.0305769219994545, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352215766906738, | |
| "mean_token_accuracy": 0.7860714495182037, | |
| "num_tokens": 6268903.0, | |
| "step": 385 | |
| }, | |
| { | |
| "entropy": 0.5410628318786621, | |
| "epoch": 1.441534144059869, | |
| "grad_norm": 0.029285579919815063, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5426855087280273, | |
| "mean_token_accuracy": 0.7768432199954987, | |
| "num_tokens": 6284894.0, | |
| "step": 386 | |
| }, | |
| { | |
| "entropy": 0.5362880975008011, | |
| "epoch": 1.4452759588400375, | |
| "grad_norm": 0.03178134933114052, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503253936767578, | |
| "mean_token_accuracy": 0.7759049534797668, | |
| "num_tokens": 6301216.0, | |
| "step": 387 | |
| }, | |
| { | |
| "entropy": 0.5453620404005051, | |
| "epoch": 1.4490177736202057, | |
| "grad_norm": 0.029615160077810287, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5539615154266357, | |
| "mean_token_accuracy": 0.7736871391534805, | |
| "num_tokens": 6317584.0, | |
| "step": 388 | |
| }, | |
| { | |
| "entropy": 0.5552696138620377, | |
| "epoch": 1.4527595884003741, | |
| "grad_norm": 0.03214653581380844, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5597580671310425, | |
| "mean_token_accuracy": 0.7707493901252747, | |
| "num_tokens": 6333884.0, | |
| "step": 389 | |
| }, | |
| { | |
| "entropy": 0.553122490644455, | |
| "epoch": 1.4565014031805426, | |
| "grad_norm": 0.029804600402712822, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552976131439209, | |
| "mean_token_accuracy": 0.778336301445961, | |
| "num_tokens": 6350141.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.5826992094516754, | |
| "epoch": 1.460243217960711, | |
| "grad_norm": 0.03438711538910866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5765487551689148, | |
| "mean_token_accuracy": 0.7643037289381027, | |
| "num_tokens": 6366374.0, | |
| "step": 391 | |
| }, | |
| { | |
| "entropy": 0.5606750249862671, | |
| "epoch": 1.4639850327408794, | |
| "grad_norm": 0.030389849096536636, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5595695376396179, | |
| "mean_token_accuracy": 0.7718200087547302, | |
| "num_tokens": 6382848.0, | |
| "step": 392 | |
| }, | |
| { | |
| "entropy": 0.5619854032993317, | |
| "epoch": 1.4677268475210477, | |
| "grad_norm": 0.032461296766996384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576058030128479, | |
| "mean_token_accuracy": 0.7746401876211166, | |
| "num_tokens": 6399173.0, | |
| "step": 393 | |
| }, | |
| { | |
| "entropy": 0.5408260822296143, | |
| "epoch": 1.471468662301216, | |
| "grad_norm": 0.03529435396194458, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5456345081329346, | |
| "mean_token_accuracy": 0.7788489162921906, | |
| "num_tokens": 6415565.0, | |
| "step": 394 | |
| }, | |
| { | |
| "entropy": 0.5425965934991837, | |
| "epoch": 1.4752104770813845, | |
| "grad_norm": 0.03692852333188057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5488424301147461, | |
| "mean_token_accuracy": 0.7782263904809952, | |
| "num_tokens": 6431912.0, | |
| "step": 395 | |
| }, | |
| { | |
| "entropy": 0.5516340583562851, | |
| "epoch": 1.4789522918615527, | |
| "grad_norm": 0.031000891700387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5553445219993591, | |
| "mean_token_accuracy": 0.7752650529146194, | |
| "num_tokens": 6448548.0, | |
| "step": 396 | |
| }, | |
| { | |
| "entropy": 0.538574829697609, | |
| "epoch": 1.4826941066417212, | |
| "grad_norm": 0.030864855274558067, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5368215441703796, | |
| "mean_token_accuracy": 0.7809993326663971, | |
| "num_tokens": 6465030.0, | |
| "step": 397 | |
| }, | |
| { | |
| "entropy": 0.5717963427305222, | |
| "epoch": 1.4864359214218896, | |
| "grad_norm": 0.033221229910850525, | |
| "learning_rate": 0.0002, | |
| "loss": 0.571186363697052, | |
| "mean_token_accuracy": 0.7653579860925674, | |
| "num_tokens": 6481528.0, | |
| "step": 398 | |
| }, | |
| { | |
| "entropy": 0.5418017208576202, | |
| "epoch": 1.490177736202058, | |
| "grad_norm": 0.04067196696996689, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442001223564148, | |
| "mean_token_accuracy": 0.7763307839632034, | |
| "num_tokens": 6497840.0, | |
| "step": 399 | |
| }, | |
| { | |
| "entropy": 0.5547621697187424, | |
| "epoch": 1.4939195509822265, | |
| "grad_norm": 0.03348267823457718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626781582832336, | |
| "mean_token_accuracy": 0.7712242007255554, | |
| "num_tokens": 6514349.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.5494479835033417, | |
| "epoch": 1.4976613657623947, | |
| "grad_norm": 0.03362090513110161, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548977792263031, | |
| "mean_token_accuracy": 0.7767577767372131, | |
| "num_tokens": 6530749.0, | |
| "step": 401 | |
| }, | |
| { | |
| "entropy": 0.5626181960105896, | |
| "epoch": 1.5014031805425632, | |
| "grad_norm": 0.03137248754501343, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5654096603393555, | |
| "mean_token_accuracy": 0.7723931819200516, | |
| "num_tokens": 6547276.0, | |
| "step": 402 | |
| }, | |
| { | |
| "entropy": 0.5499662905931473, | |
| "epoch": 1.5051449953227314, | |
| "grad_norm": 0.034359052777290344, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508401393890381, | |
| "mean_token_accuracy": 0.7756681442260742, | |
| "num_tokens": 6563580.0, | |
| "step": 403 | |
| }, | |
| { | |
| "entropy": 0.5658421665430069, | |
| "epoch": 1.5088868101028998, | |
| "grad_norm": 0.030933788046240807, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622308254241943, | |
| "mean_token_accuracy": 0.769567608833313, | |
| "num_tokens": 6579736.0, | |
| "step": 404 | |
| }, | |
| { | |
| "entropy": 0.547087088227272, | |
| "epoch": 1.5126286248830683, | |
| "grad_norm": 0.030160700902342796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470564961433411, | |
| "mean_token_accuracy": 0.7781479358673096, | |
| "num_tokens": 6596131.0, | |
| "step": 405 | |
| }, | |
| { | |
| "entropy": 0.5563077032566071, | |
| "epoch": 1.5163704396632367, | |
| "grad_norm": 0.029513506218791008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5557488799095154, | |
| "mean_token_accuracy": 0.7776722609996796, | |
| "num_tokens": 6612499.0, | |
| "step": 406 | |
| }, | |
| { | |
| "entropy": 0.5473329573869705, | |
| "epoch": 1.5201122544434051, | |
| "grad_norm": 0.031187692657113075, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444590449333191, | |
| "mean_token_accuracy": 0.7770859450101852, | |
| "num_tokens": 6628905.0, | |
| "step": 407 | |
| }, | |
| { | |
| "entropy": 0.5493151396512985, | |
| "epoch": 1.5238540692235736, | |
| "grad_norm": 0.027274703606963158, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559489130973816, | |
| "mean_token_accuracy": 0.774099811911583, | |
| "num_tokens": 6645207.0, | |
| "step": 408 | |
| }, | |
| { | |
| "entropy": 0.5369315445423126, | |
| "epoch": 1.5275958840037418, | |
| "grad_norm": 0.03280489146709442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494750738143921, | |
| "mean_token_accuracy": 0.7781352549791336, | |
| "num_tokens": 6661441.0, | |
| "step": 409 | |
| }, | |
| { | |
| "entropy": 0.543188214302063, | |
| "epoch": 1.5313376987839102, | |
| "grad_norm": 0.0317704938352108, | |
| "learning_rate": 0.0002, | |
| "loss": 0.548348069190979, | |
| "mean_token_accuracy": 0.7779366374015808, | |
| "num_tokens": 6677890.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.5514375120401382, | |
| "epoch": 1.5350795135640785, | |
| "grad_norm": 0.02904539741575718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532687902450562, | |
| "mean_token_accuracy": 0.776079460978508, | |
| "num_tokens": 6694229.0, | |
| "step": 411 | |
| }, | |
| { | |
| "entropy": 0.5228893607854843, | |
| "epoch": 1.538821328344247, | |
| "grad_norm": 0.027841076254844666, | |
| "learning_rate": 0.0002, | |
| "loss": 0.522330641746521, | |
| "mean_token_accuracy": 0.7864255011081696, | |
| "num_tokens": 6710250.0, | |
| "step": 412 | |
| }, | |
| { | |
| "entropy": 0.5390310734510422, | |
| "epoch": 1.5425631431244153, | |
| "grad_norm": 0.02716185338795185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395499467849731, | |
| "mean_token_accuracy": 0.7826422601938248, | |
| "num_tokens": 6726768.0, | |
| "step": 413 | |
| }, | |
| { | |
| "entropy": 0.5508141964673996, | |
| "epoch": 1.5463049579045838, | |
| "grad_norm": 0.030815092846751213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503819584846497, | |
| "mean_token_accuracy": 0.7755144089460373, | |
| "num_tokens": 6743055.0, | |
| "step": 414 | |
| }, | |
| { | |
| "entropy": 0.5312939435243607, | |
| "epoch": 1.5500467726847522, | |
| "grad_norm": 0.028637485578656197, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298642516136169, | |
| "mean_token_accuracy": 0.7852569371461868, | |
| "num_tokens": 6759442.0, | |
| "step": 415 | |
| }, | |
| { | |
| "entropy": 0.5471786260604858, | |
| "epoch": 1.5537885874649204, | |
| "grad_norm": 0.030604762956500053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5502840876579285, | |
| "mean_token_accuracy": 0.7758130580186844, | |
| "num_tokens": 6775919.0, | |
| "step": 416 | |
| }, | |
| { | |
| "entropy": 0.5734788477420807, | |
| "epoch": 1.5575304022450889, | |
| "grad_norm": 0.033530574291944504, | |
| "learning_rate": 0.0002, | |
| "loss": 0.573567807674408, | |
| "mean_token_accuracy": 0.7666918784379959, | |
| "num_tokens": 6792496.0, | |
| "step": 417 | |
| }, | |
| { | |
| "entropy": 0.5556947290897369, | |
| "epoch": 1.561272217025257, | |
| "grad_norm": 0.029095808044075966, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5506360530853271, | |
| "mean_token_accuracy": 0.7765111029148102, | |
| "num_tokens": 6809055.0, | |
| "step": 418 | |
| }, | |
| { | |
| "entropy": 0.5287731885910034, | |
| "epoch": 1.5650140318054255, | |
| "grad_norm": 0.03587370365858078, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343160033226013, | |
| "mean_token_accuracy": 0.7836072146892548, | |
| "num_tokens": 6825353.0, | |
| "step": 419 | |
| }, | |
| { | |
| "entropy": 0.5342409163713455, | |
| "epoch": 1.568755846585594, | |
| "grad_norm": 0.03603408485651016, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5409013628959656, | |
| "mean_token_accuracy": 0.7804750800132751, | |
| "num_tokens": 6841745.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.5486701726913452, | |
| "epoch": 1.5724976613657624, | |
| "grad_norm": 0.02864743210375309, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5528161525726318, | |
| "mean_token_accuracy": 0.7741836905479431, | |
| "num_tokens": 6857942.0, | |
| "step": 421 | |
| }, | |
| { | |
| "entropy": 0.5741837024688721, | |
| "epoch": 1.5762394761459309, | |
| "grad_norm": 0.0320119671523571, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5608420372009277, | |
| "mean_token_accuracy": 0.7707283794879913, | |
| "num_tokens": 6874193.0, | |
| "step": 422 | |
| }, | |
| { | |
| "entropy": 0.5495482236146927, | |
| "epoch": 1.5799812909260993, | |
| "grad_norm": 0.02604423463344574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479333400726318, | |
| "mean_token_accuracy": 0.7773087471723557, | |
| "num_tokens": 6890547.0, | |
| "step": 423 | |
| }, | |
| { | |
| "entropy": 0.5387884378433228, | |
| "epoch": 1.5837231057062675, | |
| "grad_norm": 0.03170885518193245, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462484359741211, | |
| "mean_token_accuracy": 0.7735171020030975, | |
| "num_tokens": 6906920.0, | |
| "step": 424 | |
| }, | |
| { | |
| "entropy": 0.539916068315506, | |
| "epoch": 1.587464920486436, | |
| "grad_norm": 0.03372619301080704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542754590511322, | |
| "mean_token_accuracy": 0.7796132117509842, | |
| "num_tokens": 6923352.0, | |
| "step": 425 | |
| }, | |
| { | |
| "entropy": 0.5413663387298584, | |
| "epoch": 1.5912067352666042, | |
| "grad_norm": 0.02999868616461754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444542765617371, | |
| "mean_token_accuracy": 0.7786892652511597, | |
| "num_tokens": 6939337.0, | |
| "step": 426 | |
| }, | |
| { | |
| "entropy": 0.556038424372673, | |
| "epoch": 1.5949485500467726, | |
| "grad_norm": 0.03419700264930725, | |
| "learning_rate": 0.0002, | |
| "loss": 0.550898015499115, | |
| "mean_token_accuracy": 0.7760495245456696, | |
| "num_tokens": 6955389.0, | |
| "step": 427 | |
| }, | |
| { | |
| "entropy": 0.5516718029975891, | |
| "epoch": 1.598690364826941, | |
| "grad_norm": 0.0298128854483366, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5519053339958191, | |
| "mean_token_accuracy": 0.7739587277173996, | |
| "num_tokens": 6971808.0, | |
| "step": 428 | |
| }, | |
| { | |
| "entropy": 0.5532359778881073, | |
| "epoch": 1.6024321796071095, | |
| "grad_norm": 0.03213290125131607, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568399429321289, | |
| "mean_token_accuracy": 0.7753729224205017, | |
| "num_tokens": 6988128.0, | |
| "step": 429 | |
| }, | |
| { | |
| "entropy": 0.5382643342018127, | |
| "epoch": 1.606173994387278, | |
| "grad_norm": 0.031161464750766754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440113544464111, | |
| "mean_token_accuracy": 0.7779531329870224, | |
| "num_tokens": 7004368.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5313677787780762, | |
| "epoch": 1.6099158091674464, | |
| "grad_norm": 0.036605071276426315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367435216903687, | |
| "mean_token_accuracy": 0.7821811884641647, | |
| "num_tokens": 7020480.0, | |
| "step": 431 | |
| }, | |
| { | |
| "entropy": 0.5567297488451004, | |
| "epoch": 1.6136576239476146, | |
| "grad_norm": 0.027995243668556213, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547551512718201, | |
| "mean_token_accuracy": 0.7722228318452835, | |
| "num_tokens": 7036925.0, | |
| "step": 432 | |
| }, | |
| { | |
| "entropy": 0.5448314994573593, | |
| "epoch": 1.617399438727783, | |
| "grad_norm": 0.03725632280111313, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5465018153190613, | |
| "mean_token_accuracy": 0.7780062705278397, | |
| "num_tokens": 7053019.0, | |
| "step": 433 | |
| }, | |
| { | |
| "entropy": 0.5258296579122543, | |
| "epoch": 1.6211412535079512, | |
| "grad_norm": 0.03214319422841072, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5300624370574951, | |
| "mean_token_accuracy": 0.7829313278198242, | |
| "num_tokens": 7069021.0, | |
| "step": 434 | |
| }, | |
| { | |
| "entropy": 0.5569266527891159, | |
| "epoch": 1.6248830682881197, | |
| "grad_norm": 0.03432042896747589, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578755140304565, | |
| "mean_token_accuracy": 0.7711293399333954, | |
| "num_tokens": 7085450.0, | |
| "step": 435 | |
| }, | |
| { | |
| "entropy": 0.5638464391231537, | |
| "epoch": 1.6286248830682881, | |
| "grad_norm": 0.03862602636218071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5726134777069092, | |
| "mean_token_accuracy": 0.7694450467824936, | |
| "num_tokens": 7101666.0, | |
| "step": 436 | |
| }, | |
| { | |
| "entropy": 0.564548671245575, | |
| "epoch": 1.6323666978484566, | |
| "grad_norm": 0.032345570623874664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651994943618774, | |
| "mean_token_accuracy": 0.7711433917284012, | |
| "num_tokens": 7117907.0, | |
| "step": 437 | |
| }, | |
| { | |
| "entropy": 0.5587478131055832, | |
| "epoch": 1.636108512628625, | |
| "grad_norm": 0.031082862988114357, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588955879211426, | |
| "mean_token_accuracy": 0.7725447416305542, | |
| "num_tokens": 7134131.0, | |
| "step": 438 | |
| }, | |
| { | |
| "entropy": 0.5472389608621597, | |
| "epoch": 1.6398503274087932, | |
| "grad_norm": 0.03695904091000557, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5445616245269775, | |
| "mean_token_accuracy": 0.778590515255928, | |
| "num_tokens": 7150298.0, | |
| "step": 439 | |
| }, | |
| { | |
| "entropy": 0.5535961091518402, | |
| "epoch": 1.6435921421889617, | |
| "grad_norm": 0.031128892675042152, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437783598899841, | |
| "mean_token_accuracy": 0.7785230875015259, | |
| "num_tokens": 7166639.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.5351960062980652, | |
| "epoch": 1.6473339569691299, | |
| "grad_norm": 0.03949431702494621, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5358127355575562, | |
| "mean_token_accuracy": 0.7802053093910217, | |
| "num_tokens": 7182613.0, | |
| "step": 441 | |
| }, | |
| { | |
| "entropy": 0.524370513856411, | |
| "epoch": 1.6510757717492983, | |
| "grad_norm": 0.03402510657906532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5297942161560059, | |
| "mean_token_accuracy": 0.7861316353082657, | |
| "num_tokens": 7198598.0, | |
| "step": 442 | |
| }, | |
| { | |
| "entropy": 0.5440799742937088, | |
| "epoch": 1.6548175865294668, | |
| "grad_norm": 0.03908916562795639, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563719868659973, | |
| "mean_token_accuracy": 0.773345485329628, | |
| "num_tokens": 7214953.0, | |
| "step": 443 | |
| }, | |
| { | |
| "entropy": 0.5496329516172409, | |
| "epoch": 1.6585594013096352, | |
| "grad_norm": 0.036347340792417526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566647052764893, | |
| "mean_token_accuracy": 0.7736042439937592, | |
| "num_tokens": 7231069.0, | |
| "step": 444 | |
| }, | |
| { | |
| "entropy": 0.5510213375091553, | |
| "epoch": 1.6623012160898036, | |
| "grad_norm": 0.027416400611400604, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495529174804688, | |
| "mean_token_accuracy": 0.7757058292627335, | |
| "num_tokens": 7247326.0, | |
| "step": 445 | |
| }, | |
| { | |
| "entropy": 0.5782728493213654, | |
| "epoch": 1.666043030869972, | |
| "grad_norm": 0.03216573968529701, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5692035555839539, | |
| "mean_token_accuracy": 0.7700701951980591, | |
| "num_tokens": 7263765.0, | |
| "step": 446 | |
| }, | |
| { | |
| "entropy": 0.5769474655389786, | |
| "epoch": 1.6697848456501403, | |
| "grad_norm": 0.03461449593305588, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5692911148071289, | |
| "mean_token_accuracy": 0.7688308656215668, | |
| "num_tokens": 7280095.0, | |
| "step": 447 | |
| }, | |
| { | |
| "entropy": 0.5636246651411057, | |
| "epoch": 1.6735266604303087, | |
| "grad_norm": 0.02763124369084835, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5576487183570862, | |
| "mean_token_accuracy": 0.7748333811759949, | |
| "num_tokens": 7296592.0, | |
| "step": 448 | |
| }, | |
| { | |
| "entropy": 0.5515684485435486, | |
| "epoch": 1.677268475210477, | |
| "grad_norm": 0.03505739942193031, | |
| "learning_rate": 0.0002, | |
| "loss": 0.562554121017456, | |
| "mean_token_accuracy": 0.7732807844877243, | |
| "num_tokens": 7313071.0, | |
| "step": 449 | |
| }, | |
| { | |
| "entropy": 0.529756709933281, | |
| "epoch": 1.6810102899906454, | |
| "grad_norm": 0.035316504538059235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5393928289413452, | |
| "mean_token_accuracy": 0.7774565666913986, | |
| "num_tokens": 7329531.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.5509119927883148, | |
| "epoch": 1.6847521047708138, | |
| "grad_norm": 0.03525395318865776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5650572180747986, | |
| "mean_token_accuracy": 0.7679217755794525, | |
| "num_tokens": 7345852.0, | |
| "step": 451 | |
| }, | |
| { | |
| "entropy": 0.5615872442722321, | |
| "epoch": 1.6884939195509823, | |
| "grad_norm": 0.032941099256277084, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626966953277588, | |
| "mean_token_accuracy": 0.7703739553689957, | |
| "num_tokens": 7362126.0, | |
| "step": 452 | |
| }, | |
| { | |
| "entropy": 0.555547222495079, | |
| "epoch": 1.6922357343311507, | |
| "grad_norm": 0.03228066489100456, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544800877571106, | |
| "mean_token_accuracy": 0.7767430245876312, | |
| "num_tokens": 7378671.0, | |
| "step": 453 | |
| }, | |
| { | |
| "entropy": 0.554116278886795, | |
| "epoch": 1.6959775491113191, | |
| "grad_norm": 0.029597081243991852, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413352847099304, | |
| "mean_token_accuracy": 0.7784619033336639, | |
| "num_tokens": 7394967.0, | |
| "step": 454 | |
| }, | |
| { | |
| "entropy": 0.5580686628818512, | |
| "epoch": 1.6997193638914874, | |
| "grad_norm": 0.02839960716664791, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5585195422172546, | |
| "mean_token_accuracy": 0.7723167389631271, | |
| "num_tokens": 7411309.0, | |
| "step": 455 | |
| }, | |
| { | |
| "entropy": 0.5392096787691116, | |
| "epoch": 1.7034611786716558, | |
| "grad_norm": 0.03588644042611122, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462691187858582, | |
| "mean_token_accuracy": 0.7782226353883743, | |
| "num_tokens": 7427429.0, | |
| "step": 456 | |
| }, | |
| { | |
| "entropy": 0.535987101495266, | |
| "epoch": 1.707202993451824, | |
| "grad_norm": 0.03534339368343353, | |
| "learning_rate": 0.0002, | |
| "loss": 0.549435019493103, | |
| "mean_token_accuracy": 0.7765841335058212, | |
| "num_tokens": 7443721.0, | |
| "step": 457 | |
| }, | |
| { | |
| "entropy": 0.5456487089395523, | |
| "epoch": 1.7109448082319925, | |
| "grad_norm": 0.03618441894650459, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485998392105103, | |
| "mean_token_accuracy": 0.7757130116224289, | |
| "num_tokens": 7460111.0, | |
| "step": 458 | |
| }, | |
| { | |
| "entropy": 0.5436663031578064, | |
| "epoch": 1.714686623012161, | |
| "grad_norm": 0.02979116700589657, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5414945483207703, | |
| "mean_token_accuracy": 0.7812917977571487, | |
| "num_tokens": 7476124.0, | |
| "step": 459 | |
| }, | |
| { | |
| "entropy": 0.5709712207317352, | |
| "epoch": 1.7184284377923293, | |
| "grad_norm": 0.03200547397136688, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619422197341919, | |
| "mean_token_accuracy": 0.7735306322574615, | |
| "num_tokens": 7492499.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.5626240521669388, | |
| "epoch": 1.7221702525724978, | |
| "grad_norm": 0.03815503418445587, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5533303618431091, | |
| "mean_token_accuracy": 0.7753702253103256, | |
| "num_tokens": 7508641.0, | |
| "step": 461 | |
| }, | |
| { | |
| "entropy": 0.5480938106775284, | |
| "epoch": 1.725912067352666, | |
| "grad_norm": 0.03169892355799675, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524613261222839, | |
| "mean_token_accuracy": 0.7751649022102356, | |
| "num_tokens": 7525219.0, | |
| "step": 462 | |
| }, | |
| { | |
| "entropy": 0.5562078654766083, | |
| "epoch": 1.7296538821328344, | |
| "grad_norm": 0.03617829084396362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5619810819625854, | |
| "mean_token_accuracy": 0.7714113295078278, | |
| "num_tokens": 7541689.0, | |
| "step": 463 | |
| }, | |
| { | |
| "entropy": 0.5358584523200989, | |
| "epoch": 1.7333956969130027, | |
| "grad_norm": 0.03426409512758255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471996068954468, | |
| "mean_token_accuracy": 0.7751270979642868, | |
| "num_tokens": 7558097.0, | |
| "step": 464 | |
| }, | |
| { | |
| "entropy": 0.5273950546979904, | |
| "epoch": 1.737137511693171, | |
| "grad_norm": 0.03135877847671509, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319076776504517, | |
| "mean_token_accuracy": 0.7831837683916092, | |
| "num_tokens": 7574193.0, | |
| "step": 465 | |
| }, | |
| { | |
| "entropy": 0.5745384991168976, | |
| "epoch": 1.7408793264733395, | |
| "grad_norm": 0.03335622698068619, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5716018676757812, | |
| "mean_token_accuracy": 0.7669582962989807, | |
| "num_tokens": 7590824.0, | |
| "step": 466 | |
| }, | |
| { | |
| "entropy": 0.5475277155637741, | |
| "epoch": 1.744621141253508, | |
| "grad_norm": 0.02866513840854168, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5436227321624756, | |
| "mean_token_accuracy": 0.777054488658905, | |
| "num_tokens": 7607042.0, | |
| "step": 467 | |
| }, | |
| { | |
| "entropy": 0.5518149137496948, | |
| "epoch": 1.7483629560336764, | |
| "grad_norm": 0.029388844966888428, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5495098233222961, | |
| "mean_token_accuracy": 0.7773433327674866, | |
| "num_tokens": 7623420.0, | |
| "step": 468 | |
| }, | |
| { | |
| "entropy": 0.5374390631914139, | |
| "epoch": 1.7521047708138449, | |
| "grad_norm": 0.0325518473982811, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412787795066833, | |
| "mean_token_accuracy": 0.7788903117179871, | |
| "num_tokens": 7639630.0, | |
| "step": 469 | |
| }, | |
| { | |
| "entropy": 0.5380698144435883, | |
| "epoch": 1.755846585594013, | |
| "grad_norm": 0.029125649482011795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5411547422409058, | |
| "mean_token_accuracy": 0.7780955582857132, | |
| "num_tokens": 7655842.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.5518491268157959, | |
| "epoch": 1.7595884003741815, | |
| "grad_norm": 0.03188946843147278, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5559889674186707, | |
| "mean_token_accuracy": 0.7736992090940475, | |
| "num_tokens": 7672101.0, | |
| "step": 471 | |
| }, | |
| { | |
| "entropy": 0.5442283153533936, | |
| "epoch": 1.7633302151543497, | |
| "grad_norm": 0.034016743302345276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5500984191894531, | |
| "mean_token_accuracy": 0.7761438190937042, | |
| "num_tokens": 7688113.0, | |
| "step": 472 | |
| }, | |
| { | |
| "entropy": 0.5488689690828323, | |
| "epoch": 1.7670720299345182, | |
| "grad_norm": 0.02747703716158867, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475065112113953, | |
| "mean_token_accuracy": 0.775134801864624, | |
| "num_tokens": 7704497.0, | |
| "step": 473 | |
| }, | |
| { | |
| "entropy": 0.568826898932457, | |
| "epoch": 1.7708138447146866, | |
| "grad_norm": 0.03434092178940773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5651647448539734, | |
| "mean_token_accuracy": 0.7715141028165817, | |
| "num_tokens": 7720786.0, | |
| "step": 474 | |
| }, | |
| { | |
| "entropy": 0.5751989632844925, | |
| "epoch": 1.774555659494855, | |
| "grad_norm": 0.03127957507967949, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5659101605415344, | |
| "mean_token_accuracy": 0.7694416791200638, | |
| "num_tokens": 7737241.0, | |
| "step": 475 | |
| }, | |
| { | |
| "entropy": 0.5532206594944, | |
| "epoch": 1.7782974742750235, | |
| "grad_norm": 0.02908439189195633, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514166355133057, | |
| "mean_token_accuracy": 0.7745979428291321, | |
| "num_tokens": 7753654.0, | |
| "step": 476 | |
| }, | |
| { | |
| "entropy": 0.5416929870843887, | |
| "epoch": 1.782039289055192, | |
| "grad_norm": 0.03806254267692566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5534486770629883, | |
| "mean_token_accuracy": 0.7739390730857849, | |
| "num_tokens": 7770019.0, | |
| "step": 477 | |
| }, | |
| { | |
| "entropy": 0.5363457053899765, | |
| "epoch": 1.7857811038353602, | |
| "grad_norm": 0.032926302403211594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503825545310974, | |
| "mean_token_accuracy": 0.7768030315637589, | |
| "num_tokens": 7786449.0, | |
| "step": 478 | |
| }, | |
| { | |
| "entropy": 0.5420104712247849, | |
| "epoch": 1.7895229186155284, | |
| "grad_norm": 0.02965935505926609, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425794124603271, | |
| "mean_token_accuracy": 0.7801303416490555, | |
| "num_tokens": 7802671.0, | |
| "step": 479 | |
| }, | |
| { | |
| "entropy": 0.549240380525589, | |
| "epoch": 1.7932647333956968, | |
| "grad_norm": 0.029267581179738045, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5447797179222107, | |
| "mean_token_accuracy": 0.7785746455192566, | |
| "num_tokens": 7819171.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.5564038902521133, | |
| "epoch": 1.7970065481758652, | |
| "grad_norm": 0.027819465845823288, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5569280385971069, | |
| "mean_token_accuracy": 0.7717359662055969, | |
| "num_tokens": 7835514.0, | |
| "step": 481 | |
| }, | |
| { | |
| "entropy": 0.5513341128826141, | |
| "epoch": 1.8007483629560337, | |
| "grad_norm": 0.032080937176942825, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5565280318260193, | |
| "mean_token_accuracy": 0.7745318114757538, | |
| "num_tokens": 7851901.0, | |
| "step": 482 | |
| }, | |
| { | |
| "entropy": 0.5669872015714645, | |
| "epoch": 1.8044901777362021, | |
| "grad_norm": 0.031251415610313416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5653026103973389, | |
| "mean_token_accuracy": 0.7678168416023254, | |
| "num_tokens": 7868506.0, | |
| "step": 483 | |
| }, | |
| { | |
| "entropy": 0.5539208799600601, | |
| "epoch": 1.8082319925163706, | |
| "grad_norm": 0.02905306965112686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545270442962646, | |
| "mean_token_accuracy": 0.7701525986194611, | |
| "num_tokens": 7884991.0, | |
| "step": 484 | |
| }, | |
| { | |
| "entropy": 0.5545967519283295, | |
| "epoch": 1.8119738072965388, | |
| "grad_norm": 0.028621984645724297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514732003211975, | |
| "mean_token_accuracy": 0.7761166989803314, | |
| "num_tokens": 7901376.0, | |
| "step": 485 | |
| }, | |
| { | |
| "entropy": 0.5499511659145355, | |
| "epoch": 1.8157156220767072, | |
| "grad_norm": 0.03022296354174614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5498670339584351, | |
| "mean_token_accuracy": 0.7770126014947891, | |
| "num_tokens": 7917862.0, | |
| "step": 486 | |
| }, | |
| { | |
| "entropy": 0.5304104536771774, | |
| "epoch": 1.8194574368568754, | |
| "grad_norm": 0.03297071531414986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350517630577087, | |
| "mean_token_accuracy": 0.7801762819290161, | |
| "num_tokens": 7933992.0, | |
| "step": 487 | |
| }, | |
| { | |
| "entropy": 0.5290692076086998, | |
| "epoch": 1.8231992516370439, | |
| "grad_norm": 0.03105652704834938, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5332382917404175, | |
| "mean_token_accuracy": 0.7827692329883575, | |
| "num_tokens": 7949802.0, | |
| "step": 488 | |
| }, | |
| { | |
| "entropy": 0.5513493865728378, | |
| "epoch": 1.8269410664172123, | |
| "grad_norm": 0.027769237756729126, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537266135215759, | |
| "mean_token_accuracy": 0.7724474370479584, | |
| "num_tokens": 7966264.0, | |
| "step": 489 | |
| }, | |
| { | |
| "entropy": 0.559148445725441, | |
| "epoch": 1.8306828811973808, | |
| "grad_norm": 0.03133245185017586, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5547972321510315, | |
| "mean_token_accuracy": 0.7729021608829498, | |
| "num_tokens": 7982562.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.5613508969545364, | |
| "epoch": 1.8344246959775492, | |
| "grad_norm": 0.031487561762332916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5589193105697632, | |
| "mean_token_accuracy": 0.7691849023103714, | |
| "num_tokens": 7999101.0, | |
| "step": 491 | |
| }, | |
| { | |
| "entropy": 0.552077904343605, | |
| "epoch": 1.8381665107577176, | |
| "grad_norm": 0.030901558697223663, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5548684597015381, | |
| "mean_token_accuracy": 0.7746628671884537, | |
| "num_tokens": 8015580.0, | |
| "step": 492 | |
| }, | |
| { | |
| "entropy": 0.5537288337945938, | |
| "epoch": 1.8419083255378859, | |
| "grad_norm": 0.032475873827934265, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554737389087677, | |
| "mean_token_accuracy": 0.7736551910638809, | |
| "num_tokens": 8031933.0, | |
| "step": 493 | |
| }, | |
| { | |
| "entropy": 0.548131912946701, | |
| "epoch": 1.8456501403180543, | |
| "grad_norm": 0.034645676612854004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518745183944702, | |
| "mean_token_accuracy": 0.7750734686851501, | |
| "num_tokens": 8048122.0, | |
| "step": 494 | |
| }, | |
| { | |
| "entropy": 0.5457621365785599, | |
| "epoch": 1.8493919550982225, | |
| "grad_norm": 0.0346519835293293, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511569380760193, | |
| "mean_token_accuracy": 0.774482324719429, | |
| "num_tokens": 8064371.0, | |
| "step": 495 | |
| }, | |
| { | |
| "entropy": 0.5622203350067139, | |
| "epoch": 1.853133769878391, | |
| "grad_norm": 0.04098769649863243, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5641219615936279, | |
| "mean_token_accuracy": 0.7717546820640564, | |
| "num_tokens": 8080811.0, | |
| "step": 496 | |
| }, | |
| { | |
| "entropy": 0.5483545809984207, | |
| "epoch": 1.8568755846585594, | |
| "grad_norm": 0.03688424080610275, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5510388612747192, | |
| "mean_token_accuracy": 0.7764346599578857, | |
| "num_tokens": 8097126.0, | |
| "step": 497 | |
| }, | |
| { | |
| "entropy": 0.5505103766918182, | |
| "epoch": 1.8606173994387278, | |
| "grad_norm": 0.03670699521899223, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5573628544807434, | |
| "mean_token_accuracy": 0.7726601958274841, | |
| "num_tokens": 8113420.0, | |
| "step": 498 | |
| }, | |
| { | |
| "entropy": 0.529410183429718, | |
| "epoch": 1.8643592142188963, | |
| "grad_norm": 0.0299246683716774, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223079919815063, | |
| "mean_token_accuracy": 0.787264496088028, | |
| "num_tokens": 8129867.0, | |
| "step": 499 | |
| }, | |
| { | |
| "entropy": 0.5540086030960083, | |
| "epoch": 1.8681010289990645, | |
| "grad_norm": 0.03435957059264183, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5479264259338379, | |
| "mean_token_accuracy": 0.7777916789054871, | |
| "num_tokens": 8146232.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.5476558804512024, | |
| "epoch": 1.871842843779233, | |
| "grad_norm": 0.032948873937129974, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458691716194153, | |
| "mean_token_accuracy": 0.7800754606723785, | |
| "num_tokens": 8162478.0, | |
| "step": 501 | |
| }, | |
| { | |
| "entropy": 0.5278200954198837, | |
| "epoch": 1.8755846585594012, | |
| "grad_norm": 0.02974856086075306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305043458938599, | |
| "mean_token_accuracy": 0.785199910402298, | |
| "num_tokens": 8179046.0, | |
| "step": 502 | |
| }, | |
| { | |
| "entropy": 0.5498995333909988, | |
| "epoch": 1.8793264733395696, | |
| "grad_norm": 0.035161007195711136, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5587770342826843, | |
| "mean_token_accuracy": 0.7729851007461548, | |
| "num_tokens": 8195430.0, | |
| "step": 503 | |
| }, | |
| { | |
| "entropy": 0.5525415539741516, | |
| "epoch": 1.883068288119738, | |
| "grad_norm": 0.0358411967754364, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540306568145752, | |
| "mean_token_accuracy": 0.7763612270355225, | |
| "num_tokens": 8211820.0, | |
| "step": 504 | |
| }, | |
| { | |
| "entropy": 0.548132598400116, | |
| "epoch": 1.8868101028999065, | |
| "grad_norm": 0.030124109238386154, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5509622693061829, | |
| "mean_token_accuracy": 0.7774811685085297, | |
| "num_tokens": 8228136.0, | |
| "step": 505 | |
| }, | |
| { | |
| "entropy": 0.5653504580259323, | |
| "epoch": 1.890551917680075, | |
| "grad_norm": 0.03144733980298042, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5578948259353638, | |
| "mean_token_accuracy": 0.7719802111387253, | |
| "num_tokens": 8244600.0, | |
| "step": 506 | |
| }, | |
| { | |
| "entropy": 0.5680980533361435, | |
| "epoch": 1.8942937324602434, | |
| "grad_norm": 0.03786737844347954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5742643475532532, | |
| "mean_token_accuracy": 0.7682982087135315, | |
| "num_tokens": 8260924.0, | |
| "step": 507 | |
| }, | |
| { | |
| "entropy": 0.5519368350505829, | |
| "epoch": 1.8980355472404116, | |
| "grad_norm": 0.03175094351172447, | |
| "learning_rate": 0.0002, | |
| "loss": 0.553012490272522, | |
| "mean_token_accuracy": 0.7758679240942001, | |
| "num_tokens": 8277138.0, | |
| "step": 508 | |
| }, | |
| { | |
| "entropy": 0.550408124923706, | |
| "epoch": 1.90177736202058, | |
| "grad_norm": 0.03196226805448532, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527910590171814, | |
| "mean_token_accuracy": 0.7774336487054825, | |
| "num_tokens": 8293651.0, | |
| "step": 509 | |
| }, | |
| { | |
| "entropy": 0.551310807466507, | |
| "epoch": 1.9055191768007482, | |
| "grad_norm": 0.032158490270376205, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5532134175300598, | |
| "mean_token_accuracy": 0.7765610069036484, | |
| "num_tokens": 8310166.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.554396003484726, | |
| "epoch": 1.9092609915809167, | |
| "grad_norm": 0.03265155106782913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5611427426338196, | |
| "mean_token_accuracy": 0.770960658788681, | |
| "num_tokens": 8326460.0, | |
| "step": 511 | |
| }, | |
| { | |
| "entropy": 0.5533443540334702, | |
| "epoch": 1.913002806361085, | |
| "grad_norm": 0.03062952496111393, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5535008311271667, | |
| "mean_token_accuracy": 0.7743202298879623, | |
| "num_tokens": 8342730.0, | |
| "step": 512 | |
| }, | |
| { | |
| "entropy": 0.557416245341301, | |
| "epoch": 1.9167446211412535, | |
| "grad_norm": 0.032427720725536346, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555341899394989, | |
| "mean_token_accuracy": 0.7736751586198807, | |
| "num_tokens": 8358790.0, | |
| "step": 513 | |
| }, | |
| { | |
| "entropy": 0.5498823821544647, | |
| "epoch": 1.920486435921422, | |
| "grad_norm": 0.03641689941287041, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5489510893821716, | |
| "mean_token_accuracy": 0.7756739258766174, | |
| "num_tokens": 8374932.0, | |
| "step": 514 | |
| }, | |
| { | |
| "entropy": 0.5567668825387955, | |
| "epoch": 1.9242282507015904, | |
| "grad_norm": 0.0356590710580349, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5600458979606628, | |
| "mean_token_accuracy": 0.7731840312480927, | |
| "num_tokens": 8391373.0, | |
| "step": 515 | |
| }, | |
| { | |
| "entropy": 0.5492214262485504, | |
| "epoch": 1.9279700654817586, | |
| "grad_norm": 0.032011594623327255, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5541006326675415, | |
| "mean_token_accuracy": 0.7760893553495407, | |
| "num_tokens": 8407637.0, | |
| "step": 516 | |
| }, | |
| { | |
| "entropy": 0.5398948937654495, | |
| "epoch": 1.931711880261927, | |
| "grad_norm": 0.03577565401792526, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467641949653625, | |
| "mean_token_accuracy": 0.775809720158577, | |
| "num_tokens": 8423916.0, | |
| "step": 517 | |
| }, | |
| { | |
| "entropy": 0.5437736511230469, | |
| "epoch": 1.9354536950420953, | |
| "grad_norm": 0.031068816781044006, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446307063102722, | |
| "mean_token_accuracy": 0.7766688168048859, | |
| "num_tokens": 8440387.0, | |
| "step": 518 | |
| }, | |
| { | |
| "entropy": 0.551026239991188, | |
| "epoch": 1.9391955098222637, | |
| "grad_norm": 0.03239775449037552, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5448942184448242, | |
| "mean_token_accuracy": 0.7764843702316284, | |
| "num_tokens": 8456844.0, | |
| "step": 519 | |
| }, | |
| { | |
| "entropy": 0.5524020791053772, | |
| "epoch": 1.9429373246024322, | |
| "grad_norm": 0.03006759099662304, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5508519411087036, | |
| "mean_token_accuracy": 0.7757467180490494, | |
| "num_tokens": 8473098.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.5465254038572311, | |
| "epoch": 1.9466791393826006, | |
| "grad_norm": 0.03377439081668854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440271496772766, | |
| "mean_token_accuracy": 0.7764104902744293, | |
| "num_tokens": 8489284.0, | |
| "step": 521 | |
| }, | |
| { | |
| "entropy": 0.5479972213506699, | |
| "epoch": 1.950420954162769, | |
| "grad_norm": 0.03804773464798927, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5570059418678284, | |
| "mean_token_accuracy": 0.7720707058906555, | |
| "num_tokens": 8505659.0, | |
| "step": 522 | |
| }, | |
| { | |
| "entropy": 0.5531162023544312, | |
| "epoch": 1.9541627689429373, | |
| "grad_norm": 0.0431046187877655, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5670960545539856, | |
| "mean_token_accuracy": 0.7688823044300079, | |
| "num_tokens": 8522329.0, | |
| "step": 523 | |
| }, | |
| { | |
| "entropy": 0.5688248574733734, | |
| "epoch": 1.9579045837231057, | |
| "grad_norm": 0.026841329410672188, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5626019835472107, | |
| "mean_token_accuracy": 0.7691622525453568, | |
| "num_tokens": 8538842.0, | |
| "step": 524 | |
| }, | |
| { | |
| "entropy": 0.5459724515676498, | |
| "epoch": 1.961646398503274, | |
| "grad_norm": 0.03493349626660347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5443795919418335, | |
| "mean_token_accuracy": 0.7770666480064392, | |
| "num_tokens": 8554945.0, | |
| "step": 525 | |
| }, | |
| { | |
| "entropy": 0.5657712519168854, | |
| "epoch": 1.9653882132834424, | |
| "grad_norm": 0.03769686445593834, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5527753829956055, | |
| "mean_token_accuracy": 0.7778369933366776, | |
| "num_tokens": 8570989.0, | |
| "step": 526 | |
| }, | |
| { | |
| "entropy": 0.550276130437851, | |
| "epoch": 1.9691300280636108, | |
| "grad_norm": 0.03369564935564995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424638986587524, | |
| "mean_token_accuracy": 0.7803192138671875, | |
| "num_tokens": 8587072.0, | |
| "step": 527 | |
| }, | |
| { | |
| "entropy": 0.5489895343780518, | |
| "epoch": 1.9728718428437793, | |
| "grad_norm": 0.03569629415869713, | |
| "learning_rate": 0.0002, | |
| "loss": 0.559888482093811, | |
| "mean_token_accuracy": 0.7720399796962738, | |
| "num_tokens": 8603352.0, | |
| "step": 528 | |
| }, | |
| { | |
| "entropy": 0.530121460556984, | |
| "epoch": 1.9766136576239477, | |
| "grad_norm": 0.037291910499334335, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5450345873832703, | |
| "mean_token_accuracy": 0.7796709537506104, | |
| "num_tokens": 8619760.0, | |
| "step": 529 | |
| }, | |
| { | |
| "entropy": 0.5523941069841385, | |
| "epoch": 1.9803554724041161, | |
| "grad_norm": 0.027196237817406654, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5566985011100769, | |
| "mean_token_accuracy": 0.773260235786438, | |
| "num_tokens": 8636140.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.5579734891653061, | |
| "epoch": 1.9840972871842844, | |
| "grad_norm": 0.029088523238897324, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5540033578872681, | |
| "mean_token_accuracy": 0.7756596505641937, | |
| "num_tokens": 8652295.0, | |
| "step": 531 | |
| }, | |
| { | |
| "entropy": 0.5574969351291656, | |
| "epoch": 1.9878391019644528, | |
| "grad_norm": 0.029939375817775726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5501161217689514, | |
| "mean_token_accuracy": 0.7750376909971237, | |
| "num_tokens": 8668973.0, | |
| "step": 532 | |
| }, | |
| { | |
| "entropy": 0.5492955148220062, | |
| "epoch": 1.991580916744621, | |
| "grad_norm": 0.03092138096690178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422185063362122, | |
| "mean_token_accuracy": 0.7804518193006516, | |
| "num_tokens": 8685148.0, | |
| "step": 533 | |
| }, | |
| { | |
| "entropy": 0.5466224402189255, | |
| "epoch": 1.9953227315247895, | |
| "grad_norm": 0.03692883625626564, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5514038801193237, | |
| "mean_token_accuracy": 0.7737534046173096, | |
| "num_tokens": 8701543.0, | |
| "step": 534 | |
| }, | |
| { | |
| "entropy": 0.5537078529596329, | |
| "epoch": 1.999064546304958, | |
| "grad_norm": 0.03208556026220322, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5545927286148071, | |
| "mean_token_accuracy": 0.777570441365242, | |
| "num_tokens": 8717790.0, | |
| "step": 535 | |
| }, | |
| { | |
| "entropy": 0.5328470468521118, | |
| "epoch": 2.0, | |
| "grad_norm": 0.056387241929769516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407091379165649, | |
| "mean_token_accuracy": 0.7980132699012756, | |
| "num_tokens": 8719006.0, | |
| "step": 536 | |
| }, | |
| { | |
| "entropy": 0.5399350374937057, | |
| "epoch": 2.0037418147801684, | |
| "grad_norm": 0.030944975093007088, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385851263999939, | |
| "mean_token_accuracy": 0.7820405662059784, | |
| "num_tokens": 8735642.0, | |
| "step": 537 | |
| }, | |
| { | |
| "entropy": 0.5494481921195984, | |
| "epoch": 2.007483629560337, | |
| "grad_norm": 0.037696994841098785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568894147872925, | |
| "mean_token_accuracy": 0.7728834450244904, | |
| "num_tokens": 8752037.0, | |
| "step": 538 | |
| }, | |
| { | |
| "entropy": 0.5218051299452782, | |
| "epoch": 2.0112254443405053, | |
| "grad_norm": 0.03197522833943367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5231513977050781, | |
| "mean_token_accuracy": 0.7889297753572464, | |
| "num_tokens": 8768180.0, | |
| "step": 539 | |
| }, | |
| { | |
| "entropy": 0.5204869955778122, | |
| "epoch": 2.0149672591206733, | |
| "grad_norm": 0.03365905210375786, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5204414129257202, | |
| "mean_token_accuracy": 0.7887504994869232, | |
| "num_tokens": 8784385.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.5250371545553207, | |
| "epoch": 2.0187090739008418, | |
| "grad_norm": 0.03206612914800644, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5264713764190674, | |
| "mean_token_accuracy": 0.7865318804979324, | |
| "num_tokens": 8800264.0, | |
| "step": 541 | |
| }, | |
| { | |
| "entropy": 0.5362996757030487, | |
| "epoch": 2.02245088868101, | |
| "grad_norm": 0.035737182945013046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328425765037537, | |
| "mean_token_accuracy": 0.7832369208335876, | |
| "num_tokens": 8816869.0, | |
| "step": 542 | |
| }, | |
| { | |
| "entropy": 0.5211998522281647, | |
| "epoch": 2.0261927034611786, | |
| "grad_norm": 0.03382508456707001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247855186462402, | |
| "mean_token_accuracy": 0.7869311422109604, | |
| "num_tokens": 8833119.0, | |
| "step": 543 | |
| }, | |
| { | |
| "entropy": 0.5350741446018219, | |
| "epoch": 2.029934518241347, | |
| "grad_norm": 0.03478322923183441, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424962639808655, | |
| "mean_token_accuracy": 0.7780940532684326, | |
| "num_tokens": 8849384.0, | |
| "step": 544 | |
| }, | |
| { | |
| "entropy": 0.5465849786996841, | |
| "epoch": 2.0336763330215155, | |
| "grad_norm": 0.04140733554959297, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5555759072303772, | |
| "mean_token_accuracy": 0.7771580815315247, | |
| "num_tokens": 8865580.0, | |
| "step": 545 | |
| }, | |
| { | |
| "entropy": 0.5315355062484741, | |
| "epoch": 2.037418147801684, | |
| "grad_norm": 0.037138681858778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5277940630912781, | |
| "mean_token_accuracy": 0.7869007289409637, | |
| "num_tokens": 8882160.0, | |
| "step": 546 | |
| }, | |
| { | |
| "entropy": 0.5415049940347672, | |
| "epoch": 2.0411599625818524, | |
| "grad_norm": 0.0382317453622818, | |
| "learning_rate": 0.0002, | |
| "loss": 0.52928626537323, | |
| "mean_token_accuracy": 0.783332422375679, | |
| "num_tokens": 8898284.0, | |
| "step": 547 | |
| }, | |
| { | |
| "entropy": 0.5444429516792297, | |
| "epoch": 2.0449017773620204, | |
| "grad_norm": 0.03212872892618179, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390786528587341, | |
| "mean_token_accuracy": 0.7800189107656479, | |
| "num_tokens": 8914317.0, | |
| "step": 548 | |
| }, | |
| { | |
| "entropy": 0.5368607640266418, | |
| "epoch": 2.048643592142189, | |
| "grad_norm": 0.03962872177362442, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5424067974090576, | |
| "mean_token_accuracy": 0.7807967215776443, | |
| "num_tokens": 8930503.0, | |
| "step": 549 | |
| }, | |
| { | |
| "entropy": 0.5316442102193832, | |
| "epoch": 2.0523854069223573, | |
| "grad_norm": 0.04042808711528778, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5394030809402466, | |
| "mean_token_accuracy": 0.7808849960565567, | |
| "num_tokens": 8946862.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.5393616110086441, | |
| "epoch": 2.0561272217025257, | |
| "grad_norm": 0.04134383797645569, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422969460487366, | |
| "mean_token_accuracy": 0.778337299823761, | |
| "num_tokens": 8963159.0, | |
| "step": 551 | |
| }, | |
| { | |
| "entropy": 0.5272297635674477, | |
| "epoch": 2.059869036482694, | |
| "grad_norm": 0.03908038139343262, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269819498062134, | |
| "mean_token_accuracy": 0.7861954718828201, | |
| "num_tokens": 8979486.0, | |
| "step": 552 | |
| }, | |
| { | |
| "entropy": 0.5292486846446991, | |
| "epoch": 2.0636108512628626, | |
| "grad_norm": 0.03547659516334534, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531383752822876, | |
| "mean_token_accuracy": 0.7845012545585632, | |
| "num_tokens": 8995728.0, | |
| "step": 553 | |
| }, | |
| { | |
| "entropy": 0.537693664431572, | |
| "epoch": 2.067352666043031, | |
| "grad_norm": 0.04505831003189087, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5415912866592407, | |
| "mean_token_accuracy": 0.7810403853654861, | |
| "num_tokens": 9012262.0, | |
| "step": 554 | |
| }, | |
| { | |
| "entropy": 0.542693018913269, | |
| "epoch": 2.0710944808231995, | |
| "grad_norm": 0.03637455403804779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454283356666565, | |
| "mean_token_accuracy": 0.7768286317586899, | |
| "num_tokens": 9028450.0, | |
| "step": 555 | |
| }, | |
| { | |
| "entropy": 0.5359488427639008, | |
| "epoch": 2.0748362956033675, | |
| "grad_norm": 0.038283299654722214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341436266899109, | |
| "mean_token_accuracy": 0.7861706465482712, | |
| "num_tokens": 9044691.0, | |
| "step": 556 | |
| }, | |
| { | |
| "entropy": 0.5348773896694183, | |
| "epoch": 2.078578110383536, | |
| "grad_norm": 0.038720738142728806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5340168476104736, | |
| "mean_token_accuracy": 0.7848398089408875, | |
| "num_tokens": 9061090.0, | |
| "step": 557 | |
| }, | |
| { | |
| "entropy": 0.5301378965377808, | |
| "epoch": 2.0823199251637043, | |
| "grad_norm": 0.03610686585307121, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331196784973145, | |
| "mean_token_accuracy": 0.7825122624635696, | |
| "num_tokens": 9077457.0, | |
| "step": 558 | |
| }, | |
| { | |
| "entropy": 0.5627280175685883, | |
| "epoch": 2.086061739943873, | |
| "grad_norm": 0.0459170863032341, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5622618198394775, | |
| "mean_token_accuracy": 0.7731509357690811, | |
| "num_tokens": 9093892.0, | |
| "step": 559 | |
| }, | |
| { | |
| "entropy": 0.5291252806782722, | |
| "epoch": 2.0898035547240412, | |
| "grad_norm": 0.03501354530453682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5241326689720154, | |
| "mean_token_accuracy": 0.7903649061918259, | |
| "num_tokens": 9110195.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.5336360484361649, | |
| "epoch": 2.0935453695042097, | |
| "grad_norm": 0.03297366574406624, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5302354097366333, | |
| "mean_token_accuracy": 0.7871804982423782, | |
| "num_tokens": 9126264.0, | |
| "step": 561 | |
| }, | |
| { | |
| "entropy": 0.5324128270149231, | |
| "epoch": 2.097287184284378, | |
| "grad_norm": 0.040097158402204514, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449591875076294, | |
| "mean_token_accuracy": 0.7766915112733841, | |
| "num_tokens": 9142405.0, | |
| "step": 562 | |
| }, | |
| { | |
| "entropy": 0.5327176600694656, | |
| "epoch": 2.101028999064546, | |
| "grad_norm": 0.03983257710933685, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5427699089050293, | |
| "mean_token_accuracy": 0.780575692653656, | |
| "num_tokens": 9158550.0, | |
| "step": 563 | |
| }, | |
| { | |
| "entropy": 0.5298762768507004, | |
| "epoch": 2.1047708138447145, | |
| "grad_norm": 0.035936590284109116, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320777297019958, | |
| "mean_token_accuracy": 0.7820149213075638, | |
| "num_tokens": 9174783.0, | |
| "step": 564 | |
| }, | |
| { | |
| "entropy": 0.5250122100114822, | |
| "epoch": 2.108512628624883, | |
| "grad_norm": 0.03537021949887276, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220876932144165, | |
| "mean_token_accuracy": 0.7874044477939606, | |
| "num_tokens": 9190734.0, | |
| "step": 565 | |
| }, | |
| { | |
| "entropy": 0.5498971343040466, | |
| "epoch": 2.1122544434050514, | |
| "grad_norm": 0.03972788527607918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416819453239441, | |
| "mean_token_accuracy": 0.7811024487018585, | |
| "num_tokens": 9207046.0, | |
| "step": 566 | |
| }, | |
| { | |
| "entropy": 0.5510820746421814, | |
| "epoch": 2.11599625818522, | |
| "grad_norm": 0.03674028813838959, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5430952906608582, | |
| "mean_token_accuracy": 0.7772987484931946, | |
| "num_tokens": 9223541.0, | |
| "step": 567 | |
| }, | |
| { | |
| "entropy": 0.5243249386548996, | |
| "epoch": 2.1197380729653883, | |
| "grad_norm": 0.03868189826607704, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5305947065353394, | |
| "mean_token_accuracy": 0.7821440249681473, | |
| "num_tokens": 9239944.0, | |
| "step": 568 | |
| }, | |
| { | |
| "entropy": 0.5186186358332634, | |
| "epoch": 2.1234798877455567, | |
| "grad_norm": 0.03420955687761307, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219792127609253, | |
| "mean_token_accuracy": 0.787507027387619, | |
| "num_tokens": 9256323.0, | |
| "step": 569 | |
| }, | |
| { | |
| "entropy": 0.5048380643129349, | |
| "epoch": 2.127221702525725, | |
| "grad_norm": 0.043813057243824005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.511600911617279, | |
| "mean_token_accuracy": 0.7919255346059799, | |
| "num_tokens": 9272250.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.5333007425069809, | |
| "epoch": 2.130963517305893, | |
| "grad_norm": 0.03591044992208481, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5382859110832214, | |
| "mean_token_accuracy": 0.7790134996175766, | |
| "num_tokens": 9288633.0, | |
| "step": 571 | |
| }, | |
| { | |
| "entropy": 0.5432953387498856, | |
| "epoch": 2.1347053320860616, | |
| "grad_norm": 0.03850630670785904, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5398726463317871, | |
| "mean_token_accuracy": 0.7803007066249847, | |
| "num_tokens": 9304977.0, | |
| "step": 572 | |
| }, | |
| { | |
| "entropy": 0.5424948632717133, | |
| "epoch": 2.13844714686623, | |
| "grad_norm": 0.042041826993227005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5371389389038086, | |
| "mean_token_accuracy": 0.7817080616950989, | |
| "num_tokens": 9321211.0, | |
| "step": 573 | |
| }, | |
| { | |
| "entropy": 0.5420571565628052, | |
| "epoch": 2.1421889616463985, | |
| "grad_norm": 0.03702463209629059, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405826568603516, | |
| "mean_token_accuracy": 0.7787773013114929, | |
| "num_tokens": 9337519.0, | |
| "step": 574 | |
| }, | |
| { | |
| "entropy": 0.5343386083841324, | |
| "epoch": 2.145930776426567, | |
| "grad_norm": 0.0367942713201046, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5343334078788757, | |
| "mean_token_accuracy": 0.7813169211149216, | |
| "num_tokens": 9353930.0, | |
| "step": 575 | |
| }, | |
| { | |
| "entropy": 0.5107736587524414, | |
| "epoch": 2.1496725912067354, | |
| "grad_norm": 0.04816743731498718, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5181273221969604, | |
| "mean_token_accuracy": 0.790352001786232, | |
| "num_tokens": 9370151.0, | |
| "step": 576 | |
| }, | |
| { | |
| "entropy": 0.5483916699886322, | |
| "epoch": 2.153414405986904, | |
| "grad_norm": 0.03954138606786728, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5537930130958557, | |
| "mean_token_accuracy": 0.7744487076997757, | |
| "num_tokens": 9386529.0, | |
| "step": 577 | |
| }, | |
| { | |
| "entropy": 0.5222444832324982, | |
| "epoch": 2.157156220767072, | |
| "grad_norm": 0.04258863255381584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331015586853027, | |
| "mean_token_accuracy": 0.7828160971403122, | |
| "num_tokens": 9402702.0, | |
| "step": 578 | |
| }, | |
| { | |
| "entropy": 0.5395079553127289, | |
| "epoch": 2.1608980355472402, | |
| "grad_norm": 0.036775294691324234, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392586588859558, | |
| "mean_token_accuracy": 0.7785846441984177, | |
| "num_tokens": 9418983.0, | |
| "step": 579 | |
| }, | |
| { | |
| "entropy": 0.5308848768472672, | |
| "epoch": 2.1646398503274087, | |
| "grad_norm": 0.041630957275629044, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5223425030708313, | |
| "mean_token_accuracy": 0.7881145030260086, | |
| "num_tokens": 9435130.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.5460510104894638, | |
| "epoch": 2.168381665107577, | |
| "grad_norm": 0.040873266756534576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5389937162399292, | |
| "mean_token_accuracy": 0.7796555161476135, | |
| "num_tokens": 9451384.0, | |
| "step": 581 | |
| }, | |
| { | |
| "entropy": 0.5144870802760124, | |
| "epoch": 2.1721234798877456, | |
| "grad_norm": 0.04395061731338501, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5220937132835388, | |
| "mean_token_accuracy": 0.7867953330278397, | |
| "num_tokens": 9467676.0, | |
| "step": 582 | |
| }, | |
| { | |
| "entropy": 0.5361004173755646, | |
| "epoch": 2.175865294667914, | |
| "grad_norm": 0.03444032743573189, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5381976962089539, | |
| "mean_token_accuracy": 0.7804248631000519, | |
| "num_tokens": 9484105.0, | |
| "step": 583 | |
| }, | |
| { | |
| "entropy": 0.5315199941396713, | |
| "epoch": 2.1796071094480824, | |
| "grad_norm": 0.04019028693437576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538859486579895, | |
| "mean_token_accuracy": 0.7802779376506805, | |
| "num_tokens": 9500441.0, | |
| "step": 584 | |
| }, | |
| { | |
| "entropy": 0.5049743205308914, | |
| "epoch": 2.183348924228251, | |
| "grad_norm": 0.038020916283130646, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5077824592590332, | |
| "mean_token_accuracy": 0.794673815369606, | |
| "num_tokens": 9516632.0, | |
| "step": 585 | |
| }, | |
| { | |
| "entropy": 0.542245015501976, | |
| "epoch": 2.187090739008419, | |
| "grad_norm": 0.03803880140185356, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457203388214111, | |
| "mean_token_accuracy": 0.7765202075242996, | |
| "num_tokens": 9532790.0, | |
| "step": 586 | |
| }, | |
| { | |
| "entropy": 0.545234277844429, | |
| "epoch": 2.1908325537885873, | |
| "grad_norm": 0.03659515827894211, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328729748725891, | |
| "mean_token_accuracy": 0.7851473838090897, | |
| "num_tokens": 9549021.0, | |
| "step": 587 | |
| }, | |
| { | |
| "entropy": 0.5441733747720718, | |
| "epoch": 2.1945743685687558, | |
| "grad_norm": 0.03839794918894768, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541313648223877, | |
| "mean_token_accuracy": 0.7806493043899536, | |
| "num_tokens": 9565414.0, | |
| "step": 588 | |
| }, | |
| { | |
| "entropy": 0.5392065942287445, | |
| "epoch": 2.198316183348924, | |
| "grad_norm": 0.03657695651054382, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446825623512268, | |
| "mean_token_accuracy": 0.7759186178445816, | |
| "num_tokens": 9581834.0, | |
| "step": 589 | |
| }, | |
| { | |
| "entropy": 0.5343391597270966, | |
| "epoch": 2.2020579981290926, | |
| "grad_norm": 0.03904880955815315, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319048166275024, | |
| "mean_token_accuracy": 0.7858142107725143, | |
| "num_tokens": 9598306.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.5127864703536034, | |
| "epoch": 2.205799812909261, | |
| "grad_norm": 0.041219562292099, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5198400616645813, | |
| "mean_token_accuracy": 0.7894931733608246, | |
| "num_tokens": 9614512.0, | |
| "step": 591 | |
| }, | |
| { | |
| "entropy": 0.5380221456289291, | |
| "epoch": 2.2095416276894295, | |
| "grad_norm": 0.03763064742088318, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350849032402039, | |
| "mean_token_accuracy": 0.779957503080368, | |
| "num_tokens": 9630831.0, | |
| "step": 592 | |
| }, | |
| { | |
| "entropy": 0.5404982268810272, | |
| "epoch": 2.213283442469598, | |
| "grad_norm": 0.03594009950757027, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446127653121948, | |
| "mean_token_accuracy": 0.7765700072050095, | |
| "num_tokens": 9647260.0, | |
| "step": 593 | |
| }, | |
| { | |
| "entropy": 0.5349030494689941, | |
| "epoch": 2.217025257249766, | |
| "grad_norm": 0.039131198078393936, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5407675504684448, | |
| "mean_token_accuracy": 0.7807668596506119, | |
| "num_tokens": 9663454.0, | |
| "step": 594 | |
| }, | |
| { | |
| "entropy": 0.5357907861471176, | |
| "epoch": 2.2207670720299344, | |
| "grad_norm": 0.03754086792469025, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390987396240234, | |
| "mean_token_accuracy": 0.7814063429832458, | |
| "num_tokens": 9679665.0, | |
| "step": 595 | |
| }, | |
| { | |
| "entropy": 0.539327397942543, | |
| "epoch": 2.224508886810103, | |
| "grad_norm": 0.042121171951293945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5349074006080627, | |
| "mean_token_accuracy": 0.7835494577884674, | |
| "num_tokens": 9695690.0, | |
| "step": 596 | |
| }, | |
| { | |
| "entropy": 0.5527440309524536, | |
| "epoch": 2.2282507015902713, | |
| "grad_norm": 0.034759730100631714, | |
| "learning_rate": 0.0002, | |
| "loss": 0.546990156173706, | |
| "mean_token_accuracy": 0.7748693376779556, | |
| "num_tokens": 9711925.0, | |
| "step": 597 | |
| }, | |
| { | |
| "entropy": 0.5339156091213226, | |
| "epoch": 2.2319925163704397, | |
| "grad_norm": 0.03824164718389511, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315659642219543, | |
| "mean_token_accuracy": 0.7847660332918167, | |
| "num_tokens": 9728568.0, | |
| "step": 598 | |
| }, | |
| { | |
| "entropy": 0.5418261140584946, | |
| "epoch": 2.235734331150608, | |
| "grad_norm": 0.03952635079622269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444273948669434, | |
| "mean_token_accuracy": 0.7786458134651184, | |
| "num_tokens": 9744937.0, | |
| "step": 599 | |
| }, | |
| { | |
| "entropy": 0.5325147211551666, | |
| "epoch": 2.2394761459307766, | |
| "grad_norm": 0.038507163524627686, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538148045539856, | |
| "mean_token_accuracy": 0.7803481221199036, | |
| "num_tokens": 9761521.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.5348295122385025, | |
| "epoch": 2.243217960710945, | |
| "grad_norm": 0.035764180123806, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350884199142456, | |
| "mean_token_accuracy": 0.7832496911287308, | |
| "num_tokens": 9777702.0, | |
| "step": 601 | |
| }, | |
| { | |
| "entropy": 0.549017146229744, | |
| "epoch": 2.246959775491113, | |
| "grad_norm": 0.037822045385837555, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5440195798873901, | |
| "mean_token_accuracy": 0.7799560874700546, | |
| "num_tokens": 9794070.0, | |
| "step": 602 | |
| }, | |
| { | |
| "entropy": 0.5402355939149857, | |
| "epoch": 2.2507015902712815, | |
| "grad_norm": 0.04137027636170387, | |
| "learning_rate": 0.0002, | |
| "loss": 0.552240788936615, | |
| "mean_token_accuracy": 0.7787455171346664, | |
| "num_tokens": 9810307.0, | |
| "step": 603 | |
| }, | |
| { | |
| "entropy": 0.5575389862060547, | |
| "epoch": 2.25444340505145, | |
| "grad_norm": 0.03639021888375282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.555095911026001, | |
| "mean_token_accuracy": 0.7715982496738434, | |
| "num_tokens": 9826944.0, | |
| "step": 604 | |
| }, | |
| { | |
| "entropy": 0.5453804582357407, | |
| "epoch": 2.2581852198316184, | |
| "grad_norm": 0.0329916886985302, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451047420501709, | |
| "mean_token_accuracy": 0.778001993894577, | |
| "num_tokens": 9843174.0, | |
| "step": 605 | |
| }, | |
| { | |
| "entropy": 0.5351513028144836, | |
| "epoch": 2.261927034611787, | |
| "grad_norm": 0.04027882218360901, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5335583686828613, | |
| "mean_token_accuracy": 0.7831520736217499, | |
| "num_tokens": 9859568.0, | |
| "step": 606 | |
| }, | |
| { | |
| "entropy": 0.5303051620721817, | |
| "epoch": 2.2656688493919552, | |
| "grad_norm": 0.037942592054605484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5293945670127869, | |
| "mean_token_accuracy": 0.7875201851129532, | |
| "num_tokens": 9876127.0, | |
| "step": 607 | |
| }, | |
| { | |
| "entropy": 0.5205637887120247, | |
| "epoch": 2.2694106641721237, | |
| "grad_norm": 0.039965420961380005, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5284023284912109, | |
| "mean_token_accuracy": 0.7851175218820572, | |
| "num_tokens": 9892336.0, | |
| "step": 608 | |
| }, | |
| { | |
| "entropy": 0.5270423293113708, | |
| "epoch": 2.2731524789522917, | |
| "grad_norm": 0.045534420758485794, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361034274101257, | |
| "mean_token_accuracy": 0.7813378870487213, | |
| "num_tokens": 9908677.0, | |
| "step": 609 | |
| }, | |
| { | |
| "entropy": 0.5461472570896149, | |
| "epoch": 2.27689429373246, | |
| "grad_norm": 0.03911803662776947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5419346690177917, | |
| "mean_token_accuracy": 0.7793000787496567, | |
| "num_tokens": 9925188.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.5332899391651154, | |
| "epoch": 2.2806361085126285, | |
| "grad_norm": 0.03753461316227913, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5261275172233582, | |
| "mean_token_accuracy": 0.7856169193983078, | |
| "num_tokens": 9941232.0, | |
| "step": 611 | |
| }, | |
| { | |
| "entropy": 0.5298324078321457, | |
| "epoch": 2.284377923292797, | |
| "grad_norm": 0.03578303009271622, | |
| "learning_rate": 0.0002, | |
| "loss": 0.525759220123291, | |
| "mean_token_accuracy": 0.7869399040937424, | |
| "num_tokens": 9957312.0, | |
| "step": 612 | |
| }, | |
| { | |
| "entropy": 0.5350215286016464, | |
| "epoch": 2.2881197380729654, | |
| "grad_norm": 0.04014569893479347, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390491485595703, | |
| "mean_token_accuracy": 0.7834457159042358, | |
| "num_tokens": 9973629.0, | |
| "step": 613 | |
| }, | |
| { | |
| "entropy": 0.5366346836090088, | |
| "epoch": 2.291861552853134, | |
| "grad_norm": 0.03635207563638687, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5361836552619934, | |
| "mean_token_accuracy": 0.7822949439287186, | |
| "num_tokens": 9990003.0, | |
| "step": 614 | |
| }, | |
| { | |
| "entropy": 0.5358218550682068, | |
| "epoch": 2.2956033676333023, | |
| "grad_norm": 0.04499870166182518, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5433334708213806, | |
| "mean_token_accuracy": 0.781024381518364, | |
| "num_tokens": 10006594.0, | |
| "step": 615 | |
| }, | |
| { | |
| "entropy": 0.5238985568284988, | |
| "epoch": 2.2993451824134707, | |
| "grad_norm": 0.041404612362384796, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5319328308105469, | |
| "mean_token_accuracy": 0.7816060185432434, | |
| "num_tokens": 10022841.0, | |
| "step": 616 | |
| }, | |
| { | |
| "entropy": 0.5418704599142075, | |
| "epoch": 2.3030869971936387, | |
| "grad_norm": 0.03798811510205269, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5385047793388367, | |
| "mean_token_accuracy": 0.781515583395958, | |
| "num_tokens": 10039191.0, | |
| "step": 617 | |
| }, | |
| { | |
| "entropy": 0.5519637167453766, | |
| "epoch": 2.306828811973807, | |
| "grad_norm": 0.03714706003665924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444304347038269, | |
| "mean_token_accuracy": 0.779953271150589, | |
| "num_tokens": 10055793.0, | |
| "step": 618 | |
| }, | |
| { | |
| "entropy": 0.5363687425851822, | |
| "epoch": 2.3105706267539756, | |
| "grad_norm": 0.0435946062207222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.538260817527771, | |
| "mean_token_accuracy": 0.7822400480508804, | |
| "num_tokens": 10072406.0, | |
| "step": 619 | |
| }, | |
| { | |
| "entropy": 0.5363148003816605, | |
| "epoch": 2.314312441534144, | |
| "grad_norm": 0.03934507444500923, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5490261316299438, | |
| "mean_token_accuracy": 0.7775698453187943, | |
| "num_tokens": 10088893.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.5337411910295486, | |
| "epoch": 2.3180542563143125, | |
| "grad_norm": 0.040114130824804306, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454047322273254, | |
| "mean_token_accuracy": 0.7799661755561829, | |
| "num_tokens": 10105348.0, | |
| "step": 621 | |
| }, | |
| { | |
| "entropy": 0.5429546684026718, | |
| "epoch": 2.321796071094481, | |
| "grad_norm": 0.04296046867966652, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543846070766449, | |
| "mean_token_accuracy": 0.7779647558927536, | |
| "num_tokens": 10121753.0, | |
| "step": 622 | |
| }, | |
| { | |
| "entropy": 0.5331653952598572, | |
| "epoch": 2.3255378858746494, | |
| "grad_norm": 0.03862839564681053, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5329957008361816, | |
| "mean_token_accuracy": 0.7838963121175766, | |
| "num_tokens": 10138069.0, | |
| "step": 623 | |
| }, | |
| { | |
| "entropy": 0.5332556366920471, | |
| "epoch": 2.3292797006548174, | |
| "grad_norm": 0.03637029603123665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5306488871574402, | |
| "mean_token_accuracy": 0.7843363881111145, | |
| "num_tokens": 10154386.0, | |
| "step": 624 | |
| }, | |
| { | |
| "entropy": 0.5389147102832794, | |
| "epoch": 2.333021515434986, | |
| "grad_norm": 0.04242001101374626, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5379246473312378, | |
| "mean_token_accuracy": 0.7805036455392838, | |
| "num_tokens": 10170602.0, | |
| "step": 625 | |
| }, | |
| { | |
| "entropy": 0.529606968164444, | |
| "epoch": 2.3367633302151543, | |
| "grad_norm": 0.04366292059421539, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5345982909202576, | |
| "mean_token_accuracy": 0.7849325835704803, | |
| "num_tokens": 10186681.0, | |
| "step": 626 | |
| }, | |
| { | |
| "entropy": 0.5343451648950577, | |
| "epoch": 2.3405051449953227, | |
| "grad_norm": 0.04901853948831558, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5390074253082275, | |
| "mean_token_accuracy": 0.7809460461139679, | |
| "num_tokens": 10202735.0, | |
| "step": 627 | |
| }, | |
| { | |
| "entropy": 0.5364287346601486, | |
| "epoch": 2.344246959775491, | |
| "grad_norm": 0.03992681950330734, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5428602695465088, | |
| "mean_token_accuracy": 0.7803080379962921, | |
| "num_tokens": 10219104.0, | |
| "step": 628 | |
| }, | |
| { | |
| "entropy": 0.5363292992115021, | |
| "epoch": 2.3479887745556596, | |
| "grad_norm": 0.04561900347471237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5422950983047485, | |
| "mean_token_accuracy": 0.7803726643323898, | |
| "num_tokens": 10235450.0, | |
| "step": 629 | |
| }, | |
| { | |
| "entropy": 0.5503382086753845, | |
| "epoch": 2.351730589335828, | |
| "grad_norm": 0.036633238196372986, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5429909229278564, | |
| "mean_token_accuracy": 0.777814120054245, | |
| "num_tokens": 10251744.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5556712299585342, | |
| "epoch": 2.3554724041159965, | |
| "grad_norm": 0.03755469620227814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372464060783386, | |
| "mean_token_accuracy": 0.7816385924816132, | |
| "num_tokens": 10268228.0, | |
| "step": 631 | |
| }, | |
| { | |
| "entropy": 0.54240882396698, | |
| "epoch": 2.3592142188961645, | |
| "grad_norm": 0.04244554787874222, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416730046272278, | |
| "mean_token_accuracy": 0.7805517017841339, | |
| "num_tokens": 10284594.0, | |
| "step": 632 | |
| }, | |
| { | |
| "entropy": 0.5457853078842163, | |
| "epoch": 2.362956033676333, | |
| "grad_norm": 0.03768390789628029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5503990054130554, | |
| "mean_token_accuracy": 0.7760391384363174, | |
| "num_tokens": 10300645.0, | |
| "step": 633 | |
| }, | |
| { | |
| "entropy": 0.5061568543314934, | |
| "epoch": 2.3666978484565013, | |
| "grad_norm": 0.04066069424152374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5147897601127625, | |
| "mean_token_accuracy": 0.7923619449138641, | |
| "num_tokens": 10317035.0, | |
| "step": 634 | |
| }, | |
| { | |
| "entropy": 0.5265238285064697, | |
| "epoch": 2.3704396632366698, | |
| "grad_norm": 0.045070137828588486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5342065691947937, | |
| "mean_token_accuracy": 0.7828978300094604, | |
| "num_tokens": 10333097.0, | |
| "step": 635 | |
| }, | |
| { | |
| "entropy": 0.5213058292865753, | |
| "epoch": 2.374181478016838, | |
| "grad_norm": 0.04251949489116669, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242940783500671, | |
| "mean_token_accuracy": 0.7875875681638718, | |
| "num_tokens": 10349477.0, | |
| "step": 636 | |
| }, | |
| { | |
| "entropy": 0.532469779253006, | |
| "epoch": 2.3779232927970066, | |
| "grad_norm": 0.04180033504962921, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338732600212097, | |
| "mean_token_accuracy": 0.7874448299407959, | |
| "num_tokens": 10365855.0, | |
| "step": 637 | |
| }, | |
| { | |
| "entropy": 0.5583899617195129, | |
| "epoch": 2.381665107577175, | |
| "grad_norm": 0.036461617797613144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5522404313087463, | |
| "mean_token_accuracy": 0.7765318900346756, | |
| "num_tokens": 10382454.0, | |
| "step": 638 | |
| }, | |
| { | |
| "entropy": 0.5361616462469101, | |
| "epoch": 2.385406922357343, | |
| "grad_norm": 0.03820829838514328, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331661701202393, | |
| "mean_token_accuracy": 0.7812754958868027, | |
| "num_tokens": 10398570.0, | |
| "step": 639 | |
| }, | |
| { | |
| "entropy": 0.5388377606868744, | |
| "epoch": 2.3891487371375115, | |
| "grad_norm": 0.03890148177742958, | |
| "learning_rate": 0.0002, | |
| "loss": 0.535783052444458, | |
| "mean_token_accuracy": 0.7837421149015427, | |
| "num_tokens": 10415136.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.5403297692537308, | |
| "epoch": 2.39289055191768, | |
| "grad_norm": 0.037266530096530914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5458592176437378, | |
| "mean_token_accuracy": 0.7799215018749237, | |
| "num_tokens": 10431595.0, | |
| "step": 641 | |
| }, | |
| { | |
| "entropy": 0.5327188819646835, | |
| "epoch": 2.3966323666978484, | |
| "grad_norm": 0.04411016404628754, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5372153520584106, | |
| "mean_token_accuracy": 0.7820907682180405, | |
| "num_tokens": 10448092.0, | |
| "step": 642 | |
| }, | |
| { | |
| "entropy": 0.5483715236186981, | |
| "epoch": 2.400374181478017, | |
| "grad_norm": 0.03909829258918762, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5454411506652832, | |
| "mean_token_accuracy": 0.781398132443428, | |
| "num_tokens": 10464267.0, | |
| "step": 643 | |
| }, | |
| { | |
| "entropy": 0.5467081367969513, | |
| "epoch": 2.4041159962581853, | |
| "grad_norm": 0.04295220598578453, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5442530512809753, | |
| "mean_token_accuracy": 0.7759910225868225, | |
| "num_tokens": 10480622.0, | |
| "step": 644 | |
| }, | |
| { | |
| "entropy": 0.545724093914032, | |
| "epoch": 2.4078578110383537, | |
| "grad_norm": 0.04099191352725029, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471324324607849, | |
| "mean_token_accuracy": 0.7780001610517502, | |
| "num_tokens": 10497093.0, | |
| "step": 645 | |
| }, | |
| { | |
| "entropy": 0.5526789277791977, | |
| "epoch": 2.411599625818522, | |
| "grad_norm": 0.03481397032737732, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5524189472198486, | |
| "mean_token_accuracy": 0.7738725692033768, | |
| "num_tokens": 10513288.0, | |
| "step": 646 | |
| }, | |
| { | |
| "entropy": 0.5496002286672592, | |
| "epoch": 2.4153414405986906, | |
| "grad_norm": 0.04474830627441406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568821430206299, | |
| "mean_token_accuracy": 0.7747314423322678, | |
| "num_tokens": 10529966.0, | |
| "step": 647 | |
| }, | |
| { | |
| "entropy": 0.5191539749503136, | |
| "epoch": 2.4190832553788586, | |
| "grad_norm": 0.04506181180477142, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247750878334045, | |
| "mean_token_accuracy": 0.7888272404670715, | |
| "num_tokens": 10546217.0, | |
| "step": 648 | |
| }, | |
| { | |
| "entropy": 0.5462011098861694, | |
| "epoch": 2.422825070159027, | |
| "grad_norm": 0.03946157172322273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449219942092896, | |
| "mean_token_accuracy": 0.7763949930667877, | |
| "num_tokens": 10562587.0, | |
| "step": 649 | |
| }, | |
| { | |
| "entropy": 0.5374903529882431, | |
| "epoch": 2.4265668849391955, | |
| "grad_norm": 0.035694316029548645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5298718214035034, | |
| "mean_token_accuracy": 0.7844248116016388, | |
| "num_tokens": 10578673.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.5490742027759552, | |
| "epoch": 2.430308699719364, | |
| "grad_norm": 0.040128957480192184, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5476623773574829, | |
| "mean_token_accuracy": 0.7761844098567963, | |
| "num_tokens": 10594904.0, | |
| "step": 651 | |
| }, | |
| { | |
| "entropy": 0.5350600033998489, | |
| "epoch": 2.4340505144995324, | |
| "grad_norm": 0.04965779185295105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5467137694358826, | |
| "mean_token_accuracy": 0.7777107208967209, | |
| "num_tokens": 10611301.0, | |
| "step": 652 | |
| }, | |
| { | |
| "entropy": 0.5389928370714188, | |
| "epoch": 2.437792329279701, | |
| "grad_norm": 0.038716454058885574, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5406030416488647, | |
| "mean_token_accuracy": 0.7798842638731003, | |
| "num_tokens": 10627924.0, | |
| "step": 653 | |
| }, | |
| { | |
| "entropy": 0.5396043509244919, | |
| "epoch": 2.441534144059869, | |
| "grad_norm": 0.04796689748764038, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5485687255859375, | |
| "mean_token_accuracy": 0.7767132520675659, | |
| "num_tokens": 10643995.0, | |
| "step": 654 | |
| }, | |
| { | |
| "entropy": 0.5651813000440598, | |
| "epoch": 2.4452759588400372, | |
| "grad_norm": 0.03899235278367996, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5558621883392334, | |
| "mean_token_accuracy": 0.7751055210828781, | |
| "num_tokens": 10660611.0, | |
| "step": 655 | |
| }, | |
| { | |
| "entropy": 0.5467101633548737, | |
| "epoch": 2.4490177736202057, | |
| "grad_norm": 0.041317425668239594, | |
| "learning_rate": 0.0002, | |
| "loss": 0.544463574886322, | |
| "mean_token_accuracy": 0.7791299223899841, | |
| "num_tokens": 10676939.0, | |
| "step": 656 | |
| }, | |
| { | |
| "entropy": 0.5405649244785309, | |
| "epoch": 2.452759588400374, | |
| "grad_norm": 0.03767058625817299, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5359505414962769, | |
| "mean_token_accuracy": 0.7838631421327591, | |
| "num_tokens": 10693242.0, | |
| "step": 657 | |
| }, | |
| { | |
| "entropy": 0.5295758992433548, | |
| "epoch": 2.4565014031805426, | |
| "grad_norm": 0.03993664309382439, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338568091392517, | |
| "mean_token_accuracy": 0.7815168350934982, | |
| "num_tokens": 10709228.0, | |
| "step": 658 | |
| }, | |
| { | |
| "entropy": 0.5318661481142044, | |
| "epoch": 2.460243217960711, | |
| "grad_norm": 0.04673660546541214, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5387503504753113, | |
| "mean_token_accuracy": 0.7823595702648163, | |
| "num_tokens": 10725743.0, | |
| "step": 659 | |
| }, | |
| { | |
| "entropy": 0.5362888127565384, | |
| "epoch": 2.4639850327408794, | |
| "grad_norm": 0.0443369522690773, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374599099159241, | |
| "mean_token_accuracy": 0.7816221117973328, | |
| "num_tokens": 10742450.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5324875563383102, | |
| "epoch": 2.467726847521048, | |
| "grad_norm": 0.037758708000183105, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5326871871948242, | |
| "mean_token_accuracy": 0.7862564772367477, | |
| "num_tokens": 10758610.0, | |
| "step": 661 | |
| }, | |
| { | |
| "entropy": 0.5277500152587891, | |
| "epoch": 2.4714686623012163, | |
| "grad_norm": 0.042098864912986755, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5331279635429382, | |
| "mean_token_accuracy": 0.7840241938829422, | |
| "num_tokens": 10774701.0, | |
| "step": 662 | |
| }, | |
| { | |
| "entropy": 0.5366615355014801, | |
| "epoch": 2.4752104770813843, | |
| "grad_norm": 0.040946412831544876, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5397564768791199, | |
| "mean_token_accuracy": 0.7829322069883347, | |
| "num_tokens": 10790740.0, | |
| "step": 663 | |
| }, | |
| { | |
| "entropy": 0.5435209423303604, | |
| "epoch": 2.4789522918615527, | |
| "grad_norm": 0.04173668473958969, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5457897186279297, | |
| "mean_token_accuracy": 0.7782775014638901, | |
| "num_tokens": 10806903.0, | |
| "step": 664 | |
| }, | |
| { | |
| "entropy": 0.5472803115844727, | |
| "epoch": 2.482694106641721, | |
| "grad_norm": 0.040667202323675156, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5462859869003296, | |
| "mean_token_accuracy": 0.7769711166620255, | |
| "num_tokens": 10823042.0, | |
| "step": 665 | |
| }, | |
| { | |
| "entropy": 0.5469382554292679, | |
| "epoch": 2.4864359214218896, | |
| "grad_norm": 0.04248496890068054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395170450210571, | |
| "mean_token_accuracy": 0.7798823863267899, | |
| "num_tokens": 10839340.0, | |
| "step": 666 | |
| }, | |
| { | |
| "entropy": 0.5202000439167023, | |
| "epoch": 2.490177736202058, | |
| "grad_norm": 0.03368566930294037, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5234949588775635, | |
| "mean_token_accuracy": 0.786568820476532, | |
| "num_tokens": 10855502.0, | |
| "step": 667 | |
| }, | |
| { | |
| "entropy": 0.5273594409227371, | |
| "epoch": 2.4939195509822265, | |
| "grad_norm": 0.04516978561878204, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360161066055298, | |
| "mean_token_accuracy": 0.7853840887546539, | |
| "num_tokens": 10871840.0, | |
| "step": 668 | |
| }, | |
| { | |
| "entropy": 0.5393954515457153, | |
| "epoch": 2.497661365762395, | |
| "grad_norm": 0.03674040734767914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5378697514533997, | |
| "mean_token_accuracy": 0.7824258059263229, | |
| "num_tokens": 10888120.0, | |
| "step": 669 | |
| }, | |
| { | |
| "entropy": 0.5479197651147842, | |
| "epoch": 2.501403180542563, | |
| "grad_norm": 0.03727351129055023, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5392875671386719, | |
| "mean_token_accuracy": 0.7811300158500671, | |
| "num_tokens": 10904483.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.552995502948761, | |
| "epoch": 2.5051449953227314, | |
| "grad_norm": 0.036775074899196625, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5475963950157166, | |
| "mean_token_accuracy": 0.7784164547920227, | |
| "num_tokens": 10920853.0, | |
| "step": 671 | |
| }, | |
| { | |
| "entropy": 0.5446810871362686, | |
| "epoch": 2.5088868101029, | |
| "grad_norm": 0.038499053567647934, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5511402487754822, | |
| "mean_token_accuracy": 0.7761510908603668, | |
| "num_tokens": 10937231.0, | |
| "step": 672 | |
| }, | |
| { | |
| "entropy": 0.5175495520234108, | |
| "epoch": 2.5126286248830683, | |
| "grad_norm": 0.039775073528289795, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5242205858230591, | |
| "mean_token_accuracy": 0.7848553359508514, | |
| "num_tokens": 10953429.0, | |
| "step": 673 | |
| }, | |
| { | |
| "entropy": 0.5237327665090561, | |
| "epoch": 2.5163704396632367, | |
| "grad_norm": 0.04171684384346008, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307218432426453, | |
| "mean_token_accuracy": 0.7838338315486908, | |
| "num_tokens": 10969808.0, | |
| "step": 674 | |
| }, | |
| { | |
| "entropy": 0.5405460149049759, | |
| "epoch": 2.520112254443405, | |
| "grad_norm": 0.04240800440311432, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408159494400024, | |
| "mean_token_accuracy": 0.7787611186504364, | |
| "num_tokens": 10986049.0, | |
| "step": 675 | |
| }, | |
| { | |
| "entropy": 0.5486787706613541, | |
| "epoch": 2.5238540692235736, | |
| "grad_norm": 0.039784692227840424, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455769896507263, | |
| "mean_token_accuracy": 0.7784162014722824, | |
| "num_tokens": 11002254.0, | |
| "step": 676 | |
| }, | |
| { | |
| "entropy": 0.5363409966230392, | |
| "epoch": 2.527595884003742, | |
| "grad_norm": 0.03736806660890579, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5266451239585876, | |
| "mean_token_accuracy": 0.7866665124893188, | |
| "num_tokens": 11018914.0, | |
| "step": 677 | |
| }, | |
| { | |
| "entropy": 0.5279175043106079, | |
| "epoch": 2.5313376987839105, | |
| "grad_norm": 0.035363830626010895, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288829207420349, | |
| "mean_token_accuracy": 0.7874743491411209, | |
| "num_tokens": 11034952.0, | |
| "step": 678 | |
| }, | |
| { | |
| "entropy": 0.5376022309064865, | |
| "epoch": 2.5350795135640785, | |
| "grad_norm": 0.051831189543008804, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518858432769775, | |
| "mean_token_accuracy": 0.7750970423221588, | |
| "num_tokens": 11051172.0, | |
| "step": 679 | |
| }, | |
| { | |
| "entropy": 0.5426171720027924, | |
| "epoch": 2.538821328344247, | |
| "grad_norm": 0.04189771041274071, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5544742345809937, | |
| "mean_token_accuracy": 0.7774394005537033, | |
| "num_tokens": 11067538.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5293037593364716, | |
| "epoch": 2.5425631431244153, | |
| "grad_norm": 0.04074425622820854, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5310404896736145, | |
| "mean_token_accuracy": 0.7826415598392487, | |
| "num_tokens": 11083927.0, | |
| "step": 681 | |
| }, | |
| { | |
| "entropy": 0.5473333150148392, | |
| "epoch": 2.5463049579045838, | |
| "grad_norm": 0.03279516100883484, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5383847951889038, | |
| "mean_token_accuracy": 0.7836183458566666, | |
| "num_tokens": 11100675.0, | |
| "step": 682 | |
| }, | |
| { | |
| "entropy": 0.5422270894050598, | |
| "epoch": 2.550046772684752, | |
| "grad_norm": 0.039768971502780914, | |
| "learning_rate": 0.0002, | |
| "loss": 0.543849766254425, | |
| "mean_token_accuracy": 0.7796186804771423, | |
| "num_tokens": 11116748.0, | |
| "step": 683 | |
| }, | |
| { | |
| "entropy": 0.5384610444307327, | |
| "epoch": 2.55378858746492, | |
| "grad_norm": 0.037385329604148865, | |
| "learning_rate": 0.0002, | |
| "loss": 0.54084312915802, | |
| "mean_token_accuracy": 0.7830232381820679, | |
| "num_tokens": 11133051.0, | |
| "step": 684 | |
| }, | |
| { | |
| "entropy": 0.5261296629905701, | |
| "epoch": 2.5575304022450887, | |
| "grad_norm": 0.039306074380874634, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531363844871521, | |
| "mean_token_accuracy": 0.785315752029419, | |
| "num_tokens": 11149362.0, | |
| "step": 685 | |
| }, | |
| { | |
| "entropy": 0.5491520762443542, | |
| "epoch": 2.561272217025257, | |
| "grad_norm": 0.04143069311976433, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5444177389144897, | |
| "mean_token_accuracy": 0.7807131856679916, | |
| "num_tokens": 11165746.0, | |
| "step": 686 | |
| }, | |
| { | |
| "entropy": 0.53914874792099, | |
| "epoch": 2.5650140318054255, | |
| "grad_norm": 0.03408098593354225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294961929321289, | |
| "mean_token_accuracy": 0.7870545238256454, | |
| "num_tokens": 11182138.0, | |
| "step": 687 | |
| }, | |
| { | |
| "entropy": 0.5346123576164246, | |
| "epoch": 2.568755846585594, | |
| "grad_norm": 0.04301401227712631, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5353041887283325, | |
| "mean_token_accuracy": 0.784915953874588, | |
| "num_tokens": 11198330.0, | |
| "step": 688 | |
| }, | |
| { | |
| "entropy": 0.5318583697080612, | |
| "epoch": 2.5724976613657624, | |
| "grad_norm": 0.04231448844075203, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5399123430252075, | |
| "mean_token_accuracy": 0.7802146077156067, | |
| "num_tokens": 11214613.0, | |
| "step": 689 | |
| }, | |
| { | |
| "entropy": 0.5280211716890335, | |
| "epoch": 2.576239476145931, | |
| "grad_norm": 0.04549930989742279, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432953238487244, | |
| "mean_token_accuracy": 0.777678519487381, | |
| "num_tokens": 11230987.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.5567438304424286, | |
| "epoch": 2.5799812909260993, | |
| "grad_norm": 0.03926197439432144, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5588645339012146, | |
| "mean_token_accuracy": 0.7713411450386047, | |
| "num_tokens": 11247503.0, | |
| "step": 691 | |
| }, | |
| { | |
| "entropy": 0.542352095246315, | |
| "epoch": 2.5837231057062677, | |
| "grad_norm": 0.035485655069351196, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354308485984802, | |
| "mean_token_accuracy": 0.7822972387075424, | |
| "num_tokens": 11263949.0, | |
| "step": 692 | |
| }, | |
| { | |
| "entropy": 0.5373577028512955, | |
| "epoch": 2.587464920486436, | |
| "grad_norm": 0.04045470058917999, | |
| "learning_rate": 0.0002, | |
| "loss": 0.524779200553894, | |
| "mean_token_accuracy": 0.785191684961319, | |
| "num_tokens": 11280345.0, | |
| "step": 693 | |
| }, | |
| { | |
| "entropy": 0.5388759598135948, | |
| "epoch": 2.591206735266604, | |
| "grad_norm": 0.03759071230888367, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5312530994415283, | |
| "mean_token_accuracy": 0.7809051126241684, | |
| "num_tokens": 11296587.0, | |
| "step": 694 | |
| }, | |
| { | |
| "entropy": 0.5210207849740982, | |
| "epoch": 2.5949485500467726, | |
| "grad_norm": 0.03664049878716469, | |
| "learning_rate": 0.0002, | |
| "loss": 0.526019275188446, | |
| "mean_token_accuracy": 0.7867360413074493, | |
| "num_tokens": 11313101.0, | |
| "step": 695 | |
| }, | |
| { | |
| "entropy": 0.5182994976639748, | |
| "epoch": 2.598690364826941, | |
| "grad_norm": 0.05368485301733017, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5354053974151611, | |
| "mean_token_accuracy": 0.7826909422874451, | |
| "num_tokens": 11329367.0, | |
| "step": 696 | |
| }, | |
| { | |
| "entropy": 0.5452821850776672, | |
| "epoch": 2.6024321796071095, | |
| "grad_norm": 0.04641703888773918, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5546022057533264, | |
| "mean_token_accuracy": 0.7768976241350174, | |
| "num_tokens": 11345547.0, | |
| "step": 697 | |
| }, | |
| { | |
| "entropy": 0.5391091257333755, | |
| "epoch": 2.606173994387278, | |
| "grad_norm": 0.04271511733531952, | |
| "learning_rate": 0.0002, | |
| "loss": 0.541153073310852, | |
| "mean_token_accuracy": 0.7804041355848312, | |
| "num_tokens": 11361574.0, | |
| "step": 698 | |
| }, | |
| { | |
| "entropy": 0.5462173670530319, | |
| "epoch": 2.6099158091674464, | |
| "grad_norm": 0.03939999267458916, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369886159896851, | |
| "mean_token_accuracy": 0.7804831266403198, | |
| "num_tokens": 11377812.0, | |
| "step": 699 | |
| }, | |
| { | |
| "entropy": 0.5714237540960312, | |
| "epoch": 2.6136576239476144, | |
| "grad_norm": 0.03745459020137787, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5620177984237671, | |
| "mean_token_accuracy": 0.7719487398862839, | |
| "num_tokens": 11394403.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.5377793908119202, | |
| "epoch": 2.617399438727783, | |
| "grad_norm": 0.03732477128505707, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375291109085083, | |
| "mean_token_accuracy": 0.7813573330640793, | |
| "num_tokens": 11410706.0, | |
| "step": 701 | |
| }, | |
| { | |
| "entropy": 0.5385070145130157, | |
| "epoch": 2.6211412535079512, | |
| "grad_norm": 0.04680998623371124, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5455629825592041, | |
| "mean_token_accuracy": 0.776125431060791, | |
| "num_tokens": 11427143.0, | |
| "step": 702 | |
| }, | |
| { | |
| "entropy": 0.5411592125892639, | |
| "epoch": 2.6248830682881197, | |
| "grad_norm": 0.037070900201797485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5470774173736572, | |
| "mean_token_accuracy": 0.7772253155708313, | |
| "num_tokens": 11443536.0, | |
| "step": 703 | |
| }, | |
| { | |
| "entropy": 0.5268983989953995, | |
| "epoch": 2.628624883068288, | |
| "grad_norm": 0.04107747972011566, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5320890545845032, | |
| "mean_token_accuracy": 0.7819889187812805, | |
| "num_tokens": 11459635.0, | |
| "step": 704 | |
| }, | |
| { | |
| "entropy": 0.5278744846582413, | |
| "epoch": 2.6323666978484566, | |
| "grad_norm": 0.03608566150069237, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5288647413253784, | |
| "mean_token_accuracy": 0.7842333018779755, | |
| "num_tokens": 11476037.0, | |
| "step": 705 | |
| }, | |
| { | |
| "entropy": 0.5504002794623375, | |
| "epoch": 2.636108512628625, | |
| "grad_norm": 0.041055019944906235, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5523802638053894, | |
| "mean_token_accuracy": 0.7737344652414322, | |
| "num_tokens": 11492344.0, | |
| "step": 706 | |
| }, | |
| { | |
| "entropy": 0.541622132062912, | |
| "epoch": 2.6398503274087934, | |
| "grad_norm": 0.03790360316634178, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5410860776901245, | |
| "mean_token_accuracy": 0.7775967717170715, | |
| "num_tokens": 11508715.0, | |
| "step": 707 | |
| }, | |
| { | |
| "entropy": 0.53721022605896, | |
| "epoch": 2.643592142188962, | |
| "grad_norm": 0.048964016139507294, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5369323492050171, | |
| "mean_token_accuracy": 0.7816558331251144, | |
| "num_tokens": 11525153.0, | |
| "step": 708 | |
| }, | |
| { | |
| "entropy": 0.5321754217147827, | |
| "epoch": 2.64733395696913, | |
| "grad_norm": 0.048466358333826065, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5365191698074341, | |
| "mean_token_accuracy": 0.7804320156574249, | |
| "num_tokens": 11541270.0, | |
| "step": 709 | |
| }, | |
| { | |
| "entropy": 0.5573434978723526, | |
| "epoch": 2.6510757717492983, | |
| "grad_norm": 0.045038264244794846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5563772320747375, | |
| "mean_token_accuracy": 0.7737798243761063, | |
| "num_tokens": 11557694.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5524247735738754, | |
| "epoch": 2.6548175865294668, | |
| "grad_norm": 0.038673996925354004, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5518113970756531, | |
| "mean_token_accuracy": 0.7768261432647705, | |
| "num_tokens": 11574308.0, | |
| "step": 711 | |
| }, | |
| { | |
| "entropy": 0.5358691960573196, | |
| "epoch": 2.658559401309635, | |
| "grad_norm": 0.03978041559457779, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5338990688323975, | |
| "mean_token_accuracy": 0.7842043936252594, | |
| "num_tokens": 11590586.0, | |
| "step": 712 | |
| }, | |
| { | |
| "entropy": 0.5332267433404922, | |
| "epoch": 2.6623012160898036, | |
| "grad_norm": 0.03574821725487709, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5405697822570801, | |
| "mean_token_accuracy": 0.7808981388807297, | |
| "num_tokens": 11606867.0, | |
| "step": 713 | |
| }, | |
| { | |
| "entropy": 0.5254797339439392, | |
| "epoch": 2.666043030869972, | |
| "grad_norm": 0.040162764489650726, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5316233038902283, | |
| "mean_token_accuracy": 0.7839036136865616, | |
| "num_tokens": 11623321.0, | |
| "step": 714 | |
| }, | |
| { | |
| "entropy": 0.5194612145423889, | |
| "epoch": 2.66978484565014, | |
| "grad_norm": 0.0536888912320137, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5308873057365417, | |
| "mean_token_accuracy": 0.7844232022762299, | |
| "num_tokens": 11639616.0, | |
| "step": 715 | |
| }, | |
| { | |
| "entropy": 0.5397140085697174, | |
| "epoch": 2.6735266604303085, | |
| "grad_norm": 0.034708283841609955, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5418391227722168, | |
| "mean_token_accuracy": 0.7771459370851517, | |
| "num_tokens": 11655924.0, | |
| "step": 716 | |
| }, | |
| { | |
| "entropy": 0.5523687899112701, | |
| "epoch": 2.677268475210477, | |
| "grad_norm": 0.03549209609627724, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5451604127883911, | |
| "mean_token_accuracy": 0.7780284285545349, | |
| "num_tokens": 11672448.0, | |
| "step": 717 | |
| }, | |
| { | |
| "entropy": 0.5573620796203613, | |
| "epoch": 2.6810102899906454, | |
| "grad_norm": 0.03517598658800125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5482261180877686, | |
| "mean_token_accuracy": 0.7732254415750504, | |
| "num_tokens": 11688985.0, | |
| "step": 718 | |
| }, | |
| { | |
| "entropy": 0.5521951466798782, | |
| "epoch": 2.684752104770814, | |
| "grad_norm": 0.03560207411646843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5395568609237671, | |
| "mean_token_accuracy": 0.7822758108377457, | |
| "num_tokens": 11705608.0, | |
| "step": 719 | |
| }, | |
| { | |
| "entropy": 0.5614044666290283, | |
| "epoch": 2.6884939195509823, | |
| "grad_norm": 0.04236432537436485, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560280084609985, | |
| "mean_token_accuracy": 0.7751108258962631, | |
| "num_tokens": 11721966.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5331545174121857, | |
| "epoch": 2.6922357343311507, | |
| "grad_norm": 0.03850049898028374, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5384074449539185, | |
| "mean_token_accuracy": 0.7795211225748062, | |
| "num_tokens": 11738118.0, | |
| "step": 721 | |
| }, | |
| { | |
| "entropy": 0.5322619527578354, | |
| "epoch": 2.695977549111319, | |
| "grad_norm": 0.04224139824509621, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5480450987815857, | |
| "mean_token_accuracy": 0.7758100479841232, | |
| "num_tokens": 11754350.0, | |
| "step": 722 | |
| }, | |
| { | |
| "entropy": 0.53462353348732, | |
| "epoch": 2.6997193638914876, | |
| "grad_norm": 0.03856648504734039, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5420241355895996, | |
| "mean_token_accuracy": 0.7794053852558136, | |
| "num_tokens": 11770468.0, | |
| "step": 723 | |
| }, | |
| { | |
| "entropy": 0.5529629737138748, | |
| "epoch": 2.703461178671656, | |
| "grad_norm": 0.03881238028407097, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5515606999397278, | |
| "mean_token_accuracy": 0.777623638510704, | |
| "num_tokens": 11786891.0, | |
| "step": 724 | |
| }, | |
| { | |
| "entropy": 0.5365050584077835, | |
| "epoch": 2.707202993451824, | |
| "grad_norm": 0.030840173363685608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5374981760978699, | |
| "mean_token_accuracy": 0.7810342460870743, | |
| "num_tokens": 11803202.0, | |
| "step": 725 | |
| }, | |
| { | |
| "entropy": 0.5490061491727829, | |
| "epoch": 2.7109448082319925, | |
| "grad_norm": 0.03318411111831665, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5416221022605896, | |
| "mean_token_accuracy": 0.7810187339782715, | |
| "num_tokens": 11819633.0, | |
| "step": 726 | |
| }, | |
| { | |
| "entropy": 0.5287661999464035, | |
| "epoch": 2.714686623012161, | |
| "grad_norm": 0.033848777413368225, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5285395383834839, | |
| "mean_token_accuracy": 0.785768449306488, | |
| "num_tokens": 11835951.0, | |
| "step": 727 | |
| }, | |
| { | |
| "entropy": 0.5228402391076088, | |
| "epoch": 2.7184284377923293, | |
| "grad_norm": 0.037826504558324814, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5267374515533447, | |
| "mean_token_accuracy": 0.7853263914585114, | |
| "num_tokens": 11852172.0, | |
| "step": 728 | |
| }, | |
| { | |
| "entropy": 0.5451251715421677, | |
| "epoch": 2.722170252572498, | |
| "grad_norm": 0.03935185819864273, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5431327223777771, | |
| "mean_token_accuracy": 0.7800047546625137, | |
| "num_tokens": 11868665.0, | |
| "step": 729 | |
| }, | |
| { | |
| "entropy": 0.5370529890060425, | |
| "epoch": 2.725912067352666, | |
| "grad_norm": 0.040121592581272125, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5504775643348694, | |
| "mean_token_accuracy": 0.7777304202318192, | |
| "num_tokens": 11884782.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5336936116218567, | |
| "epoch": 2.729653882132834, | |
| "grad_norm": 0.046451181173324585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401822328567505, | |
| "mean_token_accuracy": 0.7810492217540741, | |
| "num_tokens": 11900966.0, | |
| "step": 731 | |
| }, | |
| { | |
| "entropy": 0.5421666949987411, | |
| "epoch": 2.7333956969130027, | |
| "grad_norm": 0.03996991366147995, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5425142645835876, | |
| "mean_token_accuracy": 0.7759256362915039, | |
| "num_tokens": 11917559.0, | |
| "step": 732 | |
| }, | |
| { | |
| "entropy": 0.5548020005226135, | |
| "epoch": 2.737137511693171, | |
| "grad_norm": 0.039705440402030945, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5471047163009644, | |
| "mean_token_accuracy": 0.7788440138101578, | |
| "num_tokens": 11933791.0, | |
| "step": 733 | |
| }, | |
| { | |
| "entropy": 0.5459768623113632, | |
| "epoch": 2.7408793264733395, | |
| "grad_norm": 0.044193848967552185, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5505638718605042, | |
| "mean_token_accuracy": 0.7753681987524033, | |
| "num_tokens": 11949788.0, | |
| "step": 734 | |
| }, | |
| { | |
| "entropy": 0.5197051167488098, | |
| "epoch": 2.744621141253508, | |
| "grad_norm": 0.04006953909993172, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5269069671630859, | |
| "mean_token_accuracy": 0.7862325310707092, | |
| "num_tokens": 11965909.0, | |
| "step": 735 | |
| }, | |
| { | |
| "entropy": 0.5576485246419907, | |
| "epoch": 2.7483629560336764, | |
| "grad_norm": 0.03677723556756973, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5640283823013306, | |
| "mean_token_accuracy": 0.7697114050388336, | |
| "num_tokens": 11982388.0, | |
| "step": 736 | |
| }, | |
| { | |
| "entropy": 0.5379237085580826, | |
| "epoch": 2.752104770813845, | |
| "grad_norm": 0.03523614630103111, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5367957353591919, | |
| "mean_token_accuracy": 0.7794550508260727, | |
| "num_tokens": 11998589.0, | |
| "step": 737 | |
| }, | |
| { | |
| "entropy": 0.5357311069965363, | |
| "epoch": 2.7558465855940133, | |
| "grad_norm": 0.03599949926137924, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5299929976463318, | |
| "mean_token_accuracy": 0.784047082066536, | |
| "num_tokens": 12014892.0, | |
| "step": 738 | |
| }, | |
| { | |
| "entropy": 0.5434677302837372, | |
| "epoch": 2.7595884003741817, | |
| "grad_norm": 0.03983872011303902, | |
| "learning_rate": 0.0002, | |
| "loss": 0.537936806678772, | |
| "mean_token_accuracy": 0.7832438200712204, | |
| "num_tokens": 12030925.0, | |
| "step": 739 | |
| }, | |
| { | |
| "entropy": 0.5472689718008041, | |
| "epoch": 2.7633302151543497, | |
| "grad_norm": 0.03287053480744362, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5477735996246338, | |
| "mean_token_accuracy": 0.7759514302015305, | |
| "num_tokens": 12047168.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.5356525778770447, | |
| "epoch": 2.767072029934518, | |
| "grad_norm": 0.03699969872832298, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5401504635810852, | |
| "mean_token_accuracy": 0.7797222137451172, | |
| "num_tokens": 12063859.0, | |
| "step": 741 | |
| }, | |
| { | |
| "entropy": 0.522783175110817, | |
| "epoch": 2.7708138447146866, | |
| "grad_norm": 0.04751390591263771, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5334336161613464, | |
| "mean_token_accuracy": 0.785777673125267, | |
| "num_tokens": 12080092.0, | |
| "step": 742 | |
| }, | |
| { | |
| "entropy": 0.5513002574443817, | |
| "epoch": 2.774555659494855, | |
| "grad_norm": 0.04812496900558472, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542380809783936, | |
| "mean_token_accuracy": 0.7760861963033676, | |
| "num_tokens": 12096314.0, | |
| "step": 743 | |
| }, | |
| { | |
| "entropy": 0.5436785966157913, | |
| "epoch": 2.7782974742750235, | |
| "grad_norm": 0.03719832003116608, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5375255346298218, | |
| "mean_token_accuracy": 0.7817601412534714, | |
| "num_tokens": 12112385.0, | |
| "step": 744 | |
| }, | |
| { | |
| "entropy": 0.5392426550388336, | |
| "epoch": 2.782039289055192, | |
| "grad_norm": 0.036235589534044266, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5315327644348145, | |
| "mean_token_accuracy": 0.783770278096199, | |
| "num_tokens": 12128749.0, | |
| "step": 745 | |
| }, | |
| { | |
| "entropy": 0.5371043086051941, | |
| "epoch": 2.78578110383536, | |
| "grad_norm": 0.04002665355801582, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5355648994445801, | |
| "mean_token_accuracy": 0.7825834453105927, | |
| "num_tokens": 12145069.0, | |
| "step": 746 | |
| }, | |
| { | |
| "entropy": 0.5386099964380264, | |
| "epoch": 2.7895229186155284, | |
| "grad_norm": 0.0372973270714283, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5449782609939575, | |
| "mean_token_accuracy": 0.7772656977176666, | |
| "num_tokens": 12161381.0, | |
| "step": 747 | |
| }, | |
| { | |
| "entropy": 0.49367938190698624, | |
| "epoch": 2.793264733395697, | |
| "grad_norm": 0.042931776493787766, | |
| "learning_rate": 0.0002, | |
| "loss": 0.49913763999938965, | |
| "mean_token_accuracy": 0.795563668012619, | |
| "num_tokens": 12177674.0, | |
| "step": 748 | |
| }, | |
| { | |
| "entropy": 0.5577136278152466, | |
| "epoch": 2.7970065481758652, | |
| "grad_norm": 0.03464139625430107, | |
| "learning_rate": 0.0002, | |
| "loss": 0.563284158706665, | |
| "mean_token_accuracy": 0.7712576389312744, | |
| "num_tokens": 12194200.0, | |
| "step": 749 | |
| }, | |
| { | |
| "entropy": 0.5163726359605789, | |
| "epoch": 2.8007483629560337, | |
| "grad_norm": 0.043806042522192, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5230565071105957, | |
| "mean_token_accuracy": 0.7878428548574448, | |
| "num_tokens": 12210649.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.5474874824285507, | |
| "epoch": 2.804490177736202, | |
| "grad_norm": 0.03748728707432747, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5494849681854248, | |
| "mean_token_accuracy": 0.777756467461586, | |
| "num_tokens": 12226971.0, | |
| "step": 751 | |
| }, | |
| { | |
| "entropy": 0.5351517200469971, | |
| "epoch": 2.8082319925163706, | |
| "grad_norm": 0.045867737382650375, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539400577545166, | |
| "mean_token_accuracy": 0.7824986279010773, | |
| "num_tokens": 12243263.0, | |
| "step": 752 | |
| }, | |
| { | |
| "entropy": 0.5563795119524002, | |
| "epoch": 2.811973807296539, | |
| "grad_norm": 0.03956415131688118, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5521907210350037, | |
| "mean_token_accuracy": 0.7774280607700348, | |
| "num_tokens": 12259518.0, | |
| "step": 753 | |
| }, | |
| { | |
| "entropy": 0.56000916659832, | |
| "epoch": 2.8157156220767074, | |
| "grad_norm": 0.038831926882267, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5568797588348389, | |
| "mean_token_accuracy": 0.7727828919887543, | |
| "num_tokens": 12276004.0, | |
| "step": 754 | |
| }, | |
| { | |
| "entropy": 0.5431783348321915, | |
| "epoch": 2.8194574368568754, | |
| "grad_norm": 0.04772892966866493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5474101901054382, | |
| "mean_token_accuracy": 0.7786049693822861, | |
| "num_tokens": 12292373.0, | |
| "step": 755 | |
| }, | |
| { | |
| "entropy": 0.5570650398731232, | |
| "epoch": 2.823199251637044, | |
| "grad_norm": 0.03613967075943947, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5507438778877258, | |
| "mean_token_accuracy": 0.7748661190271378, | |
| "num_tokens": 12309010.0, | |
| "step": 756 | |
| }, | |
| { | |
| "entropy": 0.5275236368179321, | |
| "epoch": 2.8269410664172123, | |
| "grad_norm": 0.04989537596702576, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5294247269630432, | |
| "mean_token_accuracy": 0.7852834612131119, | |
| "num_tokens": 12325334.0, | |
| "step": 757 | |
| }, | |
| { | |
| "entropy": 0.5346865504980087, | |
| "epoch": 2.8306828811973808, | |
| "grad_norm": 0.03763777017593384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.536054790019989, | |
| "mean_token_accuracy": 0.7806695699691772, | |
| "num_tokens": 12341700.0, | |
| "step": 758 | |
| }, | |
| { | |
| "entropy": 0.5543745011091232, | |
| "epoch": 2.834424695977549, | |
| "grad_norm": 0.045101623982191086, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5560649037361145, | |
| "mean_token_accuracy": 0.7761011719703674, | |
| "num_tokens": 12358184.0, | |
| "step": 759 | |
| }, | |
| { | |
| "entropy": 0.5500671565532684, | |
| "epoch": 2.8381665107577176, | |
| "grad_norm": 0.042196061462163925, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5577619075775146, | |
| "mean_token_accuracy": 0.7745834439992905, | |
| "num_tokens": 12374727.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.5422725081443787, | |
| "epoch": 2.8419083255378856, | |
| "grad_norm": 0.037925731390714645, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5486158132553101, | |
| "mean_token_accuracy": 0.7735314965248108, | |
| "num_tokens": 12391054.0, | |
| "step": 761 | |
| }, | |
| { | |
| "entropy": 0.5447213500738144, | |
| "epoch": 2.845650140318054, | |
| "grad_norm": 0.039297524839639664, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5439249277114868, | |
| "mean_token_accuracy": 0.7782430201768875, | |
| "num_tokens": 12407240.0, | |
| "step": 762 | |
| }, | |
| { | |
| "entropy": 0.5623101443052292, | |
| "epoch": 2.8493919550982225, | |
| "grad_norm": 0.03727223724126816, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5529690980911255, | |
| "mean_token_accuracy": 0.7783486098051071, | |
| "num_tokens": 12423651.0, | |
| "step": 763 | |
| }, | |
| { | |
| "entropy": 0.5487337410449982, | |
| "epoch": 2.853133769878391, | |
| "grad_norm": 0.041605204343795776, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5483216047286987, | |
| "mean_token_accuracy": 0.7777005285024643, | |
| "num_tokens": 12439865.0, | |
| "step": 764 | |
| }, | |
| { | |
| "entropy": 0.5403908789157867, | |
| "epoch": 2.8568755846585594, | |
| "grad_norm": 0.042009830474853516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446419715881348, | |
| "mean_token_accuracy": 0.7782749831676483, | |
| "num_tokens": 12456283.0, | |
| "step": 765 | |
| }, | |
| { | |
| "entropy": 0.5366557389497757, | |
| "epoch": 2.860617399438728, | |
| "grad_norm": 0.03936697915196419, | |
| "learning_rate": 0.0002, | |
| "loss": 0.542513370513916, | |
| "mean_token_accuracy": 0.7779817581176758, | |
| "num_tokens": 12472812.0, | |
| "step": 766 | |
| }, | |
| { | |
| "entropy": 0.5674513280391693, | |
| "epoch": 2.8643592142188963, | |
| "grad_norm": 0.050604403018951416, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5683247447013855, | |
| "mean_token_accuracy": 0.7713179588317871, | |
| "num_tokens": 12489449.0, | |
| "step": 767 | |
| }, | |
| { | |
| "entropy": 0.5182722359895706, | |
| "epoch": 2.8681010289990647, | |
| "grad_norm": 0.036767635494470596, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5209700465202332, | |
| "mean_token_accuracy": 0.7906691282987595, | |
| "num_tokens": 12505831.0, | |
| "step": 768 | |
| }, | |
| { | |
| "entropy": 0.5400542318820953, | |
| "epoch": 2.871842843779233, | |
| "grad_norm": 0.0423893928527832, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5363757014274597, | |
| "mean_token_accuracy": 0.7849675416946411, | |
| "num_tokens": 12522266.0, | |
| "step": 769 | |
| }, | |
| { | |
| "entropy": 0.5384216755628586, | |
| "epoch": 2.875584658559401, | |
| "grad_norm": 0.03423478081822395, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539215087890625, | |
| "mean_token_accuracy": 0.7803387194871902, | |
| "num_tokens": 12538797.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.5494250059127808, | |
| "epoch": 2.8793264733395696, | |
| "grad_norm": 0.03864506259560585, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5536534786224365, | |
| "mean_token_accuracy": 0.7749843001365662, | |
| "num_tokens": 12554840.0, | |
| "step": 771 | |
| }, | |
| { | |
| "entropy": 0.5292802900075912, | |
| "epoch": 2.883068288119738, | |
| "grad_norm": 0.03668517246842384, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531915009021759, | |
| "mean_token_accuracy": 0.7857315242290497, | |
| "num_tokens": 12571194.0, | |
| "step": 772 | |
| }, | |
| { | |
| "entropy": 0.5444097071886063, | |
| "epoch": 2.8868101028999065, | |
| "grad_norm": 0.03593030199408531, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5466811060905457, | |
| "mean_token_accuracy": 0.7787587195634842, | |
| "num_tokens": 12587746.0, | |
| "step": 773 | |
| }, | |
| { | |
| "entropy": 0.5468859821557999, | |
| "epoch": 2.890551917680075, | |
| "grad_norm": 0.042690832167863846, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5463913679122925, | |
| "mean_token_accuracy": 0.779534175992012, | |
| "num_tokens": 12604183.0, | |
| "step": 774 | |
| }, | |
| { | |
| "entropy": 0.5508814752101898, | |
| "epoch": 2.8942937324602434, | |
| "grad_norm": 0.04205498844385147, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5481387376785278, | |
| "mean_token_accuracy": 0.776447519659996, | |
| "num_tokens": 12620732.0, | |
| "step": 775 | |
| }, | |
| { | |
| "entropy": 0.5370959490537643, | |
| "epoch": 2.8980355472404113, | |
| "grad_norm": 0.04001722112298012, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5357980728149414, | |
| "mean_token_accuracy": 0.7828036099672318, | |
| "num_tokens": 12636847.0, | |
| "step": 776 | |
| }, | |
| { | |
| "entropy": 0.5336840003728867, | |
| "epoch": 2.90177736202058, | |
| "grad_norm": 0.04124586284160614, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5350784063339233, | |
| "mean_token_accuracy": 0.7848693281412125, | |
| "num_tokens": 12653376.0, | |
| "step": 777 | |
| }, | |
| { | |
| "entropy": 0.5422462821006775, | |
| "epoch": 2.9055191768007482, | |
| "grad_norm": 0.04322974756360054, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5437650680541992, | |
| "mean_token_accuracy": 0.7811295241117477, | |
| "num_tokens": 12669838.0, | |
| "step": 778 | |
| }, | |
| { | |
| "entropy": 0.5301967561244965, | |
| "epoch": 2.9092609915809167, | |
| "grad_norm": 0.040180791169404984, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5413050055503845, | |
| "mean_token_accuracy": 0.7816843837499619, | |
| "num_tokens": 12686338.0, | |
| "step": 779 | |
| }, | |
| { | |
| "entropy": 0.5494007170200348, | |
| "epoch": 2.913002806361085, | |
| "grad_norm": 0.03727947920560837, | |
| "learning_rate": 0.0002, | |
| "loss": 0.551271915435791, | |
| "mean_token_accuracy": 0.7756839543581009, | |
| "num_tokens": 12702976.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.557955801486969, | |
| "epoch": 2.9167446211412535, | |
| "grad_norm": 0.03641374036669731, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5591468214988708, | |
| "mean_token_accuracy": 0.7722364217042923, | |
| "num_tokens": 12719319.0, | |
| "step": 781 | |
| }, | |
| { | |
| "entropy": 0.5437477082014084, | |
| "epoch": 2.920486435921422, | |
| "grad_norm": 0.03696129098534584, | |
| "learning_rate": 0.0002, | |
| "loss": 0.539549708366394, | |
| "mean_token_accuracy": 0.7802012413740158, | |
| "num_tokens": 12735691.0, | |
| "step": 782 | |
| }, | |
| { | |
| "entropy": 0.5459663569927216, | |
| "epoch": 2.9242282507015904, | |
| "grad_norm": 0.03394176810979843, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5432969331741333, | |
| "mean_token_accuracy": 0.7803399115800858, | |
| "num_tokens": 12752042.0, | |
| "step": 783 | |
| }, | |
| { | |
| "entropy": 0.540153980255127, | |
| "epoch": 2.927970065481759, | |
| "grad_norm": 0.04523579031229019, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5408099889755249, | |
| "mean_token_accuracy": 0.7797322869300842, | |
| "num_tokens": 12768264.0, | |
| "step": 784 | |
| }, | |
| { | |
| "entropy": 0.5484558641910553, | |
| "epoch": 2.9317118802619273, | |
| "grad_norm": 0.03857382759451866, | |
| "learning_rate": 0.0002, | |
| "loss": 0.554611325263977, | |
| "mean_token_accuracy": 0.7754960358142853, | |
| "num_tokens": 12784469.0, | |
| "step": 785 | |
| }, | |
| { | |
| "entropy": 0.5373403131961823, | |
| "epoch": 2.9354536950420953, | |
| "grad_norm": 0.04521877318620682, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5412609577178955, | |
| "mean_token_accuracy": 0.7812603563070297, | |
| "num_tokens": 12800714.0, | |
| "step": 786 | |
| }, | |
| { | |
| "entropy": 0.5420941710472107, | |
| "epoch": 2.9391955098222637, | |
| "grad_norm": 0.037385161966085434, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5446354746818542, | |
| "mean_token_accuracy": 0.7783695161342621, | |
| "num_tokens": 12816921.0, | |
| "step": 787 | |
| }, | |
| { | |
| "entropy": 0.5351656675338745, | |
| "epoch": 2.942937324602432, | |
| "grad_norm": 0.041876692324876785, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376321077346802, | |
| "mean_token_accuracy": 0.7807199209928513, | |
| "num_tokens": 12833350.0, | |
| "step": 788 | |
| }, | |
| { | |
| "entropy": 0.5680812299251556, | |
| "epoch": 2.9466791393826006, | |
| "grad_norm": 0.040565043687820435, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5634538531303406, | |
| "mean_token_accuracy": 0.7689831405878067, | |
| "num_tokens": 12849646.0, | |
| "step": 789 | |
| }, | |
| { | |
| "entropy": 0.5357328206300735, | |
| "epoch": 2.950420954162769, | |
| "grad_norm": 0.04082103073596954, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5352612733840942, | |
| "mean_token_accuracy": 0.7824973464012146, | |
| "num_tokens": 12865840.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.5547877848148346, | |
| "epoch": 2.954162768942937, | |
| "grad_norm": 0.04521463066339493, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5542868971824646, | |
| "mean_token_accuracy": 0.7752365618944168, | |
| "num_tokens": 12882266.0, | |
| "step": 791 | |
| }, | |
| { | |
| "entropy": 0.5343262106180191, | |
| "epoch": 2.9579045837231055, | |
| "grad_norm": 0.039067838340997696, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5333149433135986, | |
| "mean_token_accuracy": 0.783295214176178, | |
| "num_tokens": 12898704.0, | |
| "step": 792 | |
| }, | |
| { | |
| "entropy": 0.5165642648935318, | |
| "epoch": 2.961646398503274, | |
| "grad_norm": 0.04161246493458748, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5219287276268005, | |
| "mean_token_accuracy": 0.790781170129776, | |
| "num_tokens": 12914733.0, | |
| "step": 793 | |
| }, | |
| { | |
| "entropy": 0.5363114923238754, | |
| "epoch": 2.9653882132834424, | |
| "grad_norm": 0.03739769384264946, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5376189351081848, | |
| "mean_token_accuracy": 0.7812457233667374, | |
| "num_tokens": 12931042.0, | |
| "step": 794 | |
| }, | |
| { | |
| "entropy": 0.5318800210952759, | |
| "epoch": 2.969130028063611, | |
| "grad_norm": 0.047191355377435684, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5360404849052429, | |
| "mean_token_accuracy": 0.7821078598499298, | |
| "num_tokens": 12947442.0, | |
| "step": 795 | |
| }, | |
| { | |
| "entropy": 0.5284593552350998, | |
| "epoch": 2.9728718428437793, | |
| "grad_norm": 0.03614107519388199, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5247491598129272, | |
| "mean_token_accuracy": 0.7871349304914474, | |
| "num_tokens": 12963611.0, | |
| "step": 796 | |
| }, | |
| { | |
| "entropy": 0.5265946090221405, | |
| "epoch": 2.9766136576239477, | |
| "grad_norm": 0.04248823598027229, | |
| "learning_rate": 0.0002, | |
| "loss": 0.53187096118927, | |
| "mean_token_accuracy": 0.78339883685112, | |
| "num_tokens": 12979965.0, | |
| "step": 797 | |
| }, | |
| { | |
| "entropy": 0.5121617913246155, | |
| "epoch": 2.980355472404116, | |
| "grad_norm": 0.042288120836019516, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5201407670974731, | |
| "mean_token_accuracy": 0.7870761901140213, | |
| "num_tokens": 12996017.0, | |
| "step": 798 | |
| }, | |
| { | |
| "entropy": 0.5229809135198593, | |
| "epoch": 2.9840972871842846, | |
| "grad_norm": 0.040804166346788406, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5307119488716125, | |
| "mean_token_accuracy": 0.7831887602806091, | |
| "num_tokens": 13012277.0, | |
| "step": 799 | |
| }, | |
| { | |
| "entropy": 0.5386293828487396, | |
| "epoch": 2.987839101964453, | |
| "grad_norm": 0.04149458184838295, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5341092348098755, | |
| "mean_token_accuracy": 0.783338725566864, | |
| "num_tokens": 13028574.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.5334920659661293, | |
| "epoch": 2.991580916744621, | |
| "grad_norm": 0.04282135143876076, | |
| "learning_rate": 0.0002, | |
| "loss": 0.531876802444458, | |
| "mean_token_accuracy": 0.7834694683551788, | |
| "num_tokens": 13044829.0, | |
| "step": 801 | |
| }, | |
| { | |
| "entropy": 0.5673989802598953, | |
| "epoch": 2.9953227315247895, | |
| "grad_norm": 0.03961246460676193, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5678121447563171, | |
| "mean_token_accuracy": 0.7711912542581558, | |
| "num_tokens": 13061330.0, | |
| "step": 802 | |
| }, | |
| { | |
| "entropy": 0.531833752989769, | |
| "epoch": 2.999064546304958, | |
| "grad_norm": 0.03890501707792282, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5328924655914307, | |
| "mean_token_accuracy": 0.7814844250679016, | |
| "num_tokens": 13077343.0, | |
| "step": 803 | |
| }, | |
| { | |
| "entropy": 0.5831514596939087, | |
| "epoch": 3.0, | |
| "grad_norm": 0.06591155380010605, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5364804267883301, | |
| "mean_token_accuracy": 0.7760791182518005, | |
| "num_tokens": 13078463.0, | |
| "step": 804 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 804, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.218543283492356e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |