Instructions to use moos124/code-reasoning-0.5b with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use moos124/code-reasoning-0.5b with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("moos124/code-reasoning-0.5b", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0068266666666668, | |
| "eval_steps": 500, | |
| "global_step": 4720, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.2413338124752045, | |
| "epoch": 0.00021333333333333333, | |
| "grad_norm": 0.5237457752227783, | |
| "learning_rate": 0.0, | |
| "loss": 2.287360191345215, | |
| "mean_token_accuracy": 0.6451007425785065, | |
| "num_tokens": 4191.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 1.2207386708921857, | |
| "epoch": 0.0021333333333333334, | |
| "grad_norm": 0.4350375831127167, | |
| "learning_rate": 3e-06, | |
| "loss": 2.19429079691569, | |
| "mean_token_accuracy": 0.65218452612559, | |
| "num_tokens": 39906.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.1284118384122848, | |
| "epoch": 0.004266666666666667, | |
| "grad_norm": 0.49733078479766846, | |
| "learning_rate": 6.333333333333334e-06, | |
| "loss": 1.9721708297729492, | |
| "mean_token_accuracy": 0.6748957321047783, | |
| "num_tokens": 90428.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.3042965233325958, | |
| "epoch": 0.0064, | |
| "grad_norm": 0.39715778827667236, | |
| "learning_rate": 9.666666666666667e-06, | |
| "loss": 2.214934539794922, | |
| "mean_token_accuracy": 0.6405263364315033, | |
| "num_tokens": 140516.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.206481871008873, | |
| "epoch": 0.008533333333333334, | |
| "grad_norm": 0.5763714909553528, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 2.0720544815063477, | |
| "mean_token_accuracy": 0.6752387754619121, | |
| "num_tokens": 182338.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.236506675183773, | |
| "epoch": 0.010666666666666666, | |
| "grad_norm": 0.5518174171447754, | |
| "learning_rate": 1.6333333333333335e-05, | |
| "loss": 1.9000749588012695, | |
| "mean_token_accuracy": 0.6715238064527511, | |
| "num_tokens": 235345.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.2768938690423965, | |
| "epoch": 0.0128, | |
| "grad_norm": 0.6082292199134827, | |
| "learning_rate": 1.9666666666666666e-05, | |
| "loss": 1.8587135314941405, | |
| "mean_token_accuracy": 0.6761457294225692, | |
| "num_tokens": 279475.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.4092280209064483, | |
| "epoch": 0.014933333333333333, | |
| "grad_norm": 0.6658427119255066, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 1.7947965621948243, | |
| "mean_token_accuracy": 0.6667077802121639, | |
| "num_tokens": 330352.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.4339179992675781, | |
| "epoch": 0.017066666666666667, | |
| "grad_norm": 1.2848966121673584, | |
| "learning_rate": 2.633333333333333e-05, | |
| "loss": 1.6212703704833984, | |
| "mean_token_accuracy": 0.6736222848296165, | |
| "num_tokens": 373039.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 1.516643624007702, | |
| "epoch": 0.0192, | |
| "grad_norm": 0.5840373039245605, | |
| "learning_rate": 2.9666666666666672e-05, | |
| "loss": 1.6039567947387696, | |
| "mean_token_accuracy": 0.6713629268109799, | |
| "num_tokens": 422371.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 1.467514592409134, | |
| "epoch": 0.021333333333333333, | |
| "grad_norm": 0.5295536518096924, | |
| "learning_rate": 3.3e-05, | |
| "loss": 1.576413631439209, | |
| "mean_token_accuracy": 0.6885988213121891, | |
| "num_tokens": 462284.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.3056970939040184, | |
| "epoch": 0.023466666666666667, | |
| "grad_norm": 0.4493468701839447, | |
| "learning_rate": 3.633333333333333e-05, | |
| "loss": 1.2876591682434082, | |
| "mean_token_accuracy": 0.7173117578029633, | |
| "num_tokens": 508883.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.2709181517362595, | |
| "epoch": 0.0256, | |
| "grad_norm": 0.5542399883270264, | |
| "learning_rate": 3.966666666666667e-05, | |
| "loss": 1.249708652496338, | |
| "mean_token_accuracy": 0.727140337228775, | |
| "num_tokens": 555373.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 1.3233605474233627, | |
| "epoch": 0.027733333333333332, | |
| "grad_norm": 0.4590514004230499, | |
| "learning_rate": 4.3e-05, | |
| "loss": 1.3480740547180177, | |
| "mean_token_accuracy": 0.7144808873534203, | |
| "num_tokens": 599533.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 1.241466723382473, | |
| "epoch": 0.029866666666666666, | |
| "grad_norm": 0.2866012752056122, | |
| "learning_rate": 4.633333333333333e-05, | |
| "loss": 1.275578498840332, | |
| "mean_token_accuracy": 0.7148015096783638, | |
| "num_tokens": 657261.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 1.103185237944126, | |
| "epoch": 0.032, | |
| "grad_norm": 0.25304746627807617, | |
| "learning_rate": 4.966666666666667e-05, | |
| "loss": 1.1829004287719727, | |
| "mean_token_accuracy": 0.7423724889755249, | |
| "num_tokens": 704749.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.085295742750168, | |
| "epoch": 0.034133333333333335, | |
| "grad_norm": 2.8501718044281006, | |
| "learning_rate": 5.300000000000001e-05, | |
| "loss": 1.1587767601013184, | |
| "mean_token_accuracy": 0.7412141926586628, | |
| "num_tokens": 751635.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.0946420103311538, | |
| "epoch": 0.03626666666666667, | |
| "grad_norm": 0.2150825560092926, | |
| "learning_rate": 5.633333333333334e-05, | |
| "loss": 1.1224396705627442, | |
| "mean_token_accuracy": 0.7382922798395157, | |
| "num_tokens": 794488.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 1.0789968609809875, | |
| "epoch": 0.0384, | |
| "grad_norm": 0.2265262007713318, | |
| "learning_rate": 5.966666666666667e-05, | |
| "loss": 1.1242941856384276, | |
| "mean_token_accuracy": 0.7424697011709214, | |
| "num_tokens": 838647.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 1.0386934965848922, | |
| "epoch": 0.04053333333333333, | |
| "grad_norm": 0.2709059417247772, | |
| "learning_rate": 6.3e-05, | |
| "loss": 1.1370309829711913, | |
| "mean_token_accuracy": 0.7532300829887391, | |
| "num_tokens": 884497.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 1.1033389106392861, | |
| "epoch": 0.042666666666666665, | |
| "grad_norm": 0.27512305974960327, | |
| "learning_rate": 6.633333333333334e-05, | |
| "loss": 1.2109394073486328, | |
| "mean_token_accuracy": 0.7378732696175575, | |
| "num_tokens": 931398.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.108424139022827, | |
| "epoch": 0.0448, | |
| "grad_norm": 0.23238568007946014, | |
| "learning_rate": 6.966666666666668e-05, | |
| "loss": 1.1784509658813476, | |
| "mean_token_accuracy": 0.7302148967981339, | |
| "num_tokens": 983649.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.9056355074048043, | |
| "epoch": 0.046933333333333334, | |
| "grad_norm": 0.3005298674106598, | |
| "learning_rate": 7.3e-05, | |
| "loss": 0.977406120300293, | |
| "mean_token_accuracy": 0.7774429574608803, | |
| "num_tokens": 1024372.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 1.019825778901577, | |
| "epoch": 0.04906666666666667, | |
| "grad_norm": 0.2753085494041443, | |
| "learning_rate": 7.633333333333334e-05, | |
| "loss": 1.1463540077209473, | |
| "mean_token_accuracy": 0.7483755856752395, | |
| "num_tokens": 1073667.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 1.042508740723133, | |
| "epoch": 0.0512, | |
| "grad_norm": 0.32724323868751526, | |
| "learning_rate": 7.966666666666666e-05, | |
| "loss": 1.098323440551758, | |
| "mean_token_accuracy": 0.7490729346871376, | |
| "num_tokens": 1117523.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 1.036650500446558, | |
| "epoch": 0.05333333333333334, | |
| "grad_norm": 0.28012844920158386, | |
| "learning_rate": 8.3e-05, | |
| "loss": 1.078709030151367, | |
| "mean_token_accuracy": 0.751419472694397, | |
| "num_tokens": 1160152.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 1.0605042964220046, | |
| "epoch": 0.055466666666666664, | |
| "grad_norm": 0.28786003589630127, | |
| "learning_rate": 8.633333333333334e-05, | |
| "loss": 1.1537845611572266, | |
| "mean_token_accuracy": 0.7475032344460487, | |
| "num_tokens": 1203153.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.0000992961227895, | |
| "epoch": 0.0576, | |
| "grad_norm": 0.2648380696773529, | |
| "learning_rate": 8.966666666666666e-05, | |
| "loss": 1.0818438529968262, | |
| "mean_token_accuracy": 0.7550511255860328, | |
| "num_tokens": 1246554.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.9443735256791115, | |
| "epoch": 0.05973333333333333, | |
| "grad_norm": 0.29017189145088196, | |
| "learning_rate": 9.300000000000001e-05, | |
| "loss": 1.0578758239746093, | |
| "mean_token_accuracy": 0.7694767877459526, | |
| "num_tokens": 1286049.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.9980478152632714, | |
| "epoch": 0.06186666666666667, | |
| "grad_norm": 0.3078839182853699, | |
| "learning_rate": 9.633333333333335e-05, | |
| "loss": 1.1183401107788087, | |
| "mean_token_accuracy": 0.7594211131334305, | |
| "num_tokens": 1331161.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.9343128114938736, | |
| "epoch": 0.064, | |
| "grad_norm": 0.253248393535614, | |
| "learning_rate": 9.966666666666667e-05, | |
| "loss": 0.9862067222595214, | |
| "mean_token_accuracy": 0.7730094477534294, | |
| "num_tokens": 1375777.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 1.0266294315457345, | |
| "epoch": 0.06613333333333334, | |
| "grad_norm": 0.23917347192764282, | |
| "learning_rate": 9.999975737505649e-05, | |
| "loss": 1.1334312438964844, | |
| "mean_token_accuracy": 0.7540638357400894, | |
| "num_tokens": 1421027.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.0486552365124227, | |
| "epoch": 0.06826666666666667, | |
| "grad_norm": 0.2840607762336731, | |
| "learning_rate": 9.999891867457112e-05, | |
| "loss": 1.1424532890319825, | |
| "mean_token_accuracy": 0.7420963421463966, | |
| "num_tokens": 1472539.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.9518789499998093, | |
| "epoch": 0.0704, | |
| "grad_norm": 0.25352534651756287, | |
| "learning_rate": 9.999748091322068e-05, | |
| "loss": 0.9646738052368165, | |
| "mean_token_accuracy": 0.7610545977950096, | |
| "num_tokens": 1518725.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 1.014043178409338, | |
| "epoch": 0.07253333333333334, | |
| "grad_norm": 0.22224737703800201, | |
| "learning_rate": 9.999544410823167e-05, | |
| "loss": 1.0504605293273925, | |
| "mean_token_accuracy": 0.7481713563203811, | |
| "num_tokens": 1570475.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.9679042734205723, | |
| "epoch": 0.07466666666666667, | |
| "grad_norm": 0.2880384027957916, | |
| "learning_rate": 9.999280828400803e-05, | |
| "loss": 1.0482870101928712, | |
| "mean_token_accuracy": 0.7645679444074631, | |
| "num_tokens": 1617068.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.077889482676983, | |
| "epoch": 0.0768, | |
| "grad_norm": 0.24659322202205658, | |
| "learning_rate": 9.998957347213085e-05, | |
| "loss": 1.1968301773071288, | |
| "mean_token_accuracy": 0.7367698416113854, | |
| "num_tokens": 1661987.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.0857299536466598, | |
| "epoch": 0.07893333333333333, | |
| "grad_norm": 0.31553569436073303, | |
| "learning_rate": 9.99857397113579e-05, | |
| "loss": 1.1405964851379395, | |
| "mean_token_accuracy": 0.7424520552158356, | |
| "num_tokens": 1712075.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 1.0094588652253151, | |
| "epoch": 0.08106666666666666, | |
| "grad_norm": 0.1927226334810257, | |
| "learning_rate": 9.998130704762335e-05, | |
| "loss": 1.062878704071045, | |
| "mean_token_accuracy": 0.7529133662581444, | |
| "num_tokens": 1762001.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.95768673568964, | |
| "epoch": 0.0832, | |
| "grad_norm": 0.2921101748943329, | |
| "learning_rate": 9.997627553403699e-05, | |
| "loss": 1.0986035346984864, | |
| "mean_token_accuracy": 0.76050655990839, | |
| "num_tokens": 1805002.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.8763241834938527, | |
| "epoch": 0.08533333333333333, | |
| "grad_norm": 0.24879010021686554, | |
| "learning_rate": 9.997064523088384e-05, | |
| "loss": 0.9313676834106446, | |
| "mean_token_accuracy": 0.7820899412035942, | |
| "num_tokens": 1844640.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.0211177349090577, | |
| "epoch": 0.08746666666666666, | |
| "grad_norm": 0.27772271633148193, | |
| "learning_rate": 9.996441620562322e-05, | |
| "loss": 1.1202519416809082, | |
| "mean_token_accuracy": 0.7511946842074394, | |
| "num_tokens": 1891181.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.9595077157020568, | |
| "epoch": 0.0896, | |
| "grad_norm": 0.31688833236694336, | |
| "learning_rate": 9.995758853288805e-05, | |
| "loss": 1.0745075225830079, | |
| "mean_token_accuracy": 0.7604640245437622, | |
| "num_tokens": 1934494.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.963299511373043, | |
| "epoch": 0.09173333333333333, | |
| "grad_norm": 0.29464003443717957, | |
| "learning_rate": 9.995016229448395e-05, | |
| "loss": 1.0001104354858399, | |
| "mean_token_accuracy": 0.763674932718277, | |
| "num_tokens": 1976770.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.1388054117560387, | |
| "epoch": 0.09386666666666667, | |
| "grad_norm": 0.2581053078174591, | |
| "learning_rate": 9.994213757938819e-05, | |
| "loss": 1.2570265769958495, | |
| "mean_token_accuracy": 0.726725485175848, | |
| "num_tokens": 2028837.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.9712555348873139, | |
| "epoch": 0.096, | |
| "grad_norm": 0.3099062740802765, | |
| "learning_rate": 9.993351448374873e-05, | |
| "loss": 1.072590446472168, | |
| "mean_token_accuracy": 0.7607480764389039, | |
| "num_tokens": 2072441.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.0221860893070698, | |
| "epoch": 0.09813333333333334, | |
| "grad_norm": 0.25882700085639954, | |
| "learning_rate": 9.992429311088296e-05, | |
| "loss": 1.0992252349853515, | |
| "mean_token_accuracy": 0.7492104887962341, | |
| "num_tokens": 2119587.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.082937440276146, | |
| "epoch": 0.10026666666666667, | |
| "grad_norm": 0.23417538404464722, | |
| "learning_rate": 9.991447357127657e-05, | |
| "loss": 1.2704851150512695, | |
| "mean_token_accuracy": 0.7427436165511608, | |
| "num_tokens": 2171365.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.8950243927538395, | |
| "epoch": 0.1024, | |
| "grad_norm": 0.27864041924476624, | |
| "learning_rate": 9.990405598258212e-05, | |
| "loss": 0.965212631225586, | |
| "mean_token_accuracy": 0.773382380604744, | |
| "num_tokens": 2216186.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.9513905093073844, | |
| "epoch": 0.10453333333333334, | |
| "grad_norm": 0.27456656098365784, | |
| "learning_rate": 9.989304046961772e-05, | |
| "loss": 1.0514169692993165, | |
| "mean_token_accuracy": 0.7602177545428276, | |
| "num_tokens": 2263172.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.1295288607478142, | |
| "epoch": 0.10666666666666667, | |
| "grad_norm": 0.2654035985469818, | |
| "learning_rate": 9.988142716436546e-05, | |
| "loss": 1.2499847412109375, | |
| "mean_token_accuracy": 0.7319100961089134, | |
| "num_tokens": 2309133.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.9629665195941925, | |
| "epoch": 0.1088, | |
| "grad_norm": 0.2432626485824585, | |
| "learning_rate": 9.986921620596989e-05, | |
| "loss": 1.038647174835205, | |
| "mean_token_accuracy": 0.7657018698751926, | |
| "num_tokens": 2352188.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.0380054742097855, | |
| "epoch": 0.11093333333333333, | |
| "grad_norm": 0.22924789786338806, | |
| "learning_rate": 9.985640774073634e-05, | |
| "loss": 1.0875710487365722, | |
| "mean_token_accuracy": 0.7410315036773681, | |
| "num_tokens": 2400184.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.0756568349897861, | |
| "epoch": 0.11306666666666666, | |
| "grad_norm": 0.322125107049942, | |
| "learning_rate": 9.984300192212912e-05, | |
| "loss": 1.15003662109375, | |
| "mean_token_accuracy": 0.7418296962976456, | |
| "num_tokens": 2445609.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.037429604679346, | |
| "epoch": 0.1152, | |
| "grad_norm": 0.29860854148864746, | |
| "learning_rate": 9.982899891076973e-05, | |
| "loss": 1.1256014823913574, | |
| "mean_token_accuracy": 0.7413557574152947, | |
| "num_tokens": 2490595.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.863083366304636, | |
| "epoch": 0.11733333333333333, | |
| "grad_norm": 0.2839767634868622, | |
| "learning_rate": 9.98143988744349e-05, | |
| "loss": 0.9366037368774414, | |
| "mean_token_accuracy": 0.786571592092514, | |
| "num_tokens": 2537415.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.9333060696721077, | |
| "epoch": 0.11946666666666667, | |
| "grad_norm": 0.23799379169940948, | |
| "learning_rate": 9.979920198805464e-05, | |
| "loss": 1.0223896980285645, | |
| "mean_token_accuracy": 0.7608667835593224, | |
| "num_tokens": 2582645.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.0654601491987705, | |
| "epoch": 0.1216, | |
| "grad_norm": 0.26716434955596924, | |
| "learning_rate": 9.97834084337101e-05, | |
| "loss": 1.1589471817016601, | |
| "mean_token_accuracy": 0.7418766617774963, | |
| "num_tokens": 2633308.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.0902433142066001, | |
| "epoch": 0.12373333333333333, | |
| "grad_norm": 0.20525577664375305, | |
| "learning_rate": 9.976701840063136e-05, | |
| "loss": 1.1464842796325683, | |
| "mean_token_accuracy": 0.735442753136158, | |
| "num_tokens": 2686009.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.8992204323410988, | |
| "epoch": 0.12586666666666665, | |
| "grad_norm": 0.2524716556072235, | |
| "learning_rate": 9.975003208519522e-05, | |
| "loss": 0.9971331596374512, | |
| "mean_token_accuracy": 0.7749405071139336, | |
| "num_tokens": 2731814.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.0139609590172767, | |
| "epoch": 0.128, | |
| "grad_norm": 0.26556891202926636, | |
| "learning_rate": 9.973244969092282e-05, | |
| "loss": 1.1074792861938476, | |
| "mean_token_accuracy": 0.7530097424983978, | |
| "num_tokens": 2780314.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.9936468213796615, | |
| "epoch": 0.13013333333333332, | |
| "grad_norm": 0.1984289288520813, | |
| "learning_rate": 9.971427142847718e-05, | |
| "loss": 1.0992106437683105, | |
| "mean_token_accuracy": 0.7568746477365493, | |
| "num_tokens": 2827832.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.9435897715389728, | |
| "epoch": 0.13226666666666667, | |
| "grad_norm": 0.25219106674194336, | |
| "learning_rate": 9.969549751566075e-05, | |
| "loss": 1.016776180267334, | |
| "mean_token_accuracy": 0.76652851998806, | |
| "num_tokens": 2873516.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.9641087010502816, | |
| "epoch": 0.1344, | |
| "grad_norm": 0.2497103363275528, | |
| "learning_rate": 9.967612817741272e-05, | |
| "loss": 1.0406078338623046, | |
| "mean_token_accuracy": 0.7586069479584694, | |
| "num_tokens": 2913600.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.0591406509280206, | |
| "epoch": 0.13653333333333334, | |
| "grad_norm": 0.23842781782150269, | |
| "learning_rate": 9.965616364580636e-05, | |
| "loss": 1.1297724723815918, | |
| "mean_token_accuracy": 0.7443821474909782, | |
| "num_tokens": 2962337.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.0139970764517785, | |
| "epoch": 0.13866666666666666, | |
| "grad_norm": 0.2730376124382019, | |
| "learning_rate": 9.963560416004623e-05, | |
| "loss": 1.112107276916504, | |
| "mean_token_accuracy": 0.7496687114238739, | |
| "num_tokens": 3006113.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.9443116314709187, | |
| "epoch": 0.1408, | |
| "grad_norm": 0.26775607466697693, | |
| "learning_rate": 9.961444996646532e-05, | |
| "loss": 1.020461654663086, | |
| "mean_token_accuracy": 0.7648672193288804, | |
| "num_tokens": 3052837.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.113273823261261, | |
| "epoch": 0.14293333333333333, | |
| "grad_norm": 0.2903483510017395, | |
| "learning_rate": 9.95927013185221e-05, | |
| "loss": 1.2222958564758302, | |
| "mean_token_accuracy": 0.7364178076386452, | |
| "num_tokens": 3101650.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.8284098848700523, | |
| "epoch": 0.14506666666666668, | |
| "grad_norm": 0.3110564053058624, | |
| "learning_rate": 9.957035847679749e-05, | |
| "loss": 0.905357551574707, | |
| "mean_token_accuracy": 0.7894617035984993, | |
| "num_tokens": 3143357.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.9675152562558651, | |
| "epoch": 0.1472, | |
| "grad_norm": 0.27567023038864136, | |
| "learning_rate": 9.954742170899172e-05, | |
| "loss": 1.0641048431396485, | |
| "mean_token_accuracy": 0.7591987878084183, | |
| "num_tokens": 3193515.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.9688289143145085, | |
| "epoch": 0.14933333333333335, | |
| "grad_norm": 0.28721460700035095, | |
| "learning_rate": 9.952389128992113e-05, | |
| "loss": 1.0814785957336426, | |
| "mean_token_accuracy": 0.759615159034729, | |
| "num_tokens": 3242439.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.9880564108490943, | |
| "epoch": 0.15146666666666667, | |
| "grad_norm": 0.3781116306781769, | |
| "learning_rate": 9.949976750151489e-05, | |
| "loss": 1.100698184967041, | |
| "mean_token_accuracy": 0.7498437210917472, | |
| "num_tokens": 3287959.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.027633222937584, | |
| "epoch": 0.1536, | |
| "grad_norm": 0.2338317185640335, | |
| "learning_rate": 9.947505063281157e-05, | |
| "loss": 1.0913351058959961, | |
| "mean_token_accuracy": 0.750363714993, | |
| "num_tokens": 3335092.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.1284249052405357, | |
| "epoch": 0.15573333333333333, | |
| "grad_norm": 0.2954626679420471, | |
| "learning_rate": 9.944974097995581e-05, | |
| "loss": 1.2494465827941894, | |
| "mean_token_accuracy": 0.7387717284262181, | |
| "num_tokens": 3380879.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.9154682122170925, | |
| "epoch": 0.15786666666666666, | |
| "grad_norm": 0.2581311762332916, | |
| "learning_rate": 9.942383884619455e-05, | |
| "loss": 1.0275691032409668, | |
| "mean_token_accuracy": 0.7657118752598763, | |
| "num_tokens": 3430079.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.0482445612549782, | |
| "epoch": 0.16, | |
| "grad_norm": 0.2496950626373291, | |
| "learning_rate": 9.939734454187365e-05, | |
| "loss": 1.1575148582458497, | |
| "mean_token_accuracy": 0.7434881895780563, | |
| "num_tokens": 3482687.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.008715095371008, | |
| "epoch": 0.16213333333333332, | |
| "grad_norm": 0.24042253196239471, | |
| "learning_rate": 9.937025838443397e-05, | |
| "loss": 1.0921018600463868, | |
| "mean_token_accuracy": 0.7544152162969112, | |
| "num_tokens": 3528140.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.9826410934329033, | |
| "epoch": 0.16426666666666667, | |
| "grad_norm": 0.20811501145362854, | |
| "learning_rate": 9.934258069840765e-05, | |
| "loss": 1.0858916282653808, | |
| "mean_token_accuracy": 0.7535254985094071, | |
| "num_tokens": 3578130.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.9679107010364533, | |
| "epoch": 0.1664, | |
| "grad_norm": 0.3156932294368744, | |
| "learning_rate": 9.931431181541426e-05, | |
| "loss": 1.0209306716918944, | |
| "mean_token_accuracy": 0.7623399093747139, | |
| "num_tokens": 3625602.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.8914617538452149, | |
| "epoch": 0.16853333333333334, | |
| "grad_norm": 0.2739625573158264, | |
| "learning_rate": 9.928545207415675e-05, | |
| "loss": 1.014828872680664, | |
| "mean_token_accuracy": 0.7721783280372619, | |
| "num_tokens": 3671375.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.9649161577224732, | |
| "epoch": 0.17066666666666666, | |
| "grad_norm": 0.23826555907726288, | |
| "learning_rate": 9.92560018204174e-05, | |
| "loss": 1.0278871536254883, | |
| "mean_token_accuracy": 0.7589392751455307, | |
| "num_tokens": 3717893.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.9766084000468254, | |
| "epoch": 0.1728, | |
| "grad_norm": 0.33309969305992126, | |
| "learning_rate": 9.92259614070538e-05, | |
| "loss": 1.0395455360412598, | |
| "mean_token_accuracy": 0.7545588746666908, | |
| "num_tokens": 3764739.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 1.0871834971010685, | |
| "epoch": 0.17493333333333333, | |
| "grad_norm": 0.22862163186073303, | |
| "learning_rate": 9.919533119399438e-05, | |
| "loss": 1.1718174934387207, | |
| "mean_token_accuracy": 0.7301450505852699, | |
| "num_tokens": 3810938.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 1.0070038817822933, | |
| "epoch": 0.17706666666666668, | |
| "grad_norm": 0.2899021506309509, | |
| "learning_rate": 9.916411154823433e-05, | |
| "loss": 1.146003818511963, | |
| "mean_token_accuracy": 0.7569455504417419, | |
| "num_tokens": 3857657.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.116474264860153, | |
| "epoch": 0.1792, | |
| "grad_norm": 0.2815554738044739, | |
| "learning_rate": 9.913230284383112e-05, | |
| "loss": 1.1874059677124023, | |
| "mean_token_accuracy": 0.7290896072983741, | |
| "num_tokens": 3909965.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.9500585079193116, | |
| "epoch": 0.18133333333333335, | |
| "grad_norm": 0.3055127263069153, | |
| "learning_rate": 9.90999054619e-05, | |
| "loss": 1.0213364601135253, | |
| "mean_token_accuracy": 0.7678412273526192, | |
| "num_tokens": 3948812.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.8651109091937542, | |
| "epoch": 0.18346666666666667, | |
| "grad_norm": 0.2479069083929062, | |
| "learning_rate": 9.906691979060943e-05, | |
| "loss": 0.9216291427612304, | |
| "mean_token_accuracy": 0.7761695921421051, | |
| "num_tokens": 3991940.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.1529859654605388, | |
| "epoch": 0.1856, | |
| "grad_norm": 0.26249563694000244, | |
| "learning_rate": 9.903334622517643e-05, | |
| "loss": 1.2724492073059082, | |
| "mean_token_accuracy": 0.7244490720331669, | |
| "num_tokens": 4050343.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.0595085561275481, | |
| "epoch": 0.18773333333333334, | |
| "grad_norm": 0.294251948595047, | |
| "learning_rate": 9.89991851678619e-05, | |
| "loss": 1.1476259231567383, | |
| "mean_token_accuracy": 0.7438870698213578, | |
| "num_tokens": 4099181.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.0237427443265914, | |
| "epoch": 0.18986666666666666, | |
| "grad_norm": 0.251798152923584, | |
| "learning_rate": 9.896443702796573e-05, | |
| "loss": 1.1016413688659668, | |
| "mean_token_accuracy": 0.7469503089785576, | |
| "num_tokens": 4143951.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.9793856598436832, | |
| "epoch": 0.192, | |
| "grad_norm": 0.2858197093009949, | |
| "learning_rate": 9.892910222182196e-05, | |
| "loss": 1.0976881980895996, | |
| "mean_token_accuracy": 0.7614006102085114, | |
| "num_tokens": 4194087.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 1.0538529880344867, | |
| "epoch": 0.19413333333333332, | |
| "grad_norm": 0.23789124190807343, | |
| "learning_rate": 9.889318117279373e-05, | |
| "loss": 1.196424674987793, | |
| "mean_token_accuracy": 0.7455965608358384, | |
| "num_tokens": 4243300.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.9650195389986038, | |
| "epoch": 0.19626666666666667, | |
| "grad_norm": 0.3590649366378784, | |
| "learning_rate": 9.885667431126824e-05, | |
| "loss": 1.0054343223571778, | |
| "mean_token_accuracy": 0.7639905765652657, | |
| "num_tokens": 4291788.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.8939604975283146, | |
| "epoch": 0.1984, | |
| "grad_norm": 0.2963460087776184, | |
| "learning_rate": 9.881958207465158e-05, | |
| "loss": 0.9668001174926758, | |
| "mean_token_accuracy": 0.778343915939331, | |
| "num_tokens": 4336567.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.8603343136608601, | |
| "epoch": 0.20053333333333334, | |
| "grad_norm": 0.22635255753993988, | |
| "learning_rate": 9.878190490736353e-05, | |
| "loss": 0.9547459602355957, | |
| "mean_token_accuracy": 0.780703829228878, | |
| "num_tokens": 4379939.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.0914236083626747, | |
| "epoch": 0.20266666666666666, | |
| "grad_norm": 0.28582993149757385, | |
| "learning_rate": 9.874364326083216e-05, | |
| "loss": 1.1933195114135742, | |
| "mean_token_accuracy": 0.7399712555110455, | |
| "num_tokens": 4429828.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 0.9969653740525246, | |
| "epoch": 0.2048, | |
| "grad_norm": 0.23508235812187195, | |
| "learning_rate": 9.87047975934885e-05, | |
| "loss": 1.080200481414795, | |
| "mean_token_accuracy": 0.7547310657799244, | |
| "num_tokens": 4478207.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 0.9463694766163826, | |
| "epoch": 0.20693333333333333, | |
| "grad_norm": 0.28800663352012634, | |
| "learning_rate": 9.866536837076098e-05, | |
| "loss": 1.0770373344421387, | |
| "mean_token_accuracy": 0.7622694931924343, | |
| "num_tokens": 4523626.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.017927524447441, | |
| "epoch": 0.20906666666666668, | |
| "grad_norm": 0.22783571481704712, | |
| "learning_rate": 9.862535606506992e-05, | |
| "loss": 1.1111245155334473, | |
| "mean_token_accuracy": 0.7514706686139107, | |
| "num_tokens": 4575636.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 0.9148829184472561, | |
| "epoch": 0.2112, | |
| "grad_norm": 0.44951871037483215, | |
| "learning_rate": 9.85847611558218e-05, | |
| "loss": 1.0038617134094239, | |
| "mean_token_accuracy": 0.7708485037088394, | |
| "num_tokens": 4623275.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 0.9400355100631714, | |
| "epoch": 0.21333333333333335, | |
| "grad_norm": 0.2274121642112732, | |
| "learning_rate": 9.85435841294036e-05, | |
| "loss": 1.051990795135498, | |
| "mean_token_accuracy": 0.762999877333641, | |
| "num_tokens": 4670070.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 0.9205634713172912, | |
| "epoch": 0.21546666666666667, | |
| "grad_norm": 0.2580691874027252, | |
| "learning_rate": 9.850182547917686e-05, | |
| "loss": 1.0361328125, | |
| "mean_token_accuracy": 0.77038114964962, | |
| "num_tokens": 4713754.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 0.9357184395194054, | |
| "epoch": 0.2176, | |
| "grad_norm": 0.3024005889892578, | |
| "learning_rate": 9.845948570547187e-05, | |
| "loss": 1.001820945739746, | |
| "mean_token_accuracy": 0.7686716303229332, | |
| "num_tokens": 4754969.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 0.9788610845804214, | |
| "epoch": 0.21973333333333334, | |
| "grad_norm": 0.22762160003185272, | |
| "learning_rate": 9.841656531558163e-05, | |
| "loss": 1.0857264518737793, | |
| "mean_token_accuracy": 0.755905470252037, | |
| "num_tokens": 4799172.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 0.8699902944266796, | |
| "epoch": 0.22186666666666666, | |
| "grad_norm": 0.3153718113899231, | |
| "learning_rate": 9.83730648237558e-05, | |
| "loss": 0.9507910728454589, | |
| "mean_token_accuracy": 0.7810183942317963, | |
| "num_tokens": 4841210.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.0130531772971154, | |
| "epoch": 0.224, | |
| "grad_norm": 0.254517138004303, | |
| "learning_rate": 9.832898475119446e-05, | |
| "loss": 1.0863225936889649, | |
| "mean_token_accuracy": 0.7473610386252403, | |
| "num_tokens": 4889983.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 0.9305486619472504, | |
| "epoch": 0.22613333333333333, | |
| "grad_norm": 0.3190745413303375, | |
| "learning_rate": 9.828432562604197e-05, | |
| "loss": 1.0010540008544921, | |
| "mean_token_accuracy": 0.7654982030391693, | |
| "num_tokens": 4935048.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 0.8807580441236496, | |
| "epoch": 0.22826666666666667, | |
| "grad_norm": 0.25836268067359924, | |
| "learning_rate": 9.823908798338061e-05, | |
| "loss": 0.9626541137695312, | |
| "mean_token_accuracy": 0.7766690820455551, | |
| "num_tokens": 4974286.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 0.9813514620065689, | |
| "epoch": 0.2304, | |
| "grad_norm": 0.23850619792938232, | |
| "learning_rate": 9.819327236522411e-05, | |
| "loss": 1.0773870468139648, | |
| "mean_token_accuracy": 0.7523389101028443, | |
| "num_tokens": 5029521.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 0.9586616240441799, | |
| "epoch": 0.23253333333333334, | |
| "grad_norm": 0.32606637477874756, | |
| "learning_rate": 9.814687932051123e-05, | |
| "loss": 1.0027738571166993, | |
| "mean_token_accuracy": 0.7671207025647163, | |
| "num_tokens": 5074269.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 0.9240864880383015, | |
| "epoch": 0.23466666666666666, | |
| "grad_norm": 0.23331928253173828, | |
| "learning_rate": 9.809990940509911e-05, | |
| "loss": 1.0010904312133788, | |
| "mean_token_accuracy": 0.7657159030437469, | |
| "num_tokens": 5120632.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 0.9960295349359513, | |
| "epoch": 0.2368, | |
| "grad_norm": 0.27676209807395935, | |
| "learning_rate": 9.805236318175672e-05, | |
| "loss": 1.0985889434814453, | |
| "mean_token_accuracy": 0.7565312668681144, | |
| "num_tokens": 5167690.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 0.9320930615067482, | |
| "epoch": 0.23893333333333333, | |
| "grad_norm": 0.2536001205444336, | |
| "learning_rate": 9.800424122015802e-05, | |
| "loss": 1.0100153923034667, | |
| "mean_token_accuracy": 0.7664023399353027, | |
| "num_tokens": 5209976.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 0.960879421979189, | |
| "epoch": 0.24106666666666668, | |
| "grad_norm": 0.2650390565395355, | |
| "learning_rate": 9.79555440968751e-05, | |
| "loss": 1.0518557548522949, | |
| "mean_token_accuracy": 0.7670834749937058, | |
| "num_tokens": 5257491.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.0790555529296397, | |
| "epoch": 0.2432, | |
| "grad_norm": 0.2658112645149231, | |
| "learning_rate": 9.790627239537144e-05, | |
| "loss": 1.2411640167236329, | |
| "mean_token_accuracy": 0.7390920028090477, | |
| "num_tokens": 5302986.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 0.9476075693964958, | |
| "epoch": 0.24533333333333332, | |
| "grad_norm": 0.2797292470932007, | |
| "learning_rate": 9.785642670599479e-05, | |
| "loss": 1.0823354721069336, | |
| "mean_token_accuracy": 0.765032921731472, | |
| "num_tokens": 5348946.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 0.8997275173664093, | |
| "epoch": 0.24746666666666667, | |
| "grad_norm": 0.21643367409706116, | |
| "learning_rate": 9.780600762597005e-05, | |
| "loss": 0.9590452194213868, | |
| "mean_token_accuracy": 0.7714703544974327, | |
| "num_tokens": 5393421.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.0325972460210324, | |
| "epoch": 0.2496, | |
| "grad_norm": 0.26943519711494446, | |
| "learning_rate": 9.775501575939227e-05, | |
| "loss": 1.0748598098754882, | |
| "mean_token_accuracy": 0.7444046661257744, | |
| "num_tokens": 5443639.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 0.9544263079762458, | |
| "epoch": 0.2517333333333333, | |
| "grad_norm": 0.28337323665618896, | |
| "learning_rate": 9.770345171721929e-05, | |
| "loss": 1.0133567810058595, | |
| "mean_token_accuracy": 0.7597964540123939, | |
| "num_tokens": 5494552.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 0.9211089439690113, | |
| "epoch": 0.2538666666666667, | |
| "grad_norm": 0.23862819373607635, | |
| "learning_rate": 9.765131611726446e-05, | |
| "loss": 0.9742342948913574, | |
| "mean_token_accuracy": 0.7758729934692383, | |
| "num_tokens": 5540743.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.0495109014213084, | |
| "epoch": 0.256, | |
| "grad_norm": 0.2335851490497589, | |
| "learning_rate": 9.759860958418926e-05, | |
| "loss": 1.1212799072265625, | |
| "mean_token_accuracy": 0.7419089064002037, | |
| "num_tokens": 5592981.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 1.115493653714657, | |
| "epoch": 0.2581333333333333, | |
| "grad_norm": 0.2895198464393616, | |
| "learning_rate": 9.754533274949575e-05, | |
| "loss": 1.2449783325195312, | |
| "mean_token_accuracy": 0.7372826255857945, | |
| "num_tokens": 5637868.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 0.9640243485569954, | |
| "epoch": 0.26026666666666665, | |
| "grad_norm": 0.23221854865550995, | |
| "learning_rate": 9.749148625151908e-05, | |
| "loss": 1.0559998512268067, | |
| "mean_token_accuracy": 0.7624265968799591, | |
| "num_tokens": 5684479.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 0.9518351659178734, | |
| "epoch": 0.2624, | |
| "grad_norm": 0.2750278413295746, | |
| "learning_rate": 9.743707073541978e-05, | |
| "loss": 1.0608203887939454, | |
| "mean_token_accuracy": 0.7580445930361748, | |
| "num_tokens": 5726626.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.0462738864123822, | |
| "epoch": 0.26453333333333334, | |
| "grad_norm": 0.21800817549228668, | |
| "learning_rate": 9.738208685317611e-05, | |
| "loss": 1.172841453552246, | |
| "mean_token_accuracy": 0.7489309534430504, | |
| "num_tokens": 5776891.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 0.9446908816695213, | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 0.2996140718460083, | |
| "learning_rate": 9.732653526357612e-05, | |
| "loss": 1.0297443389892578, | |
| "mean_token_accuracy": 0.7673991709947586, | |
| "num_tokens": 5817721.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 1.0725971311330795, | |
| "epoch": 0.2688, | |
| "grad_norm": 0.25890064239501953, | |
| "learning_rate": 9.727041663220989e-05, | |
| "loss": 1.2022390365600586, | |
| "mean_token_accuracy": 0.7427652187645435, | |
| "num_tokens": 5864952.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.0392154708504677, | |
| "epoch": 0.27093333333333336, | |
| "grad_norm": 0.2563996911048889, | |
| "learning_rate": 9.721373163146148e-05, | |
| "loss": 1.1769997596740722, | |
| "mean_token_accuracy": 0.7434597261250019, | |
| "num_tokens": 5910222.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 0.9681669734418392, | |
| "epoch": 0.2730666666666667, | |
| "grad_norm": 0.2539491653442383, | |
| "learning_rate": 9.715648094050087e-05, | |
| "loss": 1.0889246940612793, | |
| "mean_token_accuracy": 0.7573506951332092, | |
| "num_tokens": 5958487.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 0.9369139075279236, | |
| "epoch": 0.2752, | |
| "grad_norm": 0.29088094830513, | |
| "learning_rate": 9.709866524527588e-05, | |
| "loss": 1.0017178535461426, | |
| "mean_token_accuracy": 0.7683532252907753, | |
| "num_tokens": 6002949.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.0060899198055266, | |
| "epoch": 0.2773333333333333, | |
| "grad_norm": 0.2669098675251007, | |
| "learning_rate": 9.704028523850392e-05, | |
| "loss": 1.0900717735290528, | |
| "mean_token_accuracy": 0.7483311414718627, | |
| "num_tokens": 6053462.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 0.9726419121026992, | |
| "epoch": 0.27946666666666664, | |
| "grad_norm": 0.2472907453775406, | |
| "learning_rate": 9.698134161966363e-05, | |
| "loss": 1.1085708618164063, | |
| "mean_token_accuracy": 0.7585345059633255, | |
| "num_tokens": 6103810.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 0.9410715244710446, | |
| "epoch": 0.2816, | |
| "grad_norm": 0.2315952181816101, | |
| "learning_rate": 9.692183509498659e-05, | |
| "loss": 1.04432373046875, | |
| "mean_token_accuracy": 0.7650494173169136, | |
| "num_tokens": 6149332.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 0.9582152135670186, | |
| "epoch": 0.28373333333333334, | |
| "grad_norm": 0.28324106335639954, | |
| "learning_rate": 9.686176637744884e-05, | |
| "loss": 1.0879361152648925, | |
| "mean_token_accuracy": 0.7590656578540802, | |
| "num_tokens": 6193221.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 0.8448746472597122, | |
| "epoch": 0.28586666666666666, | |
| "grad_norm": 0.27708905935287476, | |
| "learning_rate": 9.680113618676229e-05, | |
| "loss": 0.9311987876892089, | |
| "mean_token_accuracy": 0.7856365889310837, | |
| "num_tokens": 6238920.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 0.9729965463280678, | |
| "epoch": 0.288, | |
| "grad_norm": 0.2840976119041443, | |
| "learning_rate": 9.673994524936615e-05, | |
| "loss": 1.0606694221496582, | |
| "mean_token_accuracy": 0.7592033997178078, | |
| "num_tokens": 6286552.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 0.9429876990616322, | |
| "epoch": 0.29013333333333335, | |
| "grad_norm": 0.2964062988758087, | |
| "learning_rate": 9.667819429841817e-05, | |
| "loss": 1.0691003799438477, | |
| "mean_token_accuracy": 0.7747758001089096, | |
| "num_tokens": 6331573.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.0872648723423481, | |
| "epoch": 0.2922666666666667, | |
| "grad_norm": 0.2675784230232239, | |
| "learning_rate": 9.661588407378596e-05, | |
| "loss": 1.2234331130981446, | |
| "mean_token_accuracy": 0.7345306605100632, | |
| "num_tokens": 6383986.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 0.8609121344983578, | |
| "epoch": 0.2944, | |
| "grad_norm": 0.22704993188381195, | |
| "learning_rate": 9.655301532203797e-05, | |
| "loss": 0.934294605255127, | |
| "mean_token_accuracy": 0.7782075613737106, | |
| "num_tokens": 6426867.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 0.8974170722067356, | |
| "epoch": 0.2965333333333333, | |
| "grad_norm": 0.2712918221950531, | |
| "learning_rate": 9.648958879643467e-05, | |
| "loss": 0.9794875144958496, | |
| "mean_token_accuracy": 0.768825002014637, | |
| "num_tokens": 6470272.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 0.9115433268249035, | |
| "epoch": 0.2986666666666667, | |
| "grad_norm": 0.2932375371456146, | |
| "learning_rate": 9.642560525691948e-05, | |
| "loss": 0.9729208946228027, | |
| "mean_token_accuracy": 0.7670300453901291, | |
| "num_tokens": 6513165.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 0.8882284432649612, | |
| "epoch": 0.3008, | |
| "grad_norm": 0.28009793162345886, | |
| "learning_rate": 9.63610654701097e-05, | |
| "loss": 0.9678432464599609, | |
| "mean_token_accuracy": 0.7733970731496811, | |
| "num_tokens": 6556342.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 0.8777762867510319, | |
| "epoch": 0.30293333333333333, | |
| "grad_norm": 0.23763342201709747, | |
| "learning_rate": 9.629597020928722e-05, | |
| "loss": 0.9687582969665527, | |
| "mean_token_accuracy": 0.7792320027947426, | |
| "num_tokens": 6601597.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 0.9160067647695541, | |
| "epoch": 0.30506666666666665, | |
| "grad_norm": 0.31182846426963806, | |
| "learning_rate": 9.623032025438939e-05, | |
| "loss": 0.9827415466308593, | |
| "mean_token_accuracy": 0.7696192592382431, | |
| "num_tokens": 6649545.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 0.9564003840088844, | |
| "epoch": 0.3072, | |
| "grad_norm": 0.23129577934741974, | |
| "learning_rate": 9.61641163919996e-05, | |
| "loss": 1.0380489349365234, | |
| "mean_token_accuracy": 0.7621255874633789, | |
| "num_tokens": 6696196.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.05502300709486, | |
| "epoch": 0.30933333333333335, | |
| "grad_norm": 0.27563098073005676, | |
| "learning_rate": 9.609735941533788e-05, | |
| "loss": 1.150872802734375, | |
| "mean_token_accuracy": 0.7396457836031913, | |
| "num_tokens": 6745953.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 0.9179727308452129, | |
| "epoch": 0.31146666666666667, | |
| "grad_norm": 0.2697086036205292, | |
| "learning_rate": 9.603005012425135e-05, | |
| "loss": 1.0098464012145996, | |
| "mean_token_accuracy": 0.7658590793609619, | |
| "num_tokens": 6794497.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.1471355110406876, | |
| "epoch": 0.3136, | |
| "grad_norm": 0.2717576026916504, | |
| "learning_rate": 9.596218932520468e-05, | |
| "loss": 1.2438765525817872, | |
| "mean_token_accuracy": 0.7290126971900464, | |
| "num_tokens": 6841976.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 0.9211908876895905, | |
| "epoch": 0.3157333333333333, | |
| "grad_norm": 0.23845043778419495, | |
| "learning_rate": 9.589377783127047e-05, | |
| "loss": 1.0137989044189453, | |
| "mean_token_accuracy": 0.767640671133995, | |
| "num_tokens": 6882780.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 0.8538284994661808, | |
| "epoch": 0.3178666666666667, | |
| "grad_norm": 0.23966865241527557, | |
| "learning_rate": 9.58248164621194e-05, | |
| "loss": 0.908942985534668, | |
| "mean_token_accuracy": 0.7838981494307518, | |
| "num_tokens": 6922694.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 0.9744522646069527, | |
| "epoch": 0.32, | |
| "grad_norm": 0.27710849046707153, | |
| "learning_rate": 9.575530604401051e-05, | |
| "loss": 1.0989601135253906, | |
| "mean_token_accuracy": 0.7573227554559707, | |
| "num_tokens": 6972307.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 1.0386228650808333, | |
| "epoch": 0.3221333333333333, | |
| "grad_norm": 0.2898867428302765, | |
| "learning_rate": 9.56852474097812e-05, | |
| "loss": 1.1454790115356446, | |
| "mean_token_accuracy": 0.7430783316493035, | |
| "num_tokens": 7020126.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 0.9898950323462486, | |
| "epoch": 0.32426666666666665, | |
| "grad_norm": 0.2526203393936157, | |
| "learning_rate": 9.561464139883737e-05, | |
| "loss": 1.1016074180603028, | |
| "mean_token_accuracy": 0.7571706652641297, | |
| "num_tokens": 7068804.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 0.999041261523962, | |
| "epoch": 0.3264, | |
| "grad_norm": 0.24795448780059814, | |
| "learning_rate": 9.554348885714326e-05, | |
| "loss": 1.0913206100463868, | |
| "mean_token_accuracy": 0.7498321965336799, | |
| "num_tokens": 7120053.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 0.9470011726021766, | |
| "epoch": 0.32853333333333334, | |
| "grad_norm": 0.2526148557662964, | |
| "learning_rate": 9.547179063721139e-05, | |
| "loss": 1.0380746841430664, | |
| "mean_token_accuracy": 0.7631630048155784, | |
| "num_tokens": 7166553.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 0.9413013480603695, | |
| "epoch": 0.33066666666666666, | |
| "grad_norm": 0.2717132866382599, | |
| "learning_rate": 9.539954759809226e-05, | |
| "loss": 1.0266976356506348, | |
| "mean_token_accuracy": 0.7626717418432236, | |
| "num_tokens": 7211784.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.0694061882793904, | |
| "epoch": 0.3328, | |
| "grad_norm": 0.2959578335285187, | |
| "learning_rate": 9.532676060536419e-05, | |
| "loss": 1.1947439193725586, | |
| "mean_token_accuracy": 0.7360827416181565, | |
| "num_tokens": 7256909.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.100894968211651, | |
| "epoch": 0.33493333333333336, | |
| "grad_norm": 0.2543633282184601, | |
| "learning_rate": 9.525343053112276e-05, | |
| "loss": 1.2262410163879394, | |
| "mean_token_accuracy": 0.7347081542015076, | |
| "num_tokens": 7305883.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 0.887219874560833, | |
| "epoch": 0.3370666666666667, | |
| "grad_norm": 0.2346959412097931, | |
| "learning_rate": 9.517955825397056e-05, | |
| "loss": 0.9671891212463379, | |
| "mean_token_accuracy": 0.7790701374411583, | |
| "num_tokens": 7352371.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 0.8671779796481133, | |
| "epoch": 0.3392, | |
| "grad_norm": 0.32672831416130066, | |
| "learning_rate": 9.510514465900653e-05, | |
| "loss": 0.9251022338867188, | |
| "mean_token_accuracy": 0.7759940758347511, | |
| "num_tokens": 7393383.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 0.9520958945155144, | |
| "epoch": 0.3413333333333333, | |
| "grad_norm": 0.2607707679271698, | |
| "learning_rate": 9.50301906378154e-05, | |
| "loss": 1.0628732681274413, | |
| "mean_token_accuracy": 0.7598425537347794, | |
| "num_tokens": 7439670.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.0912248834967613, | |
| "epoch": 0.34346666666666664, | |
| "grad_norm": 0.2782971262931824, | |
| "learning_rate": 9.495469708845701e-05, | |
| "loss": 1.1914597511291505, | |
| "mean_token_accuracy": 0.7388298735022545, | |
| "num_tokens": 7485267.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 0.8908423334360123, | |
| "epoch": 0.3456, | |
| "grad_norm": 0.3009212613105774, | |
| "learning_rate": 9.487866491545554e-05, | |
| "loss": 0.9948520660400391, | |
| "mean_token_accuracy": 0.7739639699459075, | |
| "num_tokens": 7527918.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 1.0133905045688152, | |
| "epoch": 0.34773333333333334, | |
| "grad_norm": 0.22292616963386536, | |
| "learning_rate": 9.480209502978869e-05, | |
| "loss": 1.0447552680969239, | |
| "mean_token_accuracy": 0.7502199374139309, | |
| "num_tokens": 7573700.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 0.9277701899409294, | |
| "epoch": 0.34986666666666666, | |
| "grad_norm": 0.25027042627334595, | |
| "learning_rate": 9.472498834887671e-05, | |
| "loss": 0.9662016868591309, | |
| "mean_token_accuracy": 0.769710011780262, | |
| "num_tokens": 7619772.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 0.9953067198395729, | |
| "epoch": 0.352, | |
| "grad_norm": 0.2255147099494934, | |
| "learning_rate": 9.46473457965715e-05, | |
| "loss": 1.1140392303466797, | |
| "mean_token_accuracy": 0.7478319302201271, | |
| "num_tokens": 7666080.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 0.9726508021354675, | |
| "epoch": 0.35413333333333336, | |
| "grad_norm": 0.23199768364429474, | |
| "learning_rate": 9.456916830314548e-05, | |
| "loss": 1.043684482574463, | |
| "mean_token_accuracy": 0.7573757611215115, | |
| "num_tokens": 7717686.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 0.8418352358043194, | |
| "epoch": 0.3562666666666667, | |
| "grad_norm": 0.2636827826499939, | |
| "learning_rate": 9.449045680528041e-05, | |
| "loss": 0.9572833061218262, | |
| "mean_token_accuracy": 0.7885566264390945, | |
| "num_tokens": 7760984.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 0.9119837798178196, | |
| "epoch": 0.3584, | |
| "grad_norm": 0.2830665111541748, | |
| "learning_rate": 9.441121224605629e-05, | |
| "loss": 0.9864655494689941, | |
| "mean_token_accuracy": 0.7652302265167237, | |
| "num_tokens": 7806335.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 0.833198781311512, | |
| "epoch": 0.3605333333333333, | |
| "grad_norm": 0.2658802568912506, | |
| "learning_rate": 9.43314355749399e-05, | |
| "loss": 0.8842006683349609, | |
| "mean_token_accuracy": 0.7828017815947532, | |
| "num_tokens": 7846783.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 0.9806767851114273, | |
| "epoch": 0.3626666666666667, | |
| "grad_norm": 0.23127029836177826, | |
| "learning_rate": 9.425112774777354e-05, | |
| "loss": 1.116124439239502, | |
| "mean_token_accuracy": 0.7566492781043053, | |
| "num_tokens": 7890665.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 0.8920052781701088, | |
| "epoch": 0.3648, | |
| "grad_norm": 0.22740185260772705, | |
| "learning_rate": 9.417028972676359e-05, | |
| "loss": 1.005050277709961, | |
| "mean_token_accuracy": 0.7736168324947357, | |
| "num_tokens": 7933849.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.0065233081579208, | |
| "epoch": 0.36693333333333333, | |
| "grad_norm": 0.34241342544555664, | |
| "learning_rate": 9.408892248046885e-05, | |
| "loss": 1.1022210121154785, | |
| "mean_token_accuracy": 0.7556669354438782, | |
| "num_tokens": 7981235.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 0.9596796780824661, | |
| "epoch": 0.36906666666666665, | |
| "grad_norm": 0.24347490072250366, | |
| "learning_rate": 9.40070269837891e-05, | |
| "loss": 1.058722686767578, | |
| "mean_token_accuracy": 0.7606610782444477, | |
| "num_tokens": 8028778.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 0.8785062082111835, | |
| "epoch": 0.3712, | |
| "grad_norm": 0.20483236014842987, | |
| "learning_rate": 9.392460421795328e-05, | |
| "loss": 0.9666107177734375, | |
| "mean_token_accuracy": 0.7833232149481774, | |
| "num_tokens": 8071410.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 0.8095596194267273, | |
| "epoch": 0.37333333333333335, | |
| "grad_norm": 0.36941683292388916, | |
| "learning_rate": 9.38416551705078e-05, | |
| "loss": 0.8782508850097657, | |
| "mean_token_accuracy": 0.7895680025219918, | |
| "num_tokens": 8113695.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 0.9545476101338863, | |
| "epoch": 0.37546666666666667, | |
| "grad_norm": 0.23256246745586395, | |
| "learning_rate": 9.375818083530474e-05, | |
| "loss": 1.0102588653564453, | |
| "mean_token_accuracy": 0.7568591669201851, | |
| "num_tokens": 8161702.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 0.8932637803256511, | |
| "epoch": 0.3776, | |
| "grad_norm": 0.2185504138469696, | |
| "learning_rate": 9.367418221248989e-05, | |
| "loss": 0.9898091316223144, | |
| "mean_token_accuracy": 0.7799834325909615, | |
| "num_tokens": 8205315.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 0.9569453194737434, | |
| "epoch": 0.3797333333333333, | |
| "grad_norm": 0.34778159856796265, | |
| "learning_rate": 9.358966030849072e-05, | |
| "loss": 1.0686969757080078, | |
| "mean_token_accuracy": 0.7559246391057968, | |
| "num_tokens": 8252919.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.029954968392849, | |
| "epoch": 0.3818666666666667, | |
| "grad_norm": 0.26953041553497314, | |
| "learning_rate": 9.350461613600449e-05, | |
| "loss": 1.1169189453125, | |
| "mean_token_accuracy": 0.746114706993103, | |
| "num_tokens": 8300355.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 0.943967767059803, | |
| "epoch": 0.384, | |
| "grad_norm": 0.2434380203485489, | |
| "learning_rate": 9.34190507139859e-05, | |
| "loss": 0.9968074798583985, | |
| "mean_token_accuracy": 0.7640544638037682, | |
| "num_tokens": 8346150.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 0.960440730303526, | |
| "epoch": 0.38613333333333333, | |
| "grad_norm": 0.2441101223230362, | |
| "learning_rate": 9.333296506763505e-05, | |
| "loss": 1.059675121307373, | |
| "mean_token_accuracy": 0.7603183135390281, | |
| "num_tokens": 8393408.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 0.8498699732124806, | |
| "epoch": 0.38826666666666665, | |
| "grad_norm": 0.2330222874879837, | |
| "learning_rate": 9.324636022838509e-05, | |
| "loss": 0.9152523040771484, | |
| "mean_token_accuracy": 0.7780576214194298, | |
| "num_tokens": 8440947.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 0.9348598286509514, | |
| "epoch": 0.3904, | |
| "grad_norm": 0.25513678789138794, | |
| "learning_rate": 9.315923723388986e-05, | |
| "loss": 1.041547966003418, | |
| "mean_token_accuracy": 0.7705774419009686, | |
| "num_tokens": 8486476.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 0.8954004615545272, | |
| "epoch": 0.39253333333333335, | |
| "grad_norm": 0.24745526909828186, | |
| "learning_rate": 9.307159712801147e-05, | |
| "loss": 0.9804242134094239, | |
| "mean_token_accuracy": 0.773698341846466, | |
| "num_tokens": 8533216.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 0.9542841285467147, | |
| "epoch": 0.39466666666666667, | |
| "grad_norm": 0.3648325800895691, | |
| "learning_rate": 9.298344096080776e-05, | |
| "loss": 1.0262937545776367, | |
| "mean_token_accuracy": 0.7662550717592239, | |
| "num_tokens": 8579411.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 0.820501434057951, | |
| "epoch": 0.3968, | |
| "grad_norm": 0.26821690797805786, | |
| "learning_rate": 9.289476978851976e-05, | |
| "loss": 0.8598980903625488, | |
| "mean_token_accuracy": 0.7888635769486427, | |
| "num_tokens": 8623046.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 0.9134793929755688, | |
| "epoch": 0.3989333333333333, | |
| "grad_norm": 0.25643301010131836, | |
| "learning_rate": 9.280558467355907e-05, | |
| "loss": 1.0032004356384276, | |
| "mean_token_accuracy": 0.7722208425402641, | |
| "num_tokens": 8665201.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 0.9829332500696182, | |
| "epoch": 0.4010666666666667, | |
| "grad_norm": 0.27002307772636414, | |
| "learning_rate": 9.271588668449503e-05, | |
| "loss": 1.1377047538757323, | |
| "mean_token_accuracy": 0.7662092931568623, | |
| "num_tokens": 8707004.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 1.0146782457828523, | |
| "epoch": 0.4032, | |
| "grad_norm": 0.2506401240825653, | |
| "learning_rate": 9.262567689604195e-05, | |
| "loss": 1.124861240386963, | |
| "mean_token_accuracy": 0.7566848322749138, | |
| "num_tokens": 8753713.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 1.0487395107746125, | |
| "epoch": 0.4053333333333333, | |
| "grad_norm": 0.24727876484394073, | |
| "learning_rate": 9.25349563890463e-05, | |
| "loss": 1.254913330078125, | |
| "mean_token_accuracy": 0.7382492840290069, | |
| "num_tokens": 8801703.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 0.8892258107662201, | |
| "epoch": 0.40746666666666664, | |
| "grad_norm": 0.2835586667060852, | |
| "learning_rate": 9.244372625047372e-05, | |
| "loss": 0.9624475479125977, | |
| "mean_token_accuracy": 0.7717373371124268, | |
| "num_tokens": 8848191.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 0.9389666721224785, | |
| "epoch": 0.4096, | |
| "grad_norm": 0.2468794584274292, | |
| "learning_rate": 9.235198757339594e-05, | |
| "loss": 1.0801708221435546, | |
| "mean_token_accuracy": 0.7686146914958953, | |
| "num_tokens": 8895860.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 0.9686854526400566, | |
| "epoch": 0.41173333333333334, | |
| "grad_norm": 0.24529697000980377, | |
| "learning_rate": 9.225974145697775e-05, | |
| "loss": 1.062663745880127, | |
| "mean_token_accuracy": 0.7554567009210587, | |
| "num_tokens": 8945814.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 0.9051942273974418, | |
| "epoch": 0.41386666666666666, | |
| "grad_norm": 0.3458064794540405, | |
| "learning_rate": 9.216698900646383e-05, | |
| "loss": 0.9697030067443848, | |
| "mean_token_accuracy": 0.7712534263730049, | |
| "num_tokens": 8990326.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.0142885446548462, | |
| "epoch": 0.416, | |
| "grad_norm": 0.25167402625083923, | |
| "learning_rate": 9.20737313331655e-05, | |
| "loss": 1.0986696243286134, | |
| "mean_token_accuracy": 0.7525161564350128, | |
| "num_tokens": 9037859.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.0254975706338882, | |
| "epoch": 0.41813333333333336, | |
| "grad_norm": 0.2604680061340332, | |
| "learning_rate": 9.197996955444732e-05, | |
| "loss": 1.0939658164978028, | |
| "mean_token_accuracy": 0.7426734983921051, | |
| "num_tokens": 9088894.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 0.9542628638446331, | |
| "epoch": 0.4202666666666667, | |
| "grad_norm": 0.2736513614654541, | |
| "learning_rate": 9.188570479371387e-05, | |
| "loss": 1.14229097366333, | |
| "mean_token_accuracy": 0.7659218199551105, | |
| "num_tokens": 9133717.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 0.9707967802882195, | |
| "epoch": 0.4224, | |
| "grad_norm": 0.2167668491601944, | |
| "learning_rate": 9.179093818039616e-05, | |
| "loss": 1.077283763885498, | |
| "mean_token_accuracy": 0.7643323555588722, | |
| "num_tokens": 9178671.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 0.9660324349999427, | |
| "epoch": 0.4245333333333333, | |
| "grad_norm": 0.35394829511642456, | |
| "learning_rate": 9.169567084993814e-05, | |
| "loss": 1.083635139465332, | |
| "mean_token_accuracy": 0.7617688804864884, | |
| "num_tokens": 9221289.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 0.9027157455682755, | |
| "epoch": 0.4266666666666667, | |
| "grad_norm": 0.2674417495727539, | |
| "learning_rate": 9.159990394378303e-05, | |
| "loss": 0.980889892578125, | |
| "mean_token_accuracy": 0.7698795303702355, | |
| "num_tokens": 9268892.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 0.952321158349514, | |
| "epoch": 0.4288, | |
| "grad_norm": 0.2332596629858017, | |
| "learning_rate": 9.15036386093598e-05, | |
| "loss": 1.0610048294067382, | |
| "mean_token_accuracy": 0.7620892718434333, | |
| "num_tokens": 9317142.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 0.8780575722455979, | |
| "epoch": 0.43093333333333333, | |
| "grad_norm": 0.2631659209728241, | |
| "learning_rate": 9.140687600006929e-05, | |
| "loss": 0.983364200592041, | |
| "mean_token_accuracy": 0.7832586973905563, | |
| "num_tokens": 9360075.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 0.9146695531904697, | |
| "epoch": 0.43306666666666666, | |
| "grad_norm": 0.33491751551628113, | |
| "learning_rate": 9.13096172752704e-05, | |
| "loss": 0.9856008529663086, | |
| "mean_token_accuracy": 0.7726532012224198, | |
| "num_tokens": 9396928.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 1.0050749816000462, | |
| "epoch": 0.4352, | |
| "grad_norm": 0.30352818965911865, | |
| "learning_rate": 9.121186360026625e-05, | |
| "loss": 1.0613908767700195, | |
| "mean_token_accuracy": 0.7496299520134926, | |
| "num_tokens": 9449045.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 0.919689030200243, | |
| "epoch": 0.43733333333333335, | |
| "grad_norm": 0.33956724405288696, | |
| "learning_rate": 9.111361614629022e-05, | |
| "loss": 0.9860305786132812, | |
| "mean_token_accuracy": 0.7708889573812485, | |
| "num_tokens": 9498718.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 0.9274381674826145, | |
| "epoch": 0.43946666666666667, | |
| "grad_norm": 0.29058581590652466, | |
| "learning_rate": 9.101487609049181e-05, | |
| "loss": 0.9976702690124511, | |
| "mean_token_accuracy": 0.7693670354783535, | |
| "num_tokens": 9537791.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 0.917995036393404, | |
| "epoch": 0.4416, | |
| "grad_norm": 0.22210952639579773, | |
| "learning_rate": 9.091564461592274e-05, | |
| "loss": 1.01414155960083, | |
| "mean_token_accuracy": 0.7708904504776001, | |
| "num_tokens": 9581893.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 0.8260227806866169, | |
| "epoch": 0.4437333333333333, | |
| "grad_norm": 0.31099778413772583, | |
| "learning_rate": 9.081592291152252e-05, | |
| "loss": 0.9264348983764649, | |
| "mean_token_accuracy": 0.7857892021536828, | |
| "num_tokens": 9624465.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 0.9573663167655468, | |
| "epoch": 0.4458666666666667, | |
| "grad_norm": 0.2500694692134857, | |
| "learning_rate": 9.071571217210443e-05, | |
| "loss": 1.042325496673584, | |
| "mean_token_accuracy": 0.7585179045796394, | |
| "num_tokens": 9670927.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 0.8527260765433311, | |
| "epoch": 0.448, | |
| "grad_norm": 0.22426480054855347, | |
| "learning_rate": 9.061501359834108e-05, | |
| "loss": 0.9534576416015625, | |
| "mean_token_accuracy": 0.7796914517879486, | |
| "num_tokens": 9717399.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.0308616764843463, | |
| "epoch": 0.45013333333333333, | |
| "grad_norm": 0.2896214723587036, | |
| "learning_rate": 9.051382839675005e-05, | |
| "loss": 1.1293525695800781, | |
| "mean_token_accuracy": 0.7496985673904419, | |
| "num_tokens": 9766859.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.0453794084489345, | |
| "epoch": 0.45226666666666665, | |
| "grad_norm": 0.21886104345321655, | |
| "learning_rate": 9.041215777967945e-05, | |
| "loss": 1.128882598876953, | |
| "mean_token_accuracy": 0.7480149149894715, | |
| "num_tokens": 9821214.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.002899456769228, | |
| "epoch": 0.4544, | |
| "grad_norm": 0.24397237598896027, | |
| "learning_rate": 9.031000296529336e-05, | |
| "loss": 1.0722038269042968, | |
| "mean_token_accuracy": 0.7499327704310417, | |
| "num_tokens": 9873482.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 0.9601933643221855, | |
| "epoch": 0.45653333333333335, | |
| "grad_norm": 0.249656543135643, | |
| "learning_rate": 9.020736517755733e-05, | |
| "loss": 1.0663026809692382, | |
| "mean_token_accuracy": 0.7673163414001465, | |
| "num_tokens": 9924229.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 1.1109409362077713, | |
| "epoch": 0.45866666666666667, | |
| "grad_norm": 0.2716203033924103, | |
| "learning_rate": 9.010424564622353e-05, | |
| "loss": 1.1743658065795899, | |
| "mean_token_accuracy": 0.7358616881072522, | |
| "num_tokens": 9972409.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 0.9736781157553196, | |
| "epoch": 0.4608, | |
| "grad_norm": 0.32094913721084595, | |
| "learning_rate": 9.000064560681625e-05, | |
| "loss": 1.082399559020996, | |
| "mean_token_accuracy": 0.7566891044378281, | |
| "num_tokens": 10020194.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 0.961787448823452, | |
| "epoch": 0.4629333333333333, | |
| "grad_norm": 0.2845711410045624, | |
| "learning_rate": 8.98965663006169e-05, | |
| "loss": 1.0939053535461425, | |
| "mean_token_accuracy": 0.7606314912438392, | |
| "num_tokens": 10067865.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 0.8467822209000587, | |
| "epoch": 0.4650666666666667, | |
| "grad_norm": 0.26253893971443176, | |
| "learning_rate": 8.979200897464921e-05, | |
| "loss": 0.9104162216186523, | |
| "mean_token_accuracy": 0.7799090445041656, | |
| "num_tokens": 10112591.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 0.880243568867445, | |
| "epoch": 0.4672, | |
| "grad_norm": 0.21990089118480682, | |
| "learning_rate": 8.968697488166435e-05, | |
| "loss": 0.9580593109130859, | |
| "mean_token_accuracy": 0.7760828763246537, | |
| "num_tokens": 10159849.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 0.9931552834808827, | |
| "epoch": 0.4693333333333333, | |
| "grad_norm": 0.2644927501678467, | |
| "learning_rate": 8.95814652801258e-05, | |
| "loss": 1.1035637855529785, | |
| "mean_token_accuracy": 0.755839766561985, | |
| "num_tokens": 10210375.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 0.9833620935678482, | |
| "epoch": 0.47146666666666665, | |
| "grad_norm": 0.31400638818740845, | |
| "learning_rate": 8.947548143419437e-05, | |
| "loss": 1.0525406837463378, | |
| "mean_token_accuracy": 0.7570377990603447, | |
| "num_tokens": 10258067.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.0285537526011468, | |
| "epoch": 0.4736, | |
| "grad_norm": 0.18883784115314484, | |
| "learning_rate": 8.936902461371302e-05, | |
| "loss": 1.1230335235595703, | |
| "mean_token_accuracy": 0.7476231440901756, | |
| "num_tokens": 10308406.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 1.0317844092845916, | |
| "epoch": 0.47573333333333334, | |
| "grad_norm": 0.23253889381885529, | |
| "learning_rate": 8.926209609419165e-05, | |
| "loss": 1.1079911231994628, | |
| "mean_token_accuracy": 0.7472389042377472, | |
| "num_tokens": 10361473.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.0787047080695629, | |
| "epoch": 0.47786666666666666, | |
| "grad_norm": 0.21470828354358673, | |
| "learning_rate": 8.915469715679175e-05, | |
| "loss": 1.2236790657043457, | |
| "mean_token_accuracy": 0.7383568711578846, | |
| "num_tokens": 10412214.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 0.9707687653601169, | |
| "epoch": 0.48, | |
| "grad_norm": 0.2584036588668823, | |
| "learning_rate": 8.904682908831119e-05, | |
| "loss": 1.056828212738037, | |
| "mean_token_accuracy": 0.7611462950706482, | |
| "num_tokens": 10457804.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 0.8710766941308975, | |
| "epoch": 0.48213333333333336, | |
| "grad_norm": 0.23925144970417023, | |
| "learning_rate": 8.893849318116868e-05, | |
| "loss": 0.9463179588317872, | |
| "mean_token_accuracy": 0.7805656388401985, | |
| "num_tokens": 10496592.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.0050086982548236, | |
| "epoch": 0.4842666666666667, | |
| "grad_norm": 0.31434357166290283, | |
| "learning_rate": 8.882969073338833e-05, | |
| "loss": 1.1325186729431151, | |
| "mean_token_accuracy": 0.7497815892100335, | |
| "num_tokens": 10546937.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 0.9999729461967946, | |
| "epoch": 0.4864, | |
| "grad_norm": 0.23635436594486237, | |
| "learning_rate": 8.872042304858412e-05, | |
| "loss": 1.0858405113220215, | |
| "mean_token_accuracy": 0.7575721621513367, | |
| "num_tokens": 10598042.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 0.9136553466320038, | |
| "epoch": 0.4885333333333333, | |
| "grad_norm": 0.2523214519023895, | |
| "learning_rate": 8.861069143594423e-05, | |
| "loss": 0.9898375511169434, | |
| "mean_token_accuracy": 0.7727992206811904, | |
| "num_tokens": 10640977.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 0.912187123298645, | |
| "epoch": 0.49066666666666664, | |
| "grad_norm": 0.2530852258205414, | |
| "learning_rate": 8.850049721021537e-05, | |
| "loss": 1.026146125793457, | |
| "mean_token_accuracy": 0.7669046297669411, | |
| "num_tokens": 10691324.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 0.9109658055007458, | |
| "epoch": 0.4928, | |
| "grad_norm": 0.2615504264831543, | |
| "learning_rate": 8.838984169168708e-05, | |
| "loss": 0.9777699470520019, | |
| "mean_token_accuracy": 0.7721924617886543, | |
| "num_tokens": 10736949.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 0.9462881706655025, | |
| "epoch": 0.49493333333333334, | |
| "grad_norm": 0.30475521087646484, | |
| "learning_rate": 8.827872620617584e-05, | |
| "loss": 1.0332406044006348, | |
| "mean_token_accuracy": 0.7597260788083077, | |
| "num_tokens": 10785226.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 0.8475333206355572, | |
| "epoch": 0.49706666666666666, | |
| "grad_norm": 0.30574989318847656, | |
| "learning_rate": 8.816715208500922e-05, | |
| "loss": 0.9416275978088379, | |
| "mean_token_accuracy": 0.7838068321347237, | |
| "num_tokens": 10830340.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 0.985354783385992, | |
| "epoch": 0.4992, | |
| "grad_norm": 0.2665145695209503, | |
| "learning_rate": 8.805512066500992e-05, | |
| "loss": 1.0852888107299805, | |
| "mean_token_accuracy": 0.7492594376206398, | |
| "num_tokens": 10876689.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.0402933657169342, | |
| "epoch": 0.5013333333333333, | |
| "grad_norm": 0.2032317966222763, | |
| "learning_rate": 8.794263328847975e-05, | |
| "loss": 1.1533799171447754, | |
| "mean_token_accuracy": 0.7475039146840572, | |
| "num_tokens": 10925395.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.0321477994322776, | |
| "epoch": 0.5034666666666666, | |
| "grad_norm": 0.2507781386375427, | |
| "learning_rate": 8.782969130318358e-05, | |
| "loss": 1.1364535331726073, | |
| "mean_token_accuracy": 0.7528289645910263, | |
| "num_tokens": 10973794.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 0.9620419174432755, | |
| "epoch": 0.5056, | |
| "grad_norm": 0.2626616656780243, | |
| "learning_rate": 8.771629606233314e-05, | |
| "loss": 1.0547003746032715, | |
| "mean_token_accuracy": 0.7652165532112122, | |
| "num_tokens": 11022983.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 0.945545606315136, | |
| "epoch": 0.5077333333333334, | |
| "grad_norm": 0.24525931477546692, | |
| "learning_rate": 8.76024489245708e-05, | |
| "loss": 0.9925324440002441, | |
| "mean_token_accuracy": 0.7643568038940429, | |
| "num_tokens": 11065494.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.0124794401228427, | |
| "epoch": 0.5098666666666667, | |
| "grad_norm": 0.2306586503982544, | |
| "learning_rate": 8.74881512539534e-05, | |
| "loss": 1.159686279296875, | |
| "mean_token_accuracy": 0.7495349571108818, | |
| "num_tokens": 11117249.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.0916487082839013, | |
| "epoch": 0.512, | |
| "grad_norm": 0.31891921162605286, | |
| "learning_rate": 8.737340441993575e-05, | |
| "loss": 1.1685538291931152, | |
| "mean_token_accuracy": 0.7355501770973205, | |
| "num_tokens": 11167106.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 0.9723750591278076, | |
| "epoch": 0.5141333333333333, | |
| "grad_norm": 0.3258633315563202, | |
| "learning_rate": 8.725820979735436e-05, | |
| "loss": 1.1157949447631836, | |
| "mean_token_accuracy": 0.7603042379021645, | |
| "num_tokens": 11211815.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 0.8673077188432217, | |
| "epoch": 0.5162666666666667, | |
| "grad_norm": 0.2602537274360657, | |
| "learning_rate": 8.714256876641087e-05, | |
| "loss": 0.9992271423339844, | |
| "mean_token_accuracy": 0.7819159016013145, | |
| "num_tokens": 11257215.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.030208633840084, | |
| "epoch": 0.5184, | |
| "grad_norm": 0.34487199783325195, | |
| "learning_rate": 8.702648271265559e-05, | |
| "loss": 1.1172332763671875, | |
| "mean_token_accuracy": 0.7529917880892754, | |
| "num_tokens": 11302807.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.0103901624679565, | |
| "epoch": 0.5205333333333333, | |
| "grad_norm": 0.3081832230091095, | |
| "learning_rate": 8.690995302697081e-05, | |
| "loss": 1.1663932800292969, | |
| "mean_token_accuracy": 0.7516932919621467, | |
| "num_tokens": 11350668.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.0364744253456593, | |
| "epoch": 0.5226666666666666, | |
| "grad_norm": 0.24485838413238525, | |
| "learning_rate": 8.67929811055542e-05, | |
| "loss": 1.100508689880371, | |
| "mean_token_accuracy": 0.7447643153369427, | |
| "num_tokens": 11403181.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 0.8236982800066471, | |
| "epoch": 0.5248, | |
| "grad_norm": 0.28031599521636963, | |
| "learning_rate": 8.667556834990211e-05, | |
| "loss": 0.9254312515258789, | |
| "mean_token_accuracy": 0.7904586613178253, | |
| "num_tokens": 11443565.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 0.9475759953260422, | |
| "epoch": 0.5269333333333334, | |
| "grad_norm": 0.2664242088794708, | |
| "learning_rate": 8.65577161667927e-05, | |
| "loss": 1.0038702011108398, | |
| "mean_token_accuracy": 0.7648072987794876, | |
| "num_tokens": 11484816.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 0.9790533006191253, | |
| "epoch": 0.5290666666666667, | |
| "grad_norm": 0.2507703900337219, | |
| "learning_rate": 8.643942596826911e-05, | |
| "loss": 1.1305276870727539, | |
| "mean_token_accuracy": 0.7577844798564911, | |
| "num_tokens": 11526756.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 0.929996844381094, | |
| "epoch": 0.5312, | |
| "grad_norm": 0.29296958446502686, | |
| "learning_rate": 8.632069917162255e-05, | |
| "loss": 1.0086584091186523, | |
| "mean_token_accuracy": 0.7662723585963249, | |
| "num_tokens": 11568739.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 0.9251207195222377, | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 0.23416800796985626, | |
| "learning_rate": 8.620153719937535e-05, | |
| "loss": 0.9998083114624023, | |
| "mean_token_accuracy": 0.7651191413402557, | |
| "num_tokens": 11616235.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 0.9308743372559547, | |
| "epoch": 0.5354666666666666, | |
| "grad_norm": 0.29301849007606506, | |
| "learning_rate": 8.60819414792639e-05, | |
| "loss": 1.0315680503845215, | |
| "mean_token_accuracy": 0.7692675769329071, | |
| "num_tokens": 11666713.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 0.995804300904274, | |
| "epoch": 0.5376, | |
| "grad_norm": 0.3097708821296692, | |
| "learning_rate": 8.596191344422144e-05, | |
| "loss": 1.072645664215088, | |
| "mean_token_accuracy": 0.7509198278188706, | |
| "num_tokens": 11710341.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 0.8292181946337223, | |
| "epoch": 0.5397333333333333, | |
| "grad_norm": 0.2717600166797638, | |
| "learning_rate": 8.58414545323611e-05, | |
| "loss": 0.9124368667602539, | |
| "mean_token_accuracy": 0.789941257238388, | |
| "num_tokens": 11753282.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.0006135009229182, | |
| "epoch": 0.5418666666666667, | |
| "grad_norm": 0.2360389232635498, | |
| "learning_rate": 8.572056618695845e-05, | |
| "loss": 1.1389695167541505, | |
| "mean_token_accuracy": 0.7522218823432922, | |
| "num_tokens": 11801605.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.0214832991361618, | |
| "epoch": 0.544, | |
| "grad_norm": 0.265127569437027, | |
| "learning_rate": 8.559924985643436e-05, | |
| "loss": 1.1042506217956543, | |
| "mean_token_accuracy": 0.7507105216383934, | |
| "num_tokens": 11847802.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 0.9052864700555802, | |
| "epoch": 0.5461333333333334, | |
| "grad_norm": 0.21368557214736938, | |
| "learning_rate": 8.54775069943376e-05, | |
| "loss": 0.9432812690734863, | |
| "mean_token_accuracy": 0.7738461554050445, | |
| "num_tokens": 11895685.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 0.8330970570445061, | |
| "epoch": 0.5482666666666667, | |
| "grad_norm": 0.25935855507850647, | |
| "learning_rate": 8.535533905932738e-05, | |
| "loss": 0.9135859489440918, | |
| "mean_token_accuracy": 0.7793566673994065, | |
| "num_tokens": 11940774.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 0.941501996666193, | |
| "epoch": 0.5504, | |
| "grad_norm": 0.2179393619298935, | |
| "learning_rate": 8.523274751515595e-05, | |
| "loss": 0.9954432487487793, | |
| "mean_token_accuracy": 0.7663334146142006, | |
| "num_tokens": 11987918.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 0.8678958520293236, | |
| "epoch": 0.5525333333333333, | |
| "grad_norm": 0.28909558057785034, | |
| "learning_rate": 8.510973383065099e-05, | |
| "loss": 0.9430976867675781, | |
| "mean_token_accuracy": 0.7832803040742874, | |
| "num_tokens": 12032447.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 0.981315091997385, | |
| "epoch": 0.5546666666666666, | |
| "grad_norm": 0.25849565863609314, | |
| "learning_rate": 8.498629947969807e-05, | |
| "loss": 1.135009765625, | |
| "mean_token_accuracy": 0.7560776218771934, | |
| "num_tokens": 12082379.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 0.904555281996727, | |
| "epoch": 0.5568, | |
| "grad_norm": 0.2646128535270691, | |
| "learning_rate": 8.486244594122297e-05, | |
| "loss": 1.0139558792114258, | |
| "mean_token_accuracy": 0.7750694006681442, | |
| "num_tokens": 12125503.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 0.9893216907978057, | |
| "epoch": 0.5589333333333333, | |
| "grad_norm": 0.2604218125343323, | |
| "learning_rate": 8.47381746991739e-05, | |
| "loss": 1.040914249420166, | |
| "mean_token_accuracy": 0.7624668940901757, | |
| "num_tokens": 12177100.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 0.9657470785081387, | |
| "epoch": 0.5610666666666667, | |
| "grad_norm": 0.2379087507724762, | |
| "learning_rate": 8.461348724250384e-05, | |
| "loss": 1.075094223022461, | |
| "mean_token_accuracy": 0.7620343893766404, | |
| "num_tokens": 12223113.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 0.9550898216664792, | |
| "epoch": 0.5632, | |
| "grad_norm": 0.25011542439460754, | |
| "learning_rate": 8.448838506515262e-05, | |
| "loss": 0.9950971603393555, | |
| "mean_token_accuracy": 0.7567313954234123, | |
| "num_tokens": 12274057.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 0.9988928638398648, | |
| "epoch": 0.5653333333333334, | |
| "grad_norm": 0.25330716371536255, | |
| "learning_rate": 8.436286966602903e-05, | |
| "loss": 1.0961190223693849, | |
| "mean_token_accuracy": 0.7484898209571839, | |
| "num_tokens": 12319765.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 0.9943435691297055, | |
| "epoch": 0.5674666666666667, | |
| "grad_norm": 0.22841870784759521, | |
| "learning_rate": 8.423694254899283e-05, | |
| "loss": 1.0581014633178711, | |
| "mean_token_accuracy": 0.7553936064243316, | |
| "num_tokens": 12364825.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 0.8084396831691265, | |
| "epoch": 0.5696, | |
| "grad_norm": 0.2357935756444931, | |
| "learning_rate": 8.411060522283685e-05, | |
| "loss": 0.8782732963562012, | |
| "mean_token_accuracy": 0.7895583346486091, | |
| "num_tokens": 12406871.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 0.942859411239624, | |
| "epoch": 0.5717333333333333, | |
| "grad_norm": 0.27082499861717224, | |
| "learning_rate": 8.398385920126874e-05, | |
| "loss": 1.0009033203125, | |
| "mean_token_accuracy": 0.7650713473558426, | |
| "num_tokens": 12451365.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 0.8713853091001511, | |
| "epoch": 0.5738666666666666, | |
| "grad_norm": 0.2360270470380783, | |
| "learning_rate": 8.385670600289302e-05, | |
| "loss": 0.9726097106933593, | |
| "mean_token_accuracy": 0.7823046505451202, | |
| "num_tokens": 12496420.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 0.9863057106733322, | |
| "epoch": 0.576, | |
| "grad_norm": 0.2574012279510498, | |
| "learning_rate": 8.372914715119269e-05, | |
| "loss": 1.0647315979003906, | |
| "mean_token_accuracy": 0.7581366948783398, | |
| "num_tokens": 12541298.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 1.001559029519558, | |
| "epoch": 0.5781333333333334, | |
| "grad_norm": 0.3141133487224579, | |
| "learning_rate": 8.360118417451113e-05, | |
| "loss": 1.1159303665161133, | |
| "mean_token_accuracy": 0.7509155049920082, | |
| "num_tokens": 12586250.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 0.9664455614984035, | |
| "epoch": 0.5802666666666667, | |
| "grad_norm": 0.2820577323436737, | |
| "learning_rate": 8.347281860603375e-05, | |
| "loss": 1.0676399230957032, | |
| "mean_token_accuracy": 0.7622367069125175, | |
| "num_tokens": 12635296.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 0.9387885488569736, | |
| "epoch": 0.5824, | |
| "grad_norm": 0.3290941119194031, | |
| "learning_rate": 8.334405198376958e-05, | |
| "loss": 1.0030738830566406, | |
| "mean_token_accuracy": 0.7586753875017166, | |
| "num_tokens": 12681486.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 0.9214616276323795, | |
| "epoch": 0.5845333333333333, | |
| "grad_norm": 0.2720118761062622, | |
| "learning_rate": 8.321488585053285e-05, | |
| "loss": 1.0133691787719727, | |
| "mean_token_accuracy": 0.7658515647053719, | |
| "num_tokens": 12728360.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.02061934620142, | |
| "epoch": 0.5866666666666667, | |
| "grad_norm": 0.2634807229042053, | |
| "learning_rate": 8.308532175392456e-05, | |
| "loss": 1.098531150817871, | |
| "mean_token_accuracy": 0.7488874278962612, | |
| "num_tokens": 12776988.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 0.8518887549638748, | |
| "epoch": 0.5888, | |
| "grad_norm": 0.2686164677143097, | |
| "learning_rate": 8.295536124631385e-05, | |
| "loss": 0.9314091682434082, | |
| "mean_token_accuracy": 0.7844551488757133, | |
| "num_tokens": 12821354.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.0409190684556962, | |
| "epoch": 0.5909333333333333, | |
| "grad_norm": 0.211833655834198, | |
| "learning_rate": 8.28250058848195e-05, | |
| "loss": 1.101518440246582, | |
| "mean_token_accuracy": 0.7454183496534824, | |
| "num_tokens": 12874281.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 0.8854779146611691, | |
| "epoch": 0.5930666666666666, | |
| "grad_norm": 0.19881795346736908, | |
| "learning_rate": 8.26942572312912e-05, | |
| "loss": 0.9492866516113281, | |
| "mean_token_accuracy": 0.7723072916269302, | |
| "num_tokens": 12921599.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 0.9700805857777596, | |
| "epoch": 0.5952, | |
| "grad_norm": 0.27477312088012695, | |
| "learning_rate": 8.256311685229085e-05, | |
| "loss": 1.1071263313293458, | |
| "mean_token_accuracy": 0.7538750320672989, | |
| "num_tokens": 12969810.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 0.9516462504863739, | |
| "epoch": 0.5973333333333334, | |
| "grad_norm": 0.27765893936157227, | |
| "learning_rate": 8.243158631907382e-05, | |
| "loss": 1.0368030548095704, | |
| "mean_token_accuracy": 0.7694585740566253, | |
| "num_tokens": 13015439.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 1.0109743446111679, | |
| "epoch": 0.5994666666666667, | |
| "grad_norm": 0.299020379781723, | |
| "learning_rate": 8.229966720757007e-05, | |
| "loss": 1.1124341011047363, | |
| "mean_token_accuracy": 0.7469129160046577, | |
| "num_tokens": 13064583.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 1.0561771862208844, | |
| "epoch": 0.6016, | |
| "grad_norm": 0.2526226043701172, | |
| "learning_rate": 8.216736109836534e-05, | |
| "loss": 1.1680998802185059, | |
| "mean_token_accuracy": 0.7392091482877732, | |
| "num_tokens": 13115184.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 0.9755672253668308, | |
| "epoch": 0.6037333333333333, | |
| "grad_norm": 0.24116092920303345, | |
| "learning_rate": 8.203466957668215e-05, | |
| "loss": 1.0671576499938964, | |
| "mean_token_accuracy": 0.7598815195262432, | |
| "num_tokens": 13163748.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 0.8110849797725678, | |
| "epoch": 0.6058666666666667, | |
| "grad_norm": 0.3148553967475891, | |
| "learning_rate": 8.190159423236086e-05, | |
| "loss": 0.8950259208679199, | |
| "mean_token_accuracy": 0.7862283095717431, | |
| "num_tokens": 13204391.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 0.829298897087574, | |
| "epoch": 0.608, | |
| "grad_norm": 0.2556048631668091, | |
| "learning_rate": 8.176813665984053e-05, | |
| "loss": 0.8883259773254395, | |
| "mean_token_accuracy": 0.789163002371788, | |
| "num_tokens": 13244838.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 0.9395963847637177, | |
| "epoch": 0.6101333333333333, | |
| "grad_norm": 0.19703006744384766, | |
| "learning_rate": 8.163429845813997e-05, | |
| "loss": 1.0494510650634765, | |
| "mean_token_accuracy": 0.7710079193115235, | |
| "num_tokens": 13290932.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 0.9920587949454784, | |
| "epoch": 0.6122666666666666, | |
| "grad_norm": 0.2381218671798706, | |
| "learning_rate": 8.150008123083838e-05, | |
| "loss": 1.0494998931884765, | |
| "mean_token_accuracy": 0.7526131421327591, | |
| "num_tokens": 13333787.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 0.9984497465193272, | |
| "epoch": 0.6144, | |
| "grad_norm": 0.25819751620292664, | |
| "learning_rate": 8.136548658605635e-05, | |
| "loss": 1.1107137680053711, | |
| "mean_token_accuracy": 0.7557663440704345, | |
| "num_tokens": 13382126.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 0.9907154351472854, | |
| "epoch": 0.6165333333333334, | |
| "grad_norm": 0.2328466922044754, | |
| "learning_rate": 8.123051613643641e-05, | |
| "loss": 1.1184075355529786, | |
| "mean_token_accuracy": 0.7595549002289772, | |
| "num_tokens": 13430083.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 0.9244011230766773, | |
| "epoch": 0.6186666666666667, | |
| "grad_norm": 0.24781359732151031, | |
| "learning_rate": 8.109517149912386e-05, | |
| "loss": 1.017502498626709, | |
| "mean_token_accuracy": 0.7722871780395508, | |
| "num_tokens": 13478876.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 0.8886970773339271, | |
| "epoch": 0.6208, | |
| "grad_norm": 0.2412341833114624, | |
| "learning_rate": 8.095945429574724e-05, | |
| "loss": 0.9119473457336426, | |
| "mean_token_accuracy": 0.7751852914690971, | |
| "num_tokens": 13527978.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.040999775379896, | |
| "epoch": 0.6229333333333333, | |
| "grad_norm": 0.2708323895931244, | |
| "learning_rate": 8.082336615239903e-05, | |
| "loss": 1.1017963409423828, | |
| "mean_token_accuracy": 0.7445731669664383, | |
| "num_tokens": 13579308.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.0086095616221429, | |
| "epoch": 0.6250666666666667, | |
| "grad_norm": 0.2506955564022064, | |
| "learning_rate": 8.068690869961613e-05, | |
| "loss": 1.1194355964660645, | |
| "mean_token_accuracy": 0.7530581071972847, | |
| "num_tokens": 13632480.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 0.9920367047190666, | |
| "epoch": 0.6272, | |
| "grad_norm": 0.28143101930618286, | |
| "learning_rate": 8.055008357236027e-05, | |
| "loss": 1.0880350112915038, | |
| "mean_token_accuracy": 0.7523079156875611, | |
| "num_tokens": 13683250.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 0.947841040790081, | |
| "epoch": 0.6293333333333333, | |
| "grad_norm": 0.34841635823249817, | |
| "learning_rate": 8.04128924099985e-05, | |
| "loss": 1.013569164276123, | |
| "mean_token_accuracy": 0.7690569952130317, | |
| "num_tokens": 13724761.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 0.8923015877604484, | |
| "epoch": 0.6314666666666666, | |
| "grad_norm": 0.24537858366966248, | |
| "learning_rate": 8.027533685628348e-05, | |
| "loss": 0.9606434822082519, | |
| "mean_token_accuracy": 0.7777309969067574, | |
| "num_tokens": 13771701.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 1.082998887449503, | |
| "epoch": 0.6336, | |
| "grad_norm": 0.2772109806537628, | |
| "learning_rate": 8.013741855933386e-05, | |
| "loss": 1.155489444732666, | |
| "mean_token_accuracy": 0.7356668919324875, | |
| "num_tokens": 13824969.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.0548067845404148, | |
| "epoch": 0.6357333333333334, | |
| "grad_norm": 0.2706131041049957, | |
| "learning_rate": 7.999913917161446e-05, | |
| "loss": 1.1606884002685547, | |
| "mean_token_accuracy": 0.7461161836981773, | |
| "num_tokens": 13879673.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 0.9122042678296566, | |
| "epoch": 0.6378666666666667, | |
| "grad_norm": 0.28579071164131165, | |
| "learning_rate": 7.986050034991646e-05, | |
| "loss": 1.0014433860778809, | |
| "mean_token_accuracy": 0.7702639386057853, | |
| "num_tokens": 13923893.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 0.856528140604496, | |
| "epoch": 0.64, | |
| "grad_norm": 0.2646186351776123, | |
| "learning_rate": 7.972150375533767e-05, | |
| "loss": 0.9789193153381348, | |
| "mean_token_accuracy": 0.7824795439839363, | |
| "num_tokens": 13967914.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.013469608873129, | |
| "epoch": 0.6421333333333333, | |
| "grad_norm": 0.2540909945964813, | |
| "learning_rate": 7.958215105326252e-05, | |
| "loss": 1.1425801277160645, | |
| "mean_token_accuracy": 0.7503237001597881, | |
| "num_tokens": 14016335.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 0.9561307951807976, | |
| "epoch": 0.6442666666666667, | |
| "grad_norm": 0.2495027333498001, | |
| "learning_rate": 7.94424439133421e-05, | |
| "loss": 1.0421770095825196, | |
| "mean_token_accuracy": 0.7604482308030128, | |
| "num_tokens": 14060745.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 0.9330584339797496, | |
| "epoch": 0.6464, | |
| "grad_norm": 0.26480352878570557, | |
| "learning_rate": 7.930238400947422e-05, | |
| "loss": 1.0622355461120605, | |
| "mean_token_accuracy": 0.7683120101690293, | |
| "num_tokens": 14108255.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 0.8226673573255538, | |
| "epoch": 0.6485333333333333, | |
| "grad_norm": 0.2883199453353882, | |
| "learning_rate": 7.916197301978331e-05, | |
| "loss": 0.8736177444458008, | |
| "mean_token_accuracy": 0.7835568472743034, | |
| "num_tokens": 14151595.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.0103112280368804, | |
| "epoch": 0.6506666666666666, | |
| "grad_norm": 0.2573588788509369, | |
| "learning_rate": 7.902121262660036e-05, | |
| "loss": 1.1782626152038573, | |
| "mean_token_accuracy": 0.7547322385013103, | |
| "num_tokens": 14198658.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 0.9194101721048356, | |
| "epoch": 0.6528, | |
| "grad_norm": 0.22869926691055298, | |
| "learning_rate": 7.888010451644265e-05, | |
| "loss": 0.96375732421875, | |
| "mean_token_accuracy": 0.7731851547956466, | |
| "num_tokens": 14243252.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 0.927897697687149, | |
| "epoch": 0.6549333333333334, | |
| "grad_norm": 0.32361456751823425, | |
| "learning_rate": 7.873865037999373e-05, | |
| "loss": 1.0542486190795899, | |
| "mean_token_accuracy": 0.7636147439479828, | |
| "num_tokens": 14290318.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 0.8857385322451592, | |
| "epoch": 0.6570666666666667, | |
| "grad_norm": 0.25951746106147766, | |
| "learning_rate": 7.859685191208297e-05, | |
| "loss": 0.9199460983276367, | |
| "mean_token_accuracy": 0.7751095175743103, | |
| "num_tokens": 14341937.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 0.9319920368492604, | |
| "epoch": 0.6592, | |
| "grad_norm": 0.22098122537136078, | |
| "learning_rate": 7.845471081166535e-05, | |
| "loss": 1.057561206817627, | |
| "mean_token_accuracy": 0.763427771627903, | |
| "num_tokens": 14388811.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 0.9401551052927971, | |
| "epoch": 0.6613333333333333, | |
| "grad_norm": 0.25181668996810913, | |
| "learning_rate": 7.831222878180115e-05, | |
| "loss": 1.0170879364013672, | |
| "mean_token_accuracy": 0.7671449035406113, | |
| "num_tokens": 14432608.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 0.9817736372351646, | |
| "epoch": 0.6634666666666666, | |
| "grad_norm": 0.25245943665504456, | |
| "learning_rate": 7.816940752963543e-05, | |
| "loss": 1.1231375694274903, | |
| "mean_token_accuracy": 0.7525465905666351, | |
| "num_tokens": 14483062.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.032941934466362, | |
| "epoch": 0.6656, | |
| "grad_norm": 0.255884051322937, | |
| "learning_rate": 7.80262487663777e-05, | |
| "loss": 1.1379814147949219, | |
| "mean_token_accuracy": 0.7467011958360672, | |
| "num_tokens": 14526227.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 0.850496319681406, | |
| "epoch": 0.6677333333333333, | |
| "grad_norm": 0.37918779253959656, | |
| "learning_rate": 7.788275420728123e-05, | |
| "loss": 0.914525032043457, | |
| "mean_token_accuracy": 0.7852855637669564, | |
| "num_tokens": 14566458.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 1.002537302672863, | |
| "epoch": 0.6698666666666667, | |
| "grad_norm": 0.21786057949066162, | |
| "learning_rate": 7.773892557162274e-05, | |
| "loss": 1.063007640838623, | |
| "mean_token_accuracy": 0.7521986544132233, | |
| "num_tokens": 14620140.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 0.8929514214396477, | |
| "epoch": 0.672, | |
| "grad_norm": 0.24498853087425232, | |
| "learning_rate": 7.759476458268165e-05, | |
| "loss": 0.9452738761901855, | |
| "mean_token_accuracy": 0.7730352610349656, | |
| "num_tokens": 14664132.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 0.9587577134370804, | |
| "epoch": 0.6741333333333334, | |
| "grad_norm": 0.2750677168369293, | |
| "learning_rate": 7.74502729677194e-05, | |
| "loss": 1.1020426750183105, | |
| "mean_token_accuracy": 0.7670308589935303, | |
| "num_tokens": 14711920.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 0.9528509542346001, | |
| "epoch": 0.6762666666666667, | |
| "grad_norm": 0.215078204870224, | |
| "learning_rate": 7.730545245795891e-05, | |
| "loss": 0.9985583305358887, | |
| "mean_token_accuracy": 0.7616261839866638, | |
| "num_tokens": 14759139.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 0.9082593686878682, | |
| "epoch": 0.6784, | |
| "grad_norm": 0.23811033368110657, | |
| "learning_rate": 7.71603047885637e-05, | |
| "loss": 1.0061087608337402, | |
| "mean_token_accuracy": 0.7741461530327797, | |
| "num_tokens": 14803777.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 0.9152800880372525, | |
| "epoch": 0.6805333333333333, | |
| "grad_norm": 0.25680598616600037, | |
| "learning_rate": 7.701483169861713e-05, | |
| "loss": 0.9781417846679688, | |
| "mean_token_accuracy": 0.7678594440221786, | |
| "num_tokens": 14851182.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 0.7769002497196198, | |
| "epoch": 0.6826666666666666, | |
| "grad_norm": 0.29509180784225464, | |
| "learning_rate": 7.68690349311016e-05, | |
| "loss": 0.8263790130615234, | |
| "mean_token_accuracy": 0.8022488832473755, | |
| "num_tokens": 14891820.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.0069321602582932, | |
| "epoch": 0.6848, | |
| "grad_norm": 0.27509671449661255, | |
| "learning_rate": 7.672291623287766e-05, | |
| "loss": 1.1010238647460937, | |
| "mean_token_accuracy": 0.754035946726799, | |
| "num_tokens": 14942310.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 0.9065248876810074, | |
| "epoch": 0.6869333333333333, | |
| "grad_norm": 0.22744986414909363, | |
| "learning_rate": 7.657647735466302e-05, | |
| "loss": 0.9641946792602539, | |
| "mean_token_accuracy": 0.772594378888607, | |
| "num_tokens": 14986110.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 0.95280120074749, | |
| "epoch": 0.6890666666666667, | |
| "grad_norm": 0.24981571733951569, | |
| "learning_rate": 7.642972005101168e-05, | |
| "loss": 1.054494857788086, | |
| "mean_token_accuracy": 0.7634354814887047, | |
| "num_tokens": 15031665.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 0.930540493875742, | |
| "epoch": 0.6912, | |
| "grad_norm": 0.2832178473472595, | |
| "learning_rate": 7.628264608029277e-05, | |
| "loss": 1.0797764778137207, | |
| "mean_token_accuracy": 0.768897658586502, | |
| "num_tokens": 15075297.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 0.8848231807351112, | |
| "epoch": 0.6933333333333334, | |
| "grad_norm": 0.3329085111618042, | |
| "learning_rate": 7.613525720466965e-05, | |
| "loss": 0.9568095207214355, | |
| "mean_token_accuracy": 0.7773910194635392, | |
| "num_tokens": 15117055.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 0.9285655155777931, | |
| "epoch": 0.6954666666666667, | |
| "grad_norm": 0.24558193981647491, | |
| "learning_rate": 7.59875551900786e-05, | |
| "loss": 1.037491226196289, | |
| "mean_token_accuracy": 0.7648348346352577, | |
| "num_tokens": 15164096.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.0723001688718796, | |
| "epoch": 0.6976, | |
| "grad_norm": 0.3482857644557953, | |
| "learning_rate": 7.58395418062079e-05, | |
| "loss": 1.147115993499756, | |
| "mean_token_accuracy": 0.7382091999053955, | |
| "num_tokens": 15212178.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.099733480066061, | |
| "epoch": 0.6997333333333333, | |
| "grad_norm": 0.26093462109565735, | |
| "learning_rate": 7.569121882647634e-05, | |
| "loss": 1.2072590827941894, | |
| "mean_token_accuracy": 0.7411856979131699, | |
| "num_tokens": 15261433.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 0.8516895264387131, | |
| "epoch": 0.7018666666666666, | |
| "grad_norm": 0.30177560448646545, | |
| "learning_rate": 7.554258802801226e-05, | |
| "loss": 0.9454229354858399, | |
| "mean_token_accuracy": 0.7824687540531159, | |
| "num_tokens": 15302428.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 0.9096255600452423, | |
| "epoch": 0.704, | |
| "grad_norm": 0.2182462513446808, | |
| "learning_rate": 7.539365119163204e-05, | |
| "loss": 0.9878718376159668, | |
| "mean_token_accuracy": 0.7683326050639152, | |
| "num_tokens": 15350117.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.0591608263552188, | |
| "epoch": 0.7061333333333333, | |
| "grad_norm": 0.28637412190437317, | |
| "learning_rate": 7.524441010181889e-05, | |
| "loss": 1.1826082229614259, | |
| "mean_token_accuracy": 0.7397180199623108, | |
| "num_tokens": 15404947.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.0128936409950255, | |
| "epoch": 0.7082666666666667, | |
| "grad_norm": 0.2553557753562927, | |
| "learning_rate": 7.509486654670137e-05, | |
| "loss": 1.0848438262939453, | |
| "mean_token_accuracy": 0.752461838722229, | |
| "num_tokens": 15454949.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.0468340575695039, | |
| "epoch": 0.7104, | |
| "grad_norm": 0.29637107253074646, | |
| "learning_rate": 7.494502231803211e-05, | |
| "loss": 1.148671531677246, | |
| "mean_token_accuracy": 0.7463315047323704, | |
| "num_tokens": 15507585.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 0.9354016333818436, | |
| "epoch": 0.7125333333333334, | |
| "grad_norm": 0.2948022782802582, | |
| "learning_rate": 7.47948792111662e-05, | |
| "loss": 1.0402913093566895, | |
| "mean_token_accuracy": 0.7643374137580394, | |
| "num_tokens": 15554937.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 0.9632456421852111, | |
| "epoch": 0.7146666666666667, | |
| "grad_norm": 0.21984997391700745, | |
| "learning_rate": 7.464443902503968e-05, | |
| "loss": 1.0455013275146485, | |
| "mean_token_accuracy": 0.7605102032423019, | |
| "num_tokens": 15605470.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 0.9598750308156013, | |
| "epoch": 0.7168, | |
| "grad_norm": 0.2748892903327942, | |
| "learning_rate": 7.449370356214814e-05, | |
| "loss": 1.0057992935180664, | |
| "mean_token_accuracy": 0.7655998513102531, | |
| "num_tokens": 15652544.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 0.8534669198095799, | |
| "epoch": 0.7189333333333333, | |
| "grad_norm": 0.20464476943016052, | |
| "learning_rate": 7.434267462852496e-05, | |
| "loss": 0.9573710441589356, | |
| "mean_token_accuracy": 0.7839296951889991, | |
| "num_tokens": 15691801.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 0.9314685501158237, | |
| "epoch": 0.7210666666666666, | |
| "grad_norm": 0.2979079782962799, | |
| "learning_rate": 7.419135403371976e-05, | |
| "loss": 1.051080322265625, | |
| "mean_token_accuracy": 0.7620216578245163, | |
| "num_tokens": 15736780.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 1.0298451118171215, | |
| "epoch": 0.7232, | |
| "grad_norm": 0.24461258947849274, | |
| "learning_rate": 7.403974359077664e-05, | |
| "loss": 1.080700397491455, | |
| "mean_token_accuracy": 0.7475218966603279, | |
| "num_tokens": 15789887.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 1.0159111820161342, | |
| "epoch": 0.7253333333333334, | |
| "grad_norm": 0.2403489053249359, | |
| "learning_rate": 7.38878451162126e-05, | |
| "loss": 1.1378083229064941, | |
| "mean_token_accuracy": 0.7544682174921036, | |
| "num_tokens": 15837848.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 0.9885091617703438, | |
| "epoch": 0.7274666666666667, | |
| "grad_norm": 0.31231454014778137, | |
| "learning_rate": 7.373566042999559e-05, | |
| "loss": 1.0966137886047362, | |
| "mean_token_accuracy": 0.7566501721739769, | |
| "num_tokens": 15885904.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.0563621714711189, | |
| "epoch": 0.7296, | |
| "grad_norm": 0.3054843246936798, | |
| "learning_rate": 7.358319135552285e-05, | |
| "loss": 1.189220142364502, | |
| "mean_token_accuracy": 0.738009649515152, | |
| "num_tokens": 15937698.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 0.9974006466567517, | |
| "epoch": 0.7317333333333333, | |
| "grad_norm": 0.23932306468486786, | |
| "learning_rate": 7.343043971959902e-05, | |
| "loss": 1.0580031394958496, | |
| "mean_token_accuracy": 0.7584069922566414, | |
| "num_tokens": 15984086.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 0.9371322847902774, | |
| "epoch": 0.7338666666666667, | |
| "grad_norm": 0.21763980388641357, | |
| "learning_rate": 7.32774073524142e-05, | |
| "loss": 1.0160024642944336, | |
| "mean_token_accuracy": 0.7715103484690189, | |
| "num_tokens": 16029965.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 0.9526532724499702, | |
| "epoch": 0.736, | |
| "grad_norm": 0.24097904562950134, | |
| "learning_rate": 7.312409608752208e-05, | |
| "loss": 1.0411754608154298, | |
| "mean_token_accuracy": 0.7602859303355217, | |
| "num_tokens": 16078997.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 0.9305942483246327, | |
| "epoch": 0.7381333333333333, | |
| "grad_norm": 0.25894254446029663, | |
| "learning_rate": 7.2970507761818e-05, | |
| "loss": 0.9928631782531738, | |
| "mean_token_accuracy": 0.7672612771391869, | |
| "num_tokens": 16128753.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 0.9178998105227947, | |
| "epoch": 0.7402666666666666, | |
| "grad_norm": 0.24847134947776794, | |
| "learning_rate": 7.281664421551684e-05, | |
| "loss": 1.0369275093078614, | |
| "mean_token_accuracy": 0.7686163082718849, | |
| "num_tokens": 16169199.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.089708861708641, | |
| "epoch": 0.7424, | |
| "grad_norm": 0.2787526547908783, | |
| "learning_rate": 7.266250729213105e-05, | |
| "loss": 1.177119255065918, | |
| "mean_token_accuracy": 0.7344872549176216, | |
| "num_tokens": 16218140.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 0.9035327732563019, | |
| "epoch": 0.7445333333333334, | |
| "grad_norm": 0.35477346181869507, | |
| "learning_rate": 7.250809883844855e-05, | |
| "loss": 1.0146682739257813, | |
| "mean_token_accuracy": 0.7716259181499481, | |
| "num_tokens": 16261629.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 0.9472721114754676, | |
| "epoch": 0.7466666666666667, | |
| "grad_norm": 0.2600723206996918, | |
| "learning_rate": 7.235342070451059e-05, | |
| "loss": 1.0361743927001954, | |
| "mean_token_accuracy": 0.761479677259922, | |
| "num_tokens": 16308149.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.0926350936293603, | |
| "epoch": 0.7488, | |
| "grad_norm": 0.25519415736198425, | |
| "learning_rate": 7.219847474358959e-05, | |
| "loss": 1.1195724487304688, | |
| "mean_token_accuracy": 0.7408729113638401, | |
| "num_tokens": 16355489.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 0.9718878209590912, | |
| "epoch": 0.7509333333333333, | |
| "grad_norm": 0.3055277168750763, | |
| "learning_rate": 7.20432628121669e-05, | |
| "loss": 1.1054911613464355, | |
| "mean_token_accuracy": 0.7583063259720803, | |
| "num_tokens": 16400826.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 0.8583284638822078, | |
| "epoch": 0.7530666666666667, | |
| "grad_norm": 0.2846459448337555, | |
| "learning_rate": 7.188778676991064e-05, | |
| "loss": 0.914365577697754, | |
| "mean_token_accuracy": 0.7785162061452866, | |
| "num_tokens": 16445628.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 1.0267987482249736, | |
| "epoch": 0.7552, | |
| "grad_norm": 0.26450878381729126, | |
| "learning_rate": 7.173204847965333e-05, | |
| "loss": 1.1284149169921875, | |
| "mean_token_accuracy": 0.7466149963438511, | |
| "num_tokens": 16498069.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 0.932567299157381, | |
| "epoch": 0.7573333333333333, | |
| "grad_norm": 0.2745480239391327, | |
| "learning_rate": 7.157604980736962e-05, | |
| "loss": 1.02783260345459, | |
| "mean_token_accuracy": 0.7691405609250068, | |
| "num_tokens": 16546746.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 0.9001861125230789, | |
| "epoch": 0.7594666666666666, | |
| "grad_norm": 0.29825839400291443, | |
| "learning_rate": 7.141979262215396e-05, | |
| "loss": 1.0350588798522948, | |
| "mean_token_accuracy": 0.7682245895266533, | |
| "num_tokens": 16586157.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 0.8198081150650978, | |
| "epoch": 0.7616, | |
| "grad_norm": 0.3035356104373932, | |
| "learning_rate": 7.126327879619807e-05, | |
| "loss": 0.8880753517150879, | |
| "mean_token_accuracy": 0.7954168729484081, | |
| "num_tokens": 16623992.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 0.8905762024223804, | |
| "epoch": 0.7637333333333334, | |
| "grad_norm": 0.24514277279376984, | |
| "learning_rate": 7.110651020476873e-05, | |
| "loss": 0.9519443511962891, | |
| "mean_token_accuracy": 0.7789977833628654, | |
| "num_tokens": 16666011.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 0.8767322935163975, | |
| "epoch": 0.7658666666666667, | |
| "grad_norm": 0.30208712816238403, | |
| "learning_rate": 7.094948872618507e-05, | |
| "loss": 1.0047502517700195, | |
| "mean_token_accuracy": 0.7747991606593132, | |
| "num_tokens": 16709398.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 0.9047524333000183, | |
| "epoch": 0.768, | |
| "grad_norm": 0.23355495929718018, | |
| "learning_rate": 7.079221624179623e-05, | |
| "loss": 0.9877220153808594, | |
| "mean_token_accuracy": 0.7743734329938888, | |
| "num_tokens": 16759830.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 0.8517782382667065, | |
| "epoch": 0.7701333333333333, | |
| "grad_norm": 0.2146274745464325, | |
| "learning_rate": 7.063469463595884e-05, | |
| "loss": 0.9309274673461914, | |
| "mean_token_accuracy": 0.7834656447172165, | |
| "num_tokens": 16802813.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.0183790929615497, | |
| "epoch": 0.7722666666666667, | |
| "grad_norm": 0.24549080431461334, | |
| "learning_rate": 7.047692579601424e-05, | |
| "loss": 1.1990603446960448, | |
| "mean_token_accuracy": 0.7547581911087036, | |
| "num_tokens": 16850703.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 0.8650934003293514, | |
| "epoch": 0.7744, | |
| "grad_norm": 0.23581688106060028, | |
| "learning_rate": 7.031891161226608e-05, | |
| "loss": 0.9123600959777832, | |
| "mean_token_accuracy": 0.7830170378088951, | |
| "num_tokens": 16894959.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 0.951847655326128, | |
| "epoch": 0.7765333333333333, | |
| "grad_norm": 0.3015543818473816, | |
| "learning_rate": 7.016065397795758e-05, | |
| "loss": 1.062761116027832, | |
| "mean_token_accuracy": 0.7644057601690293, | |
| "num_tokens": 16939640.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.0231108613312245, | |
| "epoch": 0.7786666666666666, | |
| "grad_norm": 0.23233352601528168, | |
| "learning_rate": 7.000215478924887e-05, | |
| "loss": 1.1309197425842286, | |
| "mean_token_accuracy": 0.744163216650486, | |
| "num_tokens": 16999652.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 0.8934633955359459, | |
| "epoch": 0.7808, | |
| "grad_norm": 0.25434958934783936, | |
| "learning_rate": 6.984341594519421e-05, | |
| "loss": 1.0075945854187012, | |
| "mean_token_accuracy": 0.7736709147691727, | |
| "num_tokens": 17046141.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.0374766498804093, | |
| "epoch": 0.7829333333333334, | |
| "grad_norm": 0.2597425878047943, | |
| "learning_rate": 6.968443934771933e-05, | |
| "loss": 1.1429466247558593, | |
| "mean_token_accuracy": 0.7515291333198547, | |
| "num_tokens": 17096136.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 0.974248643219471, | |
| "epoch": 0.7850666666666667, | |
| "grad_norm": 0.22508691251277924, | |
| "learning_rate": 6.952522690159861e-05, | |
| "loss": 1.0584315299987792, | |
| "mean_token_accuracy": 0.7587296038866043, | |
| "num_tokens": 17144177.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 0.9973213374614716, | |
| "epoch": 0.7872, | |
| "grad_norm": 0.2418157011270523, | |
| "learning_rate": 6.936578051443219e-05, | |
| "loss": 1.1423637390136718, | |
| "mean_token_accuracy": 0.7530387908220291, | |
| "num_tokens": 17191263.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 0.8815206326544285, | |
| "epoch": 0.7893333333333333, | |
| "grad_norm": 0.28048619627952576, | |
| "learning_rate": 6.92061020966232e-05, | |
| "loss": 0.9586901664733887, | |
| "mean_token_accuracy": 0.7787635132670403, | |
| "num_tokens": 17235554.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 0.8578987941145897, | |
| "epoch": 0.7914666666666667, | |
| "grad_norm": 0.2751041352748871, | |
| "learning_rate": 6.904619356135484e-05, | |
| "loss": 0.9659609794616699, | |
| "mean_token_accuracy": 0.7794149458408356, | |
| "num_tokens": 17280617.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 0.8956944331526756, | |
| "epoch": 0.7936, | |
| "grad_norm": 0.23321297764778137, | |
| "learning_rate": 6.888605682456746e-05, | |
| "loss": 1.0033020973205566, | |
| "mean_token_accuracy": 0.7758402660489082, | |
| "num_tokens": 17326396.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 0.8936455383896827, | |
| "epoch": 0.7957333333333333, | |
| "grad_norm": 0.2756887674331665, | |
| "learning_rate": 6.87256938049356e-05, | |
| "loss": 0.9862062454223632, | |
| "mean_token_accuracy": 0.7779143631458283, | |
| "num_tokens": 17370948.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 0.9536221623420715, | |
| "epoch": 0.7978666666666666, | |
| "grad_norm": 0.2456534057855606, | |
| "learning_rate": 6.856510642384499e-05, | |
| "loss": 1.0342220306396483, | |
| "mean_token_accuracy": 0.7630825422704219, | |
| "num_tokens": 17421511.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.0029884599149228, | |
| "epoch": 0.8, | |
| "grad_norm": 0.23356133699417114, | |
| "learning_rate": 6.840429660536953e-05, | |
| "loss": 1.0578575134277344, | |
| "mean_token_accuracy": 0.7524963855743408, | |
| "num_tokens": 17474234.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 0.9401593998074531, | |
| "epoch": 0.8021333333333334, | |
| "grad_norm": 0.2025330811738968, | |
| "learning_rate": 6.82432662762483e-05, | |
| "loss": 1.0459843635559083, | |
| "mean_token_accuracy": 0.7684256717562675, | |
| "num_tokens": 17519106.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 0.9076099701225757, | |
| "epoch": 0.8042666666666667, | |
| "grad_norm": 0.28303810954093933, | |
| "learning_rate": 6.80820173658624e-05, | |
| "loss": 1.0061184883117675, | |
| "mean_token_accuracy": 0.774385878443718, | |
| "num_tokens": 17563524.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 0.9020247898995877, | |
| "epoch": 0.8064, | |
| "grad_norm": 0.25860196352005005, | |
| "learning_rate": 6.79205518062118e-05, | |
| "loss": 0.9976821899414062, | |
| "mean_token_accuracy": 0.774232342839241, | |
| "num_tokens": 17611723.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 0.9692328073084354, | |
| "epoch": 0.8085333333333333, | |
| "grad_norm": 0.24602019786834717, | |
| "learning_rate": 6.775887153189233e-05, | |
| "loss": 1.06738224029541, | |
| "mean_token_accuracy": 0.7612074792385102, | |
| "num_tokens": 17657838.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 0.9567753560841084, | |
| "epoch": 0.8106666666666666, | |
| "grad_norm": 0.2228858321905136, | |
| "learning_rate": 6.759697848007238e-05, | |
| "loss": 1.0671761512756348, | |
| "mean_token_accuracy": 0.7626087903976441, | |
| "num_tokens": 17705375.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.0885871052742004, | |
| "epoch": 0.8128, | |
| "grad_norm": 0.24882915616035461, | |
| "learning_rate": 6.743487459046971e-05, | |
| "loss": 1.1456743240356446, | |
| "mean_token_accuracy": 0.7413103066384792, | |
| "num_tokens": 17751890.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 0.9753526367247105, | |
| "epoch": 0.8149333333333333, | |
| "grad_norm": 0.32060110569000244, | |
| "learning_rate": 6.72725618053283e-05, | |
| "loss": 1.051304244995117, | |
| "mean_token_accuracy": 0.7579805940389633, | |
| "num_tokens": 17796312.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.0881080597639083, | |
| "epoch": 0.8170666666666667, | |
| "grad_norm": 0.2615782618522644, | |
| "learning_rate": 6.711004206939491e-05, | |
| "loss": 1.20444917678833, | |
| "mean_token_accuracy": 0.7393553704023361, | |
| "num_tokens": 17849058.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 0.9971455112099648, | |
| "epoch": 0.8192, | |
| "grad_norm": 0.28769299387931824, | |
| "learning_rate": 6.694731732989593e-05, | |
| "loss": 1.186737632751465, | |
| "mean_token_accuracy": 0.7578480765223503, | |
| "num_tokens": 17897760.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 0.8759050570428372, | |
| "epoch": 0.8213333333333334, | |
| "grad_norm": 0.27374961972236633, | |
| "learning_rate": 6.678438953651401e-05, | |
| "loss": 0.9380218505859375, | |
| "mean_token_accuracy": 0.7734792202711105, | |
| "num_tokens": 17939963.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 0.985682574659586, | |
| "epoch": 0.8234666666666667, | |
| "grad_norm": 0.26758262515068054, | |
| "learning_rate": 6.662126064136466e-05, | |
| "loss": 1.073539638519287, | |
| "mean_token_accuracy": 0.7554293870925903, | |
| "num_tokens": 17990860.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 0.9883775785565376, | |
| "epoch": 0.8256, | |
| "grad_norm": 0.273813396692276, | |
| "learning_rate": 6.645793259897288e-05, | |
| "loss": 1.1143600463867187, | |
| "mean_token_accuracy": 0.7514252230525017, | |
| "num_tokens": 18044403.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 0.9075474604964257, | |
| "epoch": 0.8277333333333333, | |
| "grad_norm": 0.28175604343414307, | |
| "learning_rate": 6.629440736624977e-05, | |
| "loss": 0.9921407699584961, | |
| "mean_token_accuracy": 0.7708318918943405, | |
| "num_tokens": 18090208.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 0.8513198003172875, | |
| "epoch": 0.8298666666666666, | |
| "grad_norm": 0.2545130252838135, | |
| "learning_rate": 6.613068690246905e-05, | |
| "loss": 0.9449549674987793, | |
| "mean_token_accuracy": 0.7845224231481552, | |
| "num_tokens": 18135064.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.0117795512080192, | |
| "epoch": 0.832, | |
| "grad_norm": 0.26254504919052124, | |
| "learning_rate": 6.596677316924355e-05, | |
| "loss": 1.1285503387451172, | |
| "mean_token_accuracy": 0.7520590081810952, | |
| "num_tokens": 18184374.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 1.0141760632395744, | |
| "epoch": 0.8341333333333333, | |
| "grad_norm": 0.29730224609375, | |
| "learning_rate": 6.580266813050187e-05, | |
| "loss": 1.107116985321045, | |
| "mean_token_accuracy": 0.7563492476940155, | |
| "num_tokens": 18226039.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 0.9843406617641449, | |
| "epoch": 0.8362666666666667, | |
| "grad_norm": 0.26783686876296997, | |
| "learning_rate": 6.563837375246463e-05, | |
| "loss": 1.0850018501281737, | |
| "mean_token_accuracy": 0.7563267104327679, | |
| "num_tokens": 18270937.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 0.9336299523711205, | |
| "epoch": 0.8384, | |
| "grad_norm": 0.25351837277412415, | |
| "learning_rate": 6.547389200362103e-05, | |
| "loss": 1.0218440055847169, | |
| "mean_token_accuracy": 0.767199169844389, | |
| "num_tokens": 18314733.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 0.9812062717974186, | |
| "epoch": 0.8405333333333334, | |
| "grad_norm": 0.2531512677669525, | |
| "learning_rate": 6.530922485470531e-05, | |
| "loss": 1.0596059799194335, | |
| "mean_token_accuracy": 0.764385013282299, | |
| "num_tokens": 18367778.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 0.9153544351458549, | |
| "epoch": 0.8426666666666667, | |
| "grad_norm": 0.40482011437416077, | |
| "learning_rate": 6.5144374278673e-05, | |
| "loss": 0.990359878540039, | |
| "mean_token_accuracy": 0.7689836576581002, | |
| "num_tokens": 18413756.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 0.965121752768755, | |
| "epoch": 0.8448, | |
| "grad_norm": 0.23541757464408875, | |
| "learning_rate": 6.497934225067736e-05, | |
| "loss": 1.0639681816101074, | |
| "mean_token_accuracy": 0.7629014477133751, | |
| "num_tokens": 18459252.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 0.9449482955038547, | |
| "epoch": 0.8469333333333333, | |
| "grad_norm": 0.38528165221214294, | |
| "learning_rate": 6.481413074804579e-05, | |
| "loss": 1.0649182319641113, | |
| "mean_token_accuracy": 0.7604210332036019, | |
| "num_tokens": 18508661.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.056539323925972, | |
| "epoch": 0.8490666666666666, | |
| "grad_norm": 0.24575304985046387, | |
| "learning_rate": 6.464874175025602e-05, | |
| "loss": 1.1153278350830078, | |
| "mean_token_accuracy": 0.7435829386115074, | |
| "num_tokens": 18560510.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 0.9280949518084526, | |
| "epoch": 0.8512, | |
| "grad_norm": 0.2419559508562088, | |
| "learning_rate": 6.448317723891237e-05, | |
| "loss": 1.0382183074951172, | |
| "mean_token_accuracy": 0.7698814913630485, | |
| "num_tokens": 18608764.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 0.9310038618743419, | |
| "epoch": 0.8533333333333334, | |
| "grad_norm": 0.3000566065311432, | |
| "learning_rate": 6.431743919772218e-05, | |
| "loss": 1.0243574142456056, | |
| "mean_token_accuracy": 0.7703951939940452, | |
| "num_tokens": 18653703.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 0.9708857566118241, | |
| "epoch": 0.8554666666666667, | |
| "grad_norm": 0.2541143000125885, | |
| "learning_rate": 6.415152961247186e-05, | |
| "loss": 1.07316312789917, | |
| "mean_token_accuracy": 0.752861674129963, | |
| "num_tokens": 18701666.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 0.9462185628712177, | |
| "epoch": 0.8576, | |
| "grad_norm": 0.23720327019691467, | |
| "learning_rate": 6.398545047100321e-05, | |
| "loss": 1.023193359375, | |
| "mean_token_accuracy": 0.7677365422248841, | |
| "num_tokens": 18749563.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 0.9408772967755794, | |
| "epoch": 0.8597333333333333, | |
| "grad_norm": 0.215755894780159, | |
| "learning_rate": 6.381920376318951e-05, | |
| "loss": 1.0272337913513183, | |
| "mean_token_accuracy": 0.7659091472625732, | |
| "num_tokens": 18793365.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 0.9391368016600609, | |
| "epoch": 0.8618666666666667, | |
| "grad_norm": 0.22395288944244385, | |
| "learning_rate": 6.365279148091182e-05, | |
| "loss": 1.0316532135009766, | |
| "mean_token_accuracy": 0.7643599301576615, | |
| "num_tokens": 18843087.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 0.9580571033060551, | |
| "epoch": 0.864, | |
| "grad_norm": 0.2631727159023285, | |
| "learning_rate": 6.348621561803495e-05, | |
| "loss": 1.0001374244689942, | |
| "mean_token_accuracy": 0.7608293548226357, | |
| "num_tokens": 18891900.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 0.9185742639005184, | |
| "epoch": 0.8661333333333333, | |
| "grad_norm": 0.2510085701942444, | |
| "learning_rate": 6.331947817038367e-05, | |
| "loss": 0.9962324142456055, | |
| "mean_token_accuracy": 0.7723157353699207, | |
| "num_tokens": 18938986.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 0.9397155575454235, | |
| "epoch": 0.8682666666666666, | |
| "grad_norm": 0.2758144736289978, | |
| "learning_rate": 6.315258113571876e-05, | |
| "loss": 1.0858741760253907, | |
| "mean_token_accuracy": 0.7691247522830963, | |
| "num_tokens": 18984144.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 0.9895228892564774, | |
| "epoch": 0.8704, | |
| "grad_norm": 0.2775322198867798, | |
| "learning_rate": 6.298552651371316e-05, | |
| "loss": 1.10516300201416, | |
| "mean_token_accuracy": 0.7543898217380047, | |
| "num_tokens": 19027278.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 0.889600582420826, | |
| "epoch": 0.8725333333333334, | |
| "grad_norm": 0.2604535222053528, | |
| "learning_rate": 6.281831630592783e-05, | |
| "loss": 1.0391739845275878, | |
| "mean_token_accuracy": 0.7718042567372322, | |
| "num_tokens": 19073552.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 0.8700525127351284, | |
| "epoch": 0.8746666666666667, | |
| "grad_norm": 0.26574060320854187, | |
| "learning_rate": 6.265095251578796e-05, | |
| "loss": 0.9732645988464356, | |
| "mean_token_accuracy": 0.7781016409397126, | |
| "num_tokens": 19112840.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 0.8111571930348873, | |
| "epoch": 0.8768, | |
| "grad_norm": 0.24323873221874237, | |
| "learning_rate": 6.248343714855884e-05, | |
| "loss": 0.8503658294677734, | |
| "mean_token_accuracy": 0.7953767567873001, | |
| "num_tokens": 19155980.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.0650635436177254, | |
| "epoch": 0.8789333333333333, | |
| "grad_norm": 0.22809672355651855, | |
| "learning_rate": 6.23157722113219e-05, | |
| "loss": 1.208934211730957, | |
| "mean_token_accuracy": 0.7424666874110699, | |
| "num_tokens": 19206121.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.9243351340293884, | |
| "epoch": 0.8810666666666667, | |
| "grad_norm": 0.29001903533935547, | |
| "learning_rate": 6.214795971295063e-05, | |
| "loss": 0.9857352256774903, | |
| "mean_token_accuracy": 0.771124804764986, | |
| "num_tokens": 19252396.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 0.7921540692448616, | |
| "epoch": 0.8832, | |
| "grad_norm": 0.3147003650665283, | |
| "learning_rate": 6.198000166408651e-05, | |
| "loss": 0.8609647750854492, | |
| "mean_token_accuracy": 0.7940000563859939, | |
| "num_tokens": 19293212.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 0.8676056079566479, | |
| "epoch": 0.8853333333333333, | |
| "grad_norm": 0.307106614112854, | |
| "learning_rate": 6.181190007711497e-05, | |
| "loss": 0.9692766189575195, | |
| "mean_token_accuracy": 0.7774873107671738, | |
| "num_tokens": 19339257.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 1.0086314499378204, | |
| "epoch": 0.8874666666666666, | |
| "grad_norm": 0.24890249967575073, | |
| "learning_rate": 6.16436569661412e-05, | |
| "loss": 1.1359784126281738, | |
| "mean_token_accuracy": 0.7473259434103966, | |
| "num_tokens": 19389016.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 0.913795480877161, | |
| "epoch": 0.8896, | |
| "grad_norm": 0.27178287506103516, | |
| "learning_rate": 6.147527434696606e-05, | |
| "loss": 0.976069450378418, | |
| "mean_token_accuracy": 0.7722298249602317, | |
| "num_tokens": 19431392.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 0.874677724391222, | |
| "epoch": 0.8917333333333334, | |
| "grad_norm": 0.3088259994983673, | |
| "learning_rate": 6.130675423706191e-05, | |
| "loss": 0.9541938781738282, | |
| "mean_token_accuracy": 0.780048543214798, | |
| "num_tokens": 19476928.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 0.9338957525789737, | |
| "epoch": 0.8938666666666667, | |
| "grad_norm": 0.3184056282043457, | |
| "learning_rate": 6.113809865554853e-05, | |
| "loss": 1.0237668991088866, | |
| "mean_token_accuracy": 0.7652892589569091, | |
| "num_tokens": 19522534.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 0.8182342648506165, | |
| "epoch": 0.896, | |
| "grad_norm": 0.25737565755844116, | |
| "learning_rate": 6.0969309623168736e-05, | |
| "loss": 0.9096416473388672, | |
| "mean_token_accuracy": 0.7868136927485466, | |
| "num_tokens": 19564324.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 0.9676721416413784, | |
| "epoch": 0.8981333333333333, | |
| "grad_norm": 0.3115193843841553, | |
| "learning_rate": 6.080038916226436e-05, | |
| "loss": 1.0527458190917969, | |
| "mean_token_accuracy": 0.7573085993528366, | |
| "num_tokens": 19606557.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 0.9689052537083626, | |
| "epoch": 0.9002666666666667, | |
| "grad_norm": 0.24019014835357666, | |
| "learning_rate": 6.063133929675193e-05, | |
| "loss": 1.0610797882080079, | |
| "mean_token_accuracy": 0.7604381129145622, | |
| "num_tokens": 19648234.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 0.9963471919298172, | |
| "epoch": 0.9024, | |
| "grad_norm": 0.22769565880298615, | |
| "learning_rate": 6.046216205209842e-05, | |
| "loss": 1.1422395706176758, | |
| "mean_token_accuracy": 0.7552115090191365, | |
| "num_tokens": 19697922.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 0.8659336932003499, | |
| "epoch": 0.9045333333333333, | |
| "grad_norm": 0.33948129415512085, | |
| "learning_rate": 6.029285945529699e-05, | |
| "loss": 0.9698437690734864, | |
| "mean_token_accuracy": 0.7807855561375618, | |
| "num_tokens": 19742041.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 0.9663250721991062, | |
| "epoch": 0.9066666666666666, | |
| "grad_norm": 0.2511708438396454, | |
| "learning_rate": 6.012343353484271e-05, | |
| "loss": 1.0937541007995606, | |
| "mean_token_accuracy": 0.7600424617528916, | |
| "num_tokens": 19789800.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 1.019892977923155, | |
| "epoch": 0.9088, | |
| "grad_norm": 0.23841184377670288, | |
| "learning_rate": 5.995388632070827e-05, | |
| "loss": 1.0938913345336914, | |
| "mean_token_accuracy": 0.7473356157541275, | |
| "num_tokens": 19839462.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 1.0092505671083927, | |
| "epoch": 0.9109333333333334, | |
| "grad_norm": 0.23689568042755127, | |
| "learning_rate": 5.978421984431959e-05, | |
| "loss": 1.110377597808838, | |
| "mean_token_accuracy": 0.7536325603723526, | |
| "num_tokens": 19888786.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 0.8563631072640419, | |
| "epoch": 0.9130666666666667, | |
| "grad_norm": 0.2725948393344879, | |
| "learning_rate": 5.961443613853157e-05, | |
| "loss": 0.9641815185546875, | |
| "mean_token_accuracy": 0.7762529909610748, | |
| "num_tokens": 19931094.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 0.9774221003055572, | |
| "epoch": 0.9152, | |
| "grad_norm": 0.23440679907798767, | |
| "learning_rate": 5.944453723760367e-05, | |
| "loss": 1.0834471702575683, | |
| "mean_token_accuracy": 0.7571895673871041, | |
| "num_tokens": 19976730.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 0.9086124181747437, | |
| "epoch": 0.9173333333333333, | |
| "grad_norm": 0.331281453371048, | |
| "learning_rate": 5.927452517717558e-05, | |
| "loss": 1.0120928764343262, | |
| "mean_token_accuracy": 0.7699793577194214, | |
| "num_tokens": 20021630.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 0.930675259232521, | |
| "epoch": 0.9194666666666667, | |
| "grad_norm": 0.23214256763458252, | |
| "learning_rate": 5.9104401994242786e-05, | |
| "loss": 1.0291691780090333, | |
| "mean_token_accuracy": 0.7654816180467605, | |
| "num_tokens": 20070846.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 0.9637592114508152, | |
| "epoch": 0.9216, | |
| "grad_norm": 0.23460128903388977, | |
| "learning_rate": 5.893416972713217e-05, | |
| "loss": 1.0039209365844726, | |
| "mean_token_accuracy": 0.763984477519989, | |
| "num_tokens": 20116649.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 0.95065303966403, | |
| "epoch": 0.9237333333333333, | |
| "grad_norm": 0.26842495799064636, | |
| "learning_rate": 5.8763830415477674e-05, | |
| "loss": 1.0108171463012696, | |
| "mean_token_accuracy": 0.7639551430940628, | |
| "num_tokens": 20161155.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 0.8422643013298512, | |
| "epoch": 0.9258666666666666, | |
| "grad_norm": 0.29824700951576233, | |
| "learning_rate": 5.85933861001957e-05, | |
| "loss": 0.9323680877685547, | |
| "mean_token_accuracy": 0.7818280473351479, | |
| "num_tokens": 20202829.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 0.8829731650650501, | |
| "epoch": 0.928, | |
| "grad_norm": 0.31879284977912903, | |
| "learning_rate": 5.842283882346082e-05, | |
| "loss": 0.9750779151916504, | |
| "mean_token_accuracy": 0.7774942420423031, | |
| "num_tokens": 20248134.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 0.8857482261955738, | |
| "epoch": 0.9301333333333334, | |
| "grad_norm": 0.2634090483188629, | |
| "learning_rate": 5.825219062868118e-05, | |
| "loss": 0.9745967864990235, | |
| "mean_token_accuracy": 0.7782815754413605, | |
| "num_tokens": 20288504.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 0.9351005107164383, | |
| "epoch": 0.9322666666666667, | |
| "grad_norm": 0.2535351812839508, | |
| "learning_rate": 5.808144356047414e-05, | |
| "loss": 1.0453302383422851, | |
| "mean_token_accuracy": 0.7632610902190209, | |
| "num_tokens": 20335084.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 0.9218877613544464, | |
| "epoch": 0.9344, | |
| "grad_norm": 0.23900526762008667, | |
| "learning_rate": 5.791059966464164e-05, | |
| "loss": 0.9873531341552735, | |
| "mean_token_accuracy": 0.7675649732351303, | |
| "num_tokens": 20384687.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 0.9403703153133393, | |
| "epoch": 0.9365333333333333, | |
| "grad_norm": 0.23416976630687714, | |
| "learning_rate": 5.773966098814579e-05, | |
| "loss": 1.0653534889221192, | |
| "mean_token_accuracy": 0.764773941040039, | |
| "num_tokens": 20432372.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 0.9848338901996613, | |
| "epoch": 0.9386666666666666, | |
| "grad_norm": 0.2765245735645294, | |
| "learning_rate": 5.756862957908433e-05, | |
| "loss": 1.1192432403564454, | |
| "mean_token_accuracy": 0.7547446370124817, | |
| "num_tokens": 20481366.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 0.9790939651429653, | |
| "epoch": 0.9408, | |
| "grad_norm": 0.23915551602840424, | |
| "learning_rate": 5.739750748666606e-05, | |
| "loss": 1.036961555480957, | |
| "mean_token_accuracy": 0.7573970347642899, | |
| "num_tokens": 20526985.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 0.9054527454078197, | |
| "epoch": 0.9429333333333333, | |
| "grad_norm": 0.24054944515228271, | |
| "learning_rate": 5.7226296761186274e-05, | |
| "loss": 0.9758554458618164, | |
| "mean_token_accuracy": 0.7724366948008538, | |
| "num_tokens": 20571815.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 0.9364707127213479, | |
| "epoch": 0.9450666666666667, | |
| "grad_norm": 0.28272607922554016, | |
| "learning_rate": 5.705499945400223e-05, | |
| "loss": 1.0225676536560058, | |
| "mean_token_accuracy": 0.7622330486774445, | |
| "num_tokens": 20615072.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.0657535366714002, | |
| "epoch": 0.9472, | |
| "grad_norm": 0.23734600841999054, | |
| "learning_rate": 5.688361761750861e-05, | |
| "loss": 1.1335111618041993, | |
| "mean_token_accuracy": 0.7402229458093643, | |
| "num_tokens": 20666534.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 0.9826746597886086, | |
| "epoch": 0.9493333333333334, | |
| "grad_norm": 0.28600969910621643, | |
| "learning_rate": 5.671215330511283e-05, | |
| "loss": 1.066628646850586, | |
| "mean_token_accuracy": 0.7560828119516373, | |
| "num_tokens": 20715376.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 0.9109843887388707, | |
| "epoch": 0.9514666666666667, | |
| "grad_norm": 0.2514685392379761, | |
| "learning_rate": 5.65406085712105e-05, | |
| "loss": 1.0114540100097655, | |
| "mean_token_accuracy": 0.7724284827709198, | |
| "num_tokens": 20758838.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 0.8498819716274738, | |
| "epoch": 0.9536, | |
| "grad_norm": 0.28889158368110657, | |
| "learning_rate": 5.6368985471160804e-05, | |
| "loss": 0.9062424659729004, | |
| "mean_token_accuracy": 0.785689315199852, | |
| "num_tokens": 20799444.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 0.8840778715908527, | |
| "epoch": 0.9557333333333333, | |
| "grad_norm": 0.2577449083328247, | |
| "learning_rate": 5.6197286061261875e-05, | |
| "loss": 0.9439300537109375, | |
| "mean_token_accuracy": 0.7696003526449203, | |
| "num_tokens": 20843766.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 0.8888865426182747, | |
| "epoch": 0.9578666666666666, | |
| "grad_norm": 0.27302756905555725, | |
| "learning_rate": 5.602551239872616e-05, | |
| "loss": 0.9372305870056152, | |
| "mean_token_accuracy": 0.7730641543865204, | |
| "num_tokens": 20888764.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 0.9558203481137753, | |
| "epoch": 0.96, | |
| "grad_norm": 0.3576233386993408, | |
| "learning_rate": 5.58536665416557e-05, | |
| "loss": 1.0556070327758789, | |
| "mean_token_accuracy": 0.762606156617403, | |
| "num_tokens": 20936028.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 0.9054192140698433, | |
| "epoch": 0.9621333333333333, | |
| "grad_norm": 0.2521965205669403, | |
| "learning_rate": 5.568175054901763e-05, | |
| "loss": 0.9705222129821778, | |
| "mean_token_accuracy": 0.7672724887728691, | |
| "num_tokens": 20985057.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 0.9011006608605385, | |
| "epoch": 0.9642666666666667, | |
| "grad_norm": 0.27024832367897034, | |
| "learning_rate": 5.550976648061934e-05, | |
| "loss": 0.9830186843872071, | |
| "mean_token_accuracy": 0.7754541039466858, | |
| "num_tokens": 21028567.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 0.9991332605481148, | |
| "epoch": 0.9664, | |
| "grad_norm": 0.2703147828578949, | |
| "learning_rate": 5.533771639708388e-05, | |
| "loss": 1.1589097023010253, | |
| "mean_token_accuracy": 0.7532796613872051, | |
| "num_tokens": 21072699.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 0.9183724671602249, | |
| "epoch": 0.9685333333333334, | |
| "grad_norm": 0.2243046760559082, | |
| "learning_rate": 5.516560235982527e-05, | |
| "loss": 0.9856460571289063, | |
| "mean_token_accuracy": 0.771567003428936, | |
| "num_tokens": 21121413.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 0.8655671834945678, | |
| "epoch": 0.9706666666666667, | |
| "grad_norm": 0.3306775987148285, | |
| "learning_rate": 5.499342643102381e-05, | |
| "loss": 0.9172829627990723, | |
| "mean_token_accuracy": 0.777653044462204, | |
| "num_tokens": 21162927.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 0.9436637915670871, | |
| "epoch": 0.9728, | |
| "grad_norm": 0.2542389929294586, | |
| "learning_rate": 5.482119067360132e-05, | |
| "loss": 1.0658721923828125, | |
| "mean_token_accuracy": 0.767835621535778, | |
| "num_tokens": 21206936.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 0.7974261797964572, | |
| "epoch": 0.9749333333333333, | |
| "grad_norm": 0.24307052791118622, | |
| "learning_rate": 5.4648897151196455e-05, | |
| "loss": 0.8578211784362793, | |
| "mean_token_accuracy": 0.7923481151461601, | |
| "num_tokens": 21252732.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 0.9691430673003196, | |
| "epoch": 0.9770666666666666, | |
| "grad_norm": 0.2720329165458679, | |
| "learning_rate": 5.447654792814e-05, | |
| "loss": 1.0459741592407226, | |
| "mean_token_accuracy": 0.7617560073733329, | |
| "num_tokens": 21298972.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 0.9178217075765133, | |
| "epoch": 0.9792, | |
| "grad_norm": 0.2640475630760193, | |
| "learning_rate": 5.4304145069430115e-05, | |
| "loss": 1.0324625015258788, | |
| "mean_token_accuracy": 0.7745086327195168, | |
| "num_tokens": 21348870.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 0.8973256818950176, | |
| "epoch": 0.9813333333333333, | |
| "grad_norm": 0.2828875184059143, | |
| "learning_rate": 5.4131690640707574e-05, | |
| "loss": 0.9894962310791016, | |
| "mean_token_accuracy": 0.7752941563725472, | |
| "num_tokens": 21390716.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 0.9490196861326694, | |
| "epoch": 0.9834666666666667, | |
| "grad_norm": 0.27414020895957947, | |
| "learning_rate": 5.3959186708231046e-05, | |
| "loss": 1.0264591217041015, | |
| "mean_token_accuracy": 0.7639399319887161, | |
| "num_tokens": 21440700.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 0.9219519071280956, | |
| "epoch": 0.9856, | |
| "grad_norm": 0.2545549273490906, | |
| "learning_rate": 5.3786635338852346e-05, | |
| "loss": 1.0511361122131349, | |
| "mean_token_accuracy": 0.7739394150674344, | |
| "num_tokens": 21483867.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 0.99324054941535, | |
| "epoch": 0.9877333333333334, | |
| "grad_norm": 0.272182434797287, | |
| "learning_rate": 5.361403859999161e-05, | |
| "loss": 1.116584587097168, | |
| "mean_token_accuracy": 0.7553175091743469, | |
| "num_tokens": 21535354.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 0.8828953221440315, | |
| "epoch": 0.9898666666666667, | |
| "grad_norm": 0.29537713527679443, | |
| "learning_rate": 5.344139855961262e-05, | |
| "loss": 0.9682372093200684, | |
| "mean_token_accuracy": 0.7781552016735077, | |
| "num_tokens": 21578265.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 0.9005228154361248, | |
| "epoch": 0.992, | |
| "grad_norm": 0.3032234013080597, | |
| "learning_rate": 5.3268717286197945e-05, | |
| "loss": 0.9423254013061524, | |
| "mean_token_accuracy": 0.7735077708959579, | |
| "num_tokens": 21618545.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 0.8464630447328091, | |
| "epoch": 0.9941333333333333, | |
| "grad_norm": 0.32000964879989624, | |
| "learning_rate": 5.3095996848724184e-05, | |
| "loss": 0.9030919075012207, | |
| "mean_token_accuracy": 0.7863337904214859, | |
| "num_tokens": 21657735.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 0.8923816077411175, | |
| "epoch": 0.9962666666666666, | |
| "grad_norm": 0.3551577627658844, | |
| "learning_rate": 5.292323931663719e-05, | |
| "loss": 0.9792759895324707, | |
| "mean_token_accuracy": 0.7739578939974308, | |
| "num_tokens": 21705183.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 0.9760521411895752, | |
| "epoch": 0.9984, | |
| "grad_norm": 0.2613706886768341, | |
| "learning_rate": 5.275044675982724e-05, | |
| "loss": 1.055685043334961, | |
| "mean_token_accuracy": 0.7623668745160103, | |
| "num_tokens": 21747104.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 0.9629100082736266, | |
| "epoch": 1.0004266666666666, | |
| "grad_norm": 0.3171702027320862, | |
| "learning_rate": 5.257762124860431e-05, | |
| "loss": 1.0939340591430664, | |
| "mean_token_accuracy": 0.7673146160025346, | |
| "num_tokens": 21789348.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 0.9121152207255363, | |
| "epoch": 1.00256, | |
| "grad_norm": 0.2546738386154175, | |
| "learning_rate": 5.240476485367317e-05, | |
| "loss": 0.9231260299682618, | |
| "mean_token_accuracy": 0.7732596561312676, | |
| "num_tokens": 21834781.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 0.8686859309673309, | |
| "epoch": 1.0046933333333334, | |
| "grad_norm": 0.25343966484069824, | |
| "learning_rate": 5.223187964610865e-05, | |
| "loss": 0.9800569534301757, | |
| "mean_token_accuracy": 0.7781821310520172, | |
| "num_tokens": 21879326.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 0.9335578382015228, | |
| "epoch": 1.0068266666666668, | |
| "grad_norm": 0.2416774481534958, | |
| "learning_rate": 5.2058967697330784e-05, | |
| "loss": 0.9976616859436035, | |
| "mean_token_accuracy": 0.7626704692840576, | |
| "num_tokens": 21933750.0, | |
| "step": 4720 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 9376, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0386211988394086e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |