code-reasoning-0.5b / last-checkpoint /trainer_state.json
moos124's picture
Training in progress, step 4720, checkpoint
92d4fba verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0068266666666668,
"eval_steps": 500,
"global_step": 4720,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2413338124752045,
"epoch": 0.00021333333333333333,
"grad_norm": 0.5237457752227783,
"learning_rate": 0.0,
"loss": 2.287360191345215,
"mean_token_accuracy": 0.6451007425785065,
"num_tokens": 4191.0,
"step": 1
},
{
"entropy": 1.2207386708921857,
"epoch": 0.0021333333333333334,
"grad_norm": 0.4350375831127167,
"learning_rate": 3e-06,
"loss": 2.19429079691569,
"mean_token_accuracy": 0.65218452612559,
"num_tokens": 39906.0,
"step": 10
},
{
"entropy": 1.1284118384122848,
"epoch": 0.004266666666666667,
"grad_norm": 0.49733078479766846,
"learning_rate": 6.333333333333334e-06,
"loss": 1.9721708297729492,
"mean_token_accuracy": 0.6748957321047783,
"num_tokens": 90428.0,
"step": 20
},
{
"entropy": 1.3042965233325958,
"epoch": 0.0064,
"grad_norm": 0.39715778827667236,
"learning_rate": 9.666666666666667e-06,
"loss": 2.214934539794922,
"mean_token_accuracy": 0.6405263364315033,
"num_tokens": 140516.0,
"step": 30
},
{
"entropy": 1.206481871008873,
"epoch": 0.008533333333333334,
"grad_norm": 0.5763714909553528,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.0720544815063477,
"mean_token_accuracy": 0.6752387754619121,
"num_tokens": 182338.0,
"step": 40
},
{
"entropy": 1.236506675183773,
"epoch": 0.010666666666666666,
"grad_norm": 0.5518174171447754,
"learning_rate": 1.6333333333333335e-05,
"loss": 1.9000749588012695,
"mean_token_accuracy": 0.6715238064527511,
"num_tokens": 235345.0,
"step": 50
},
{
"entropy": 1.2768938690423965,
"epoch": 0.0128,
"grad_norm": 0.6082292199134827,
"learning_rate": 1.9666666666666666e-05,
"loss": 1.8587135314941405,
"mean_token_accuracy": 0.6761457294225692,
"num_tokens": 279475.0,
"step": 60
},
{
"entropy": 1.4092280209064483,
"epoch": 0.014933333333333333,
"grad_norm": 0.6658427119255066,
"learning_rate": 2.3000000000000003e-05,
"loss": 1.7947965621948243,
"mean_token_accuracy": 0.6667077802121639,
"num_tokens": 330352.0,
"step": 70
},
{
"entropy": 1.4339179992675781,
"epoch": 0.017066666666666667,
"grad_norm": 1.2848966121673584,
"learning_rate": 2.633333333333333e-05,
"loss": 1.6212703704833984,
"mean_token_accuracy": 0.6736222848296165,
"num_tokens": 373039.0,
"step": 80
},
{
"entropy": 1.516643624007702,
"epoch": 0.0192,
"grad_norm": 0.5840373039245605,
"learning_rate": 2.9666666666666672e-05,
"loss": 1.6039567947387696,
"mean_token_accuracy": 0.6713629268109799,
"num_tokens": 422371.0,
"step": 90
},
{
"entropy": 1.467514592409134,
"epoch": 0.021333333333333333,
"grad_norm": 0.5295536518096924,
"learning_rate": 3.3e-05,
"loss": 1.576413631439209,
"mean_token_accuracy": 0.6885988213121891,
"num_tokens": 462284.0,
"step": 100
},
{
"entropy": 1.3056970939040184,
"epoch": 0.023466666666666667,
"grad_norm": 0.4493468701839447,
"learning_rate": 3.633333333333333e-05,
"loss": 1.2876591682434082,
"mean_token_accuracy": 0.7173117578029633,
"num_tokens": 508883.0,
"step": 110
},
{
"entropy": 1.2709181517362595,
"epoch": 0.0256,
"grad_norm": 0.5542399883270264,
"learning_rate": 3.966666666666667e-05,
"loss": 1.249708652496338,
"mean_token_accuracy": 0.727140337228775,
"num_tokens": 555373.0,
"step": 120
},
{
"entropy": 1.3233605474233627,
"epoch": 0.027733333333333332,
"grad_norm": 0.4590514004230499,
"learning_rate": 4.3e-05,
"loss": 1.3480740547180177,
"mean_token_accuracy": 0.7144808873534203,
"num_tokens": 599533.0,
"step": 130
},
{
"entropy": 1.241466723382473,
"epoch": 0.029866666666666666,
"grad_norm": 0.2866012752056122,
"learning_rate": 4.633333333333333e-05,
"loss": 1.275578498840332,
"mean_token_accuracy": 0.7148015096783638,
"num_tokens": 657261.0,
"step": 140
},
{
"entropy": 1.103185237944126,
"epoch": 0.032,
"grad_norm": 0.25304746627807617,
"learning_rate": 4.966666666666667e-05,
"loss": 1.1829004287719727,
"mean_token_accuracy": 0.7423724889755249,
"num_tokens": 704749.0,
"step": 150
},
{
"entropy": 1.085295742750168,
"epoch": 0.034133333333333335,
"grad_norm": 2.8501718044281006,
"learning_rate": 5.300000000000001e-05,
"loss": 1.1587767601013184,
"mean_token_accuracy": 0.7412141926586628,
"num_tokens": 751635.0,
"step": 160
},
{
"entropy": 1.0946420103311538,
"epoch": 0.03626666666666667,
"grad_norm": 0.2150825560092926,
"learning_rate": 5.633333333333334e-05,
"loss": 1.1224396705627442,
"mean_token_accuracy": 0.7382922798395157,
"num_tokens": 794488.0,
"step": 170
},
{
"entropy": 1.0789968609809875,
"epoch": 0.0384,
"grad_norm": 0.2265262007713318,
"learning_rate": 5.966666666666667e-05,
"loss": 1.1242941856384276,
"mean_token_accuracy": 0.7424697011709214,
"num_tokens": 838647.0,
"step": 180
},
{
"entropy": 1.0386934965848922,
"epoch": 0.04053333333333333,
"grad_norm": 0.2709059417247772,
"learning_rate": 6.3e-05,
"loss": 1.1370309829711913,
"mean_token_accuracy": 0.7532300829887391,
"num_tokens": 884497.0,
"step": 190
},
{
"entropy": 1.1033389106392861,
"epoch": 0.042666666666666665,
"grad_norm": 0.27512305974960327,
"learning_rate": 6.633333333333334e-05,
"loss": 1.2109394073486328,
"mean_token_accuracy": 0.7378732696175575,
"num_tokens": 931398.0,
"step": 200
},
{
"entropy": 1.108424139022827,
"epoch": 0.0448,
"grad_norm": 0.23238568007946014,
"learning_rate": 6.966666666666668e-05,
"loss": 1.1784509658813476,
"mean_token_accuracy": 0.7302148967981339,
"num_tokens": 983649.0,
"step": 210
},
{
"entropy": 0.9056355074048043,
"epoch": 0.046933333333333334,
"grad_norm": 0.3005298674106598,
"learning_rate": 7.3e-05,
"loss": 0.977406120300293,
"mean_token_accuracy": 0.7774429574608803,
"num_tokens": 1024372.0,
"step": 220
},
{
"entropy": 1.019825778901577,
"epoch": 0.04906666666666667,
"grad_norm": 0.2753085494041443,
"learning_rate": 7.633333333333334e-05,
"loss": 1.1463540077209473,
"mean_token_accuracy": 0.7483755856752395,
"num_tokens": 1073667.0,
"step": 230
},
{
"entropy": 1.042508740723133,
"epoch": 0.0512,
"grad_norm": 0.32724323868751526,
"learning_rate": 7.966666666666666e-05,
"loss": 1.098323440551758,
"mean_token_accuracy": 0.7490729346871376,
"num_tokens": 1117523.0,
"step": 240
},
{
"entropy": 1.036650500446558,
"epoch": 0.05333333333333334,
"grad_norm": 0.28012844920158386,
"learning_rate": 8.3e-05,
"loss": 1.078709030151367,
"mean_token_accuracy": 0.751419472694397,
"num_tokens": 1160152.0,
"step": 250
},
{
"entropy": 1.0605042964220046,
"epoch": 0.055466666666666664,
"grad_norm": 0.28786003589630127,
"learning_rate": 8.633333333333334e-05,
"loss": 1.1537845611572266,
"mean_token_accuracy": 0.7475032344460487,
"num_tokens": 1203153.0,
"step": 260
},
{
"entropy": 1.0000992961227895,
"epoch": 0.0576,
"grad_norm": 0.2648380696773529,
"learning_rate": 8.966666666666666e-05,
"loss": 1.0818438529968262,
"mean_token_accuracy": 0.7550511255860328,
"num_tokens": 1246554.0,
"step": 270
},
{
"entropy": 0.9443735256791115,
"epoch": 0.05973333333333333,
"grad_norm": 0.29017189145088196,
"learning_rate": 9.300000000000001e-05,
"loss": 1.0578758239746093,
"mean_token_accuracy": 0.7694767877459526,
"num_tokens": 1286049.0,
"step": 280
},
{
"entropy": 0.9980478152632714,
"epoch": 0.06186666666666667,
"grad_norm": 0.3078839182853699,
"learning_rate": 9.633333333333335e-05,
"loss": 1.1183401107788087,
"mean_token_accuracy": 0.7594211131334305,
"num_tokens": 1331161.0,
"step": 290
},
{
"entropy": 0.9343128114938736,
"epoch": 0.064,
"grad_norm": 0.253248393535614,
"learning_rate": 9.966666666666667e-05,
"loss": 0.9862067222595214,
"mean_token_accuracy": 0.7730094477534294,
"num_tokens": 1375777.0,
"step": 300
},
{
"entropy": 1.0266294315457345,
"epoch": 0.06613333333333334,
"grad_norm": 0.23917347192764282,
"learning_rate": 9.999975737505649e-05,
"loss": 1.1334312438964844,
"mean_token_accuracy": 0.7540638357400894,
"num_tokens": 1421027.0,
"step": 310
},
{
"entropy": 1.0486552365124227,
"epoch": 0.06826666666666667,
"grad_norm": 0.2840607762336731,
"learning_rate": 9.999891867457112e-05,
"loss": 1.1424532890319825,
"mean_token_accuracy": 0.7420963421463966,
"num_tokens": 1472539.0,
"step": 320
},
{
"entropy": 0.9518789499998093,
"epoch": 0.0704,
"grad_norm": 0.25352534651756287,
"learning_rate": 9.999748091322068e-05,
"loss": 0.9646738052368165,
"mean_token_accuracy": 0.7610545977950096,
"num_tokens": 1518725.0,
"step": 330
},
{
"entropy": 1.014043178409338,
"epoch": 0.07253333333333334,
"grad_norm": 0.22224737703800201,
"learning_rate": 9.999544410823167e-05,
"loss": 1.0504605293273925,
"mean_token_accuracy": 0.7481713563203811,
"num_tokens": 1570475.0,
"step": 340
},
{
"entropy": 0.9679042734205723,
"epoch": 0.07466666666666667,
"grad_norm": 0.2880384027957916,
"learning_rate": 9.999280828400803e-05,
"loss": 1.0482870101928712,
"mean_token_accuracy": 0.7645679444074631,
"num_tokens": 1617068.0,
"step": 350
},
{
"entropy": 1.077889482676983,
"epoch": 0.0768,
"grad_norm": 0.24659322202205658,
"learning_rate": 9.998957347213085e-05,
"loss": 1.1968301773071288,
"mean_token_accuracy": 0.7367698416113854,
"num_tokens": 1661987.0,
"step": 360
},
{
"entropy": 1.0857299536466598,
"epoch": 0.07893333333333333,
"grad_norm": 0.31553569436073303,
"learning_rate": 9.99857397113579e-05,
"loss": 1.1405964851379395,
"mean_token_accuracy": 0.7424520552158356,
"num_tokens": 1712075.0,
"step": 370
},
{
"entropy": 1.0094588652253151,
"epoch": 0.08106666666666666,
"grad_norm": 0.1927226334810257,
"learning_rate": 9.998130704762335e-05,
"loss": 1.062878704071045,
"mean_token_accuracy": 0.7529133662581444,
"num_tokens": 1762001.0,
"step": 380
},
{
"entropy": 0.95768673568964,
"epoch": 0.0832,
"grad_norm": 0.2921101748943329,
"learning_rate": 9.997627553403699e-05,
"loss": 1.0986035346984864,
"mean_token_accuracy": 0.76050655990839,
"num_tokens": 1805002.0,
"step": 390
},
{
"entropy": 0.8763241834938527,
"epoch": 0.08533333333333333,
"grad_norm": 0.24879010021686554,
"learning_rate": 9.997064523088384e-05,
"loss": 0.9313676834106446,
"mean_token_accuracy": 0.7820899412035942,
"num_tokens": 1844640.0,
"step": 400
},
{
"entropy": 1.0211177349090577,
"epoch": 0.08746666666666666,
"grad_norm": 0.27772271633148193,
"learning_rate": 9.996441620562322e-05,
"loss": 1.1202519416809082,
"mean_token_accuracy": 0.7511946842074394,
"num_tokens": 1891181.0,
"step": 410
},
{
"entropy": 0.9595077157020568,
"epoch": 0.0896,
"grad_norm": 0.31688833236694336,
"learning_rate": 9.995758853288805e-05,
"loss": 1.0745075225830079,
"mean_token_accuracy": 0.7604640245437622,
"num_tokens": 1934494.0,
"step": 420
},
{
"entropy": 0.963299511373043,
"epoch": 0.09173333333333333,
"grad_norm": 0.29464003443717957,
"learning_rate": 9.995016229448395e-05,
"loss": 1.0001104354858399,
"mean_token_accuracy": 0.763674932718277,
"num_tokens": 1976770.0,
"step": 430
},
{
"entropy": 1.1388054117560387,
"epoch": 0.09386666666666667,
"grad_norm": 0.2581053078174591,
"learning_rate": 9.994213757938819e-05,
"loss": 1.2570265769958495,
"mean_token_accuracy": 0.726725485175848,
"num_tokens": 2028837.0,
"step": 440
},
{
"entropy": 0.9712555348873139,
"epoch": 0.096,
"grad_norm": 0.3099062740802765,
"learning_rate": 9.993351448374873e-05,
"loss": 1.072590446472168,
"mean_token_accuracy": 0.7607480764389039,
"num_tokens": 2072441.0,
"step": 450
},
{
"entropy": 1.0221860893070698,
"epoch": 0.09813333333333334,
"grad_norm": 0.25882700085639954,
"learning_rate": 9.992429311088296e-05,
"loss": 1.0992252349853515,
"mean_token_accuracy": 0.7492104887962341,
"num_tokens": 2119587.0,
"step": 460
},
{
"entropy": 1.082937440276146,
"epoch": 0.10026666666666667,
"grad_norm": 0.23417538404464722,
"learning_rate": 9.991447357127657e-05,
"loss": 1.2704851150512695,
"mean_token_accuracy": 0.7427436165511608,
"num_tokens": 2171365.0,
"step": 470
},
{
"entropy": 0.8950243927538395,
"epoch": 0.1024,
"grad_norm": 0.27864041924476624,
"learning_rate": 9.990405598258212e-05,
"loss": 0.965212631225586,
"mean_token_accuracy": 0.773382380604744,
"num_tokens": 2216186.0,
"step": 480
},
{
"entropy": 0.9513905093073844,
"epoch": 0.10453333333333334,
"grad_norm": 0.27456656098365784,
"learning_rate": 9.989304046961772e-05,
"loss": 1.0514169692993165,
"mean_token_accuracy": 0.7602177545428276,
"num_tokens": 2263172.0,
"step": 490
},
{
"entropy": 1.1295288607478142,
"epoch": 0.10666666666666667,
"grad_norm": 0.2654035985469818,
"learning_rate": 9.988142716436546e-05,
"loss": 1.2499847412109375,
"mean_token_accuracy": 0.7319100961089134,
"num_tokens": 2309133.0,
"step": 500
},
{
"entropy": 0.9629665195941925,
"epoch": 0.1088,
"grad_norm": 0.2432626485824585,
"learning_rate": 9.986921620596989e-05,
"loss": 1.038647174835205,
"mean_token_accuracy": 0.7657018698751926,
"num_tokens": 2352188.0,
"step": 510
},
{
"entropy": 1.0380054742097855,
"epoch": 0.11093333333333333,
"grad_norm": 0.22924789786338806,
"learning_rate": 9.985640774073634e-05,
"loss": 1.0875710487365722,
"mean_token_accuracy": 0.7410315036773681,
"num_tokens": 2400184.0,
"step": 520
},
{
"entropy": 1.0756568349897861,
"epoch": 0.11306666666666666,
"grad_norm": 0.322125107049942,
"learning_rate": 9.984300192212912e-05,
"loss": 1.15003662109375,
"mean_token_accuracy": 0.7418296962976456,
"num_tokens": 2445609.0,
"step": 530
},
{
"entropy": 1.037429604679346,
"epoch": 0.1152,
"grad_norm": 0.29860854148864746,
"learning_rate": 9.982899891076973e-05,
"loss": 1.1256014823913574,
"mean_token_accuracy": 0.7413557574152947,
"num_tokens": 2490595.0,
"step": 540
},
{
"entropy": 0.863083366304636,
"epoch": 0.11733333333333333,
"grad_norm": 0.2839767634868622,
"learning_rate": 9.98143988744349e-05,
"loss": 0.9366037368774414,
"mean_token_accuracy": 0.786571592092514,
"num_tokens": 2537415.0,
"step": 550
},
{
"entropy": 0.9333060696721077,
"epoch": 0.11946666666666667,
"grad_norm": 0.23799379169940948,
"learning_rate": 9.979920198805464e-05,
"loss": 1.0223896980285645,
"mean_token_accuracy": 0.7608667835593224,
"num_tokens": 2582645.0,
"step": 560
},
{
"entropy": 1.0654601491987705,
"epoch": 0.1216,
"grad_norm": 0.26716434955596924,
"learning_rate": 9.97834084337101e-05,
"loss": 1.1589471817016601,
"mean_token_accuracy": 0.7418766617774963,
"num_tokens": 2633308.0,
"step": 570
},
{
"entropy": 1.0902433142066001,
"epoch": 0.12373333333333333,
"grad_norm": 0.20525577664375305,
"learning_rate": 9.976701840063136e-05,
"loss": 1.1464842796325683,
"mean_token_accuracy": 0.735442753136158,
"num_tokens": 2686009.0,
"step": 580
},
{
"entropy": 0.8992204323410988,
"epoch": 0.12586666666666665,
"grad_norm": 0.2524716556072235,
"learning_rate": 9.975003208519522e-05,
"loss": 0.9971331596374512,
"mean_token_accuracy": 0.7749405071139336,
"num_tokens": 2731814.0,
"step": 590
},
{
"entropy": 1.0139609590172767,
"epoch": 0.128,
"grad_norm": 0.26556891202926636,
"learning_rate": 9.973244969092282e-05,
"loss": 1.1074792861938476,
"mean_token_accuracy": 0.7530097424983978,
"num_tokens": 2780314.0,
"step": 600
},
{
"entropy": 0.9936468213796615,
"epoch": 0.13013333333333332,
"grad_norm": 0.1984289288520813,
"learning_rate": 9.971427142847718e-05,
"loss": 1.0992106437683105,
"mean_token_accuracy": 0.7568746477365493,
"num_tokens": 2827832.0,
"step": 610
},
{
"entropy": 0.9435897715389728,
"epoch": 0.13226666666666667,
"grad_norm": 0.25219106674194336,
"learning_rate": 9.969549751566075e-05,
"loss": 1.016776180267334,
"mean_token_accuracy": 0.76652851998806,
"num_tokens": 2873516.0,
"step": 620
},
{
"entropy": 0.9641087010502816,
"epoch": 0.1344,
"grad_norm": 0.2497103363275528,
"learning_rate": 9.967612817741272e-05,
"loss": 1.0406078338623046,
"mean_token_accuracy": 0.7586069479584694,
"num_tokens": 2913600.0,
"step": 630
},
{
"entropy": 1.0591406509280206,
"epoch": 0.13653333333333334,
"grad_norm": 0.23842781782150269,
"learning_rate": 9.965616364580636e-05,
"loss": 1.1297724723815918,
"mean_token_accuracy": 0.7443821474909782,
"num_tokens": 2962337.0,
"step": 640
},
{
"entropy": 1.0139970764517785,
"epoch": 0.13866666666666666,
"grad_norm": 0.2730376124382019,
"learning_rate": 9.963560416004623e-05,
"loss": 1.112107276916504,
"mean_token_accuracy": 0.7496687114238739,
"num_tokens": 3006113.0,
"step": 650
},
{
"entropy": 0.9443116314709187,
"epoch": 0.1408,
"grad_norm": 0.26775607466697693,
"learning_rate": 9.961444996646532e-05,
"loss": 1.020461654663086,
"mean_token_accuracy": 0.7648672193288804,
"num_tokens": 3052837.0,
"step": 660
},
{
"entropy": 1.113273823261261,
"epoch": 0.14293333333333333,
"grad_norm": 0.2903483510017395,
"learning_rate": 9.95927013185221e-05,
"loss": 1.2222958564758302,
"mean_token_accuracy": 0.7364178076386452,
"num_tokens": 3101650.0,
"step": 670
},
{
"entropy": 0.8284098848700523,
"epoch": 0.14506666666666668,
"grad_norm": 0.3110564053058624,
"learning_rate": 9.957035847679749e-05,
"loss": 0.905357551574707,
"mean_token_accuracy": 0.7894617035984993,
"num_tokens": 3143357.0,
"step": 680
},
{
"entropy": 0.9675152562558651,
"epoch": 0.1472,
"grad_norm": 0.27567023038864136,
"learning_rate": 9.954742170899172e-05,
"loss": 1.0641048431396485,
"mean_token_accuracy": 0.7591987878084183,
"num_tokens": 3193515.0,
"step": 690
},
{
"entropy": 0.9688289143145085,
"epoch": 0.14933333333333335,
"grad_norm": 0.28721460700035095,
"learning_rate": 9.952389128992113e-05,
"loss": 1.0814785957336426,
"mean_token_accuracy": 0.759615159034729,
"num_tokens": 3242439.0,
"step": 700
},
{
"entropy": 0.9880564108490943,
"epoch": 0.15146666666666667,
"grad_norm": 0.3781116306781769,
"learning_rate": 9.949976750151489e-05,
"loss": 1.100698184967041,
"mean_token_accuracy": 0.7498437210917472,
"num_tokens": 3287959.0,
"step": 710
},
{
"entropy": 1.027633222937584,
"epoch": 0.1536,
"grad_norm": 0.2338317185640335,
"learning_rate": 9.947505063281157e-05,
"loss": 1.0913351058959961,
"mean_token_accuracy": 0.750363714993,
"num_tokens": 3335092.0,
"step": 720
},
{
"entropy": 1.1284249052405357,
"epoch": 0.15573333333333333,
"grad_norm": 0.2954626679420471,
"learning_rate": 9.944974097995581e-05,
"loss": 1.2494465827941894,
"mean_token_accuracy": 0.7387717284262181,
"num_tokens": 3380879.0,
"step": 730
},
{
"entropy": 0.9154682122170925,
"epoch": 0.15786666666666666,
"grad_norm": 0.2581311762332916,
"learning_rate": 9.942383884619455e-05,
"loss": 1.0275691032409668,
"mean_token_accuracy": 0.7657118752598763,
"num_tokens": 3430079.0,
"step": 740
},
{
"entropy": 1.0482445612549782,
"epoch": 0.16,
"grad_norm": 0.2496950626373291,
"learning_rate": 9.939734454187365e-05,
"loss": 1.1575148582458497,
"mean_token_accuracy": 0.7434881895780563,
"num_tokens": 3482687.0,
"step": 750
},
{
"entropy": 1.008715095371008,
"epoch": 0.16213333333333332,
"grad_norm": 0.24042253196239471,
"learning_rate": 9.937025838443397e-05,
"loss": 1.0921018600463868,
"mean_token_accuracy": 0.7544152162969112,
"num_tokens": 3528140.0,
"step": 760
},
{
"entropy": 0.9826410934329033,
"epoch": 0.16426666666666667,
"grad_norm": 0.20811501145362854,
"learning_rate": 9.934258069840765e-05,
"loss": 1.0858916282653808,
"mean_token_accuracy": 0.7535254985094071,
"num_tokens": 3578130.0,
"step": 770
},
{
"entropy": 0.9679107010364533,
"epoch": 0.1664,
"grad_norm": 0.3156932294368744,
"learning_rate": 9.931431181541426e-05,
"loss": 1.0209306716918944,
"mean_token_accuracy": 0.7623399093747139,
"num_tokens": 3625602.0,
"step": 780
},
{
"entropy": 0.8914617538452149,
"epoch": 0.16853333333333334,
"grad_norm": 0.2739625573158264,
"learning_rate": 9.928545207415675e-05,
"loss": 1.014828872680664,
"mean_token_accuracy": 0.7721783280372619,
"num_tokens": 3671375.0,
"step": 790
},
{
"entropy": 0.9649161577224732,
"epoch": 0.17066666666666666,
"grad_norm": 0.23826555907726288,
"learning_rate": 9.92560018204174e-05,
"loss": 1.0278871536254883,
"mean_token_accuracy": 0.7589392751455307,
"num_tokens": 3717893.0,
"step": 800
},
{
"entropy": 0.9766084000468254,
"epoch": 0.1728,
"grad_norm": 0.33309969305992126,
"learning_rate": 9.92259614070538e-05,
"loss": 1.0395455360412598,
"mean_token_accuracy": 0.7545588746666908,
"num_tokens": 3764739.0,
"step": 810
},
{
"entropy": 1.0871834971010685,
"epoch": 0.17493333333333333,
"grad_norm": 0.22862163186073303,
"learning_rate": 9.919533119399438e-05,
"loss": 1.1718174934387207,
"mean_token_accuracy": 0.7301450505852699,
"num_tokens": 3810938.0,
"step": 820
},
{
"entropy": 1.0070038817822933,
"epoch": 0.17706666666666668,
"grad_norm": 0.2899021506309509,
"learning_rate": 9.916411154823433e-05,
"loss": 1.146003818511963,
"mean_token_accuracy": 0.7569455504417419,
"num_tokens": 3857657.0,
"step": 830
},
{
"entropy": 1.116474264860153,
"epoch": 0.1792,
"grad_norm": 0.2815554738044739,
"learning_rate": 9.913230284383112e-05,
"loss": 1.1874059677124023,
"mean_token_accuracy": 0.7290896072983741,
"num_tokens": 3909965.0,
"step": 840
},
{
"entropy": 0.9500585079193116,
"epoch": 0.18133333333333335,
"grad_norm": 0.3055127263069153,
"learning_rate": 9.90999054619e-05,
"loss": 1.0213364601135253,
"mean_token_accuracy": 0.7678412273526192,
"num_tokens": 3948812.0,
"step": 850
},
{
"entropy": 0.8651109091937542,
"epoch": 0.18346666666666667,
"grad_norm": 0.2479069083929062,
"learning_rate": 9.906691979060943e-05,
"loss": 0.9216291427612304,
"mean_token_accuracy": 0.7761695921421051,
"num_tokens": 3991940.0,
"step": 860
},
{
"entropy": 1.1529859654605388,
"epoch": 0.1856,
"grad_norm": 0.26249563694000244,
"learning_rate": 9.903334622517643e-05,
"loss": 1.2724492073059082,
"mean_token_accuracy": 0.7244490720331669,
"num_tokens": 4050343.0,
"step": 870
},
{
"entropy": 1.0595085561275481,
"epoch": 0.18773333333333334,
"grad_norm": 0.294251948595047,
"learning_rate": 9.89991851678619e-05,
"loss": 1.1476259231567383,
"mean_token_accuracy": 0.7438870698213578,
"num_tokens": 4099181.0,
"step": 880
},
{
"entropy": 1.0237427443265914,
"epoch": 0.18986666666666666,
"grad_norm": 0.251798152923584,
"learning_rate": 9.896443702796573e-05,
"loss": 1.1016413688659668,
"mean_token_accuracy": 0.7469503089785576,
"num_tokens": 4143951.0,
"step": 890
},
{
"entropy": 0.9793856598436832,
"epoch": 0.192,
"grad_norm": 0.2858197093009949,
"learning_rate": 9.892910222182196e-05,
"loss": 1.0976881980895996,
"mean_token_accuracy": 0.7614006102085114,
"num_tokens": 4194087.0,
"step": 900
},
{
"entropy": 1.0538529880344867,
"epoch": 0.19413333333333332,
"grad_norm": 0.23789124190807343,
"learning_rate": 9.889318117279373e-05,
"loss": 1.196424674987793,
"mean_token_accuracy": 0.7455965608358384,
"num_tokens": 4243300.0,
"step": 910
},
{
"entropy": 0.9650195389986038,
"epoch": 0.19626666666666667,
"grad_norm": 0.3590649366378784,
"learning_rate": 9.885667431126824e-05,
"loss": 1.0054343223571778,
"mean_token_accuracy": 0.7639905765652657,
"num_tokens": 4291788.0,
"step": 920
},
{
"entropy": 0.8939604975283146,
"epoch": 0.1984,
"grad_norm": 0.2963460087776184,
"learning_rate": 9.881958207465158e-05,
"loss": 0.9668001174926758,
"mean_token_accuracy": 0.778343915939331,
"num_tokens": 4336567.0,
"step": 930
},
{
"entropy": 0.8603343136608601,
"epoch": 0.20053333333333334,
"grad_norm": 0.22635255753993988,
"learning_rate": 9.878190490736353e-05,
"loss": 0.9547459602355957,
"mean_token_accuracy": 0.780703829228878,
"num_tokens": 4379939.0,
"step": 940
},
{
"entropy": 1.0914236083626747,
"epoch": 0.20266666666666666,
"grad_norm": 0.28582993149757385,
"learning_rate": 9.874364326083216e-05,
"loss": 1.1933195114135742,
"mean_token_accuracy": 0.7399712555110455,
"num_tokens": 4429828.0,
"step": 950
},
{
"entropy": 0.9969653740525246,
"epoch": 0.2048,
"grad_norm": 0.23508235812187195,
"learning_rate": 9.87047975934885e-05,
"loss": 1.080200481414795,
"mean_token_accuracy": 0.7547310657799244,
"num_tokens": 4478207.0,
"step": 960
},
{
"entropy": 0.9463694766163826,
"epoch": 0.20693333333333333,
"grad_norm": 0.28800663352012634,
"learning_rate": 9.866536837076098e-05,
"loss": 1.0770373344421387,
"mean_token_accuracy": 0.7622694931924343,
"num_tokens": 4523626.0,
"step": 970
},
{
"entropy": 1.017927524447441,
"epoch": 0.20906666666666668,
"grad_norm": 0.22783571481704712,
"learning_rate": 9.862535606506992e-05,
"loss": 1.1111245155334473,
"mean_token_accuracy": 0.7514706686139107,
"num_tokens": 4575636.0,
"step": 980
},
{
"entropy": 0.9148829184472561,
"epoch": 0.2112,
"grad_norm": 0.44951871037483215,
"learning_rate": 9.85847611558218e-05,
"loss": 1.0038617134094239,
"mean_token_accuracy": 0.7708485037088394,
"num_tokens": 4623275.0,
"step": 990
},
{
"entropy": 0.9400355100631714,
"epoch": 0.21333333333333335,
"grad_norm": 0.2274121642112732,
"learning_rate": 9.85435841294036e-05,
"loss": 1.051990795135498,
"mean_token_accuracy": 0.762999877333641,
"num_tokens": 4670070.0,
"step": 1000
},
{
"entropy": 0.9205634713172912,
"epoch": 0.21546666666666667,
"grad_norm": 0.2580691874027252,
"learning_rate": 9.850182547917686e-05,
"loss": 1.0361328125,
"mean_token_accuracy": 0.77038114964962,
"num_tokens": 4713754.0,
"step": 1010
},
{
"entropy": 0.9357184395194054,
"epoch": 0.2176,
"grad_norm": 0.3024005889892578,
"learning_rate": 9.845948570547187e-05,
"loss": 1.001820945739746,
"mean_token_accuracy": 0.7686716303229332,
"num_tokens": 4754969.0,
"step": 1020
},
{
"entropy": 0.9788610845804214,
"epoch": 0.21973333333333334,
"grad_norm": 0.22762160003185272,
"learning_rate": 9.841656531558163e-05,
"loss": 1.0857264518737793,
"mean_token_accuracy": 0.755905470252037,
"num_tokens": 4799172.0,
"step": 1030
},
{
"entropy": 0.8699902944266796,
"epoch": 0.22186666666666666,
"grad_norm": 0.3153718113899231,
"learning_rate": 9.83730648237558e-05,
"loss": 0.9507910728454589,
"mean_token_accuracy": 0.7810183942317963,
"num_tokens": 4841210.0,
"step": 1040
},
{
"entropy": 1.0130531772971154,
"epoch": 0.224,
"grad_norm": 0.254517138004303,
"learning_rate": 9.832898475119446e-05,
"loss": 1.0863225936889649,
"mean_token_accuracy": 0.7473610386252403,
"num_tokens": 4889983.0,
"step": 1050
},
{
"entropy": 0.9305486619472504,
"epoch": 0.22613333333333333,
"grad_norm": 0.3190745413303375,
"learning_rate": 9.828432562604197e-05,
"loss": 1.0010540008544921,
"mean_token_accuracy": 0.7654982030391693,
"num_tokens": 4935048.0,
"step": 1060
},
{
"entropy": 0.8807580441236496,
"epoch": 0.22826666666666667,
"grad_norm": 0.25836268067359924,
"learning_rate": 9.823908798338061e-05,
"loss": 0.9626541137695312,
"mean_token_accuracy": 0.7766690820455551,
"num_tokens": 4974286.0,
"step": 1070
},
{
"entropy": 0.9813514620065689,
"epoch": 0.2304,
"grad_norm": 0.23850619792938232,
"learning_rate": 9.819327236522411e-05,
"loss": 1.0773870468139648,
"mean_token_accuracy": 0.7523389101028443,
"num_tokens": 5029521.0,
"step": 1080
},
{
"entropy": 0.9586616240441799,
"epoch": 0.23253333333333334,
"grad_norm": 0.32606637477874756,
"learning_rate": 9.814687932051123e-05,
"loss": 1.0027738571166993,
"mean_token_accuracy": 0.7671207025647163,
"num_tokens": 5074269.0,
"step": 1090
},
{
"entropy": 0.9240864880383015,
"epoch": 0.23466666666666666,
"grad_norm": 0.23331928253173828,
"learning_rate": 9.809990940509911e-05,
"loss": 1.0010904312133788,
"mean_token_accuracy": 0.7657159030437469,
"num_tokens": 5120632.0,
"step": 1100
},
{
"entropy": 0.9960295349359513,
"epoch": 0.2368,
"grad_norm": 0.27676209807395935,
"learning_rate": 9.805236318175672e-05,
"loss": 1.0985889434814453,
"mean_token_accuracy": 0.7565312668681144,
"num_tokens": 5167690.0,
"step": 1110
},
{
"entropy": 0.9320930615067482,
"epoch": 0.23893333333333333,
"grad_norm": 0.2536001205444336,
"learning_rate": 9.800424122015802e-05,
"loss": 1.0100153923034667,
"mean_token_accuracy": 0.7664023399353027,
"num_tokens": 5209976.0,
"step": 1120
},
{
"entropy": 0.960879421979189,
"epoch": 0.24106666666666668,
"grad_norm": 0.2650390565395355,
"learning_rate": 9.79555440968751e-05,
"loss": 1.0518557548522949,
"mean_token_accuracy": 0.7670834749937058,
"num_tokens": 5257491.0,
"step": 1130
},
{
"entropy": 1.0790555529296397,
"epoch": 0.2432,
"grad_norm": 0.2658112645149231,
"learning_rate": 9.790627239537144e-05,
"loss": 1.2411640167236329,
"mean_token_accuracy": 0.7390920028090477,
"num_tokens": 5302986.0,
"step": 1140
},
{
"entropy": 0.9476075693964958,
"epoch": 0.24533333333333332,
"grad_norm": 0.2797292470932007,
"learning_rate": 9.785642670599479e-05,
"loss": 1.0823354721069336,
"mean_token_accuracy": 0.765032921731472,
"num_tokens": 5348946.0,
"step": 1150
},
{
"entropy": 0.8997275173664093,
"epoch": 0.24746666666666667,
"grad_norm": 0.21643367409706116,
"learning_rate": 9.780600762597005e-05,
"loss": 0.9590452194213868,
"mean_token_accuracy": 0.7714703544974327,
"num_tokens": 5393421.0,
"step": 1160
},
{
"entropy": 1.0325972460210324,
"epoch": 0.2496,
"grad_norm": 0.26943519711494446,
"learning_rate": 9.775501575939227e-05,
"loss": 1.0748598098754882,
"mean_token_accuracy": 0.7444046661257744,
"num_tokens": 5443639.0,
"step": 1170
},
{
"entropy": 0.9544263079762458,
"epoch": 0.2517333333333333,
"grad_norm": 0.28337323665618896,
"learning_rate": 9.770345171721929e-05,
"loss": 1.0133567810058595,
"mean_token_accuracy": 0.7597964540123939,
"num_tokens": 5494552.0,
"step": 1180
},
{
"entropy": 0.9211089439690113,
"epoch": 0.2538666666666667,
"grad_norm": 0.23862819373607635,
"learning_rate": 9.765131611726446e-05,
"loss": 0.9742342948913574,
"mean_token_accuracy": 0.7758729934692383,
"num_tokens": 5540743.0,
"step": 1190
},
{
"entropy": 1.0495109014213084,
"epoch": 0.256,
"grad_norm": 0.2335851490497589,
"learning_rate": 9.759860958418926e-05,
"loss": 1.1212799072265625,
"mean_token_accuracy": 0.7419089064002037,
"num_tokens": 5592981.0,
"step": 1200
},
{
"entropy": 1.115493653714657,
"epoch": 0.2581333333333333,
"grad_norm": 0.2895198464393616,
"learning_rate": 9.754533274949575e-05,
"loss": 1.2449783325195312,
"mean_token_accuracy": 0.7372826255857945,
"num_tokens": 5637868.0,
"step": 1210
},
{
"entropy": 0.9640243485569954,
"epoch": 0.26026666666666665,
"grad_norm": 0.23221854865550995,
"learning_rate": 9.749148625151908e-05,
"loss": 1.0559998512268067,
"mean_token_accuracy": 0.7624265968799591,
"num_tokens": 5684479.0,
"step": 1220
},
{
"entropy": 0.9518351659178734,
"epoch": 0.2624,
"grad_norm": 0.2750278413295746,
"learning_rate": 9.743707073541978e-05,
"loss": 1.0608203887939454,
"mean_token_accuracy": 0.7580445930361748,
"num_tokens": 5726626.0,
"step": 1230
},
{
"entropy": 1.0462738864123822,
"epoch": 0.26453333333333334,
"grad_norm": 0.21800817549228668,
"learning_rate": 9.738208685317611e-05,
"loss": 1.172841453552246,
"mean_token_accuracy": 0.7489309534430504,
"num_tokens": 5776891.0,
"step": 1240
},
{
"entropy": 0.9446908816695213,
"epoch": 0.26666666666666666,
"grad_norm": 0.2996140718460083,
"learning_rate": 9.732653526357612e-05,
"loss": 1.0297443389892578,
"mean_token_accuracy": 0.7673991709947586,
"num_tokens": 5817721.0,
"step": 1250
},
{
"entropy": 1.0725971311330795,
"epoch": 0.2688,
"grad_norm": 0.25890064239501953,
"learning_rate": 9.727041663220989e-05,
"loss": 1.2022390365600586,
"mean_token_accuracy": 0.7427652187645435,
"num_tokens": 5864952.0,
"step": 1260
},
{
"entropy": 1.0392154708504677,
"epoch": 0.27093333333333336,
"grad_norm": 0.2563996911048889,
"learning_rate": 9.721373163146148e-05,
"loss": 1.1769997596740722,
"mean_token_accuracy": 0.7434597261250019,
"num_tokens": 5910222.0,
"step": 1270
},
{
"entropy": 0.9681669734418392,
"epoch": 0.2730666666666667,
"grad_norm": 0.2539491653442383,
"learning_rate": 9.715648094050087e-05,
"loss": 1.0889246940612793,
"mean_token_accuracy": 0.7573506951332092,
"num_tokens": 5958487.0,
"step": 1280
},
{
"entropy": 0.9369139075279236,
"epoch": 0.2752,
"grad_norm": 0.29088094830513,
"learning_rate": 9.709866524527588e-05,
"loss": 1.0017178535461426,
"mean_token_accuracy": 0.7683532252907753,
"num_tokens": 6002949.0,
"step": 1290
},
{
"entropy": 1.0060899198055266,
"epoch": 0.2773333333333333,
"grad_norm": 0.2669098675251007,
"learning_rate": 9.704028523850392e-05,
"loss": 1.0900717735290528,
"mean_token_accuracy": 0.7483311414718627,
"num_tokens": 6053462.0,
"step": 1300
},
{
"entropy": 0.9726419121026992,
"epoch": 0.27946666666666664,
"grad_norm": 0.2472907453775406,
"learning_rate": 9.698134161966363e-05,
"loss": 1.1085708618164063,
"mean_token_accuracy": 0.7585345059633255,
"num_tokens": 6103810.0,
"step": 1310
},
{
"entropy": 0.9410715244710446,
"epoch": 0.2816,
"grad_norm": 0.2315952181816101,
"learning_rate": 9.692183509498659e-05,
"loss": 1.04432373046875,
"mean_token_accuracy": 0.7650494173169136,
"num_tokens": 6149332.0,
"step": 1320
},
{
"entropy": 0.9582152135670186,
"epoch": 0.28373333333333334,
"grad_norm": 0.28324106335639954,
"learning_rate": 9.686176637744884e-05,
"loss": 1.0879361152648925,
"mean_token_accuracy": 0.7590656578540802,
"num_tokens": 6193221.0,
"step": 1330
},
{
"entropy": 0.8448746472597122,
"epoch": 0.28586666666666666,
"grad_norm": 0.27708905935287476,
"learning_rate": 9.680113618676229e-05,
"loss": 0.9311987876892089,
"mean_token_accuracy": 0.7856365889310837,
"num_tokens": 6238920.0,
"step": 1340
},
{
"entropy": 0.9729965463280678,
"epoch": 0.288,
"grad_norm": 0.2840976119041443,
"learning_rate": 9.673994524936615e-05,
"loss": 1.0606694221496582,
"mean_token_accuracy": 0.7592033997178078,
"num_tokens": 6286552.0,
"step": 1350
},
{
"entropy": 0.9429876990616322,
"epoch": 0.29013333333333335,
"grad_norm": 0.2964062988758087,
"learning_rate": 9.667819429841817e-05,
"loss": 1.0691003799438477,
"mean_token_accuracy": 0.7747758001089096,
"num_tokens": 6331573.0,
"step": 1360
},
{
"entropy": 1.0872648723423481,
"epoch": 0.2922666666666667,
"grad_norm": 0.2675784230232239,
"learning_rate": 9.661588407378596e-05,
"loss": 1.2234331130981446,
"mean_token_accuracy": 0.7345306605100632,
"num_tokens": 6383986.0,
"step": 1370
},
{
"entropy": 0.8609121344983578,
"epoch": 0.2944,
"grad_norm": 0.22704993188381195,
"learning_rate": 9.655301532203797e-05,
"loss": 0.934294605255127,
"mean_token_accuracy": 0.7782075613737106,
"num_tokens": 6426867.0,
"step": 1380
},
{
"entropy": 0.8974170722067356,
"epoch": 0.2965333333333333,
"grad_norm": 0.2712918221950531,
"learning_rate": 9.648958879643467e-05,
"loss": 0.9794875144958496,
"mean_token_accuracy": 0.768825002014637,
"num_tokens": 6470272.0,
"step": 1390
},
{
"entropy": 0.9115433268249035,
"epoch": 0.2986666666666667,
"grad_norm": 0.2932375371456146,
"learning_rate": 9.642560525691948e-05,
"loss": 0.9729208946228027,
"mean_token_accuracy": 0.7670300453901291,
"num_tokens": 6513165.0,
"step": 1400
},
{
"entropy": 0.8882284432649612,
"epoch": 0.3008,
"grad_norm": 0.28009793162345886,
"learning_rate": 9.63610654701097e-05,
"loss": 0.9678432464599609,
"mean_token_accuracy": 0.7733970731496811,
"num_tokens": 6556342.0,
"step": 1410
},
{
"entropy": 0.8777762867510319,
"epoch": 0.30293333333333333,
"grad_norm": 0.23763342201709747,
"learning_rate": 9.629597020928722e-05,
"loss": 0.9687582969665527,
"mean_token_accuracy": 0.7792320027947426,
"num_tokens": 6601597.0,
"step": 1420
},
{
"entropy": 0.9160067647695541,
"epoch": 0.30506666666666665,
"grad_norm": 0.31182846426963806,
"learning_rate": 9.623032025438939e-05,
"loss": 0.9827415466308593,
"mean_token_accuracy": 0.7696192592382431,
"num_tokens": 6649545.0,
"step": 1430
},
{
"entropy": 0.9564003840088844,
"epoch": 0.3072,
"grad_norm": 0.23129577934741974,
"learning_rate": 9.61641163919996e-05,
"loss": 1.0380489349365234,
"mean_token_accuracy": 0.7621255874633789,
"num_tokens": 6696196.0,
"step": 1440
},
{
"entropy": 1.05502300709486,
"epoch": 0.30933333333333335,
"grad_norm": 0.27563098073005676,
"learning_rate": 9.609735941533788e-05,
"loss": 1.150872802734375,
"mean_token_accuracy": 0.7396457836031913,
"num_tokens": 6745953.0,
"step": 1450
},
{
"entropy": 0.9179727308452129,
"epoch": 0.31146666666666667,
"grad_norm": 0.2697086036205292,
"learning_rate": 9.603005012425135e-05,
"loss": 1.0098464012145996,
"mean_token_accuracy": 0.7658590793609619,
"num_tokens": 6794497.0,
"step": 1460
},
{
"entropy": 1.1471355110406876,
"epoch": 0.3136,
"grad_norm": 0.2717576026916504,
"learning_rate": 9.596218932520468e-05,
"loss": 1.2438765525817872,
"mean_token_accuracy": 0.7290126971900464,
"num_tokens": 6841976.0,
"step": 1470
},
{
"entropy": 0.9211908876895905,
"epoch": 0.3157333333333333,
"grad_norm": 0.23845043778419495,
"learning_rate": 9.589377783127047e-05,
"loss": 1.0137989044189453,
"mean_token_accuracy": 0.767640671133995,
"num_tokens": 6882780.0,
"step": 1480
},
{
"entropy": 0.8538284994661808,
"epoch": 0.3178666666666667,
"grad_norm": 0.23966865241527557,
"learning_rate": 9.58248164621194e-05,
"loss": 0.908942985534668,
"mean_token_accuracy": 0.7838981494307518,
"num_tokens": 6922694.0,
"step": 1490
},
{
"entropy": 0.9744522646069527,
"epoch": 0.32,
"grad_norm": 0.27710849046707153,
"learning_rate": 9.575530604401051e-05,
"loss": 1.0989601135253906,
"mean_token_accuracy": 0.7573227554559707,
"num_tokens": 6972307.0,
"step": 1500
},
{
"entropy": 1.0386228650808333,
"epoch": 0.3221333333333333,
"grad_norm": 0.2898867428302765,
"learning_rate": 9.56852474097812e-05,
"loss": 1.1454790115356446,
"mean_token_accuracy": 0.7430783316493035,
"num_tokens": 7020126.0,
"step": 1510
},
{
"entropy": 0.9898950323462486,
"epoch": 0.32426666666666665,
"grad_norm": 0.2526203393936157,
"learning_rate": 9.561464139883737e-05,
"loss": 1.1016074180603028,
"mean_token_accuracy": 0.7571706652641297,
"num_tokens": 7068804.0,
"step": 1520
},
{
"entropy": 0.999041261523962,
"epoch": 0.3264,
"grad_norm": 0.24795448780059814,
"learning_rate": 9.554348885714326e-05,
"loss": 1.0913206100463868,
"mean_token_accuracy": 0.7498321965336799,
"num_tokens": 7120053.0,
"step": 1530
},
{
"entropy": 0.9470011726021766,
"epoch": 0.32853333333333334,
"grad_norm": 0.2526148557662964,
"learning_rate": 9.547179063721139e-05,
"loss": 1.0380746841430664,
"mean_token_accuracy": 0.7631630048155784,
"num_tokens": 7166553.0,
"step": 1540
},
{
"entropy": 0.9413013480603695,
"epoch": 0.33066666666666666,
"grad_norm": 0.2717132866382599,
"learning_rate": 9.539954759809226e-05,
"loss": 1.0266976356506348,
"mean_token_accuracy": 0.7626717418432236,
"num_tokens": 7211784.0,
"step": 1550
},
{
"entropy": 1.0694061882793904,
"epoch": 0.3328,
"grad_norm": 0.2959578335285187,
"learning_rate": 9.532676060536419e-05,
"loss": 1.1947439193725586,
"mean_token_accuracy": 0.7360827416181565,
"num_tokens": 7256909.0,
"step": 1560
},
{
"entropy": 1.100894968211651,
"epoch": 0.33493333333333336,
"grad_norm": 0.2543633282184601,
"learning_rate": 9.525343053112276e-05,
"loss": 1.2262410163879394,
"mean_token_accuracy": 0.7347081542015076,
"num_tokens": 7305883.0,
"step": 1570
},
{
"entropy": 0.887219874560833,
"epoch": 0.3370666666666667,
"grad_norm": 0.2346959412097931,
"learning_rate": 9.517955825397056e-05,
"loss": 0.9671891212463379,
"mean_token_accuracy": 0.7790701374411583,
"num_tokens": 7352371.0,
"step": 1580
},
{
"entropy": 0.8671779796481133,
"epoch": 0.3392,
"grad_norm": 0.32672831416130066,
"learning_rate": 9.510514465900653e-05,
"loss": 0.9251022338867188,
"mean_token_accuracy": 0.7759940758347511,
"num_tokens": 7393383.0,
"step": 1590
},
{
"entropy": 0.9520958945155144,
"epoch": 0.3413333333333333,
"grad_norm": 0.2607707679271698,
"learning_rate": 9.50301906378154e-05,
"loss": 1.0628732681274413,
"mean_token_accuracy": 0.7598425537347794,
"num_tokens": 7439670.0,
"step": 1600
},
{
"entropy": 1.0912248834967613,
"epoch": 0.34346666666666664,
"grad_norm": 0.2782971262931824,
"learning_rate": 9.495469708845701e-05,
"loss": 1.1914597511291505,
"mean_token_accuracy": 0.7388298735022545,
"num_tokens": 7485267.0,
"step": 1610
},
{
"entropy": 0.8908423334360123,
"epoch": 0.3456,
"grad_norm": 0.3009212613105774,
"learning_rate": 9.487866491545554e-05,
"loss": 0.9948520660400391,
"mean_token_accuracy": 0.7739639699459075,
"num_tokens": 7527918.0,
"step": 1620
},
{
"entropy": 1.0133905045688152,
"epoch": 0.34773333333333334,
"grad_norm": 0.22292616963386536,
"learning_rate": 9.480209502978869e-05,
"loss": 1.0447552680969239,
"mean_token_accuracy": 0.7502199374139309,
"num_tokens": 7573700.0,
"step": 1630
},
{
"entropy": 0.9277701899409294,
"epoch": 0.34986666666666666,
"grad_norm": 0.25027042627334595,
"learning_rate": 9.472498834887671e-05,
"loss": 0.9662016868591309,
"mean_token_accuracy": 0.769710011780262,
"num_tokens": 7619772.0,
"step": 1640
},
{
"entropy": 0.9953067198395729,
"epoch": 0.352,
"grad_norm": 0.2255147099494934,
"learning_rate": 9.46473457965715e-05,
"loss": 1.1140392303466797,
"mean_token_accuracy": 0.7478319302201271,
"num_tokens": 7666080.0,
"step": 1650
},
{
"entropy": 0.9726508021354675,
"epoch": 0.35413333333333336,
"grad_norm": 0.23199768364429474,
"learning_rate": 9.456916830314548e-05,
"loss": 1.043684482574463,
"mean_token_accuracy": 0.7573757611215115,
"num_tokens": 7717686.0,
"step": 1660
},
{
"entropy": 0.8418352358043194,
"epoch": 0.3562666666666667,
"grad_norm": 0.2636827826499939,
"learning_rate": 9.449045680528041e-05,
"loss": 0.9572833061218262,
"mean_token_accuracy": 0.7885566264390945,
"num_tokens": 7760984.0,
"step": 1670
},
{
"entropy": 0.9119837798178196,
"epoch": 0.3584,
"grad_norm": 0.2830665111541748,
"learning_rate": 9.441121224605629e-05,
"loss": 0.9864655494689941,
"mean_token_accuracy": 0.7652302265167237,
"num_tokens": 7806335.0,
"step": 1680
},
{
"entropy": 0.833198781311512,
"epoch": 0.3605333333333333,
"grad_norm": 0.2658802568912506,
"learning_rate": 9.43314355749399e-05,
"loss": 0.8842006683349609,
"mean_token_accuracy": 0.7828017815947532,
"num_tokens": 7846783.0,
"step": 1690
},
{
"entropy": 0.9806767851114273,
"epoch": 0.3626666666666667,
"grad_norm": 0.23127029836177826,
"learning_rate": 9.425112774777354e-05,
"loss": 1.116124439239502,
"mean_token_accuracy": 0.7566492781043053,
"num_tokens": 7890665.0,
"step": 1700
},
{
"entropy": 0.8920052781701088,
"epoch": 0.3648,
"grad_norm": 0.22740185260772705,
"learning_rate": 9.417028972676359e-05,
"loss": 1.005050277709961,
"mean_token_accuracy": 0.7736168324947357,
"num_tokens": 7933849.0,
"step": 1710
},
{
"entropy": 1.0065233081579208,
"epoch": 0.36693333333333333,
"grad_norm": 0.34241342544555664,
"learning_rate": 9.408892248046885e-05,
"loss": 1.1022210121154785,
"mean_token_accuracy": 0.7556669354438782,
"num_tokens": 7981235.0,
"step": 1720
},
{
"entropy": 0.9596796780824661,
"epoch": 0.36906666666666665,
"grad_norm": 0.24347490072250366,
"learning_rate": 9.40070269837891e-05,
"loss": 1.058722686767578,
"mean_token_accuracy": 0.7606610782444477,
"num_tokens": 8028778.0,
"step": 1730
},
{
"entropy": 0.8785062082111835,
"epoch": 0.3712,
"grad_norm": 0.20483236014842987,
"learning_rate": 9.392460421795328e-05,
"loss": 0.9666107177734375,
"mean_token_accuracy": 0.7833232149481774,
"num_tokens": 8071410.0,
"step": 1740
},
{
"entropy": 0.8095596194267273,
"epoch": 0.37333333333333335,
"grad_norm": 0.36941683292388916,
"learning_rate": 9.38416551705078e-05,
"loss": 0.8782508850097657,
"mean_token_accuracy": 0.7895680025219918,
"num_tokens": 8113695.0,
"step": 1750
},
{
"entropy": 0.9545476101338863,
"epoch": 0.37546666666666667,
"grad_norm": 0.23256246745586395,
"learning_rate": 9.375818083530474e-05,
"loss": 1.0102588653564453,
"mean_token_accuracy": 0.7568591669201851,
"num_tokens": 8161702.0,
"step": 1760
},
{
"entropy": 0.8932637803256511,
"epoch": 0.3776,
"grad_norm": 0.2185504138469696,
"learning_rate": 9.367418221248989e-05,
"loss": 0.9898091316223144,
"mean_token_accuracy": 0.7799834325909615,
"num_tokens": 8205315.0,
"step": 1770
},
{
"entropy": 0.9569453194737434,
"epoch": 0.3797333333333333,
"grad_norm": 0.34778159856796265,
"learning_rate": 9.358966030849072e-05,
"loss": 1.0686969757080078,
"mean_token_accuracy": 0.7559246391057968,
"num_tokens": 8252919.0,
"step": 1780
},
{
"entropy": 1.029954968392849,
"epoch": 0.3818666666666667,
"grad_norm": 0.26953041553497314,
"learning_rate": 9.350461613600449e-05,
"loss": 1.1169189453125,
"mean_token_accuracy": 0.746114706993103,
"num_tokens": 8300355.0,
"step": 1790
},
{
"entropy": 0.943967767059803,
"epoch": 0.384,
"grad_norm": 0.2434380203485489,
"learning_rate": 9.34190507139859e-05,
"loss": 0.9968074798583985,
"mean_token_accuracy": 0.7640544638037682,
"num_tokens": 8346150.0,
"step": 1800
},
{
"entropy": 0.960440730303526,
"epoch": 0.38613333333333333,
"grad_norm": 0.2441101223230362,
"learning_rate": 9.333296506763505e-05,
"loss": 1.059675121307373,
"mean_token_accuracy": 0.7603183135390281,
"num_tokens": 8393408.0,
"step": 1810
},
{
"entropy": 0.8498699732124806,
"epoch": 0.38826666666666665,
"grad_norm": 0.2330222874879837,
"learning_rate": 9.324636022838509e-05,
"loss": 0.9152523040771484,
"mean_token_accuracy": 0.7780576214194298,
"num_tokens": 8440947.0,
"step": 1820
},
{
"entropy": 0.9348598286509514,
"epoch": 0.3904,
"grad_norm": 0.25513678789138794,
"learning_rate": 9.315923723388986e-05,
"loss": 1.041547966003418,
"mean_token_accuracy": 0.7705774419009686,
"num_tokens": 8486476.0,
"step": 1830
},
{
"entropy": 0.8954004615545272,
"epoch": 0.39253333333333335,
"grad_norm": 0.24745526909828186,
"learning_rate": 9.307159712801147e-05,
"loss": 0.9804242134094239,
"mean_token_accuracy": 0.773698341846466,
"num_tokens": 8533216.0,
"step": 1840
},
{
"entropy": 0.9542841285467147,
"epoch": 0.39466666666666667,
"grad_norm": 0.3648325800895691,
"learning_rate": 9.298344096080776e-05,
"loss": 1.0262937545776367,
"mean_token_accuracy": 0.7662550717592239,
"num_tokens": 8579411.0,
"step": 1850
},
{
"entropy": 0.820501434057951,
"epoch": 0.3968,
"grad_norm": 0.26821690797805786,
"learning_rate": 9.289476978851976e-05,
"loss": 0.8598980903625488,
"mean_token_accuracy": 0.7888635769486427,
"num_tokens": 8623046.0,
"step": 1860
},
{
"entropy": 0.9134793929755688,
"epoch": 0.3989333333333333,
"grad_norm": 0.25643301010131836,
"learning_rate": 9.280558467355907e-05,
"loss": 1.0032004356384276,
"mean_token_accuracy": 0.7722208425402641,
"num_tokens": 8665201.0,
"step": 1870
},
{
"entropy": 0.9829332500696182,
"epoch": 0.4010666666666667,
"grad_norm": 0.27002307772636414,
"learning_rate": 9.271588668449503e-05,
"loss": 1.1377047538757323,
"mean_token_accuracy": 0.7662092931568623,
"num_tokens": 8707004.0,
"step": 1880
},
{
"entropy": 1.0146782457828523,
"epoch": 0.4032,
"grad_norm": 0.2506401240825653,
"learning_rate": 9.262567689604195e-05,
"loss": 1.124861240386963,
"mean_token_accuracy": 0.7566848322749138,
"num_tokens": 8753713.0,
"step": 1890
},
{
"entropy": 1.0487395107746125,
"epoch": 0.4053333333333333,
"grad_norm": 0.24727876484394073,
"learning_rate": 9.25349563890463e-05,
"loss": 1.254913330078125,
"mean_token_accuracy": 0.7382492840290069,
"num_tokens": 8801703.0,
"step": 1900
},
{
"entropy": 0.8892258107662201,
"epoch": 0.40746666666666664,
"grad_norm": 0.2835586667060852,
"learning_rate": 9.244372625047372e-05,
"loss": 0.9624475479125977,
"mean_token_accuracy": 0.7717373371124268,
"num_tokens": 8848191.0,
"step": 1910
},
{
"entropy": 0.9389666721224785,
"epoch": 0.4096,
"grad_norm": 0.2468794584274292,
"learning_rate": 9.235198757339594e-05,
"loss": 1.0801708221435546,
"mean_token_accuracy": 0.7686146914958953,
"num_tokens": 8895860.0,
"step": 1920
},
{
"entropy": 0.9686854526400566,
"epoch": 0.41173333333333334,
"grad_norm": 0.24529697000980377,
"learning_rate": 9.225974145697775e-05,
"loss": 1.062663745880127,
"mean_token_accuracy": 0.7554567009210587,
"num_tokens": 8945814.0,
"step": 1930
},
{
"entropy": 0.9051942273974418,
"epoch": 0.41386666666666666,
"grad_norm": 0.3458064794540405,
"learning_rate": 9.216698900646383e-05,
"loss": 0.9697030067443848,
"mean_token_accuracy": 0.7712534263730049,
"num_tokens": 8990326.0,
"step": 1940
},
{
"entropy": 1.0142885446548462,
"epoch": 0.416,
"grad_norm": 0.25167402625083923,
"learning_rate": 9.20737313331655e-05,
"loss": 1.0986696243286134,
"mean_token_accuracy": 0.7525161564350128,
"num_tokens": 9037859.0,
"step": 1950
},
{
"entropy": 1.0254975706338882,
"epoch": 0.41813333333333336,
"grad_norm": 0.2604680061340332,
"learning_rate": 9.197996955444732e-05,
"loss": 1.0939658164978028,
"mean_token_accuracy": 0.7426734983921051,
"num_tokens": 9088894.0,
"step": 1960
},
{
"entropy": 0.9542628638446331,
"epoch": 0.4202666666666667,
"grad_norm": 0.2736513614654541,
"learning_rate": 9.188570479371387e-05,
"loss": 1.14229097366333,
"mean_token_accuracy": 0.7659218199551105,
"num_tokens": 9133717.0,
"step": 1970
},
{
"entropy": 0.9707967802882195,
"epoch": 0.4224,
"grad_norm": 0.2167668491601944,
"learning_rate": 9.179093818039616e-05,
"loss": 1.077283763885498,
"mean_token_accuracy": 0.7643323555588722,
"num_tokens": 9178671.0,
"step": 1980
},
{
"entropy": 0.9660324349999427,
"epoch": 0.4245333333333333,
"grad_norm": 0.35394829511642456,
"learning_rate": 9.169567084993814e-05,
"loss": 1.083635139465332,
"mean_token_accuracy": 0.7617688804864884,
"num_tokens": 9221289.0,
"step": 1990
},
{
"entropy": 0.9027157455682755,
"epoch": 0.4266666666666667,
"grad_norm": 0.2674417495727539,
"learning_rate": 9.159990394378303e-05,
"loss": 0.980889892578125,
"mean_token_accuracy": 0.7698795303702355,
"num_tokens": 9268892.0,
"step": 2000
},
{
"entropy": 0.952321158349514,
"epoch": 0.4288,
"grad_norm": 0.2332596629858017,
"learning_rate": 9.15036386093598e-05,
"loss": 1.0610048294067382,
"mean_token_accuracy": 0.7620892718434333,
"num_tokens": 9317142.0,
"step": 2010
},
{
"entropy": 0.8780575722455979,
"epoch": 0.43093333333333333,
"grad_norm": 0.2631659209728241,
"learning_rate": 9.140687600006929e-05,
"loss": 0.983364200592041,
"mean_token_accuracy": 0.7832586973905563,
"num_tokens": 9360075.0,
"step": 2020
},
{
"entropy": 0.9146695531904697,
"epoch": 0.43306666666666666,
"grad_norm": 0.33491751551628113,
"learning_rate": 9.13096172752704e-05,
"loss": 0.9856008529663086,
"mean_token_accuracy": 0.7726532012224198,
"num_tokens": 9396928.0,
"step": 2030
},
{
"entropy": 1.0050749816000462,
"epoch": 0.4352,
"grad_norm": 0.30352818965911865,
"learning_rate": 9.121186360026625e-05,
"loss": 1.0613908767700195,
"mean_token_accuracy": 0.7496299520134926,
"num_tokens": 9449045.0,
"step": 2040
},
{
"entropy": 0.919689030200243,
"epoch": 0.43733333333333335,
"grad_norm": 0.33956724405288696,
"learning_rate": 9.111361614629022e-05,
"loss": 0.9860305786132812,
"mean_token_accuracy": 0.7708889573812485,
"num_tokens": 9498718.0,
"step": 2050
},
{
"entropy": 0.9274381674826145,
"epoch": 0.43946666666666667,
"grad_norm": 0.29058581590652466,
"learning_rate": 9.101487609049181e-05,
"loss": 0.9976702690124511,
"mean_token_accuracy": 0.7693670354783535,
"num_tokens": 9537791.0,
"step": 2060
},
{
"entropy": 0.917995036393404,
"epoch": 0.4416,
"grad_norm": 0.22210952639579773,
"learning_rate": 9.091564461592274e-05,
"loss": 1.01414155960083,
"mean_token_accuracy": 0.7708904504776001,
"num_tokens": 9581893.0,
"step": 2070
},
{
"entropy": 0.8260227806866169,
"epoch": 0.4437333333333333,
"grad_norm": 0.31099778413772583,
"learning_rate": 9.081592291152252e-05,
"loss": 0.9264348983764649,
"mean_token_accuracy": 0.7857892021536828,
"num_tokens": 9624465.0,
"step": 2080
},
{
"entropy": 0.9573663167655468,
"epoch": 0.4458666666666667,
"grad_norm": 0.2500694692134857,
"learning_rate": 9.071571217210443e-05,
"loss": 1.042325496673584,
"mean_token_accuracy": 0.7585179045796394,
"num_tokens": 9670927.0,
"step": 2090
},
{
"entropy": 0.8527260765433311,
"epoch": 0.448,
"grad_norm": 0.22426480054855347,
"learning_rate": 9.061501359834108e-05,
"loss": 0.9534576416015625,
"mean_token_accuracy": 0.7796914517879486,
"num_tokens": 9717399.0,
"step": 2100
},
{
"entropy": 1.0308616764843463,
"epoch": 0.45013333333333333,
"grad_norm": 0.2896214723587036,
"learning_rate": 9.051382839675005e-05,
"loss": 1.1293525695800781,
"mean_token_accuracy": 0.7496985673904419,
"num_tokens": 9766859.0,
"step": 2110
},
{
"entropy": 1.0453794084489345,
"epoch": 0.45226666666666665,
"grad_norm": 0.21886104345321655,
"learning_rate": 9.041215777967945e-05,
"loss": 1.128882598876953,
"mean_token_accuracy": 0.7480149149894715,
"num_tokens": 9821214.0,
"step": 2120
},
{
"entropy": 1.002899456769228,
"epoch": 0.4544,
"grad_norm": 0.24397237598896027,
"learning_rate": 9.031000296529336e-05,
"loss": 1.0722038269042968,
"mean_token_accuracy": 0.7499327704310417,
"num_tokens": 9873482.0,
"step": 2130
},
{
"entropy": 0.9601933643221855,
"epoch": 0.45653333333333335,
"grad_norm": 0.249656543135643,
"learning_rate": 9.020736517755733e-05,
"loss": 1.0663026809692382,
"mean_token_accuracy": 0.7673163414001465,
"num_tokens": 9924229.0,
"step": 2140
},
{
"entropy": 1.1109409362077713,
"epoch": 0.45866666666666667,
"grad_norm": 0.2716203033924103,
"learning_rate": 9.010424564622353e-05,
"loss": 1.1743658065795899,
"mean_token_accuracy": 0.7358616881072522,
"num_tokens": 9972409.0,
"step": 2150
},
{
"entropy": 0.9736781157553196,
"epoch": 0.4608,
"grad_norm": 0.32094913721084595,
"learning_rate": 9.000064560681625e-05,
"loss": 1.082399559020996,
"mean_token_accuracy": 0.7566891044378281,
"num_tokens": 10020194.0,
"step": 2160
},
{
"entropy": 0.961787448823452,
"epoch": 0.4629333333333333,
"grad_norm": 0.2845711410045624,
"learning_rate": 8.98965663006169e-05,
"loss": 1.0939053535461425,
"mean_token_accuracy": 0.7606314912438392,
"num_tokens": 10067865.0,
"step": 2170
},
{
"entropy": 0.8467822209000587,
"epoch": 0.4650666666666667,
"grad_norm": 0.26253893971443176,
"learning_rate": 8.979200897464921e-05,
"loss": 0.9104162216186523,
"mean_token_accuracy": 0.7799090445041656,
"num_tokens": 10112591.0,
"step": 2180
},
{
"entropy": 0.880243568867445,
"epoch": 0.4672,
"grad_norm": 0.21990089118480682,
"learning_rate": 8.968697488166435e-05,
"loss": 0.9580593109130859,
"mean_token_accuracy": 0.7760828763246537,
"num_tokens": 10159849.0,
"step": 2190
},
{
"entropy": 0.9931552834808827,
"epoch": 0.4693333333333333,
"grad_norm": 0.2644927501678467,
"learning_rate": 8.95814652801258e-05,
"loss": 1.1035637855529785,
"mean_token_accuracy": 0.755839766561985,
"num_tokens": 10210375.0,
"step": 2200
},
{
"entropy": 0.9833620935678482,
"epoch": 0.47146666666666665,
"grad_norm": 0.31400638818740845,
"learning_rate": 8.947548143419437e-05,
"loss": 1.0525406837463378,
"mean_token_accuracy": 0.7570377990603447,
"num_tokens": 10258067.0,
"step": 2210
},
{
"entropy": 1.0285537526011468,
"epoch": 0.4736,
"grad_norm": 0.18883784115314484,
"learning_rate": 8.936902461371302e-05,
"loss": 1.1230335235595703,
"mean_token_accuracy": 0.7476231440901756,
"num_tokens": 10308406.0,
"step": 2220
},
{
"entropy": 1.0317844092845916,
"epoch": 0.47573333333333334,
"grad_norm": 0.23253889381885529,
"learning_rate": 8.926209609419165e-05,
"loss": 1.1079911231994628,
"mean_token_accuracy": 0.7472389042377472,
"num_tokens": 10361473.0,
"step": 2230
},
{
"entropy": 1.0787047080695629,
"epoch": 0.47786666666666666,
"grad_norm": 0.21470828354358673,
"learning_rate": 8.915469715679175e-05,
"loss": 1.2236790657043457,
"mean_token_accuracy": 0.7383568711578846,
"num_tokens": 10412214.0,
"step": 2240
},
{
"entropy": 0.9707687653601169,
"epoch": 0.48,
"grad_norm": 0.2584036588668823,
"learning_rate": 8.904682908831119e-05,
"loss": 1.056828212738037,
"mean_token_accuracy": 0.7611462950706482,
"num_tokens": 10457804.0,
"step": 2250
},
{
"entropy": 0.8710766941308975,
"epoch": 0.48213333333333336,
"grad_norm": 0.23925144970417023,
"learning_rate": 8.893849318116868e-05,
"loss": 0.9463179588317872,
"mean_token_accuracy": 0.7805656388401985,
"num_tokens": 10496592.0,
"step": 2260
},
{
"entropy": 1.0050086982548236,
"epoch": 0.4842666666666667,
"grad_norm": 0.31434357166290283,
"learning_rate": 8.882969073338833e-05,
"loss": 1.1325186729431151,
"mean_token_accuracy": 0.7497815892100335,
"num_tokens": 10546937.0,
"step": 2270
},
{
"entropy": 0.9999729461967946,
"epoch": 0.4864,
"grad_norm": 0.23635436594486237,
"learning_rate": 8.872042304858412e-05,
"loss": 1.0858405113220215,
"mean_token_accuracy": 0.7575721621513367,
"num_tokens": 10598042.0,
"step": 2280
},
{
"entropy": 0.9136553466320038,
"epoch": 0.4885333333333333,
"grad_norm": 0.2523214519023895,
"learning_rate": 8.861069143594423e-05,
"loss": 0.9898375511169434,
"mean_token_accuracy": 0.7727992206811904,
"num_tokens": 10640977.0,
"step": 2290
},
{
"entropy": 0.912187123298645,
"epoch": 0.49066666666666664,
"grad_norm": 0.2530852258205414,
"learning_rate": 8.850049721021537e-05,
"loss": 1.026146125793457,
"mean_token_accuracy": 0.7669046297669411,
"num_tokens": 10691324.0,
"step": 2300
},
{
"entropy": 0.9109658055007458,
"epoch": 0.4928,
"grad_norm": 0.2615504264831543,
"learning_rate": 8.838984169168708e-05,
"loss": 0.9777699470520019,
"mean_token_accuracy": 0.7721924617886543,
"num_tokens": 10736949.0,
"step": 2310
},
{
"entropy": 0.9462881706655025,
"epoch": 0.49493333333333334,
"grad_norm": 0.30475521087646484,
"learning_rate": 8.827872620617584e-05,
"loss": 1.0332406044006348,
"mean_token_accuracy": 0.7597260788083077,
"num_tokens": 10785226.0,
"step": 2320
},
{
"entropy": 0.8475333206355572,
"epoch": 0.49706666666666666,
"grad_norm": 0.30574989318847656,
"learning_rate": 8.816715208500922e-05,
"loss": 0.9416275978088379,
"mean_token_accuracy": 0.7838068321347237,
"num_tokens": 10830340.0,
"step": 2330
},
{
"entropy": 0.985354783385992,
"epoch": 0.4992,
"grad_norm": 0.2665145695209503,
"learning_rate": 8.805512066500992e-05,
"loss": 1.0852888107299805,
"mean_token_accuracy": 0.7492594376206398,
"num_tokens": 10876689.0,
"step": 2340
},
{
"entropy": 1.0402933657169342,
"epoch": 0.5013333333333333,
"grad_norm": 0.2032317966222763,
"learning_rate": 8.794263328847975e-05,
"loss": 1.1533799171447754,
"mean_token_accuracy": 0.7475039146840572,
"num_tokens": 10925395.0,
"step": 2350
},
{
"entropy": 1.0321477994322776,
"epoch": 0.5034666666666666,
"grad_norm": 0.2507781386375427,
"learning_rate": 8.782969130318358e-05,
"loss": 1.1364535331726073,
"mean_token_accuracy": 0.7528289645910263,
"num_tokens": 10973794.0,
"step": 2360
},
{
"entropy": 0.9620419174432755,
"epoch": 0.5056,
"grad_norm": 0.2626616656780243,
"learning_rate": 8.771629606233314e-05,
"loss": 1.0547003746032715,
"mean_token_accuracy": 0.7652165532112122,
"num_tokens": 11022983.0,
"step": 2370
},
{
"entropy": 0.945545606315136,
"epoch": 0.5077333333333334,
"grad_norm": 0.24525931477546692,
"learning_rate": 8.76024489245708e-05,
"loss": 0.9925324440002441,
"mean_token_accuracy": 0.7643568038940429,
"num_tokens": 11065494.0,
"step": 2380
},
{
"entropy": 1.0124794401228427,
"epoch": 0.5098666666666667,
"grad_norm": 0.2306586503982544,
"learning_rate": 8.74881512539534e-05,
"loss": 1.159686279296875,
"mean_token_accuracy": 0.7495349571108818,
"num_tokens": 11117249.0,
"step": 2390
},
{
"entropy": 1.0916487082839013,
"epoch": 0.512,
"grad_norm": 0.31891921162605286,
"learning_rate": 8.737340441993575e-05,
"loss": 1.1685538291931152,
"mean_token_accuracy": 0.7355501770973205,
"num_tokens": 11167106.0,
"step": 2400
},
{
"entropy": 0.9723750591278076,
"epoch": 0.5141333333333333,
"grad_norm": 0.3258633315563202,
"learning_rate": 8.725820979735436e-05,
"loss": 1.1157949447631836,
"mean_token_accuracy": 0.7603042379021645,
"num_tokens": 11211815.0,
"step": 2410
},
{
"entropy": 0.8673077188432217,
"epoch": 0.5162666666666667,
"grad_norm": 0.2602537274360657,
"learning_rate": 8.714256876641087e-05,
"loss": 0.9992271423339844,
"mean_token_accuracy": 0.7819159016013145,
"num_tokens": 11257215.0,
"step": 2420
},
{
"entropy": 1.030208633840084,
"epoch": 0.5184,
"grad_norm": 0.34487199783325195,
"learning_rate": 8.702648271265559e-05,
"loss": 1.1172332763671875,
"mean_token_accuracy": 0.7529917880892754,
"num_tokens": 11302807.0,
"step": 2430
},
{
"entropy": 1.0103901624679565,
"epoch": 0.5205333333333333,
"grad_norm": 0.3081832230091095,
"learning_rate": 8.690995302697081e-05,
"loss": 1.1663932800292969,
"mean_token_accuracy": 0.7516932919621467,
"num_tokens": 11350668.0,
"step": 2440
},
{
"entropy": 1.0364744253456593,
"epoch": 0.5226666666666666,
"grad_norm": 0.24485838413238525,
"learning_rate": 8.67929811055542e-05,
"loss": 1.100508689880371,
"mean_token_accuracy": 0.7447643153369427,
"num_tokens": 11403181.0,
"step": 2450
},
{
"entropy": 0.8236982800066471,
"epoch": 0.5248,
"grad_norm": 0.28031599521636963,
"learning_rate": 8.667556834990211e-05,
"loss": 0.9254312515258789,
"mean_token_accuracy": 0.7904586613178253,
"num_tokens": 11443565.0,
"step": 2460
},
{
"entropy": 0.9475759953260422,
"epoch": 0.5269333333333334,
"grad_norm": 0.2664242088794708,
"learning_rate": 8.65577161667927e-05,
"loss": 1.0038702011108398,
"mean_token_accuracy": 0.7648072987794876,
"num_tokens": 11484816.0,
"step": 2470
},
{
"entropy": 0.9790533006191253,
"epoch": 0.5290666666666667,
"grad_norm": 0.2507703900337219,
"learning_rate": 8.643942596826911e-05,
"loss": 1.1305276870727539,
"mean_token_accuracy": 0.7577844798564911,
"num_tokens": 11526756.0,
"step": 2480
},
{
"entropy": 0.929996844381094,
"epoch": 0.5312,
"grad_norm": 0.29296958446502686,
"learning_rate": 8.632069917162255e-05,
"loss": 1.0086584091186523,
"mean_token_accuracy": 0.7662723585963249,
"num_tokens": 11568739.0,
"step": 2490
},
{
"entropy": 0.9251207195222377,
"epoch": 0.5333333333333333,
"grad_norm": 0.23416800796985626,
"learning_rate": 8.620153719937535e-05,
"loss": 0.9998083114624023,
"mean_token_accuracy": 0.7651191413402557,
"num_tokens": 11616235.0,
"step": 2500
},
{
"entropy": 0.9308743372559547,
"epoch": 0.5354666666666666,
"grad_norm": 0.29301849007606506,
"learning_rate": 8.60819414792639e-05,
"loss": 1.0315680503845215,
"mean_token_accuracy": 0.7692675769329071,
"num_tokens": 11666713.0,
"step": 2510
},
{
"entropy": 0.995804300904274,
"epoch": 0.5376,
"grad_norm": 0.3097708821296692,
"learning_rate": 8.596191344422144e-05,
"loss": 1.072645664215088,
"mean_token_accuracy": 0.7509198278188706,
"num_tokens": 11710341.0,
"step": 2520
},
{
"entropy": 0.8292181946337223,
"epoch": 0.5397333333333333,
"grad_norm": 0.2717600166797638,
"learning_rate": 8.58414545323611e-05,
"loss": 0.9124368667602539,
"mean_token_accuracy": 0.789941257238388,
"num_tokens": 11753282.0,
"step": 2530
},
{
"entropy": 1.0006135009229182,
"epoch": 0.5418666666666667,
"grad_norm": 0.2360389232635498,
"learning_rate": 8.572056618695845e-05,
"loss": 1.1389695167541505,
"mean_token_accuracy": 0.7522218823432922,
"num_tokens": 11801605.0,
"step": 2540
},
{
"entropy": 1.0214832991361618,
"epoch": 0.544,
"grad_norm": 0.265127569437027,
"learning_rate": 8.559924985643436e-05,
"loss": 1.1042506217956543,
"mean_token_accuracy": 0.7507105216383934,
"num_tokens": 11847802.0,
"step": 2550
},
{
"entropy": 0.9052864700555802,
"epoch": 0.5461333333333334,
"grad_norm": 0.21368557214736938,
"learning_rate": 8.54775069943376e-05,
"loss": 0.9432812690734863,
"mean_token_accuracy": 0.7738461554050445,
"num_tokens": 11895685.0,
"step": 2560
},
{
"entropy": 0.8330970570445061,
"epoch": 0.5482666666666667,
"grad_norm": 0.25935855507850647,
"learning_rate": 8.535533905932738e-05,
"loss": 0.9135859489440918,
"mean_token_accuracy": 0.7793566673994065,
"num_tokens": 11940774.0,
"step": 2570
},
{
"entropy": 0.941501996666193,
"epoch": 0.5504,
"grad_norm": 0.2179393619298935,
"learning_rate": 8.523274751515595e-05,
"loss": 0.9954432487487793,
"mean_token_accuracy": 0.7663334146142006,
"num_tokens": 11987918.0,
"step": 2580
},
{
"entropy": 0.8678958520293236,
"epoch": 0.5525333333333333,
"grad_norm": 0.28909558057785034,
"learning_rate": 8.510973383065099e-05,
"loss": 0.9430976867675781,
"mean_token_accuracy": 0.7832803040742874,
"num_tokens": 12032447.0,
"step": 2590
},
{
"entropy": 0.981315091997385,
"epoch": 0.5546666666666666,
"grad_norm": 0.25849565863609314,
"learning_rate": 8.498629947969807e-05,
"loss": 1.135009765625,
"mean_token_accuracy": 0.7560776218771934,
"num_tokens": 12082379.0,
"step": 2600
},
{
"entropy": 0.904555281996727,
"epoch": 0.5568,
"grad_norm": 0.2646128535270691,
"learning_rate": 8.486244594122297e-05,
"loss": 1.0139558792114258,
"mean_token_accuracy": 0.7750694006681442,
"num_tokens": 12125503.0,
"step": 2610
},
{
"entropy": 0.9893216907978057,
"epoch": 0.5589333333333333,
"grad_norm": 0.2604218125343323,
"learning_rate": 8.47381746991739e-05,
"loss": 1.040914249420166,
"mean_token_accuracy": 0.7624668940901757,
"num_tokens": 12177100.0,
"step": 2620
},
{
"entropy": 0.9657470785081387,
"epoch": 0.5610666666666667,
"grad_norm": 0.2379087507724762,
"learning_rate": 8.461348724250384e-05,
"loss": 1.075094223022461,
"mean_token_accuracy": 0.7620343893766404,
"num_tokens": 12223113.0,
"step": 2630
},
{
"entropy": 0.9550898216664792,
"epoch": 0.5632,
"grad_norm": 0.25011542439460754,
"learning_rate": 8.448838506515262e-05,
"loss": 0.9950971603393555,
"mean_token_accuracy": 0.7567313954234123,
"num_tokens": 12274057.0,
"step": 2640
},
{
"entropy": 0.9988928638398648,
"epoch": 0.5653333333333334,
"grad_norm": 0.25330716371536255,
"learning_rate": 8.436286966602903e-05,
"loss": 1.0961190223693849,
"mean_token_accuracy": 0.7484898209571839,
"num_tokens": 12319765.0,
"step": 2650
},
{
"entropy": 0.9943435691297055,
"epoch": 0.5674666666666667,
"grad_norm": 0.22841870784759521,
"learning_rate": 8.423694254899283e-05,
"loss": 1.0581014633178711,
"mean_token_accuracy": 0.7553936064243316,
"num_tokens": 12364825.0,
"step": 2660
},
{
"entropy": 0.8084396831691265,
"epoch": 0.5696,
"grad_norm": 0.2357935756444931,
"learning_rate": 8.411060522283685e-05,
"loss": 0.8782732963562012,
"mean_token_accuracy": 0.7895583346486091,
"num_tokens": 12406871.0,
"step": 2670
},
{
"entropy": 0.942859411239624,
"epoch": 0.5717333333333333,
"grad_norm": 0.27082499861717224,
"learning_rate": 8.398385920126874e-05,
"loss": 1.0009033203125,
"mean_token_accuracy": 0.7650713473558426,
"num_tokens": 12451365.0,
"step": 2680
},
{
"entropy": 0.8713853091001511,
"epoch": 0.5738666666666666,
"grad_norm": 0.2360270470380783,
"learning_rate": 8.385670600289302e-05,
"loss": 0.9726097106933593,
"mean_token_accuracy": 0.7823046505451202,
"num_tokens": 12496420.0,
"step": 2690
},
{
"entropy": 0.9863057106733322,
"epoch": 0.576,
"grad_norm": 0.2574012279510498,
"learning_rate": 8.372914715119269e-05,
"loss": 1.0647315979003906,
"mean_token_accuracy": 0.7581366948783398,
"num_tokens": 12541298.0,
"step": 2700
},
{
"entropy": 1.001559029519558,
"epoch": 0.5781333333333334,
"grad_norm": 0.3141133487224579,
"learning_rate": 8.360118417451113e-05,
"loss": 1.1159303665161133,
"mean_token_accuracy": 0.7509155049920082,
"num_tokens": 12586250.0,
"step": 2710
},
{
"entropy": 0.9664455614984035,
"epoch": 0.5802666666666667,
"grad_norm": 0.2820577323436737,
"learning_rate": 8.347281860603375e-05,
"loss": 1.0676399230957032,
"mean_token_accuracy": 0.7622367069125175,
"num_tokens": 12635296.0,
"step": 2720
},
{
"entropy": 0.9387885488569736,
"epoch": 0.5824,
"grad_norm": 0.3290941119194031,
"learning_rate": 8.334405198376958e-05,
"loss": 1.0030738830566406,
"mean_token_accuracy": 0.7586753875017166,
"num_tokens": 12681486.0,
"step": 2730
},
{
"entropy": 0.9214616276323795,
"epoch": 0.5845333333333333,
"grad_norm": 0.2720118761062622,
"learning_rate": 8.321488585053285e-05,
"loss": 1.0133691787719727,
"mean_token_accuracy": 0.7658515647053719,
"num_tokens": 12728360.0,
"step": 2740
},
{
"entropy": 1.02061934620142,
"epoch": 0.5866666666666667,
"grad_norm": 0.2634807229042053,
"learning_rate": 8.308532175392456e-05,
"loss": 1.098531150817871,
"mean_token_accuracy": 0.7488874278962612,
"num_tokens": 12776988.0,
"step": 2750
},
{
"entropy": 0.8518887549638748,
"epoch": 0.5888,
"grad_norm": 0.2686164677143097,
"learning_rate": 8.295536124631385e-05,
"loss": 0.9314091682434082,
"mean_token_accuracy": 0.7844551488757133,
"num_tokens": 12821354.0,
"step": 2760
},
{
"entropy": 1.0409190684556962,
"epoch": 0.5909333333333333,
"grad_norm": 0.211833655834198,
"learning_rate": 8.28250058848195e-05,
"loss": 1.101518440246582,
"mean_token_accuracy": 0.7454183496534824,
"num_tokens": 12874281.0,
"step": 2770
},
{
"entropy": 0.8854779146611691,
"epoch": 0.5930666666666666,
"grad_norm": 0.19881795346736908,
"learning_rate": 8.26942572312912e-05,
"loss": 0.9492866516113281,
"mean_token_accuracy": 0.7723072916269302,
"num_tokens": 12921599.0,
"step": 2780
},
{
"entropy": 0.9700805857777596,
"epoch": 0.5952,
"grad_norm": 0.27477312088012695,
"learning_rate": 8.256311685229085e-05,
"loss": 1.1071263313293458,
"mean_token_accuracy": 0.7538750320672989,
"num_tokens": 12969810.0,
"step": 2790
},
{
"entropy": 0.9516462504863739,
"epoch": 0.5973333333333334,
"grad_norm": 0.27765893936157227,
"learning_rate": 8.243158631907382e-05,
"loss": 1.0368030548095704,
"mean_token_accuracy": 0.7694585740566253,
"num_tokens": 13015439.0,
"step": 2800
},
{
"entropy": 1.0109743446111679,
"epoch": 0.5994666666666667,
"grad_norm": 0.299020379781723,
"learning_rate": 8.229966720757007e-05,
"loss": 1.1124341011047363,
"mean_token_accuracy": 0.7469129160046577,
"num_tokens": 13064583.0,
"step": 2810
},
{
"entropy": 1.0561771862208844,
"epoch": 0.6016,
"grad_norm": 0.2526226043701172,
"learning_rate": 8.216736109836534e-05,
"loss": 1.1680998802185059,
"mean_token_accuracy": 0.7392091482877732,
"num_tokens": 13115184.0,
"step": 2820
},
{
"entropy": 0.9755672253668308,
"epoch": 0.6037333333333333,
"grad_norm": 0.24116092920303345,
"learning_rate": 8.203466957668215e-05,
"loss": 1.0671576499938964,
"mean_token_accuracy": 0.7598815195262432,
"num_tokens": 13163748.0,
"step": 2830
},
{
"entropy": 0.8110849797725678,
"epoch": 0.6058666666666667,
"grad_norm": 0.3148553967475891,
"learning_rate": 8.190159423236086e-05,
"loss": 0.8950259208679199,
"mean_token_accuracy": 0.7862283095717431,
"num_tokens": 13204391.0,
"step": 2840
},
{
"entropy": 0.829298897087574,
"epoch": 0.608,
"grad_norm": 0.2556048631668091,
"learning_rate": 8.176813665984053e-05,
"loss": 0.8883259773254395,
"mean_token_accuracy": 0.789163002371788,
"num_tokens": 13244838.0,
"step": 2850
},
{
"entropy": 0.9395963847637177,
"epoch": 0.6101333333333333,
"grad_norm": 0.19703006744384766,
"learning_rate": 8.163429845813997e-05,
"loss": 1.0494510650634765,
"mean_token_accuracy": 0.7710079193115235,
"num_tokens": 13290932.0,
"step": 2860
},
{
"entropy": 0.9920587949454784,
"epoch": 0.6122666666666666,
"grad_norm": 0.2381218671798706,
"learning_rate": 8.150008123083838e-05,
"loss": 1.0494998931884765,
"mean_token_accuracy": 0.7526131421327591,
"num_tokens": 13333787.0,
"step": 2870
},
{
"entropy": 0.9984497465193272,
"epoch": 0.6144,
"grad_norm": 0.25819751620292664,
"learning_rate": 8.136548658605635e-05,
"loss": 1.1107137680053711,
"mean_token_accuracy": 0.7557663440704345,
"num_tokens": 13382126.0,
"step": 2880
},
{
"entropy": 0.9907154351472854,
"epoch": 0.6165333333333334,
"grad_norm": 0.2328466922044754,
"learning_rate": 8.123051613643641e-05,
"loss": 1.1184075355529786,
"mean_token_accuracy": 0.7595549002289772,
"num_tokens": 13430083.0,
"step": 2890
},
{
"entropy": 0.9244011230766773,
"epoch": 0.6186666666666667,
"grad_norm": 0.24781359732151031,
"learning_rate": 8.109517149912386e-05,
"loss": 1.017502498626709,
"mean_token_accuracy": 0.7722871780395508,
"num_tokens": 13478876.0,
"step": 2900
},
{
"entropy": 0.8886970773339271,
"epoch": 0.6208,
"grad_norm": 0.2412341833114624,
"learning_rate": 8.095945429574724e-05,
"loss": 0.9119473457336426,
"mean_token_accuracy": 0.7751852914690971,
"num_tokens": 13527978.0,
"step": 2910
},
{
"entropy": 1.040999775379896,
"epoch": 0.6229333333333333,
"grad_norm": 0.2708323895931244,
"learning_rate": 8.082336615239903e-05,
"loss": 1.1017963409423828,
"mean_token_accuracy": 0.7445731669664383,
"num_tokens": 13579308.0,
"step": 2920
},
{
"entropy": 1.0086095616221429,
"epoch": 0.6250666666666667,
"grad_norm": 0.2506955564022064,
"learning_rate": 8.068690869961613e-05,
"loss": 1.1194355964660645,
"mean_token_accuracy": 0.7530581071972847,
"num_tokens": 13632480.0,
"step": 2930
},
{
"entropy": 0.9920367047190666,
"epoch": 0.6272,
"grad_norm": 0.28143101930618286,
"learning_rate": 8.055008357236027e-05,
"loss": 1.0880350112915038,
"mean_token_accuracy": 0.7523079156875611,
"num_tokens": 13683250.0,
"step": 2940
},
{
"entropy": 0.947841040790081,
"epoch": 0.6293333333333333,
"grad_norm": 0.34841635823249817,
"learning_rate": 8.04128924099985e-05,
"loss": 1.013569164276123,
"mean_token_accuracy": 0.7690569952130317,
"num_tokens": 13724761.0,
"step": 2950
},
{
"entropy": 0.8923015877604484,
"epoch": 0.6314666666666666,
"grad_norm": 0.24537858366966248,
"learning_rate": 8.027533685628348e-05,
"loss": 0.9606434822082519,
"mean_token_accuracy": 0.7777309969067574,
"num_tokens": 13771701.0,
"step": 2960
},
{
"entropy": 1.082998887449503,
"epoch": 0.6336,
"grad_norm": 0.2772109806537628,
"learning_rate": 8.013741855933386e-05,
"loss": 1.155489444732666,
"mean_token_accuracy": 0.7356668919324875,
"num_tokens": 13824969.0,
"step": 2970
},
{
"entropy": 1.0548067845404148,
"epoch": 0.6357333333333334,
"grad_norm": 0.2706131041049957,
"learning_rate": 7.999913917161446e-05,
"loss": 1.1606884002685547,
"mean_token_accuracy": 0.7461161836981773,
"num_tokens": 13879673.0,
"step": 2980
},
{
"entropy": 0.9122042678296566,
"epoch": 0.6378666666666667,
"grad_norm": 0.28579071164131165,
"learning_rate": 7.986050034991646e-05,
"loss": 1.0014433860778809,
"mean_token_accuracy": 0.7702639386057853,
"num_tokens": 13923893.0,
"step": 2990
},
{
"entropy": 0.856528140604496,
"epoch": 0.64,
"grad_norm": 0.2646186351776123,
"learning_rate": 7.972150375533767e-05,
"loss": 0.9789193153381348,
"mean_token_accuracy": 0.7824795439839363,
"num_tokens": 13967914.0,
"step": 3000
},
{
"entropy": 1.013469608873129,
"epoch": 0.6421333333333333,
"grad_norm": 0.2540909945964813,
"learning_rate": 7.958215105326252e-05,
"loss": 1.1425801277160645,
"mean_token_accuracy": 0.7503237001597881,
"num_tokens": 14016335.0,
"step": 3010
},
{
"entropy": 0.9561307951807976,
"epoch": 0.6442666666666667,
"grad_norm": 0.2495027333498001,
"learning_rate": 7.94424439133421e-05,
"loss": 1.0421770095825196,
"mean_token_accuracy": 0.7604482308030128,
"num_tokens": 14060745.0,
"step": 3020
},
{
"entropy": 0.9330584339797496,
"epoch": 0.6464,
"grad_norm": 0.26480352878570557,
"learning_rate": 7.930238400947422e-05,
"loss": 1.0622355461120605,
"mean_token_accuracy": 0.7683120101690293,
"num_tokens": 14108255.0,
"step": 3030
},
{
"entropy": 0.8226673573255538,
"epoch": 0.6485333333333333,
"grad_norm": 0.2883199453353882,
"learning_rate": 7.916197301978331e-05,
"loss": 0.8736177444458008,
"mean_token_accuracy": 0.7835568472743034,
"num_tokens": 14151595.0,
"step": 3040
},
{
"entropy": 1.0103112280368804,
"epoch": 0.6506666666666666,
"grad_norm": 0.2573588788509369,
"learning_rate": 7.902121262660036e-05,
"loss": 1.1782626152038573,
"mean_token_accuracy": 0.7547322385013103,
"num_tokens": 14198658.0,
"step": 3050
},
{
"entropy": 0.9194101721048356,
"epoch": 0.6528,
"grad_norm": 0.22869926691055298,
"learning_rate": 7.888010451644265e-05,
"loss": 0.96375732421875,
"mean_token_accuracy": 0.7731851547956466,
"num_tokens": 14243252.0,
"step": 3060
},
{
"entropy": 0.927897697687149,
"epoch": 0.6549333333333334,
"grad_norm": 0.32361456751823425,
"learning_rate": 7.873865037999373e-05,
"loss": 1.0542486190795899,
"mean_token_accuracy": 0.7636147439479828,
"num_tokens": 14290318.0,
"step": 3070
},
{
"entropy": 0.8857385322451592,
"epoch": 0.6570666666666667,
"grad_norm": 0.25951746106147766,
"learning_rate": 7.859685191208297e-05,
"loss": 0.9199460983276367,
"mean_token_accuracy": 0.7751095175743103,
"num_tokens": 14341937.0,
"step": 3080
},
{
"entropy": 0.9319920368492604,
"epoch": 0.6592,
"grad_norm": 0.22098122537136078,
"learning_rate": 7.845471081166535e-05,
"loss": 1.057561206817627,
"mean_token_accuracy": 0.763427771627903,
"num_tokens": 14388811.0,
"step": 3090
},
{
"entropy": 0.9401551052927971,
"epoch": 0.6613333333333333,
"grad_norm": 0.25181668996810913,
"learning_rate": 7.831222878180115e-05,
"loss": 1.0170879364013672,
"mean_token_accuracy": 0.7671449035406113,
"num_tokens": 14432608.0,
"step": 3100
},
{
"entropy": 0.9817736372351646,
"epoch": 0.6634666666666666,
"grad_norm": 0.25245943665504456,
"learning_rate": 7.816940752963543e-05,
"loss": 1.1231375694274903,
"mean_token_accuracy": 0.7525465905666351,
"num_tokens": 14483062.0,
"step": 3110
},
{
"entropy": 1.032941934466362,
"epoch": 0.6656,
"grad_norm": 0.255884051322937,
"learning_rate": 7.80262487663777e-05,
"loss": 1.1379814147949219,
"mean_token_accuracy": 0.7467011958360672,
"num_tokens": 14526227.0,
"step": 3120
},
{
"entropy": 0.850496319681406,
"epoch": 0.6677333333333333,
"grad_norm": 0.37918779253959656,
"learning_rate": 7.788275420728123e-05,
"loss": 0.914525032043457,
"mean_token_accuracy": 0.7852855637669564,
"num_tokens": 14566458.0,
"step": 3130
},
{
"entropy": 1.002537302672863,
"epoch": 0.6698666666666667,
"grad_norm": 0.21786057949066162,
"learning_rate": 7.773892557162274e-05,
"loss": 1.063007640838623,
"mean_token_accuracy": 0.7521986544132233,
"num_tokens": 14620140.0,
"step": 3140
},
{
"entropy": 0.8929514214396477,
"epoch": 0.672,
"grad_norm": 0.24498853087425232,
"learning_rate": 7.759476458268165e-05,
"loss": 0.9452738761901855,
"mean_token_accuracy": 0.7730352610349656,
"num_tokens": 14664132.0,
"step": 3150
},
{
"entropy": 0.9587577134370804,
"epoch": 0.6741333333333334,
"grad_norm": 0.2750677168369293,
"learning_rate": 7.74502729677194e-05,
"loss": 1.1020426750183105,
"mean_token_accuracy": 0.7670308589935303,
"num_tokens": 14711920.0,
"step": 3160
},
{
"entropy": 0.9528509542346001,
"epoch": 0.6762666666666667,
"grad_norm": 0.215078204870224,
"learning_rate": 7.730545245795891e-05,
"loss": 0.9985583305358887,
"mean_token_accuracy": 0.7616261839866638,
"num_tokens": 14759139.0,
"step": 3170
},
{
"entropy": 0.9082593686878682,
"epoch": 0.6784,
"grad_norm": 0.23811033368110657,
"learning_rate": 7.71603047885637e-05,
"loss": 1.0061087608337402,
"mean_token_accuracy": 0.7741461530327797,
"num_tokens": 14803777.0,
"step": 3180
},
{
"entropy": 0.9152800880372525,
"epoch": 0.6805333333333333,
"grad_norm": 0.25680598616600037,
"learning_rate": 7.701483169861713e-05,
"loss": 0.9781417846679688,
"mean_token_accuracy": 0.7678594440221786,
"num_tokens": 14851182.0,
"step": 3190
},
{
"entropy": 0.7769002497196198,
"epoch": 0.6826666666666666,
"grad_norm": 0.29509180784225464,
"learning_rate": 7.68690349311016e-05,
"loss": 0.8263790130615234,
"mean_token_accuracy": 0.8022488832473755,
"num_tokens": 14891820.0,
"step": 3200
},
{
"entropy": 1.0069321602582932,
"epoch": 0.6848,
"grad_norm": 0.27509671449661255,
"learning_rate": 7.672291623287766e-05,
"loss": 1.1010238647460937,
"mean_token_accuracy": 0.754035946726799,
"num_tokens": 14942310.0,
"step": 3210
},
{
"entropy": 0.9065248876810074,
"epoch": 0.6869333333333333,
"grad_norm": 0.22744986414909363,
"learning_rate": 7.657647735466302e-05,
"loss": 0.9641946792602539,
"mean_token_accuracy": 0.772594378888607,
"num_tokens": 14986110.0,
"step": 3220
},
{
"entropy": 0.95280120074749,
"epoch": 0.6890666666666667,
"grad_norm": 0.24981571733951569,
"learning_rate": 7.642972005101168e-05,
"loss": 1.054494857788086,
"mean_token_accuracy": 0.7634354814887047,
"num_tokens": 15031665.0,
"step": 3230
},
{
"entropy": 0.930540493875742,
"epoch": 0.6912,
"grad_norm": 0.2832178473472595,
"learning_rate": 7.628264608029277e-05,
"loss": 1.0797764778137207,
"mean_token_accuracy": 0.768897658586502,
"num_tokens": 15075297.0,
"step": 3240
},
{
"entropy": 0.8848231807351112,
"epoch": 0.6933333333333334,
"grad_norm": 0.3329085111618042,
"learning_rate": 7.613525720466965e-05,
"loss": 0.9568095207214355,
"mean_token_accuracy": 0.7773910194635392,
"num_tokens": 15117055.0,
"step": 3250
},
{
"entropy": 0.9285655155777931,
"epoch": 0.6954666666666667,
"grad_norm": 0.24558193981647491,
"learning_rate": 7.59875551900786e-05,
"loss": 1.037491226196289,
"mean_token_accuracy": 0.7648348346352577,
"num_tokens": 15164096.0,
"step": 3260
},
{
"entropy": 1.0723001688718796,
"epoch": 0.6976,
"grad_norm": 0.3482857644557953,
"learning_rate": 7.58395418062079e-05,
"loss": 1.147115993499756,
"mean_token_accuracy": 0.7382091999053955,
"num_tokens": 15212178.0,
"step": 3270
},
{
"entropy": 1.099733480066061,
"epoch": 0.6997333333333333,
"grad_norm": 0.26093462109565735,
"learning_rate": 7.569121882647634e-05,
"loss": 1.2072590827941894,
"mean_token_accuracy": 0.7411856979131699,
"num_tokens": 15261433.0,
"step": 3280
},
{
"entropy": 0.8516895264387131,
"epoch": 0.7018666666666666,
"grad_norm": 0.30177560448646545,
"learning_rate": 7.554258802801226e-05,
"loss": 0.9454229354858399,
"mean_token_accuracy": 0.7824687540531159,
"num_tokens": 15302428.0,
"step": 3290
},
{
"entropy": 0.9096255600452423,
"epoch": 0.704,
"grad_norm": 0.2182462513446808,
"learning_rate": 7.539365119163204e-05,
"loss": 0.9878718376159668,
"mean_token_accuracy": 0.7683326050639152,
"num_tokens": 15350117.0,
"step": 3300
},
{
"entropy": 1.0591608263552188,
"epoch": 0.7061333333333333,
"grad_norm": 0.28637412190437317,
"learning_rate": 7.524441010181889e-05,
"loss": 1.1826082229614259,
"mean_token_accuracy": 0.7397180199623108,
"num_tokens": 15404947.0,
"step": 3310
},
{
"entropy": 1.0128936409950255,
"epoch": 0.7082666666666667,
"grad_norm": 0.2553557753562927,
"learning_rate": 7.509486654670137e-05,
"loss": 1.0848438262939453,
"mean_token_accuracy": 0.752461838722229,
"num_tokens": 15454949.0,
"step": 3320
},
{
"entropy": 1.0468340575695039,
"epoch": 0.7104,
"grad_norm": 0.29637107253074646,
"learning_rate": 7.494502231803211e-05,
"loss": 1.148671531677246,
"mean_token_accuracy": 0.7463315047323704,
"num_tokens": 15507585.0,
"step": 3330
},
{
"entropy": 0.9354016333818436,
"epoch": 0.7125333333333334,
"grad_norm": 0.2948022782802582,
"learning_rate": 7.47948792111662e-05,
"loss": 1.0402913093566895,
"mean_token_accuracy": 0.7643374137580394,
"num_tokens": 15554937.0,
"step": 3340
},
{
"entropy": 0.9632456421852111,
"epoch": 0.7146666666666667,
"grad_norm": 0.21984997391700745,
"learning_rate": 7.464443902503968e-05,
"loss": 1.0455013275146485,
"mean_token_accuracy": 0.7605102032423019,
"num_tokens": 15605470.0,
"step": 3350
},
{
"entropy": 0.9598750308156013,
"epoch": 0.7168,
"grad_norm": 0.2748892903327942,
"learning_rate": 7.449370356214814e-05,
"loss": 1.0057992935180664,
"mean_token_accuracy": 0.7655998513102531,
"num_tokens": 15652544.0,
"step": 3360
},
{
"entropy": 0.8534669198095799,
"epoch": 0.7189333333333333,
"grad_norm": 0.20464476943016052,
"learning_rate": 7.434267462852496e-05,
"loss": 0.9573710441589356,
"mean_token_accuracy": 0.7839296951889991,
"num_tokens": 15691801.0,
"step": 3370
},
{
"entropy": 0.9314685501158237,
"epoch": 0.7210666666666666,
"grad_norm": 0.2979079782962799,
"learning_rate": 7.419135403371976e-05,
"loss": 1.051080322265625,
"mean_token_accuracy": 0.7620216578245163,
"num_tokens": 15736780.0,
"step": 3380
},
{
"entropy": 1.0298451118171215,
"epoch": 0.7232,
"grad_norm": 0.24461258947849274,
"learning_rate": 7.403974359077664e-05,
"loss": 1.080700397491455,
"mean_token_accuracy": 0.7475218966603279,
"num_tokens": 15789887.0,
"step": 3390
},
{
"entropy": 1.0159111820161342,
"epoch": 0.7253333333333334,
"grad_norm": 0.2403489053249359,
"learning_rate": 7.38878451162126e-05,
"loss": 1.1378083229064941,
"mean_token_accuracy": 0.7544682174921036,
"num_tokens": 15837848.0,
"step": 3400
},
{
"entropy": 0.9885091617703438,
"epoch": 0.7274666666666667,
"grad_norm": 0.31231454014778137,
"learning_rate": 7.373566042999559e-05,
"loss": 1.0966137886047362,
"mean_token_accuracy": 0.7566501721739769,
"num_tokens": 15885904.0,
"step": 3410
},
{
"entropy": 1.0563621714711189,
"epoch": 0.7296,
"grad_norm": 0.3054843246936798,
"learning_rate": 7.358319135552285e-05,
"loss": 1.189220142364502,
"mean_token_accuracy": 0.738009649515152,
"num_tokens": 15937698.0,
"step": 3420
},
{
"entropy": 0.9974006466567517,
"epoch": 0.7317333333333333,
"grad_norm": 0.23932306468486786,
"learning_rate": 7.343043971959902e-05,
"loss": 1.0580031394958496,
"mean_token_accuracy": 0.7584069922566414,
"num_tokens": 15984086.0,
"step": 3430
},
{
"entropy": 0.9371322847902774,
"epoch": 0.7338666666666667,
"grad_norm": 0.21763980388641357,
"learning_rate": 7.32774073524142e-05,
"loss": 1.0160024642944336,
"mean_token_accuracy": 0.7715103484690189,
"num_tokens": 16029965.0,
"step": 3440
},
{
"entropy": 0.9526532724499702,
"epoch": 0.736,
"grad_norm": 0.24097904562950134,
"learning_rate": 7.312409608752208e-05,
"loss": 1.0411754608154298,
"mean_token_accuracy": 0.7602859303355217,
"num_tokens": 16078997.0,
"step": 3450
},
{
"entropy": 0.9305942483246327,
"epoch": 0.7381333333333333,
"grad_norm": 0.25894254446029663,
"learning_rate": 7.2970507761818e-05,
"loss": 0.9928631782531738,
"mean_token_accuracy": 0.7672612771391869,
"num_tokens": 16128753.0,
"step": 3460
},
{
"entropy": 0.9178998105227947,
"epoch": 0.7402666666666666,
"grad_norm": 0.24847134947776794,
"learning_rate": 7.281664421551684e-05,
"loss": 1.0369275093078614,
"mean_token_accuracy": 0.7686163082718849,
"num_tokens": 16169199.0,
"step": 3470
},
{
"entropy": 1.089708861708641,
"epoch": 0.7424,
"grad_norm": 0.2787526547908783,
"learning_rate": 7.266250729213105e-05,
"loss": 1.177119255065918,
"mean_token_accuracy": 0.7344872549176216,
"num_tokens": 16218140.0,
"step": 3480
},
{
"entropy": 0.9035327732563019,
"epoch": 0.7445333333333334,
"grad_norm": 0.35477346181869507,
"learning_rate": 7.250809883844855e-05,
"loss": 1.0146682739257813,
"mean_token_accuracy": 0.7716259181499481,
"num_tokens": 16261629.0,
"step": 3490
},
{
"entropy": 0.9472721114754676,
"epoch": 0.7466666666666667,
"grad_norm": 0.2600723206996918,
"learning_rate": 7.235342070451059e-05,
"loss": 1.0361743927001954,
"mean_token_accuracy": 0.761479677259922,
"num_tokens": 16308149.0,
"step": 3500
},
{
"entropy": 1.0926350936293603,
"epoch": 0.7488,
"grad_norm": 0.25519415736198425,
"learning_rate": 7.219847474358959e-05,
"loss": 1.1195724487304688,
"mean_token_accuracy": 0.7408729113638401,
"num_tokens": 16355489.0,
"step": 3510
},
{
"entropy": 0.9718878209590912,
"epoch": 0.7509333333333333,
"grad_norm": 0.3055277168750763,
"learning_rate": 7.20432628121669e-05,
"loss": 1.1054911613464355,
"mean_token_accuracy": 0.7583063259720803,
"num_tokens": 16400826.0,
"step": 3520
},
{
"entropy": 0.8583284638822078,
"epoch": 0.7530666666666667,
"grad_norm": 0.2846459448337555,
"learning_rate": 7.188778676991064e-05,
"loss": 0.914365577697754,
"mean_token_accuracy": 0.7785162061452866,
"num_tokens": 16445628.0,
"step": 3530
},
{
"entropy": 1.0267987482249736,
"epoch": 0.7552,
"grad_norm": 0.26450878381729126,
"learning_rate": 7.173204847965333e-05,
"loss": 1.1284149169921875,
"mean_token_accuracy": 0.7466149963438511,
"num_tokens": 16498069.0,
"step": 3540
},
{
"entropy": 0.932567299157381,
"epoch": 0.7573333333333333,
"grad_norm": 0.2745480239391327,
"learning_rate": 7.157604980736962e-05,
"loss": 1.02783260345459,
"mean_token_accuracy": 0.7691405609250068,
"num_tokens": 16546746.0,
"step": 3550
},
{
"entropy": 0.9001861125230789,
"epoch": 0.7594666666666666,
"grad_norm": 0.29825839400291443,
"learning_rate": 7.141979262215396e-05,
"loss": 1.0350588798522948,
"mean_token_accuracy": 0.7682245895266533,
"num_tokens": 16586157.0,
"step": 3560
},
{
"entropy": 0.8198081150650978,
"epoch": 0.7616,
"grad_norm": 0.3035356104373932,
"learning_rate": 7.126327879619807e-05,
"loss": 0.8880753517150879,
"mean_token_accuracy": 0.7954168729484081,
"num_tokens": 16623992.0,
"step": 3570
},
{
"entropy": 0.8905762024223804,
"epoch": 0.7637333333333334,
"grad_norm": 0.24514277279376984,
"learning_rate": 7.110651020476873e-05,
"loss": 0.9519443511962891,
"mean_token_accuracy": 0.7789977833628654,
"num_tokens": 16666011.0,
"step": 3580
},
{
"entropy": 0.8767322935163975,
"epoch": 0.7658666666666667,
"grad_norm": 0.30208712816238403,
"learning_rate": 7.094948872618507e-05,
"loss": 1.0047502517700195,
"mean_token_accuracy": 0.7747991606593132,
"num_tokens": 16709398.0,
"step": 3590
},
{
"entropy": 0.9047524333000183,
"epoch": 0.768,
"grad_norm": 0.23355495929718018,
"learning_rate": 7.079221624179623e-05,
"loss": 0.9877220153808594,
"mean_token_accuracy": 0.7743734329938888,
"num_tokens": 16759830.0,
"step": 3600
},
{
"entropy": 0.8517782382667065,
"epoch": 0.7701333333333333,
"grad_norm": 0.2146274745464325,
"learning_rate": 7.063469463595884e-05,
"loss": 0.9309274673461914,
"mean_token_accuracy": 0.7834656447172165,
"num_tokens": 16802813.0,
"step": 3610
},
{
"entropy": 1.0183790929615497,
"epoch": 0.7722666666666667,
"grad_norm": 0.24549080431461334,
"learning_rate": 7.047692579601424e-05,
"loss": 1.1990603446960448,
"mean_token_accuracy": 0.7547581911087036,
"num_tokens": 16850703.0,
"step": 3620
},
{
"entropy": 0.8650934003293514,
"epoch": 0.7744,
"grad_norm": 0.23581688106060028,
"learning_rate": 7.031891161226608e-05,
"loss": 0.9123600959777832,
"mean_token_accuracy": 0.7830170378088951,
"num_tokens": 16894959.0,
"step": 3630
},
{
"entropy": 0.951847655326128,
"epoch": 0.7765333333333333,
"grad_norm": 0.3015543818473816,
"learning_rate": 7.016065397795758e-05,
"loss": 1.062761116027832,
"mean_token_accuracy": 0.7644057601690293,
"num_tokens": 16939640.0,
"step": 3640
},
{
"entropy": 1.0231108613312245,
"epoch": 0.7786666666666666,
"grad_norm": 0.23233352601528168,
"learning_rate": 7.000215478924887e-05,
"loss": 1.1309197425842286,
"mean_token_accuracy": 0.744163216650486,
"num_tokens": 16999652.0,
"step": 3650
},
{
"entropy": 0.8934633955359459,
"epoch": 0.7808,
"grad_norm": 0.25434958934783936,
"learning_rate": 6.984341594519421e-05,
"loss": 1.0075945854187012,
"mean_token_accuracy": 0.7736709147691727,
"num_tokens": 17046141.0,
"step": 3660
},
{
"entropy": 1.0374766498804093,
"epoch": 0.7829333333333334,
"grad_norm": 0.2597425878047943,
"learning_rate": 6.968443934771933e-05,
"loss": 1.1429466247558593,
"mean_token_accuracy": 0.7515291333198547,
"num_tokens": 17096136.0,
"step": 3670
},
{
"entropy": 0.974248643219471,
"epoch": 0.7850666666666667,
"grad_norm": 0.22508691251277924,
"learning_rate": 6.952522690159861e-05,
"loss": 1.0584315299987792,
"mean_token_accuracy": 0.7587296038866043,
"num_tokens": 17144177.0,
"step": 3680
},
{
"entropy": 0.9973213374614716,
"epoch": 0.7872,
"grad_norm": 0.2418157011270523,
"learning_rate": 6.936578051443219e-05,
"loss": 1.1423637390136718,
"mean_token_accuracy": 0.7530387908220291,
"num_tokens": 17191263.0,
"step": 3690
},
{
"entropy": 0.8815206326544285,
"epoch": 0.7893333333333333,
"grad_norm": 0.28048619627952576,
"learning_rate": 6.92061020966232e-05,
"loss": 0.9586901664733887,
"mean_token_accuracy": 0.7787635132670403,
"num_tokens": 17235554.0,
"step": 3700
},
{
"entropy": 0.8578987941145897,
"epoch": 0.7914666666666667,
"grad_norm": 0.2751041352748871,
"learning_rate": 6.904619356135484e-05,
"loss": 0.9659609794616699,
"mean_token_accuracy": 0.7794149458408356,
"num_tokens": 17280617.0,
"step": 3710
},
{
"entropy": 0.8956944331526756,
"epoch": 0.7936,
"grad_norm": 0.23321297764778137,
"learning_rate": 6.888605682456746e-05,
"loss": 1.0033020973205566,
"mean_token_accuracy": 0.7758402660489082,
"num_tokens": 17326396.0,
"step": 3720
},
{
"entropy": 0.8936455383896827,
"epoch": 0.7957333333333333,
"grad_norm": 0.2756887674331665,
"learning_rate": 6.87256938049356e-05,
"loss": 0.9862062454223632,
"mean_token_accuracy": 0.7779143631458283,
"num_tokens": 17370948.0,
"step": 3730
},
{
"entropy": 0.9536221623420715,
"epoch": 0.7978666666666666,
"grad_norm": 0.2456534057855606,
"learning_rate": 6.856510642384499e-05,
"loss": 1.0342220306396483,
"mean_token_accuracy": 0.7630825422704219,
"num_tokens": 17421511.0,
"step": 3740
},
{
"entropy": 1.0029884599149228,
"epoch": 0.8,
"grad_norm": 0.23356133699417114,
"learning_rate": 6.840429660536953e-05,
"loss": 1.0578575134277344,
"mean_token_accuracy": 0.7524963855743408,
"num_tokens": 17474234.0,
"step": 3750
},
{
"entropy": 0.9401593998074531,
"epoch": 0.8021333333333334,
"grad_norm": 0.2025330811738968,
"learning_rate": 6.82432662762483e-05,
"loss": 1.0459843635559083,
"mean_token_accuracy": 0.7684256717562675,
"num_tokens": 17519106.0,
"step": 3760
},
{
"entropy": 0.9076099701225757,
"epoch": 0.8042666666666667,
"grad_norm": 0.28303810954093933,
"learning_rate": 6.80820173658624e-05,
"loss": 1.0061184883117675,
"mean_token_accuracy": 0.774385878443718,
"num_tokens": 17563524.0,
"step": 3770
},
{
"entropy": 0.9020247898995877,
"epoch": 0.8064,
"grad_norm": 0.25860196352005005,
"learning_rate": 6.79205518062118e-05,
"loss": 0.9976821899414062,
"mean_token_accuracy": 0.774232342839241,
"num_tokens": 17611723.0,
"step": 3780
},
{
"entropy": 0.9692328073084354,
"epoch": 0.8085333333333333,
"grad_norm": 0.24602019786834717,
"learning_rate": 6.775887153189233e-05,
"loss": 1.06738224029541,
"mean_token_accuracy": 0.7612074792385102,
"num_tokens": 17657838.0,
"step": 3790
},
{
"entropy": 0.9567753560841084,
"epoch": 0.8106666666666666,
"grad_norm": 0.2228858321905136,
"learning_rate": 6.759697848007238e-05,
"loss": 1.0671761512756348,
"mean_token_accuracy": 0.7626087903976441,
"num_tokens": 17705375.0,
"step": 3800
},
{
"entropy": 1.0885871052742004,
"epoch": 0.8128,
"grad_norm": 0.24882915616035461,
"learning_rate": 6.743487459046971e-05,
"loss": 1.1456743240356446,
"mean_token_accuracy": 0.7413103066384792,
"num_tokens": 17751890.0,
"step": 3810
},
{
"entropy": 0.9753526367247105,
"epoch": 0.8149333333333333,
"grad_norm": 0.32060110569000244,
"learning_rate": 6.72725618053283e-05,
"loss": 1.051304244995117,
"mean_token_accuracy": 0.7579805940389633,
"num_tokens": 17796312.0,
"step": 3820
},
{
"entropy": 1.0881080597639083,
"epoch": 0.8170666666666667,
"grad_norm": 0.2615782618522644,
"learning_rate": 6.711004206939491e-05,
"loss": 1.20444917678833,
"mean_token_accuracy": 0.7393553704023361,
"num_tokens": 17849058.0,
"step": 3830
},
{
"entropy": 0.9971455112099648,
"epoch": 0.8192,
"grad_norm": 0.28769299387931824,
"learning_rate": 6.694731732989593e-05,
"loss": 1.186737632751465,
"mean_token_accuracy": 0.7578480765223503,
"num_tokens": 17897760.0,
"step": 3840
},
{
"entropy": 0.8759050570428372,
"epoch": 0.8213333333333334,
"grad_norm": 0.27374961972236633,
"learning_rate": 6.678438953651401e-05,
"loss": 0.9380218505859375,
"mean_token_accuracy": 0.7734792202711105,
"num_tokens": 17939963.0,
"step": 3850
},
{
"entropy": 0.985682574659586,
"epoch": 0.8234666666666667,
"grad_norm": 0.26758262515068054,
"learning_rate": 6.662126064136466e-05,
"loss": 1.073539638519287,
"mean_token_accuracy": 0.7554293870925903,
"num_tokens": 17990860.0,
"step": 3860
},
{
"entropy": 0.9883775785565376,
"epoch": 0.8256,
"grad_norm": 0.273813396692276,
"learning_rate": 6.645793259897288e-05,
"loss": 1.1143600463867187,
"mean_token_accuracy": 0.7514252230525017,
"num_tokens": 18044403.0,
"step": 3870
},
{
"entropy": 0.9075474604964257,
"epoch": 0.8277333333333333,
"grad_norm": 0.28175604343414307,
"learning_rate": 6.629440736624977e-05,
"loss": 0.9921407699584961,
"mean_token_accuracy": 0.7708318918943405,
"num_tokens": 18090208.0,
"step": 3880
},
{
"entropy": 0.8513198003172875,
"epoch": 0.8298666666666666,
"grad_norm": 0.2545130252838135,
"learning_rate": 6.613068690246905e-05,
"loss": 0.9449549674987793,
"mean_token_accuracy": 0.7845224231481552,
"num_tokens": 18135064.0,
"step": 3890
},
{
"entropy": 1.0117795512080192,
"epoch": 0.832,
"grad_norm": 0.26254504919052124,
"learning_rate": 6.596677316924355e-05,
"loss": 1.1285503387451172,
"mean_token_accuracy": 0.7520590081810952,
"num_tokens": 18184374.0,
"step": 3900
},
{
"entropy": 1.0141760632395744,
"epoch": 0.8341333333333333,
"grad_norm": 0.29730224609375,
"learning_rate": 6.580266813050187e-05,
"loss": 1.107116985321045,
"mean_token_accuracy": 0.7563492476940155,
"num_tokens": 18226039.0,
"step": 3910
},
{
"entropy": 0.9843406617641449,
"epoch": 0.8362666666666667,
"grad_norm": 0.26783686876296997,
"learning_rate": 6.563837375246463e-05,
"loss": 1.0850018501281737,
"mean_token_accuracy": 0.7563267104327679,
"num_tokens": 18270937.0,
"step": 3920
},
{
"entropy": 0.9336299523711205,
"epoch": 0.8384,
"grad_norm": 0.25351837277412415,
"learning_rate": 6.547389200362103e-05,
"loss": 1.0218440055847169,
"mean_token_accuracy": 0.767199169844389,
"num_tokens": 18314733.0,
"step": 3930
},
{
"entropy": 0.9812062717974186,
"epoch": 0.8405333333333334,
"grad_norm": 0.2531512677669525,
"learning_rate": 6.530922485470531e-05,
"loss": 1.0596059799194335,
"mean_token_accuracy": 0.764385013282299,
"num_tokens": 18367778.0,
"step": 3940
},
{
"entropy": 0.9153544351458549,
"epoch": 0.8426666666666667,
"grad_norm": 0.40482011437416077,
"learning_rate": 6.5144374278673e-05,
"loss": 0.990359878540039,
"mean_token_accuracy": 0.7689836576581002,
"num_tokens": 18413756.0,
"step": 3950
},
{
"entropy": 0.965121752768755,
"epoch": 0.8448,
"grad_norm": 0.23541757464408875,
"learning_rate": 6.497934225067736e-05,
"loss": 1.0639681816101074,
"mean_token_accuracy": 0.7629014477133751,
"num_tokens": 18459252.0,
"step": 3960
},
{
"entropy": 0.9449482955038547,
"epoch": 0.8469333333333333,
"grad_norm": 0.38528165221214294,
"learning_rate": 6.481413074804579e-05,
"loss": 1.0649182319641113,
"mean_token_accuracy": 0.7604210332036019,
"num_tokens": 18508661.0,
"step": 3970
},
{
"entropy": 1.056539323925972,
"epoch": 0.8490666666666666,
"grad_norm": 0.24575304985046387,
"learning_rate": 6.464874175025602e-05,
"loss": 1.1153278350830078,
"mean_token_accuracy": 0.7435829386115074,
"num_tokens": 18560510.0,
"step": 3980
},
{
"entropy": 0.9280949518084526,
"epoch": 0.8512,
"grad_norm": 0.2419559508562088,
"learning_rate": 6.448317723891237e-05,
"loss": 1.0382183074951172,
"mean_token_accuracy": 0.7698814913630485,
"num_tokens": 18608764.0,
"step": 3990
},
{
"entropy": 0.9310038618743419,
"epoch": 0.8533333333333334,
"grad_norm": 0.3000566065311432,
"learning_rate": 6.431743919772218e-05,
"loss": 1.0243574142456056,
"mean_token_accuracy": 0.7703951939940452,
"num_tokens": 18653703.0,
"step": 4000
},
{
"entropy": 0.9708857566118241,
"epoch": 0.8554666666666667,
"grad_norm": 0.2541143000125885,
"learning_rate": 6.415152961247186e-05,
"loss": 1.07316312789917,
"mean_token_accuracy": 0.752861674129963,
"num_tokens": 18701666.0,
"step": 4010
},
{
"entropy": 0.9462185628712177,
"epoch": 0.8576,
"grad_norm": 0.23720327019691467,
"learning_rate": 6.398545047100321e-05,
"loss": 1.023193359375,
"mean_token_accuracy": 0.7677365422248841,
"num_tokens": 18749563.0,
"step": 4020
},
{
"entropy": 0.9408772967755794,
"epoch": 0.8597333333333333,
"grad_norm": 0.215755894780159,
"learning_rate": 6.381920376318951e-05,
"loss": 1.0272337913513183,
"mean_token_accuracy": 0.7659091472625732,
"num_tokens": 18793365.0,
"step": 4030
},
{
"entropy": 0.9391368016600609,
"epoch": 0.8618666666666667,
"grad_norm": 0.22395288944244385,
"learning_rate": 6.365279148091182e-05,
"loss": 1.0316532135009766,
"mean_token_accuracy": 0.7643599301576615,
"num_tokens": 18843087.0,
"step": 4040
},
{
"entropy": 0.9580571033060551,
"epoch": 0.864,
"grad_norm": 0.2631727159023285,
"learning_rate": 6.348621561803495e-05,
"loss": 1.0001374244689942,
"mean_token_accuracy": 0.7608293548226357,
"num_tokens": 18891900.0,
"step": 4050
},
{
"entropy": 0.9185742639005184,
"epoch": 0.8661333333333333,
"grad_norm": 0.2510085701942444,
"learning_rate": 6.331947817038367e-05,
"loss": 0.9962324142456055,
"mean_token_accuracy": 0.7723157353699207,
"num_tokens": 18938986.0,
"step": 4060
},
{
"entropy": 0.9397155575454235,
"epoch": 0.8682666666666666,
"grad_norm": 0.2758144736289978,
"learning_rate": 6.315258113571876e-05,
"loss": 1.0858741760253907,
"mean_token_accuracy": 0.7691247522830963,
"num_tokens": 18984144.0,
"step": 4070
},
{
"entropy": 0.9895228892564774,
"epoch": 0.8704,
"grad_norm": 0.2775322198867798,
"learning_rate": 6.298552651371316e-05,
"loss": 1.10516300201416,
"mean_token_accuracy": 0.7543898217380047,
"num_tokens": 19027278.0,
"step": 4080
},
{
"entropy": 0.889600582420826,
"epoch": 0.8725333333333334,
"grad_norm": 0.2604535222053528,
"learning_rate": 6.281831630592783e-05,
"loss": 1.0391739845275878,
"mean_token_accuracy": 0.7718042567372322,
"num_tokens": 19073552.0,
"step": 4090
},
{
"entropy": 0.8700525127351284,
"epoch": 0.8746666666666667,
"grad_norm": 0.26574060320854187,
"learning_rate": 6.265095251578796e-05,
"loss": 0.9732645988464356,
"mean_token_accuracy": 0.7781016409397126,
"num_tokens": 19112840.0,
"step": 4100
},
{
"entropy": 0.8111571930348873,
"epoch": 0.8768,
"grad_norm": 0.24323873221874237,
"learning_rate": 6.248343714855884e-05,
"loss": 0.8503658294677734,
"mean_token_accuracy": 0.7953767567873001,
"num_tokens": 19155980.0,
"step": 4110
},
{
"entropy": 1.0650635436177254,
"epoch": 0.8789333333333333,
"grad_norm": 0.22809672355651855,
"learning_rate": 6.23157722113219e-05,
"loss": 1.208934211730957,
"mean_token_accuracy": 0.7424666874110699,
"num_tokens": 19206121.0,
"step": 4120
},
{
"entropy": 0.9243351340293884,
"epoch": 0.8810666666666667,
"grad_norm": 0.29001903533935547,
"learning_rate": 6.214795971295063e-05,
"loss": 0.9857352256774903,
"mean_token_accuracy": 0.771124804764986,
"num_tokens": 19252396.0,
"step": 4130
},
{
"entropy": 0.7921540692448616,
"epoch": 0.8832,
"grad_norm": 0.3147003650665283,
"learning_rate": 6.198000166408651e-05,
"loss": 0.8609647750854492,
"mean_token_accuracy": 0.7940000563859939,
"num_tokens": 19293212.0,
"step": 4140
},
{
"entropy": 0.8676056079566479,
"epoch": 0.8853333333333333,
"grad_norm": 0.307106614112854,
"learning_rate": 6.181190007711497e-05,
"loss": 0.9692766189575195,
"mean_token_accuracy": 0.7774873107671738,
"num_tokens": 19339257.0,
"step": 4150
},
{
"entropy": 1.0086314499378204,
"epoch": 0.8874666666666666,
"grad_norm": 0.24890249967575073,
"learning_rate": 6.16436569661412e-05,
"loss": 1.1359784126281738,
"mean_token_accuracy": 0.7473259434103966,
"num_tokens": 19389016.0,
"step": 4160
},
{
"entropy": 0.913795480877161,
"epoch": 0.8896,
"grad_norm": 0.27178287506103516,
"learning_rate": 6.147527434696606e-05,
"loss": 0.976069450378418,
"mean_token_accuracy": 0.7722298249602317,
"num_tokens": 19431392.0,
"step": 4170
},
{
"entropy": 0.874677724391222,
"epoch": 0.8917333333333334,
"grad_norm": 0.3088259994983673,
"learning_rate": 6.130675423706191e-05,
"loss": 0.9541938781738282,
"mean_token_accuracy": 0.780048543214798,
"num_tokens": 19476928.0,
"step": 4180
},
{
"entropy": 0.9338957525789737,
"epoch": 0.8938666666666667,
"grad_norm": 0.3184056282043457,
"learning_rate": 6.113809865554853e-05,
"loss": 1.0237668991088866,
"mean_token_accuracy": 0.7652892589569091,
"num_tokens": 19522534.0,
"step": 4190
},
{
"entropy": 0.8182342648506165,
"epoch": 0.896,
"grad_norm": 0.25737565755844116,
"learning_rate": 6.0969309623168736e-05,
"loss": 0.9096416473388672,
"mean_token_accuracy": 0.7868136927485466,
"num_tokens": 19564324.0,
"step": 4200
},
{
"entropy": 0.9676721416413784,
"epoch": 0.8981333333333333,
"grad_norm": 0.3115193843841553,
"learning_rate": 6.080038916226436e-05,
"loss": 1.0527458190917969,
"mean_token_accuracy": 0.7573085993528366,
"num_tokens": 19606557.0,
"step": 4210
},
{
"entropy": 0.9689052537083626,
"epoch": 0.9002666666666667,
"grad_norm": 0.24019014835357666,
"learning_rate": 6.063133929675193e-05,
"loss": 1.0610797882080079,
"mean_token_accuracy": 0.7604381129145622,
"num_tokens": 19648234.0,
"step": 4220
},
{
"entropy": 0.9963471919298172,
"epoch": 0.9024,
"grad_norm": 0.22769565880298615,
"learning_rate": 6.046216205209842e-05,
"loss": 1.1422395706176758,
"mean_token_accuracy": 0.7552115090191365,
"num_tokens": 19697922.0,
"step": 4230
},
{
"entropy": 0.8659336932003499,
"epoch": 0.9045333333333333,
"grad_norm": 0.33948129415512085,
"learning_rate": 6.029285945529699e-05,
"loss": 0.9698437690734864,
"mean_token_accuracy": 0.7807855561375618,
"num_tokens": 19742041.0,
"step": 4240
},
{
"entropy": 0.9663250721991062,
"epoch": 0.9066666666666666,
"grad_norm": 0.2511708438396454,
"learning_rate": 6.012343353484271e-05,
"loss": 1.0937541007995606,
"mean_token_accuracy": 0.7600424617528916,
"num_tokens": 19789800.0,
"step": 4250
},
{
"entropy": 1.019892977923155,
"epoch": 0.9088,
"grad_norm": 0.23841184377670288,
"learning_rate": 5.995388632070827e-05,
"loss": 1.0938913345336914,
"mean_token_accuracy": 0.7473356157541275,
"num_tokens": 19839462.0,
"step": 4260
},
{
"entropy": 1.0092505671083927,
"epoch": 0.9109333333333334,
"grad_norm": 0.23689568042755127,
"learning_rate": 5.978421984431959e-05,
"loss": 1.110377597808838,
"mean_token_accuracy": 0.7536325603723526,
"num_tokens": 19888786.0,
"step": 4270
},
{
"entropy": 0.8563631072640419,
"epoch": 0.9130666666666667,
"grad_norm": 0.2725948393344879,
"learning_rate": 5.961443613853157e-05,
"loss": 0.9641815185546875,
"mean_token_accuracy": 0.7762529909610748,
"num_tokens": 19931094.0,
"step": 4280
},
{
"entropy": 0.9774221003055572,
"epoch": 0.9152,
"grad_norm": 0.23440679907798767,
"learning_rate": 5.944453723760367e-05,
"loss": 1.0834471702575683,
"mean_token_accuracy": 0.7571895673871041,
"num_tokens": 19976730.0,
"step": 4290
},
{
"entropy": 0.9086124181747437,
"epoch": 0.9173333333333333,
"grad_norm": 0.331281453371048,
"learning_rate": 5.927452517717558e-05,
"loss": 1.0120928764343262,
"mean_token_accuracy": 0.7699793577194214,
"num_tokens": 20021630.0,
"step": 4300
},
{
"entropy": 0.930675259232521,
"epoch": 0.9194666666666667,
"grad_norm": 0.23214256763458252,
"learning_rate": 5.9104401994242786e-05,
"loss": 1.0291691780090333,
"mean_token_accuracy": 0.7654816180467605,
"num_tokens": 20070846.0,
"step": 4310
},
{
"entropy": 0.9637592114508152,
"epoch": 0.9216,
"grad_norm": 0.23460128903388977,
"learning_rate": 5.893416972713217e-05,
"loss": 1.0039209365844726,
"mean_token_accuracy": 0.763984477519989,
"num_tokens": 20116649.0,
"step": 4320
},
{
"entropy": 0.95065303966403,
"epoch": 0.9237333333333333,
"grad_norm": 0.26842495799064636,
"learning_rate": 5.8763830415477674e-05,
"loss": 1.0108171463012696,
"mean_token_accuracy": 0.7639551430940628,
"num_tokens": 20161155.0,
"step": 4330
},
{
"entropy": 0.8422643013298512,
"epoch": 0.9258666666666666,
"grad_norm": 0.29824700951576233,
"learning_rate": 5.85933861001957e-05,
"loss": 0.9323680877685547,
"mean_token_accuracy": 0.7818280473351479,
"num_tokens": 20202829.0,
"step": 4340
},
{
"entropy": 0.8829731650650501,
"epoch": 0.928,
"grad_norm": 0.31879284977912903,
"learning_rate": 5.842283882346082e-05,
"loss": 0.9750779151916504,
"mean_token_accuracy": 0.7774942420423031,
"num_tokens": 20248134.0,
"step": 4350
},
{
"entropy": 0.8857482261955738,
"epoch": 0.9301333333333334,
"grad_norm": 0.2634090483188629,
"learning_rate": 5.825219062868118e-05,
"loss": 0.9745967864990235,
"mean_token_accuracy": 0.7782815754413605,
"num_tokens": 20288504.0,
"step": 4360
},
{
"entropy": 0.9351005107164383,
"epoch": 0.9322666666666667,
"grad_norm": 0.2535351812839508,
"learning_rate": 5.808144356047414e-05,
"loss": 1.0453302383422851,
"mean_token_accuracy": 0.7632610902190209,
"num_tokens": 20335084.0,
"step": 4370
},
{
"entropy": 0.9218877613544464,
"epoch": 0.9344,
"grad_norm": 0.23900526762008667,
"learning_rate": 5.791059966464164e-05,
"loss": 0.9873531341552735,
"mean_token_accuracy": 0.7675649732351303,
"num_tokens": 20384687.0,
"step": 4380
},
{
"entropy": 0.9403703153133393,
"epoch": 0.9365333333333333,
"grad_norm": 0.23416976630687714,
"learning_rate": 5.773966098814579e-05,
"loss": 1.0653534889221192,
"mean_token_accuracy": 0.764773941040039,
"num_tokens": 20432372.0,
"step": 4390
},
{
"entropy": 0.9848338901996613,
"epoch": 0.9386666666666666,
"grad_norm": 0.2765245735645294,
"learning_rate": 5.756862957908433e-05,
"loss": 1.1192432403564454,
"mean_token_accuracy": 0.7547446370124817,
"num_tokens": 20481366.0,
"step": 4400
},
{
"entropy": 0.9790939651429653,
"epoch": 0.9408,
"grad_norm": 0.23915551602840424,
"learning_rate": 5.739750748666606e-05,
"loss": 1.036961555480957,
"mean_token_accuracy": 0.7573970347642899,
"num_tokens": 20526985.0,
"step": 4410
},
{
"entropy": 0.9054527454078197,
"epoch": 0.9429333333333333,
"grad_norm": 0.24054944515228271,
"learning_rate": 5.7226296761186274e-05,
"loss": 0.9758554458618164,
"mean_token_accuracy": 0.7724366948008538,
"num_tokens": 20571815.0,
"step": 4420
},
{
"entropy": 0.9364707127213479,
"epoch": 0.9450666666666667,
"grad_norm": 0.28272607922554016,
"learning_rate": 5.705499945400223e-05,
"loss": 1.0225676536560058,
"mean_token_accuracy": 0.7622330486774445,
"num_tokens": 20615072.0,
"step": 4430
},
{
"entropy": 1.0657535366714002,
"epoch": 0.9472,
"grad_norm": 0.23734600841999054,
"learning_rate": 5.688361761750861e-05,
"loss": 1.1335111618041993,
"mean_token_accuracy": 0.7402229458093643,
"num_tokens": 20666534.0,
"step": 4440
},
{
"entropy": 0.9826746597886086,
"epoch": 0.9493333333333334,
"grad_norm": 0.28600969910621643,
"learning_rate": 5.671215330511283e-05,
"loss": 1.066628646850586,
"mean_token_accuracy": 0.7560828119516373,
"num_tokens": 20715376.0,
"step": 4450
},
{
"entropy": 0.9109843887388707,
"epoch": 0.9514666666666667,
"grad_norm": 0.2514685392379761,
"learning_rate": 5.65406085712105e-05,
"loss": 1.0114540100097655,
"mean_token_accuracy": 0.7724284827709198,
"num_tokens": 20758838.0,
"step": 4460
},
{
"entropy": 0.8498819716274738,
"epoch": 0.9536,
"grad_norm": 0.28889158368110657,
"learning_rate": 5.6368985471160804e-05,
"loss": 0.9062424659729004,
"mean_token_accuracy": 0.785689315199852,
"num_tokens": 20799444.0,
"step": 4470
},
{
"entropy": 0.8840778715908527,
"epoch": 0.9557333333333333,
"grad_norm": 0.2577449083328247,
"learning_rate": 5.6197286061261875e-05,
"loss": 0.9439300537109375,
"mean_token_accuracy": 0.7696003526449203,
"num_tokens": 20843766.0,
"step": 4480
},
{
"entropy": 0.8888865426182747,
"epoch": 0.9578666666666666,
"grad_norm": 0.27302756905555725,
"learning_rate": 5.602551239872616e-05,
"loss": 0.9372305870056152,
"mean_token_accuracy": 0.7730641543865204,
"num_tokens": 20888764.0,
"step": 4490
},
{
"entropy": 0.9558203481137753,
"epoch": 0.96,
"grad_norm": 0.3576233386993408,
"learning_rate": 5.58536665416557e-05,
"loss": 1.0556070327758789,
"mean_token_accuracy": 0.762606156617403,
"num_tokens": 20936028.0,
"step": 4500
},
{
"entropy": 0.9054192140698433,
"epoch": 0.9621333333333333,
"grad_norm": 0.2521965205669403,
"learning_rate": 5.568175054901763e-05,
"loss": 0.9705222129821778,
"mean_token_accuracy": 0.7672724887728691,
"num_tokens": 20985057.0,
"step": 4510
},
{
"entropy": 0.9011006608605385,
"epoch": 0.9642666666666667,
"grad_norm": 0.27024832367897034,
"learning_rate": 5.550976648061934e-05,
"loss": 0.9830186843872071,
"mean_token_accuracy": 0.7754541039466858,
"num_tokens": 21028567.0,
"step": 4520
},
{
"entropy": 0.9991332605481148,
"epoch": 0.9664,
"grad_norm": 0.2703147828578949,
"learning_rate": 5.533771639708388e-05,
"loss": 1.1589097023010253,
"mean_token_accuracy": 0.7532796613872051,
"num_tokens": 21072699.0,
"step": 4530
},
{
"entropy": 0.9183724671602249,
"epoch": 0.9685333333333334,
"grad_norm": 0.2243046760559082,
"learning_rate": 5.516560235982527e-05,
"loss": 0.9856460571289063,
"mean_token_accuracy": 0.771567003428936,
"num_tokens": 21121413.0,
"step": 4540
},
{
"entropy": 0.8655671834945678,
"epoch": 0.9706666666666667,
"grad_norm": 0.3306775987148285,
"learning_rate": 5.499342643102381e-05,
"loss": 0.9172829627990723,
"mean_token_accuracy": 0.777653044462204,
"num_tokens": 21162927.0,
"step": 4550
},
{
"entropy": 0.9436637915670871,
"epoch": 0.9728,
"grad_norm": 0.2542389929294586,
"learning_rate": 5.482119067360132e-05,
"loss": 1.0658721923828125,
"mean_token_accuracy": 0.767835621535778,
"num_tokens": 21206936.0,
"step": 4560
},
{
"entropy": 0.7974261797964572,
"epoch": 0.9749333333333333,
"grad_norm": 0.24307052791118622,
"learning_rate": 5.4648897151196455e-05,
"loss": 0.8578211784362793,
"mean_token_accuracy": 0.7923481151461601,
"num_tokens": 21252732.0,
"step": 4570
},
{
"entropy": 0.9691430673003196,
"epoch": 0.9770666666666666,
"grad_norm": 0.2720329165458679,
"learning_rate": 5.447654792814e-05,
"loss": 1.0459741592407226,
"mean_token_accuracy": 0.7617560073733329,
"num_tokens": 21298972.0,
"step": 4580
},
{
"entropy": 0.9178217075765133,
"epoch": 0.9792,
"grad_norm": 0.2640475630760193,
"learning_rate": 5.4304145069430115e-05,
"loss": 1.0324625015258788,
"mean_token_accuracy": 0.7745086327195168,
"num_tokens": 21348870.0,
"step": 4590
},
{
"entropy": 0.8973256818950176,
"epoch": 0.9813333333333333,
"grad_norm": 0.2828875184059143,
"learning_rate": 5.4131690640707574e-05,
"loss": 0.9894962310791016,
"mean_token_accuracy": 0.7752941563725472,
"num_tokens": 21390716.0,
"step": 4600
},
{
"entropy": 0.9490196861326694,
"epoch": 0.9834666666666667,
"grad_norm": 0.27414020895957947,
"learning_rate": 5.3959186708231046e-05,
"loss": 1.0264591217041015,
"mean_token_accuracy": 0.7639399319887161,
"num_tokens": 21440700.0,
"step": 4610
},
{
"entropy": 0.9219519071280956,
"epoch": 0.9856,
"grad_norm": 0.2545549273490906,
"learning_rate": 5.3786635338852346e-05,
"loss": 1.0511361122131349,
"mean_token_accuracy": 0.7739394150674344,
"num_tokens": 21483867.0,
"step": 4620
},
{
"entropy": 0.99324054941535,
"epoch": 0.9877333333333334,
"grad_norm": 0.272182434797287,
"learning_rate": 5.361403859999161e-05,
"loss": 1.116584587097168,
"mean_token_accuracy": 0.7553175091743469,
"num_tokens": 21535354.0,
"step": 4630
},
{
"entropy": 0.8828953221440315,
"epoch": 0.9898666666666667,
"grad_norm": 0.29537713527679443,
"learning_rate": 5.344139855961262e-05,
"loss": 0.9682372093200684,
"mean_token_accuracy": 0.7781552016735077,
"num_tokens": 21578265.0,
"step": 4640
},
{
"entropy": 0.9005228154361248,
"epoch": 0.992,
"grad_norm": 0.3032234013080597,
"learning_rate": 5.3268717286197945e-05,
"loss": 0.9423254013061524,
"mean_token_accuracy": 0.7735077708959579,
"num_tokens": 21618545.0,
"step": 4650
},
{
"entropy": 0.8464630447328091,
"epoch": 0.9941333333333333,
"grad_norm": 0.32000964879989624,
"learning_rate": 5.3095996848724184e-05,
"loss": 0.9030919075012207,
"mean_token_accuracy": 0.7863337904214859,
"num_tokens": 21657735.0,
"step": 4660
},
{
"entropy": 0.8923816077411175,
"epoch": 0.9962666666666666,
"grad_norm": 0.3551577627658844,
"learning_rate": 5.292323931663719e-05,
"loss": 0.9792759895324707,
"mean_token_accuracy": 0.7739578939974308,
"num_tokens": 21705183.0,
"step": 4670
},
{
"entropy": 0.9760521411895752,
"epoch": 0.9984,
"grad_norm": 0.2613706886768341,
"learning_rate": 5.275044675982724e-05,
"loss": 1.055685043334961,
"mean_token_accuracy": 0.7623668745160103,
"num_tokens": 21747104.0,
"step": 4680
},
{
"entropy": 0.9629100082736266,
"epoch": 1.0004266666666666,
"grad_norm": 0.3171702027320862,
"learning_rate": 5.257762124860431e-05,
"loss": 1.0939340591430664,
"mean_token_accuracy": 0.7673146160025346,
"num_tokens": 21789348.0,
"step": 4690
},
{
"entropy": 0.9121152207255363,
"epoch": 1.00256,
"grad_norm": 0.2546738386154175,
"learning_rate": 5.240476485367317e-05,
"loss": 0.9231260299682618,
"mean_token_accuracy": 0.7732596561312676,
"num_tokens": 21834781.0,
"step": 4700
},
{
"entropy": 0.8686859309673309,
"epoch": 1.0046933333333334,
"grad_norm": 0.25343966484069824,
"learning_rate": 5.223187964610865e-05,
"loss": 0.9800569534301757,
"mean_token_accuracy": 0.7781821310520172,
"num_tokens": 21879326.0,
"step": 4710
},
{
"entropy": 0.9335578382015228,
"epoch": 1.0068266666666668,
"grad_norm": 0.2416774481534958,
"learning_rate": 5.2058967697330784e-05,
"loss": 0.9976616859436035,
"mean_token_accuracy": 0.7626704692840576,
"num_tokens": 21933750.0,
"step": 4720
}
],
"logging_steps": 10,
"max_steps": 9376,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0386211988394086e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}