seed-training / trainer_state.json
azizillo's picture
Upload folder using huggingface_hub
492601b verified
{
"best_global_step": 5400,
"best_metric": 1.2261559963226318,
"best_model_checkpoint": "./results-3/checkpoint-5400",
"epoch": 8.0,
"eval_steps": 150,
"global_step": 6184,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.4760531455278396,
"epoch": 0.0129366106080207,
"grad_norm": 1.3410229682922363,
"learning_rate": 9.67741935483871e-06,
"loss": 3.8342,
"mean_token_accuracy": 0.40634620636701585,
"num_tokens": 77854.0,
"step": 10
},
{
"entropy": 1.4689971387386322,
"epoch": 0.0258732212160414,
"grad_norm": 1.4104728698730469,
"learning_rate": 2.0430107526881722e-05,
"loss": 4.4137,
"mean_token_accuracy": 0.3765578977763653,
"num_tokens": 111064.0,
"step": 20
},
{
"entropy": 1.893897533416748,
"epoch": 0.03880983182406209,
"grad_norm": 0.8629273772239685,
"learning_rate": 3.118279569892473e-05,
"loss": 3.8151,
"mean_token_accuracy": 0.38278606086969375,
"num_tokens": 134712.0,
"step": 30
},
{
"entropy": 4.312886017560959,
"epoch": 0.0517464424320828,
"grad_norm": 0.0,
"learning_rate": 4.1935483870967746e-05,
"loss": 3.7735,
"mean_token_accuracy": 0.19467806722968817,
"num_tokens": 142734.0,
"step": 40
},
{
"entropy": 8.096190857887269,
"epoch": 0.0646830530401035,
"grad_norm": 0.0,
"learning_rate": 5.268817204301075e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 143374.0,
"step": 50
},
{
"entropy": 2.519210198521614,
"epoch": 0.07761966364812418,
"grad_norm": 0.46956390142440796,
"learning_rate": 6.344086021505376e-05,
"loss": 2.7759,
"mean_token_accuracy": 0.4458329685032368,
"num_tokens": 218138.0,
"step": 60
},
{
"entropy": 2.7062919318675993,
"epoch": 0.09055627425614489,
"grad_norm": 0.36261770129203796,
"learning_rate": 7.419354838709677e-05,
"loss": 2.5766,
"mean_token_accuracy": 0.4825271964073181,
"num_tokens": 250316.0,
"step": 70
},
{
"entropy": 2.5266534447669984,
"epoch": 0.1034928848641656,
"grad_norm": 0.39197003841400146,
"learning_rate": 8.494623655913979e-05,
"loss": 2.5861,
"mean_token_accuracy": 0.47026830837130545,
"num_tokens": 272857.0,
"step": 80
},
{
"entropy": 4.64949648976326,
"epoch": 0.11642949547218628,
"grad_norm": 0.0,
"learning_rate": 9.56989247311828e-05,
"loss": 2.5449,
"mean_token_accuracy": 0.20907760383561252,
"num_tokens": 279057.0,
"step": 90
},
{
"entropy": 6.761381912231445,
"epoch": 0.129366106080207,
"grad_norm": 0.0,
"learning_rate": 0.0001064516129032258,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 279697.0,
"step": 100
},
{
"entropy": 2.331750747561455,
"epoch": 0.1423027166882277,
"grad_norm": 0.4349558353424072,
"learning_rate": 0.00011720430107526883,
"loss": 2.4607,
"mean_token_accuracy": 0.4927462741732597,
"num_tokens": 358859.0,
"step": 110
},
{
"entropy": 1.976218768954277,
"epoch": 0.15523932729624837,
"grad_norm": 0.24631856381893158,
"learning_rate": 0.00012795698924731184,
"loss": 2.038,
"mean_token_accuracy": 0.564648849517107,
"num_tokens": 391721.0,
"step": 120
},
{
"entropy": 2.3566002756357194,
"epoch": 0.16817593790426907,
"grad_norm": 0.33470404148101807,
"learning_rate": 0.00013870967741935487,
"loss": 2.3135,
"mean_token_accuracy": 0.5072783440351486,
"num_tokens": 415274.0,
"step": 130
},
{
"entropy": 3.8850305318832397,
"epoch": 0.18111254851228978,
"grad_norm": 0.0,
"learning_rate": 0.00014946236559139787,
"loss": 2.4748,
"mean_token_accuracy": 0.29179108552634714,
"num_tokens": 423127.0,
"step": 140
},
{
"entropy": 6.345981705188751,
"epoch": 0.19404915912031048,
"grad_norm": 0.0,
"learning_rate": 0.00016021505376344087,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 423767.0,
"step": 150
},
{
"epoch": 0.19404915912031048,
"eval_entropy": 3.4076465337082396,
"eval_loss": 2.088292360305786,
"eval_mean_token_accuracy": 0.3316028483731802,
"eval_num_tokens": 423767.0,
"eval_runtime": 243.9108,
"eval_samples_per_second": 22.533,
"eval_steps_per_second": 1.41,
"step": 150
},
{
"entropy": 2.2055542409420013,
"epoch": 0.2069857697283312,
"grad_norm": 0.31700077652931213,
"learning_rate": 0.0001709677419354839,
"loss": 2.4005,
"mean_token_accuracy": 0.5007682546973229,
"num_tokens": 500625.0,
"step": 160
},
{
"entropy": 1.8786041021347046,
"epoch": 0.21992238033635186,
"grad_norm": 0.24800752103328705,
"learning_rate": 0.0001817204301075269,
"loss": 1.8474,
"mean_token_accuracy": 0.5935635283589363,
"num_tokens": 534396.0,
"step": 170
},
{
"entropy": 2.263536959886551,
"epoch": 0.23285899094437257,
"grad_norm": 0.3183101415634155,
"learning_rate": 0.00019247311827956992,
"loss": 2.2154,
"mean_token_accuracy": 0.518243944644928,
"num_tokens": 558685.0,
"step": 180
},
{
"entropy": 4.052780479192734,
"epoch": 0.24579560155239327,
"grad_norm": 0.0,
"learning_rate": 0.00019999987654768255,
"loss": 2.3652,
"mean_token_accuracy": 0.32749315425753595,
"num_tokens": 566987.0,
"step": 190
},
{
"entropy": 4.421183264255523,
"epoch": 0.258732212160414,
"grad_norm": 0.0,
"learning_rate": 0.0001999976818482961,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 567627.0,
"step": 200
},
{
"entropy": 2.0365082800388334,
"epoch": 0.2716688227684347,
"grad_norm": 0.2679975628852844,
"learning_rate": 0.00019999274383338027,
"loss": 2.1862,
"mean_token_accuracy": 0.5347613260149956,
"num_tokens": 644352.0,
"step": 210
},
{
"entropy": 1.8313011974096298,
"epoch": 0.2846054333764554,
"grad_norm": 0.2597528398036957,
"learning_rate": 0.00019998506263840354,
"loss": 1.8579,
"mean_token_accuracy": 0.5869012281298638,
"num_tokens": 676791.0,
"step": 220
},
{
"entropy": 2.229961010813713,
"epoch": 0.2975420439844761,
"grad_norm": 0.39198312163352966,
"learning_rate": 0.00019997463847409023,
"loss": 2.2158,
"mean_token_accuracy": 0.5119729146361351,
"num_tokens": 699604.0,
"step": 230
},
{
"entropy": 3.5435027480125427,
"epoch": 0.31047865459249674,
"grad_norm": 0.0,
"learning_rate": 0.00019996147162641464,
"loss": 2.2309,
"mean_token_accuracy": 0.31516757532954215,
"num_tokens": 706414.0,
"step": 240
},
{
"entropy": 3.784550839662552,
"epoch": 0.32341526520051744,
"grad_norm": 0.0,
"learning_rate": 0.00019994556245659338,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 707054.0,
"step": 250
},
{
"entropy": 2.086453899741173,
"epoch": 0.33635187580853815,
"grad_norm": 0.2695913314819336,
"learning_rate": 0.00019992691140107525,
"loss": 2.2688,
"mean_token_accuracy": 0.5183561690151691,
"num_tokens": 787476.0,
"step": 260
},
{
"entropy": 1.80660640001297,
"epoch": 0.34928848641655885,
"grad_norm": 0.2775532603263855,
"learning_rate": 0.0001999055189715294,
"loss": 1.855,
"mean_token_accuracy": 0.5896616145968437,
"num_tokens": 820945.0,
"step": 270
},
{
"entropy": 2.265737462043762,
"epoch": 0.36222509702457956,
"grad_norm": 0.35880544781684875,
"learning_rate": 0.0001998813857548313,
"loss": 2.1884,
"mean_token_accuracy": 0.5160560064017773,
"num_tokens": 844570.0,
"step": 280
},
{
"entropy": 3.490731942653656,
"epoch": 0.37516170763260026,
"grad_norm": 0.0,
"learning_rate": 0.0001998545124130466,
"loss": 2.196,
"mean_token_accuracy": 0.3669252373278141,
"num_tokens": 852461.0,
"step": 290
},
{
"entropy": 3.8365337908267976,
"epoch": 0.38809831824062097,
"grad_norm": 0.0,
"learning_rate": 0.00019982489968341292,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 853101.0,
"step": 300
},
{
"epoch": 0.38809831824062097,
"eval_entropy": 2.108367764672568,
"eval_loss": 1.785624623298645,
"eval_mean_token_accuracy": 0.3863564946277197,
"eval_num_tokens": 853101.0,
"eval_runtime": 244.4512,
"eval_samples_per_second": 22.483,
"eval_steps_per_second": 1.407,
"step": 300
},
{
"entropy": 2.0010604202747344,
"epoch": 0.40103492884864167,
"grad_norm": 0.26067453622817993,
"learning_rate": 0.00019979254837831976,
"loss": 2.1888,
"mean_token_accuracy": 0.527290866523981,
"num_tokens": 932233.0,
"step": 310
},
{
"entropy": 1.8096002161502838,
"epoch": 0.4139715394566624,
"grad_norm": 0.3278159201145172,
"learning_rate": 0.00019975745938528597,
"loss": 1.8032,
"mean_token_accuracy": 0.5965773060917854,
"num_tokens": 965240.0,
"step": 320
},
{
"entropy": 2.239218121767044,
"epoch": 0.4269081500646831,
"grad_norm": 0.3497501611709595,
"learning_rate": 0.00019971963366693574,
"loss": 2.1853,
"mean_token_accuracy": 0.5204933404922485,
"num_tokens": 988836.0,
"step": 330
},
{
"entropy": 3.591762775182724,
"epoch": 0.4398447606727037,
"grad_norm": 0.0,
"learning_rate": 0.0001996790722609719,
"loss": 2.0384,
"mean_token_accuracy": 0.3091650754213333,
"num_tokens": 995598.0,
"step": 340
},
{
"entropy": 1.7911852180957795,
"epoch": 0.45278137128072443,
"grad_norm": 0.0,
"learning_rate": 0.00019963577628014757,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 996238.0,
"step": 350
},
{
"entropy": 1.9936166375875473,
"epoch": 0.46571798188874514,
"grad_norm": 0.2826139032840729,
"learning_rate": 0.00019958974691223572,
"loss": 2.1339,
"mean_token_accuracy": 0.5367397539317608,
"num_tokens": 1068779.0,
"step": 360
},
{
"entropy": 1.7499752998352052,
"epoch": 0.47865459249676584,
"grad_norm": 0.25705066323280334,
"learning_rate": 0.00019954098541999634,
"loss": 1.7626,
"mean_token_accuracy": 0.6045101627707481,
"num_tokens": 1101822.0,
"step": 370
},
{
"entropy": 2.2398334205150605,
"epoch": 0.49159120310478654,
"grad_norm": 0.35060882568359375,
"learning_rate": 0.00019948949314114208,
"loss": 2.1407,
"mean_token_accuracy": 0.5221379362046719,
"num_tokens": 1125242.0,
"step": 380
},
{
"entropy": 3.20022537112236,
"epoch": 0.5045278137128072,
"grad_norm": 0.0,
"learning_rate": 0.00019943527148830138,
"loss": 2.1867,
"mean_token_accuracy": 0.3573383778333664,
"num_tokens": 1132694.0,
"step": 390
},
{
"entropy": 3.233865666389465,
"epoch": 0.517464424320828,
"grad_norm": 0.0,
"learning_rate": 0.00019937832194897968,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1133334.0,
"step": 400
},
{
"entropy": 1.883677563071251,
"epoch": 0.5304010349288486,
"grad_norm": 0.253384530544281,
"learning_rate": 0.00019931864608551886,
"loss": 2.065,
"mean_token_accuracy": 0.5480175256729126,
"num_tokens": 1208651.0,
"step": 410
},
{
"entropy": 1.8230630427598953,
"epoch": 0.5433376455368694,
"grad_norm": 0.27244824171066284,
"learning_rate": 0.000199256245535054,
"loss": 1.7993,
"mean_token_accuracy": 0.5971413522958755,
"num_tokens": 1241633.0,
"step": 420
},
{
"entropy": 2.1840337038040163,
"epoch": 0.55627425614489,
"grad_norm": 0.33489564061164856,
"learning_rate": 0.00019919112200946878,
"loss": 2.1355,
"mean_token_accuracy": 0.523309488594532,
"num_tokens": 1265245.0,
"step": 430
},
{
"entropy": 3.2613951563835144,
"epoch": 0.5692108667529108,
"grad_norm": 0.0,
"learning_rate": 0.0001991232772953485,
"loss": 2.0666,
"mean_token_accuracy": 0.36050624772906303,
"num_tokens": 1272655.0,
"step": 440
},
{
"entropy": 2.055793708562851,
"epoch": 0.5821474773609314,
"grad_norm": 0.0,
"learning_rate": 0.0001990527132539308,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1273295.0,
"step": 450
},
{
"epoch": 0.5821474773609314,
"eval_entropy": 1.5776830232420633,
"eval_loss": 1.6587693691253662,
"eval_mean_token_accuracy": 0.40149001534595047,
"eval_num_tokens": 1273295.0,
"eval_runtime": 245.22,
"eval_samples_per_second": 22.413,
"eval_steps_per_second": 1.403,
"step": 450
},
{
"entropy": 1.9504007667303085,
"epoch": 0.5950840879689522,
"grad_norm": 0.2335178405046463,
"learning_rate": 0.00019897943182105486,
"loss": 2.1289,
"mean_token_accuracy": 0.5388719126582145,
"num_tokens": 1353662.0,
"step": 460
},
{
"entropy": 1.812851694226265,
"epoch": 0.6080206985769728,
"grad_norm": 0.27590492367744446,
"learning_rate": 0.00019890343500710827,
"loss": 1.79,
"mean_token_accuracy": 0.5952848941087723,
"num_tokens": 1386745.0,
"step": 470
},
{
"entropy": 2.1694509744644166,
"epoch": 0.6209573091849935,
"grad_norm": 0.36973315477371216,
"learning_rate": 0.0001988247248969717,
"loss": 2.1425,
"mean_token_accuracy": 0.5235736042261123,
"num_tokens": 1410114.0,
"step": 480
},
{
"entropy": 3.2446247756481172,
"epoch": 0.6338939197930142,
"grad_norm": 0.0,
"learning_rate": 0.00019874330364996192,
"loss": 2.0907,
"mean_token_accuracy": 0.3589281477034092,
"num_tokens": 1417385.0,
"step": 490
},
{
"entropy": 2.887257432937622,
"epoch": 0.6468305304010349,
"grad_norm": 0.0,
"learning_rate": 0.00019865917349977242,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1418025.0,
"step": 500
},
{
"entropy": 2.0031155347824097,
"epoch": 0.6597671410090556,
"grad_norm": 0.2290731519460678,
"learning_rate": 0.00019857233675441217,
"loss": 2.1288,
"mean_token_accuracy": 0.5355072975158691,
"num_tokens": 1498284.0,
"step": 510
},
{
"entropy": 1.7464266479015351,
"epoch": 0.6727037516170763,
"grad_norm": 0.27917975187301636,
"learning_rate": 0.0001984827957961423,
"loss": 1.7213,
"mean_token_accuracy": 0.6062818467617035,
"num_tokens": 1531645.0,
"step": 520
},
{
"entropy": 2.099287986755371,
"epoch": 0.685640362225097,
"grad_norm": 0.34847304224967957,
"learning_rate": 0.00019839055308141078,
"loss": 2.0957,
"mean_token_accuracy": 0.5292750775814057,
"num_tokens": 1555744.0,
"step": 530
},
{
"entropy": 2.987301951646805,
"epoch": 0.6985769728331177,
"grad_norm": 0.0,
"learning_rate": 0.00019829561114078503,
"loss": 2.035,
"mean_token_accuracy": 0.35071768537163733,
"num_tokens": 1563621.0,
"step": 540
},
{
"entropy": 1.7990713268518448,
"epoch": 0.7115135834411385,
"grad_norm": 0.0,
"learning_rate": 0.00019819797257888237,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1564261.0,
"step": 550
},
{
"entropy": 1.9829886645078658,
"epoch": 0.7244501940491591,
"grad_norm": 0.23086819052696228,
"learning_rate": 0.00019809764007429874,
"loss": 2.0682,
"mean_token_accuracy": 0.546464990824461,
"num_tokens": 1645469.0,
"step": 560
},
{
"entropy": 1.742349737882614,
"epoch": 0.7373868046571799,
"grad_norm": 0.2855489253997803,
"learning_rate": 0.00019799461637953517,
"loss": 1.7437,
"mean_token_accuracy": 0.6023638218641281,
"num_tokens": 1678187.0,
"step": 570
},
{
"entropy": 2.0789969861507416,
"epoch": 0.7503234152652005,
"grad_norm": 0.3439568877220154,
"learning_rate": 0.00019788890432092211,
"loss": 2.0849,
"mean_token_accuracy": 0.5323359861969947,
"num_tokens": 1701620.0,
"step": 580
},
{
"entropy": 3.068958950042725,
"epoch": 0.7632600258732212,
"grad_norm": 0.0,
"learning_rate": 0.0001977805067985422,
"loss": 2.0752,
"mean_token_accuracy": 0.34963107854127884,
"num_tokens": 1709495.0,
"step": 590
},
{
"entropy": 1.8244159191846847,
"epoch": 0.7761966364812419,
"grad_norm": 0.0,
"learning_rate": 0.00019766942678615035,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1710135.0,
"step": 600
},
{
"epoch": 0.7761966364812419,
"eval_entropy": 1.7642724643959555,
"eval_loss": 1.5706199407577515,
"eval_mean_token_accuracy": 0.41619762856253356,
"eval_num_tokens": 1710135.0,
"eval_runtime": 241.9763,
"eval_samples_per_second": 22.713,
"eval_steps_per_second": 1.422,
"step": 600
},
{
"entropy": 2.0163265824317933,
"epoch": 0.7891332470892626,
"grad_norm": 0.21755698323249817,
"learning_rate": 0.00019755566733109251,
"loss": 2.083,
"mean_token_accuracy": 0.5411292694509029,
"num_tokens": 1791443.0,
"step": 610
},
{
"entropy": 1.7245848059654236,
"epoch": 0.8020698576972833,
"grad_norm": 0.288361519575119,
"learning_rate": 0.0001974392315542218,
"loss": 1.735,
"mean_token_accuracy": 0.6052085891366005,
"num_tokens": 1824564.0,
"step": 620
},
{
"entropy": 2.11205150783062,
"epoch": 0.815006468305304,
"grad_norm": 0.3383215069770813,
"learning_rate": 0.000197320122649813,
"loss": 2.1082,
"mean_token_accuracy": 0.5229554586112499,
"num_tokens": 1847974.0,
"step": 630
},
{
"entropy": 3.1667094111442564,
"epoch": 0.8279430789133247,
"grad_norm": 0.0,
"learning_rate": 0.000197198343885475,
"loss": 2.0386,
"mean_token_accuracy": 0.3788307599723339,
"num_tokens": 1855343.0,
"step": 640
},
{
"entropy": 4.338292050361633,
"epoch": 0.8408796895213454,
"grad_norm": 0.0,
"learning_rate": 0.00019707389860206087,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1855983.0,
"step": 650
},
{
"entropy": 2.0194253027439117,
"epoch": 0.8538163001293662,
"grad_norm": 0.2260085493326187,
"learning_rate": 0.00019694679021357666,
"loss": 2.0757,
"mean_token_accuracy": 0.5414572946727276,
"num_tokens": 1933686.0,
"step": 660
},
{
"entropy": 1.7457041829824447,
"epoch": 0.8667529107373868,
"grad_norm": 0.2763752341270447,
"learning_rate": 0.00019681702220708725,
"loss": 1.7265,
"mean_token_accuracy": 0.6072784595191478,
"num_tokens": 1967008.0,
"step": 670
},
{
"entropy": 2.0987232238054276,
"epoch": 0.8796895213454075,
"grad_norm": 0.3309071958065033,
"learning_rate": 0.00019668459814262116,
"loss": 2.0841,
"mean_token_accuracy": 0.5245410539209843,
"num_tokens": 1990659.0,
"step": 680
},
{
"entropy": 3.1821718513965607,
"epoch": 0.8926261319534282,
"grad_norm": 0.0,
"learning_rate": 0.00019654952165307245,
"loss": 2.229,
"mean_token_accuracy": 0.3981798455119133,
"num_tokens": 1999251.0,
"step": 690
},
{
"entropy": 1.533292955160141,
"epoch": 0.9055627425614489,
"grad_norm": 0.0,
"learning_rate": 0.00019641179644410136,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 1999891.0,
"step": 700
},
{
"entropy": 1.9957170754671096,
"epoch": 0.9184993531694696,
"grad_norm": 0.24394062161445618,
"learning_rate": 0.00019627142629403258,
"loss": 2.0975,
"mean_token_accuracy": 0.5407429985702038,
"num_tokens": 2079895.0,
"step": 710
},
{
"entropy": 1.7518584847450256,
"epoch": 0.9314359637774903,
"grad_norm": 0.307822048664093,
"learning_rate": 0.00019612841505375138,
"loss": 1.7164,
"mean_token_accuracy": 0.610467329621315,
"num_tokens": 2113509.0,
"step": 720
},
{
"entropy": 2.1020208179950712,
"epoch": 0.944372574385511,
"grad_norm": 0.35130032896995544,
"learning_rate": 0.0001959827666465984,
"loss": 2.1253,
"mean_token_accuracy": 0.5220636121928692,
"num_tokens": 2137129.0,
"step": 730
},
{
"entropy": 2.8922512531280518,
"epoch": 0.9573091849935317,
"grad_norm": 0.0,
"learning_rate": 0.00019583448506826155,
"loss": 1.9805,
"mean_token_accuracy": 0.3766542553901672,
"num_tokens": 2144488.0,
"step": 740
},
{
"entropy": 2.719996190071106,
"epoch": 0.9702457956015524,
"grad_norm": 0.0,
"learning_rate": 0.00019568357438666675,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 2145128.0,
"step": 750
},
{
"epoch": 0.9702457956015524,
"eval_entropy": 2.1529439126336296,
"eval_loss": 1.5584267377853394,
"eval_mean_token_accuracy": 0.41127229698522144,
"eval_num_tokens": 2145128.0,
"eval_runtime": 239.9653,
"eval_samples_per_second": 22.903,
"eval_steps_per_second": 1.434,
"step": 750
},
{
"entropy": 1.9202934563159944,
"epoch": 0.9831824062095731,
"grad_norm": 0.28597503900527954,
"learning_rate": 0.00019553003874186607,
"loss": 1.9302,
"mean_token_accuracy": 0.5697067268192768,
"num_tokens": 2197523.0,
"step": 760
},
{
"entropy": 2.492697748541832,
"epoch": 0.9961190168175937,
"grad_norm": 0.0,
"learning_rate": 0.00019537388234592442,
"loss": 1.81,
"mean_token_accuracy": 0.39367630481719973,
"num_tokens": 2210056.0,
"step": 770
},
{
"entropy": 2.2332220911979674,
"epoch": 1.0090556274256144,
"grad_norm": 0.24866575002670288,
"learning_rate": 0.00019521510948280373,
"loss": 1.5005,
"mean_token_accuracy": 0.36937303096055984,
"num_tokens": 2275252.0,
"step": 780
},
{
"entropy": 1.6707671225070952,
"epoch": 1.0219922380336353,
"grad_norm": 0.26215294003486633,
"learning_rate": 0.0001950537245082456,
"loss": 1.6341,
"mean_token_accuracy": 0.6254087015986443,
"num_tokens": 2311716.0,
"step": 790
},
{
"entropy": 1.8872807383537293,
"epoch": 1.034928848641656,
"grad_norm": 0.36441880464553833,
"learning_rate": 0.0001948897318496517,
"loss": 1.8977,
"mean_token_accuracy": 0.5622286461293697,
"num_tokens": 2338280.0,
"step": 800
},
{
"entropy": 2.630810996890068,
"epoch": 1.0478654592496766,
"grad_norm": 0.9684458374977112,
"learning_rate": 0.0001947231360059624,
"loss": 2.4046,
"mean_token_accuracy": 0.48553739935159684,
"num_tokens": 2351659.0,
"step": 810
},
{
"entropy": 2.581965911388397,
"epoch": 1.0608020698576972,
"grad_norm": 0.0,
"learning_rate": 0.0001945539415475333,
"loss": 0.1693,
"mean_token_accuracy": 0.06434160768985749,
"num_tokens": 2352447.0,
"step": 820
},
{
"entropy": 2.1218140482902528,
"epoch": 1.073738680465718,
"grad_norm": 0.28001680970191956,
"learning_rate": 0.00019438215311600989,
"loss": 1.5396,
"mean_token_accuracy": 0.3639061972498894,
"num_tokens": 2421672.0,
"step": 830
},
{
"entropy": 1.6590570658445358,
"epoch": 1.0866752910737387,
"grad_norm": 0.27536195516586304,
"learning_rate": 0.0001942077754242001,
"loss": 1.5986,
"mean_token_accuracy": 0.6285051852464676,
"num_tokens": 2458016.0,
"step": 840
},
{
"entropy": 1.8610892415046691,
"epoch": 1.0996119016817594,
"grad_norm": 0.3670406937599182,
"learning_rate": 0.00019403081325594516,
"loss": 1.8678,
"mean_token_accuracy": 0.5674182385206222,
"num_tokens": 2484503.0,
"step": 850
},
{
"entropy": 2.5339868366718292,
"epoch": 1.11254851228978,
"grad_norm": 0.911289393901825,
"learning_rate": 0.0001938512714659882,
"loss": 2.3594,
"mean_token_accuracy": 0.49951401725411415,
"num_tokens": 2498485.0,
"step": 860
},
{
"entropy": 1.802379448711872,
"epoch": 1.1254851228978007,
"grad_norm": 0.0,
"learning_rate": 0.00019366915497984126,
"loss": 0.1255,
"mean_token_accuracy": 0.04691708832979202,
"num_tokens": 2499204.0,
"step": 870
},
{
"entropy": 1.716230283677578,
"epoch": 1.1384217335058215,
"grad_norm": 0.29532766342163086,
"learning_rate": 0.00019348446879364998,
"loss": 1.5067,
"mean_token_accuracy": 0.3694909021258354,
"num_tokens": 2567621.0,
"step": 880
},
{
"entropy": 1.6236608117818832,
"epoch": 1.1513583441138422,
"grad_norm": 0.29713669419288635,
"learning_rate": 0.00019329721797405665,
"loss": 1.5861,
"mean_token_accuracy": 0.6327742949128151,
"num_tokens": 2603962.0,
"step": 890
},
{
"entropy": 1.8917641669511795,
"epoch": 1.1642949547218628,
"grad_norm": 0.3658815324306488,
"learning_rate": 0.00019310740765806112,
"loss": 1.9243,
"mean_token_accuracy": 0.5606695532798767,
"num_tokens": 2630252.0,
"step": 900
},
{
"epoch": 1.1642949547218628,
"eval_entropy": 1.9112664786882179,
"eval_loss": 1.4807052612304688,
"eval_mean_token_accuracy": 0.42875484818982523,
"eval_num_tokens": 2630252.0,
"eval_runtime": 244.6093,
"eval_samples_per_second": 22.468,
"eval_steps_per_second": 1.406,
"step": 900
},
{
"entropy": 2.6821564227342605,
"epoch": 1.1772315653298835,
"grad_norm": 1.0140999555587769,
"learning_rate": 0.00019291504305288005,
"loss": 2.4338,
"mean_token_accuracy": 0.482094044983387,
"num_tokens": 2643300.0,
"step": 910
},
{
"entropy": 2.024871030449867,
"epoch": 1.1901681759379044,
"grad_norm": 0.0,
"learning_rate": 0.00019272012943580383,
"loss": 0.088,
"mean_token_accuracy": 0.05487980842590332,
"num_tokens": 2644037.0,
"step": 920
},
{
"entropy": 1.9695144146680832,
"epoch": 1.203104786545925,
"grad_norm": 0.290670245885849,
"learning_rate": 0.00019252267215405188,
"loss": 1.523,
"mean_token_accuracy": 0.36803208142518995,
"num_tokens": 2711455.0,
"step": 930
},
{
"entropy": 1.634880828857422,
"epoch": 1.2160413971539457,
"grad_norm": 0.2892841100692749,
"learning_rate": 0.00019232267662462618,
"loss": 1.5725,
"mean_token_accuracy": 0.6363927751779557,
"num_tokens": 2747178.0,
"step": 940
},
{
"entropy": 1.8903283953666687,
"epoch": 1.2289780077619663,
"grad_norm": 0.3681142330169678,
"learning_rate": 0.00019212014833416222,
"loss": 1.9128,
"mean_token_accuracy": 0.5572593852877616,
"num_tokens": 2773302.0,
"step": 950
},
{
"entropy": 2.5646925628185273,
"epoch": 1.2419146183699872,
"grad_norm": 2.999826669692993,
"learning_rate": 0.00019191509283877892,
"loss": 2.3972,
"mean_token_accuracy": 0.49176110327243805,
"num_tokens": 2787000.0,
"step": 960
},
{
"entropy": 2.151153501868248,
"epoch": 1.2548512289780078,
"grad_norm": 0.0,
"learning_rate": 0.00019170751576392587,
"loss": 0.1193,
"mean_token_accuracy": 0.044841271638870236,
"num_tokens": 2787722.0,
"step": 970
},
{
"entropy": 1.8522070705890656,
"epoch": 1.2677878395860285,
"grad_norm": 0.2727435827255249,
"learning_rate": 0.00019149742280422924,
"loss": 1.5171,
"mean_token_accuracy": 0.36686722859740256,
"num_tokens": 2854084.0,
"step": 980
},
{
"entropy": 1.5743449032306671,
"epoch": 1.2807244501940491,
"grad_norm": 0.2871781289577484,
"learning_rate": 0.00019128481972333544,
"loss": 1.5921,
"mean_token_accuracy": 0.6345128893852234,
"num_tokens": 2890579.0,
"step": 990
},
{
"entropy": 1.969143381714821,
"epoch": 1.2936610608020698,
"grad_norm": 0.4106636643409729,
"learning_rate": 0.00019106971235375298,
"loss": 1.9566,
"mean_token_accuracy": 0.5519939877092839,
"num_tokens": 2917103.0,
"step": 1000
},
{
"entropy": 2.637175753712654,
"epoch": 1.3065976714100906,
"grad_norm": 0.956899881362915,
"learning_rate": 0.0001908521065966926,
"loss": 2.4367,
"mean_token_accuracy": 0.47931770235300064,
"num_tokens": 2930324.0,
"step": 1010
},
{
"entropy": 1.2107470080256462,
"epoch": 1.3195342820181113,
"grad_norm": 0.0,
"learning_rate": 0.00019063200842190514,
"loss": 0.1138,
"mean_token_accuracy": 0.07033292502164841,
"num_tokens": 2931098.0,
"step": 1020
},
{
"entropy": 1.5640547186136247,
"epoch": 1.332470892626132,
"grad_norm": 0.2837156057357788,
"learning_rate": 0.00019040942386751804,
"loss": 1.5281,
"mean_token_accuracy": 0.368409526348114,
"num_tokens": 2998986.0,
"step": 1030
},
{
"entropy": 1.6472883015871047,
"epoch": 1.3454075032341526,
"grad_norm": 0.31581056118011475,
"learning_rate": 0.00019018435903986943,
"loss": 1.6144,
"mean_token_accuracy": 0.62486432492733,
"num_tokens": 3035300.0,
"step": 1040
},
{
"entropy": 1.8509329915046693,
"epoch": 1.3583441138421732,
"grad_norm": 0.39050692319869995,
"learning_rate": 0.00018995682011334087,
"loss": 1.8415,
"mean_token_accuracy": 0.5710361421108245,
"num_tokens": 3062133.0,
"step": 1050
},
{
"epoch": 1.3583441138421732,
"eval_entropy": 1.7658769363580749,
"eval_loss": 1.464791178703308,
"eval_mean_token_accuracy": 0.429339470125215,
"eval_num_tokens": 3062133.0,
"eval_runtime": 243.4077,
"eval_samples_per_second": 22.579,
"eval_steps_per_second": 1.413,
"step": 1050
},
{
"entropy": 2.4731887727975845,
"epoch": 1.371280724450194,
"grad_norm": 0.9063658714294434,
"learning_rate": 0.00018972681333018776,
"loss": 2.3412,
"mean_token_accuracy": 0.4919880717992783,
"num_tokens": 3076137.0,
"step": 1060
},
{
"entropy": 1.815966796875,
"epoch": 1.3842173350582148,
"grad_norm": 0.0,
"learning_rate": 0.00018949434500036816,
"loss": 0.2748,
"mean_token_accuracy": 0.094140625,
"num_tokens": 3077033.0,
"step": 1070
},
{
"entropy": 1.7788158431649208,
"epoch": 1.3971539456662354,
"grad_norm": 0.28700482845306396,
"learning_rate": 0.0001892594215013697,
"loss": 1.491,
"mean_token_accuracy": 0.3707178644835949,
"num_tokens": 3139012.0,
"step": 1080
},
{
"entropy": 1.5893326640129088,
"epoch": 1.4100905562742563,
"grad_norm": 0.3248252868652344,
"learning_rate": 0.00018902204927803462,
"loss": 1.5707,
"mean_token_accuracy": 0.6353108420968056,
"num_tokens": 3175132.0,
"step": 1090
},
{
"entropy": 1.8777880787849426,
"epoch": 1.4230271668822767,
"grad_norm": 0.4096948206424713,
"learning_rate": 0.00018878223484238295,
"loss": 1.9016,
"mean_token_accuracy": 0.5628921225667,
"num_tokens": 3201175.0,
"step": 1100
},
{
"entropy": 2.5787813514471054,
"epoch": 1.4359637774902976,
"grad_norm": 0.9349520206451416,
"learning_rate": 0.00018853998477343385,
"loss": 2.4275,
"mean_token_accuracy": 0.4918954521417618,
"num_tokens": 3213218.0,
"step": 1110
},
{
"entropy": 1.5953246742486953,
"epoch": 1.4489003880983182,
"grad_norm": 0.0,
"learning_rate": 0.00018829530571702515,
"loss": 0.0759,
"mean_token_accuracy": 0.03794117569923401,
"num_tokens": 3213902.0,
"step": 1120
},
{
"entropy": 1.7004274040460587,
"epoch": 1.4618369987063389,
"grad_norm": 0.28281426429748535,
"learning_rate": 0.000188048204385631,
"loss": 1.4741,
"mean_token_accuracy": 0.37432471886277197,
"num_tokens": 3278399.0,
"step": 1130
},
{
"entropy": 1.54911307990551,
"epoch": 1.4747736093143597,
"grad_norm": 0.3112603425979614,
"learning_rate": 0.00018779868755817777,
"loss": 1.529,
"mean_token_accuracy": 0.6405477434396744,
"num_tokens": 3314005.0,
"step": 1140
},
{
"entropy": 1.8169409155845642,
"epoch": 1.4877102199223804,
"grad_norm": 0.4136084020137787,
"learning_rate": 0.00018754676207985798,
"loss": 1.8563,
"mean_token_accuracy": 0.5684241697192192,
"num_tokens": 3339761.0,
"step": 1150
},
{
"entropy": 2.6468518733978272,
"epoch": 1.500646830530401,
"grad_norm": 0.9774990081787109,
"learning_rate": 0.00018729243486194258,
"loss": 2.4068,
"mean_token_accuracy": 0.49020475447177886,
"num_tokens": 3352396.0,
"step": 1160
},
{
"entropy": 1.844868466258049,
"epoch": 1.5135834411384217,
"grad_norm": 0.0,
"learning_rate": 0.0001870357128815915,
"loss": 0.1083,
"mean_token_accuracy": 0.03311403542757034,
"num_tokens": 3353089.0,
"step": 1170
},
{
"entropy": 1.77299522459507,
"epoch": 1.5265200517464423,
"grad_norm": 0.29017725586891174,
"learning_rate": 0.00018677660318166178,
"loss": 1.5134,
"mean_token_accuracy": 0.37067501023411753,
"num_tokens": 3417806.0,
"step": 1180
},
{
"entropy": 1.605825701355934,
"epoch": 1.5394566623544632,
"grad_norm": 0.3007327616214752,
"learning_rate": 0.000186515112870515,
"loss": 1.5754,
"mean_token_accuracy": 0.6359535038471222,
"num_tokens": 3453968.0,
"step": 1190
},
{
"entropy": 1.8059845566749573,
"epoch": 1.5523932729624839,
"grad_norm": 0.4170464277267456,
"learning_rate": 0.0001862512491218217,
"loss": 1.8209,
"mean_token_accuracy": 0.5729366824030876,
"num_tokens": 3480122.0,
"step": 1200
},
{
"epoch": 1.5523932729624839,
"eval_entropy": 1.846903031302053,
"eval_loss": 1.4494483470916748,
"eval_mean_token_accuracy": 0.4338979678618353,
"eval_num_tokens": 3480122.0,
"eval_runtime": 245.7587,
"eval_samples_per_second": 22.363,
"eval_steps_per_second": 1.4,
"step": 1200
},
{
"entropy": 2.4776687741279604,
"epoch": 1.5653298835705045,
"grad_norm": 1.236024022102356,
"learning_rate": 0.00018598501917436487,
"loss": 2.2694,
"mean_token_accuracy": 0.5161234959959984,
"num_tokens": 3492043.0,
"step": 1210
},
{
"entropy": 2.871905821561813,
"epoch": 1.5782664941785254,
"grad_norm": 0.0,
"learning_rate": 0.00018571643033184136,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 3492683.0,
"step": 1220
},
{
"entropy": 2.30782949924469,
"epoch": 1.5912031047865458,
"grad_norm": 0.3269418179988861,
"learning_rate": 0.00018544548996266138,
"loss": 1.4917,
"mean_token_accuracy": 0.3702575147151947,
"num_tokens": 3561621.0,
"step": 1230
},
{
"entropy": 1.5860986828804016,
"epoch": 1.6041397153945667,
"grad_norm": 0.33811113238334656,
"learning_rate": 0.00018517220549974642,
"loss": 1.5659,
"mean_token_accuracy": 0.6364668473601341,
"num_tokens": 3597551.0,
"step": 1240
},
{
"entropy": 1.8561100304126739,
"epoch": 1.6170763260025873,
"grad_norm": 0.4206816554069519,
"learning_rate": 0.00018489658444032544,
"loss": 1.8636,
"mean_token_accuracy": 0.5685464948415756,
"num_tokens": 3623516.0,
"step": 1250
},
{
"entropy": 2.475165989995003,
"epoch": 1.630012936610608,
"grad_norm": 0.9206745624542236,
"learning_rate": 0.00018461863434572905,
"loss": 2.3686,
"mean_token_accuracy": 0.49352553114295006,
"num_tokens": 3636662.0,
"step": 1260
},
{
"entropy": 1.5844505287706852,
"epoch": 1.6429495472186288,
"grad_norm": 0.0,
"learning_rate": 0.0001843383628411821,
"loss": 0.1782,
"mean_token_accuracy": 0.08751860111951829,
"num_tokens": 3637501.0,
"step": 1270
},
{
"entropy": 1.5500032015144825,
"epoch": 1.6558861578266493,
"grad_norm": 0.2985474169254303,
"learning_rate": 0.00018405577761559453,
"loss": 1.5005,
"mean_token_accuracy": 0.3704367861151695,
"num_tokens": 3705898.0,
"step": 1280
},
{
"entropy": 1.5747513711452483,
"epoch": 1.6688227684346701,
"grad_norm": 0.3510088622570038,
"learning_rate": 0.0001837708864213505,
"loss": 1.5586,
"mean_token_accuracy": 0.6378742828965187,
"num_tokens": 3742275.0,
"step": 1290
},
{
"entropy": 1.7819489419460297,
"epoch": 1.6817593790426908,
"grad_norm": 0.42687690258026123,
"learning_rate": 0.00018348369707409546,
"loss": 1.8096,
"mean_token_accuracy": 0.5733471587300301,
"num_tokens": 3768563.0,
"step": 1300
},
{
"entropy": 2.4534367620944977,
"epoch": 1.6946959896507114,
"grad_norm": 0.9902492165565491,
"learning_rate": 0.00018319421745252208,
"loss": 2.3035,
"mean_token_accuracy": 0.49916471540927887,
"num_tokens": 3782396.0,
"step": 1310
},
{
"entropy": 1.977598437666893,
"epoch": 1.7076326002587323,
"grad_norm": 0.0,
"learning_rate": 0.00018290245549815385,
"loss": 0.1527,
"mean_token_accuracy": 0.0657636746764183,
"num_tokens": 3783196.0,
"step": 1320
},
{
"entropy": 2.1555118948221206,
"epoch": 1.720569210866753,
"grad_norm": 0.3243282437324524,
"learning_rate": 0.0001826084192151273,
"loss": 1.5106,
"mean_token_accuracy": 0.36851018443703654,
"num_tokens": 3846769.0,
"step": 1330
},
{
"entropy": 1.5848265200853349,
"epoch": 1.7335058214747736,
"grad_norm": 0.32707569003105164,
"learning_rate": 0.00018231211666997247,
"loss": 1.5277,
"mean_token_accuracy": 0.642450013756752,
"num_tokens": 3882748.0,
"step": 1340
},
{
"entropy": 1.8691698461771011,
"epoch": 1.7464424320827943,
"grad_norm": 0.43988320231437683,
"learning_rate": 0.00018201355599139154,
"loss": 1.9016,
"mean_token_accuracy": 0.56101154088974,
"num_tokens": 3908934.0,
"step": 1350
},
{
"epoch": 1.7464424320827943,
"eval_entropy": 1.7982253941685655,
"eval_loss": 1.4296140670776367,
"eval_mean_token_accuracy": 0.43251860254379204,
"eval_num_tokens": 3908934.0,
"eval_runtime": 245.0387,
"eval_samples_per_second": 22.429,
"eval_steps_per_second": 1.404,
"step": 1350
},
{
"entropy": 2.471151527762413,
"epoch": 1.759379042690815,
"grad_norm": 0.9302666187286377,
"learning_rate": 0.0001817127453700358,
"loss": 2.3247,
"mean_token_accuracy": 0.5023237220942974,
"num_tokens": 3922255.0,
"step": 1360
},
{
"entropy": 1.8378637909889222,
"epoch": 1.7723156532988358,
"grad_norm": 0.0,
"learning_rate": 0.00018140969305828106,
"loss": 0.0576,
"mean_token_accuracy": 0.0373076930642128,
"num_tokens": 3922926.0,
"step": 1370
},
{
"entropy": 1.7470036551356316,
"epoch": 1.7852522639068564,
"grad_norm": 0.3011367619037628,
"learning_rate": 0.00018110440737000122,
"loss": 1.4591,
"mean_token_accuracy": 0.3771127283573151,
"num_tokens": 3990074.0,
"step": 1380
},
{
"entropy": 1.5329654335975647,
"epoch": 1.798188874514877,
"grad_norm": 0.31504422426223755,
"learning_rate": 0.00018079689668034005,
"loss": 1.4973,
"mean_token_accuracy": 0.6467197388410568,
"num_tokens": 4026755.0,
"step": 1390
},
{
"entropy": 1.7885783523321153,
"epoch": 1.811125485122898,
"grad_norm": 0.42766207456588745,
"learning_rate": 0.00018048716942548168,
"loss": 1.8211,
"mean_token_accuracy": 0.5723803475499153,
"num_tokens": 4053589.0,
"step": 1400
},
{
"entropy": 2.405156469345093,
"epoch": 1.8240620957309184,
"grad_norm": 0.953956663608551,
"learning_rate": 0.00018017523410241893,
"loss": 2.2967,
"mean_token_accuracy": 0.5070258714258671,
"num_tokens": 4068297.0,
"step": 1410
},
{
"entropy": 1.202190825343132,
"epoch": 1.8369987063389392,
"grad_norm": 0.0,
"learning_rate": 0.00017986109926872032,
"loss": 0.2475,
"mean_token_accuracy": 0.09388883709907532,
"num_tokens": 4069205.0,
"step": 1420
},
{
"entropy": 1.8208864331245422,
"epoch": 1.84993531694696,
"grad_norm": 0.30337706208229065,
"learning_rate": 0.00017954477354229536,
"loss": 1.4609,
"mean_token_accuracy": 0.3746915958821774,
"num_tokens": 4135636.0,
"step": 1430
},
{
"entropy": 1.547205138206482,
"epoch": 1.8628719275549805,
"grad_norm": 0.3231499493122101,
"learning_rate": 0.00017922626560115798,
"loss": 1.5262,
"mean_token_accuracy": 0.6422269076108933,
"num_tokens": 4171871.0,
"step": 1440
},
{
"entropy": 1.8343932330608368,
"epoch": 1.8758085381630014,
"grad_norm": 0.45170995593070984,
"learning_rate": 0.0001789055841831885,
"loss": 1.8589,
"mean_token_accuracy": 0.5682013630867004,
"num_tokens": 4198004.0,
"step": 1450
},
{
"entropy": 2.4178356170654296,
"epoch": 1.8887451487710218,
"grad_norm": 1.1836594343185425,
"learning_rate": 0.00017858273808589402,
"loss": 2.219,
"mean_token_accuracy": 0.5180532835423947,
"num_tokens": 4210568.0,
"step": 1460
},
{
"entropy": 1.6111401319503784,
"epoch": 1.9016817593790427,
"grad_norm": 0.0,
"learning_rate": 0.00017825773616616703,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 4211208.0,
"step": 1470
},
{
"entropy": 1.7321304202079773,
"epoch": 1.9146183699870634,
"grad_norm": 0.30086463689804077,
"learning_rate": 0.0001779305873400423,
"loss": 1.4654,
"mean_token_accuracy": 0.3772578649222851,
"num_tokens": 4279659.0,
"step": 1480
},
{
"entropy": 1.5423625767230988,
"epoch": 1.927554980595084,
"grad_norm": 0.33361881971359253,
"learning_rate": 0.00017760130058245242,
"loss": 1.4942,
"mean_token_accuracy": 0.6453819587826729,
"num_tokens": 4315273.0,
"step": 1490
},
{
"entropy": 1.8349818885326385,
"epoch": 1.9404915912031049,
"grad_norm": 0.4649695158004761,
"learning_rate": 0.0001772698849269816,
"loss": 1.8167,
"mean_token_accuracy": 0.5768257766962052,
"num_tokens": 4341460.0,
"step": 1500
},
{
"epoch": 1.9404915912031049,
"eval_entropy": 1.7394565719851227,
"eval_loss": 1.4138603210449219,
"eval_mean_token_accuracy": 0.43571045708864237,
"eval_num_tokens": 4341460.0,
"eval_runtime": 245.3477,
"eval_samples_per_second": 22.401,
"eval_steps_per_second": 1.402,
"step": 1500
},
{
"entropy": 2.375590392947197,
"epoch": 1.9534282018111255,
"grad_norm": 0.9830443263053894,
"learning_rate": 0.00017693634946561775,
"loss": 2.2734,
"mean_token_accuracy": 0.5091598987579345,
"num_tokens": 4355559.0,
"step": 1510
},
{
"entropy": 2.5652388006448748,
"epoch": 1.9663648124191462,
"grad_norm": 0.0,
"learning_rate": 0.00017660070334850304,
"loss": 0.1559,
"mean_token_accuracy": 0.07029985040426254,
"num_tokens": 4356373.0,
"step": 1520
},
{
"entropy": 2.071737268567085,
"epoch": 1.9793014230271668,
"grad_norm": 0.33517006039619446,
"learning_rate": 0.00017626295578368305,
"loss": 1.2406,
"mean_token_accuracy": 0.418473818898201,
"num_tokens": 4398312.0,
"step": 1530
},
{
"entropy": 2.0724180698394776,
"epoch": 1.9922380336351875,
"grad_norm": 0.8757649660110474,
"learning_rate": 0.00017592311603685393,
"loss": 2.0395,
"mean_token_accuracy": 0.5450932942330837,
"num_tokens": 4419963.0,
"step": 1540
},
{
"entropy": 2.4648678690195083,
"epoch": 2.0051746442432083,
"grad_norm": 0.29208359122276306,
"learning_rate": 0.00017558119343110838,
"loss": 1.0811,
"mean_token_accuracy": 0.2497500881552696,
"num_tokens": 4466509.0,
"step": 1550
},
{
"entropy": 1.4679036349058152,
"epoch": 2.0181112548512288,
"grad_norm": 0.31473416090011597,
"learning_rate": 0.00017523719734667973,
"loss": 1.4439,
"mean_token_accuracy": 0.6536323636770248,
"num_tokens": 4506293.0,
"step": 1560
},
{
"entropy": 1.5847829729318619,
"epoch": 2.0310478654592496,
"grad_norm": 0.4749562740325928,
"learning_rate": 0.0001748911372206848,
"loss": 1.5723,
"mean_token_accuracy": 0.6196332320570945,
"num_tokens": 4535291.0,
"step": 1570
},
{
"entropy": 2.08716399371624,
"epoch": 2.0439844760672705,
"grad_norm": 0.6515533924102783,
"learning_rate": 0.00017454302254686486,
"loss": 2.0148,
"mean_token_accuracy": 0.5413075156509877,
"num_tokens": 4553239.0,
"step": 1580
},
{
"entropy": 2.5849849820137023,
"epoch": 2.056921086675291,
"grad_norm": 0.0,
"learning_rate": 0.00017419286287532516,
"loss": 0.7934,
"mean_token_accuracy": 0.19277514591813089,
"num_tokens": 4555288.0,
"step": 1590
},
{
"entropy": 2.378413477540016,
"epoch": 2.069857697283312,
"grad_norm": 0.28206267952919006,
"learning_rate": 0.00017384066781227307,
"loss": 0.9347,
"mean_token_accuracy": 0.1983368217945099,
"num_tokens": 4604552.0,
"step": 1600
},
{
"entropy": 1.5316940248012543,
"epoch": 2.0827943078913327,
"grad_norm": 0.33060652017593384,
"learning_rate": 0.0001734864470197544,
"loss": 1.5009,
"mean_token_accuracy": 0.6414364308118821,
"num_tokens": 4644766.0,
"step": 1610
},
{
"entropy": 1.6213007301092148,
"epoch": 2.095730918499353,
"grad_norm": 0.5015760064125061,
"learning_rate": 0.00017313021021538844,
"loss": 1.6038,
"mean_token_accuracy": 0.6168796703219414,
"num_tokens": 4673702.0,
"step": 1620
},
{
"entropy": 2.1126689702272414,
"epoch": 2.108667529107374,
"grad_norm": 0.7331583499908447,
"learning_rate": 0.0001727719671721013,
"loss": 2.0398,
"mean_token_accuracy": 0.533772025257349,
"num_tokens": 4691426.0,
"step": 1630
},
{
"entropy": 2.546648120880127,
"epoch": 2.1216041397153944,
"grad_norm": 0.0,
"learning_rate": 0.0001724117277178579,
"loss": 0.5647,
"mean_token_accuracy": 0.1764907084405422,
"num_tokens": 4693073.0,
"step": 1640
},
{
"entropy": 2.3147504776716232,
"epoch": 2.1345407503234153,
"grad_norm": 0.3223225474357605,
"learning_rate": 0.0001720495017353922,
"loss": 0.8825,
"mean_token_accuracy": 0.2041303940117359,
"num_tokens": 4745475.0,
"step": 1650
},
{
"epoch": 2.1345407503234153,
"eval_entropy": 1.9804211553446083,
"eval_loss": 1.4297912120819092,
"eval_mean_token_accuracy": 0.43714187025677326,
"eval_num_tokens": 4745475.0,
"eval_runtime": 240.4238,
"eval_samples_per_second": 22.86,
"eval_steps_per_second": 1.431,
"step": 1650
},
{
"entropy": 1.5305457144975663,
"epoch": 2.147477360931436,
"grad_norm": 0.35115179419517517,
"learning_rate": 0.00017168529916193614,
"loss": 1.521,
"mean_token_accuracy": 0.6396576210856437,
"num_tokens": 4786054.0,
"step": 1660
},
{
"entropy": 1.5795167148113252,
"epoch": 2.1604139715394566,
"grad_norm": 0.50258469581604,
"learning_rate": 0.00017131912998894717,
"loss": 1.5679,
"mean_token_accuracy": 0.6227076068520546,
"num_tokens": 4815157.0,
"step": 1670
},
{
"entropy": 2.0948879569768906,
"epoch": 2.1733505821474774,
"grad_norm": 0.7732148766517639,
"learning_rate": 0.0001709510042618339,
"loss": 2.0484,
"mean_token_accuracy": 0.539436261355877,
"num_tokens": 4833514.0,
"step": 1680
},
{
"entropy": 2.419444125890732,
"epoch": 2.186287192755498,
"grad_norm": 0.0,
"learning_rate": 0.00017058093207968067,
"loss": 0.6193,
"mean_token_accuracy": 0.19686403200030328,
"num_tokens": 4835320.0,
"step": 1690
},
{
"entropy": 2.0893223583698273,
"epoch": 2.1992238033635187,
"grad_norm": 0.3104536831378937,
"learning_rate": 0.0001702089235949705,
"loss": 0.8909,
"mean_token_accuracy": 0.20202562659978868,
"num_tokens": 4887586.0,
"step": 1700
},
{
"entropy": 1.493579125404358,
"epoch": 2.2121604139715396,
"grad_norm": 0.35835903882980347,
"learning_rate": 0.0001698349890133065,
"loss": 1.5107,
"mean_token_accuracy": 0.6415591448545456,
"num_tokens": 4928021.0,
"step": 1710
},
{
"entropy": 1.599936455488205,
"epoch": 2.22509702457956,
"grad_norm": 0.5604035258293152,
"learning_rate": 0.0001694591385931319,
"loss": 1.5589,
"mean_token_accuracy": 0.6183684885501861,
"num_tokens": 4956628.0,
"step": 1720
},
{
"entropy": 2.0975252121686934,
"epoch": 2.238033635187581,
"grad_norm": 0.7757624983787537,
"learning_rate": 0.00016908138264544874,
"loss": 2.0586,
"mean_token_accuracy": 0.537506015598774,
"num_tokens": 4973976.0,
"step": 1730
},
{
"entropy": 2.402835935354233,
"epoch": 2.2509702457956013,
"grad_norm": 0.0,
"learning_rate": 0.00016870173153353478,
"loss": 0.7325,
"mean_token_accuracy": 0.21586424633860588,
"num_tokens": 4975943.0,
"step": 1740
},
{
"entropy": 1.8081632763147355,
"epoch": 2.263906856403622,
"grad_norm": 0.29493167996406555,
"learning_rate": 0.0001683201956726593,
"loss": 0.8952,
"mean_token_accuracy": 0.20223823115229606,
"num_tokens": 5031202.0,
"step": 1750
},
{
"entropy": 1.5031701743602752,
"epoch": 2.276843467011643,
"grad_norm": 0.3834936320781708,
"learning_rate": 0.0001679367855297976,
"loss": 1.5076,
"mean_token_accuracy": 0.643890731036663,
"num_tokens": 5071593.0,
"step": 1760
},
{
"entropy": 1.6009941071271896,
"epoch": 2.2897800776196635,
"grad_norm": 0.5210739374160767,
"learning_rate": 0.0001675515116233434,
"loss": 1.5777,
"mean_token_accuracy": 0.6210601255297661,
"num_tokens": 5100741.0,
"step": 1770
},
{
"entropy": 2.032317912578583,
"epoch": 2.3027166882276844,
"grad_norm": 0.6077569723129272,
"learning_rate": 0.0001671643845228207,
"loss": 1.9718,
"mean_token_accuracy": 0.5442127160727978,
"num_tokens": 5120288.0,
"step": 1780
},
{
"entropy": 1.8405873313546182,
"epoch": 2.315653298835705,
"grad_norm": 0.0,
"learning_rate": 0.00016677541484859352,
"loss": 0.9106,
"mean_token_accuracy": 0.22827735766768456,
"num_tokens": 5122772.0,
"step": 1790
},
{
"entropy": 1.2277291655540465,
"epoch": 2.3285899094437257,
"grad_norm": 0.2893352806568146,
"learning_rate": 0.0001663846132715747,
"loss": 0.9194,
"mean_token_accuracy": 0.1989746630191803,
"num_tokens": 5178960.0,
"step": 1800
},
{
"epoch": 2.3285899094437257,
"eval_entropy": 1.3877028687748798,
"eval_loss": 1.4073032140731812,
"eval_mean_token_accuracy": 0.44012743035374685,
"eval_num_tokens": 5178960.0,
"eval_runtime": 243.8297,
"eval_samples_per_second": 22.54,
"eval_steps_per_second": 1.411,
"step": 1800
},
{
"entropy": 1.5012196868658065,
"epoch": 2.3415265200517466,
"grad_norm": 0.3776693344116211,
"learning_rate": 0.00016599199051293314,
"loss": 1.4982,
"mean_token_accuracy": 0.644976706802845,
"num_tokens": 5220342.0,
"step": 1810
},
{
"entropy": 1.6106306850910186,
"epoch": 2.354463130659767,
"grad_norm": 0.5475464463233948,
"learning_rate": 0.0001655975573437996,
"loss": 1.5526,
"mean_token_accuracy": 0.6244173154234887,
"num_tokens": 5249776.0,
"step": 1820
},
{
"entropy": 2.004978260397911,
"epoch": 2.367399741267788,
"grad_norm": 0.6898283958435059,
"learning_rate": 0.0001652013245849714,
"loss": 1.9472,
"mean_token_accuracy": 0.557063739746809,
"num_tokens": 5268417.0,
"step": 1830
},
{
"entropy": 2.327620804309845,
"epoch": 2.3803363518758087,
"grad_norm": 0.0,
"learning_rate": 0.00016480330310661523,
"loss": 0.7845,
"mean_token_accuracy": 0.20984074249863624,
"num_tokens": 5270607.0,
"step": 1840
},
{
"entropy": 2.509730467200279,
"epoch": 2.393272962483829,
"grad_norm": 0.30109038949012756,
"learning_rate": 0.00016440350382796929,
"loss": 0.9268,
"mean_token_accuracy": 0.19716072604060172,
"num_tokens": 5325120.0,
"step": 1850
},
{
"entropy": 1.480056384205818,
"epoch": 2.40620957309185,
"grad_norm": 0.36303573846817017,
"learning_rate": 0.00016400193771704354,
"loss": 1.4947,
"mean_token_accuracy": 0.6465561181306839,
"num_tokens": 5366273.0,
"step": 1860
},
{
"entropy": 1.5890043556690217,
"epoch": 2.4191461836998704,
"grad_norm": 0.5530393123626709,
"learning_rate": 0.00016359861579031884,
"loss": 1.5522,
"mean_token_accuracy": 0.6297082543373108,
"num_tokens": 5395726.0,
"step": 1870
},
{
"entropy": 2.038092666864395,
"epoch": 2.4320827943078913,
"grad_norm": 1.0535674095153809,
"learning_rate": 0.00016319354911244468,
"loss": 1.9806,
"mean_token_accuracy": 0.5464614436030388,
"num_tokens": 5414798.0,
"step": 1880
},
{
"entropy": 2.923152169585228,
"epoch": 2.445019404915912,
"grad_norm": 0.0,
"learning_rate": 0.00016278674879593582,
"loss": 0.7968,
"mean_token_accuracy": 0.2314663991332054,
"num_tokens": 5417197.0,
"step": 1890
},
{
"entropy": 2.655092605948448,
"epoch": 2.4579560155239326,
"grad_norm": 0.3218407928943634,
"learning_rate": 0.00016237822600086716,
"loss": 0.9259,
"mean_token_accuracy": 0.19839748442173005,
"num_tokens": 5470736.0,
"step": 1900
},
{
"entropy": 1.4376092582941056,
"epoch": 2.4708926261319535,
"grad_norm": 0.3781118094921112,
"learning_rate": 0.00016196799193456785,
"loss": 1.4415,
"mean_token_accuracy": 0.6578261837363243,
"num_tokens": 5511266.0,
"step": 1910
},
{
"entropy": 1.5665327340364457,
"epoch": 2.4838292367399744,
"grad_norm": 0.5386565327644348,
"learning_rate": 0.00016155605785131357,
"loss": 1.5497,
"mean_token_accuracy": 0.6252920791506767,
"num_tokens": 5541123.0,
"step": 1920
},
{
"entropy": 1.9834172219038009,
"epoch": 2.496765847347995,
"grad_norm": 0.6560537815093994,
"learning_rate": 0.00016114243505201795,
"loss": 1.9184,
"mean_token_accuracy": 0.555550941824913,
"num_tokens": 5561101.0,
"step": 1930
},
{
"entropy": 2.323999685049057,
"epoch": 2.5097024579560157,
"grad_norm": 0.0,
"learning_rate": 0.0001607271348839226,
"loss": 0.9326,
"mean_token_accuracy": 0.2633499436080456,
"num_tokens": 5564120.0,
"step": 1940
},
{
"entropy": 1.5099886417388917,
"epoch": 2.522639068564036,
"grad_norm": 0.39876788854599,
"learning_rate": 0.00016031016874028557,
"loss": 0.9269,
"mean_token_accuracy": 0.20084442123770713,
"num_tokens": 5613256.0,
"step": 1950
},
{
"epoch": 2.522639068564036,
"eval_entropy": 1.3481496193034703,
"eval_loss": 1.3939740657806396,
"eval_mean_token_accuracy": 0.44758816895096804,
"eval_num_tokens": 5613256.0,
"eval_runtime": 246.9294,
"eval_samples_per_second": 22.257,
"eval_steps_per_second": 1.393,
"step": 1950
},
{
"entropy": 1.4310665398836135,
"epoch": 2.535575679172057,
"grad_norm": 0.39710840582847595,
"learning_rate": 0.00015989154806006904,
"loss": 1.4336,
"mean_token_accuracy": 0.6602939382195473,
"num_tokens": 5653638.0,
"step": 1960
},
{
"entropy": 1.5728681892156602,
"epoch": 2.548512289780078,
"grad_norm": 0.5568864941596985,
"learning_rate": 0.00015947128432762536,
"loss": 1.5237,
"mean_token_accuracy": 0.627597238123417,
"num_tokens": 5683333.0,
"step": 1970
},
{
"entropy": 1.9994044303894043,
"epoch": 2.5614489003880982,
"grad_norm": 0.6420727968215942,
"learning_rate": 0.00015904938907238206,
"loss": 1.9615,
"mean_token_accuracy": 0.5487420856952667,
"num_tokens": 5702066.0,
"step": 1980
},
{
"entropy": 2.452130767703056,
"epoch": 2.574385510996119,
"grad_norm": 0.0,
"learning_rate": 0.00015862587386852541,
"loss": 0.7703,
"mean_token_accuracy": 0.2316281594336033,
"num_tokens": 5704289.0,
"step": 1990
},
{
"entropy": 2.385006046295166,
"epoch": 2.5873221216041395,
"grad_norm": 0.3110261857509613,
"learning_rate": 0.0001582007503346832,
"loss": 0.9186,
"mean_token_accuracy": 0.19861687943339348,
"num_tokens": 5760847.0,
"step": 2000
},
{
"entropy": 1.4642044007778168,
"epoch": 2.6002587322121604,
"grad_norm": 0.38485661149024963,
"learning_rate": 0.0001577740301336057,
"loss": 1.4756,
"mean_token_accuracy": 0.6492435604333877,
"num_tokens": 5802455.0,
"step": 2010
},
{
"entropy": 1.5432655066251755,
"epoch": 2.6131953428201813,
"grad_norm": 0.6033521294593811,
"learning_rate": 0.00015734572497184577,
"loss": 1.5119,
"mean_token_accuracy": 0.6332074150443077,
"num_tokens": 5831848.0,
"step": 2020
},
{
"entropy": 2.0233444392681124,
"epoch": 2.6261319534282017,
"grad_norm": 0.7502851486206055,
"learning_rate": 0.00015691584659943786,
"loss": 1.9476,
"mean_token_accuracy": 0.5473973207175732,
"num_tokens": 5850975.0,
"step": 2030
},
{
"entropy": 2.2630896627902986,
"epoch": 2.6390685640362226,
"grad_norm": 0.0,
"learning_rate": 0.0001564844068095755,
"loss": 0.8525,
"mean_token_accuracy": 0.23688365146517754,
"num_tokens": 5853548.0,
"step": 2040
},
{
"entropy": 1.6931863486766816,
"epoch": 2.652005174644243,
"grad_norm": 0.3148477077484131,
"learning_rate": 0.0001560514174382878,
"loss": 0.8972,
"mean_token_accuracy": 0.20218148753046988,
"num_tokens": 5907614.0,
"step": 2050
},
{
"entropy": 1.4166515529155732,
"epoch": 2.664941785252264,
"grad_norm": 0.38905423879623413,
"learning_rate": 0.0001556168903641148,
"loss": 1.4368,
"mean_token_accuracy": 0.6563202187418937,
"num_tokens": 5947663.0,
"step": 2060
},
{
"entropy": 1.5506242126226426,
"epoch": 2.6778783958602848,
"grad_norm": 0.5905367136001587,
"learning_rate": 0.00015518083750778157,
"loss": 1.5309,
"mean_token_accuracy": 0.6258940026164055,
"num_tokens": 5976765.0,
"step": 2070
},
{
"entropy": 1.9377893030643463,
"epoch": 2.690815006468305,
"grad_norm": 0.6645969152450562,
"learning_rate": 0.00015474327083187105,
"loss": 1.9022,
"mean_token_accuracy": 0.5610988035798072,
"num_tokens": 5996303.0,
"step": 2080
},
{
"entropy": 2.6364343762397766,
"epoch": 2.703751617076326,
"grad_norm": 0.0,
"learning_rate": 0.00015430420234049624,
"loss": 1.038,
"mean_token_accuracy": 0.2556902192533016,
"num_tokens": 5999434.0,
"step": 2090
},
{
"entropy": 2.8424737572669985,
"epoch": 2.7166882276843465,
"grad_norm": 0.3264569938182831,
"learning_rate": 0.00015386364407897035,
"loss": 0.9078,
"mean_token_accuracy": 0.20131859928369522,
"num_tokens": 6051774.0,
"step": 2100
},
{
"epoch": 2.7166882276843465,
"eval_entropy": 2.204050070671148,
"eval_loss": 1.3715640306472778,
"eval_mean_token_accuracy": 0.4440248931736447,
"eval_num_tokens": 6051774.0,
"eval_runtime": 244.556,
"eval_samples_per_second": 22.473,
"eval_steps_per_second": 1.407,
"step": 2100
},
{
"entropy": 1.4316389322280885,
"epoch": 2.7296248382923674,
"grad_norm": 0.3802427053451538,
"learning_rate": 0.00015342160813347676,
"loss": 1.4553,
"mean_token_accuracy": 0.6519668206572533,
"num_tokens": 6091750.0,
"step": 2110
},
{
"entropy": 1.5787472486495973,
"epoch": 2.742561448900388,
"grad_norm": 0.5799654126167297,
"learning_rate": 0.00015297810663073743,
"loss": 1.5507,
"mean_token_accuracy": 0.6268433704972267,
"num_tokens": 6120790.0,
"step": 2120
},
{
"entropy": 1.9796525478363036,
"epoch": 2.7554980595084086,
"grad_norm": 0.7903239727020264,
"learning_rate": 0.00015253315173767993,
"loss": 1.9383,
"mean_token_accuracy": 0.5536467991769314,
"num_tokens": 6139010.0,
"step": 2130
},
{
"entropy": 2.6805751383304597,
"epoch": 2.7684346701164295,
"grad_norm": 0.0,
"learning_rate": 0.00015208675566110387,
"loss": 0.7659,
"mean_token_accuracy": 0.21504319161176683,
"num_tokens": 6141159.0,
"step": 2140
},
{
"entropy": 2.1302292913198473,
"epoch": 2.78137128072445,
"grad_norm": 0.3743366003036499,
"learning_rate": 0.0001516389306473461,
"loss": 0.8888,
"mean_token_accuracy": 0.20484731644392012,
"num_tokens": 6191053.0,
"step": 2150
},
{
"entropy": 1.4483990609645843,
"epoch": 2.794307891332471,
"grad_norm": 0.3969733417034149,
"learning_rate": 0.00015118968898194458,
"loss": 1.443,
"mean_token_accuracy": 0.6526175752282143,
"num_tokens": 6230521.0,
"step": 2160
},
{
"entropy": 1.582485669851303,
"epoch": 2.8072445019404917,
"grad_norm": 0.6144042611122131,
"learning_rate": 0.00015073904298930132,
"loss": 1.5429,
"mean_token_accuracy": 0.6261137276887894,
"num_tokens": 6259286.0,
"step": 2170
},
{
"entropy": 1.970637395977974,
"epoch": 2.8201811125485126,
"grad_norm": 0.7516705393791199,
"learning_rate": 0.00015028700503234447,
"loss": 1.9348,
"mean_token_accuracy": 0.5558973327279091,
"num_tokens": 6277729.0,
"step": 2180
},
{
"entropy": 2.001736190915108,
"epoch": 2.833117723156533,
"grad_norm": 0.0,
"learning_rate": 0.00014983358751218892,
"loss": 0.736,
"mean_token_accuracy": 0.19615912958979606,
"num_tokens": 6279643.0,
"step": 2190
},
{
"entropy": 1.9369044452905655,
"epoch": 2.8460543337645534,
"grad_norm": 0.32840585708618164,
"learning_rate": 0.00014937880286779629,
"loss": 0.9147,
"mean_token_accuracy": 0.19959167763590813,
"num_tokens": 6336300.0,
"step": 2200
},
{
"entropy": 1.4088002383708953,
"epoch": 2.8589909443725743,
"grad_norm": 0.4119824767112732,
"learning_rate": 0.00014892266357563358,
"loss": 1.4187,
"mean_token_accuracy": 0.6627781435847282,
"num_tokens": 6375995.0,
"step": 2210
},
{
"entropy": 1.6024494558572768,
"epoch": 2.871927554980595,
"grad_norm": 0.5892689228057861,
"learning_rate": 0.0001484651821493309,
"loss": 1.5693,
"mean_token_accuracy": 0.6204348549246788,
"num_tokens": 6404526.0,
"step": 2220
},
{
"entropy": 2.072836604714394,
"epoch": 2.884864165588616,
"grad_norm": 0.7402485013008118,
"learning_rate": 0.0001480063711393382,
"loss": 2.0136,
"mean_token_accuracy": 0.5476931251585484,
"num_tokens": 6421889.0,
"step": 2230
},
{
"entropy": 1.5923803925514222,
"epoch": 2.8978007761966365,
"grad_norm": 0.0,
"learning_rate": 0.00014754624313258102,
"loss": 0.6735,
"mean_token_accuracy": 0.20976952239871025,
"num_tokens": 6423681.0,
"step": 2240
},
{
"entropy": 1.2221377216279508,
"epoch": 2.9107373868046573,
"grad_norm": 0.3352583050727844,
"learning_rate": 0.00014708481075211498,
"loss": 0.9037,
"mean_token_accuracy": 0.20100481137633325,
"num_tokens": 6474539.0,
"step": 2250
},
{
"epoch": 2.9107373868046573,
"eval_entropy": 1.358256766096104,
"eval_loss": 1.3591663837432861,
"eval_mean_token_accuracy": 0.45166019766136656,
"eval_num_tokens": 6474539.0,
"eval_runtime": 241.3389,
"eval_samples_per_second": 22.773,
"eval_steps_per_second": 1.425,
"step": 2250
},
{
"entropy": 1.3933149039745332,
"epoch": 2.9236739974126777,
"grad_norm": 0.4007508456707001,
"learning_rate": 0.00014662208665677966,
"loss": 1.4101,
"mean_token_accuracy": 0.6611413463950158,
"num_tokens": 6514494.0,
"step": 2260
},
{
"entropy": 1.5439734548330306,
"epoch": 2.9366106080206986,
"grad_norm": 0.5625568628311157,
"learning_rate": 0.0001461580835408513,
"loss": 1.4993,
"mean_token_accuracy": 0.6339735224843025,
"num_tokens": 6543746.0,
"step": 2270
},
{
"entropy": 1.982978528738022,
"epoch": 2.9495472186287195,
"grad_norm": 0.7641308307647705,
"learning_rate": 0.00014569281413369462,
"loss": 1.9328,
"mean_token_accuracy": 0.5539643183350563,
"num_tokens": 6562759.0,
"step": 2280
},
{
"entropy": 1.5298347800970078,
"epoch": 2.96248382923674,
"grad_norm": 0.0,
"learning_rate": 0.00014522629119941333,
"loss": 0.766,
"mean_token_accuracy": 0.21878809183835984,
"num_tokens": 6564974.0,
"step": 2290
},
{
"entropy": 1.4145286485552788,
"epoch": 2.975420439844761,
"grad_norm": 0.4561901092529297,
"learning_rate": 0.00014475852753650023,
"loss": 0.7577,
"mean_token_accuracy": 0.22906568124890328,
"num_tokens": 6598409.0,
"step": 2300
},
{
"entropy": 1.5782025367021562,
"epoch": 2.988357050452781,
"grad_norm": 0.5903820991516113,
"learning_rate": 0.000144289535977486,
"loss": 1.554,
"mean_token_accuracy": 0.6246525257825851,
"num_tokens": 6627531.0,
"step": 2310
},
{
"entropy": 1.9433803856372833,
"epoch": 3.001293661060802,
"grad_norm": 0.13881655037403107,
"learning_rate": 0.00014381932938858718,
"loss": 0.9444,
"mean_token_accuracy": 0.22419775873422623,
"num_tokens": 6660338.0,
"step": 2320
},
{
"entropy": 1.621496966481209,
"epoch": 3.014230271668823,
"grad_norm": 0.42520761489868164,
"learning_rate": 0.0001433479206693532,
"loss": 1.6127,
"mean_token_accuracy": 0.6233608849346638,
"num_tokens": 6713107.0,
"step": 2330
},
{
"entropy": 1.338898405432701,
"epoch": 3.0271668822768434,
"grad_norm": 0.6367995738983154,
"learning_rate": 0.0001428753227523124,
"loss": 1.3191,
"mean_token_accuracy": 0.67000552713871,
"num_tokens": 6744799.0,
"step": 2340
},
{
"entropy": 1.590729820728302,
"epoch": 3.0401034928848643,
"grad_norm": 0.6899548172950745,
"learning_rate": 0.0001424015486026174,
"loss": 1.5648,
"mean_token_accuracy": 0.618783813714981,
"num_tokens": 6766726.0,
"step": 2350
},
{
"entropy": 1.977810901403427,
"epoch": 3.0530401034928847,
"grad_norm": 0.0,
"learning_rate": 0.00014192661121768932,
"loss": 1.3483,
"mean_token_accuracy": 0.3756748877465725,
"num_tokens": 6772184.0,
"step": 2360
},
{
"entropy": 1.1425089821219445,
"epoch": 3.0659767141009056,
"grad_norm": 0.1791164129972458,
"learning_rate": 0.0001414505236268613,
"loss": 0.2221,
"mean_token_accuracy": 0.05023420602083206,
"num_tokens": 6801985.0,
"step": 2370
},
{
"entropy": 1.534485575556755,
"epoch": 3.0789133247089264,
"grad_norm": 0.4513719975948334,
"learning_rate": 0.00014097329889102084,
"loss": 1.6302,
"mean_token_accuracy": 0.6191562682390213,
"num_tokens": 6853863.0,
"step": 2380
},
{
"entropy": 1.3535702049732208,
"epoch": 3.091849935316947,
"grad_norm": 0.6277197599411011,
"learning_rate": 0.00014049495010225174,
"loss": 1.2826,
"mean_token_accuracy": 0.6846122413873672,
"num_tokens": 6885860.0,
"step": 2390
},
{
"entropy": 1.611542597413063,
"epoch": 3.1047865459249677,
"grad_norm": 0.6629586219787598,
"learning_rate": 0.00014001549038347488,
"loss": 1.5841,
"mean_token_accuracy": 0.6110770747065544,
"num_tokens": 6907549.0,
"step": 2400
},
{
"epoch": 3.1047865459249677,
"eval_entropy": 1.4435141939063405,
"eval_loss": 1.3480572700500488,
"eval_mean_token_accuracy": 0.45482284610354623,
"eval_num_tokens": 6907549.0,
"eval_runtime": 243.0256,
"eval_samples_per_second": 22.615,
"eval_steps_per_second": 1.415,
"step": 2400
},
{
"entropy": 2.002578613162041,
"epoch": 3.117723156532988,
"grad_norm": 0.0,
"learning_rate": 0.00013953493288808804,
"loss": 1.2204,
"mean_token_accuracy": 0.3793766848742962,
"num_tokens": 6912238.0,
"step": 2410
},
{
"entropy": 1.580290713906288,
"epoch": 3.130659767141009,
"grad_norm": 0.17965653538703918,
"learning_rate": 0.00013905329079960522,
"loss": 0.2405,
"mean_token_accuracy": 0.04845013022422791,
"num_tokens": 6941537.0,
"step": 2420
},
{
"entropy": 1.4815610826015473,
"epoch": 3.14359637774903,
"grad_norm": 0.46858540177345276,
"learning_rate": 0.00013857057733129494,
"loss": 1.5548,
"mean_token_accuracy": 0.6307360790669918,
"num_tokens": 6994352.0,
"step": 2430
},
{
"entropy": 1.3407190799713136,
"epoch": 3.1565329883570503,
"grad_norm": 0.6128517389297485,
"learning_rate": 0.00013808680572581776,
"loss": 1.2793,
"mean_token_accuracy": 0.6835518077015876,
"num_tokens": 7026544.0,
"step": 2440
},
{
"entropy": 1.6429592788219451,
"epoch": 3.169469598965071,
"grad_norm": 0.7309837937355042,
"learning_rate": 0.0001376019892548629,
"loss": 1.6028,
"mean_token_accuracy": 0.6109883636236191,
"num_tokens": 7049229.0,
"step": 2450
},
{
"entropy": 2.1930068999528887,
"epoch": 3.1824062095730916,
"grad_norm": 0.0,
"learning_rate": 0.00013711614121878423,
"loss": 1.3452,
"mean_token_accuracy": 0.4032416954636574,
"num_tokens": 7055638.0,
"step": 2460
},
{
"entropy": 2.582664442062378,
"epoch": 3.1953428201811125,
"grad_norm": 0.17951107025146484,
"learning_rate": 0.00013662927494623528,
"loss": 0.238,
"mean_token_accuracy": 0.0486849807202816,
"num_tokens": 7079933.0,
"step": 2470
},
{
"entropy": 1.4514012217521668,
"epoch": 3.2082794307891334,
"grad_norm": 0.48690128326416016,
"learning_rate": 0.00013614140379380384,
"loss": 1.5635,
"mean_token_accuracy": 0.6299719527363777,
"num_tokens": 7130984.0,
"step": 2480
},
{
"entropy": 1.3963081300258637,
"epoch": 3.221216041397154,
"grad_norm": 0.5850987434387207,
"learning_rate": 0.00013565254114564522,
"loss": 1.3093,
"mean_token_accuracy": 0.6751079827547073,
"num_tokens": 7162961.0,
"step": 2490
},
{
"entropy": 1.6287110567092895,
"epoch": 3.2341526520051747,
"grad_norm": 0.7363412976264954,
"learning_rate": 0.00013516270041311523,
"loss": 1.6109,
"mean_token_accuracy": 0.6086324542760849,
"num_tokens": 7185148.0,
"step": 2500
},
{
"entropy": 2.588909697532654,
"epoch": 3.2470892626131955,
"grad_norm": 0.0,
"learning_rate": 0.0001346718950344023,
"loss": 1.3295,
"mean_token_accuracy": 0.36438525542616845,
"num_tokens": 7190578.0,
"step": 2510
},
{
"entropy": 2.170939177274704,
"epoch": 3.260025873221216,
"grad_norm": 0.16089969873428345,
"learning_rate": 0.00013418013847415875,
"loss": 0.2333,
"mean_token_accuracy": 0.04912624955177307,
"num_tokens": 7223083.0,
"step": 2520
},
{
"entropy": 1.5124918982386588,
"epoch": 3.272962483829237,
"grad_norm": 0.48449796438217163,
"learning_rate": 0.00013368744422313135,
"loss": 1.5844,
"mean_token_accuracy": 0.6292549699544907,
"num_tokens": 7278262.0,
"step": 2530
},
{
"entropy": 1.3001452058553695,
"epoch": 3.2858990944372573,
"grad_norm": 0.6388899087905884,
"learning_rate": 0.00013319382579779143,
"loss": 1.2473,
"mean_token_accuracy": 0.686492520570755,
"num_tokens": 7310633.0,
"step": 2540
},
{
"entropy": 1.588513082265854,
"epoch": 3.298835705045278,
"grad_norm": 0.7601234316825867,
"learning_rate": 0.00013269929673996372,
"loss": 1.5813,
"mean_token_accuracy": 0.6151460394263267,
"num_tokens": 7333877.0,
"step": 2550
},
{
"epoch": 3.298835705045278,
"eval_entropy": 1.50408104668523,
"eval_loss": 1.3354183435440063,
"eval_mean_token_accuracy": 0.4569617995862351,
"eval_num_tokens": 7333877.0,
"eval_runtime": 242.7951,
"eval_samples_per_second": 22.636,
"eval_steps_per_second": 1.417,
"step": 2550
},
{
"entropy": 1.8434918358922006,
"epoch": 3.311772315653299,
"grad_norm": 0.0,
"learning_rate": 0.00013220387061645518,
"loss": 1.2378,
"mean_token_accuracy": 0.3966076374053955,
"num_tokens": 7340126.0,
"step": 2560
},
{
"entropy": 2.0701662808656693,
"epoch": 3.3247089262613194,
"grad_norm": 0.1653972566127777,
"learning_rate": 0.00013170756101868274,
"loss": 0.2363,
"mean_token_accuracy": 0.04905220568180084,
"num_tokens": 7368440.0,
"step": 2570
},
{
"entropy": 1.521276581287384,
"epoch": 3.3376455368693403,
"grad_norm": 0.5110422372817993,
"learning_rate": 0.00013121038156230021,
"loss": 1.6069,
"mean_token_accuracy": 0.6247900031507015,
"num_tokens": 7422449.0,
"step": 2580
},
{
"entropy": 1.3473992764949798,
"epoch": 3.350582147477361,
"grad_norm": 0.5985650420188904,
"learning_rate": 0.00013071234588682507,
"loss": 1.2818,
"mean_token_accuracy": 0.6814156129956246,
"num_tokens": 7455078.0,
"step": 2590
},
{
"entropy": 1.5794302642345428,
"epoch": 3.3635187580853816,
"grad_norm": 0.7455780506134033,
"learning_rate": 0.00013021346765526405,
"loss": 1.5565,
"mean_token_accuracy": 0.6210769057273865,
"num_tokens": 7478151.0,
"step": 2600
},
{
"entropy": 2.400119936466217,
"epoch": 3.3764553686934025,
"grad_norm": 0.0,
"learning_rate": 0.00012971376055373842,
"loss": 1.3398,
"mean_token_accuracy": 0.3794242724776268,
"num_tokens": 7483907.0,
"step": 2610
},
{
"entropy": 2.360330358147621,
"epoch": 3.389391979301423,
"grad_norm": 0.16837802529335022,
"learning_rate": 0.0001292132382911085,
"loss": 0.231,
"mean_token_accuracy": 0.04970394000411034,
"num_tokens": 7511728.0,
"step": 2620
},
{
"entropy": 1.5115429222583772,
"epoch": 3.4023285899094438,
"grad_norm": 0.5140193700790405,
"learning_rate": 0.00012871191459859754,
"loss": 1.5844,
"mean_token_accuracy": 0.626202804595232,
"num_tokens": 7564367.0,
"step": 2630
},
{
"entropy": 1.33871136456728,
"epoch": 3.4152652005174646,
"grad_norm": 0.5856406092643738,
"learning_rate": 0.00012820980322941506,
"loss": 1.2772,
"mean_token_accuracy": 0.6828064471483231,
"num_tokens": 7596458.0,
"step": 2640
},
{
"entropy": 1.5606994718313216,
"epoch": 3.428201811125485,
"grad_norm": 0.7913902401924133,
"learning_rate": 0.00012770691795837956,
"loss": 1.5388,
"mean_token_accuracy": 0.6267461031675339,
"num_tokens": 7618937.0,
"step": 2650
},
{
"entropy": 2.3131509482860566,
"epoch": 3.441138421733506,
"grad_norm": 0.0,
"learning_rate": 0.00012720327258154059,
"loss": 1.3789,
"mean_token_accuracy": 0.39152705743908883,
"num_tokens": 7624946.0,
"step": 2660
},
{
"entropy": 2.270913216471672,
"epoch": 3.4540750323415264,
"grad_norm": 0.1674034297466278,
"learning_rate": 0.00012669888091580033,
"loss": 0.2283,
"mean_token_accuracy": 0.05011768788099289,
"num_tokens": 7655621.0,
"step": 2670
},
{
"entropy": 1.5039668411016465,
"epoch": 3.4670116429495472,
"grad_norm": 0.5039061307907104,
"learning_rate": 0.00012619375679853435,
"loss": 1.5889,
"mean_token_accuracy": 0.6255090057849884,
"num_tokens": 7706496.0,
"step": 2680
},
{
"entropy": 1.299958510696888,
"epoch": 3.479948253557568,
"grad_norm": 0.6249063611030579,
"learning_rate": 0.0001256879140872123,
"loss": 1.2262,
"mean_token_accuracy": 0.6930169105529785,
"num_tokens": 7738457.0,
"step": 2690
},
{
"entropy": 1.5891169756650925,
"epoch": 3.4928848641655885,
"grad_norm": 0.7654421925544739,
"learning_rate": 0.00012518136665901755,
"loss": 1.5485,
"mean_token_accuracy": 0.6236635655164718,
"num_tokens": 7760759.0,
"step": 2700
},
{
"epoch": 3.4928848641655885,
"eval_entropy": 1.7460197186054185,
"eval_loss": 1.3263978958129883,
"eval_mean_token_accuracy": 0.45740372557626213,
"eval_num_tokens": 7760759.0,
"eval_runtime": 244.9238,
"eval_samples_per_second": 22.44,
"eval_steps_per_second": 1.405,
"step": 2700
},
{
"entropy": 2.4236282050609588,
"epoch": 3.5058214747736094,
"grad_norm": 0.0,
"learning_rate": 0.00012467412841046644,
"loss": 1.3685,
"mean_token_accuracy": 0.38023146614432335,
"num_tokens": 7766609.0,
"step": 2710
},
{
"entropy": 2.481502190232277,
"epoch": 3.51875808538163,
"grad_norm": 0.18167299032211304,
"learning_rate": 0.00012416621325702723,
"loss": 0.2353,
"mean_token_accuracy": 0.049381527304649356,
"num_tokens": 7796963.0,
"step": 2720
},
{
"entropy": 1.526540043950081,
"epoch": 3.5316946959896507,
"grad_norm": 0.5063906908035278,
"learning_rate": 0.00012365763513273826,
"loss": 1.6301,
"mean_token_accuracy": 0.6226166233420372,
"num_tokens": 7851436.0,
"step": 2730
},
{
"entropy": 1.3451905250549316,
"epoch": 3.5446313065976716,
"grad_norm": 0.591876208782196,
"learning_rate": 0.0001231484079898255,
"loss": 1.2804,
"mean_token_accuracy": 0.6807183653116227,
"num_tokens": 7883623.0,
"step": 2740
},
{
"entropy": 1.6224838614463806,
"epoch": 3.557567917205692,
"grad_norm": 0.8054526448249817,
"learning_rate": 0.00012263854579832022,
"loss": 1.5855,
"mean_token_accuracy": 0.6138912171125412,
"num_tokens": 7906065.0,
"step": 2750
},
{
"entropy": 2.2193833112716677,
"epoch": 3.570504527813713,
"grad_norm": 0.0,
"learning_rate": 0.00012212806254567526,
"loss": 1.3055,
"mean_token_accuracy": 0.388429357111454,
"num_tokens": 7911950.0,
"step": 2760
},
{
"entropy": 1.9380589336156846,
"epoch": 3.5834411384217333,
"grad_norm": 0.15811856091022491,
"learning_rate": 0.00012161697223638162,
"loss": 0.2486,
"mean_token_accuracy": 0.048336771130561826,
"num_tokens": 7944772.0,
"step": 2770
},
{
"entropy": 1.5291394203901292,
"epoch": 3.596377749029754,
"grad_norm": 0.5478163361549377,
"learning_rate": 0.00012110528889158421,
"loss": 1.6201,
"mean_token_accuracy": 0.6210859633982182,
"num_tokens": 7998744.0,
"step": 2780
},
{
"entropy": 1.3308267042040824,
"epoch": 3.609314359637775,
"grad_norm": 0.6494978070259094,
"learning_rate": 0.00012059302654869707,
"loss": 1.2747,
"mean_token_accuracy": 0.6828291460871696,
"num_tokens": 8030628.0,
"step": 2790
},
{
"entropy": 1.6048484414815902,
"epoch": 3.6222509702457955,
"grad_norm": 0.8232805132865906,
"learning_rate": 0.00012008019926101837,
"loss": 1.5858,
"mean_token_accuracy": 0.614265987277031,
"num_tokens": 8052959.0,
"step": 2800
},
{
"entropy": 2.457938811182976,
"epoch": 3.6351875808538163,
"grad_norm": 0.0,
"learning_rate": 0.00011956682109734485,
"loss": 1.3734,
"mean_token_accuracy": 0.37425210550427435,
"num_tokens": 8058605.0,
"step": 2810
},
{
"entropy": 2.780105286836624,
"epoch": 3.6481241914618368,
"grad_norm": 0.15952081978321075,
"learning_rate": 0.0001190529061415859,
"loss": 0.2238,
"mean_token_accuracy": 0.0499541737139225,
"num_tokens": 8088439.0,
"step": 2820
},
{
"entropy": 1.4993865296244622,
"epoch": 3.6610608020698576,
"grad_norm": 0.4854850769042969,
"learning_rate": 0.0001185384684923772,
"loss": 1.5841,
"mean_token_accuracy": 0.6286533363163471,
"num_tokens": 8140599.0,
"step": 2830
},
{
"entropy": 1.3472731560468674,
"epoch": 3.6739974126778785,
"grad_norm": 0.6306962966918945,
"learning_rate": 0.00011802352226269375,
"loss": 1.292,
"mean_token_accuracy": 0.6775945991277694,
"num_tokens": 8172688.0,
"step": 2840
},
{
"entropy": 1.5441134572029114,
"epoch": 3.6869340232858994,
"grad_norm": 0.8373256325721741,
"learning_rate": 0.00011750808157946291,
"loss": 1.5236,
"mean_token_accuracy": 0.6226452678442002,
"num_tokens": 8195667.0,
"step": 2850
},
{
"epoch": 3.6869340232858994,
"eval_entropy": 2.019692697324032,
"eval_loss": 1.3088935613632202,
"eval_mean_token_accuracy": 0.45852816875937374,
"eval_num_tokens": 8195667.0,
"eval_runtime": 247.8075,
"eval_samples_per_second": 22.179,
"eval_steps_per_second": 1.388,
"step": 2850
},
{
"entropy": 2.331311251223087,
"epoch": 3.69987063389392,
"grad_norm": 0.0,
"learning_rate": 0.00011699216058317686,
"loss": 1.4345,
"mean_token_accuracy": 0.42385049238801004,
"num_tokens": 8202061.0,
"step": 2860
},
{
"entropy": 1.3996504232287408,
"epoch": 3.71280724450194,
"grad_norm": 0.16637884080410004,
"learning_rate": 0.00011647577342750447,
"loss": 0.232,
"mean_token_accuracy": 0.05035848617553711,
"num_tokens": 8229320.0,
"step": 2870
},
{
"entropy": 1.5440905675292016,
"epoch": 3.725743855109961,
"grad_norm": 0.5046349763870239,
"learning_rate": 0.00011595893427890316,
"loss": 1.6135,
"mean_token_accuracy": 0.6227852456271649,
"num_tokens": 8282159.0,
"step": 2880
},
{
"entropy": 1.313097244501114,
"epoch": 3.738680465717982,
"grad_norm": 0.6280332803726196,
"learning_rate": 0.00011544165731623029,
"loss": 1.283,
"mean_token_accuracy": 0.6847794458270073,
"num_tokens": 8314583.0,
"step": 2890
},
{
"entropy": 1.5734279870986938,
"epoch": 3.751617076326003,
"grad_norm": 0.8147013187408447,
"learning_rate": 0.00011492395673035401,
"loss": 1.5372,
"mean_token_accuracy": 0.6240187495946884,
"num_tokens": 8337156.0,
"step": 2900
},
{
"entropy": 1.903187246620655,
"epoch": 3.7645536869340233,
"grad_norm": 0.0,
"learning_rate": 0.00011440584672376418,
"loss": 1.3835,
"mean_token_accuracy": 0.3674991957843304,
"num_tokens": 8343309.0,
"step": 2910
},
{
"entropy": 1.1613501474261283,
"epoch": 3.777490297542044,
"grad_norm": 0.16990479826927185,
"learning_rate": 0.00011388734151018252,
"loss": 0.2192,
"mean_token_accuracy": 0.050329743325710295,
"num_tokens": 8374198.0,
"step": 2920
},
{
"entropy": 1.5224060222506524,
"epoch": 3.7904269081500646,
"grad_norm": 0.5338153839111328,
"learning_rate": 0.00011336845531417286,
"loss": 1.6167,
"mean_token_accuracy": 0.6217537559568882,
"num_tokens": 8426906.0,
"step": 2930
},
{
"entropy": 1.3422169074416161,
"epoch": 3.8033635187580854,
"grad_norm": 0.6484615802764893,
"learning_rate": 0.00011284920237075076,
"loss": 1.2771,
"mean_token_accuracy": 0.6828199326992035,
"num_tokens": 8458929.0,
"step": 2940
},
{
"entropy": 1.5778010010719299,
"epoch": 3.8163001293661063,
"grad_norm": 0.8282558917999268,
"learning_rate": 0.00011232959692499308,
"loss": 1.5224,
"mean_token_accuracy": 0.6264667376875878,
"num_tokens": 8481613.0,
"step": 2950
},
{
"entropy": 2.231258991360664,
"epoch": 3.8292367399741267,
"grad_norm": 0.0,
"learning_rate": 0.00011180965323164719,
"loss": 1.3715,
"mean_token_accuracy": 0.4014947086572647,
"num_tokens": 8487887.0,
"step": 2960
},
{
"entropy": 2.2951877444982527,
"epoch": 3.8421733505821476,
"grad_norm": 0.16264809668064117,
"learning_rate": 0.00011128938555473976,
"loss": 0.242,
"mean_token_accuracy": 0.04751046672463417,
"num_tokens": 8522204.0,
"step": 2970
},
{
"entropy": 1.505036623775959,
"epoch": 3.855109961190168,
"grad_norm": 0.5537543892860413,
"learning_rate": 0.00011076880816718569,
"loss": 1.5994,
"mean_token_accuracy": 0.6235061697661877,
"num_tokens": 8576399.0,
"step": 2980
},
{
"entropy": 1.306050930917263,
"epoch": 3.868046571798189,
"grad_norm": 0.6618802547454834,
"learning_rate": 0.00011024793535039634,
"loss": 1.2665,
"mean_token_accuracy": 0.6823444902896881,
"num_tokens": 8607791.0,
"step": 2990
},
{
"entropy": 1.5978755921125412,
"epoch": 3.8809831824062098,
"grad_norm": 0.756771445274353,
"learning_rate": 0.00010972678139388784,
"loss": 1.5231,
"mean_token_accuracy": 0.6199123159050941,
"num_tokens": 8629942.0,
"step": 3000
},
{
"epoch": 3.8809831824062098,
"eval_entropy": 1.7341382033949675,
"eval_loss": 1.2953605651855469,
"eval_mean_token_accuracy": 0.4613482361269552,
"eval_num_tokens": 8629942.0,
"eval_runtime": 243.363,
"eval_samples_per_second": 22.584,
"eval_steps_per_second": 1.414,
"step": 3000
},
{
"entropy": 1.9563438802957536,
"epoch": 3.89391979301423,
"grad_norm": 0.0,
"learning_rate": 0.00010920536059488904,
"loss": 1.2245,
"mean_token_accuracy": 0.35897522792220116,
"num_tokens": 8635069.0,
"step": 3010
},
{
"entropy": 0.9117880932986736,
"epoch": 3.906856403622251,
"grad_norm": 0.16995865106582642,
"learning_rate": 0.00010868368725794928,
"loss": 0.2219,
"mean_token_accuracy": 0.050884007662534717,
"num_tokens": 8661156.0,
"step": 3020
},
{
"entropy": 1.5383384585380555,
"epoch": 3.9197930142302715,
"grad_norm": 0.5345892310142517,
"learning_rate": 0.000108161775694546,
"loss": 1.6123,
"mean_token_accuracy": 0.6229903392493725,
"num_tokens": 8713506.0,
"step": 3030
},
{
"entropy": 1.2795201033353805,
"epoch": 3.9327296248382924,
"grad_norm": 0.682775616645813,
"learning_rate": 0.00010763964022269213,
"loss": 1.2389,
"mean_token_accuracy": 0.6921025589108467,
"num_tokens": 8745762.0,
"step": 3040
},
{
"entropy": 1.585690438747406,
"epoch": 3.9456662354463132,
"grad_norm": 0.7901929616928101,
"learning_rate": 0.00010711729516654311,
"loss": 1.5575,
"mean_token_accuracy": 0.6214944392442703,
"num_tokens": 8768560.0,
"step": 3050
},
{
"entropy": 2.0845181226730345,
"epoch": 3.9586028460543337,
"grad_norm": 0.0,
"learning_rate": 0.00010659475485600423,
"loss": 1.4895,
"mean_token_accuracy": 0.39826231375336646,
"num_tokens": 8775063.0,
"step": 3060
},
{
"entropy": 2.2135625928640366,
"epoch": 3.9715394566623545,
"grad_norm": 0.212826207280159,
"learning_rate": 0.00010607203362633728,
"loss": 0.2226,
"mean_token_accuracy": 0.051099646091461184,
"num_tokens": 8793192.0,
"step": 3070
},
{
"entropy": 1.4032258987426758,
"epoch": 3.984476067270375,
"grad_norm": 0.6924927830696106,
"learning_rate": 0.00010554914581776738,
"loss": 1.4474,
"mean_token_accuracy": 0.6517833903431892,
"num_tokens": 8831113.0,
"step": 3080
},
{
"entropy": 2.1208325177431107,
"epoch": 3.997412677878396,
"grad_norm": 0.0,
"learning_rate": 0.00010502610577508949,
"loss": 1.1819,
"mean_token_accuracy": 0.38025794699788096,
"num_tokens": 8840822.0,
"step": 3090
},
{
"entropy": 1.8567550331354141,
"epoch": 4.010349288486417,
"grad_norm": 0.5068947076797485,
"learning_rate": 0.00010450292784727496,
"loss": 1.3687,
"mean_token_accuracy": 0.48387093394994735,
"num_tokens": 8907582.0,
"step": 3100
},
{
"entropy": 1.1507928803563119,
"epoch": 4.023285899094438,
"grad_norm": 0.6847311854362488,
"learning_rate": 0.00010397962638707783,
"loss": 1.129,
"mean_token_accuracy": 0.7149621859192848,
"num_tokens": 8942268.0,
"step": 3110
},
{
"entropy": 1.3405901521444321,
"epoch": 4.0362225097024576,
"grad_norm": 0.8465374112129211,
"learning_rate": 0.00010345621575064117,
"loss": 1.3204,
"mean_token_accuracy": 0.6661748513579369,
"num_tokens": 8967621.0,
"step": 3120
},
{
"entropy": 1.9997529834508896,
"epoch": 4.049159120310478,
"grad_norm": 1.2902584075927734,
"learning_rate": 0.00010293271029710307,
"loss": 1.7005,
"mean_token_accuracy": 0.5859146490693092,
"num_tokens": 8978783.0,
"step": 3130
},
{
"entropy": 2.575493034720421,
"epoch": 4.062095730918499,
"grad_norm": 0.0,
"learning_rate": 0.00010240912438820289,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 8979423.0,
"step": 3140
},
{
"entropy": 1.8354697600007057,
"epoch": 4.07503234152652,
"grad_norm": 0.6097379326820374,
"learning_rate": 0.00010188547238788713,
"loss": 1.3617,
"mean_token_accuracy": 0.4855068750679493,
"num_tokens": 9049300.0,
"step": 3150
},
{
"epoch": 4.07503234152652,
"eval_entropy": 1.8855025125450866,
"eval_loss": 1.301902413368225,
"eval_mean_token_accuracy": 0.46030220640606656,
"eval_num_tokens": 9049300.0,
"eval_runtime": 243.8279,
"eval_samples_per_second": 22.54,
"eval_steps_per_second": 1.411,
"step": 3150
},
{
"entropy": 1.140310089290142,
"epoch": 4.087968952134541,
"grad_norm": 0.6553735136985779,
"learning_rate": 0.00010136176866191548,
"loss": 1.109,
"mean_token_accuracy": 0.7216179341077804,
"num_tokens": 9083874.0,
"step": 3160
},
{
"entropy": 1.3620821744203568,
"epoch": 4.100905562742561,
"grad_norm": 0.9848551154136658,
"learning_rate": 0.00010083802757746668,
"loss": 1.2997,
"mean_token_accuracy": 0.6707961618900299,
"num_tokens": 9108826.0,
"step": 3170
},
{
"entropy": 2.078350791335106,
"epoch": 4.113842173350582,
"grad_norm": 0.9935686588287354,
"learning_rate": 0.0001003142635027442,
"loss": 1.6088,
"mean_token_accuracy": 0.5507442288100719,
"num_tokens": 9118696.0,
"step": 3180
},
{
"entropy": 1.528096930682659,
"epoch": 4.126778783958603,
"grad_norm": 0.0,
"learning_rate": 9.979049080658242e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 9119336.0,
"step": 3190
},
{
"entropy": 1.4985127076506615,
"epoch": 4.139715394566624,
"grad_norm": 0.6286259889602661,
"learning_rate": 9.926672385805207e-05,
"loss": 1.4428,
"mean_token_accuracy": 0.46830192804336546,
"num_tokens": 9198456.0,
"step": 3200
},
{
"entropy": 1.1341844990849494,
"epoch": 4.1526520051746445,
"grad_norm": 0.6682960391044617,
"learning_rate": 9.874297702606636e-05,
"loss": 1.1144,
"mean_token_accuracy": 0.7213881194591523,
"num_tokens": 9234104.0,
"step": 3210
},
{
"entropy": 1.3693108260631561,
"epoch": 4.165588615782665,
"grad_norm": 0.8303619027137756,
"learning_rate": 9.821926467898653e-05,
"loss": 1.3216,
"mean_token_accuracy": 0.6689239561557769,
"num_tokens": 9259921.0,
"step": 3220
},
{
"entropy": 1.9042235404253005,
"epoch": 4.178525226390685,
"grad_norm": 1.645528793334961,
"learning_rate": 9.769560118422773e-05,
"loss": 1.7769,
"mean_token_accuracy": 0.5957130216062069,
"num_tokens": 9272479.0,
"step": 3230
},
{
"entropy": 0.9734129890799522,
"epoch": 4.191461836998706,
"grad_norm": 0.0,
"learning_rate": 9.717200090786501e-05,
"loss": 0.0492,
"mean_token_accuracy": 0.03619047701358795,
"num_tokens": 9273156.0,
"step": 3240
},
{
"entropy": 1.5239285960793496,
"epoch": 4.204398447606727,
"grad_norm": 0.6020880937576294,
"learning_rate": 9.664847821423907e-05,
"loss": 1.4046,
"mean_token_accuracy": 0.47501309886574744,
"num_tokens": 9347748.0,
"step": 3250
},
{
"entropy": 1.103029479086399,
"epoch": 4.217335058214748,
"grad_norm": 0.6547256708145142,
"learning_rate": 9.612504746556215e-05,
"loss": 1.0853,
"mean_token_accuracy": 0.722417363524437,
"num_tokens": 9382776.0,
"step": 3260
},
{
"entropy": 1.371236687898636,
"epoch": 4.230271668822769,
"grad_norm": 0.910345733165741,
"learning_rate": 9.560172302152414e-05,
"loss": 1.3338,
"mean_token_accuracy": 0.6663747102022171,
"num_tokens": 9408048.0,
"step": 3270
},
{
"entropy": 1.8871563643217086,
"epoch": 4.243208279430789,
"grad_norm": 1.3442589044570923,
"learning_rate": 9.507851923889868e-05,
"loss": 1.6856,
"mean_token_accuracy": 0.5958636343479157,
"num_tokens": 9419207.0,
"step": 3280
},
{
"entropy": 2.1561751127243043,
"epoch": 4.25614489003881,
"grad_norm": 0.0,
"learning_rate": 9.455545047114901e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 9419847.0,
"step": 3290
},
{
"entropy": 1.766649141907692,
"epoch": 4.269081500646831,
"grad_norm": 0.6345491409301758,
"learning_rate": 9.40325310680346e-05,
"loss": 1.3764,
"mean_token_accuracy": 0.48196633756160734,
"num_tokens": 9491348.0,
"step": 3300
},
{
"epoch": 4.269081500646831,
"eval_entropy": 1.759770261860171,
"eval_loss": 1.3021514415740967,
"eval_mean_token_accuracy": 0.4654658474894457,
"eval_num_tokens": 9491348.0,
"eval_runtime": 243.8603,
"eval_samples_per_second": 22.537,
"eval_steps_per_second": 1.411,
"step": 3300
},
{
"entropy": 1.0932901889085769,
"epoch": 4.282018111254851,
"grad_norm": 0.6778357028961182,
"learning_rate": 9.350977537521717e-05,
"loss": 1.0699,
"mean_token_accuracy": 0.7278983518481255,
"num_tokens": 9526419.0,
"step": 3310
},
{
"entropy": 1.3789748430252076,
"epoch": 4.294954721862872,
"grad_norm": 0.8899635672569275,
"learning_rate": 9.298719773386724e-05,
"loss": 1.3351,
"mean_token_accuracy": 0.6661961570382118,
"num_tokens": 9551892.0,
"step": 3320
},
{
"entropy": 1.957590714097023,
"epoch": 4.307891332470892,
"grad_norm": 1.470860481262207,
"learning_rate": 9.246481248027077e-05,
"loss": 1.7173,
"mean_token_accuracy": 0.5974891498684883,
"num_tokens": 9563515.0,
"step": 3330
},
{
"entropy": 2.714459627866745,
"epoch": 4.320827943078913,
"grad_norm": 0.0,
"learning_rate": 9.194263394543575e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 9564155.0,
"step": 3340
},
{
"entropy": 1.8973265826702117,
"epoch": 4.333764553686934,
"grad_norm": 0.6255518198013306,
"learning_rate": 9.14206764546991e-05,
"loss": 1.4331,
"mean_token_accuracy": 0.47237296029925346,
"num_tokens": 9638156.0,
"step": 3350
},
{
"entropy": 1.113997830450535,
"epoch": 4.346701164294955,
"grad_norm": 0.6197985410690308,
"learning_rate": 9.089895432733364e-05,
"loss": 1.1138,
"mean_token_accuracy": 0.7213677644729615,
"num_tokens": 9674105.0,
"step": 3360
},
{
"entropy": 1.355890506505966,
"epoch": 4.359637774902976,
"grad_norm": 0.8531930446624756,
"learning_rate": 9.037748187615538e-05,
"loss": 1.3064,
"mean_token_accuracy": 0.6726941719651223,
"num_tokens": 9700126.0,
"step": 3370
},
{
"entropy": 1.9791965007781982,
"epoch": 4.372574385510996,
"grad_norm": 1.7110706567764282,
"learning_rate": 8.985627340713061e-05,
"loss": 1.6769,
"mean_token_accuracy": 0.5642684459686279,
"num_tokens": 9711816.0,
"step": 3380
},
{
"entropy": 3.160873770713806,
"epoch": 4.385510996119017,
"grad_norm": 0.0,
"learning_rate": 8.933534321898367e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 9712456.0,
"step": 3390
},
{
"entropy": 2.013157232105732,
"epoch": 4.3984476067270375,
"grad_norm": 0.6276950240135193,
"learning_rate": 8.881470560280465e-05,
"loss": 1.4395,
"mean_token_accuracy": 0.4699708536267281,
"num_tokens": 9789047.0,
"step": 3400
},
{
"entropy": 1.0582531332969665,
"epoch": 4.411384217335058,
"grad_norm": 0.6762167811393738,
"learning_rate": 8.829437484165718e-05,
"loss": 1.0539,
"mean_token_accuracy": 0.7299133688211441,
"num_tokens": 9824536.0,
"step": 3410
},
{
"entropy": 1.3210778176784514,
"epoch": 4.424320827943079,
"grad_norm": 0.8756985664367676,
"learning_rate": 8.777436521018676e-05,
"loss": 1.2846,
"mean_token_accuracy": 0.6797921255230903,
"num_tokens": 9850555.0,
"step": 3420
},
{
"entropy": 1.8927232474088669,
"epoch": 4.437257438551099,
"grad_norm": 1.5375664234161377,
"learning_rate": 8.725469097422912e-05,
"loss": 1.7705,
"mean_token_accuracy": 0.5886133186519146,
"num_tokens": 9863603.0,
"step": 3430
},
{
"entropy": 2.54144030213356,
"epoch": 4.45019404915912,
"grad_norm": 0.0,
"learning_rate": 8.673536639041864e-05,
"loss": 0.0476,
"mean_token_accuracy": 0.04354838728904724,
"num_tokens": 9864278.0,
"step": 3440
},
{
"entropy": 1.6926740244030953,
"epoch": 4.463130659767141,
"grad_norm": 0.639385461807251,
"learning_rate": 8.621640570579764e-05,
"loss": 1.2832,
"mean_token_accuracy": 0.502137529104948,
"num_tokens": 9929876.0,
"step": 3450
},
{
"epoch": 4.463130659767141,
"eval_entropy": 1.6399936731471572,
"eval_loss": 1.2823114395141602,
"eval_mean_token_accuracy": 0.4697489900471166,
"eval_num_tokens": 9929876.0,
"eval_runtime": 242.6114,
"eval_samples_per_second": 22.654,
"eval_steps_per_second": 1.418,
"step": 3450
},
{
"entropy": 1.0890112176537514,
"epoch": 4.476067270375162,
"grad_norm": 0.6899943351745605,
"learning_rate": 8.56978231574252e-05,
"loss": 1.0627,
"mean_token_accuracy": 0.7313546255230904,
"num_tokens": 9964211.0,
"step": 3460
},
{
"entropy": 1.3737705022096633,
"epoch": 4.489003880983183,
"grad_norm": 0.9175981879234314,
"learning_rate": 8.517963297198672e-05,
"loss": 1.3508,
"mean_token_accuracy": 0.6623948410153389,
"num_tokens": 9989036.0,
"step": 3470
},
{
"entropy": 1.8537749290466308,
"epoch": 4.501940491591203,
"grad_norm": 1.1406779289245605,
"learning_rate": 8.466184936540351e-05,
"loss": 1.6469,
"mean_token_accuracy": 0.590015722811222,
"num_tokens": 9999994.0,
"step": 3480
},
{
"entropy": 1.9705951809883118,
"epoch": 4.514877102199224,
"grad_norm": 0.0,
"learning_rate": 8.414448654244297e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10000634.0,
"step": 3490
},
{
"entropy": 1.7736740306019783,
"epoch": 4.527813712807244,
"grad_norm": 0.5741596817970276,
"learning_rate": 8.362755869632883e-05,
"loss": 1.418,
"mean_token_accuracy": 0.4736007325351238,
"num_tokens": 10069782.0,
"step": 3500
},
{
"entropy": 1.1099611327052117,
"epoch": 4.540750323415265,
"grad_norm": 0.6997600793838501,
"learning_rate": 8.311108000835167e-05,
"loss": 1.1002,
"mean_token_accuracy": 0.7222409531474113,
"num_tokens": 10105051.0,
"step": 3510
},
{
"entropy": 1.3370502710342407,
"epoch": 4.553686934023286,
"grad_norm": 0.9216951131820679,
"learning_rate": 8.259506464747999e-05,
"loss": 1.2856,
"mean_token_accuracy": 0.6742190420627594,
"num_tokens": 10129844.0,
"step": 3520
},
{
"entropy": 2.0127808332443236,
"epoch": 4.566623544631307,
"grad_norm": 1.644737958908081,
"learning_rate": 8.207952676997153e-05,
"loss": 1.7374,
"mean_token_accuracy": 0.5706604786217213,
"num_tokens": 10140891.0,
"step": 3530
},
{
"entropy": 2.3392362356185914,
"epoch": 4.579560155239327,
"grad_norm": 0.0,
"learning_rate": 8.156448051898476e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10141531.0,
"step": 3540
},
{
"entropy": 1.7776501581072808,
"epoch": 4.592496765847348,
"grad_norm": 0.6358464956283569,
"learning_rate": 8.1049940024191e-05,
"loss": 1.4156,
"mean_token_accuracy": 0.47597954645752905,
"num_tokens": 10208071.0,
"step": 3550
},
{
"entropy": 1.103192213177681,
"epoch": 4.605433376455369,
"grad_norm": 0.6968359351158142,
"learning_rate": 8.053591940138686e-05,
"loss": 1.096,
"mean_token_accuracy": 0.7267577677965165,
"num_tokens": 10242851.0,
"step": 3560
},
{
"entropy": 1.3612541019916535,
"epoch": 4.61836998706339,
"grad_norm": 0.9655300974845886,
"learning_rate": 8.002243275210669e-05,
"loss": 1.3057,
"mean_token_accuracy": 0.672816789150238,
"num_tokens": 10268178.0,
"step": 3570
},
{
"entropy": 1.932911714911461,
"epoch": 4.63130659767141,
"grad_norm": 1.2096027135849,
"learning_rate": 7.950949416323612e-05,
"loss": 1.7086,
"mean_token_accuracy": 0.612860233336687,
"num_tokens": 10279495.0,
"step": 3580
},
{
"entropy": 1.9618256837129593,
"epoch": 4.6442432082794305,
"grad_norm": 0.0,
"learning_rate": 7.899711770662532e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10280135.0,
"step": 3590
},
{
"entropy": 1.6968649536371232,
"epoch": 4.657179818887451,
"grad_norm": 0.6373590230941772,
"learning_rate": 7.848531743870297e-05,
"loss": 1.3993,
"mean_token_accuracy": 0.4759638875722885,
"num_tokens": 10346462.0,
"step": 3600
},
{
"epoch": 4.657179818887451,
"eval_entropy": 1.577659371980401,
"eval_loss": 1.273931622505188,
"eval_mean_token_accuracy": 0.4723818853150967,
"eval_num_tokens": 10346462.0,
"eval_runtime": 245.6574,
"eval_samples_per_second": 22.373,
"eval_steps_per_second": 1.4,
"step": 3600
},
{
"entropy": 1.093438169360161,
"epoch": 4.670116429495472,
"grad_norm": 0.7240473628044128,
"learning_rate": 7.797410740009084e-05,
"loss": 1.0745,
"mean_token_accuracy": 0.7309321075677871,
"num_tokens": 10381489.0,
"step": 3610
},
{
"entropy": 1.37732635140419,
"epoch": 4.683053040103493,
"grad_norm": 0.9580934047698975,
"learning_rate": 7.746350161521845e-05,
"loss": 1.336,
"mean_token_accuracy": 0.6637881115078926,
"num_tokens": 10406795.0,
"step": 3620
},
{
"entropy": 1.9799594402313232,
"epoch": 4.695989650711514,
"grad_norm": 1.5260565280914307,
"learning_rate": 7.695351409193823e-05,
"loss": 1.7859,
"mean_token_accuracy": 0.5888419583439827,
"num_tokens": 10418685.0,
"step": 3630
},
{
"entropy": 1.8445574283599853,
"epoch": 4.708926261319534,
"grad_norm": 0.0,
"learning_rate": 7.644415882114145e-05,
"loss": 0.0354,
"mean_token_accuracy": 0.04375,
"num_tokens": 10419355.0,
"step": 3640
},
{
"entropy": 1.724594485759735,
"epoch": 4.721862871927555,
"grad_norm": 0.5997304320335388,
"learning_rate": 7.593544977637436e-05,
"loss": 1.4375,
"mean_token_accuracy": 0.4693992160260677,
"num_tokens": 10485312.0,
"step": 3650
},
{
"entropy": 1.079079033434391,
"epoch": 4.734799482535576,
"grad_norm": 0.6873499155044556,
"learning_rate": 7.54274009134546e-05,
"loss": 1.0708,
"mean_token_accuracy": 0.7280381500720978,
"num_tokens": 10520582.0,
"step": 3660
},
{
"entropy": 1.315394550561905,
"epoch": 4.747736093143597,
"grad_norm": 0.8612226843833923,
"learning_rate": 7.492002617008866e-05,
"loss": 1.2891,
"mean_token_accuracy": 0.6757827803492547,
"num_tokens": 10545966.0,
"step": 3670
},
{
"entropy": 1.840933558344841,
"epoch": 4.760672703751617,
"grad_norm": 0.7735125422477722,
"learning_rate": 7.441333946548939e-05,
"loss": 1.575,
"mean_token_accuracy": 0.5655414000153541,
"num_tokens": 10557080.0,
"step": 3680
},
{
"entropy": 1.232702499628067,
"epoch": 4.773609314359637,
"grad_norm": 0.0,
"learning_rate": 7.390735469999398e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10557720.0,
"step": 3690
},
{
"entropy": 1.5656249672174454,
"epoch": 4.786545924967658,
"grad_norm": 0.6145333051681519,
"learning_rate": 7.340208575468291e-05,
"loss": 1.4603,
"mean_token_accuracy": 0.46657404825091364,
"num_tokens": 10627563.0,
"step": 3700
},
{
"entropy": 1.0934513494372369,
"epoch": 4.799482535575679,
"grad_norm": 0.7226387858390808,
"learning_rate": 7.289754649099897e-05,
"loss": 1.0786,
"mean_token_accuracy": 0.7299003899097443,
"num_tokens": 10662880.0,
"step": 3710
},
{
"entropy": 1.3585843235254287,
"epoch": 4.8124191461837,
"grad_norm": 0.8521022796630859,
"learning_rate": 7.239375075036697e-05,
"loss": 1.3144,
"mean_token_accuracy": 0.6705298006534577,
"num_tokens": 10688600.0,
"step": 3720
},
{
"entropy": 1.8722685337066651,
"epoch": 4.825355756791721,
"grad_norm": 1.371882677078247,
"learning_rate": 7.189071235381406e-05,
"loss": 1.7141,
"mean_token_accuracy": 0.604588358104229,
"num_tokens": 10700334.0,
"step": 3730
},
{
"entropy": 1.860415416955948,
"epoch": 4.838292367399741,
"grad_norm": 0.0,
"learning_rate": 7.138844510159069e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10700974.0,
"step": 3740
},
{
"entropy": 1.68975418061018,
"epoch": 4.851228978007762,
"grad_norm": 0.6484793424606323,
"learning_rate": 7.088696277279175e-05,
"loss": 1.3382,
"mean_token_accuracy": 0.4887751266360283,
"num_tokens": 10771692.0,
"step": 3750
},
{
"epoch": 4.851228978007762,
"eval_entropy": 1.7122522977202437,
"eval_loss": 1.2648330926895142,
"eval_mean_token_accuracy": 0.47577540377198263,
"eval_num_tokens": 10771692.0,
"eval_runtime": 244.9784,
"eval_samples_per_second": 22.435,
"eval_steps_per_second": 1.404,
"step": 3750
},
{
"entropy": 1.1040325671434403,
"epoch": 4.864165588615783,
"grad_norm": 0.7224993705749512,
"learning_rate": 7.038627912497873e-05,
"loss": 1.0872,
"mean_token_accuracy": 0.7262751698493958,
"num_tokens": 10806575.0,
"step": 3760
},
{
"entropy": 1.3863080263137817,
"epoch": 4.8771021992238035,
"grad_norm": 0.9205716252326965,
"learning_rate": 6.988640789380241e-05,
"loss": 1.3415,
"mean_token_accuracy": 0.6670658677816391,
"num_tokens": 10831607.0,
"step": 3770
},
{
"entropy": 1.986344888806343,
"epoch": 4.890038809831824,
"grad_norm": 1.2501696348190308,
"learning_rate": 6.938736279262567e-05,
"loss": 1.5931,
"mean_token_accuracy": 0.5594463273882866,
"num_tokens": 10842477.0,
"step": 3780
},
{
"entropy": 2.6916876256465914,
"epoch": 4.902975420439844,
"grad_norm": 0.0,
"learning_rate": 6.888915751214774e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10843117.0,
"step": 3790
},
{
"entropy": 1.8490806862711906,
"epoch": 4.915912031047865,
"grad_norm": 0.6139810085296631,
"learning_rate": 6.83918057200283e-05,
"loss": 1.3791,
"mean_token_accuracy": 0.4822954162955284,
"num_tokens": 10917288.0,
"step": 3800
},
{
"entropy": 1.063162423670292,
"epoch": 4.928848641655886,
"grad_norm": 0.7340760231018066,
"learning_rate": 6.789532106051246e-05,
"loss": 1.0523,
"mean_token_accuracy": 0.7331129983067513,
"num_tokens": 10952906.0,
"step": 3810
},
{
"entropy": 1.348393714427948,
"epoch": 4.941785252263907,
"grad_norm": 0.979292094707489,
"learning_rate": 6.739971715405684e-05,
"loss": 1.3057,
"mean_token_accuracy": 0.6723238781094552,
"num_tokens": 10978606.0,
"step": 3820
},
{
"entropy": 1.887803316116333,
"epoch": 4.954721862871928,
"grad_norm": 1.4358190298080444,
"learning_rate": 6.690500759695557e-05,
"loss": 1.6779,
"mean_token_accuracy": 0.6134289026260376,
"num_tokens": 10990333.0,
"step": 3830
},
{
"entropy": 2.7988963067531585,
"epoch": 4.967658473479949,
"grad_norm": 0.0,
"learning_rate": 6.641120596096729e-05,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 10990973.0,
"step": 3840
},
{
"entropy": 1.5671290338039399,
"epoch": 4.980595084087969,
"grad_norm": 0.697485625743866,
"learning_rate": 6.591832579294303e-05,
"loss": 1.0782,
"mean_token_accuracy": 0.5383081540465355,
"num_tokens": 11034414.0,
"step": 3850
},
{
"entropy": 1.7319936901330948,
"epoch": 4.99353169469599,
"grad_norm": 0.0,
"learning_rate": 6.542638061445447e-05,
"loss": 1.3846,
"mean_token_accuracy": 0.5769012212753296,
"num_tokens": 11050864.0,
"step": 3860
},
{
"entropy": 2.4691727608442307,
"epoch": 5.00646830530401,
"grad_norm": 0.49155521392822266,
"learning_rate": 6.493538392142287e-05,
"loss": 1.0145,
"mean_token_accuracy": 0.26990386173129083,
"num_tokens": 11109874.0,
"step": 3870
},
{
"entropy": 1.075531531870365,
"epoch": 5.019404915912031,
"grad_norm": 0.7045453190803528,
"learning_rate": 6.444534918374906e-05,
"loss": 1.0364,
"mean_token_accuracy": 0.7393457636237144,
"num_tokens": 11148394.0,
"step": 3880
},
{
"entropy": 1.1883759826421738,
"epoch": 5.032341526520052,
"grad_norm": 0.9995729327201843,
"learning_rate": 6.395628984494378e-05,
"loss": 1.2028,
"mean_token_accuracy": 0.6972079753875733,
"num_tokens": 11176092.0,
"step": 3890
},
{
"entropy": 1.7173998385667801,
"epoch": 5.045278137128072,
"grad_norm": 1.125909686088562,
"learning_rate": 6.346821932175873e-05,
"loss": 1.5967,
"mean_token_accuracy": 0.6245104640722274,
"num_tokens": 11192285.0,
"step": 3900
},
{
"epoch": 5.045278137128072,
"eval_entropy": 1.9103823839578518,
"eval_loss": 1.2630141973495483,
"eval_mean_token_accuracy": 0.4754580475043419,
"eval_num_tokens": 11192285.0,
"eval_runtime": 244.3056,
"eval_samples_per_second": 22.496,
"eval_steps_per_second": 1.408,
"step": 3900
},
{
"entropy": 2.6084256947040556,
"epoch": 5.058214747736093,
"grad_norm": 0.0,
"learning_rate": 6.298115100381882e-05,
"loss": 0.3442,
"mean_token_accuracy": 0.16731906533241273,
"num_tokens": 11193644.0,
"step": 3910
},
{
"entropy": 2.282520645856857,
"epoch": 5.071151358344114,
"grad_norm": 0.5869239568710327,
"learning_rate": 6.249509825325467e-05,
"loss": 0.9511,
"mean_token_accuracy": 0.28290636241436007,
"num_tokens": 11249840.0,
"step": 3920
},
{
"entropy": 1.016249306499958,
"epoch": 5.084087968952135,
"grad_norm": 0.7197193503379822,
"learning_rate": 6.201007440433588e-05,
"loss": 1.007,
"mean_token_accuracy": 0.7442266702651977,
"num_tokens": 11287639.0,
"step": 3930
},
{
"entropy": 1.2221685394644737,
"epoch": 5.097024579560156,
"grad_norm": 0.9447595477104187,
"learning_rate": 6.152609276310549e-05,
"loss": 1.187,
"mean_token_accuracy": 0.7011413544416427,
"num_tokens": 11315215.0,
"step": 3940
},
{
"entropy": 1.6715268433094024,
"epoch": 5.109961190168176,
"grad_norm": 1.0949913263320923,
"learning_rate": 6.104316660701485e-05,
"loss": 1.5623,
"mean_token_accuracy": 0.6256066203117371,
"num_tokens": 11332567.0,
"step": 3950
},
{
"entropy": 2.496020531654358,
"epoch": 5.1228978007761965,
"grad_norm": 0.0,
"learning_rate": 6.056130918455929e-05,
"loss": 0.4602,
"mean_token_accuracy": 0.21488995999097824,
"num_tokens": 11334364.0,
"step": 3960
},
{
"entropy": 2.2577121645212173,
"epoch": 5.135834411384217,
"grad_norm": 0.6211187243461609,
"learning_rate": 6.0080533714914766e-05,
"loss": 1.0081,
"mean_token_accuracy": 0.2705229982733727,
"num_tokens": 11391718.0,
"step": 3970
},
{
"entropy": 1.0153650417923927,
"epoch": 5.148771021992238,
"grad_norm": 0.649202287197113,
"learning_rate": 5.9600853387575163e-05,
"loss": 1.0426,
"mean_token_accuracy": 0.7383781686425209,
"num_tokens": 11430710.0,
"step": 3980
},
{
"entropy": 1.1217432379722596,
"epoch": 5.161707632600259,
"grad_norm": 0.9362276792526245,
"learning_rate": 5.912228136199038e-05,
"loss": 1.0765,
"mean_token_accuracy": 0.7234507903456688,
"num_tokens": 11459154.0,
"step": 3990
},
{
"entropy": 1.6653785824775695,
"epoch": 5.174644243208279,
"grad_norm": 1.2307344675064087,
"learning_rate": 5.864483076720555e-05,
"loss": 1.5669,
"mean_token_accuracy": 0.6285063222050666,
"num_tokens": 11476268.0,
"step": 4000
},
{
"entropy": 2.082801552116871,
"epoch": 5.1875808538163,
"grad_norm": 0.0,
"learning_rate": 5.81685147015006e-05,
"loss": 0.3513,
"mean_token_accuracy": 0.1956300586462021,
"num_tokens": 11477779.0,
"step": 4010
},
{
"entropy": 2.0466490238904953,
"epoch": 5.200517464424321,
"grad_norm": 0.5699072480201721,
"learning_rate": 5.769334623203095e-05,
"loss": 0.9736,
"mean_token_accuracy": 0.27822155207395555,
"num_tokens": 11531993.0,
"step": 4020
},
{
"entropy": 1.0089649006724357,
"epoch": 5.213454075032342,
"grad_norm": 0.6833609938621521,
"learning_rate": 5.7219338394469356e-05,
"loss": 1.0355,
"mean_token_accuracy": 0.7415396451950074,
"num_tokens": 11570430.0,
"step": 4030
},
{
"entropy": 1.1602358788251876,
"epoch": 5.226390685640363,
"grad_norm": 0.933566153049469,
"learning_rate": 5.674650419264782e-05,
"loss": 1.1016,
"mean_token_accuracy": 0.7183712035417557,
"num_tokens": 11598642.0,
"step": 4040
},
{
"entropy": 1.6275397926568984,
"epoch": 5.239327296248383,
"grad_norm": 1.2435181140899658,
"learning_rate": 5.6274856598201066e-05,
"loss": 1.5472,
"mean_token_accuracy": 0.6266872644424438,
"num_tokens": 11615900.0,
"step": 4050
},
{
"epoch": 5.239327296248383,
"eval_entropy": 1.7370388171700544,
"eval_loss": 1.2589974403381348,
"eval_mean_token_accuracy": 0.4773535789965197,
"eval_num_tokens": 11615900.0,
"eval_runtime": 242.7915,
"eval_samples_per_second": 22.637,
"eval_steps_per_second": 1.417,
"step": 4050
},
{
"entropy": 2.3815665364265444,
"epoch": 5.252263906856403,
"grad_norm": 0.0,
"learning_rate": 5.580440855021083e-05,
"loss": 0.4649,
"mean_token_accuracy": 0.19248609468340874,
"num_tokens": 11617642.0,
"step": 4060
},
{
"entropy": 2.2312158316373827,
"epoch": 5.265200517464424,
"grad_norm": 0.5702583193778992,
"learning_rate": 5.533517295485062e-05,
"loss": 0.9829,
"mean_token_accuracy": 0.27761168628931043,
"num_tokens": 11675101.0,
"step": 4070
},
{
"entropy": 1.0108808249235153,
"epoch": 5.278137128072445,
"grad_norm": 0.752931535243988,
"learning_rate": 5.486716268503182e-05,
"loss": 1.0438,
"mean_token_accuracy": 0.7365775972604751,
"num_tokens": 11713657.0,
"step": 4080
},
{
"entropy": 1.2421717032790185,
"epoch": 5.291073738680466,
"grad_norm": 0.9655210375785828,
"learning_rate": 5.440039058005047e-05,
"loss": 1.1822,
"mean_token_accuracy": 0.7000416114926338,
"num_tokens": 11741666.0,
"step": 4090
},
{
"entropy": 1.6973173677921296,
"epoch": 5.304010349288486,
"grad_norm": 1.5103716850280762,
"learning_rate": 5.393486944523505e-05,
"loss": 1.5623,
"mean_token_accuracy": 0.6223144173622132,
"num_tokens": 11758203.0,
"step": 4100
},
{
"entropy": 2.1967957854270934,
"epoch": 5.316946959896507,
"grad_norm": 0.0,
"learning_rate": 5.347061205159519e-05,
"loss": 0.2983,
"mean_token_accuracy": 0.15621012300252915,
"num_tokens": 11759461.0,
"step": 4110
},
{
"entropy": 1.981054452061653,
"epoch": 5.329883570504528,
"grad_norm": 0.6421746611595154,
"learning_rate": 5.3007631135471334e-05,
"loss": 0.9895,
"mean_token_accuracy": 0.2755612075328827,
"num_tokens": 11813613.0,
"step": 4120
},
{
"entropy": 0.9758897602558136,
"epoch": 5.342820181112549,
"grad_norm": 0.7207741141319275,
"learning_rate": 5.2545939398185284e-05,
"loss": 1.0031,
"mean_token_accuracy": 0.7456466734409333,
"num_tokens": 11852165.0,
"step": 4130
},
{
"entropy": 1.195047491788864,
"epoch": 5.3557567917205695,
"grad_norm": 0.9851743578910828,
"learning_rate": 5.208554950569178e-05,
"loss": 1.1364,
"mean_token_accuracy": 0.7128469496965408,
"num_tokens": 11880541.0,
"step": 4140
},
{
"entropy": 1.6736773550510406,
"epoch": 5.36869340232859,
"grad_norm": 1.2857285737991333,
"learning_rate": 5.1626474088231004e-05,
"loss": 1.6022,
"mean_token_accuracy": 0.6264947578310966,
"num_tokens": 11897978.0,
"step": 4150
},
{
"entropy": 2.3716455429792402,
"epoch": 5.38163001293661,
"grad_norm": 0.0,
"learning_rate": 5.116872573998217e-05,
"loss": 0.3852,
"mean_token_accuracy": 0.18944832757115365,
"num_tokens": 11899460.0,
"step": 4160
},
{
"entropy": 2.3163172632455824,
"epoch": 5.394566623544631,
"grad_norm": 0.60521000623703,
"learning_rate": 5.071231701871787e-05,
"loss": 0.9779,
"mean_token_accuracy": 0.27711123302578927,
"num_tokens": 11956251.0,
"step": 4170
},
{
"entropy": 1.026511162519455,
"epoch": 5.407503234152652,
"grad_norm": 0.7545950412750244,
"learning_rate": 5.025726044545968e-05,
"loss": 1.0516,
"mean_token_accuracy": 0.7328214541077613,
"num_tokens": 11995157.0,
"step": 4180
},
{
"entropy": 1.1451522946357726,
"epoch": 5.420439844760673,
"grad_norm": 0.9537347555160522,
"learning_rate": 4.980356850413472e-05,
"loss": 1.1319,
"mean_token_accuracy": 0.7138208642601966,
"num_tokens": 12023430.0,
"step": 4190
},
{
"entropy": 1.7249857246875764,
"epoch": 5.433376455368693,
"grad_norm": 1.279359221458435,
"learning_rate": 4.935125364123292e-05,
"loss": 1.6072,
"mean_token_accuracy": 0.6237147711217403,
"num_tokens": 12040024.0,
"step": 4200
},
{
"epoch": 5.433376455368693,
"eval_entropy": 1.8443097567488982,
"eval_loss": 1.2536410093307495,
"eval_mean_token_accuracy": 0.4748884228079818,
"eval_num_tokens": 12040024.0,
"eval_runtime": 241.7185,
"eval_samples_per_second": 22.737,
"eval_steps_per_second": 1.423,
"step": 4200
},
{
"entropy": 2.7387112855911253,
"epoch": 5.446313065976714,
"grad_norm": 0.0,
"learning_rate": 4.89003282654658e-05,
"loss": 0.389,
"mean_token_accuracy": 0.1823613777756691,
"num_tokens": 12041467.0,
"step": 4210
},
{
"entropy": 2.44253671169281,
"epoch": 5.459249676584735,
"grad_norm": 0.5715515613555908,
"learning_rate": 4.845080474742608e-05,
"loss": 0.9938,
"mean_token_accuracy": 0.2730660729110241,
"num_tokens": 12103775.0,
"step": 4220
},
{
"entropy": 1.003270110487938,
"epoch": 5.472186287192756,
"grad_norm": 0.7785800099372864,
"learning_rate": 4.800269541924799e-05,
"loss": 1.0184,
"mean_token_accuracy": 0.7413052409887314,
"num_tokens": 12143014.0,
"step": 4230
},
{
"entropy": 1.1527703180909157,
"epoch": 5.485122897800776,
"grad_norm": 0.9831658005714417,
"learning_rate": 4.7556012574269395e-05,
"loss": 1.1284,
"mean_token_accuracy": 0.7102037504315376,
"num_tokens": 12171448.0,
"step": 4240
},
{
"entropy": 1.7090917527675629,
"epoch": 5.498059508408797,
"grad_norm": 1.4465516805648804,
"learning_rate": 4.7110768466694224e-05,
"loss": 1.6112,
"mean_token_accuracy": 0.6218582183122635,
"num_tokens": 12188400.0,
"step": 4250
},
{
"entropy": 2.560487928986549,
"epoch": 5.510996119016817,
"grad_norm": 0.0,
"learning_rate": 4.666697531125627e-05,
"loss": 0.3879,
"mean_token_accuracy": 0.16174983084201813,
"num_tokens": 12189804.0,
"step": 4260
},
{
"entropy": 2.277393540740013,
"epoch": 5.523932729624838,
"grad_norm": 0.5197897553443909,
"learning_rate": 4.622464528288443e-05,
"loss": 1.027,
"mean_token_accuracy": 0.2683463282883167,
"num_tokens": 12249572.0,
"step": 4270
},
{
"entropy": 1.0234995201230048,
"epoch": 5.536869340232859,
"grad_norm": 0.7546108961105347,
"learning_rate": 4.578379051636832e-05,
"loss": 1.0282,
"mean_token_accuracy": 0.7406062006950378,
"num_tokens": 12288484.0,
"step": 4280
},
{
"entropy": 1.1632590115070343,
"epoch": 5.54980595084088,
"grad_norm": 1.0032302141189575,
"learning_rate": 4.534442310602559e-05,
"loss": 1.1404,
"mean_token_accuracy": 0.7092833399772644,
"num_tokens": 12316357.0,
"step": 4290
},
{
"entropy": 1.6969922810792923,
"epoch": 5.5627425614489,
"grad_norm": 1.356163740158081,
"learning_rate": 4.490655510537004e-05,
"loss": 1.5895,
"mean_token_accuracy": 0.6228079289197922,
"num_tokens": 12332741.0,
"step": 4300
},
{
"entropy": 2.227242410182953,
"epoch": 5.575679172056921,
"grad_norm": 0.0,
"learning_rate": 4.447019852678101e-05,
"loss": 0.3691,
"mean_token_accuracy": 0.18163795471191407,
"num_tokens": 12334119.0,
"step": 4310
},
{
"entropy": 2.0241008371114733,
"epoch": 5.588615782664942,
"grad_norm": 0.5737898945808411,
"learning_rate": 4.40353653411738e-05,
"loss": 0.956,
"mean_token_accuracy": 0.2796335697174072,
"num_tokens": 12386755.0,
"step": 4320
},
{
"entropy": 1.009289626777172,
"epoch": 5.6015523932729625,
"grad_norm": 0.7294387221336365,
"learning_rate": 4.360206747767122e-05,
"loss": 1.032,
"mean_token_accuracy": 0.7417484134435653,
"num_tokens": 12424985.0,
"step": 4330
},
{
"entropy": 1.1622566372156142,
"epoch": 5.614489003880983,
"grad_norm": 1.0234155654907227,
"learning_rate": 4.3170316823276424e-05,
"loss": 1.1576,
"mean_token_accuracy": 0.7061204954981803,
"num_tokens": 12452639.0,
"step": 4340
},
{
"entropy": 1.6782744824886322,
"epoch": 5.627425614489004,
"grad_norm": 1.4249658584594727,
"learning_rate": 4.274012522254674e-05,
"loss": 1.5881,
"mean_token_accuracy": 0.6237360410392284,
"num_tokens": 12469230.0,
"step": 4350
},
{
"epoch": 5.627425614489004,
"eval_entropy": 1.6210007650214573,
"eval_loss": 1.241470217704773,
"eval_mean_token_accuracy": 0.47642275673705475,
"eval_num_tokens": 12469230.0,
"eval_runtime": 246.3378,
"eval_samples_per_second": 22.311,
"eval_steps_per_second": 1.396,
"step": 4350
},
{
"entropy": 1.9652087688446045,
"epoch": 5.640362225097024,
"grad_norm": 0.0,
"learning_rate": 4.231150447726874e-05,
"loss": 0.3533,
"mean_token_accuracy": 0.19179367125034333,
"num_tokens": 12470690.0,
"step": 4360
},
{
"entropy": 1.9280417621135713,
"epoch": 5.653298835705045,
"grad_norm": 0.5711302757263184,
"learning_rate": 4.1884466346134466e-05,
"loss": 0.9704,
"mean_token_accuracy": 0.27944710552692414,
"num_tokens": 12525117.0,
"step": 4370
},
{
"entropy": 1.0357938587665558,
"epoch": 5.666235446313066,
"grad_norm": 0.6963515877723694,
"learning_rate": 4.145902254441888e-05,
"loss": 1.0365,
"mean_token_accuracy": 0.7398686364293099,
"num_tokens": 12563021.0,
"step": 4380
},
{
"entropy": 1.1490644261240959,
"epoch": 5.679172056921087,
"grad_norm": 0.9824443459510803,
"learning_rate": 4.1035184743658376e-05,
"loss": 1.1307,
"mean_token_accuracy": 0.7091254457831383,
"num_tokens": 12591024.0,
"step": 4390
},
{
"entropy": 1.68570619225502,
"epoch": 5.692108667529108,
"grad_norm": 1.2685192823410034,
"learning_rate": 4.0612964571330805e-05,
"loss": 1.5877,
"mean_token_accuracy": 0.6187320709228515,
"num_tokens": 12607889.0,
"step": 4400
},
{
"entropy": 1.995962232351303,
"epoch": 5.705045278137128,
"grad_norm": 0.0,
"learning_rate": 4.019237361053615e-05,
"loss": 0.4375,
"mean_token_accuracy": 0.1990293502807617,
"num_tokens": 12609477.0,
"step": 4410
},
{
"entropy": 2.0628999888896944,
"epoch": 5.717981888745149,
"grad_norm": 0.582778811454773,
"learning_rate": 3.977342339967902e-05,
"loss": 0.9965,
"mean_token_accuracy": 0.2732643634080887,
"num_tokens": 12668390.0,
"step": 4420
},
{
"entropy": 1.0030916407704353,
"epoch": 5.730918499353169,
"grad_norm": 0.7195892930030823,
"learning_rate": 3.935612543215216e-05,
"loss": 1.0055,
"mean_token_accuracy": 0.7438824102282524,
"num_tokens": 12707626.0,
"step": 4430
},
{
"entropy": 1.1245022103190423,
"epoch": 5.74385510996119,
"grad_norm": 0.9609954357147217,
"learning_rate": 3.8940491156020744e-05,
"loss": 1.0932,
"mean_token_accuracy": 0.7223910227417946,
"num_tokens": 12736376.0,
"step": 4440
},
{
"entropy": 1.653869342803955,
"epoch": 5.756791720569211,
"grad_norm": 1.3840677738189697,
"learning_rate": 3.852653197370885e-05,
"loss": 1.5745,
"mean_token_accuracy": 0.6224342837929726,
"num_tokens": 12753560.0,
"step": 4450
},
{
"entropy": 2.0997736901044846,
"epoch": 5.769728331177232,
"grad_norm": 0.0,
"learning_rate": 3.811425924168628e-05,
"loss": 0.4083,
"mean_token_accuracy": 0.17910270839929582,
"num_tokens": 12755081.0,
"step": 4460
},
{
"entropy": 2.0056353509426117,
"epoch": 5.782664941785252,
"grad_norm": 0.5901302099227905,
"learning_rate": 3.770368427015699e-05,
"loss": 0.9965,
"mean_token_accuracy": 0.2755757987499237,
"num_tokens": 12818062.0,
"step": 4470
},
{
"entropy": 0.9973522603511811,
"epoch": 5.795601552393273,
"grad_norm": 0.7053154110908508,
"learning_rate": 3.729481832274916e-05,
"loss": 1.0101,
"mean_token_accuracy": 0.7445162117481232,
"num_tokens": 12856675.0,
"step": 4480
},
{
"entropy": 1.158506852388382,
"epoch": 5.808538163001294,
"grad_norm": 1.0795212984085083,
"learning_rate": 3.688767261620578e-05,
"loss": 1.1325,
"mean_token_accuracy": 0.7126885786652565,
"num_tokens": 12884620.0,
"step": 4490
},
{
"entropy": 1.6880556523799897,
"epoch": 5.821474773609315,
"grad_norm": 1.5192304849624634,
"learning_rate": 3.64822583200772e-05,
"loss": 1.5872,
"mean_token_accuracy": 0.6223025761544705,
"num_tokens": 12901293.0,
"step": 4500
},
{
"epoch": 5.821474773609315,
"eval_entropy": 1.5741082594491715,
"eval_loss": 1.2425955533981323,
"eval_mean_token_accuracy": 0.4777476576178573,
"eval_num_tokens": 12901293.0,
"eval_runtime": 245.8608,
"eval_samples_per_second": 22.354,
"eval_steps_per_second": 1.399,
"step": 4500
},
{
"entropy": 1.8717746943235398,
"epoch": 5.834411384217335,
"grad_norm": 0.0,
"learning_rate": 3.607858655641457e-05,
"loss": 0.3819,
"mean_token_accuracy": 0.20605695247650146,
"num_tokens": 12902761.0,
"step": 4510
},
{
"entropy": 1.97312273979187,
"epoch": 5.8473479948253555,
"grad_norm": 0.5747093558311462,
"learning_rate": 3.56766683994648e-05,
"loss": 0.9997,
"mean_token_accuracy": 0.27485966980457305,
"num_tokens": 12956936.0,
"step": 4520
},
{
"entropy": 1.026018126308918,
"epoch": 5.860284605433376,
"grad_norm": 0.7504481077194214,
"learning_rate": 3.527651487536669e-05,
"loss": 1.044,
"mean_token_accuracy": 0.7389606684446335,
"num_tokens": 12995952.0,
"step": 4530
},
{
"entropy": 1.1011481299996375,
"epoch": 5.873221216041397,
"grad_norm": 0.9883886575698853,
"learning_rate": 3.487813696184852e-05,
"loss": 1.0814,
"mean_token_accuracy": 0.722546960413456,
"num_tokens": 13024545.0,
"step": 4540
},
{
"entropy": 1.6190055787563324,
"epoch": 5.886157826649418,
"grad_norm": 1.3633733987808228,
"learning_rate": 3.448154558792677e-05,
"loss": 1.5299,
"mean_token_accuracy": 0.6360443904995918,
"num_tokens": 13041707.0,
"step": 4550
},
{
"entropy": 1.919902539253235,
"epoch": 5.899094437257439,
"grad_norm": 0.0,
"learning_rate": 3.408675163360643e-05,
"loss": 0.3972,
"mean_token_accuracy": 0.18492977023124696,
"num_tokens": 13043179.0,
"step": 4560
},
{
"entropy": 1.9439027100801467,
"epoch": 5.912031047865459,
"grad_norm": 0.5576460957527161,
"learning_rate": 3.369376592958243e-05,
"loss": 1.0312,
"mean_token_accuracy": 0.2685145862400532,
"num_tokens": 13106663.0,
"step": 4570
},
{
"entropy": 1.0852982923388481,
"epoch": 5.92496765847348,
"grad_norm": 0.7461971044540405,
"learning_rate": 3.3302599256942524e-05,
"loss": 1.0907,
"mean_token_accuracy": 0.7287055298686027,
"num_tokens": 13146036.0,
"step": 4580
},
{
"entropy": 1.1466092258691787,
"epoch": 5.937904269081501,
"grad_norm": 0.9710547924041748,
"learning_rate": 3.2913262346871564e-05,
"loss": 1.118,
"mean_token_accuracy": 0.7170251324772835,
"num_tokens": 13175061.0,
"step": 4590
},
{
"entropy": 1.5422434598207473,
"epoch": 5.950840879689522,
"grad_norm": 1.2156635522842407,
"learning_rate": 3.252576588035703e-05,
"loss": 1.4615,
"mean_token_accuracy": 0.6465979412198066,
"num_tokens": 13192904.0,
"step": 4600
},
{
"entropy": 1.8891061872243882,
"epoch": 5.963777490297542,
"grad_norm": 0.0,
"learning_rate": 3.21401204878962e-05,
"loss": 0.4084,
"mean_token_accuracy": 0.20470450967550277,
"num_tokens": 13194636.0,
"step": 4610
},
{
"entropy": 1.679259254038334,
"epoch": 5.976714100905562,
"grad_norm": 0.7184410095214844,
"learning_rate": 3.175633674920415e-05,
"loss": 0.7382,
"mean_token_accuracy": 0.3269588887691498,
"num_tokens": 13232029.0,
"step": 4620
},
{
"entropy": 1.1688358381390571,
"epoch": 5.989650711513583,
"grad_norm": 0.9711093306541443,
"learning_rate": 3.1374425192923874e-05,
"loss": 1.1566,
"mean_token_accuracy": 0.7072150468826294,
"num_tokens": 13259115.0,
"step": 4630
},
{
"entropy": 1.995809930562973,
"epoch": 6.002587322121604,
"grad_norm": 0.3214434087276459,
"learning_rate": 3.099439629633738e-05,
"loss": 0.9081,
"mean_token_accuracy": 0.2743851698935032,
"num_tokens": 13302193.0,
"step": 4640
},
{
"entropy": 1.2387345060706139,
"epoch": 6.015523932729625,
"grad_norm": 0.7096182107925415,
"learning_rate": 3.061626048507794e-05,
"loss": 1.2251,
"mean_token_accuracy": 0.7026221588253975,
"num_tokens": 13349206.0,
"step": 4650
},
{
"epoch": 6.015523932729625,
"eval_entropy": 1.4673164599510127,
"eval_loss": 1.236427664756775,
"eval_mean_token_accuracy": 0.4835313937171947,
"eval_num_tokens": 13349206.0,
"eval_runtime": 245.226,
"eval_samples_per_second": 22.412,
"eval_steps_per_second": 1.403,
"step": 4650
},
{
"entropy": 1.0112595960497857,
"epoch": 6.028460543337646,
"grad_norm": 0.9612884521484375,
"learning_rate": 3.0240028132844577e-05,
"loss": 0.9916,
"mean_token_accuracy": 0.7466120198369026,
"num_tokens": 13380735.0,
"step": 4660
},
{
"entropy": 1.321917925775051,
"epoch": 6.041397153945666,
"grad_norm": 0.9298290014266968,
"learning_rate": 2.9865709561117093e-05,
"loss": 1.277,
"mean_token_accuracy": 0.6769641906023025,
"num_tokens": 13402259.0,
"step": 4670
},
{
"entropy": 1.9312127828598022,
"epoch": 6.054333764553687,
"grad_norm": 0.0,
"learning_rate": 2.949331503887296e-05,
"loss": 0.9373,
"mean_token_accuracy": 0.38414124920964243,
"num_tokens": 13406702.0,
"step": 4680
},
{
"entropy": 1.8519232898950577,
"epoch": 6.067270375161708,
"grad_norm": 0.3253900706768036,
"learning_rate": 2.9122854782305853e-05,
"loss": 0.4393,
"mean_token_accuracy": 0.10099697411060334,
"num_tokens": 13448471.0,
"step": 4690
},
{
"entropy": 1.2315872982144356,
"epoch": 6.0802069857697285,
"grad_norm": 0.7172207832336426,
"learning_rate": 2.8754338954545078e-05,
"loss": 1.2677,
"mean_token_accuracy": 0.6917841538786889,
"num_tokens": 13494707.0,
"step": 4700
},
{
"entropy": 1.078819990158081,
"epoch": 6.093143596377749,
"grad_norm": 0.9585686326026917,
"learning_rate": 2.8387777665376947e-05,
"loss": 1.0795,
"mean_token_accuracy": 0.7268196657299996,
"num_tokens": 13525272.0,
"step": 4710
},
{
"entropy": 1.439416041970253,
"epoch": 6.106080206985769,
"grad_norm": 0.967811107635498,
"learning_rate": 2.8023180970967333e-05,
"loss": 1.3684,
"mean_token_accuracy": 0.6664265364408493,
"num_tokens": 13545790.0,
"step": 4720
},
{
"entropy": 1.8261877298355103,
"epoch": 6.11901681759379,
"grad_norm": 0.0,
"learning_rate": 2.766055887358584e-05,
"loss": 0.8898,
"mean_token_accuracy": 0.34252284914255143,
"num_tokens": 13549613.0,
"step": 4730
},
{
"entropy": 1.8926386773586272,
"epoch": 6.131953428201811,
"grad_norm": 0.33156275749206543,
"learning_rate": 2.72999213213314e-05,
"loss": 0.438,
"mean_token_accuracy": 0.10151686370372773,
"num_tokens": 13586113.0,
"step": 4740
},
{
"entropy": 1.2399160832166671,
"epoch": 6.144890038809832,
"grad_norm": 0.7470856308937073,
"learning_rate": 2.6941278207859333e-05,
"loss": 1.2593,
"mean_token_accuracy": 0.6944727435708046,
"num_tokens": 13632230.0,
"step": 4750
},
{
"entropy": 1.008384570479393,
"epoch": 6.157826649417853,
"grad_norm": 0.992726743221283,
"learning_rate": 2.6584639372109942e-05,
"loss": 0.991,
"mean_token_accuracy": 0.7462219312787056,
"num_tokens": 13663326.0,
"step": 4760
},
{
"entropy": 1.338111485540867,
"epoch": 6.170763260025873,
"grad_norm": 1.0804771184921265,
"learning_rate": 2.623001459803861e-05,
"loss": 1.3146,
"mean_token_accuracy": 0.6769130662083626,
"num_tokens": 13684547.0,
"step": 4770
},
{
"entropy": 1.9144802495837212,
"epoch": 6.183699870633894,
"grad_norm": 0.0,
"learning_rate": 2.5877413614347358e-05,
"loss": 0.8822,
"mean_token_accuracy": 0.3425231367349625,
"num_tokens": 13688744.0,
"step": 4780
},
{
"entropy": 1.9466426759958266,
"epoch": 6.196636481241915,
"grad_norm": 0.30804237723350525,
"learning_rate": 2.5526846094217948e-05,
"loss": 0.4398,
"mean_token_accuracy": 0.10224909633398056,
"num_tokens": 13724520.0,
"step": 4790
},
{
"entropy": 1.2254028126597405,
"epoch": 6.2095730918499354,
"grad_norm": 0.7537704706192017,
"learning_rate": 2.5178321655046577e-05,
"loss": 1.2608,
"mean_token_accuracy": 0.6935150980949402,
"num_tokens": 13771548.0,
"step": 4800
},
{
"epoch": 6.2095730918499354,
"eval_entropy": 1.452152093482572,
"eval_loss": 1.2386506795883179,
"eval_mean_token_accuracy": 0.4820184623605983,
"eval_num_tokens": 13771548.0,
"eval_runtime": 247.597,
"eval_samples_per_second": 22.197,
"eval_steps_per_second": 1.389,
"step": 4800
},
{
"entropy": 1.0173511430621147,
"epoch": 6.222509702457956,
"grad_norm": 0.9651890993118286,
"learning_rate": 2.4831849858179913e-05,
"loss": 1.0019,
"mean_token_accuracy": 0.7469066709280014,
"num_tokens": 13802198.0,
"step": 4810
},
{
"entropy": 1.3790171161293983,
"epoch": 6.235446313065976,
"grad_norm": 1.0010708570480347,
"learning_rate": 2.448744020865299e-05,
"loss": 1.3194,
"mean_token_accuracy": 0.6719131916761398,
"num_tokens": 13823366.0,
"step": 4820
},
{
"entropy": 1.8534984111785888,
"epoch": 6.248382923673997,
"grad_norm": 0.0,
"learning_rate": 2.4145102154928156e-05,
"loss": 0.9022,
"mean_token_accuracy": 0.36226404309272764,
"num_tokens": 13827780.0,
"step": 4830
},
{
"entropy": 1.787733218073845,
"epoch": 6.261319534282018,
"grad_norm": 0.35094037652015686,
"learning_rate": 2.380484508863611e-05,
"loss": 0.4416,
"mean_token_accuracy": 0.10181766748428345,
"num_tokens": 13865780.0,
"step": 4840
},
{
"entropy": 1.2079532265663147,
"epoch": 6.274256144890039,
"grad_norm": 0.7374927997589111,
"learning_rate": 2.346667834431826e-05,
"loss": 1.2223,
"mean_token_accuracy": 0.704416724294424,
"num_tokens": 13911952.0,
"step": 4850
},
{
"entropy": 0.9947008207440377,
"epoch": 6.28719275549806,
"grad_norm": 0.9054901003837585,
"learning_rate": 2.3130611199170384e-05,
"loss": 0.9776,
"mean_token_accuracy": 0.7504064351320267,
"num_tokens": 13943487.0,
"step": 4860
},
{
"entropy": 1.312053567171097,
"epoch": 6.300129366106081,
"grad_norm": 1.030329704284668,
"learning_rate": 2.2796652872788448e-05,
"loss": 1.2766,
"mean_token_accuracy": 0.6872908189892769,
"num_tokens": 13965764.0,
"step": 4870
},
{
"entropy": 1.73905668258667,
"epoch": 6.313065976714101,
"grad_norm": 0.0,
"learning_rate": 2.246481252691548e-05,
"loss": 0.9456,
"mean_token_accuracy": 0.3810268484055996,
"num_tokens": 13970660.0,
"step": 4880
},
{
"entropy": 1.7431816905736923,
"epoch": 6.3260025873221215,
"grad_norm": 0.3432393968105316,
"learning_rate": 2.213509926519016e-05,
"loss": 0.4431,
"mean_token_accuracy": 0.103342554718256,
"num_tokens": 14010149.0,
"step": 4890
},
{
"entropy": 1.2463560298085212,
"epoch": 6.338939197930142,
"grad_norm": 0.7458313703536987,
"learning_rate": 2.1807522132897383e-05,
"loss": 1.2702,
"mean_token_accuracy": 0.6920596107840538,
"num_tokens": 14057120.0,
"step": 4900
},
{
"entropy": 1.027150359749794,
"epoch": 6.351875808538163,
"grad_norm": 0.8767898082733154,
"learning_rate": 2.148209011671979e-05,
"loss": 0.9989,
"mean_token_accuracy": 0.743067529797554,
"num_tokens": 14088988.0,
"step": 4910
},
{
"entropy": 1.3012418672442436,
"epoch": 6.364812419146183,
"grad_norm": 1.0773974657058716,
"learning_rate": 2.1158812144491357e-05,
"loss": 1.247,
"mean_token_accuracy": 0.6856265813112259,
"num_tokens": 14111094.0,
"step": 4920
},
{
"entropy": 1.7512285083532333,
"epoch": 6.377749029754204,
"grad_norm": 0.0,
"learning_rate": 2.0837697084952503e-05,
"loss": 0.9705,
"mean_token_accuracy": 0.38980276361107824,
"num_tokens": 14115970.0,
"step": 4930
},
{
"entropy": 1.7514180034399032,
"epoch": 6.390685640362225,
"grad_norm": 0.3412686288356781,
"learning_rate": 2.0518753747506748e-05,
"loss": 0.4438,
"mean_token_accuracy": 0.10270617604255676,
"num_tokens": 14151452.0,
"step": 4940
},
{
"entropy": 1.2002925127744675,
"epoch": 6.403622250970246,
"grad_norm": 0.7483528852462769,
"learning_rate": 2.0201990881979006e-05,
"loss": 1.2267,
"mean_token_accuracy": 0.7003540650010109,
"num_tokens": 14198038.0,
"step": 4950
},
{
"epoch": 6.403622250970246,
"eval_entropy": 1.4145794496979824,
"eval_loss": 1.2361637353897095,
"eval_mean_token_accuracy": 0.4807747915213884,
"eval_num_tokens": 14198038.0,
"eval_runtime": 239.3212,
"eval_samples_per_second": 22.965,
"eval_steps_per_second": 1.437,
"step": 4950
},
{
"entropy": 1.0035071596503258,
"epoch": 6.416558861578267,
"grad_norm": 0.9199973940849304,
"learning_rate": 1.9887417178375633e-05,
"loss": 0.9911,
"mean_token_accuracy": 0.7502464011311532,
"num_tokens": 14229396.0,
"step": 4960
},
{
"entropy": 1.3634681567549705,
"epoch": 6.429495472186288,
"grad_norm": 0.8955945372581482,
"learning_rate": 1.957504126664593e-05,
"loss": 1.3055,
"mean_token_accuracy": 0.677581375837326,
"num_tokens": 14251059.0,
"step": 4970
},
{
"entropy": 1.782031211256981,
"epoch": 6.442432082794308,
"grad_norm": 0.0,
"learning_rate": 1.9264871716445454e-05,
"loss": 1.0013,
"mean_token_accuracy": 0.41802891343832016,
"num_tokens": 14255872.0,
"step": 4980
},
{
"entropy": 1.542439764738083,
"epoch": 6.455368693402328,
"grad_norm": 0.34138184785842896,
"learning_rate": 1.89569170369009e-05,
"loss": 0.4513,
"mean_token_accuracy": 0.1006891518831253,
"num_tokens": 14297788.0,
"step": 4990
},
{
"entropy": 1.2497848883271216,
"epoch": 6.468305304010349,
"grad_norm": 0.7626767158508301,
"learning_rate": 1.865118567637667e-05,
"loss": 1.2743,
"mean_token_accuracy": 0.6893603593111038,
"num_tokens": 14345367.0,
"step": 5000
},
{
"entropy": 0.9866194486618042,
"epoch": 6.48124191461837,
"grad_norm": 1.0120469331741333,
"learning_rate": 1.834768602224307e-05,
"loss": 0.9661,
"mean_token_accuracy": 0.752055998146534,
"num_tokens": 14376619.0,
"step": 5010
},
{
"entropy": 1.2819917246699333,
"epoch": 6.494178525226391,
"grad_norm": 0.9832173585891724,
"learning_rate": 1.8046426400646244e-05,
"loss": 1.2393,
"mean_token_accuracy": 0.6865051403641701,
"num_tokens": 14398410.0,
"step": 5020
},
{
"entropy": 1.656550607085228,
"epoch": 6.507115135834411,
"grad_norm": 0.0,
"learning_rate": 1.774741507627984e-05,
"loss": 1.0363,
"mean_token_accuracy": 0.402515621483326,
"num_tokens": 14403699.0,
"step": 5030
},
{
"entropy": 1.4212503910064698,
"epoch": 6.520051746442432,
"grad_norm": 0.3207855820655823,
"learning_rate": 1.7450660252158015e-05,
"loss": 0.4273,
"mean_token_accuracy": 0.10288792848587036,
"num_tokens": 14446058.0,
"step": 5040
},
{
"entropy": 1.2183921545743943,
"epoch": 6.532988357050453,
"grad_norm": 0.7788935899734497,
"learning_rate": 1.71561700693907e-05,
"loss": 1.2401,
"mean_token_accuracy": 0.7000276446342468,
"num_tokens": 14492725.0,
"step": 5050
},
{
"entropy": 1.0459384858608245,
"epoch": 6.545924967658474,
"grad_norm": 0.9662116765975952,
"learning_rate": 1.6863952606960132e-05,
"loss": 1.037,
"mean_token_accuracy": 0.7341208711266518,
"num_tokens": 14523347.0,
"step": 5060
},
{
"entropy": 1.3962342336773872,
"epoch": 6.5588615782664945,
"grad_norm": 1.0042107105255127,
"learning_rate": 1.6574015881499106e-05,
"loss": 1.3439,
"mean_token_accuracy": 0.6732321053743362,
"num_tokens": 14543748.0,
"step": 5070
},
{
"entropy": 1.4976371228694916,
"epoch": 6.5717981888745145,
"grad_norm": 0.0,
"learning_rate": 1.6286367847071294e-05,
"loss": 0.8495,
"mean_token_accuracy": 0.37927755415439607,
"num_tokens": 14547526.0,
"step": 5080
},
{
"entropy": 1.378989189863205,
"epoch": 6.584734799482535,
"grad_norm": 0.35467758774757385,
"learning_rate": 1.6001016394952817e-05,
"loss": 0.436,
"mean_token_accuracy": 0.10404296517372132,
"num_tokens": 14587727.0,
"step": 5090
},
{
"entropy": 1.2019992262125014,
"epoch": 6.597671410090556,
"grad_norm": 0.7634411454200745,
"learning_rate": 1.5717969353415772e-05,
"loss": 1.2363,
"mean_token_accuracy": 0.7016454577445984,
"num_tokens": 14633377.0,
"step": 5100
},
{
"epoch": 6.597671410090556,
"eval_entropy": 1.3025533678226693,
"eval_loss": 1.2344391345977783,
"eval_mean_token_accuracy": 0.4806629490367202,
"eval_num_tokens": 14633377.0,
"eval_runtime": 243.0518,
"eval_samples_per_second": 22.612,
"eval_steps_per_second": 1.415,
"step": 5100
},
{
"entropy": 1.0161924228072166,
"epoch": 6.610608020698577,
"grad_norm": 1.0323160886764526,
"learning_rate": 1.5437234487513687e-05,
"loss": 0.9938,
"mean_token_accuracy": 0.747073483467102,
"num_tokens": 14664256.0,
"step": 5110
},
{
"entropy": 1.358753038942814,
"epoch": 6.623544631306598,
"grad_norm": 1.011472225189209,
"learning_rate": 1.5158819498868248e-05,
"loss": 1.3273,
"mean_token_accuracy": 0.6735880345106124,
"num_tokens": 14685452.0,
"step": 5120
},
{
"entropy": 1.5439666867256165,
"epoch": 6.636481241914618,
"grad_norm": 0.0,
"learning_rate": 1.4882732025458124e-05,
"loss": 0.8744,
"mean_token_accuracy": 0.35112617164850235,
"num_tokens": 14689408.0,
"step": 5130
},
{
"entropy": 1.490699003636837,
"epoch": 6.649417852522639,
"grad_norm": 0.33567583560943604,
"learning_rate": 1.4608979641409448e-05,
"loss": 0.4429,
"mean_token_accuracy": 0.10201395228505135,
"num_tokens": 14730607.0,
"step": 5140
},
{
"entropy": 1.1885226652026177,
"epoch": 6.66235446313066,
"grad_norm": 0.7712506055831909,
"learning_rate": 1.4337569856787958e-05,
"loss": 1.2014,
"mean_token_accuracy": 0.7031497925519943,
"num_tokens": 14775950.0,
"step": 5150
},
{
"entropy": 1.013894683122635,
"epoch": 6.675291073738681,
"grad_norm": 0.993394672870636,
"learning_rate": 1.406851011739303e-05,
"loss": 0.9995,
"mean_token_accuracy": 0.7462615251541138,
"num_tokens": 14806798.0,
"step": 5160
},
{
"entropy": 1.276303158700466,
"epoch": 6.6882276843467015,
"grad_norm": 0.9287812113761902,
"learning_rate": 1.3801807804553401e-05,
"loss": 1.2193,
"mean_token_accuracy": 0.701404669880867,
"num_tokens": 14828450.0,
"step": 5170
},
{
"entropy": 1.639420548081398,
"epoch": 6.701164294954722,
"grad_norm": 0.0,
"learning_rate": 1.3537470234924642e-05,
"loss": 0.9149,
"mean_token_accuracy": 0.36589213013648986,
"num_tokens": 14832909.0,
"step": 5180
},
{
"entropy": 1.5444379433989526,
"epoch": 6.714100905562742,
"grad_norm": 0.33196088671684265,
"learning_rate": 1.3275504660288462e-05,
"loss": 0.4502,
"mean_token_accuracy": 0.09918043613433838,
"num_tokens": 14875888.0,
"step": 5190
},
{
"entropy": 1.1905731126666068,
"epoch": 6.727037516170763,
"grad_norm": 0.7245560884475708,
"learning_rate": 1.3015918267353743e-05,
"loss": 1.2055,
"mean_token_accuracy": 0.7072307705879212,
"num_tokens": 14921555.0,
"step": 5200
},
{
"entropy": 1.0091575369238854,
"epoch": 6.739974126778784,
"grad_norm": 0.9656630158424377,
"learning_rate": 1.2758718177559403e-05,
"loss": 1.0059,
"mean_token_accuracy": 0.7457368150353432,
"num_tokens": 14952319.0,
"step": 5210
},
{
"entropy": 1.3768625631928444,
"epoch": 6.752910737386805,
"grad_norm": 1.0023345947265625,
"learning_rate": 1.2503911446879014e-05,
"loss": 1.3323,
"mean_token_accuracy": 0.6721446126699447,
"num_tokens": 14973360.0,
"step": 5220
},
{
"entropy": 1.706917905807495,
"epoch": 6.765847347994825,
"grad_norm": 0.0,
"learning_rate": 1.2251505065627211e-05,
"loss": 0.884,
"mean_token_accuracy": 0.34794071316719055,
"num_tokens": 14977368.0,
"step": 5230
},
{
"entropy": 1.6983414202928544,
"epoch": 6.778783958602846,
"grad_norm": 0.34029924869537354,
"learning_rate": 1.2001505958268045e-05,
"loss": 0.4392,
"mean_token_accuracy": 0.10167066529393196,
"num_tokens": 15016518.0,
"step": 5240
},
{
"entropy": 1.1760634392499925,
"epoch": 6.791720569210867,
"grad_norm": 0.7289795875549316,
"learning_rate": 1.1753920983224753e-05,
"loss": 1.2004,
"mean_token_accuracy": 0.7051770240068436,
"num_tokens": 15062291.0,
"step": 5250
},
{
"epoch": 6.791720569210867,
"eval_entropy": 1.3887645453214645,
"eval_loss": 1.2298688888549805,
"eval_mean_token_accuracy": 0.48596259925601093,
"eval_num_tokens": 15062291.0,
"eval_runtime": 246.7195,
"eval_samples_per_second": 22.276,
"eval_steps_per_second": 1.394,
"step": 5250
},
{
"entropy": 1.019908943772316,
"epoch": 6.8046571798188875,
"grad_norm": 1.0139966011047363,
"learning_rate": 1.1508756932691878e-05,
"loss": 1.016,
"mean_token_accuracy": 0.7411870285868645,
"num_tokens": 15093136.0,
"step": 5260
},
{
"entropy": 1.3366242468357086,
"epoch": 6.817593790426908,
"grad_norm": 1.015224814414978,
"learning_rate": 1.1266020532448863e-05,
"loss": 1.3099,
"mean_token_accuracy": 0.680339677631855,
"num_tokens": 15113801.0,
"step": 5270
},
{
"entropy": 1.7231854051351547,
"epoch": 6.830530401034929,
"grad_norm": 0.0,
"learning_rate": 1.1025718441675348e-05,
"loss": 0.8459,
"mean_token_accuracy": 0.34885319918394087,
"num_tokens": 15117501.0,
"step": 5280
},
{
"entropy": 1.8012044936418534,
"epoch": 6.843467011642949,
"grad_norm": 0.3444773256778717,
"learning_rate": 1.0787857252768807e-05,
"loss": 0.4338,
"mean_token_accuracy": 0.10217657834291458,
"num_tokens": 15154208.0,
"step": 5290
},
{
"entropy": 1.17054093927145,
"epoch": 6.85640362225097,
"grad_norm": 0.7941517233848572,
"learning_rate": 1.0552443491163422e-05,
"loss": 1.1874,
"mean_token_accuracy": 0.7076364248991013,
"num_tokens": 15199469.0,
"step": 5300
},
{
"entropy": 1.0057064607739448,
"epoch": 6.869340232858991,
"grad_norm": 0.8840006589889526,
"learning_rate": 1.0319483615151137e-05,
"loss": 0.981,
"mean_token_accuracy": 0.7503589361906051,
"num_tokens": 15230670.0,
"step": 5310
},
{
"entropy": 1.2563072219491005,
"epoch": 6.882276843467012,
"grad_norm": 1.0177907943725586,
"learning_rate": 1.0088984015704629e-05,
"loss": 1.2394,
"mean_token_accuracy": 0.6934975415468216,
"num_tokens": 15252641.0,
"step": 5320
},
{
"entropy": 1.8372395306825637,
"epoch": 6.895213454075033,
"grad_norm": 0.0,
"learning_rate": 9.860951016301756e-06,
"loss": 0.9875,
"mean_token_accuracy": 0.3743965640664101,
"num_tokens": 15257407.0,
"step": 5330
},
{
"entropy": 1.7831202149391174,
"epoch": 6.908150064683053,
"grad_norm": 0.3214081823825836,
"learning_rate": 9.635390872752237e-06,
"loss": 0.43,
"mean_token_accuracy": 0.10435229986906051,
"num_tokens": 15299860.0,
"step": 5340
},
{
"entropy": 1.2522226199507713,
"epoch": 6.921086675291074,
"grad_norm": 0.8021490573883057,
"learning_rate": 9.412309773025952e-06,
"loss": 1.2766,
"mean_token_accuracy": 0.6917116060853005,
"num_tokens": 15347391.0,
"step": 5350
},
{
"entropy": 1.0165240302681924,
"epoch": 6.9340232858990944,
"grad_norm": 0.9851676225662231,
"learning_rate": 9.191713837083238e-06,
"loss": 1.0192,
"mean_token_accuracy": 0.7415471941232681,
"num_tokens": 15379391.0,
"step": 5360
},
{
"entropy": 1.2651836022734642,
"epoch": 6.946959896507115,
"grad_norm": 1.12442946434021,
"learning_rate": 8.973609116706926e-06,
"loss": 1.2443,
"mean_token_accuracy": 0.6868803769350051,
"num_tokens": 15401606.0,
"step": 5370
},
{
"entropy": 1.7752905175089837,
"epoch": 6.959896507115136,
"grad_norm": 0.0,
"learning_rate": 8.758001595336418e-06,
"loss": 0.8999,
"mean_token_accuracy": 0.38887517899274826,
"num_tokens": 15406538.0,
"step": 5380
},
{
"entropy": 1.7106927633285522,
"epoch": 6.972833117723156,
"grad_norm": 0.5107993483543396,
"learning_rate": 8.544897187903423e-06,
"loss": 0.4117,
"mean_token_accuracy": 0.10680279433727265,
"num_tokens": 15432463.0,
"step": 5390
},
{
"entropy": 1.0609442353248597,
"epoch": 6.985769728331177,
"grad_norm": 1.095216155052185,
"learning_rate": 8.33430174066978e-06,
"loss": 1.0514,
"mean_token_accuracy": 0.7322214379906654,
"num_tokens": 15465365.0,
"step": 5400
},
{
"epoch": 6.985769728331177,
"eval_entropy": 1.3962991244571155,
"eval_loss": 1.2261559963226318,
"eval_mean_token_accuracy": 0.48680107668042183,
"eval_num_tokens": 15465365.0,
"eval_runtime": 244.9697,
"eval_samples_per_second": 22.435,
"eval_steps_per_second": 1.404,
"step": 5400
},
{
"entropy": 1.6976288080215454,
"epoch": 6.998706338939198,
"grad_norm": 0.0,
"learning_rate": 8.126221031067027e-06,
"loss": 0.7689,
"mean_token_accuracy": 0.2966282024979591,
"num_tokens": 15471588.0,
"step": 5410
},
{
"entropy": 1.497927661240101,
"epoch": 7.011642949547219,
"grad_norm": 0.7096975445747375,
"learning_rate": 7.920660767537901e-06,
"loss": 1.3894,
"mean_token_accuracy": 0.5761201746761799,
"num_tokens": 15542066.0,
"step": 5420
},
{
"entropy": 0.9780161440372467,
"epoch": 7.02457956015524,
"grad_norm": 0.9500054717063904,
"learning_rate": 7.717626589379789e-06,
"loss": 0.9513,
"mean_token_accuracy": 0.7568799629807472,
"num_tokens": 15575551.0,
"step": 5430
},
{
"entropy": 1.169414332509041,
"epoch": 7.03751617076326,
"grad_norm": 1.0309356451034546,
"learning_rate": 7.517124066589909e-06,
"loss": 1.1411,
"mean_token_accuracy": 0.711452366411686,
"num_tokens": 15599584.0,
"step": 5440
},
{
"entropy": 1.7210813522338868,
"epoch": 7.0504527813712805,
"grad_norm": 0.0,
"learning_rate": 7.319158699712669e-06,
"loss": 1.3323,
"mean_token_accuracy": 0.5859084717929364,
"num_tokens": 15608747.0,
"step": 5450
},
{
"entropy": 1.6397013187408447,
"epoch": 7.063389391979301,
"grad_norm": 0.0,
"learning_rate": 7.12373591968859e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 15609387.0,
"step": 5460
},
{
"entropy": 1.5039527043700218,
"epoch": 7.076326002587322,
"grad_norm": 0.772226870059967,
"learning_rate": 6.930861087705398e-06,
"loss": 1.3666,
"mean_token_accuracy": 0.5798796579241753,
"num_tokens": 15685497.0,
"step": 5470
},
{
"entropy": 0.9571346640586853,
"epoch": 7.089262613195343,
"grad_norm": 0.9899272918701172,
"learning_rate": 6.7405394950510345e-06,
"loss": 0.9525,
"mean_token_accuracy": 0.7557973235845565,
"num_tokens": 15718968.0,
"step": 5480
},
{
"entropy": 1.1604458332061767,
"epoch": 7.102199223803363,
"grad_norm": 1.072095513343811,
"learning_rate": 6.552776362968271e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.7090446025133132,
"num_tokens": 15742748.0,
"step": 5490
},
{
"entropy": 1.7930972754955292,
"epoch": 7.115135834411384,
"grad_norm": 0.0,
"learning_rate": 6.367576842511735e-06,
"loss": 1.3237,
"mean_token_accuracy": 0.5362849146127701,
"num_tokens": 15751803.0,
"step": 5500
},
{
"entropy": 1.7586050003767013,
"epoch": 7.128072445019405,
"grad_norm": 0.0,
"learning_rate": 6.184946014406412e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 15752443.0,
"step": 5510
},
{
"entropy": 1.5453996002674102,
"epoch": 7.141009055627426,
"grad_norm": 0.7360463738441467,
"learning_rate": 6.004888888908256e-06,
"loss": 1.4109,
"mean_token_accuracy": 0.5700584821403026,
"num_tokens": 15828985.0,
"step": 5520
},
{
"entropy": 0.9328926429152489,
"epoch": 7.153945666235447,
"grad_norm": 0.9283819794654846,
"learning_rate": 5.827410405666911e-06,
"loss": 0.9175,
"mean_token_accuracy": 0.7673766747117042,
"num_tokens": 15862356.0,
"step": 5530
},
{
"entropy": 1.1613366797566413,
"epoch": 7.166882276843467,
"grad_norm": 1.0261551141738892,
"learning_rate": 5.652515433590033e-06,
"loss": 1.1253,
"mean_token_accuracy": 0.7124258697032928,
"num_tokens": 15886367.0,
"step": 5540
},
{
"entropy": 1.7588330313563347,
"epoch": 7.179818887451487,
"grad_norm": 0.00023454829351976514,
"learning_rate": 5.480208770709771e-06,
"loss": 1.4039,
"mean_token_accuracy": 0.5946097061038017,
"num_tokens": 15896207.0,
"step": 5550
},
{
"epoch": 7.179818887451487,
"eval_entropy": 1.393599722794322,
"eval_loss": 1.2324310541152954,
"eval_mean_token_accuracy": 0.4852820281372514,
"eval_num_tokens": 15896207.0,
"eval_runtime": 245.6246,
"eval_samples_per_second": 22.376,
"eval_steps_per_second": 1.401,
"step": 5550
},
{
"entropy": 1.7070483982563018,
"epoch": 7.192755498059508,
"grad_norm": 0.0,
"learning_rate": 5.310495144051142e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 15896847.0,
"step": 5560
},
{
"entropy": 1.557031399011612,
"epoch": 7.205692108667529,
"grad_norm": 0.7289990186691284,
"learning_rate": 5.143379209502352e-06,
"loss": 1.4125,
"mean_token_accuracy": 0.5720368728041649,
"num_tokens": 15976815.0,
"step": 5570
},
{
"entropy": 0.9512620970606804,
"epoch": 7.21862871927555,
"grad_norm": 0.9174538254737854,
"learning_rate": 4.978865551687062e-06,
"loss": 0.9534,
"mean_token_accuracy": 0.7580740317702294,
"num_tokens": 16010900.0,
"step": 5580
},
{
"entropy": 1.172946660220623,
"epoch": 7.231565329883571,
"grad_norm": 1.0972976684570312,
"learning_rate": 4.8169586838386346e-06,
"loss": 1.1532,
"mean_token_accuracy": 0.7079381376504899,
"num_tokens": 16035361.0,
"step": 5590
},
{
"entropy": 1.6811116263270378,
"epoch": 7.244501940491591,
"grad_norm": 0.0,
"learning_rate": 4.657663047676264e-06,
"loss": 1.2139,
"mean_token_accuracy": 0.5401002943515778,
"num_tokens": 16044571.0,
"step": 5600
},
{
"entropy": 1.6898091644048692,
"epoch": 7.257438551099612,
"grad_norm": 0.0,
"learning_rate": 4.500983013283188e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16045211.0,
"step": 5610
},
{
"entropy": 1.5436704397201537,
"epoch": 7.270375161707633,
"grad_norm": 0.6892314553260803,
"learning_rate": 4.34692287898677e-06,
"loss": 1.4148,
"mean_token_accuracy": 0.5717164523899555,
"num_tokens": 16122336.0,
"step": 5620
},
{
"entropy": 0.9374915182590484,
"epoch": 7.2833117723156535,
"grad_norm": 0.9667730927467346,
"learning_rate": 4.195486871240562e-06,
"loss": 0.9394,
"mean_token_accuracy": 0.7627643913030624,
"num_tokens": 16156408.0,
"step": 5630
},
{
"entropy": 1.1849497631192207,
"epoch": 7.296248382923674,
"grad_norm": 1.1908502578735352,
"learning_rate": 4.046679144508392e-06,
"loss": 1.142,
"mean_token_accuracy": 0.7130326569080353,
"num_tokens": 16180323.0,
"step": 5640
},
{
"entropy": 1.829011231660843,
"epoch": 7.309184993531694,
"grad_norm": 0.0,
"learning_rate": 3.900503781150366e-06,
"loss": 1.4914,
"mean_token_accuracy": 0.5614617101848125,
"num_tokens": 16189805.0,
"step": 5650
},
{
"entropy": 1.7375122755765915,
"epoch": 7.322121604139715,
"grad_norm": 0.0,
"learning_rate": 3.7569647913109243e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16190445.0,
"step": 5660
},
{
"entropy": 1.5200837269425391,
"epoch": 7.335058214747736,
"grad_norm": 0.7430135607719421,
"learning_rate": 3.6160661128087025e-06,
"loss": 1.397,
"mean_token_accuracy": 0.57365105971694,
"num_tokens": 16268426.0,
"step": 5670
},
{
"entropy": 0.9276900395751,
"epoch": 7.347994825355757,
"grad_norm": 0.9390348792076111,
"learning_rate": 3.4778116110286473e-06,
"loss": 0.9249,
"mean_token_accuracy": 0.7620738327503205,
"num_tokens": 16302856.0,
"step": 5680
},
{
"entropy": 1.1663517013192177,
"epoch": 7.360931435963778,
"grad_norm": 1.0117005109786987,
"learning_rate": 3.34220507881593e-06,
"loss": 1.1293,
"mean_token_accuracy": 0.7132649436593056,
"num_tokens": 16327211.0,
"step": 5690
},
{
"entropy": 1.7131205320358276,
"epoch": 7.373868046571798,
"grad_norm": 0.0,
"learning_rate": 3.209250236371797e-06,
"loss": 1.3032,
"mean_token_accuracy": 0.5476110517978668,
"num_tokens": 16336179.0,
"step": 5700
},
{
"epoch": 7.373868046571798,
"eval_entropy": 1.4321047376061595,
"eval_loss": 1.2324743270874023,
"eval_mean_token_accuracy": 0.48222382652551626,
"eval_num_tokens": 16336179.0,
"eval_runtime": 242.208,
"eval_samples_per_second": 22.691,
"eval_steps_per_second": 1.42,
"step": 5700
},
{
"entropy": 1.744317215681076,
"epoch": 7.386804657179819,
"grad_norm": 0.0,
"learning_rate": 3.0789507311516864e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16336819.0,
"step": 5710
},
{
"entropy": 1.5234818816184998,
"epoch": 7.39974126778784,
"grad_norm": 0.7303734421730042,
"learning_rate": 2.9513101377650175e-06,
"loss": 1.3797,
"mean_token_accuracy": 0.5752100251615048,
"num_tokens": 16404914.0,
"step": 5720
},
{
"entropy": 0.9265442088246345,
"epoch": 7.4126778783958605,
"grad_norm": 0.8770347237586975,
"learning_rate": 2.8263319578771485e-06,
"loss": 0.9069,
"mean_token_accuracy": 0.7680046275258064,
"num_tokens": 16439389.0,
"step": 5730
},
{
"entropy": 1.1830172911286354,
"epoch": 7.425614489003881,
"grad_norm": 1.0386770963668823,
"learning_rate": 2.704019620113407e-06,
"loss": 1.1733,
"mean_token_accuracy": 0.7056162416934967,
"num_tokens": 16464458.0,
"step": 5740
},
{
"entropy": 1.7503404572606087,
"epoch": 7.438551099611901,
"grad_norm": 1.7682623863220215,
"learning_rate": 2.584376479964945e-06,
"loss": 1.4882,
"mean_token_accuracy": 0.6309158280491829,
"num_tokens": 16475591.0,
"step": 5750
},
{
"entropy": 1.7254247039556503,
"epoch": 7.451487710219922,
"grad_norm": 0.0,
"learning_rate": 2.4674058196966663e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16476231.0,
"step": 5760
},
{
"entropy": 1.5136717677116394,
"epoch": 7.464424320827943,
"grad_norm": 0.7473369240760803,
"learning_rate": 2.353110848257267e-06,
"loss": 1.3413,
"mean_token_accuracy": 0.5824255973100663,
"num_tokens": 16552018.0,
"step": 5770
},
{
"entropy": 0.9227760046720505,
"epoch": 7.477360931435964,
"grad_norm": 0.982836902141571,
"learning_rate": 2.241494701191127e-06,
"loss": 0.9069,
"mean_token_accuracy": 0.7623407855629921,
"num_tokens": 16586256.0,
"step": 5780
},
{
"entropy": 1.1348280161619186,
"epoch": 7.490297542043985,
"grad_norm": 1.1100831031799316,
"learning_rate": 2.1325604405523334e-06,
"loss": 1.1069,
"mean_token_accuracy": 0.7201577231287957,
"num_tokens": 16610709.0,
"step": 5790
},
{
"entropy": 1.771338665485382,
"epoch": 7.503234152652006,
"grad_norm": 0.0,
"learning_rate": 2.026311054820629e-06,
"loss": 1.411,
"mean_token_accuracy": 0.5635204806923866,
"num_tokens": 16620269.0,
"step": 5800
},
{
"entropy": 1.7322617769241333,
"epoch": 7.516170763260026,
"grad_norm": 0.0,
"learning_rate": 1.922749458819506e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16620909.0,
"step": 5810
},
{
"entropy": 1.4817992717027664,
"epoch": 7.5291073738680465,
"grad_norm": 0.756270170211792,
"learning_rate": 1.8218784936361644e-06,
"loss": 1.353,
"mean_token_accuracy": 0.5787275157868862,
"num_tokens": 16690569.0,
"step": 5820
},
{
"entropy": 0.9674558937549591,
"epoch": 7.542043984476067,
"grad_norm": 0.8812004327774048,
"learning_rate": 1.7237009265436032e-06,
"loss": 0.9613,
"mean_token_accuracy": 0.7560465827584266,
"num_tokens": 16724649.0,
"step": 5830
},
{
"entropy": 1.1716067418456078,
"epoch": 7.554980595084088,
"grad_norm": 1.0925747156143188,
"learning_rate": 1.6282194509247063e-06,
"loss": 1.1436,
"mean_token_accuracy": 0.7135581076145172,
"num_tokens": 16749582.0,
"step": 5840
},
{
"entropy": 1.6912678241729737,
"epoch": 7.567917205692108,
"grad_norm": 1.6889742612838745,
"learning_rate": 1.5354366861983438e-06,
"loss": 1.5003,
"mean_token_accuracy": 0.6513200134038926,
"num_tokens": 16760847.0,
"step": 5850
},
{
"epoch": 7.567917205692108,
"eval_entropy": 1.4259126506919084,
"eval_loss": 1.2301470041275024,
"eval_mean_token_accuracy": 0.4896806857093822,
"eval_num_tokens": 16760847.0,
"eval_runtime": 246.4439,
"eval_samples_per_second": 22.301,
"eval_steps_per_second": 1.396,
"step": 5850
},
{
"entropy": 1.7190734058618546,
"epoch": 7.580853816300129,
"grad_norm": 0.0,
"learning_rate": 1.4453551777475094e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16761487.0,
"step": 5860
},
{
"entropy": 1.5320835530757904,
"epoch": 7.59379042690815,
"grad_norm": 0.7591171264648438,
"learning_rate": 1.3579773968495191e-06,
"loss": 1.3913,
"mean_token_accuracy": 0.5738878205418587,
"num_tokens": 16833368.0,
"step": 5870
},
{
"entropy": 0.9359873235225677,
"epoch": 7.606727037516171,
"grad_norm": 0.9182559847831726,
"learning_rate": 1.2733057406081438e-06,
"loss": 0.9307,
"mean_token_accuracy": 0.7633048981428147,
"num_tokens": 16867272.0,
"step": 5880
},
{
"entropy": 1.1327362582087517,
"epoch": 7.619663648124192,
"grad_norm": 1.0494729280471802,
"learning_rate": 1.1913425318879511e-06,
"loss": 1.1095,
"mean_token_accuracy": 0.7176593467593193,
"num_tokens": 16892030.0,
"step": 5890
},
{
"entropy": 1.7231059432029725,
"epoch": 7.632600258732213,
"grad_norm": 0.0,
"learning_rate": 1.1120900192505e-06,
"loss": 1.3184,
"mean_token_accuracy": 0.5641655296087265,
"num_tokens": 16901989.0,
"step": 5900
},
{
"entropy": 1.7543556302785874,
"epoch": 7.645536869340233,
"grad_norm": 0.0,
"learning_rate": 1.0355503768926466e-06,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 16902629.0,
"step": 5910
},
{
"entropy": 1.4874625369906425,
"epoch": 7.6584734799482534,
"grad_norm": 0.7250481843948364,
"learning_rate": 9.61725704587002e-07,
"loss": 1.3483,
"mean_token_accuracy": 0.5808299452066421,
"num_tokens": 16975429.0,
"step": 5920
},
{
"entropy": 0.940713207423687,
"epoch": 7.671410090556274,
"grad_norm": 0.9228203296661377,
"learning_rate": 8.906180276242015e-07,
"loss": 0.9271,
"mean_token_accuracy": 0.760072472691536,
"num_tokens": 17009886.0,
"step": 5930
},
{
"entropy": 1.1436687961220742,
"epoch": 7.684346701164295,
"grad_norm": 1.0997246503829956,
"learning_rate": 8.22229296757393e-07,
"loss": 1.1408,
"mean_token_accuracy": 0.7107081711292267,
"num_tokens": 17034678.0,
"step": 5940
},
{
"entropy": 1.73554485142231,
"epoch": 7.697283311772315,
"grad_norm": 0.0,
"learning_rate": 7.565613881487687e-07,
"loss": 1.365,
"mean_token_accuracy": 0.5842878207564354,
"num_tokens": 17044424.0,
"step": 5950
},
{
"entropy": 1.7472249418497086,
"epoch": 7.710219922380336,
"grad_norm": 0.0,
"learning_rate": 6.936161033180066e-07,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 17045064.0,
"step": 5960
},
{
"entropy": 1.5308178260922432,
"epoch": 7.723156532988357,
"grad_norm": 0.7369622588157654,
"learning_rate": 6.333951690929318e-07,
"loss": 1.3944,
"mean_token_accuracy": 0.571716184169054,
"num_tokens": 17121301.0,
"step": 5970
},
{
"entropy": 0.9166033461689949,
"epoch": 7.736093143596378,
"grad_norm": 0.8718245625495911,
"learning_rate": 5.759002375620548e-07,
"loss": 0.9191,
"mean_token_accuracy": 0.7659956023097039,
"num_tokens": 17155878.0,
"step": 5980
},
{
"entropy": 1.1351210102438927,
"epoch": 7.749029754204399,
"grad_norm": 1.1139835119247437,
"learning_rate": 5.211328860293519e-07,
"loss": 1.0937,
"mean_token_accuracy": 0.7179104581475257,
"num_tokens": 17180817.0,
"step": 5990
},
{
"entropy": 1.7042000949382783,
"epoch": 7.7619663648124195,
"grad_norm": 0.0,
"learning_rate": 4.6909461697088874e-07,
"loss": 1.2978,
"mean_token_accuracy": 0.5402273468673229,
"num_tokens": 17190238.0,
"step": 6000
},
{
"epoch": 7.7619663648124195,
"eval_entropy": 1.4222364893486334,
"eval_loss": 1.230813980102539,
"eval_mean_token_accuracy": 0.483534776973863,
"eval_num_tokens": 17190238.0,
"eval_runtime": 243.8499,
"eval_samples_per_second": 22.538,
"eval_steps_per_second": 1.411,
"step": 6000
},
{
"entropy": 1.7714763969182967,
"epoch": 7.7749029754204395,
"grad_norm": 0.0,
"learning_rate": 4.197868579936981e-07,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 17190878.0,
"step": 6010
},
{
"entropy": 1.5036073312163354,
"epoch": 7.78783958602846,
"grad_norm": 0.7586703896522522,
"learning_rate": 3.732109617965218e-07,
"loss": 1.3917,
"mean_token_accuracy": 0.5730986759066582,
"num_tokens": 17262910.0,
"step": 6020
},
{
"entropy": 0.9327082589268685,
"epoch": 7.800776196636481,
"grad_norm": 0.8685732483863831,
"learning_rate": 3.293682061327963e-07,
"loss": 0.9333,
"mean_token_accuracy": 0.7620440036058426,
"num_tokens": 17296857.0,
"step": 6030
},
{
"entropy": 1.177341391146183,
"epoch": 7.813712807244502,
"grad_norm": 1.1222566366195679,
"learning_rate": 2.882597937755249e-07,
"loss": 1.1641,
"mean_token_accuracy": 0.7064913615584374,
"num_tokens": 17321218.0,
"step": 6040
},
{
"entropy": 1.7008673965930938,
"epoch": 7.826649417852523,
"grad_norm": 0.0,
"learning_rate": 2.498868524843045e-07,
"loss": 1.2135,
"mean_token_accuracy": 0.5372394770383835,
"num_tokens": 17329684.0,
"step": 6050
},
{
"entropy": 1.7468272864818573,
"epoch": 7.839586028460543,
"grad_norm": 0.0,
"learning_rate": 2.1425043497439456e-07,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 17330324.0,
"step": 6060
},
{
"entropy": 1.5415200561285018,
"epoch": 7.852522639068564,
"grad_norm": 0.7708677649497986,
"learning_rate": 1.8135151888782899e-07,
"loss": 1.3837,
"mean_token_accuracy": 0.574844229221344,
"num_tokens": 17408721.0,
"step": 6070
},
{
"entropy": 0.9075698807835579,
"epoch": 7.865459249676585,
"grad_norm": 0.8989212512969971,
"learning_rate": 1.5119100676662667e-07,
"loss": 0.8899,
"mean_token_accuracy": 0.771544449031353,
"num_tokens": 17442757.0,
"step": 6080
},
{
"entropy": 1.1743381530046464,
"epoch": 7.878395860284606,
"grad_norm": 1.025661826133728,
"learning_rate": 1.2376972602795578e-07,
"loss": 1.1425,
"mean_token_accuracy": 0.7124027162790298,
"num_tokens": 17467049.0,
"step": 6090
},
{
"entropy": 1.7484049052000046,
"epoch": 7.8913324708926265,
"grad_norm": 0.0,
"learning_rate": 9.908842894151837e-08,
"loss": 1.3114,
"mean_token_accuracy": 0.5641379207372665,
"num_tokens": 17475616.0,
"step": 6100
},
{
"entropy": 1.7715317398309707,
"epoch": 7.904269081500646,
"grad_norm": 0.0,
"learning_rate": 7.714779260886707e-08,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 17476256.0,
"step": 6110
},
{
"entropy": 1.5000860661268234,
"epoch": 7.917205692108667,
"grad_norm": 0.7349119186401367,
"learning_rate": 5.7948418944842043e-08,
"loss": 1.3548,
"mean_token_accuracy": 0.5794984824955464,
"num_tokens": 17546950.0,
"step": 6120
},
{
"entropy": 0.9138670772314071,
"epoch": 7.930142302716688,
"grad_norm": 0.8542500138282776,
"learning_rate": 4.149083466105097e-08,
"loss": 0.9021,
"mean_token_accuracy": 0.770347698032856,
"num_tokens": 17581293.0,
"step": 6130
},
{
"entropy": 1.1947215780615807,
"epoch": 7.943078913324709,
"grad_norm": 1.0435749292373657,
"learning_rate": 2.7775491251413877e-08,
"loss": 1.1687,
"mean_token_accuracy": 0.7094842702150345,
"num_tokens": 17605803.0,
"step": 6140
},
{
"entropy": 1.6835207402706147,
"epoch": 7.95601552393273,
"grad_norm": 0.0,
"learning_rate": 1.6802764979817474e-08,
"loss": 1.1704,
"mean_token_accuracy": 0.5183229476213456,
"num_tokens": 17613695.0,
"step": 6150
},
{
"epoch": 7.95601552393273,
"eval_entropy": 1.4208284545429917,
"eval_loss": 1.2304351329803467,
"eval_mean_token_accuracy": 0.4861882030097551,
"eval_num_tokens": 17613695.0,
"eval_runtime": 244.9318,
"eval_samples_per_second": 22.439,
"eval_steps_per_second": 1.404,
"step": 6150
},
{
"entropy": 1.7820782691240311,
"epoch": 7.96895213454075,
"grad_norm": 0.0,
"learning_rate": 8.572956869734583e-09,
"loss": 0.0,
"mean_token_accuracy": 0.0,
"num_tokens": 17614335.0,
"step": 6160
},
{
"entropy": 1.2753556087613105,
"epoch": 7.981888745148771,
"grad_norm": 0.9358561635017395,
"learning_rate": 3.0862926959973617e-09,
"loss": 1.1173,
"mean_token_accuracy": 0.6308311700820923,
"num_tokens": 17667096.0,
"step": 6170
},
{
"entropy": 1.4832376271486283,
"epoch": 7.994825355756792,
"grad_norm": 0.0,
"learning_rate": 3.429229786133803e-10,
"loss": 1.055,
"mean_token_accuracy": 0.5700831845402717,
"num_tokens": 17681630.0,
"step": 6180
}
],
"logging_steps": 10,
"max_steps": 6184,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 600,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.974075450217726e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}