{ "best_global_step": 5400, "best_metric": 1.2261559963226318, "best_model_checkpoint": "./results-3/checkpoint-5400", "epoch": 8.0, "eval_steps": 150, "global_step": 6184, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4760531455278396, "epoch": 0.0129366106080207, "grad_norm": 1.3410229682922363, "learning_rate": 9.67741935483871e-06, "loss": 3.8342, "mean_token_accuracy": 0.40634620636701585, "num_tokens": 77854.0, "step": 10 }, { "entropy": 1.4689971387386322, "epoch": 0.0258732212160414, "grad_norm": 1.4104728698730469, "learning_rate": 2.0430107526881722e-05, "loss": 4.4137, "mean_token_accuracy": 0.3765578977763653, "num_tokens": 111064.0, "step": 20 }, { "entropy": 1.893897533416748, "epoch": 0.03880983182406209, "grad_norm": 0.8629273772239685, "learning_rate": 3.118279569892473e-05, "loss": 3.8151, "mean_token_accuracy": 0.38278606086969375, "num_tokens": 134712.0, "step": 30 }, { "entropy": 4.312886017560959, "epoch": 0.0517464424320828, "grad_norm": 0.0, "learning_rate": 4.1935483870967746e-05, "loss": 3.7735, "mean_token_accuracy": 0.19467806722968817, "num_tokens": 142734.0, "step": 40 }, { "entropy": 8.096190857887269, "epoch": 0.0646830530401035, "grad_norm": 0.0, "learning_rate": 5.268817204301075e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 143374.0, "step": 50 }, { "entropy": 2.519210198521614, "epoch": 0.07761966364812418, "grad_norm": 0.46956390142440796, "learning_rate": 6.344086021505376e-05, "loss": 2.7759, "mean_token_accuracy": 0.4458329685032368, "num_tokens": 218138.0, "step": 60 }, { "entropy": 2.7062919318675993, "epoch": 0.09055627425614489, "grad_norm": 0.36261770129203796, "learning_rate": 7.419354838709677e-05, "loss": 2.5766, "mean_token_accuracy": 0.4825271964073181, "num_tokens": 250316.0, "step": 70 }, { "entropy": 2.5266534447669984, "epoch": 0.1034928848641656, "grad_norm": 0.39197003841400146, "learning_rate": 8.494623655913979e-05, "loss": 2.5861, "mean_token_accuracy": 0.47026830837130545, "num_tokens": 272857.0, "step": 80 }, { "entropy": 4.64949648976326, "epoch": 0.11642949547218628, "grad_norm": 0.0, "learning_rate": 9.56989247311828e-05, "loss": 2.5449, "mean_token_accuracy": 0.20907760383561252, "num_tokens": 279057.0, "step": 90 }, { "entropy": 6.761381912231445, "epoch": 0.129366106080207, "grad_norm": 0.0, "learning_rate": 0.0001064516129032258, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 279697.0, "step": 100 }, { "entropy": 2.331750747561455, "epoch": 0.1423027166882277, "grad_norm": 0.4349558353424072, "learning_rate": 0.00011720430107526883, "loss": 2.4607, "mean_token_accuracy": 0.4927462741732597, "num_tokens": 358859.0, "step": 110 }, { "entropy": 1.976218768954277, "epoch": 0.15523932729624837, "grad_norm": 0.24631856381893158, "learning_rate": 0.00012795698924731184, "loss": 2.038, "mean_token_accuracy": 0.564648849517107, "num_tokens": 391721.0, "step": 120 }, { "entropy": 2.3566002756357194, "epoch": 0.16817593790426907, "grad_norm": 0.33470404148101807, "learning_rate": 0.00013870967741935487, "loss": 2.3135, "mean_token_accuracy": 0.5072783440351486, "num_tokens": 415274.0, "step": 130 }, { "entropy": 3.8850305318832397, "epoch": 0.18111254851228978, "grad_norm": 0.0, "learning_rate": 0.00014946236559139787, "loss": 2.4748, "mean_token_accuracy": 0.29179108552634714, "num_tokens": 423127.0, "step": 140 }, { "entropy": 6.345981705188751, "epoch": 0.19404915912031048, "grad_norm": 0.0, "learning_rate": 0.00016021505376344087, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 423767.0, "step": 150 }, { "epoch": 0.19404915912031048, "eval_entropy": 3.4076465337082396, "eval_loss": 2.088292360305786, "eval_mean_token_accuracy": 0.3316028483731802, "eval_num_tokens": 423767.0, "eval_runtime": 243.9108, "eval_samples_per_second": 22.533, "eval_steps_per_second": 1.41, "step": 150 }, { "entropy": 2.2055542409420013, "epoch": 0.2069857697283312, "grad_norm": 0.31700077652931213, "learning_rate": 0.0001709677419354839, "loss": 2.4005, "mean_token_accuracy": 0.5007682546973229, "num_tokens": 500625.0, "step": 160 }, { "entropy": 1.8786041021347046, "epoch": 0.21992238033635186, "grad_norm": 0.24800752103328705, "learning_rate": 0.0001817204301075269, "loss": 1.8474, "mean_token_accuracy": 0.5935635283589363, "num_tokens": 534396.0, "step": 170 }, { "entropy": 2.263536959886551, "epoch": 0.23285899094437257, "grad_norm": 0.3183101415634155, "learning_rate": 0.00019247311827956992, "loss": 2.2154, "mean_token_accuracy": 0.518243944644928, "num_tokens": 558685.0, "step": 180 }, { "entropy": 4.052780479192734, "epoch": 0.24579560155239327, "grad_norm": 0.0, "learning_rate": 0.00019999987654768255, "loss": 2.3652, "mean_token_accuracy": 0.32749315425753595, "num_tokens": 566987.0, "step": 190 }, { "entropy": 4.421183264255523, "epoch": 0.258732212160414, "grad_norm": 0.0, "learning_rate": 0.0001999976818482961, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 567627.0, "step": 200 }, { "entropy": 2.0365082800388334, "epoch": 0.2716688227684347, "grad_norm": 0.2679975628852844, "learning_rate": 0.00019999274383338027, "loss": 2.1862, "mean_token_accuracy": 0.5347613260149956, "num_tokens": 644352.0, "step": 210 }, { "entropy": 1.8313011974096298, "epoch": 0.2846054333764554, "grad_norm": 0.2597528398036957, "learning_rate": 0.00019998506263840354, "loss": 1.8579, "mean_token_accuracy": 0.5869012281298638, "num_tokens": 676791.0, "step": 220 }, { "entropy": 2.229961010813713, "epoch": 0.2975420439844761, "grad_norm": 0.39198312163352966, "learning_rate": 0.00019997463847409023, "loss": 2.2158, "mean_token_accuracy": 0.5119729146361351, "num_tokens": 699604.0, "step": 230 }, { "entropy": 3.5435027480125427, "epoch": 0.31047865459249674, "grad_norm": 0.0, "learning_rate": 0.00019996147162641464, "loss": 2.2309, "mean_token_accuracy": 0.31516757532954215, "num_tokens": 706414.0, "step": 240 }, { "entropy": 3.784550839662552, "epoch": 0.32341526520051744, "grad_norm": 0.0, "learning_rate": 0.00019994556245659338, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 707054.0, "step": 250 }, { "entropy": 2.086453899741173, "epoch": 0.33635187580853815, "grad_norm": 0.2695913314819336, "learning_rate": 0.00019992691140107525, "loss": 2.2688, "mean_token_accuracy": 0.5183561690151691, "num_tokens": 787476.0, "step": 260 }, { "entropy": 1.80660640001297, "epoch": 0.34928848641655885, "grad_norm": 0.2775532603263855, "learning_rate": 0.0001999055189715294, "loss": 1.855, "mean_token_accuracy": 0.5896616145968437, "num_tokens": 820945.0, "step": 270 }, { "entropy": 2.265737462043762, "epoch": 0.36222509702457956, "grad_norm": 0.35880544781684875, "learning_rate": 0.0001998813857548313, "loss": 2.1884, "mean_token_accuracy": 0.5160560064017773, "num_tokens": 844570.0, "step": 280 }, { "entropy": 3.490731942653656, "epoch": 0.37516170763260026, "grad_norm": 0.0, "learning_rate": 0.0001998545124130466, "loss": 2.196, "mean_token_accuracy": 0.3669252373278141, "num_tokens": 852461.0, "step": 290 }, { "entropy": 3.8365337908267976, "epoch": 0.38809831824062097, "grad_norm": 0.0, "learning_rate": 0.00019982489968341292, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 853101.0, "step": 300 }, { "epoch": 0.38809831824062097, "eval_entropy": 2.108367764672568, "eval_loss": 1.785624623298645, "eval_mean_token_accuracy": 0.3863564946277197, "eval_num_tokens": 853101.0, "eval_runtime": 244.4512, "eval_samples_per_second": 22.483, "eval_steps_per_second": 1.407, "step": 300 }, { "entropy": 2.0010604202747344, "epoch": 0.40103492884864167, "grad_norm": 0.26067453622817993, "learning_rate": 0.00019979254837831976, "loss": 2.1888, "mean_token_accuracy": 0.527290866523981, "num_tokens": 932233.0, "step": 310 }, { "entropy": 1.8096002161502838, "epoch": 0.4139715394566624, "grad_norm": 0.3278159201145172, "learning_rate": 0.00019975745938528597, "loss": 1.8032, "mean_token_accuracy": 0.5965773060917854, "num_tokens": 965240.0, "step": 320 }, { "entropy": 2.239218121767044, "epoch": 0.4269081500646831, "grad_norm": 0.3497501611709595, "learning_rate": 0.00019971963366693574, "loss": 2.1853, "mean_token_accuracy": 0.5204933404922485, "num_tokens": 988836.0, "step": 330 }, { "entropy": 3.591762775182724, "epoch": 0.4398447606727037, "grad_norm": 0.0, "learning_rate": 0.0001996790722609719, "loss": 2.0384, "mean_token_accuracy": 0.3091650754213333, "num_tokens": 995598.0, "step": 340 }, { "entropy": 1.7911852180957795, "epoch": 0.45278137128072443, "grad_norm": 0.0, "learning_rate": 0.00019963577628014757, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 996238.0, "step": 350 }, { "entropy": 1.9936166375875473, "epoch": 0.46571798188874514, "grad_norm": 0.2826139032840729, "learning_rate": 0.00019958974691223572, "loss": 2.1339, "mean_token_accuracy": 0.5367397539317608, "num_tokens": 1068779.0, "step": 360 }, { "entropy": 1.7499752998352052, "epoch": 0.47865459249676584, "grad_norm": 0.25705066323280334, "learning_rate": 0.00019954098541999634, "loss": 1.7626, "mean_token_accuracy": 0.6045101627707481, "num_tokens": 1101822.0, "step": 370 }, { "entropy": 2.2398334205150605, "epoch": 0.49159120310478654, "grad_norm": 0.35060882568359375, "learning_rate": 0.00019948949314114208, "loss": 2.1407, "mean_token_accuracy": 0.5221379362046719, "num_tokens": 1125242.0, "step": 380 }, { "entropy": 3.20022537112236, "epoch": 0.5045278137128072, "grad_norm": 0.0, "learning_rate": 0.00019943527148830138, "loss": 2.1867, "mean_token_accuracy": 0.3573383778333664, "num_tokens": 1132694.0, "step": 390 }, { "entropy": 3.233865666389465, "epoch": 0.517464424320828, "grad_norm": 0.0, "learning_rate": 0.00019937832194897968, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1133334.0, "step": 400 }, { "entropy": 1.883677563071251, "epoch": 0.5304010349288486, "grad_norm": 0.253384530544281, "learning_rate": 0.00019931864608551886, "loss": 2.065, "mean_token_accuracy": 0.5480175256729126, "num_tokens": 1208651.0, "step": 410 }, { "entropy": 1.8230630427598953, "epoch": 0.5433376455368694, "grad_norm": 0.27244824171066284, "learning_rate": 0.000199256245535054, "loss": 1.7993, "mean_token_accuracy": 0.5971413522958755, "num_tokens": 1241633.0, "step": 420 }, { "entropy": 2.1840337038040163, "epoch": 0.55627425614489, "grad_norm": 0.33489564061164856, "learning_rate": 0.00019919112200946878, "loss": 2.1355, "mean_token_accuracy": 0.523309488594532, "num_tokens": 1265245.0, "step": 430 }, { "entropy": 3.2613951563835144, "epoch": 0.5692108667529108, "grad_norm": 0.0, "learning_rate": 0.0001991232772953485, "loss": 2.0666, "mean_token_accuracy": 0.36050624772906303, "num_tokens": 1272655.0, "step": 440 }, { "entropy": 2.055793708562851, "epoch": 0.5821474773609314, "grad_norm": 0.0, "learning_rate": 0.0001990527132539308, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1273295.0, "step": 450 }, { "epoch": 0.5821474773609314, "eval_entropy": 1.5776830232420633, "eval_loss": 1.6587693691253662, "eval_mean_token_accuracy": 0.40149001534595047, "eval_num_tokens": 1273295.0, "eval_runtime": 245.22, "eval_samples_per_second": 22.413, "eval_steps_per_second": 1.403, "step": 450 }, { "entropy": 1.9504007667303085, "epoch": 0.5950840879689522, "grad_norm": 0.2335178405046463, "learning_rate": 0.00019897943182105486, "loss": 2.1289, "mean_token_accuracy": 0.5388719126582145, "num_tokens": 1353662.0, "step": 460 }, { "entropy": 1.812851694226265, "epoch": 0.6080206985769728, "grad_norm": 0.27590492367744446, "learning_rate": 0.00019890343500710827, "loss": 1.79, "mean_token_accuracy": 0.5952848941087723, "num_tokens": 1386745.0, "step": 470 }, { "entropy": 2.1694509744644166, "epoch": 0.6209573091849935, "grad_norm": 0.36973315477371216, "learning_rate": 0.0001988247248969717, "loss": 2.1425, "mean_token_accuracy": 0.5235736042261123, "num_tokens": 1410114.0, "step": 480 }, { "entropy": 3.2446247756481172, "epoch": 0.6338939197930142, "grad_norm": 0.0, "learning_rate": 0.00019874330364996192, "loss": 2.0907, "mean_token_accuracy": 0.3589281477034092, "num_tokens": 1417385.0, "step": 490 }, { "entropy": 2.887257432937622, "epoch": 0.6468305304010349, "grad_norm": 0.0, "learning_rate": 0.00019865917349977242, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1418025.0, "step": 500 }, { "entropy": 2.0031155347824097, "epoch": 0.6597671410090556, "grad_norm": 0.2290731519460678, "learning_rate": 0.00019857233675441217, "loss": 2.1288, "mean_token_accuracy": 0.5355072975158691, "num_tokens": 1498284.0, "step": 510 }, { "entropy": 1.7464266479015351, "epoch": 0.6727037516170763, "grad_norm": 0.27917975187301636, "learning_rate": 0.0001984827957961423, "loss": 1.7213, "mean_token_accuracy": 0.6062818467617035, "num_tokens": 1531645.0, "step": 520 }, { "entropy": 2.099287986755371, "epoch": 0.685640362225097, "grad_norm": 0.34847304224967957, "learning_rate": 0.00019839055308141078, "loss": 2.0957, "mean_token_accuracy": 0.5292750775814057, "num_tokens": 1555744.0, "step": 530 }, { "entropy": 2.987301951646805, "epoch": 0.6985769728331177, "grad_norm": 0.0, "learning_rate": 0.00019829561114078503, "loss": 2.035, "mean_token_accuracy": 0.35071768537163733, "num_tokens": 1563621.0, "step": 540 }, { "entropy": 1.7990713268518448, "epoch": 0.7115135834411385, "grad_norm": 0.0, "learning_rate": 0.00019819797257888237, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1564261.0, "step": 550 }, { "entropy": 1.9829886645078658, "epoch": 0.7244501940491591, "grad_norm": 0.23086819052696228, "learning_rate": 0.00019809764007429874, "loss": 2.0682, "mean_token_accuracy": 0.546464990824461, "num_tokens": 1645469.0, "step": 560 }, { "entropy": 1.742349737882614, "epoch": 0.7373868046571799, "grad_norm": 0.2855489253997803, "learning_rate": 0.00019799461637953517, "loss": 1.7437, "mean_token_accuracy": 0.6023638218641281, "num_tokens": 1678187.0, "step": 570 }, { "entropy": 2.0789969861507416, "epoch": 0.7503234152652005, "grad_norm": 0.3439568877220154, "learning_rate": 0.00019788890432092211, "loss": 2.0849, "mean_token_accuracy": 0.5323359861969947, "num_tokens": 1701620.0, "step": 580 }, { "entropy": 3.068958950042725, "epoch": 0.7632600258732212, "grad_norm": 0.0, "learning_rate": 0.0001977805067985422, "loss": 2.0752, "mean_token_accuracy": 0.34963107854127884, "num_tokens": 1709495.0, "step": 590 }, { "entropy": 1.8244159191846847, "epoch": 0.7761966364812419, "grad_norm": 0.0, "learning_rate": 0.00019766942678615035, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1710135.0, "step": 600 }, { "epoch": 0.7761966364812419, "eval_entropy": 1.7642724643959555, "eval_loss": 1.5706199407577515, "eval_mean_token_accuracy": 0.41619762856253356, "eval_num_tokens": 1710135.0, "eval_runtime": 241.9763, "eval_samples_per_second": 22.713, "eval_steps_per_second": 1.422, "step": 600 }, { "entropy": 2.0163265824317933, "epoch": 0.7891332470892626, "grad_norm": 0.21755698323249817, "learning_rate": 0.00019755566733109251, "loss": 2.083, "mean_token_accuracy": 0.5411292694509029, "num_tokens": 1791443.0, "step": 610 }, { "entropy": 1.7245848059654236, "epoch": 0.8020698576972833, "grad_norm": 0.288361519575119, "learning_rate": 0.0001974392315542218, "loss": 1.735, "mean_token_accuracy": 0.6052085891366005, "num_tokens": 1824564.0, "step": 620 }, { "entropy": 2.11205150783062, "epoch": 0.815006468305304, "grad_norm": 0.3383215069770813, "learning_rate": 0.000197320122649813, "loss": 2.1082, "mean_token_accuracy": 0.5229554586112499, "num_tokens": 1847974.0, "step": 630 }, { "entropy": 3.1667094111442564, "epoch": 0.8279430789133247, "grad_norm": 0.0, "learning_rate": 0.000197198343885475, "loss": 2.0386, "mean_token_accuracy": 0.3788307599723339, "num_tokens": 1855343.0, "step": 640 }, { "entropy": 4.338292050361633, "epoch": 0.8408796895213454, "grad_norm": 0.0, "learning_rate": 0.00019707389860206087, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1855983.0, "step": 650 }, { "entropy": 2.0194253027439117, "epoch": 0.8538163001293662, "grad_norm": 0.2260085493326187, "learning_rate": 0.00019694679021357666, "loss": 2.0757, "mean_token_accuracy": 0.5414572946727276, "num_tokens": 1933686.0, "step": 660 }, { "entropy": 1.7457041829824447, "epoch": 0.8667529107373868, "grad_norm": 0.2763752341270447, "learning_rate": 0.00019681702220708725, "loss": 1.7265, "mean_token_accuracy": 0.6072784595191478, "num_tokens": 1967008.0, "step": 670 }, { "entropy": 2.0987232238054276, "epoch": 0.8796895213454075, "grad_norm": 0.3309071958065033, "learning_rate": 0.00019668459814262116, "loss": 2.0841, "mean_token_accuracy": 0.5245410539209843, "num_tokens": 1990659.0, "step": 680 }, { "entropy": 3.1821718513965607, "epoch": 0.8926261319534282, "grad_norm": 0.0, "learning_rate": 0.00019654952165307245, "loss": 2.229, "mean_token_accuracy": 0.3981798455119133, "num_tokens": 1999251.0, "step": 690 }, { "entropy": 1.533292955160141, "epoch": 0.9055627425614489, "grad_norm": 0.0, "learning_rate": 0.00019641179644410136, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 1999891.0, "step": 700 }, { "entropy": 1.9957170754671096, "epoch": 0.9184993531694696, "grad_norm": 0.24394062161445618, "learning_rate": 0.00019627142629403258, "loss": 2.0975, "mean_token_accuracy": 0.5407429985702038, "num_tokens": 2079895.0, "step": 710 }, { "entropy": 1.7518584847450256, "epoch": 0.9314359637774903, "grad_norm": 0.307822048664093, "learning_rate": 0.00019612841505375138, "loss": 1.7164, "mean_token_accuracy": 0.610467329621315, "num_tokens": 2113509.0, "step": 720 }, { "entropy": 2.1020208179950712, "epoch": 0.944372574385511, "grad_norm": 0.35130032896995544, "learning_rate": 0.0001959827666465984, "loss": 2.1253, "mean_token_accuracy": 0.5220636121928692, "num_tokens": 2137129.0, "step": 730 }, { "entropy": 2.8922512531280518, "epoch": 0.9573091849935317, "grad_norm": 0.0, "learning_rate": 0.00019583448506826155, "loss": 1.9805, "mean_token_accuracy": 0.3766542553901672, "num_tokens": 2144488.0, "step": 740 }, { "entropy": 2.719996190071106, "epoch": 0.9702457956015524, "grad_norm": 0.0, "learning_rate": 0.00019568357438666675, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 2145128.0, "step": 750 }, { "epoch": 0.9702457956015524, "eval_entropy": 2.1529439126336296, "eval_loss": 1.5584267377853394, "eval_mean_token_accuracy": 0.41127229698522144, "eval_num_tokens": 2145128.0, "eval_runtime": 239.9653, "eval_samples_per_second": 22.903, "eval_steps_per_second": 1.434, "step": 750 }, { "entropy": 1.9202934563159944, "epoch": 0.9831824062095731, "grad_norm": 0.28597503900527954, "learning_rate": 0.00019553003874186607, "loss": 1.9302, "mean_token_accuracy": 0.5697067268192768, "num_tokens": 2197523.0, "step": 760 }, { "entropy": 2.492697748541832, "epoch": 0.9961190168175937, "grad_norm": 0.0, "learning_rate": 0.00019537388234592442, "loss": 1.81, "mean_token_accuracy": 0.39367630481719973, "num_tokens": 2210056.0, "step": 770 }, { "entropy": 2.2332220911979674, "epoch": 1.0090556274256144, "grad_norm": 0.24866575002670288, "learning_rate": 0.00019521510948280373, "loss": 1.5005, "mean_token_accuracy": 0.36937303096055984, "num_tokens": 2275252.0, "step": 780 }, { "entropy": 1.6707671225070952, "epoch": 1.0219922380336353, "grad_norm": 0.26215294003486633, "learning_rate": 0.0001950537245082456, "loss": 1.6341, "mean_token_accuracy": 0.6254087015986443, "num_tokens": 2311716.0, "step": 790 }, { "entropy": 1.8872807383537293, "epoch": 1.034928848641656, "grad_norm": 0.36441880464553833, "learning_rate": 0.0001948897318496517, "loss": 1.8977, "mean_token_accuracy": 0.5622286461293697, "num_tokens": 2338280.0, "step": 800 }, { "entropy": 2.630810996890068, "epoch": 1.0478654592496766, "grad_norm": 0.9684458374977112, "learning_rate": 0.0001947231360059624, "loss": 2.4046, "mean_token_accuracy": 0.48553739935159684, "num_tokens": 2351659.0, "step": 810 }, { "entropy": 2.581965911388397, "epoch": 1.0608020698576972, "grad_norm": 0.0, "learning_rate": 0.0001945539415475333, "loss": 0.1693, "mean_token_accuracy": 0.06434160768985749, "num_tokens": 2352447.0, "step": 820 }, { "entropy": 2.1218140482902528, "epoch": 1.073738680465718, "grad_norm": 0.28001680970191956, "learning_rate": 0.00019438215311600989, "loss": 1.5396, "mean_token_accuracy": 0.3639061972498894, "num_tokens": 2421672.0, "step": 830 }, { "entropy": 1.6590570658445358, "epoch": 1.0866752910737387, "grad_norm": 0.27536195516586304, "learning_rate": 0.0001942077754242001, "loss": 1.5986, "mean_token_accuracy": 0.6285051852464676, "num_tokens": 2458016.0, "step": 840 }, { "entropy": 1.8610892415046691, "epoch": 1.0996119016817594, "grad_norm": 0.3670406937599182, "learning_rate": 0.00019403081325594516, "loss": 1.8678, "mean_token_accuracy": 0.5674182385206222, "num_tokens": 2484503.0, "step": 850 }, { "entropy": 2.5339868366718292, "epoch": 1.11254851228978, "grad_norm": 0.911289393901825, "learning_rate": 0.0001938512714659882, "loss": 2.3594, "mean_token_accuracy": 0.49951401725411415, "num_tokens": 2498485.0, "step": 860 }, { "entropy": 1.802379448711872, "epoch": 1.1254851228978007, "grad_norm": 0.0, "learning_rate": 0.00019366915497984126, "loss": 0.1255, "mean_token_accuracy": 0.04691708832979202, "num_tokens": 2499204.0, "step": 870 }, { "entropy": 1.716230283677578, "epoch": 1.1384217335058215, "grad_norm": 0.29532766342163086, "learning_rate": 0.00019348446879364998, "loss": 1.5067, "mean_token_accuracy": 0.3694909021258354, "num_tokens": 2567621.0, "step": 880 }, { "entropy": 1.6236608117818832, "epoch": 1.1513583441138422, "grad_norm": 0.29713669419288635, "learning_rate": 0.00019329721797405665, "loss": 1.5861, "mean_token_accuracy": 0.6327742949128151, "num_tokens": 2603962.0, "step": 890 }, { "entropy": 1.8917641669511795, "epoch": 1.1642949547218628, "grad_norm": 0.3658815324306488, "learning_rate": 0.00019310740765806112, "loss": 1.9243, "mean_token_accuracy": 0.5606695532798767, "num_tokens": 2630252.0, "step": 900 }, { "epoch": 1.1642949547218628, "eval_entropy": 1.9112664786882179, "eval_loss": 1.4807052612304688, "eval_mean_token_accuracy": 0.42875484818982523, "eval_num_tokens": 2630252.0, "eval_runtime": 244.6093, "eval_samples_per_second": 22.468, "eval_steps_per_second": 1.406, "step": 900 }, { "entropy": 2.6821564227342605, "epoch": 1.1772315653298835, "grad_norm": 1.0140999555587769, "learning_rate": 0.00019291504305288005, "loss": 2.4338, "mean_token_accuracy": 0.482094044983387, "num_tokens": 2643300.0, "step": 910 }, { "entropy": 2.024871030449867, "epoch": 1.1901681759379044, "grad_norm": 0.0, "learning_rate": 0.00019272012943580383, "loss": 0.088, "mean_token_accuracy": 0.05487980842590332, "num_tokens": 2644037.0, "step": 920 }, { "entropy": 1.9695144146680832, "epoch": 1.203104786545925, "grad_norm": 0.290670245885849, "learning_rate": 0.00019252267215405188, "loss": 1.523, "mean_token_accuracy": 0.36803208142518995, "num_tokens": 2711455.0, "step": 930 }, { "entropy": 1.634880828857422, "epoch": 1.2160413971539457, "grad_norm": 0.2892841100692749, "learning_rate": 0.00019232267662462618, "loss": 1.5725, "mean_token_accuracy": 0.6363927751779557, "num_tokens": 2747178.0, "step": 940 }, { "entropy": 1.8903283953666687, "epoch": 1.2289780077619663, "grad_norm": 0.3681142330169678, "learning_rate": 0.00019212014833416222, "loss": 1.9128, "mean_token_accuracy": 0.5572593852877616, "num_tokens": 2773302.0, "step": 950 }, { "entropy": 2.5646925628185273, "epoch": 1.2419146183699872, "grad_norm": 2.999826669692993, "learning_rate": 0.00019191509283877892, "loss": 2.3972, "mean_token_accuracy": 0.49176110327243805, "num_tokens": 2787000.0, "step": 960 }, { "entropy": 2.151153501868248, "epoch": 1.2548512289780078, "grad_norm": 0.0, "learning_rate": 0.00019170751576392587, "loss": 0.1193, "mean_token_accuracy": 0.044841271638870236, "num_tokens": 2787722.0, "step": 970 }, { "entropy": 1.8522070705890656, "epoch": 1.2677878395860285, "grad_norm": 0.2727435827255249, "learning_rate": 0.00019149742280422924, "loss": 1.5171, "mean_token_accuracy": 0.36686722859740256, "num_tokens": 2854084.0, "step": 980 }, { "entropy": 1.5743449032306671, "epoch": 1.2807244501940491, "grad_norm": 0.2871781289577484, "learning_rate": 0.00019128481972333544, "loss": 1.5921, "mean_token_accuracy": 0.6345128893852234, "num_tokens": 2890579.0, "step": 990 }, { "entropy": 1.969143381714821, "epoch": 1.2936610608020698, "grad_norm": 0.4106636643409729, "learning_rate": 0.00019106971235375298, "loss": 1.9566, "mean_token_accuracy": 0.5519939877092839, "num_tokens": 2917103.0, "step": 1000 }, { "entropy": 2.637175753712654, "epoch": 1.3065976714100906, "grad_norm": 0.956899881362915, "learning_rate": 0.0001908521065966926, "loss": 2.4367, "mean_token_accuracy": 0.47931770235300064, "num_tokens": 2930324.0, "step": 1010 }, { "entropy": 1.2107470080256462, "epoch": 1.3195342820181113, "grad_norm": 0.0, "learning_rate": 0.00019063200842190514, "loss": 0.1138, "mean_token_accuracy": 0.07033292502164841, "num_tokens": 2931098.0, "step": 1020 }, { "entropy": 1.5640547186136247, "epoch": 1.332470892626132, "grad_norm": 0.2837156057357788, "learning_rate": 0.00019040942386751804, "loss": 1.5281, "mean_token_accuracy": 0.368409526348114, "num_tokens": 2998986.0, "step": 1030 }, { "entropy": 1.6472883015871047, "epoch": 1.3454075032341526, "grad_norm": 0.31581056118011475, "learning_rate": 0.00019018435903986943, "loss": 1.6144, "mean_token_accuracy": 0.62486432492733, "num_tokens": 3035300.0, "step": 1040 }, { "entropy": 1.8509329915046693, "epoch": 1.3583441138421732, "grad_norm": 0.39050692319869995, "learning_rate": 0.00018995682011334087, "loss": 1.8415, "mean_token_accuracy": 0.5710361421108245, "num_tokens": 3062133.0, "step": 1050 }, { "epoch": 1.3583441138421732, "eval_entropy": 1.7658769363580749, "eval_loss": 1.464791178703308, "eval_mean_token_accuracy": 0.429339470125215, "eval_num_tokens": 3062133.0, "eval_runtime": 243.4077, "eval_samples_per_second": 22.579, "eval_steps_per_second": 1.413, "step": 1050 }, { "entropy": 2.4731887727975845, "epoch": 1.371280724450194, "grad_norm": 0.9063658714294434, "learning_rate": 0.00018972681333018776, "loss": 2.3412, "mean_token_accuracy": 0.4919880717992783, "num_tokens": 3076137.0, "step": 1060 }, { "entropy": 1.815966796875, "epoch": 1.3842173350582148, "grad_norm": 0.0, "learning_rate": 0.00018949434500036816, "loss": 0.2748, "mean_token_accuracy": 0.094140625, "num_tokens": 3077033.0, "step": 1070 }, { "entropy": 1.7788158431649208, "epoch": 1.3971539456662354, "grad_norm": 0.28700482845306396, "learning_rate": 0.0001892594215013697, "loss": 1.491, "mean_token_accuracy": 0.3707178644835949, "num_tokens": 3139012.0, "step": 1080 }, { "entropy": 1.5893326640129088, "epoch": 1.4100905562742563, "grad_norm": 0.3248252868652344, "learning_rate": 0.00018902204927803462, "loss": 1.5707, "mean_token_accuracy": 0.6353108420968056, "num_tokens": 3175132.0, "step": 1090 }, { "entropy": 1.8777880787849426, "epoch": 1.4230271668822767, "grad_norm": 0.4096948206424713, "learning_rate": 0.00018878223484238295, "loss": 1.9016, "mean_token_accuracy": 0.5628921225667, "num_tokens": 3201175.0, "step": 1100 }, { "entropy": 2.5787813514471054, "epoch": 1.4359637774902976, "grad_norm": 0.9349520206451416, "learning_rate": 0.00018853998477343385, "loss": 2.4275, "mean_token_accuracy": 0.4918954521417618, "num_tokens": 3213218.0, "step": 1110 }, { "entropy": 1.5953246742486953, "epoch": 1.4489003880983182, "grad_norm": 0.0, "learning_rate": 0.00018829530571702515, "loss": 0.0759, "mean_token_accuracy": 0.03794117569923401, "num_tokens": 3213902.0, "step": 1120 }, { "entropy": 1.7004274040460587, "epoch": 1.4618369987063389, "grad_norm": 0.28281426429748535, "learning_rate": 0.000188048204385631, "loss": 1.4741, "mean_token_accuracy": 0.37432471886277197, "num_tokens": 3278399.0, "step": 1130 }, { "entropy": 1.54911307990551, "epoch": 1.4747736093143597, "grad_norm": 0.3112603425979614, "learning_rate": 0.00018779868755817777, "loss": 1.529, "mean_token_accuracy": 0.6405477434396744, "num_tokens": 3314005.0, "step": 1140 }, { "entropy": 1.8169409155845642, "epoch": 1.4877102199223804, "grad_norm": 0.4136084020137787, "learning_rate": 0.00018754676207985798, "loss": 1.8563, "mean_token_accuracy": 0.5684241697192192, "num_tokens": 3339761.0, "step": 1150 }, { "entropy": 2.6468518733978272, "epoch": 1.500646830530401, "grad_norm": 0.9774990081787109, "learning_rate": 0.00018729243486194258, "loss": 2.4068, "mean_token_accuracy": 0.49020475447177886, "num_tokens": 3352396.0, "step": 1160 }, { "entropy": 1.844868466258049, "epoch": 1.5135834411384217, "grad_norm": 0.0, "learning_rate": 0.0001870357128815915, "loss": 0.1083, "mean_token_accuracy": 0.03311403542757034, "num_tokens": 3353089.0, "step": 1170 }, { "entropy": 1.77299522459507, "epoch": 1.5265200517464423, "grad_norm": 0.29017725586891174, "learning_rate": 0.00018677660318166178, "loss": 1.5134, "mean_token_accuracy": 0.37067501023411753, "num_tokens": 3417806.0, "step": 1180 }, { "entropy": 1.605825701355934, "epoch": 1.5394566623544632, "grad_norm": 0.3007327616214752, "learning_rate": 0.000186515112870515, "loss": 1.5754, "mean_token_accuracy": 0.6359535038471222, "num_tokens": 3453968.0, "step": 1190 }, { "entropy": 1.8059845566749573, "epoch": 1.5523932729624839, "grad_norm": 0.4170464277267456, "learning_rate": 0.0001862512491218217, "loss": 1.8209, "mean_token_accuracy": 0.5729366824030876, "num_tokens": 3480122.0, "step": 1200 }, { "epoch": 1.5523932729624839, "eval_entropy": 1.846903031302053, "eval_loss": 1.4494483470916748, "eval_mean_token_accuracy": 0.4338979678618353, "eval_num_tokens": 3480122.0, "eval_runtime": 245.7587, "eval_samples_per_second": 22.363, "eval_steps_per_second": 1.4, "step": 1200 }, { "entropy": 2.4776687741279604, "epoch": 1.5653298835705045, "grad_norm": 1.236024022102356, "learning_rate": 0.00018598501917436487, "loss": 2.2694, "mean_token_accuracy": 0.5161234959959984, "num_tokens": 3492043.0, "step": 1210 }, { "entropy": 2.871905821561813, "epoch": 1.5782664941785254, "grad_norm": 0.0, "learning_rate": 0.00018571643033184136, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 3492683.0, "step": 1220 }, { "entropy": 2.30782949924469, "epoch": 1.5912031047865458, "grad_norm": 0.3269418179988861, "learning_rate": 0.00018544548996266138, "loss": 1.4917, "mean_token_accuracy": 0.3702575147151947, "num_tokens": 3561621.0, "step": 1230 }, { "entropy": 1.5860986828804016, "epoch": 1.6041397153945667, "grad_norm": 0.33811113238334656, "learning_rate": 0.00018517220549974642, "loss": 1.5659, "mean_token_accuracy": 0.6364668473601341, "num_tokens": 3597551.0, "step": 1240 }, { "entropy": 1.8561100304126739, "epoch": 1.6170763260025873, "grad_norm": 0.4206816554069519, "learning_rate": 0.00018489658444032544, "loss": 1.8636, "mean_token_accuracy": 0.5685464948415756, "num_tokens": 3623516.0, "step": 1250 }, { "entropy": 2.475165989995003, "epoch": 1.630012936610608, "grad_norm": 0.9206745624542236, "learning_rate": 0.00018461863434572905, "loss": 2.3686, "mean_token_accuracy": 0.49352553114295006, "num_tokens": 3636662.0, "step": 1260 }, { "entropy": 1.5844505287706852, "epoch": 1.6429495472186288, "grad_norm": 0.0, "learning_rate": 0.0001843383628411821, "loss": 0.1782, "mean_token_accuracy": 0.08751860111951829, "num_tokens": 3637501.0, "step": 1270 }, { "entropy": 1.5500032015144825, "epoch": 1.6558861578266493, "grad_norm": 0.2985474169254303, "learning_rate": 0.00018405577761559453, "loss": 1.5005, "mean_token_accuracy": 0.3704367861151695, "num_tokens": 3705898.0, "step": 1280 }, { "entropy": 1.5747513711452483, "epoch": 1.6688227684346701, "grad_norm": 0.3510088622570038, "learning_rate": 0.0001837708864213505, "loss": 1.5586, "mean_token_accuracy": 0.6378742828965187, "num_tokens": 3742275.0, "step": 1290 }, { "entropy": 1.7819489419460297, "epoch": 1.6817593790426908, "grad_norm": 0.42687690258026123, "learning_rate": 0.00018348369707409546, "loss": 1.8096, "mean_token_accuracy": 0.5733471587300301, "num_tokens": 3768563.0, "step": 1300 }, { "entropy": 2.4534367620944977, "epoch": 1.6946959896507114, "grad_norm": 0.9902492165565491, "learning_rate": 0.00018319421745252208, "loss": 2.3035, "mean_token_accuracy": 0.49916471540927887, "num_tokens": 3782396.0, "step": 1310 }, { "entropy": 1.977598437666893, "epoch": 1.7076326002587323, "grad_norm": 0.0, "learning_rate": 0.00018290245549815385, "loss": 0.1527, "mean_token_accuracy": 0.0657636746764183, "num_tokens": 3783196.0, "step": 1320 }, { "entropy": 2.1555118948221206, "epoch": 1.720569210866753, "grad_norm": 0.3243282437324524, "learning_rate": 0.0001826084192151273, "loss": 1.5106, "mean_token_accuracy": 0.36851018443703654, "num_tokens": 3846769.0, "step": 1330 }, { "entropy": 1.5848265200853349, "epoch": 1.7335058214747736, "grad_norm": 0.32707569003105164, "learning_rate": 0.00018231211666997247, "loss": 1.5277, "mean_token_accuracy": 0.642450013756752, "num_tokens": 3882748.0, "step": 1340 }, { "entropy": 1.8691698461771011, "epoch": 1.7464424320827943, "grad_norm": 0.43988320231437683, "learning_rate": 0.00018201355599139154, "loss": 1.9016, "mean_token_accuracy": 0.56101154088974, "num_tokens": 3908934.0, "step": 1350 }, { "epoch": 1.7464424320827943, "eval_entropy": 1.7982253941685655, "eval_loss": 1.4296140670776367, "eval_mean_token_accuracy": 0.43251860254379204, "eval_num_tokens": 3908934.0, "eval_runtime": 245.0387, "eval_samples_per_second": 22.429, "eval_steps_per_second": 1.404, "step": 1350 }, { "entropy": 2.471151527762413, "epoch": 1.759379042690815, "grad_norm": 0.9302666187286377, "learning_rate": 0.0001817127453700358, "loss": 2.3247, "mean_token_accuracy": 0.5023237220942974, "num_tokens": 3922255.0, "step": 1360 }, { "entropy": 1.8378637909889222, "epoch": 1.7723156532988358, "grad_norm": 0.0, "learning_rate": 0.00018140969305828106, "loss": 0.0576, "mean_token_accuracy": 0.0373076930642128, "num_tokens": 3922926.0, "step": 1370 }, { "entropy": 1.7470036551356316, "epoch": 1.7852522639068564, "grad_norm": 0.3011367619037628, "learning_rate": 0.00018110440737000122, "loss": 1.4591, "mean_token_accuracy": 0.3771127283573151, "num_tokens": 3990074.0, "step": 1380 }, { "entropy": 1.5329654335975647, "epoch": 1.798188874514877, "grad_norm": 0.31504422426223755, "learning_rate": 0.00018079689668034005, "loss": 1.4973, "mean_token_accuracy": 0.6467197388410568, "num_tokens": 4026755.0, "step": 1390 }, { "entropy": 1.7885783523321153, "epoch": 1.811125485122898, "grad_norm": 0.42766207456588745, "learning_rate": 0.00018048716942548168, "loss": 1.8211, "mean_token_accuracy": 0.5723803475499153, "num_tokens": 4053589.0, "step": 1400 }, { "entropy": 2.405156469345093, "epoch": 1.8240620957309184, "grad_norm": 0.953956663608551, "learning_rate": 0.00018017523410241893, "loss": 2.2967, "mean_token_accuracy": 0.5070258714258671, "num_tokens": 4068297.0, "step": 1410 }, { "entropy": 1.202190825343132, "epoch": 1.8369987063389392, "grad_norm": 0.0, "learning_rate": 0.00017986109926872032, "loss": 0.2475, "mean_token_accuracy": 0.09388883709907532, "num_tokens": 4069205.0, "step": 1420 }, { "entropy": 1.8208864331245422, "epoch": 1.84993531694696, "grad_norm": 0.30337706208229065, "learning_rate": 0.00017954477354229536, "loss": 1.4609, "mean_token_accuracy": 0.3746915958821774, "num_tokens": 4135636.0, "step": 1430 }, { "entropy": 1.547205138206482, "epoch": 1.8628719275549805, "grad_norm": 0.3231499493122101, "learning_rate": 0.00017922626560115798, "loss": 1.5262, "mean_token_accuracy": 0.6422269076108933, "num_tokens": 4171871.0, "step": 1440 }, { "entropy": 1.8343932330608368, "epoch": 1.8758085381630014, "grad_norm": 0.45170995593070984, "learning_rate": 0.0001789055841831885, "loss": 1.8589, "mean_token_accuracy": 0.5682013630867004, "num_tokens": 4198004.0, "step": 1450 }, { "entropy": 2.4178356170654296, "epoch": 1.8887451487710218, "grad_norm": 1.1836594343185425, "learning_rate": 0.00017858273808589402, "loss": 2.219, "mean_token_accuracy": 0.5180532835423947, "num_tokens": 4210568.0, "step": 1460 }, { "entropy": 1.6111401319503784, "epoch": 1.9016817593790427, "grad_norm": 0.0, "learning_rate": 0.00017825773616616703, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 4211208.0, "step": 1470 }, { "entropy": 1.7321304202079773, "epoch": 1.9146183699870634, "grad_norm": 0.30086463689804077, "learning_rate": 0.0001779305873400423, "loss": 1.4654, "mean_token_accuracy": 0.3772578649222851, "num_tokens": 4279659.0, "step": 1480 }, { "entropy": 1.5423625767230988, "epoch": 1.927554980595084, "grad_norm": 0.33361881971359253, "learning_rate": 0.00017760130058245242, "loss": 1.4942, "mean_token_accuracy": 0.6453819587826729, "num_tokens": 4315273.0, "step": 1490 }, { "entropy": 1.8349818885326385, "epoch": 1.9404915912031049, "grad_norm": 0.4649695158004761, "learning_rate": 0.0001772698849269816, "loss": 1.8167, "mean_token_accuracy": 0.5768257766962052, "num_tokens": 4341460.0, "step": 1500 }, { "epoch": 1.9404915912031049, "eval_entropy": 1.7394565719851227, "eval_loss": 1.4138603210449219, "eval_mean_token_accuracy": 0.43571045708864237, "eval_num_tokens": 4341460.0, "eval_runtime": 245.3477, "eval_samples_per_second": 22.401, "eval_steps_per_second": 1.402, "step": 1500 }, { "entropy": 2.375590392947197, "epoch": 1.9534282018111255, "grad_norm": 0.9830443263053894, "learning_rate": 0.00017693634946561775, "loss": 2.2734, "mean_token_accuracy": 0.5091598987579345, "num_tokens": 4355559.0, "step": 1510 }, { "entropy": 2.5652388006448748, "epoch": 1.9663648124191462, "grad_norm": 0.0, "learning_rate": 0.00017660070334850304, "loss": 0.1559, "mean_token_accuracy": 0.07029985040426254, "num_tokens": 4356373.0, "step": 1520 }, { "entropy": 2.071737268567085, "epoch": 1.9793014230271668, "grad_norm": 0.33517006039619446, "learning_rate": 0.00017626295578368305, "loss": 1.2406, "mean_token_accuracy": 0.418473818898201, "num_tokens": 4398312.0, "step": 1530 }, { "entropy": 2.0724180698394776, "epoch": 1.9922380336351875, "grad_norm": 0.8757649660110474, "learning_rate": 0.00017592311603685393, "loss": 2.0395, "mean_token_accuracy": 0.5450932942330837, "num_tokens": 4419963.0, "step": 1540 }, { "entropy": 2.4648678690195083, "epoch": 2.0051746442432083, "grad_norm": 0.29208359122276306, "learning_rate": 0.00017558119343110838, "loss": 1.0811, "mean_token_accuracy": 0.2497500881552696, "num_tokens": 4466509.0, "step": 1550 }, { "entropy": 1.4679036349058152, "epoch": 2.0181112548512288, "grad_norm": 0.31473416090011597, "learning_rate": 0.00017523719734667973, "loss": 1.4439, "mean_token_accuracy": 0.6536323636770248, "num_tokens": 4506293.0, "step": 1560 }, { "entropy": 1.5847829729318619, "epoch": 2.0310478654592496, "grad_norm": 0.4749562740325928, "learning_rate": 0.0001748911372206848, "loss": 1.5723, "mean_token_accuracy": 0.6196332320570945, "num_tokens": 4535291.0, "step": 1570 }, { "entropy": 2.08716399371624, "epoch": 2.0439844760672705, "grad_norm": 0.6515533924102783, "learning_rate": 0.00017454302254686486, "loss": 2.0148, "mean_token_accuracy": 0.5413075156509877, "num_tokens": 4553239.0, "step": 1580 }, { "entropy": 2.5849849820137023, "epoch": 2.056921086675291, "grad_norm": 0.0, "learning_rate": 0.00017419286287532516, "loss": 0.7934, "mean_token_accuracy": 0.19277514591813089, "num_tokens": 4555288.0, "step": 1590 }, { "entropy": 2.378413477540016, "epoch": 2.069857697283312, "grad_norm": 0.28206267952919006, "learning_rate": 0.00017384066781227307, "loss": 0.9347, "mean_token_accuracy": 0.1983368217945099, "num_tokens": 4604552.0, "step": 1600 }, { "entropy": 1.5316940248012543, "epoch": 2.0827943078913327, "grad_norm": 0.33060652017593384, "learning_rate": 0.0001734864470197544, "loss": 1.5009, "mean_token_accuracy": 0.6414364308118821, "num_tokens": 4644766.0, "step": 1610 }, { "entropy": 1.6213007301092148, "epoch": 2.095730918499353, "grad_norm": 0.5015760064125061, "learning_rate": 0.00017313021021538844, "loss": 1.6038, "mean_token_accuracy": 0.6168796703219414, "num_tokens": 4673702.0, "step": 1620 }, { "entropy": 2.1126689702272414, "epoch": 2.108667529107374, "grad_norm": 0.7331583499908447, "learning_rate": 0.0001727719671721013, "loss": 2.0398, "mean_token_accuracy": 0.533772025257349, "num_tokens": 4691426.0, "step": 1630 }, { "entropy": 2.546648120880127, "epoch": 2.1216041397153944, "grad_norm": 0.0, "learning_rate": 0.0001724117277178579, "loss": 0.5647, "mean_token_accuracy": 0.1764907084405422, "num_tokens": 4693073.0, "step": 1640 }, { "entropy": 2.3147504776716232, "epoch": 2.1345407503234153, "grad_norm": 0.3223225474357605, "learning_rate": 0.0001720495017353922, "loss": 0.8825, "mean_token_accuracy": 0.2041303940117359, "num_tokens": 4745475.0, "step": 1650 }, { "epoch": 2.1345407503234153, "eval_entropy": 1.9804211553446083, "eval_loss": 1.4297912120819092, "eval_mean_token_accuracy": 0.43714187025677326, "eval_num_tokens": 4745475.0, "eval_runtime": 240.4238, "eval_samples_per_second": 22.86, "eval_steps_per_second": 1.431, "step": 1650 }, { "entropy": 1.5305457144975663, "epoch": 2.147477360931436, "grad_norm": 0.35115179419517517, "learning_rate": 0.00017168529916193614, "loss": 1.521, "mean_token_accuracy": 0.6396576210856437, "num_tokens": 4786054.0, "step": 1660 }, { "entropy": 1.5795167148113252, "epoch": 2.1604139715394566, "grad_norm": 0.50258469581604, "learning_rate": 0.00017131912998894717, "loss": 1.5679, "mean_token_accuracy": 0.6227076068520546, "num_tokens": 4815157.0, "step": 1670 }, { "entropy": 2.0948879569768906, "epoch": 2.1733505821474774, "grad_norm": 0.7732148766517639, "learning_rate": 0.0001709510042618339, "loss": 2.0484, "mean_token_accuracy": 0.539436261355877, "num_tokens": 4833514.0, "step": 1680 }, { "entropy": 2.419444125890732, "epoch": 2.186287192755498, "grad_norm": 0.0, "learning_rate": 0.00017058093207968067, "loss": 0.6193, "mean_token_accuracy": 0.19686403200030328, "num_tokens": 4835320.0, "step": 1690 }, { "entropy": 2.0893223583698273, "epoch": 2.1992238033635187, "grad_norm": 0.3104536831378937, "learning_rate": 0.0001702089235949705, "loss": 0.8909, "mean_token_accuracy": 0.20202562659978868, "num_tokens": 4887586.0, "step": 1700 }, { "entropy": 1.493579125404358, "epoch": 2.2121604139715396, "grad_norm": 0.35835903882980347, "learning_rate": 0.0001698349890133065, "loss": 1.5107, "mean_token_accuracy": 0.6415591448545456, "num_tokens": 4928021.0, "step": 1710 }, { "entropy": 1.599936455488205, "epoch": 2.22509702457956, "grad_norm": 0.5604035258293152, "learning_rate": 0.0001694591385931319, "loss": 1.5589, "mean_token_accuracy": 0.6183684885501861, "num_tokens": 4956628.0, "step": 1720 }, { "entropy": 2.0975252121686934, "epoch": 2.238033635187581, "grad_norm": 0.7757624983787537, "learning_rate": 0.00016908138264544874, "loss": 2.0586, "mean_token_accuracy": 0.537506015598774, "num_tokens": 4973976.0, "step": 1730 }, { "entropy": 2.402835935354233, "epoch": 2.2509702457956013, "grad_norm": 0.0, "learning_rate": 0.00016870173153353478, "loss": 0.7325, "mean_token_accuracy": 0.21586424633860588, "num_tokens": 4975943.0, "step": 1740 }, { "entropy": 1.8081632763147355, "epoch": 2.263906856403622, "grad_norm": 0.29493167996406555, "learning_rate": 0.0001683201956726593, "loss": 0.8952, "mean_token_accuracy": 0.20223823115229606, "num_tokens": 5031202.0, "step": 1750 }, { "entropy": 1.5031701743602752, "epoch": 2.276843467011643, "grad_norm": 0.3834936320781708, "learning_rate": 0.0001679367855297976, "loss": 1.5076, "mean_token_accuracy": 0.643890731036663, "num_tokens": 5071593.0, "step": 1760 }, { "entropy": 1.6009941071271896, "epoch": 2.2897800776196635, "grad_norm": 0.5210739374160767, "learning_rate": 0.0001675515116233434, "loss": 1.5777, "mean_token_accuracy": 0.6210601255297661, "num_tokens": 5100741.0, "step": 1770 }, { "entropy": 2.032317912578583, "epoch": 2.3027166882276844, "grad_norm": 0.6077569723129272, "learning_rate": 0.0001671643845228207, "loss": 1.9718, "mean_token_accuracy": 0.5442127160727978, "num_tokens": 5120288.0, "step": 1780 }, { "entropy": 1.8405873313546182, "epoch": 2.315653298835705, "grad_norm": 0.0, "learning_rate": 0.00016677541484859352, "loss": 0.9106, "mean_token_accuracy": 0.22827735766768456, "num_tokens": 5122772.0, "step": 1790 }, { "entropy": 1.2277291655540465, "epoch": 2.3285899094437257, "grad_norm": 0.2893352806568146, "learning_rate": 0.0001663846132715747, "loss": 0.9194, "mean_token_accuracy": 0.1989746630191803, "num_tokens": 5178960.0, "step": 1800 }, { "epoch": 2.3285899094437257, "eval_entropy": 1.3877028687748798, "eval_loss": 1.4073032140731812, "eval_mean_token_accuracy": 0.44012743035374685, "eval_num_tokens": 5178960.0, "eval_runtime": 243.8297, "eval_samples_per_second": 22.54, "eval_steps_per_second": 1.411, "step": 1800 }, { "entropy": 1.5012196868658065, "epoch": 2.3415265200517466, "grad_norm": 0.3776693344116211, "learning_rate": 0.00016599199051293314, "loss": 1.4982, "mean_token_accuracy": 0.644976706802845, "num_tokens": 5220342.0, "step": 1810 }, { "entropy": 1.6106306850910186, "epoch": 2.354463130659767, "grad_norm": 0.5475464463233948, "learning_rate": 0.0001655975573437996, "loss": 1.5526, "mean_token_accuracy": 0.6244173154234887, "num_tokens": 5249776.0, "step": 1820 }, { "entropy": 2.004978260397911, "epoch": 2.367399741267788, "grad_norm": 0.6898283958435059, "learning_rate": 0.0001652013245849714, "loss": 1.9472, "mean_token_accuracy": 0.557063739746809, "num_tokens": 5268417.0, "step": 1830 }, { "entropy": 2.327620804309845, "epoch": 2.3803363518758087, "grad_norm": 0.0, "learning_rate": 0.00016480330310661523, "loss": 0.7845, "mean_token_accuracy": 0.20984074249863624, "num_tokens": 5270607.0, "step": 1840 }, { "entropy": 2.509730467200279, "epoch": 2.393272962483829, "grad_norm": 0.30109038949012756, "learning_rate": 0.00016440350382796929, "loss": 0.9268, "mean_token_accuracy": 0.19716072604060172, "num_tokens": 5325120.0, "step": 1850 }, { "entropy": 1.480056384205818, "epoch": 2.40620957309185, "grad_norm": 0.36303573846817017, "learning_rate": 0.00016400193771704354, "loss": 1.4947, "mean_token_accuracy": 0.6465561181306839, "num_tokens": 5366273.0, "step": 1860 }, { "entropy": 1.5890043556690217, "epoch": 2.4191461836998704, "grad_norm": 0.5530393123626709, "learning_rate": 0.00016359861579031884, "loss": 1.5522, "mean_token_accuracy": 0.6297082543373108, "num_tokens": 5395726.0, "step": 1870 }, { "entropy": 2.038092666864395, "epoch": 2.4320827943078913, "grad_norm": 1.0535674095153809, "learning_rate": 0.00016319354911244468, "loss": 1.9806, "mean_token_accuracy": 0.5464614436030388, "num_tokens": 5414798.0, "step": 1880 }, { "entropy": 2.923152169585228, "epoch": 2.445019404915912, "grad_norm": 0.0, "learning_rate": 0.00016278674879593582, "loss": 0.7968, "mean_token_accuracy": 0.2314663991332054, "num_tokens": 5417197.0, "step": 1890 }, { "entropy": 2.655092605948448, "epoch": 2.4579560155239326, "grad_norm": 0.3218407928943634, "learning_rate": 0.00016237822600086716, "loss": 0.9259, "mean_token_accuracy": 0.19839748442173005, "num_tokens": 5470736.0, "step": 1900 }, { "entropy": 1.4376092582941056, "epoch": 2.4708926261319535, "grad_norm": 0.3781118094921112, "learning_rate": 0.00016196799193456785, "loss": 1.4415, "mean_token_accuracy": 0.6578261837363243, "num_tokens": 5511266.0, "step": 1910 }, { "entropy": 1.5665327340364457, "epoch": 2.4838292367399744, "grad_norm": 0.5386565327644348, "learning_rate": 0.00016155605785131357, "loss": 1.5497, "mean_token_accuracy": 0.6252920791506767, "num_tokens": 5541123.0, "step": 1920 }, { "entropy": 1.9834172219038009, "epoch": 2.496765847347995, "grad_norm": 0.6560537815093994, "learning_rate": 0.00016114243505201795, "loss": 1.9184, "mean_token_accuracy": 0.555550941824913, "num_tokens": 5561101.0, "step": 1930 }, { "entropy": 2.323999685049057, "epoch": 2.5097024579560157, "grad_norm": 0.0, "learning_rate": 0.0001607271348839226, "loss": 0.9326, "mean_token_accuracy": 0.2633499436080456, "num_tokens": 5564120.0, "step": 1940 }, { "entropy": 1.5099886417388917, "epoch": 2.522639068564036, "grad_norm": 0.39876788854599, "learning_rate": 0.00016031016874028557, "loss": 0.9269, "mean_token_accuracy": 0.20084442123770713, "num_tokens": 5613256.0, "step": 1950 }, { "epoch": 2.522639068564036, "eval_entropy": 1.3481496193034703, "eval_loss": 1.3939740657806396, "eval_mean_token_accuracy": 0.44758816895096804, "eval_num_tokens": 5613256.0, "eval_runtime": 246.9294, "eval_samples_per_second": 22.257, "eval_steps_per_second": 1.393, "step": 1950 }, { "entropy": 1.4310665398836135, "epoch": 2.535575679172057, "grad_norm": 0.39710840582847595, "learning_rate": 0.00015989154806006904, "loss": 1.4336, "mean_token_accuracy": 0.6602939382195473, "num_tokens": 5653638.0, "step": 1960 }, { "entropy": 1.5728681892156602, "epoch": 2.548512289780078, "grad_norm": 0.5568864941596985, "learning_rate": 0.00015947128432762536, "loss": 1.5237, "mean_token_accuracy": 0.627597238123417, "num_tokens": 5683333.0, "step": 1970 }, { "entropy": 1.9994044303894043, "epoch": 2.5614489003880982, "grad_norm": 0.6420727968215942, "learning_rate": 0.00015904938907238206, "loss": 1.9615, "mean_token_accuracy": 0.5487420856952667, "num_tokens": 5702066.0, "step": 1980 }, { "entropy": 2.452130767703056, "epoch": 2.574385510996119, "grad_norm": 0.0, "learning_rate": 0.00015862587386852541, "loss": 0.7703, "mean_token_accuracy": 0.2316281594336033, "num_tokens": 5704289.0, "step": 1990 }, { "entropy": 2.385006046295166, "epoch": 2.5873221216041395, "grad_norm": 0.3110261857509613, "learning_rate": 0.0001582007503346832, "loss": 0.9186, "mean_token_accuracy": 0.19861687943339348, "num_tokens": 5760847.0, "step": 2000 }, { "entropy": 1.4642044007778168, "epoch": 2.6002587322121604, "grad_norm": 0.38485661149024963, "learning_rate": 0.0001577740301336057, "loss": 1.4756, "mean_token_accuracy": 0.6492435604333877, "num_tokens": 5802455.0, "step": 2010 }, { "entropy": 1.5432655066251755, "epoch": 2.6131953428201813, "grad_norm": 0.6033521294593811, "learning_rate": 0.00015734572497184577, "loss": 1.5119, "mean_token_accuracy": 0.6332074150443077, "num_tokens": 5831848.0, "step": 2020 }, { "entropy": 2.0233444392681124, "epoch": 2.6261319534282017, "grad_norm": 0.7502851486206055, "learning_rate": 0.00015691584659943786, "loss": 1.9476, "mean_token_accuracy": 0.5473973207175732, "num_tokens": 5850975.0, "step": 2030 }, { "entropy": 2.2630896627902986, "epoch": 2.6390685640362226, "grad_norm": 0.0, "learning_rate": 0.0001564844068095755, "loss": 0.8525, "mean_token_accuracy": 0.23688365146517754, "num_tokens": 5853548.0, "step": 2040 }, { "entropy": 1.6931863486766816, "epoch": 2.652005174644243, "grad_norm": 0.3148477077484131, "learning_rate": 0.0001560514174382878, "loss": 0.8972, "mean_token_accuracy": 0.20218148753046988, "num_tokens": 5907614.0, "step": 2050 }, { "entropy": 1.4166515529155732, "epoch": 2.664941785252264, "grad_norm": 0.38905423879623413, "learning_rate": 0.0001556168903641148, "loss": 1.4368, "mean_token_accuracy": 0.6563202187418937, "num_tokens": 5947663.0, "step": 2060 }, { "entropy": 1.5506242126226426, "epoch": 2.6778783958602848, "grad_norm": 0.5905367136001587, "learning_rate": 0.00015518083750778157, "loss": 1.5309, "mean_token_accuracy": 0.6258940026164055, "num_tokens": 5976765.0, "step": 2070 }, { "entropy": 1.9377893030643463, "epoch": 2.690815006468305, "grad_norm": 0.6645969152450562, "learning_rate": 0.00015474327083187105, "loss": 1.9022, "mean_token_accuracy": 0.5610988035798072, "num_tokens": 5996303.0, "step": 2080 }, { "entropy": 2.6364343762397766, "epoch": 2.703751617076326, "grad_norm": 0.0, "learning_rate": 0.00015430420234049624, "loss": 1.038, "mean_token_accuracy": 0.2556902192533016, "num_tokens": 5999434.0, "step": 2090 }, { "entropy": 2.8424737572669985, "epoch": 2.7166882276843465, "grad_norm": 0.3264569938182831, "learning_rate": 0.00015386364407897035, "loss": 0.9078, "mean_token_accuracy": 0.20131859928369522, "num_tokens": 6051774.0, "step": 2100 }, { "epoch": 2.7166882276843465, "eval_entropy": 2.204050070671148, "eval_loss": 1.3715640306472778, "eval_mean_token_accuracy": 0.4440248931736447, "eval_num_tokens": 6051774.0, "eval_runtime": 244.556, "eval_samples_per_second": 22.473, "eval_steps_per_second": 1.407, "step": 2100 }, { "entropy": 1.4316389322280885, "epoch": 2.7296248382923674, "grad_norm": 0.3802427053451538, "learning_rate": 0.00015342160813347676, "loss": 1.4553, "mean_token_accuracy": 0.6519668206572533, "num_tokens": 6091750.0, "step": 2110 }, { "entropy": 1.5787472486495973, "epoch": 2.742561448900388, "grad_norm": 0.5799654126167297, "learning_rate": 0.00015297810663073743, "loss": 1.5507, "mean_token_accuracy": 0.6268433704972267, "num_tokens": 6120790.0, "step": 2120 }, { "entropy": 1.9796525478363036, "epoch": 2.7554980595084086, "grad_norm": 0.7903239727020264, "learning_rate": 0.00015253315173767993, "loss": 1.9383, "mean_token_accuracy": 0.5536467991769314, "num_tokens": 6139010.0, "step": 2130 }, { "entropy": 2.6805751383304597, "epoch": 2.7684346701164295, "grad_norm": 0.0, "learning_rate": 0.00015208675566110387, "loss": 0.7659, "mean_token_accuracy": 0.21504319161176683, "num_tokens": 6141159.0, "step": 2140 }, { "entropy": 2.1302292913198473, "epoch": 2.78137128072445, "grad_norm": 0.3743366003036499, "learning_rate": 0.0001516389306473461, "loss": 0.8888, "mean_token_accuracy": 0.20484731644392012, "num_tokens": 6191053.0, "step": 2150 }, { "entropy": 1.4483990609645843, "epoch": 2.794307891332471, "grad_norm": 0.3969733417034149, "learning_rate": 0.00015118968898194458, "loss": 1.443, "mean_token_accuracy": 0.6526175752282143, "num_tokens": 6230521.0, "step": 2160 }, { "entropy": 1.582485669851303, "epoch": 2.8072445019404917, "grad_norm": 0.6144042611122131, "learning_rate": 0.00015073904298930132, "loss": 1.5429, "mean_token_accuracy": 0.6261137276887894, "num_tokens": 6259286.0, "step": 2170 }, { "entropy": 1.970637395977974, "epoch": 2.8201811125485126, "grad_norm": 0.7516705393791199, "learning_rate": 0.00015028700503234447, "loss": 1.9348, "mean_token_accuracy": 0.5558973327279091, "num_tokens": 6277729.0, "step": 2180 }, { "entropy": 2.001736190915108, "epoch": 2.833117723156533, "grad_norm": 0.0, "learning_rate": 0.00014983358751218892, "loss": 0.736, "mean_token_accuracy": 0.19615912958979606, "num_tokens": 6279643.0, "step": 2190 }, { "entropy": 1.9369044452905655, "epoch": 2.8460543337645534, "grad_norm": 0.32840585708618164, "learning_rate": 0.00014937880286779629, "loss": 0.9147, "mean_token_accuracy": 0.19959167763590813, "num_tokens": 6336300.0, "step": 2200 }, { "entropy": 1.4088002383708953, "epoch": 2.8589909443725743, "grad_norm": 0.4119824767112732, "learning_rate": 0.00014892266357563358, "loss": 1.4187, "mean_token_accuracy": 0.6627781435847282, "num_tokens": 6375995.0, "step": 2210 }, { "entropy": 1.6024494558572768, "epoch": 2.871927554980595, "grad_norm": 0.5892689228057861, "learning_rate": 0.0001484651821493309, "loss": 1.5693, "mean_token_accuracy": 0.6204348549246788, "num_tokens": 6404526.0, "step": 2220 }, { "entropy": 2.072836604714394, "epoch": 2.884864165588616, "grad_norm": 0.7402485013008118, "learning_rate": 0.0001480063711393382, "loss": 2.0136, "mean_token_accuracy": 0.5476931251585484, "num_tokens": 6421889.0, "step": 2230 }, { "entropy": 1.5923803925514222, "epoch": 2.8978007761966365, "grad_norm": 0.0, "learning_rate": 0.00014754624313258102, "loss": 0.6735, "mean_token_accuracy": 0.20976952239871025, "num_tokens": 6423681.0, "step": 2240 }, { "entropy": 1.2221377216279508, "epoch": 2.9107373868046573, "grad_norm": 0.3352583050727844, "learning_rate": 0.00014708481075211498, "loss": 0.9037, "mean_token_accuracy": 0.20100481137633325, "num_tokens": 6474539.0, "step": 2250 }, { "epoch": 2.9107373868046573, "eval_entropy": 1.358256766096104, "eval_loss": 1.3591663837432861, "eval_mean_token_accuracy": 0.45166019766136656, "eval_num_tokens": 6474539.0, "eval_runtime": 241.3389, "eval_samples_per_second": 22.773, "eval_steps_per_second": 1.425, "step": 2250 }, { "entropy": 1.3933149039745332, "epoch": 2.9236739974126777, "grad_norm": 0.4007508456707001, "learning_rate": 0.00014662208665677966, "loss": 1.4101, "mean_token_accuracy": 0.6611413463950158, "num_tokens": 6514494.0, "step": 2260 }, { "entropy": 1.5439734548330306, "epoch": 2.9366106080206986, "grad_norm": 0.5625568628311157, "learning_rate": 0.0001461580835408513, "loss": 1.4993, "mean_token_accuracy": 0.6339735224843025, "num_tokens": 6543746.0, "step": 2270 }, { "entropy": 1.982978528738022, "epoch": 2.9495472186287195, "grad_norm": 0.7641308307647705, "learning_rate": 0.00014569281413369462, "loss": 1.9328, "mean_token_accuracy": 0.5539643183350563, "num_tokens": 6562759.0, "step": 2280 }, { "entropy": 1.5298347800970078, "epoch": 2.96248382923674, "grad_norm": 0.0, "learning_rate": 0.00014522629119941333, "loss": 0.766, "mean_token_accuracy": 0.21878809183835984, "num_tokens": 6564974.0, "step": 2290 }, { "entropy": 1.4145286485552788, "epoch": 2.975420439844761, "grad_norm": 0.4561901092529297, "learning_rate": 0.00014475852753650023, "loss": 0.7577, "mean_token_accuracy": 0.22906568124890328, "num_tokens": 6598409.0, "step": 2300 }, { "entropy": 1.5782025367021562, "epoch": 2.988357050452781, "grad_norm": 0.5903820991516113, "learning_rate": 0.000144289535977486, "loss": 1.554, "mean_token_accuracy": 0.6246525257825851, "num_tokens": 6627531.0, "step": 2310 }, { "entropy": 1.9433803856372833, "epoch": 3.001293661060802, "grad_norm": 0.13881655037403107, "learning_rate": 0.00014381932938858718, "loss": 0.9444, "mean_token_accuracy": 0.22419775873422623, "num_tokens": 6660338.0, "step": 2320 }, { "entropy": 1.621496966481209, "epoch": 3.014230271668823, "grad_norm": 0.42520761489868164, "learning_rate": 0.0001433479206693532, "loss": 1.6127, "mean_token_accuracy": 0.6233608849346638, "num_tokens": 6713107.0, "step": 2330 }, { "entropy": 1.338898405432701, "epoch": 3.0271668822768434, "grad_norm": 0.6367995738983154, "learning_rate": 0.0001428753227523124, "loss": 1.3191, "mean_token_accuracy": 0.67000552713871, "num_tokens": 6744799.0, "step": 2340 }, { "entropy": 1.590729820728302, "epoch": 3.0401034928848643, "grad_norm": 0.6899548172950745, "learning_rate": 0.0001424015486026174, "loss": 1.5648, "mean_token_accuracy": 0.618783813714981, "num_tokens": 6766726.0, "step": 2350 }, { "entropy": 1.977810901403427, "epoch": 3.0530401034928847, "grad_norm": 0.0, "learning_rate": 0.00014192661121768932, "loss": 1.3483, "mean_token_accuracy": 0.3756748877465725, "num_tokens": 6772184.0, "step": 2360 }, { "entropy": 1.1425089821219445, "epoch": 3.0659767141009056, "grad_norm": 0.1791164129972458, "learning_rate": 0.0001414505236268613, "loss": 0.2221, "mean_token_accuracy": 0.05023420602083206, "num_tokens": 6801985.0, "step": 2370 }, { "entropy": 1.534485575556755, "epoch": 3.0789133247089264, "grad_norm": 0.4513719975948334, "learning_rate": 0.00014097329889102084, "loss": 1.6302, "mean_token_accuracy": 0.6191562682390213, "num_tokens": 6853863.0, "step": 2380 }, { "entropy": 1.3535702049732208, "epoch": 3.091849935316947, "grad_norm": 0.6277197599411011, "learning_rate": 0.00014049495010225174, "loss": 1.2826, "mean_token_accuracy": 0.6846122413873672, "num_tokens": 6885860.0, "step": 2390 }, { "entropy": 1.611542597413063, "epoch": 3.1047865459249677, "grad_norm": 0.6629586219787598, "learning_rate": 0.00014001549038347488, "loss": 1.5841, "mean_token_accuracy": 0.6110770747065544, "num_tokens": 6907549.0, "step": 2400 }, { "epoch": 3.1047865459249677, "eval_entropy": 1.4435141939063405, "eval_loss": 1.3480572700500488, "eval_mean_token_accuracy": 0.45482284610354623, "eval_num_tokens": 6907549.0, "eval_runtime": 243.0256, "eval_samples_per_second": 22.615, "eval_steps_per_second": 1.415, "step": 2400 }, { "entropy": 2.002578613162041, "epoch": 3.117723156532988, "grad_norm": 0.0, "learning_rate": 0.00013953493288808804, "loss": 1.2204, "mean_token_accuracy": 0.3793766848742962, "num_tokens": 6912238.0, "step": 2410 }, { "entropy": 1.580290713906288, "epoch": 3.130659767141009, "grad_norm": 0.17965653538703918, "learning_rate": 0.00013905329079960522, "loss": 0.2405, "mean_token_accuracy": 0.04845013022422791, "num_tokens": 6941537.0, "step": 2420 }, { "entropy": 1.4815610826015473, "epoch": 3.14359637774903, "grad_norm": 0.46858540177345276, "learning_rate": 0.00013857057733129494, "loss": 1.5548, "mean_token_accuracy": 0.6307360790669918, "num_tokens": 6994352.0, "step": 2430 }, { "entropy": 1.3407190799713136, "epoch": 3.1565329883570503, "grad_norm": 0.6128517389297485, "learning_rate": 0.00013808680572581776, "loss": 1.2793, "mean_token_accuracy": 0.6835518077015876, "num_tokens": 7026544.0, "step": 2440 }, { "entropy": 1.6429592788219451, "epoch": 3.169469598965071, "grad_norm": 0.7309837937355042, "learning_rate": 0.0001376019892548629, "loss": 1.6028, "mean_token_accuracy": 0.6109883636236191, "num_tokens": 7049229.0, "step": 2450 }, { "entropy": 2.1930068999528887, "epoch": 3.1824062095730916, "grad_norm": 0.0, "learning_rate": 0.00013711614121878423, "loss": 1.3452, "mean_token_accuracy": 0.4032416954636574, "num_tokens": 7055638.0, "step": 2460 }, { "entropy": 2.582664442062378, "epoch": 3.1953428201811125, "grad_norm": 0.17951107025146484, "learning_rate": 0.00013662927494623528, "loss": 0.238, "mean_token_accuracy": 0.0486849807202816, "num_tokens": 7079933.0, "step": 2470 }, { "entropy": 1.4514012217521668, "epoch": 3.2082794307891334, "grad_norm": 0.48690128326416016, "learning_rate": 0.00013614140379380384, "loss": 1.5635, "mean_token_accuracy": 0.6299719527363777, "num_tokens": 7130984.0, "step": 2480 }, { "entropy": 1.3963081300258637, "epoch": 3.221216041397154, "grad_norm": 0.5850987434387207, "learning_rate": 0.00013565254114564522, "loss": 1.3093, "mean_token_accuracy": 0.6751079827547073, "num_tokens": 7162961.0, "step": 2490 }, { "entropy": 1.6287110567092895, "epoch": 3.2341526520051747, "grad_norm": 0.7363412976264954, "learning_rate": 0.00013516270041311523, "loss": 1.6109, "mean_token_accuracy": 0.6086324542760849, "num_tokens": 7185148.0, "step": 2500 }, { "entropy": 2.588909697532654, "epoch": 3.2470892626131955, "grad_norm": 0.0, "learning_rate": 0.0001346718950344023, "loss": 1.3295, "mean_token_accuracy": 0.36438525542616845, "num_tokens": 7190578.0, "step": 2510 }, { "entropy": 2.170939177274704, "epoch": 3.260025873221216, "grad_norm": 0.16089969873428345, "learning_rate": 0.00013418013847415875, "loss": 0.2333, "mean_token_accuracy": 0.04912624955177307, "num_tokens": 7223083.0, "step": 2520 }, { "entropy": 1.5124918982386588, "epoch": 3.272962483829237, "grad_norm": 0.48449796438217163, "learning_rate": 0.00013368744422313135, "loss": 1.5844, "mean_token_accuracy": 0.6292549699544907, "num_tokens": 7278262.0, "step": 2530 }, { "entropy": 1.3001452058553695, "epoch": 3.2858990944372573, "grad_norm": 0.6388899087905884, "learning_rate": 0.00013319382579779143, "loss": 1.2473, "mean_token_accuracy": 0.686492520570755, "num_tokens": 7310633.0, "step": 2540 }, { "entropy": 1.588513082265854, "epoch": 3.298835705045278, "grad_norm": 0.7601234316825867, "learning_rate": 0.00013269929673996372, "loss": 1.5813, "mean_token_accuracy": 0.6151460394263267, "num_tokens": 7333877.0, "step": 2550 }, { "epoch": 3.298835705045278, "eval_entropy": 1.50408104668523, "eval_loss": 1.3354183435440063, "eval_mean_token_accuracy": 0.4569617995862351, "eval_num_tokens": 7333877.0, "eval_runtime": 242.7951, "eval_samples_per_second": 22.636, "eval_steps_per_second": 1.417, "step": 2550 }, { "entropy": 1.8434918358922006, "epoch": 3.311772315653299, "grad_norm": 0.0, "learning_rate": 0.00013220387061645518, "loss": 1.2378, "mean_token_accuracy": 0.3966076374053955, "num_tokens": 7340126.0, "step": 2560 }, { "entropy": 2.0701662808656693, "epoch": 3.3247089262613194, "grad_norm": 0.1653972566127777, "learning_rate": 0.00013170756101868274, "loss": 0.2363, "mean_token_accuracy": 0.04905220568180084, "num_tokens": 7368440.0, "step": 2570 }, { "entropy": 1.521276581287384, "epoch": 3.3376455368693403, "grad_norm": 0.5110422372817993, "learning_rate": 0.00013121038156230021, "loss": 1.6069, "mean_token_accuracy": 0.6247900031507015, "num_tokens": 7422449.0, "step": 2580 }, { "entropy": 1.3473992764949798, "epoch": 3.350582147477361, "grad_norm": 0.5985650420188904, "learning_rate": 0.00013071234588682507, "loss": 1.2818, "mean_token_accuracy": 0.6814156129956246, "num_tokens": 7455078.0, "step": 2590 }, { "entropy": 1.5794302642345428, "epoch": 3.3635187580853816, "grad_norm": 0.7455780506134033, "learning_rate": 0.00013021346765526405, "loss": 1.5565, "mean_token_accuracy": 0.6210769057273865, "num_tokens": 7478151.0, "step": 2600 }, { "entropy": 2.400119936466217, "epoch": 3.3764553686934025, "grad_norm": 0.0, "learning_rate": 0.00012971376055373842, "loss": 1.3398, "mean_token_accuracy": 0.3794242724776268, "num_tokens": 7483907.0, "step": 2610 }, { "entropy": 2.360330358147621, "epoch": 3.389391979301423, "grad_norm": 0.16837802529335022, "learning_rate": 0.0001292132382911085, "loss": 0.231, "mean_token_accuracy": 0.04970394000411034, "num_tokens": 7511728.0, "step": 2620 }, { "entropy": 1.5115429222583772, "epoch": 3.4023285899094438, "grad_norm": 0.5140193700790405, "learning_rate": 0.00012871191459859754, "loss": 1.5844, "mean_token_accuracy": 0.626202804595232, "num_tokens": 7564367.0, "step": 2630 }, { "entropy": 1.33871136456728, "epoch": 3.4152652005174646, "grad_norm": 0.5856406092643738, "learning_rate": 0.00012820980322941506, "loss": 1.2772, "mean_token_accuracy": 0.6828064471483231, "num_tokens": 7596458.0, "step": 2640 }, { "entropy": 1.5606994718313216, "epoch": 3.428201811125485, "grad_norm": 0.7913902401924133, "learning_rate": 0.00012770691795837956, "loss": 1.5388, "mean_token_accuracy": 0.6267461031675339, "num_tokens": 7618937.0, "step": 2650 }, { "entropy": 2.3131509482860566, "epoch": 3.441138421733506, "grad_norm": 0.0, "learning_rate": 0.00012720327258154059, "loss": 1.3789, "mean_token_accuracy": 0.39152705743908883, "num_tokens": 7624946.0, "step": 2660 }, { "entropy": 2.270913216471672, "epoch": 3.4540750323415264, "grad_norm": 0.1674034297466278, "learning_rate": 0.00012669888091580033, "loss": 0.2283, "mean_token_accuracy": 0.05011768788099289, "num_tokens": 7655621.0, "step": 2670 }, { "entropy": 1.5039668411016465, "epoch": 3.4670116429495472, "grad_norm": 0.5039061307907104, "learning_rate": 0.00012619375679853435, "loss": 1.5889, "mean_token_accuracy": 0.6255090057849884, "num_tokens": 7706496.0, "step": 2680 }, { "entropy": 1.299958510696888, "epoch": 3.479948253557568, "grad_norm": 0.6249063611030579, "learning_rate": 0.0001256879140872123, "loss": 1.2262, "mean_token_accuracy": 0.6930169105529785, "num_tokens": 7738457.0, "step": 2690 }, { "entropy": 1.5891169756650925, "epoch": 3.4928848641655885, "grad_norm": 0.7654421925544739, "learning_rate": 0.00012518136665901755, "loss": 1.5485, "mean_token_accuracy": 0.6236635655164718, "num_tokens": 7760759.0, "step": 2700 }, { "epoch": 3.4928848641655885, "eval_entropy": 1.7460197186054185, "eval_loss": 1.3263978958129883, "eval_mean_token_accuracy": 0.45740372557626213, "eval_num_tokens": 7760759.0, "eval_runtime": 244.9238, "eval_samples_per_second": 22.44, "eval_steps_per_second": 1.405, "step": 2700 }, { "entropy": 2.4236282050609588, "epoch": 3.5058214747736094, "grad_norm": 0.0, "learning_rate": 0.00012467412841046644, "loss": 1.3685, "mean_token_accuracy": 0.38023146614432335, "num_tokens": 7766609.0, "step": 2710 }, { "entropy": 2.481502190232277, "epoch": 3.51875808538163, "grad_norm": 0.18167299032211304, "learning_rate": 0.00012416621325702723, "loss": 0.2353, "mean_token_accuracy": 0.049381527304649356, "num_tokens": 7796963.0, "step": 2720 }, { "entropy": 1.526540043950081, "epoch": 3.5316946959896507, "grad_norm": 0.5063906908035278, "learning_rate": 0.00012365763513273826, "loss": 1.6301, "mean_token_accuracy": 0.6226166233420372, "num_tokens": 7851436.0, "step": 2730 }, { "entropy": 1.3451905250549316, "epoch": 3.5446313065976716, "grad_norm": 0.591876208782196, "learning_rate": 0.0001231484079898255, "loss": 1.2804, "mean_token_accuracy": 0.6807183653116227, "num_tokens": 7883623.0, "step": 2740 }, { "entropy": 1.6224838614463806, "epoch": 3.557567917205692, "grad_norm": 0.8054526448249817, "learning_rate": 0.00012263854579832022, "loss": 1.5855, "mean_token_accuracy": 0.6138912171125412, "num_tokens": 7906065.0, "step": 2750 }, { "entropy": 2.2193833112716677, "epoch": 3.570504527813713, "grad_norm": 0.0, "learning_rate": 0.00012212806254567526, "loss": 1.3055, "mean_token_accuracy": 0.388429357111454, "num_tokens": 7911950.0, "step": 2760 }, { "entropy": 1.9380589336156846, "epoch": 3.5834411384217333, "grad_norm": 0.15811856091022491, "learning_rate": 0.00012161697223638162, "loss": 0.2486, "mean_token_accuracy": 0.048336771130561826, "num_tokens": 7944772.0, "step": 2770 }, { "entropy": 1.5291394203901292, "epoch": 3.596377749029754, "grad_norm": 0.5478163361549377, "learning_rate": 0.00012110528889158421, "loss": 1.6201, "mean_token_accuracy": 0.6210859633982182, "num_tokens": 7998744.0, "step": 2780 }, { "entropy": 1.3308267042040824, "epoch": 3.609314359637775, "grad_norm": 0.6494978070259094, "learning_rate": 0.00012059302654869707, "loss": 1.2747, "mean_token_accuracy": 0.6828291460871696, "num_tokens": 8030628.0, "step": 2790 }, { "entropy": 1.6048484414815902, "epoch": 3.6222509702457955, "grad_norm": 0.8232805132865906, "learning_rate": 0.00012008019926101837, "loss": 1.5858, "mean_token_accuracy": 0.614265987277031, "num_tokens": 8052959.0, "step": 2800 }, { "entropy": 2.457938811182976, "epoch": 3.6351875808538163, "grad_norm": 0.0, "learning_rate": 0.00011956682109734485, "loss": 1.3734, "mean_token_accuracy": 0.37425210550427435, "num_tokens": 8058605.0, "step": 2810 }, { "entropy": 2.780105286836624, "epoch": 3.6481241914618368, "grad_norm": 0.15952081978321075, "learning_rate": 0.0001190529061415859, "loss": 0.2238, "mean_token_accuracy": 0.0499541737139225, "num_tokens": 8088439.0, "step": 2820 }, { "entropy": 1.4993865296244622, "epoch": 3.6610608020698576, "grad_norm": 0.4854850769042969, "learning_rate": 0.0001185384684923772, "loss": 1.5841, "mean_token_accuracy": 0.6286533363163471, "num_tokens": 8140599.0, "step": 2830 }, { "entropy": 1.3472731560468674, "epoch": 3.6739974126778785, "grad_norm": 0.6306962966918945, "learning_rate": 0.00011802352226269375, "loss": 1.292, "mean_token_accuracy": 0.6775945991277694, "num_tokens": 8172688.0, "step": 2840 }, { "entropy": 1.5441134572029114, "epoch": 3.6869340232858994, "grad_norm": 0.8373256325721741, "learning_rate": 0.00011750808157946291, "loss": 1.5236, "mean_token_accuracy": 0.6226452678442002, "num_tokens": 8195667.0, "step": 2850 }, { "epoch": 3.6869340232858994, "eval_entropy": 2.019692697324032, "eval_loss": 1.3088935613632202, "eval_mean_token_accuracy": 0.45852816875937374, "eval_num_tokens": 8195667.0, "eval_runtime": 247.8075, "eval_samples_per_second": 22.179, "eval_steps_per_second": 1.388, "step": 2850 }, { "entropy": 2.331311251223087, "epoch": 3.69987063389392, "grad_norm": 0.0, "learning_rate": 0.00011699216058317686, "loss": 1.4345, "mean_token_accuracy": 0.42385049238801004, "num_tokens": 8202061.0, "step": 2860 }, { "entropy": 1.3996504232287408, "epoch": 3.71280724450194, "grad_norm": 0.16637884080410004, "learning_rate": 0.00011647577342750447, "loss": 0.232, "mean_token_accuracy": 0.05035848617553711, "num_tokens": 8229320.0, "step": 2870 }, { "entropy": 1.5440905675292016, "epoch": 3.725743855109961, "grad_norm": 0.5046349763870239, "learning_rate": 0.00011595893427890316, "loss": 1.6135, "mean_token_accuracy": 0.6227852456271649, "num_tokens": 8282159.0, "step": 2880 }, { "entropy": 1.313097244501114, "epoch": 3.738680465717982, "grad_norm": 0.6280332803726196, "learning_rate": 0.00011544165731623029, "loss": 1.283, "mean_token_accuracy": 0.6847794458270073, "num_tokens": 8314583.0, "step": 2890 }, { "entropy": 1.5734279870986938, "epoch": 3.751617076326003, "grad_norm": 0.8147013187408447, "learning_rate": 0.00011492395673035401, "loss": 1.5372, "mean_token_accuracy": 0.6240187495946884, "num_tokens": 8337156.0, "step": 2900 }, { "entropy": 1.903187246620655, "epoch": 3.7645536869340233, "grad_norm": 0.0, "learning_rate": 0.00011440584672376418, "loss": 1.3835, "mean_token_accuracy": 0.3674991957843304, "num_tokens": 8343309.0, "step": 2910 }, { "entropy": 1.1613501474261283, "epoch": 3.777490297542044, "grad_norm": 0.16990479826927185, "learning_rate": 0.00011388734151018252, "loss": 0.2192, "mean_token_accuracy": 0.050329743325710295, "num_tokens": 8374198.0, "step": 2920 }, { "entropy": 1.5224060222506524, "epoch": 3.7904269081500646, "grad_norm": 0.5338153839111328, "learning_rate": 0.00011336845531417286, "loss": 1.6167, "mean_token_accuracy": 0.6217537559568882, "num_tokens": 8426906.0, "step": 2930 }, { "entropy": 1.3422169074416161, "epoch": 3.8033635187580854, "grad_norm": 0.6484615802764893, "learning_rate": 0.00011284920237075076, "loss": 1.2771, "mean_token_accuracy": 0.6828199326992035, "num_tokens": 8458929.0, "step": 2940 }, { "entropy": 1.5778010010719299, "epoch": 3.8163001293661063, "grad_norm": 0.8282558917999268, "learning_rate": 0.00011232959692499308, "loss": 1.5224, "mean_token_accuracy": 0.6264667376875878, "num_tokens": 8481613.0, "step": 2950 }, { "entropy": 2.231258991360664, "epoch": 3.8292367399741267, "grad_norm": 0.0, "learning_rate": 0.00011180965323164719, "loss": 1.3715, "mean_token_accuracy": 0.4014947086572647, "num_tokens": 8487887.0, "step": 2960 }, { "entropy": 2.2951877444982527, "epoch": 3.8421733505821476, "grad_norm": 0.16264809668064117, "learning_rate": 0.00011128938555473976, "loss": 0.242, "mean_token_accuracy": 0.04751046672463417, "num_tokens": 8522204.0, "step": 2970 }, { "entropy": 1.505036623775959, "epoch": 3.855109961190168, "grad_norm": 0.5537543892860413, "learning_rate": 0.00011076880816718569, "loss": 1.5994, "mean_token_accuracy": 0.6235061697661877, "num_tokens": 8576399.0, "step": 2980 }, { "entropy": 1.306050930917263, "epoch": 3.868046571798189, "grad_norm": 0.6618802547454834, "learning_rate": 0.00011024793535039634, "loss": 1.2665, "mean_token_accuracy": 0.6823444902896881, "num_tokens": 8607791.0, "step": 2990 }, { "entropy": 1.5978755921125412, "epoch": 3.8809831824062098, "grad_norm": 0.756771445274353, "learning_rate": 0.00010972678139388784, "loss": 1.5231, "mean_token_accuracy": 0.6199123159050941, "num_tokens": 8629942.0, "step": 3000 }, { "epoch": 3.8809831824062098, "eval_entropy": 1.7341382033949675, "eval_loss": 1.2953605651855469, "eval_mean_token_accuracy": 0.4613482361269552, "eval_num_tokens": 8629942.0, "eval_runtime": 243.363, "eval_samples_per_second": 22.584, "eval_steps_per_second": 1.414, "step": 3000 }, { "entropy": 1.9563438802957536, "epoch": 3.89391979301423, "grad_norm": 0.0, "learning_rate": 0.00010920536059488904, "loss": 1.2245, "mean_token_accuracy": 0.35897522792220116, "num_tokens": 8635069.0, "step": 3010 }, { "entropy": 0.9117880932986736, "epoch": 3.906856403622251, "grad_norm": 0.16995865106582642, "learning_rate": 0.00010868368725794928, "loss": 0.2219, "mean_token_accuracy": 0.050884007662534717, "num_tokens": 8661156.0, "step": 3020 }, { "entropy": 1.5383384585380555, "epoch": 3.9197930142302715, "grad_norm": 0.5345892310142517, "learning_rate": 0.000108161775694546, "loss": 1.6123, "mean_token_accuracy": 0.6229903392493725, "num_tokens": 8713506.0, "step": 3030 }, { "entropy": 1.2795201033353805, "epoch": 3.9327296248382924, "grad_norm": 0.682775616645813, "learning_rate": 0.00010763964022269213, "loss": 1.2389, "mean_token_accuracy": 0.6921025589108467, "num_tokens": 8745762.0, "step": 3040 }, { "entropy": 1.585690438747406, "epoch": 3.9456662354463132, "grad_norm": 0.7901929616928101, "learning_rate": 0.00010711729516654311, "loss": 1.5575, "mean_token_accuracy": 0.6214944392442703, "num_tokens": 8768560.0, "step": 3050 }, { "entropy": 2.0845181226730345, "epoch": 3.9586028460543337, "grad_norm": 0.0, "learning_rate": 0.00010659475485600423, "loss": 1.4895, "mean_token_accuracy": 0.39826231375336646, "num_tokens": 8775063.0, "step": 3060 }, { "entropy": 2.2135625928640366, "epoch": 3.9715394566623545, "grad_norm": 0.212826207280159, "learning_rate": 0.00010607203362633728, "loss": 0.2226, "mean_token_accuracy": 0.051099646091461184, "num_tokens": 8793192.0, "step": 3070 }, { "entropy": 1.4032258987426758, "epoch": 3.984476067270375, "grad_norm": 0.6924927830696106, "learning_rate": 0.00010554914581776738, "loss": 1.4474, "mean_token_accuracy": 0.6517833903431892, "num_tokens": 8831113.0, "step": 3080 }, { "entropy": 2.1208325177431107, "epoch": 3.997412677878396, "grad_norm": 0.0, "learning_rate": 0.00010502610577508949, "loss": 1.1819, "mean_token_accuracy": 0.38025794699788096, "num_tokens": 8840822.0, "step": 3090 }, { "entropy": 1.8567550331354141, "epoch": 4.010349288486417, "grad_norm": 0.5068947076797485, "learning_rate": 0.00010450292784727496, "loss": 1.3687, "mean_token_accuracy": 0.48387093394994735, "num_tokens": 8907582.0, "step": 3100 }, { "entropy": 1.1507928803563119, "epoch": 4.023285899094438, "grad_norm": 0.6847311854362488, "learning_rate": 0.00010397962638707783, "loss": 1.129, "mean_token_accuracy": 0.7149621859192848, "num_tokens": 8942268.0, "step": 3110 }, { "entropy": 1.3405901521444321, "epoch": 4.0362225097024576, "grad_norm": 0.8465374112129211, "learning_rate": 0.00010345621575064117, "loss": 1.3204, "mean_token_accuracy": 0.6661748513579369, "num_tokens": 8967621.0, "step": 3120 }, { "entropy": 1.9997529834508896, "epoch": 4.049159120310478, "grad_norm": 1.2902584075927734, "learning_rate": 0.00010293271029710307, "loss": 1.7005, "mean_token_accuracy": 0.5859146490693092, "num_tokens": 8978783.0, "step": 3130 }, { "entropy": 2.575493034720421, "epoch": 4.062095730918499, "grad_norm": 0.0, "learning_rate": 0.00010240912438820289, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 8979423.0, "step": 3140 }, { "entropy": 1.8354697600007057, "epoch": 4.07503234152652, "grad_norm": 0.6097379326820374, "learning_rate": 0.00010188547238788713, "loss": 1.3617, "mean_token_accuracy": 0.4855068750679493, "num_tokens": 9049300.0, "step": 3150 }, { "epoch": 4.07503234152652, "eval_entropy": 1.8855025125450866, "eval_loss": 1.301902413368225, "eval_mean_token_accuracy": 0.46030220640606656, "eval_num_tokens": 9049300.0, "eval_runtime": 243.8279, "eval_samples_per_second": 22.54, "eval_steps_per_second": 1.411, "step": 3150 }, { "entropy": 1.140310089290142, "epoch": 4.087968952134541, "grad_norm": 0.6553735136985779, "learning_rate": 0.00010136176866191548, "loss": 1.109, "mean_token_accuracy": 0.7216179341077804, "num_tokens": 9083874.0, "step": 3160 }, { "entropy": 1.3620821744203568, "epoch": 4.100905562742561, "grad_norm": 0.9848551154136658, "learning_rate": 0.00010083802757746668, "loss": 1.2997, "mean_token_accuracy": 0.6707961618900299, "num_tokens": 9108826.0, "step": 3170 }, { "entropy": 2.078350791335106, "epoch": 4.113842173350582, "grad_norm": 0.9935686588287354, "learning_rate": 0.0001003142635027442, "loss": 1.6088, "mean_token_accuracy": 0.5507442288100719, "num_tokens": 9118696.0, "step": 3180 }, { "entropy": 1.528096930682659, "epoch": 4.126778783958603, "grad_norm": 0.0, "learning_rate": 9.979049080658242e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 9119336.0, "step": 3190 }, { "entropy": 1.4985127076506615, "epoch": 4.139715394566624, "grad_norm": 0.6286259889602661, "learning_rate": 9.926672385805207e-05, "loss": 1.4428, "mean_token_accuracy": 0.46830192804336546, "num_tokens": 9198456.0, "step": 3200 }, { "entropy": 1.1341844990849494, "epoch": 4.1526520051746445, "grad_norm": 0.6682960391044617, "learning_rate": 9.874297702606636e-05, "loss": 1.1144, "mean_token_accuracy": 0.7213881194591523, "num_tokens": 9234104.0, "step": 3210 }, { "entropy": 1.3693108260631561, "epoch": 4.165588615782665, "grad_norm": 0.8303619027137756, "learning_rate": 9.821926467898653e-05, "loss": 1.3216, "mean_token_accuracy": 0.6689239561557769, "num_tokens": 9259921.0, "step": 3220 }, { "entropy": 1.9042235404253005, "epoch": 4.178525226390685, "grad_norm": 1.645528793334961, "learning_rate": 9.769560118422773e-05, "loss": 1.7769, "mean_token_accuracy": 0.5957130216062069, "num_tokens": 9272479.0, "step": 3230 }, { "entropy": 0.9734129890799522, "epoch": 4.191461836998706, "grad_norm": 0.0, "learning_rate": 9.717200090786501e-05, "loss": 0.0492, "mean_token_accuracy": 0.03619047701358795, "num_tokens": 9273156.0, "step": 3240 }, { "entropy": 1.5239285960793496, "epoch": 4.204398447606727, "grad_norm": 0.6020880937576294, "learning_rate": 9.664847821423907e-05, "loss": 1.4046, "mean_token_accuracy": 0.47501309886574744, "num_tokens": 9347748.0, "step": 3250 }, { "entropy": 1.103029479086399, "epoch": 4.217335058214748, "grad_norm": 0.6547256708145142, "learning_rate": 9.612504746556215e-05, "loss": 1.0853, "mean_token_accuracy": 0.722417363524437, "num_tokens": 9382776.0, "step": 3260 }, { "entropy": 1.371236687898636, "epoch": 4.230271668822769, "grad_norm": 0.910345733165741, "learning_rate": 9.560172302152414e-05, "loss": 1.3338, "mean_token_accuracy": 0.6663747102022171, "num_tokens": 9408048.0, "step": 3270 }, { "entropy": 1.8871563643217086, "epoch": 4.243208279430789, "grad_norm": 1.3442589044570923, "learning_rate": 9.507851923889868e-05, "loss": 1.6856, "mean_token_accuracy": 0.5958636343479157, "num_tokens": 9419207.0, "step": 3280 }, { "entropy": 2.1561751127243043, "epoch": 4.25614489003881, "grad_norm": 0.0, "learning_rate": 9.455545047114901e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 9419847.0, "step": 3290 }, { "entropy": 1.766649141907692, "epoch": 4.269081500646831, "grad_norm": 0.6345491409301758, "learning_rate": 9.40325310680346e-05, "loss": 1.3764, "mean_token_accuracy": 0.48196633756160734, "num_tokens": 9491348.0, "step": 3300 }, { "epoch": 4.269081500646831, "eval_entropy": 1.759770261860171, "eval_loss": 1.3021514415740967, "eval_mean_token_accuracy": 0.4654658474894457, "eval_num_tokens": 9491348.0, "eval_runtime": 243.8603, "eval_samples_per_second": 22.537, "eval_steps_per_second": 1.411, "step": 3300 }, { "entropy": 1.0932901889085769, "epoch": 4.282018111254851, "grad_norm": 0.6778357028961182, "learning_rate": 9.350977537521717e-05, "loss": 1.0699, "mean_token_accuracy": 0.7278983518481255, "num_tokens": 9526419.0, "step": 3310 }, { "entropy": 1.3789748430252076, "epoch": 4.294954721862872, "grad_norm": 0.8899635672569275, "learning_rate": 9.298719773386724e-05, "loss": 1.3351, "mean_token_accuracy": 0.6661961570382118, "num_tokens": 9551892.0, "step": 3320 }, { "entropy": 1.957590714097023, "epoch": 4.307891332470892, "grad_norm": 1.470860481262207, "learning_rate": 9.246481248027077e-05, "loss": 1.7173, "mean_token_accuracy": 0.5974891498684883, "num_tokens": 9563515.0, "step": 3330 }, { "entropy": 2.714459627866745, "epoch": 4.320827943078913, "grad_norm": 0.0, "learning_rate": 9.194263394543575e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 9564155.0, "step": 3340 }, { "entropy": 1.8973265826702117, "epoch": 4.333764553686934, "grad_norm": 0.6255518198013306, "learning_rate": 9.14206764546991e-05, "loss": 1.4331, "mean_token_accuracy": 0.47237296029925346, "num_tokens": 9638156.0, "step": 3350 }, { "entropy": 1.113997830450535, "epoch": 4.346701164294955, "grad_norm": 0.6197985410690308, "learning_rate": 9.089895432733364e-05, "loss": 1.1138, "mean_token_accuracy": 0.7213677644729615, "num_tokens": 9674105.0, "step": 3360 }, { "entropy": 1.355890506505966, "epoch": 4.359637774902976, "grad_norm": 0.8531930446624756, "learning_rate": 9.037748187615538e-05, "loss": 1.3064, "mean_token_accuracy": 0.6726941719651223, "num_tokens": 9700126.0, "step": 3370 }, { "entropy": 1.9791965007781982, "epoch": 4.372574385510996, "grad_norm": 1.7110706567764282, "learning_rate": 8.985627340713061e-05, "loss": 1.6769, "mean_token_accuracy": 0.5642684459686279, "num_tokens": 9711816.0, "step": 3380 }, { "entropy": 3.160873770713806, "epoch": 4.385510996119017, "grad_norm": 0.0, "learning_rate": 8.933534321898367e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 9712456.0, "step": 3390 }, { "entropy": 2.013157232105732, "epoch": 4.3984476067270375, "grad_norm": 0.6276950240135193, "learning_rate": 8.881470560280465e-05, "loss": 1.4395, "mean_token_accuracy": 0.4699708536267281, "num_tokens": 9789047.0, "step": 3400 }, { "entropy": 1.0582531332969665, "epoch": 4.411384217335058, "grad_norm": 0.6762167811393738, "learning_rate": 8.829437484165718e-05, "loss": 1.0539, "mean_token_accuracy": 0.7299133688211441, "num_tokens": 9824536.0, "step": 3410 }, { "entropy": 1.3210778176784514, "epoch": 4.424320827943079, "grad_norm": 0.8756985664367676, "learning_rate": 8.777436521018676e-05, "loss": 1.2846, "mean_token_accuracy": 0.6797921255230903, "num_tokens": 9850555.0, "step": 3420 }, { "entropy": 1.8927232474088669, "epoch": 4.437257438551099, "grad_norm": 1.5375664234161377, "learning_rate": 8.725469097422912e-05, "loss": 1.7705, "mean_token_accuracy": 0.5886133186519146, "num_tokens": 9863603.0, "step": 3430 }, { "entropy": 2.54144030213356, "epoch": 4.45019404915912, "grad_norm": 0.0, "learning_rate": 8.673536639041864e-05, "loss": 0.0476, "mean_token_accuracy": 0.04354838728904724, "num_tokens": 9864278.0, "step": 3440 }, { "entropy": 1.6926740244030953, "epoch": 4.463130659767141, "grad_norm": 0.639385461807251, "learning_rate": 8.621640570579764e-05, "loss": 1.2832, "mean_token_accuracy": 0.502137529104948, "num_tokens": 9929876.0, "step": 3450 }, { "epoch": 4.463130659767141, "eval_entropy": 1.6399936731471572, "eval_loss": 1.2823114395141602, "eval_mean_token_accuracy": 0.4697489900471166, "eval_num_tokens": 9929876.0, "eval_runtime": 242.6114, "eval_samples_per_second": 22.654, "eval_steps_per_second": 1.418, "step": 3450 }, { "entropy": 1.0890112176537514, "epoch": 4.476067270375162, "grad_norm": 0.6899943351745605, "learning_rate": 8.56978231574252e-05, "loss": 1.0627, "mean_token_accuracy": 0.7313546255230904, "num_tokens": 9964211.0, "step": 3460 }, { "entropy": 1.3737705022096633, "epoch": 4.489003880983183, "grad_norm": 0.9175981879234314, "learning_rate": 8.517963297198672e-05, "loss": 1.3508, "mean_token_accuracy": 0.6623948410153389, "num_tokens": 9989036.0, "step": 3470 }, { "entropy": 1.8537749290466308, "epoch": 4.501940491591203, "grad_norm": 1.1406779289245605, "learning_rate": 8.466184936540351e-05, "loss": 1.6469, "mean_token_accuracy": 0.590015722811222, "num_tokens": 9999994.0, "step": 3480 }, { "entropy": 1.9705951809883118, "epoch": 4.514877102199224, "grad_norm": 0.0, "learning_rate": 8.414448654244297e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10000634.0, "step": 3490 }, { "entropy": 1.7736740306019783, "epoch": 4.527813712807244, "grad_norm": 0.5741596817970276, "learning_rate": 8.362755869632883e-05, "loss": 1.418, "mean_token_accuracy": 0.4736007325351238, "num_tokens": 10069782.0, "step": 3500 }, { "entropy": 1.1099611327052117, "epoch": 4.540750323415265, "grad_norm": 0.6997600793838501, "learning_rate": 8.311108000835167e-05, "loss": 1.1002, "mean_token_accuracy": 0.7222409531474113, "num_tokens": 10105051.0, "step": 3510 }, { "entropy": 1.3370502710342407, "epoch": 4.553686934023286, "grad_norm": 0.9216951131820679, "learning_rate": 8.259506464747999e-05, "loss": 1.2856, "mean_token_accuracy": 0.6742190420627594, "num_tokens": 10129844.0, "step": 3520 }, { "entropy": 2.0127808332443236, "epoch": 4.566623544631307, "grad_norm": 1.644737958908081, "learning_rate": 8.207952676997153e-05, "loss": 1.7374, "mean_token_accuracy": 0.5706604786217213, "num_tokens": 10140891.0, "step": 3530 }, { "entropy": 2.3392362356185914, "epoch": 4.579560155239327, "grad_norm": 0.0, "learning_rate": 8.156448051898476e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10141531.0, "step": 3540 }, { "entropy": 1.7776501581072808, "epoch": 4.592496765847348, "grad_norm": 0.6358464956283569, "learning_rate": 8.1049940024191e-05, "loss": 1.4156, "mean_token_accuracy": 0.47597954645752905, "num_tokens": 10208071.0, "step": 3550 }, { "entropy": 1.103192213177681, "epoch": 4.605433376455369, "grad_norm": 0.6968359351158142, "learning_rate": 8.053591940138686e-05, "loss": 1.096, "mean_token_accuracy": 0.7267577677965165, "num_tokens": 10242851.0, "step": 3560 }, { "entropy": 1.3612541019916535, "epoch": 4.61836998706339, "grad_norm": 0.9655300974845886, "learning_rate": 8.002243275210669e-05, "loss": 1.3057, "mean_token_accuracy": 0.672816789150238, "num_tokens": 10268178.0, "step": 3570 }, { "entropy": 1.932911714911461, "epoch": 4.63130659767141, "grad_norm": 1.2096027135849, "learning_rate": 7.950949416323612e-05, "loss": 1.7086, "mean_token_accuracy": 0.612860233336687, "num_tokens": 10279495.0, "step": 3580 }, { "entropy": 1.9618256837129593, "epoch": 4.6442432082794305, "grad_norm": 0.0, "learning_rate": 7.899711770662532e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10280135.0, "step": 3590 }, { "entropy": 1.6968649536371232, "epoch": 4.657179818887451, "grad_norm": 0.6373590230941772, "learning_rate": 7.848531743870297e-05, "loss": 1.3993, "mean_token_accuracy": 0.4759638875722885, "num_tokens": 10346462.0, "step": 3600 }, { "epoch": 4.657179818887451, "eval_entropy": 1.577659371980401, "eval_loss": 1.273931622505188, "eval_mean_token_accuracy": 0.4723818853150967, "eval_num_tokens": 10346462.0, "eval_runtime": 245.6574, "eval_samples_per_second": 22.373, "eval_steps_per_second": 1.4, "step": 3600 }, { "entropy": 1.093438169360161, "epoch": 4.670116429495472, "grad_norm": 0.7240473628044128, "learning_rate": 7.797410740009084e-05, "loss": 1.0745, "mean_token_accuracy": 0.7309321075677871, "num_tokens": 10381489.0, "step": 3610 }, { "entropy": 1.37732635140419, "epoch": 4.683053040103493, "grad_norm": 0.9580934047698975, "learning_rate": 7.746350161521845e-05, "loss": 1.336, "mean_token_accuracy": 0.6637881115078926, "num_tokens": 10406795.0, "step": 3620 }, { "entropy": 1.9799594402313232, "epoch": 4.695989650711514, "grad_norm": 1.5260565280914307, "learning_rate": 7.695351409193823e-05, "loss": 1.7859, "mean_token_accuracy": 0.5888419583439827, "num_tokens": 10418685.0, "step": 3630 }, { "entropy": 1.8445574283599853, "epoch": 4.708926261319534, "grad_norm": 0.0, "learning_rate": 7.644415882114145e-05, "loss": 0.0354, "mean_token_accuracy": 0.04375, "num_tokens": 10419355.0, "step": 3640 }, { "entropy": 1.724594485759735, "epoch": 4.721862871927555, "grad_norm": 0.5997304320335388, "learning_rate": 7.593544977637436e-05, "loss": 1.4375, "mean_token_accuracy": 0.4693992160260677, "num_tokens": 10485312.0, "step": 3650 }, { "entropy": 1.079079033434391, "epoch": 4.734799482535576, "grad_norm": 0.6873499155044556, "learning_rate": 7.54274009134546e-05, "loss": 1.0708, "mean_token_accuracy": 0.7280381500720978, "num_tokens": 10520582.0, "step": 3660 }, { "entropy": 1.315394550561905, "epoch": 4.747736093143597, "grad_norm": 0.8612226843833923, "learning_rate": 7.492002617008866e-05, "loss": 1.2891, "mean_token_accuracy": 0.6757827803492547, "num_tokens": 10545966.0, "step": 3670 }, { "entropy": 1.840933558344841, "epoch": 4.760672703751617, "grad_norm": 0.7735125422477722, "learning_rate": 7.441333946548939e-05, "loss": 1.575, "mean_token_accuracy": 0.5655414000153541, "num_tokens": 10557080.0, "step": 3680 }, { "entropy": 1.232702499628067, "epoch": 4.773609314359637, "grad_norm": 0.0, "learning_rate": 7.390735469999398e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10557720.0, "step": 3690 }, { "entropy": 1.5656249672174454, "epoch": 4.786545924967658, "grad_norm": 0.6145333051681519, "learning_rate": 7.340208575468291e-05, "loss": 1.4603, "mean_token_accuracy": 0.46657404825091364, "num_tokens": 10627563.0, "step": 3700 }, { "entropy": 1.0934513494372369, "epoch": 4.799482535575679, "grad_norm": 0.7226387858390808, "learning_rate": 7.289754649099897e-05, "loss": 1.0786, "mean_token_accuracy": 0.7299003899097443, "num_tokens": 10662880.0, "step": 3710 }, { "entropy": 1.3585843235254287, "epoch": 4.8124191461837, "grad_norm": 0.8521022796630859, "learning_rate": 7.239375075036697e-05, "loss": 1.3144, "mean_token_accuracy": 0.6705298006534577, "num_tokens": 10688600.0, "step": 3720 }, { "entropy": 1.8722685337066651, "epoch": 4.825355756791721, "grad_norm": 1.371882677078247, "learning_rate": 7.189071235381406e-05, "loss": 1.7141, "mean_token_accuracy": 0.604588358104229, "num_tokens": 10700334.0, "step": 3730 }, { "entropy": 1.860415416955948, "epoch": 4.838292367399741, "grad_norm": 0.0, "learning_rate": 7.138844510159069e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10700974.0, "step": 3740 }, { "entropy": 1.68975418061018, "epoch": 4.851228978007762, "grad_norm": 0.6484793424606323, "learning_rate": 7.088696277279175e-05, "loss": 1.3382, "mean_token_accuracy": 0.4887751266360283, "num_tokens": 10771692.0, "step": 3750 }, { "epoch": 4.851228978007762, "eval_entropy": 1.7122522977202437, "eval_loss": 1.2648330926895142, "eval_mean_token_accuracy": 0.47577540377198263, "eval_num_tokens": 10771692.0, "eval_runtime": 244.9784, "eval_samples_per_second": 22.435, "eval_steps_per_second": 1.404, "step": 3750 }, { "entropy": 1.1040325671434403, "epoch": 4.864165588615783, "grad_norm": 0.7224993705749512, "learning_rate": 7.038627912497873e-05, "loss": 1.0872, "mean_token_accuracy": 0.7262751698493958, "num_tokens": 10806575.0, "step": 3760 }, { "entropy": 1.3863080263137817, "epoch": 4.8771021992238035, "grad_norm": 0.9205716252326965, "learning_rate": 6.988640789380241e-05, "loss": 1.3415, "mean_token_accuracy": 0.6670658677816391, "num_tokens": 10831607.0, "step": 3770 }, { "entropy": 1.986344888806343, "epoch": 4.890038809831824, "grad_norm": 1.2501696348190308, "learning_rate": 6.938736279262567e-05, "loss": 1.5931, "mean_token_accuracy": 0.5594463273882866, "num_tokens": 10842477.0, "step": 3780 }, { "entropy": 2.6916876256465914, "epoch": 4.902975420439844, "grad_norm": 0.0, "learning_rate": 6.888915751214774e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10843117.0, "step": 3790 }, { "entropy": 1.8490806862711906, "epoch": 4.915912031047865, "grad_norm": 0.6139810085296631, "learning_rate": 6.83918057200283e-05, "loss": 1.3791, "mean_token_accuracy": 0.4822954162955284, "num_tokens": 10917288.0, "step": 3800 }, { "entropy": 1.063162423670292, "epoch": 4.928848641655886, "grad_norm": 0.7340760231018066, "learning_rate": 6.789532106051246e-05, "loss": 1.0523, "mean_token_accuracy": 0.7331129983067513, "num_tokens": 10952906.0, "step": 3810 }, { "entropy": 1.348393714427948, "epoch": 4.941785252263907, "grad_norm": 0.979292094707489, "learning_rate": 6.739971715405684e-05, "loss": 1.3057, "mean_token_accuracy": 0.6723238781094552, "num_tokens": 10978606.0, "step": 3820 }, { "entropy": 1.887803316116333, "epoch": 4.954721862871928, "grad_norm": 1.4358190298080444, "learning_rate": 6.690500759695557e-05, "loss": 1.6779, "mean_token_accuracy": 0.6134289026260376, "num_tokens": 10990333.0, "step": 3830 }, { "entropy": 2.7988963067531585, "epoch": 4.967658473479949, "grad_norm": 0.0, "learning_rate": 6.641120596096729e-05, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 10990973.0, "step": 3840 }, { "entropy": 1.5671290338039399, "epoch": 4.980595084087969, "grad_norm": 0.697485625743866, "learning_rate": 6.591832579294303e-05, "loss": 1.0782, "mean_token_accuracy": 0.5383081540465355, "num_tokens": 11034414.0, "step": 3850 }, { "entropy": 1.7319936901330948, "epoch": 4.99353169469599, "grad_norm": 0.0, "learning_rate": 6.542638061445447e-05, "loss": 1.3846, "mean_token_accuracy": 0.5769012212753296, "num_tokens": 11050864.0, "step": 3860 }, { "entropy": 2.4691727608442307, "epoch": 5.00646830530401, "grad_norm": 0.49155521392822266, "learning_rate": 6.493538392142287e-05, "loss": 1.0145, "mean_token_accuracy": 0.26990386173129083, "num_tokens": 11109874.0, "step": 3870 }, { "entropy": 1.075531531870365, "epoch": 5.019404915912031, "grad_norm": 0.7045453190803528, "learning_rate": 6.444534918374906e-05, "loss": 1.0364, "mean_token_accuracy": 0.7393457636237144, "num_tokens": 11148394.0, "step": 3880 }, { "entropy": 1.1883759826421738, "epoch": 5.032341526520052, "grad_norm": 0.9995729327201843, "learning_rate": 6.395628984494378e-05, "loss": 1.2028, "mean_token_accuracy": 0.6972079753875733, "num_tokens": 11176092.0, "step": 3890 }, { "entropy": 1.7173998385667801, "epoch": 5.045278137128072, "grad_norm": 1.125909686088562, "learning_rate": 6.346821932175873e-05, "loss": 1.5967, "mean_token_accuracy": 0.6245104640722274, "num_tokens": 11192285.0, "step": 3900 }, { "epoch": 5.045278137128072, "eval_entropy": 1.9103823839578518, "eval_loss": 1.2630141973495483, "eval_mean_token_accuracy": 0.4754580475043419, "eval_num_tokens": 11192285.0, "eval_runtime": 244.3056, "eval_samples_per_second": 22.496, "eval_steps_per_second": 1.408, "step": 3900 }, { "entropy": 2.6084256947040556, "epoch": 5.058214747736093, "grad_norm": 0.0, "learning_rate": 6.298115100381882e-05, "loss": 0.3442, "mean_token_accuracy": 0.16731906533241273, "num_tokens": 11193644.0, "step": 3910 }, { "entropy": 2.282520645856857, "epoch": 5.071151358344114, "grad_norm": 0.5869239568710327, "learning_rate": 6.249509825325467e-05, "loss": 0.9511, "mean_token_accuracy": 0.28290636241436007, "num_tokens": 11249840.0, "step": 3920 }, { "entropy": 1.016249306499958, "epoch": 5.084087968952135, "grad_norm": 0.7197193503379822, "learning_rate": 6.201007440433588e-05, "loss": 1.007, "mean_token_accuracy": 0.7442266702651977, "num_tokens": 11287639.0, "step": 3930 }, { "entropy": 1.2221685394644737, "epoch": 5.097024579560156, "grad_norm": 0.9447595477104187, "learning_rate": 6.152609276310549e-05, "loss": 1.187, "mean_token_accuracy": 0.7011413544416427, "num_tokens": 11315215.0, "step": 3940 }, { "entropy": 1.6715268433094024, "epoch": 5.109961190168176, "grad_norm": 1.0949913263320923, "learning_rate": 6.104316660701485e-05, "loss": 1.5623, "mean_token_accuracy": 0.6256066203117371, "num_tokens": 11332567.0, "step": 3950 }, { "entropy": 2.496020531654358, "epoch": 5.1228978007761965, "grad_norm": 0.0, "learning_rate": 6.056130918455929e-05, "loss": 0.4602, "mean_token_accuracy": 0.21488995999097824, "num_tokens": 11334364.0, "step": 3960 }, { "entropy": 2.2577121645212173, "epoch": 5.135834411384217, "grad_norm": 0.6211187243461609, "learning_rate": 6.0080533714914766e-05, "loss": 1.0081, "mean_token_accuracy": 0.2705229982733727, "num_tokens": 11391718.0, "step": 3970 }, { "entropy": 1.0153650417923927, "epoch": 5.148771021992238, "grad_norm": 0.649202287197113, "learning_rate": 5.9600853387575163e-05, "loss": 1.0426, "mean_token_accuracy": 0.7383781686425209, "num_tokens": 11430710.0, "step": 3980 }, { "entropy": 1.1217432379722596, "epoch": 5.161707632600259, "grad_norm": 0.9362276792526245, "learning_rate": 5.912228136199038e-05, "loss": 1.0765, "mean_token_accuracy": 0.7234507903456688, "num_tokens": 11459154.0, "step": 3990 }, { "entropy": 1.6653785824775695, "epoch": 5.174644243208279, "grad_norm": 1.2307344675064087, "learning_rate": 5.864483076720555e-05, "loss": 1.5669, "mean_token_accuracy": 0.6285063222050666, "num_tokens": 11476268.0, "step": 4000 }, { "entropy": 2.082801552116871, "epoch": 5.1875808538163, "grad_norm": 0.0, "learning_rate": 5.81685147015006e-05, "loss": 0.3513, "mean_token_accuracy": 0.1956300586462021, "num_tokens": 11477779.0, "step": 4010 }, { "entropy": 2.0466490238904953, "epoch": 5.200517464424321, "grad_norm": 0.5699072480201721, "learning_rate": 5.769334623203095e-05, "loss": 0.9736, "mean_token_accuracy": 0.27822155207395555, "num_tokens": 11531993.0, "step": 4020 }, { "entropy": 1.0089649006724357, "epoch": 5.213454075032342, "grad_norm": 0.6833609938621521, "learning_rate": 5.7219338394469356e-05, "loss": 1.0355, "mean_token_accuracy": 0.7415396451950074, "num_tokens": 11570430.0, "step": 4030 }, { "entropy": 1.1602358788251876, "epoch": 5.226390685640363, "grad_norm": 0.933566153049469, "learning_rate": 5.674650419264782e-05, "loss": 1.1016, "mean_token_accuracy": 0.7183712035417557, "num_tokens": 11598642.0, "step": 4040 }, { "entropy": 1.6275397926568984, "epoch": 5.239327296248383, "grad_norm": 1.2435181140899658, "learning_rate": 5.6274856598201066e-05, "loss": 1.5472, "mean_token_accuracy": 0.6266872644424438, "num_tokens": 11615900.0, "step": 4050 }, { "epoch": 5.239327296248383, "eval_entropy": 1.7370388171700544, "eval_loss": 1.2589974403381348, "eval_mean_token_accuracy": 0.4773535789965197, "eval_num_tokens": 11615900.0, "eval_runtime": 242.7915, "eval_samples_per_second": 22.637, "eval_steps_per_second": 1.417, "step": 4050 }, { "entropy": 2.3815665364265444, "epoch": 5.252263906856403, "grad_norm": 0.0, "learning_rate": 5.580440855021083e-05, "loss": 0.4649, "mean_token_accuracy": 0.19248609468340874, "num_tokens": 11617642.0, "step": 4060 }, { "entropy": 2.2312158316373827, "epoch": 5.265200517464424, "grad_norm": 0.5702583193778992, "learning_rate": 5.533517295485062e-05, "loss": 0.9829, "mean_token_accuracy": 0.27761168628931043, "num_tokens": 11675101.0, "step": 4070 }, { "entropy": 1.0108808249235153, "epoch": 5.278137128072445, "grad_norm": 0.752931535243988, "learning_rate": 5.486716268503182e-05, "loss": 1.0438, "mean_token_accuracy": 0.7365775972604751, "num_tokens": 11713657.0, "step": 4080 }, { "entropy": 1.2421717032790185, "epoch": 5.291073738680466, "grad_norm": 0.9655210375785828, "learning_rate": 5.440039058005047e-05, "loss": 1.1822, "mean_token_accuracy": 0.7000416114926338, "num_tokens": 11741666.0, "step": 4090 }, { "entropy": 1.6973173677921296, "epoch": 5.304010349288486, "grad_norm": 1.5103716850280762, "learning_rate": 5.393486944523505e-05, "loss": 1.5623, "mean_token_accuracy": 0.6223144173622132, "num_tokens": 11758203.0, "step": 4100 }, { "entropy": 2.1967957854270934, "epoch": 5.316946959896507, "grad_norm": 0.0, "learning_rate": 5.347061205159519e-05, "loss": 0.2983, "mean_token_accuracy": 0.15621012300252915, "num_tokens": 11759461.0, "step": 4110 }, { "entropy": 1.981054452061653, "epoch": 5.329883570504528, "grad_norm": 0.6421746611595154, "learning_rate": 5.3007631135471334e-05, "loss": 0.9895, "mean_token_accuracy": 0.2755612075328827, "num_tokens": 11813613.0, "step": 4120 }, { "entropy": 0.9758897602558136, "epoch": 5.342820181112549, "grad_norm": 0.7207741141319275, "learning_rate": 5.2545939398185284e-05, "loss": 1.0031, "mean_token_accuracy": 0.7456466734409333, "num_tokens": 11852165.0, "step": 4130 }, { "entropy": 1.195047491788864, "epoch": 5.3557567917205695, "grad_norm": 0.9851743578910828, "learning_rate": 5.208554950569178e-05, "loss": 1.1364, "mean_token_accuracy": 0.7128469496965408, "num_tokens": 11880541.0, "step": 4140 }, { "entropy": 1.6736773550510406, "epoch": 5.36869340232859, "grad_norm": 1.2857285737991333, "learning_rate": 5.1626474088231004e-05, "loss": 1.6022, "mean_token_accuracy": 0.6264947578310966, "num_tokens": 11897978.0, "step": 4150 }, { "entropy": 2.3716455429792402, "epoch": 5.38163001293661, "grad_norm": 0.0, "learning_rate": 5.116872573998217e-05, "loss": 0.3852, "mean_token_accuracy": 0.18944832757115365, "num_tokens": 11899460.0, "step": 4160 }, { "entropy": 2.3163172632455824, "epoch": 5.394566623544631, "grad_norm": 0.60521000623703, "learning_rate": 5.071231701871787e-05, "loss": 0.9779, "mean_token_accuracy": 0.27711123302578927, "num_tokens": 11956251.0, "step": 4170 }, { "entropy": 1.026511162519455, "epoch": 5.407503234152652, "grad_norm": 0.7545950412750244, "learning_rate": 5.025726044545968e-05, "loss": 1.0516, "mean_token_accuracy": 0.7328214541077613, "num_tokens": 11995157.0, "step": 4180 }, { "entropy": 1.1451522946357726, "epoch": 5.420439844760673, "grad_norm": 0.9537347555160522, "learning_rate": 4.980356850413472e-05, "loss": 1.1319, "mean_token_accuracy": 0.7138208642601966, "num_tokens": 12023430.0, "step": 4190 }, { "entropy": 1.7249857246875764, "epoch": 5.433376455368693, "grad_norm": 1.279359221458435, "learning_rate": 4.935125364123292e-05, "loss": 1.6072, "mean_token_accuracy": 0.6237147711217403, "num_tokens": 12040024.0, "step": 4200 }, { "epoch": 5.433376455368693, "eval_entropy": 1.8443097567488982, "eval_loss": 1.2536410093307495, "eval_mean_token_accuracy": 0.4748884228079818, "eval_num_tokens": 12040024.0, "eval_runtime": 241.7185, "eval_samples_per_second": 22.737, "eval_steps_per_second": 1.423, "step": 4200 }, { "entropy": 2.7387112855911253, "epoch": 5.446313065976714, "grad_norm": 0.0, "learning_rate": 4.89003282654658e-05, "loss": 0.389, "mean_token_accuracy": 0.1823613777756691, "num_tokens": 12041467.0, "step": 4210 }, { "entropy": 2.44253671169281, "epoch": 5.459249676584735, "grad_norm": 0.5715515613555908, "learning_rate": 4.845080474742608e-05, "loss": 0.9938, "mean_token_accuracy": 0.2730660729110241, "num_tokens": 12103775.0, "step": 4220 }, { "entropy": 1.003270110487938, "epoch": 5.472186287192756, "grad_norm": 0.7785800099372864, "learning_rate": 4.800269541924799e-05, "loss": 1.0184, "mean_token_accuracy": 0.7413052409887314, "num_tokens": 12143014.0, "step": 4230 }, { "entropy": 1.1527703180909157, "epoch": 5.485122897800776, "grad_norm": 0.9831658005714417, "learning_rate": 4.7556012574269395e-05, "loss": 1.1284, "mean_token_accuracy": 0.7102037504315376, "num_tokens": 12171448.0, "step": 4240 }, { "entropy": 1.7090917527675629, "epoch": 5.498059508408797, "grad_norm": 1.4465516805648804, "learning_rate": 4.7110768466694224e-05, "loss": 1.6112, "mean_token_accuracy": 0.6218582183122635, "num_tokens": 12188400.0, "step": 4250 }, { "entropy": 2.560487928986549, "epoch": 5.510996119016817, "grad_norm": 0.0, "learning_rate": 4.666697531125627e-05, "loss": 0.3879, "mean_token_accuracy": 0.16174983084201813, "num_tokens": 12189804.0, "step": 4260 }, { "entropy": 2.277393540740013, "epoch": 5.523932729624838, "grad_norm": 0.5197897553443909, "learning_rate": 4.622464528288443e-05, "loss": 1.027, "mean_token_accuracy": 0.2683463282883167, "num_tokens": 12249572.0, "step": 4270 }, { "entropy": 1.0234995201230048, "epoch": 5.536869340232859, "grad_norm": 0.7546108961105347, "learning_rate": 4.578379051636832e-05, "loss": 1.0282, "mean_token_accuracy": 0.7406062006950378, "num_tokens": 12288484.0, "step": 4280 }, { "entropy": 1.1632590115070343, "epoch": 5.54980595084088, "grad_norm": 1.0032302141189575, "learning_rate": 4.534442310602559e-05, "loss": 1.1404, "mean_token_accuracy": 0.7092833399772644, "num_tokens": 12316357.0, "step": 4290 }, { "entropy": 1.6969922810792923, "epoch": 5.5627425614489, "grad_norm": 1.356163740158081, "learning_rate": 4.490655510537004e-05, "loss": 1.5895, "mean_token_accuracy": 0.6228079289197922, "num_tokens": 12332741.0, "step": 4300 }, { "entropy": 2.227242410182953, "epoch": 5.575679172056921, "grad_norm": 0.0, "learning_rate": 4.447019852678101e-05, "loss": 0.3691, "mean_token_accuracy": 0.18163795471191407, "num_tokens": 12334119.0, "step": 4310 }, { "entropy": 2.0241008371114733, "epoch": 5.588615782664942, "grad_norm": 0.5737898945808411, "learning_rate": 4.40353653411738e-05, "loss": 0.956, "mean_token_accuracy": 0.2796335697174072, "num_tokens": 12386755.0, "step": 4320 }, { "entropy": 1.009289626777172, "epoch": 5.6015523932729625, "grad_norm": 0.7294387221336365, "learning_rate": 4.360206747767122e-05, "loss": 1.032, "mean_token_accuracy": 0.7417484134435653, "num_tokens": 12424985.0, "step": 4330 }, { "entropy": 1.1622566372156142, "epoch": 5.614489003880983, "grad_norm": 1.0234155654907227, "learning_rate": 4.3170316823276424e-05, "loss": 1.1576, "mean_token_accuracy": 0.7061204954981803, "num_tokens": 12452639.0, "step": 4340 }, { "entropy": 1.6782744824886322, "epoch": 5.627425614489004, "grad_norm": 1.4249658584594727, "learning_rate": 4.274012522254674e-05, "loss": 1.5881, "mean_token_accuracy": 0.6237360410392284, "num_tokens": 12469230.0, "step": 4350 }, { "epoch": 5.627425614489004, "eval_entropy": 1.6210007650214573, "eval_loss": 1.241470217704773, "eval_mean_token_accuracy": 0.47642275673705475, "eval_num_tokens": 12469230.0, "eval_runtime": 246.3378, "eval_samples_per_second": 22.311, "eval_steps_per_second": 1.396, "step": 4350 }, { "entropy": 1.9652087688446045, "epoch": 5.640362225097024, "grad_norm": 0.0, "learning_rate": 4.231150447726874e-05, "loss": 0.3533, "mean_token_accuracy": 0.19179367125034333, "num_tokens": 12470690.0, "step": 4360 }, { "entropy": 1.9280417621135713, "epoch": 5.653298835705045, "grad_norm": 0.5711302757263184, "learning_rate": 4.1884466346134466e-05, "loss": 0.9704, "mean_token_accuracy": 0.27944710552692414, "num_tokens": 12525117.0, "step": 4370 }, { "entropy": 1.0357938587665558, "epoch": 5.666235446313066, "grad_norm": 0.6963515877723694, "learning_rate": 4.145902254441888e-05, "loss": 1.0365, "mean_token_accuracy": 0.7398686364293099, "num_tokens": 12563021.0, "step": 4380 }, { "entropy": 1.1490644261240959, "epoch": 5.679172056921087, "grad_norm": 0.9824443459510803, "learning_rate": 4.1035184743658376e-05, "loss": 1.1307, "mean_token_accuracy": 0.7091254457831383, "num_tokens": 12591024.0, "step": 4390 }, { "entropy": 1.68570619225502, "epoch": 5.692108667529108, "grad_norm": 1.2685192823410034, "learning_rate": 4.0612964571330805e-05, "loss": 1.5877, "mean_token_accuracy": 0.6187320709228515, "num_tokens": 12607889.0, "step": 4400 }, { "entropy": 1.995962232351303, "epoch": 5.705045278137128, "grad_norm": 0.0, "learning_rate": 4.019237361053615e-05, "loss": 0.4375, "mean_token_accuracy": 0.1990293502807617, "num_tokens": 12609477.0, "step": 4410 }, { "entropy": 2.0628999888896944, "epoch": 5.717981888745149, "grad_norm": 0.582778811454773, "learning_rate": 3.977342339967902e-05, "loss": 0.9965, "mean_token_accuracy": 0.2732643634080887, "num_tokens": 12668390.0, "step": 4420 }, { "entropy": 1.0030916407704353, "epoch": 5.730918499353169, "grad_norm": 0.7195892930030823, "learning_rate": 3.935612543215216e-05, "loss": 1.0055, "mean_token_accuracy": 0.7438824102282524, "num_tokens": 12707626.0, "step": 4430 }, { "entropy": 1.1245022103190423, "epoch": 5.74385510996119, "grad_norm": 0.9609954357147217, "learning_rate": 3.8940491156020744e-05, "loss": 1.0932, "mean_token_accuracy": 0.7223910227417946, "num_tokens": 12736376.0, "step": 4440 }, { "entropy": 1.653869342803955, "epoch": 5.756791720569211, "grad_norm": 1.3840677738189697, "learning_rate": 3.852653197370885e-05, "loss": 1.5745, "mean_token_accuracy": 0.6224342837929726, "num_tokens": 12753560.0, "step": 4450 }, { "entropy": 2.0997736901044846, "epoch": 5.769728331177232, "grad_norm": 0.0, "learning_rate": 3.811425924168628e-05, "loss": 0.4083, "mean_token_accuracy": 0.17910270839929582, "num_tokens": 12755081.0, "step": 4460 }, { "entropy": 2.0056353509426117, "epoch": 5.782664941785252, "grad_norm": 0.5901302099227905, "learning_rate": 3.770368427015699e-05, "loss": 0.9965, "mean_token_accuracy": 0.2755757987499237, "num_tokens": 12818062.0, "step": 4470 }, { "entropy": 0.9973522603511811, "epoch": 5.795601552393273, "grad_norm": 0.7053154110908508, "learning_rate": 3.729481832274916e-05, "loss": 1.0101, "mean_token_accuracy": 0.7445162117481232, "num_tokens": 12856675.0, "step": 4480 }, { "entropy": 1.158506852388382, "epoch": 5.808538163001294, "grad_norm": 1.0795212984085083, "learning_rate": 3.688767261620578e-05, "loss": 1.1325, "mean_token_accuracy": 0.7126885786652565, "num_tokens": 12884620.0, "step": 4490 }, { "entropy": 1.6880556523799897, "epoch": 5.821474773609315, "grad_norm": 1.5192304849624634, "learning_rate": 3.64822583200772e-05, "loss": 1.5872, "mean_token_accuracy": 0.6223025761544705, "num_tokens": 12901293.0, "step": 4500 }, { "epoch": 5.821474773609315, "eval_entropy": 1.5741082594491715, "eval_loss": 1.2425955533981323, "eval_mean_token_accuracy": 0.4777476576178573, "eval_num_tokens": 12901293.0, "eval_runtime": 245.8608, "eval_samples_per_second": 22.354, "eval_steps_per_second": 1.399, "step": 4500 }, { "entropy": 1.8717746943235398, "epoch": 5.834411384217335, "grad_norm": 0.0, "learning_rate": 3.607858655641457e-05, "loss": 0.3819, "mean_token_accuracy": 0.20605695247650146, "num_tokens": 12902761.0, "step": 4510 }, { "entropy": 1.97312273979187, "epoch": 5.8473479948253555, "grad_norm": 0.5747093558311462, "learning_rate": 3.56766683994648e-05, "loss": 0.9997, "mean_token_accuracy": 0.27485966980457305, "num_tokens": 12956936.0, "step": 4520 }, { "entropy": 1.026018126308918, "epoch": 5.860284605433376, "grad_norm": 0.7504481077194214, "learning_rate": 3.527651487536669e-05, "loss": 1.044, "mean_token_accuracy": 0.7389606684446335, "num_tokens": 12995952.0, "step": 4530 }, { "entropy": 1.1011481299996375, "epoch": 5.873221216041397, "grad_norm": 0.9883886575698853, "learning_rate": 3.487813696184852e-05, "loss": 1.0814, "mean_token_accuracy": 0.722546960413456, "num_tokens": 13024545.0, "step": 4540 }, { "entropy": 1.6190055787563324, "epoch": 5.886157826649418, "grad_norm": 1.3633733987808228, "learning_rate": 3.448154558792677e-05, "loss": 1.5299, "mean_token_accuracy": 0.6360443904995918, "num_tokens": 13041707.0, "step": 4550 }, { "entropy": 1.919902539253235, "epoch": 5.899094437257439, "grad_norm": 0.0, "learning_rate": 3.408675163360643e-05, "loss": 0.3972, "mean_token_accuracy": 0.18492977023124696, "num_tokens": 13043179.0, "step": 4560 }, { "entropy": 1.9439027100801467, "epoch": 5.912031047865459, "grad_norm": 0.5576460957527161, "learning_rate": 3.369376592958243e-05, "loss": 1.0312, "mean_token_accuracy": 0.2685145862400532, "num_tokens": 13106663.0, "step": 4570 }, { "entropy": 1.0852982923388481, "epoch": 5.92496765847348, "grad_norm": 0.7461971044540405, "learning_rate": 3.3302599256942524e-05, "loss": 1.0907, "mean_token_accuracy": 0.7287055298686027, "num_tokens": 13146036.0, "step": 4580 }, { "entropy": 1.1466092258691787, "epoch": 5.937904269081501, "grad_norm": 0.9710547924041748, "learning_rate": 3.2913262346871564e-05, "loss": 1.118, "mean_token_accuracy": 0.7170251324772835, "num_tokens": 13175061.0, "step": 4590 }, { "entropy": 1.5422434598207473, "epoch": 5.950840879689522, "grad_norm": 1.2156635522842407, "learning_rate": 3.252576588035703e-05, "loss": 1.4615, "mean_token_accuracy": 0.6465979412198066, "num_tokens": 13192904.0, "step": 4600 }, { "entropy": 1.8891061872243882, "epoch": 5.963777490297542, "grad_norm": 0.0, "learning_rate": 3.21401204878962e-05, "loss": 0.4084, "mean_token_accuracy": 0.20470450967550277, "num_tokens": 13194636.0, "step": 4610 }, { "entropy": 1.679259254038334, "epoch": 5.976714100905562, "grad_norm": 0.7184410095214844, "learning_rate": 3.175633674920415e-05, "loss": 0.7382, "mean_token_accuracy": 0.3269588887691498, "num_tokens": 13232029.0, "step": 4620 }, { "entropy": 1.1688358381390571, "epoch": 5.989650711513583, "grad_norm": 0.9711093306541443, "learning_rate": 3.1374425192923874e-05, "loss": 1.1566, "mean_token_accuracy": 0.7072150468826294, "num_tokens": 13259115.0, "step": 4630 }, { "entropy": 1.995809930562973, "epoch": 6.002587322121604, "grad_norm": 0.3214434087276459, "learning_rate": 3.099439629633738e-05, "loss": 0.9081, "mean_token_accuracy": 0.2743851698935032, "num_tokens": 13302193.0, "step": 4640 }, { "entropy": 1.2387345060706139, "epoch": 6.015523932729625, "grad_norm": 0.7096182107925415, "learning_rate": 3.061626048507794e-05, "loss": 1.2251, "mean_token_accuracy": 0.7026221588253975, "num_tokens": 13349206.0, "step": 4650 }, { "epoch": 6.015523932729625, "eval_entropy": 1.4673164599510127, "eval_loss": 1.236427664756775, "eval_mean_token_accuracy": 0.4835313937171947, "eval_num_tokens": 13349206.0, "eval_runtime": 245.226, "eval_samples_per_second": 22.412, "eval_steps_per_second": 1.403, "step": 4650 }, { "entropy": 1.0112595960497857, "epoch": 6.028460543337646, "grad_norm": 0.9612884521484375, "learning_rate": 3.0240028132844577e-05, "loss": 0.9916, "mean_token_accuracy": 0.7466120198369026, "num_tokens": 13380735.0, "step": 4660 }, { "entropy": 1.321917925775051, "epoch": 6.041397153945666, "grad_norm": 0.9298290014266968, "learning_rate": 2.9865709561117093e-05, "loss": 1.277, "mean_token_accuracy": 0.6769641906023025, "num_tokens": 13402259.0, "step": 4670 }, { "entropy": 1.9312127828598022, "epoch": 6.054333764553687, "grad_norm": 0.0, "learning_rate": 2.949331503887296e-05, "loss": 0.9373, "mean_token_accuracy": 0.38414124920964243, "num_tokens": 13406702.0, "step": 4680 }, { "entropy": 1.8519232898950577, "epoch": 6.067270375161708, "grad_norm": 0.3253900706768036, "learning_rate": 2.9122854782305853e-05, "loss": 0.4393, "mean_token_accuracy": 0.10099697411060334, "num_tokens": 13448471.0, "step": 4690 }, { "entropy": 1.2315872982144356, "epoch": 6.0802069857697285, "grad_norm": 0.7172207832336426, "learning_rate": 2.8754338954545078e-05, "loss": 1.2677, "mean_token_accuracy": 0.6917841538786889, "num_tokens": 13494707.0, "step": 4700 }, { "entropy": 1.078819990158081, "epoch": 6.093143596377749, "grad_norm": 0.9585686326026917, "learning_rate": 2.8387777665376947e-05, "loss": 1.0795, "mean_token_accuracy": 0.7268196657299996, "num_tokens": 13525272.0, "step": 4710 }, { "entropy": 1.439416041970253, "epoch": 6.106080206985769, "grad_norm": 0.967811107635498, "learning_rate": 2.8023180970967333e-05, "loss": 1.3684, "mean_token_accuracy": 0.6664265364408493, "num_tokens": 13545790.0, "step": 4720 }, { "entropy": 1.8261877298355103, "epoch": 6.11901681759379, "grad_norm": 0.0, "learning_rate": 2.766055887358584e-05, "loss": 0.8898, "mean_token_accuracy": 0.34252284914255143, "num_tokens": 13549613.0, "step": 4730 }, { "entropy": 1.8926386773586272, "epoch": 6.131953428201811, "grad_norm": 0.33156275749206543, "learning_rate": 2.72999213213314e-05, "loss": 0.438, "mean_token_accuracy": 0.10151686370372773, "num_tokens": 13586113.0, "step": 4740 }, { "entropy": 1.2399160832166671, "epoch": 6.144890038809832, "grad_norm": 0.7470856308937073, "learning_rate": 2.6941278207859333e-05, "loss": 1.2593, "mean_token_accuracy": 0.6944727435708046, "num_tokens": 13632230.0, "step": 4750 }, { "entropy": 1.008384570479393, "epoch": 6.157826649417853, "grad_norm": 0.992726743221283, "learning_rate": 2.6584639372109942e-05, "loss": 0.991, "mean_token_accuracy": 0.7462219312787056, "num_tokens": 13663326.0, "step": 4760 }, { "entropy": 1.338111485540867, "epoch": 6.170763260025873, "grad_norm": 1.0804771184921265, "learning_rate": 2.623001459803861e-05, "loss": 1.3146, "mean_token_accuracy": 0.6769130662083626, "num_tokens": 13684547.0, "step": 4770 }, { "entropy": 1.9144802495837212, "epoch": 6.183699870633894, "grad_norm": 0.0, "learning_rate": 2.5877413614347358e-05, "loss": 0.8822, "mean_token_accuracy": 0.3425231367349625, "num_tokens": 13688744.0, "step": 4780 }, { "entropy": 1.9466426759958266, "epoch": 6.196636481241915, "grad_norm": 0.30804237723350525, "learning_rate": 2.5526846094217948e-05, "loss": 0.4398, "mean_token_accuracy": 0.10224909633398056, "num_tokens": 13724520.0, "step": 4790 }, { "entropy": 1.2254028126597405, "epoch": 6.2095730918499354, "grad_norm": 0.7537704706192017, "learning_rate": 2.5178321655046577e-05, "loss": 1.2608, "mean_token_accuracy": 0.6935150980949402, "num_tokens": 13771548.0, "step": 4800 }, { "epoch": 6.2095730918499354, "eval_entropy": 1.452152093482572, "eval_loss": 1.2386506795883179, "eval_mean_token_accuracy": 0.4820184623605983, "eval_num_tokens": 13771548.0, "eval_runtime": 247.597, "eval_samples_per_second": 22.197, "eval_steps_per_second": 1.389, "step": 4800 }, { "entropy": 1.0173511430621147, "epoch": 6.222509702457956, "grad_norm": 0.9651890993118286, "learning_rate": 2.4831849858179913e-05, "loss": 1.0019, "mean_token_accuracy": 0.7469066709280014, "num_tokens": 13802198.0, "step": 4810 }, { "entropy": 1.3790171161293983, "epoch": 6.235446313065976, "grad_norm": 1.0010708570480347, "learning_rate": 2.448744020865299e-05, "loss": 1.3194, "mean_token_accuracy": 0.6719131916761398, "num_tokens": 13823366.0, "step": 4820 }, { "entropy": 1.8534984111785888, "epoch": 6.248382923673997, "grad_norm": 0.0, "learning_rate": 2.4145102154928156e-05, "loss": 0.9022, "mean_token_accuracy": 0.36226404309272764, "num_tokens": 13827780.0, "step": 4830 }, { "entropy": 1.787733218073845, "epoch": 6.261319534282018, "grad_norm": 0.35094037652015686, "learning_rate": 2.380484508863611e-05, "loss": 0.4416, "mean_token_accuracy": 0.10181766748428345, "num_tokens": 13865780.0, "step": 4840 }, { "entropy": 1.2079532265663147, "epoch": 6.274256144890039, "grad_norm": 0.7374927997589111, "learning_rate": 2.346667834431826e-05, "loss": 1.2223, "mean_token_accuracy": 0.704416724294424, "num_tokens": 13911952.0, "step": 4850 }, { "entropy": 0.9947008207440377, "epoch": 6.28719275549806, "grad_norm": 0.9054901003837585, "learning_rate": 2.3130611199170384e-05, "loss": 0.9776, "mean_token_accuracy": 0.7504064351320267, "num_tokens": 13943487.0, "step": 4860 }, { "entropy": 1.312053567171097, "epoch": 6.300129366106081, "grad_norm": 1.030329704284668, "learning_rate": 2.2796652872788448e-05, "loss": 1.2766, "mean_token_accuracy": 0.6872908189892769, "num_tokens": 13965764.0, "step": 4870 }, { "entropy": 1.73905668258667, "epoch": 6.313065976714101, "grad_norm": 0.0, "learning_rate": 2.246481252691548e-05, "loss": 0.9456, "mean_token_accuracy": 0.3810268484055996, "num_tokens": 13970660.0, "step": 4880 }, { "entropy": 1.7431816905736923, "epoch": 6.3260025873221215, "grad_norm": 0.3432393968105316, "learning_rate": 2.213509926519016e-05, "loss": 0.4431, "mean_token_accuracy": 0.103342554718256, "num_tokens": 14010149.0, "step": 4890 }, { "entropy": 1.2463560298085212, "epoch": 6.338939197930142, "grad_norm": 0.7458313703536987, "learning_rate": 2.1807522132897383e-05, "loss": 1.2702, "mean_token_accuracy": 0.6920596107840538, "num_tokens": 14057120.0, "step": 4900 }, { "entropy": 1.027150359749794, "epoch": 6.351875808538163, "grad_norm": 0.8767898082733154, "learning_rate": 2.148209011671979e-05, "loss": 0.9989, "mean_token_accuracy": 0.743067529797554, "num_tokens": 14088988.0, "step": 4910 }, { "entropy": 1.3012418672442436, "epoch": 6.364812419146183, "grad_norm": 1.0773974657058716, "learning_rate": 2.1158812144491357e-05, "loss": 1.247, "mean_token_accuracy": 0.6856265813112259, "num_tokens": 14111094.0, "step": 4920 }, { "entropy": 1.7512285083532333, "epoch": 6.377749029754204, "grad_norm": 0.0, "learning_rate": 2.0837697084952503e-05, "loss": 0.9705, "mean_token_accuracy": 0.38980276361107824, "num_tokens": 14115970.0, "step": 4930 }, { "entropy": 1.7514180034399032, "epoch": 6.390685640362225, "grad_norm": 0.3412686288356781, "learning_rate": 2.0518753747506748e-05, "loss": 0.4438, "mean_token_accuracy": 0.10270617604255676, "num_tokens": 14151452.0, "step": 4940 }, { "entropy": 1.2002925127744675, "epoch": 6.403622250970246, "grad_norm": 0.7483528852462769, "learning_rate": 2.0201990881979006e-05, "loss": 1.2267, "mean_token_accuracy": 0.7003540650010109, "num_tokens": 14198038.0, "step": 4950 }, { "epoch": 6.403622250970246, "eval_entropy": 1.4145794496979824, "eval_loss": 1.2361637353897095, "eval_mean_token_accuracy": 0.4807747915213884, "eval_num_tokens": 14198038.0, "eval_runtime": 239.3212, "eval_samples_per_second": 22.965, "eval_steps_per_second": 1.437, "step": 4950 }, { "entropy": 1.0035071596503258, "epoch": 6.416558861578267, "grad_norm": 0.9199973940849304, "learning_rate": 1.9887417178375633e-05, "loss": 0.9911, "mean_token_accuracy": 0.7502464011311532, "num_tokens": 14229396.0, "step": 4960 }, { "entropy": 1.3634681567549705, "epoch": 6.429495472186288, "grad_norm": 0.8955945372581482, "learning_rate": 1.957504126664593e-05, "loss": 1.3055, "mean_token_accuracy": 0.677581375837326, "num_tokens": 14251059.0, "step": 4970 }, { "entropy": 1.782031211256981, "epoch": 6.442432082794308, "grad_norm": 0.0, "learning_rate": 1.9264871716445454e-05, "loss": 1.0013, "mean_token_accuracy": 0.41802891343832016, "num_tokens": 14255872.0, "step": 4980 }, { "entropy": 1.542439764738083, "epoch": 6.455368693402328, "grad_norm": 0.34138184785842896, "learning_rate": 1.89569170369009e-05, "loss": 0.4513, "mean_token_accuracy": 0.1006891518831253, "num_tokens": 14297788.0, "step": 4990 }, { "entropy": 1.2497848883271216, "epoch": 6.468305304010349, "grad_norm": 0.7626767158508301, "learning_rate": 1.865118567637667e-05, "loss": 1.2743, "mean_token_accuracy": 0.6893603593111038, "num_tokens": 14345367.0, "step": 5000 }, { "entropy": 0.9866194486618042, "epoch": 6.48124191461837, "grad_norm": 1.0120469331741333, "learning_rate": 1.834768602224307e-05, "loss": 0.9661, "mean_token_accuracy": 0.752055998146534, "num_tokens": 14376619.0, "step": 5010 }, { "entropy": 1.2819917246699333, "epoch": 6.494178525226391, "grad_norm": 0.9832173585891724, "learning_rate": 1.8046426400646244e-05, "loss": 1.2393, "mean_token_accuracy": 0.6865051403641701, "num_tokens": 14398410.0, "step": 5020 }, { "entropy": 1.656550607085228, "epoch": 6.507115135834411, "grad_norm": 0.0, "learning_rate": 1.774741507627984e-05, "loss": 1.0363, "mean_token_accuracy": 0.402515621483326, "num_tokens": 14403699.0, "step": 5030 }, { "entropy": 1.4212503910064698, "epoch": 6.520051746442432, "grad_norm": 0.3207855820655823, "learning_rate": 1.7450660252158015e-05, "loss": 0.4273, "mean_token_accuracy": 0.10288792848587036, "num_tokens": 14446058.0, "step": 5040 }, { "entropy": 1.2183921545743943, "epoch": 6.532988357050453, "grad_norm": 0.7788935899734497, "learning_rate": 1.71561700693907e-05, "loss": 1.2401, "mean_token_accuracy": 0.7000276446342468, "num_tokens": 14492725.0, "step": 5050 }, { "entropy": 1.0459384858608245, "epoch": 6.545924967658474, "grad_norm": 0.9662116765975952, "learning_rate": 1.6863952606960132e-05, "loss": 1.037, "mean_token_accuracy": 0.7341208711266518, "num_tokens": 14523347.0, "step": 5060 }, { "entropy": 1.3962342336773872, "epoch": 6.5588615782664945, "grad_norm": 1.0042107105255127, "learning_rate": 1.6574015881499106e-05, "loss": 1.3439, "mean_token_accuracy": 0.6732321053743362, "num_tokens": 14543748.0, "step": 5070 }, { "entropy": 1.4976371228694916, "epoch": 6.5717981888745145, "grad_norm": 0.0, "learning_rate": 1.6286367847071294e-05, "loss": 0.8495, "mean_token_accuracy": 0.37927755415439607, "num_tokens": 14547526.0, "step": 5080 }, { "entropy": 1.378989189863205, "epoch": 6.584734799482535, "grad_norm": 0.35467758774757385, "learning_rate": 1.6001016394952817e-05, "loss": 0.436, "mean_token_accuracy": 0.10404296517372132, "num_tokens": 14587727.0, "step": 5090 }, { "entropy": 1.2019992262125014, "epoch": 6.597671410090556, "grad_norm": 0.7634411454200745, "learning_rate": 1.5717969353415772e-05, "loss": 1.2363, "mean_token_accuracy": 0.7016454577445984, "num_tokens": 14633377.0, "step": 5100 }, { "epoch": 6.597671410090556, "eval_entropy": 1.3025533678226693, "eval_loss": 1.2344391345977783, "eval_mean_token_accuracy": 0.4806629490367202, "eval_num_tokens": 14633377.0, "eval_runtime": 243.0518, "eval_samples_per_second": 22.612, "eval_steps_per_second": 1.415, "step": 5100 }, { "entropy": 1.0161924228072166, "epoch": 6.610608020698577, "grad_norm": 1.0323160886764526, "learning_rate": 1.5437234487513687e-05, "loss": 0.9938, "mean_token_accuracy": 0.747073483467102, "num_tokens": 14664256.0, "step": 5110 }, { "entropy": 1.358753038942814, "epoch": 6.623544631306598, "grad_norm": 1.011472225189209, "learning_rate": 1.5158819498868248e-05, "loss": 1.3273, "mean_token_accuracy": 0.6735880345106124, "num_tokens": 14685452.0, "step": 5120 }, { "entropy": 1.5439666867256165, "epoch": 6.636481241914618, "grad_norm": 0.0, "learning_rate": 1.4882732025458124e-05, "loss": 0.8744, "mean_token_accuracy": 0.35112617164850235, "num_tokens": 14689408.0, "step": 5130 }, { "entropy": 1.490699003636837, "epoch": 6.649417852522639, "grad_norm": 0.33567583560943604, "learning_rate": 1.4608979641409448e-05, "loss": 0.4429, "mean_token_accuracy": 0.10201395228505135, "num_tokens": 14730607.0, "step": 5140 }, { "entropy": 1.1885226652026177, "epoch": 6.66235446313066, "grad_norm": 0.7712506055831909, "learning_rate": 1.4337569856787958e-05, "loss": 1.2014, "mean_token_accuracy": 0.7031497925519943, "num_tokens": 14775950.0, "step": 5150 }, { "entropy": 1.013894683122635, "epoch": 6.675291073738681, "grad_norm": 0.993394672870636, "learning_rate": 1.406851011739303e-05, "loss": 0.9995, "mean_token_accuracy": 0.7462615251541138, "num_tokens": 14806798.0, "step": 5160 }, { "entropy": 1.276303158700466, "epoch": 6.6882276843467015, "grad_norm": 0.9287812113761902, "learning_rate": 1.3801807804553401e-05, "loss": 1.2193, "mean_token_accuracy": 0.701404669880867, "num_tokens": 14828450.0, "step": 5170 }, { "entropy": 1.639420548081398, "epoch": 6.701164294954722, "grad_norm": 0.0, "learning_rate": 1.3537470234924642e-05, "loss": 0.9149, "mean_token_accuracy": 0.36589213013648986, "num_tokens": 14832909.0, "step": 5180 }, { "entropy": 1.5444379433989526, "epoch": 6.714100905562742, "grad_norm": 0.33196088671684265, "learning_rate": 1.3275504660288462e-05, "loss": 0.4502, "mean_token_accuracy": 0.09918043613433838, "num_tokens": 14875888.0, "step": 5190 }, { "entropy": 1.1905731126666068, "epoch": 6.727037516170763, "grad_norm": 0.7245560884475708, "learning_rate": 1.3015918267353743e-05, "loss": 1.2055, "mean_token_accuracy": 0.7072307705879212, "num_tokens": 14921555.0, "step": 5200 }, { "entropy": 1.0091575369238854, "epoch": 6.739974126778784, "grad_norm": 0.9656630158424377, "learning_rate": 1.2758718177559403e-05, "loss": 1.0059, "mean_token_accuracy": 0.7457368150353432, "num_tokens": 14952319.0, "step": 5210 }, { "entropy": 1.3768625631928444, "epoch": 6.752910737386805, "grad_norm": 1.0023345947265625, "learning_rate": 1.2503911446879014e-05, "loss": 1.3323, "mean_token_accuracy": 0.6721446126699447, "num_tokens": 14973360.0, "step": 5220 }, { "entropy": 1.706917905807495, "epoch": 6.765847347994825, "grad_norm": 0.0, "learning_rate": 1.2251505065627211e-05, "loss": 0.884, "mean_token_accuracy": 0.34794071316719055, "num_tokens": 14977368.0, "step": 5230 }, { "entropy": 1.6983414202928544, "epoch": 6.778783958602846, "grad_norm": 0.34029924869537354, "learning_rate": 1.2001505958268045e-05, "loss": 0.4392, "mean_token_accuracy": 0.10167066529393196, "num_tokens": 15016518.0, "step": 5240 }, { "entropy": 1.1760634392499925, "epoch": 6.791720569210867, "grad_norm": 0.7289795875549316, "learning_rate": 1.1753920983224753e-05, "loss": 1.2004, "mean_token_accuracy": 0.7051770240068436, "num_tokens": 15062291.0, "step": 5250 }, { "epoch": 6.791720569210867, "eval_entropy": 1.3887645453214645, "eval_loss": 1.2298688888549805, "eval_mean_token_accuracy": 0.48596259925601093, "eval_num_tokens": 15062291.0, "eval_runtime": 246.7195, "eval_samples_per_second": 22.276, "eval_steps_per_second": 1.394, "step": 5250 }, { "entropy": 1.019908943772316, "epoch": 6.8046571798188875, "grad_norm": 1.0139966011047363, "learning_rate": 1.1508756932691878e-05, "loss": 1.016, "mean_token_accuracy": 0.7411870285868645, "num_tokens": 15093136.0, "step": 5260 }, { "entropy": 1.3366242468357086, "epoch": 6.817593790426908, "grad_norm": 1.015224814414978, "learning_rate": 1.1266020532448863e-05, "loss": 1.3099, "mean_token_accuracy": 0.680339677631855, "num_tokens": 15113801.0, "step": 5270 }, { "entropy": 1.7231854051351547, "epoch": 6.830530401034929, "grad_norm": 0.0, "learning_rate": 1.1025718441675348e-05, "loss": 0.8459, "mean_token_accuracy": 0.34885319918394087, "num_tokens": 15117501.0, "step": 5280 }, { "entropy": 1.8012044936418534, "epoch": 6.843467011642949, "grad_norm": 0.3444773256778717, "learning_rate": 1.0787857252768807e-05, "loss": 0.4338, "mean_token_accuracy": 0.10217657834291458, "num_tokens": 15154208.0, "step": 5290 }, { "entropy": 1.17054093927145, "epoch": 6.85640362225097, "grad_norm": 0.7941517233848572, "learning_rate": 1.0552443491163422e-05, "loss": 1.1874, "mean_token_accuracy": 0.7076364248991013, "num_tokens": 15199469.0, "step": 5300 }, { "entropy": 1.0057064607739448, "epoch": 6.869340232858991, "grad_norm": 0.8840006589889526, "learning_rate": 1.0319483615151137e-05, "loss": 0.981, "mean_token_accuracy": 0.7503589361906051, "num_tokens": 15230670.0, "step": 5310 }, { "entropy": 1.2563072219491005, "epoch": 6.882276843467012, "grad_norm": 1.0177907943725586, "learning_rate": 1.0088984015704629e-05, "loss": 1.2394, "mean_token_accuracy": 0.6934975415468216, "num_tokens": 15252641.0, "step": 5320 }, { "entropy": 1.8372395306825637, "epoch": 6.895213454075033, "grad_norm": 0.0, "learning_rate": 9.860951016301756e-06, "loss": 0.9875, "mean_token_accuracy": 0.3743965640664101, "num_tokens": 15257407.0, "step": 5330 }, { "entropy": 1.7831202149391174, "epoch": 6.908150064683053, "grad_norm": 0.3214081823825836, "learning_rate": 9.635390872752237e-06, "loss": 0.43, "mean_token_accuracy": 0.10435229986906051, "num_tokens": 15299860.0, "step": 5340 }, { "entropy": 1.2522226199507713, "epoch": 6.921086675291074, "grad_norm": 0.8021490573883057, "learning_rate": 9.412309773025952e-06, "loss": 1.2766, "mean_token_accuracy": 0.6917116060853005, "num_tokens": 15347391.0, "step": 5350 }, { "entropy": 1.0165240302681924, "epoch": 6.9340232858990944, "grad_norm": 0.9851676225662231, "learning_rate": 9.191713837083238e-06, "loss": 1.0192, "mean_token_accuracy": 0.7415471941232681, "num_tokens": 15379391.0, "step": 5360 }, { "entropy": 1.2651836022734642, "epoch": 6.946959896507115, "grad_norm": 1.12442946434021, "learning_rate": 8.973609116706926e-06, "loss": 1.2443, "mean_token_accuracy": 0.6868803769350051, "num_tokens": 15401606.0, "step": 5370 }, { "entropy": 1.7752905175089837, "epoch": 6.959896507115136, "grad_norm": 0.0, "learning_rate": 8.758001595336418e-06, "loss": 0.8999, "mean_token_accuracy": 0.38887517899274826, "num_tokens": 15406538.0, "step": 5380 }, { "entropy": 1.7106927633285522, "epoch": 6.972833117723156, "grad_norm": 0.5107993483543396, "learning_rate": 8.544897187903423e-06, "loss": 0.4117, "mean_token_accuracy": 0.10680279433727265, "num_tokens": 15432463.0, "step": 5390 }, { "entropy": 1.0609442353248597, "epoch": 6.985769728331177, "grad_norm": 1.095216155052185, "learning_rate": 8.33430174066978e-06, "loss": 1.0514, "mean_token_accuracy": 0.7322214379906654, "num_tokens": 15465365.0, "step": 5400 }, { "epoch": 6.985769728331177, "eval_entropy": 1.3962991244571155, "eval_loss": 1.2261559963226318, "eval_mean_token_accuracy": 0.48680107668042183, "eval_num_tokens": 15465365.0, "eval_runtime": 244.9697, "eval_samples_per_second": 22.435, "eval_steps_per_second": 1.404, "step": 5400 }, { "entropy": 1.6976288080215454, "epoch": 6.998706338939198, "grad_norm": 0.0, "learning_rate": 8.126221031067027e-06, "loss": 0.7689, "mean_token_accuracy": 0.2966282024979591, "num_tokens": 15471588.0, "step": 5410 }, { "entropy": 1.497927661240101, "epoch": 7.011642949547219, "grad_norm": 0.7096975445747375, "learning_rate": 7.920660767537901e-06, "loss": 1.3894, "mean_token_accuracy": 0.5761201746761799, "num_tokens": 15542066.0, "step": 5420 }, { "entropy": 0.9780161440372467, "epoch": 7.02457956015524, "grad_norm": 0.9500054717063904, "learning_rate": 7.717626589379789e-06, "loss": 0.9513, "mean_token_accuracy": 0.7568799629807472, "num_tokens": 15575551.0, "step": 5430 }, { "entropy": 1.169414332509041, "epoch": 7.03751617076326, "grad_norm": 1.0309356451034546, "learning_rate": 7.517124066589909e-06, "loss": 1.1411, "mean_token_accuracy": 0.711452366411686, "num_tokens": 15599584.0, "step": 5440 }, { "entropy": 1.7210813522338868, "epoch": 7.0504527813712805, "grad_norm": 0.0, "learning_rate": 7.319158699712669e-06, "loss": 1.3323, "mean_token_accuracy": 0.5859084717929364, "num_tokens": 15608747.0, "step": 5450 }, { "entropy": 1.6397013187408447, "epoch": 7.063389391979301, "grad_norm": 0.0, "learning_rate": 7.12373591968859e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 15609387.0, "step": 5460 }, { "entropy": 1.5039527043700218, "epoch": 7.076326002587322, "grad_norm": 0.772226870059967, "learning_rate": 6.930861087705398e-06, "loss": 1.3666, "mean_token_accuracy": 0.5798796579241753, "num_tokens": 15685497.0, "step": 5470 }, { "entropy": 0.9571346640586853, "epoch": 7.089262613195343, "grad_norm": 0.9899272918701172, "learning_rate": 6.7405394950510345e-06, "loss": 0.9525, "mean_token_accuracy": 0.7557973235845565, "num_tokens": 15718968.0, "step": 5480 }, { "entropy": 1.1604458332061767, "epoch": 7.102199223803363, "grad_norm": 1.072095513343811, "learning_rate": 6.552776362968271e-06, "loss": 1.1571, "mean_token_accuracy": 0.7090446025133132, "num_tokens": 15742748.0, "step": 5490 }, { "entropy": 1.7930972754955292, "epoch": 7.115135834411384, "grad_norm": 0.0, "learning_rate": 6.367576842511735e-06, "loss": 1.3237, "mean_token_accuracy": 0.5362849146127701, "num_tokens": 15751803.0, "step": 5500 }, { "entropy": 1.7586050003767013, "epoch": 7.128072445019405, "grad_norm": 0.0, "learning_rate": 6.184946014406412e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 15752443.0, "step": 5510 }, { "entropy": 1.5453996002674102, "epoch": 7.141009055627426, "grad_norm": 0.7360463738441467, "learning_rate": 6.004888888908256e-06, "loss": 1.4109, "mean_token_accuracy": 0.5700584821403026, "num_tokens": 15828985.0, "step": 5520 }, { "entropy": 0.9328926429152489, "epoch": 7.153945666235447, "grad_norm": 0.9283819794654846, "learning_rate": 5.827410405666911e-06, "loss": 0.9175, "mean_token_accuracy": 0.7673766747117042, "num_tokens": 15862356.0, "step": 5530 }, { "entropy": 1.1613366797566413, "epoch": 7.166882276843467, "grad_norm": 1.0261551141738892, "learning_rate": 5.652515433590033e-06, "loss": 1.1253, "mean_token_accuracy": 0.7124258697032928, "num_tokens": 15886367.0, "step": 5540 }, { "entropy": 1.7588330313563347, "epoch": 7.179818887451487, "grad_norm": 0.00023454829351976514, "learning_rate": 5.480208770709771e-06, "loss": 1.4039, "mean_token_accuracy": 0.5946097061038017, "num_tokens": 15896207.0, "step": 5550 }, { "epoch": 7.179818887451487, "eval_entropy": 1.393599722794322, "eval_loss": 1.2324310541152954, "eval_mean_token_accuracy": 0.4852820281372514, "eval_num_tokens": 15896207.0, "eval_runtime": 245.6246, "eval_samples_per_second": 22.376, "eval_steps_per_second": 1.401, "step": 5550 }, { "entropy": 1.7070483982563018, "epoch": 7.192755498059508, "grad_norm": 0.0, "learning_rate": 5.310495144051142e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 15896847.0, "step": 5560 }, { "entropy": 1.557031399011612, "epoch": 7.205692108667529, "grad_norm": 0.7289990186691284, "learning_rate": 5.143379209502352e-06, "loss": 1.4125, "mean_token_accuracy": 0.5720368728041649, "num_tokens": 15976815.0, "step": 5570 }, { "entropy": 0.9512620970606804, "epoch": 7.21862871927555, "grad_norm": 0.9174538254737854, "learning_rate": 4.978865551687062e-06, "loss": 0.9534, "mean_token_accuracy": 0.7580740317702294, "num_tokens": 16010900.0, "step": 5580 }, { "entropy": 1.172946660220623, "epoch": 7.231565329883571, "grad_norm": 1.0972976684570312, "learning_rate": 4.8169586838386346e-06, "loss": 1.1532, "mean_token_accuracy": 0.7079381376504899, "num_tokens": 16035361.0, "step": 5590 }, { "entropy": 1.6811116263270378, "epoch": 7.244501940491591, "grad_norm": 0.0, "learning_rate": 4.657663047676264e-06, "loss": 1.2139, "mean_token_accuracy": 0.5401002943515778, "num_tokens": 16044571.0, "step": 5600 }, { "entropy": 1.6898091644048692, "epoch": 7.257438551099612, "grad_norm": 0.0, "learning_rate": 4.500983013283188e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16045211.0, "step": 5610 }, { "entropy": 1.5436704397201537, "epoch": 7.270375161707633, "grad_norm": 0.6892314553260803, "learning_rate": 4.34692287898677e-06, "loss": 1.4148, "mean_token_accuracy": 0.5717164523899555, "num_tokens": 16122336.0, "step": 5620 }, { "entropy": 0.9374915182590484, "epoch": 7.2833117723156535, "grad_norm": 0.9667730927467346, "learning_rate": 4.195486871240562e-06, "loss": 0.9394, "mean_token_accuracy": 0.7627643913030624, "num_tokens": 16156408.0, "step": 5630 }, { "entropy": 1.1849497631192207, "epoch": 7.296248382923674, "grad_norm": 1.1908502578735352, "learning_rate": 4.046679144508392e-06, "loss": 1.142, "mean_token_accuracy": 0.7130326569080353, "num_tokens": 16180323.0, "step": 5640 }, { "entropy": 1.829011231660843, "epoch": 7.309184993531694, "grad_norm": 0.0, "learning_rate": 3.900503781150366e-06, "loss": 1.4914, "mean_token_accuracy": 0.5614617101848125, "num_tokens": 16189805.0, "step": 5650 }, { "entropy": 1.7375122755765915, "epoch": 7.322121604139715, "grad_norm": 0.0, "learning_rate": 3.7569647913109243e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16190445.0, "step": 5660 }, { "entropy": 1.5200837269425391, "epoch": 7.335058214747736, "grad_norm": 0.7430135607719421, "learning_rate": 3.6160661128087025e-06, "loss": 1.397, "mean_token_accuracy": 0.57365105971694, "num_tokens": 16268426.0, "step": 5670 }, { "entropy": 0.9276900395751, "epoch": 7.347994825355757, "grad_norm": 0.9390348792076111, "learning_rate": 3.4778116110286473e-06, "loss": 0.9249, "mean_token_accuracy": 0.7620738327503205, "num_tokens": 16302856.0, "step": 5680 }, { "entropy": 1.1663517013192177, "epoch": 7.360931435963778, "grad_norm": 1.0117005109786987, "learning_rate": 3.34220507881593e-06, "loss": 1.1293, "mean_token_accuracy": 0.7132649436593056, "num_tokens": 16327211.0, "step": 5690 }, { "entropy": 1.7131205320358276, "epoch": 7.373868046571798, "grad_norm": 0.0, "learning_rate": 3.209250236371797e-06, "loss": 1.3032, "mean_token_accuracy": 0.5476110517978668, "num_tokens": 16336179.0, "step": 5700 }, { "epoch": 7.373868046571798, "eval_entropy": 1.4321047376061595, "eval_loss": 1.2324743270874023, "eval_mean_token_accuracy": 0.48222382652551626, "eval_num_tokens": 16336179.0, "eval_runtime": 242.208, "eval_samples_per_second": 22.691, "eval_steps_per_second": 1.42, "step": 5700 }, { "entropy": 1.744317215681076, "epoch": 7.386804657179819, "grad_norm": 0.0, "learning_rate": 3.0789507311516864e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16336819.0, "step": 5710 }, { "entropy": 1.5234818816184998, "epoch": 7.39974126778784, "grad_norm": 0.7303734421730042, "learning_rate": 2.9513101377650175e-06, "loss": 1.3797, "mean_token_accuracy": 0.5752100251615048, "num_tokens": 16404914.0, "step": 5720 }, { "entropy": 0.9265442088246345, "epoch": 7.4126778783958605, "grad_norm": 0.8770347237586975, "learning_rate": 2.8263319578771485e-06, "loss": 0.9069, "mean_token_accuracy": 0.7680046275258064, "num_tokens": 16439389.0, "step": 5730 }, { "entropy": 1.1830172911286354, "epoch": 7.425614489003881, "grad_norm": 1.0386770963668823, "learning_rate": 2.704019620113407e-06, "loss": 1.1733, "mean_token_accuracy": 0.7056162416934967, "num_tokens": 16464458.0, "step": 5740 }, { "entropy": 1.7503404572606087, "epoch": 7.438551099611901, "grad_norm": 1.7682623863220215, "learning_rate": 2.584376479964945e-06, "loss": 1.4882, "mean_token_accuracy": 0.6309158280491829, "num_tokens": 16475591.0, "step": 5750 }, { "entropy": 1.7254247039556503, "epoch": 7.451487710219922, "grad_norm": 0.0, "learning_rate": 2.4674058196966663e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16476231.0, "step": 5760 }, { "entropy": 1.5136717677116394, "epoch": 7.464424320827943, "grad_norm": 0.7473369240760803, "learning_rate": 2.353110848257267e-06, "loss": 1.3413, "mean_token_accuracy": 0.5824255973100663, "num_tokens": 16552018.0, "step": 5770 }, { "entropy": 0.9227760046720505, "epoch": 7.477360931435964, "grad_norm": 0.982836902141571, "learning_rate": 2.241494701191127e-06, "loss": 0.9069, "mean_token_accuracy": 0.7623407855629921, "num_tokens": 16586256.0, "step": 5780 }, { "entropy": 1.1348280161619186, "epoch": 7.490297542043985, "grad_norm": 1.1100831031799316, "learning_rate": 2.1325604405523334e-06, "loss": 1.1069, "mean_token_accuracy": 0.7201577231287957, "num_tokens": 16610709.0, "step": 5790 }, { "entropy": 1.771338665485382, "epoch": 7.503234152652006, "grad_norm": 0.0, "learning_rate": 2.026311054820629e-06, "loss": 1.411, "mean_token_accuracy": 0.5635204806923866, "num_tokens": 16620269.0, "step": 5800 }, { "entropy": 1.7322617769241333, "epoch": 7.516170763260026, "grad_norm": 0.0, "learning_rate": 1.922749458819506e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16620909.0, "step": 5810 }, { "entropy": 1.4817992717027664, "epoch": 7.5291073738680465, "grad_norm": 0.756270170211792, "learning_rate": 1.8218784936361644e-06, "loss": 1.353, "mean_token_accuracy": 0.5787275157868862, "num_tokens": 16690569.0, "step": 5820 }, { "entropy": 0.9674558937549591, "epoch": 7.542043984476067, "grad_norm": 0.8812004327774048, "learning_rate": 1.7237009265436032e-06, "loss": 0.9613, "mean_token_accuracy": 0.7560465827584266, "num_tokens": 16724649.0, "step": 5830 }, { "entropy": 1.1716067418456078, "epoch": 7.554980595084088, "grad_norm": 1.0925747156143188, "learning_rate": 1.6282194509247063e-06, "loss": 1.1436, "mean_token_accuracy": 0.7135581076145172, "num_tokens": 16749582.0, "step": 5840 }, { "entropy": 1.6912678241729737, "epoch": 7.567917205692108, "grad_norm": 1.6889742612838745, "learning_rate": 1.5354366861983438e-06, "loss": 1.5003, "mean_token_accuracy": 0.6513200134038926, "num_tokens": 16760847.0, "step": 5850 }, { "epoch": 7.567917205692108, "eval_entropy": 1.4259126506919084, "eval_loss": 1.2301470041275024, "eval_mean_token_accuracy": 0.4896806857093822, "eval_num_tokens": 16760847.0, "eval_runtime": 246.4439, "eval_samples_per_second": 22.301, "eval_steps_per_second": 1.396, "step": 5850 }, { "entropy": 1.7190734058618546, "epoch": 7.580853816300129, "grad_norm": 0.0, "learning_rate": 1.4453551777475094e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16761487.0, "step": 5860 }, { "entropy": 1.5320835530757904, "epoch": 7.59379042690815, "grad_norm": 0.7591171264648438, "learning_rate": 1.3579773968495191e-06, "loss": 1.3913, "mean_token_accuracy": 0.5738878205418587, "num_tokens": 16833368.0, "step": 5870 }, { "entropy": 0.9359873235225677, "epoch": 7.606727037516171, "grad_norm": 0.9182559847831726, "learning_rate": 1.2733057406081438e-06, "loss": 0.9307, "mean_token_accuracy": 0.7633048981428147, "num_tokens": 16867272.0, "step": 5880 }, { "entropy": 1.1327362582087517, "epoch": 7.619663648124192, "grad_norm": 1.0494729280471802, "learning_rate": 1.1913425318879511e-06, "loss": 1.1095, "mean_token_accuracy": 0.7176593467593193, "num_tokens": 16892030.0, "step": 5890 }, { "entropy": 1.7231059432029725, "epoch": 7.632600258732213, "grad_norm": 0.0, "learning_rate": 1.1120900192505e-06, "loss": 1.3184, "mean_token_accuracy": 0.5641655296087265, "num_tokens": 16901989.0, "step": 5900 }, { "entropy": 1.7543556302785874, "epoch": 7.645536869340233, "grad_norm": 0.0, "learning_rate": 1.0355503768926466e-06, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 16902629.0, "step": 5910 }, { "entropy": 1.4874625369906425, "epoch": 7.6584734799482534, "grad_norm": 0.7250481843948364, "learning_rate": 9.61725704587002e-07, "loss": 1.3483, "mean_token_accuracy": 0.5808299452066421, "num_tokens": 16975429.0, "step": 5920 }, { "entropy": 0.940713207423687, "epoch": 7.671410090556274, "grad_norm": 0.9228203296661377, "learning_rate": 8.906180276242015e-07, "loss": 0.9271, "mean_token_accuracy": 0.760072472691536, "num_tokens": 17009886.0, "step": 5930 }, { "entropy": 1.1436687961220742, "epoch": 7.684346701164295, "grad_norm": 1.0997246503829956, "learning_rate": 8.22229296757393e-07, "loss": 1.1408, "mean_token_accuracy": 0.7107081711292267, "num_tokens": 17034678.0, "step": 5940 }, { "entropy": 1.73554485142231, "epoch": 7.697283311772315, "grad_norm": 0.0, "learning_rate": 7.565613881487687e-07, "loss": 1.365, "mean_token_accuracy": 0.5842878207564354, "num_tokens": 17044424.0, "step": 5950 }, { "entropy": 1.7472249418497086, "epoch": 7.710219922380336, "grad_norm": 0.0, "learning_rate": 6.936161033180066e-07, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 17045064.0, "step": 5960 }, { "entropy": 1.5308178260922432, "epoch": 7.723156532988357, "grad_norm": 0.7369622588157654, "learning_rate": 6.333951690929318e-07, "loss": 1.3944, "mean_token_accuracy": 0.571716184169054, "num_tokens": 17121301.0, "step": 5970 }, { "entropy": 0.9166033461689949, "epoch": 7.736093143596378, "grad_norm": 0.8718245625495911, "learning_rate": 5.759002375620548e-07, "loss": 0.9191, "mean_token_accuracy": 0.7659956023097039, "num_tokens": 17155878.0, "step": 5980 }, { "entropy": 1.1351210102438927, "epoch": 7.749029754204399, "grad_norm": 1.1139835119247437, "learning_rate": 5.211328860293519e-07, "loss": 1.0937, "mean_token_accuracy": 0.7179104581475257, "num_tokens": 17180817.0, "step": 5990 }, { "entropy": 1.7042000949382783, "epoch": 7.7619663648124195, "grad_norm": 0.0, "learning_rate": 4.6909461697088874e-07, "loss": 1.2978, "mean_token_accuracy": 0.5402273468673229, "num_tokens": 17190238.0, "step": 6000 }, { "epoch": 7.7619663648124195, "eval_entropy": 1.4222364893486334, "eval_loss": 1.230813980102539, "eval_mean_token_accuracy": 0.483534776973863, "eval_num_tokens": 17190238.0, "eval_runtime": 243.8499, "eval_samples_per_second": 22.538, "eval_steps_per_second": 1.411, "step": 6000 }, { "entropy": 1.7714763969182967, "epoch": 7.7749029754204395, "grad_norm": 0.0, "learning_rate": 4.197868579936981e-07, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 17190878.0, "step": 6010 }, { "entropy": 1.5036073312163354, "epoch": 7.78783958602846, "grad_norm": 0.7586703896522522, "learning_rate": 3.732109617965218e-07, "loss": 1.3917, "mean_token_accuracy": 0.5730986759066582, "num_tokens": 17262910.0, "step": 6020 }, { "entropy": 0.9327082589268685, "epoch": 7.800776196636481, "grad_norm": 0.8685732483863831, "learning_rate": 3.293682061327963e-07, "loss": 0.9333, "mean_token_accuracy": 0.7620440036058426, "num_tokens": 17296857.0, "step": 6030 }, { "entropy": 1.177341391146183, "epoch": 7.813712807244502, "grad_norm": 1.1222566366195679, "learning_rate": 2.882597937755249e-07, "loss": 1.1641, "mean_token_accuracy": 0.7064913615584374, "num_tokens": 17321218.0, "step": 6040 }, { "entropy": 1.7008673965930938, "epoch": 7.826649417852523, "grad_norm": 0.0, "learning_rate": 2.498868524843045e-07, "loss": 1.2135, "mean_token_accuracy": 0.5372394770383835, "num_tokens": 17329684.0, "step": 6050 }, { "entropy": 1.7468272864818573, "epoch": 7.839586028460543, "grad_norm": 0.0, "learning_rate": 2.1425043497439456e-07, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 17330324.0, "step": 6060 }, { "entropy": 1.5415200561285018, "epoch": 7.852522639068564, "grad_norm": 0.7708677649497986, "learning_rate": 1.8135151888782899e-07, "loss": 1.3837, "mean_token_accuracy": 0.574844229221344, "num_tokens": 17408721.0, "step": 6070 }, { "entropy": 0.9075698807835579, "epoch": 7.865459249676585, "grad_norm": 0.8989212512969971, "learning_rate": 1.5119100676662667e-07, "loss": 0.8899, "mean_token_accuracy": 0.771544449031353, "num_tokens": 17442757.0, "step": 6080 }, { "entropy": 1.1743381530046464, "epoch": 7.878395860284606, "grad_norm": 1.025661826133728, "learning_rate": 1.2376972602795578e-07, "loss": 1.1425, "mean_token_accuracy": 0.7124027162790298, "num_tokens": 17467049.0, "step": 6090 }, { "entropy": 1.7484049052000046, "epoch": 7.8913324708926265, "grad_norm": 0.0, "learning_rate": 9.908842894151837e-08, "loss": 1.3114, "mean_token_accuracy": 0.5641379207372665, "num_tokens": 17475616.0, "step": 6100 }, { "entropy": 1.7715317398309707, "epoch": 7.904269081500646, "grad_norm": 0.0, "learning_rate": 7.714779260886707e-08, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 17476256.0, "step": 6110 }, { "entropy": 1.5000860661268234, "epoch": 7.917205692108667, "grad_norm": 0.7349119186401367, "learning_rate": 5.7948418944842043e-08, "loss": 1.3548, "mean_token_accuracy": 0.5794984824955464, "num_tokens": 17546950.0, "step": 6120 }, { "entropy": 0.9138670772314071, "epoch": 7.930142302716688, "grad_norm": 0.8542500138282776, "learning_rate": 4.149083466105097e-08, "loss": 0.9021, "mean_token_accuracy": 0.770347698032856, "num_tokens": 17581293.0, "step": 6130 }, { "entropy": 1.1947215780615807, "epoch": 7.943078913324709, "grad_norm": 1.0435749292373657, "learning_rate": 2.7775491251413877e-08, "loss": 1.1687, "mean_token_accuracy": 0.7094842702150345, "num_tokens": 17605803.0, "step": 6140 }, { "entropy": 1.6835207402706147, "epoch": 7.95601552393273, "grad_norm": 0.0, "learning_rate": 1.6802764979817474e-08, "loss": 1.1704, "mean_token_accuracy": 0.5183229476213456, "num_tokens": 17613695.0, "step": 6150 }, { "epoch": 7.95601552393273, "eval_entropy": 1.4208284545429917, "eval_loss": 1.2304351329803467, "eval_mean_token_accuracy": 0.4861882030097551, "eval_num_tokens": 17613695.0, "eval_runtime": 244.9318, "eval_samples_per_second": 22.439, "eval_steps_per_second": 1.404, "step": 6150 }, { "entropy": 1.7820782691240311, "epoch": 7.96895213454075, "grad_norm": 0.0, "learning_rate": 8.572956869734583e-09, "loss": 0.0, "mean_token_accuracy": 0.0, "num_tokens": 17614335.0, "step": 6160 }, { "entropy": 1.2753556087613105, "epoch": 7.981888745148771, "grad_norm": 0.9358561635017395, "learning_rate": 3.0862926959973617e-09, "loss": 1.1173, "mean_token_accuracy": 0.6308311700820923, "num_tokens": 17667096.0, "step": 6170 }, { "entropy": 1.4832376271486283, "epoch": 7.994825355756792, "grad_norm": 0.0, "learning_rate": 3.429229786133803e-10, "loss": 1.055, "mean_token_accuracy": 0.5700831845402717, "num_tokens": 17681630.0, "step": 6180 } ], "logging_steps": 10, "max_steps": 6184, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 600, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.974075450217726e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }