| { | |
| "best_global_step": 5400, | |
| "best_metric": 1.2261559963226318, | |
| "best_model_checkpoint": "./results-3/checkpoint-5400", | |
| "epoch": 8.0, | |
| "eval_steps": 150, | |
| "global_step": 6184, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.4760531455278396, | |
| "epoch": 0.0129366106080207, | |
| "grad_norm": 1.3410229682922363, | |
| "learning_rate": 9.67741935483871e-06, | |
| "loss": 3.8342, | |
| "mean_token_accuracy": 0.40634620636701585, | |
| "num_tokens": 77854.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.4689971387386322, | |
| "epoch": 0.0258732212160414, | |
| "grad_norm": 1.4104728698730469, | |
| "learning_rate": 2.0430107526881722e-05, | |
| "loss": 4.4137, | |
| "mean_token_accuracy": 0.3765578977763653, | |
| "num_tokens": 111064.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.893897533416748, | |
| "epoch": 0.03880983182406209, | |
| "grad_norm": 0.8629273772239685, | |
| "learning_rate": 3.118279569892473e-05, | |
| "loss": 3.8151, | |
| "mean_token_accuracy": 0.38278606086969375, | |
| "num_tokens": 134712.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 4.312886017560959, | |
| "epoch": 0.0517464424320828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.1935483870967746e-05, | |
| "loss": 3.7735, | |
| "mean_token_accuracy": 0.19467806722968817, | |
| "num_tokens": 142734.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 8.096190857887269, | |
| "epoch": 0.0646830530401035, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.268817204301075e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 143374.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.519210198521614, | |
| "epoch": 0.07761966364812418, | |
| "grad_norm": 0.46956390142440796, | |
| "learning_rate": 6.344086021505376e-05, | |
| "loss": 2.7759, | |
| "mean_token_accuracy": 0.4458329685032368, | |
| "num_tokens": 218138.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.7062919318675993, | |
| "epoch": 0.09055627425614489, | |
| "grad_norm": 0.36261770129203796, | |
| "learning_rate": 7.419354838709677e-05, | |
| "loss": 2.5766, | |
| "mean_token_accuracy": 0.4825271964073181, | |
| "num_tokens": 250316.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.5266534447669984, | |
| "epoch": 0.1034928848641656, | |
| "grad_norm": 0.39197003841400146, | |
| "learning_rate": 8.494623655913979e-05, | |
| "loss": 2.5861, | |
| "mean_token_accuracy": 0.47026830837130545, | |
| "num_tokens": 272857.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 4.64949648976326, | |
| "epoch": 0.11642949547218628, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.56989247311828e-05, | |
| "loss": 2.5449, | |
| "mean_token_accuracy": 0.20907760383561252, | |
| "num_tokens": 279057.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 6.761381912231445, | |
| "epoch": 0.129366106080207, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001064516129032258, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 279697.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.331750747561455, | |
| "epoch": 0.1423027166882277, | |
| "grad_norm": 0.4349558353424072, | |
| "learning_rate": 0.00011720430107526883, | |
| "loss": 2.4607, | |
| "mean_token_accuracy": 0.4927462741732597, | |
| "num_tokens": 358859.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.976218768954277, | |
| "epoch": 0.15523932729624837, | |
| "grad_norm": 0.24631856381893158, | |
| "learning_rate": 0.00012795698924731184, | |
| "loss": 2.038, | |
| "mean_token_accuracy": 0.564648849517107, | |
| "num_tokens": 391721.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.3566002756357194, | |
| "epoch": 0.16817593790426907, | |
| "grad_norm": 0.33470404148101807, | |
| "learning_rate": 0.00013870967741935487, | |
| "loss": 2.3135, | |
| "mean_token_accuracy": 0.5072783440351486, | |
| "num_tokens": 415274.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 3.8850305318832397, | |
| "epoch": 0.18111254851228978, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014946236559139787, | |
| "loss": 2.4748, | |
| "mean_token_accuracy": 0.29179108552634714, | |
| "num_tokens": 423127.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 6.345981705188751, | |
| "epoch": 0.19404915912031048, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016021505376344087, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 423767.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.19404915912031048, | |
| "eval_entropy": 3.4076465337082396, | |
| "eval_loss": 2.088292360305786, | |
| "eval_mean_token_accuracy": 0.3316028483731802, | |
| "eval_num_tokens": 423767.0, | |
| "eval_runtime": 243.9108, | |
| "eval_samples_per_second": 22.533, | |
| "eval_steps_per_second": 1.41, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.2055542409420013, | |
| "epoch": 0.2069857697283312, | |
| "grad_norm": 0.31700077652931213, | |
| "learning_rate": 0.0001709677419354839, | |
| "loss": 2.4005, | |
| "mean_token_accuracy": 0.5007682546973229, | |
| "num_tokens": 500625.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.8786041021347046, | |
| "epoch": 0.21992238033635186, | |
| "grad_norm": 0.24800752103328705, | |
| "learning_rate": 0.0001817204301075269, | |
| "loss": 1.8474, | |
| "mean_token_accuracy": 0.5935635283589363, | |
| "num_tokens": 534396.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.263536959886551, | |
| "epoch": 0.23285899094437257, | |
| "grad_norm": 0.3183101415634155, | |
| "learning_rate": 0.00019247311827956992, | |
| "loss": 2.2154, | |
| "mean_token_accuracy": 0.518243944644928, | |
| "num_tokens": 558685.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 4.052780479192734, | |
| "epoch": 0.24579560155239327, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019999987654768255, | |
| "loss": 2.3652, | |
| "mean_token_accuracy": 0.32749315425753595, | |
| "num_tokens": 566987.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 4.421183264255523, | |
| "epoch": 0.258732212160414, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001999976818482961, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 567627.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.0365082800388334, | |
| "epoch": 0.2716688227684347, | |
| "grad_norm": 0.2679975628852844, | |
| "learning_rate": 0.00019999274383338027, | |
| "loss": 2.1862, | |
| "mean_token_accuracy": 0.5347613260149956, | |
| "num_tokens": 644352.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.8313011974096298, | |
| "epoch": 0.2846054333764554, | |
| "grad_norm": 0.2597528398036957, | |
| "learning_rate": 0.00019998506263840354, | |
| "loss": 1.8579, | |
| "mean_token_accuracy": 0.5869012281298638, | |
| "num_tokens": 676791.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.229961010813713, | |
| "epoch": 0.2975420439844761, | |
| "grad_norm": 0.39198312163352966, | |
| "learning_rate": 0.00019997463847409023, | |
| "loss": 2.2158, | |
| "mean_token_accuracy": 0.5119729146361351, | |
| "num_tokens": 699604.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 3.5435027480125427, | |
| "epoch": 0.31047865459249674, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019996147162641464, | |
| "loss": 2.2309, | |
| "mean_token_accuracy": 0.31516757532954215, | |
| "num_tokens": 706414.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 3.784550839662552, | |
| "epoch": 0.32341526520051744, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019994556245659338, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 707054.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.086453899741173, | |
| "epoch": 0.33635187580853815, | |
| "grad_norm": 0.2695913314819336, | |
| "learning_rate": 0.00019992691140107525, | |
| "loss": 2.2688, | |
| "mean_token_accuracy": 0.5183561690151691, | |
| "num_tokens": 787476.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 1.80660640001297, | |
| "epoch": 0.34928848641655885, | |
| "grad_norm": 0.2775532603263855, | |
| "learning_rate": 0.0001999055189715294, | |
| "loss": 1.855, | |
| "mean_token_accuracy": 0.5896616145968437, | |
| "num_tokens": 820945.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.265737462043762, | |
| "epoch": 0.36222509702457956, | |
| "grad_norm": 0.35880544781684875, | |
| "learning_rate": 0.0001998813857548313, | |
| "loss": 2.1884, | |
| "mean_token_accuracy": 0.5160560064017773, | |
| "num_tokens": 844570.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 3.490731942653656, | |
| "epoch": 0.37516170763260026, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001998545124130466, | |
| "loss": 2.196, | |
| "mean_token_accuracy": 0.3669252373278141, | |
| "num_tokens": 852461.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 3.8365337908267976, | |
| "epoch": 0.38809831824062097, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019982489968341292, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 853101.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.38809831824062097, | |
| "eval_entropy": 2.108367764672568, | |
| "eval_loss": 1.785624623298645, | |
| "eval_mean_token_accuracy": 0.3863564946277197, | |
| "eval_num_tokens": 853101.0, | |
| "eval_runtime": 244.4512, | |
| "eval_samples_per_second": 22.483, | |
| "eval_steps_per_second": 1.407, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.0010604202747344, | |
| "epoch": 0.40103492884864167, | |
| "grad_norm": 0.26067453622817993, | |
| "learning_rate": 0.00019979254837831976, | |
| "loss": 2.1888, | |
| "mean_token_accuracy": 0.527290866523981, | |
| "num_tokens": 932233.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 1.8096002161502838, | |
| "epoch": 0.4139715394566624, | |
| "grad_norm": 0.3278159201145172, | |
| "learning_rate": 0.00019975745938528597, | |
| "loss": 1.8032, | |
| "mean_token_accuracy": 0.5965773060917854, | |
| "num_tokens": 965240.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.239218121767044, | |
| "epoch": 0.4269081500646831, | |
| "grad_norm": 0.3497501611709595, | |
| "learning_rate": 0.00019971963366693574, | |
| "loss": 2.1853, | |
| "mean_token_accuracy": 0.5204933404922485, | |
| "num_tokens": 988836.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 3.591762775182724, | |
| "epoch": 0.4398447606727037, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001996790722609719, | |
| "loss": 2.0384, | |
| "mean_token_accuracy": 0.3091650754213333, | |
| "num_tokens": 995598.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 1.7911852180957795, | |
| "epoch": 0.45278137128072443, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019963577628014757, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 996238.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 1.9936166375875473, | |
| "epoch": 0.46571798188874514, | |
| "grad_norm": 0.2826139032840729, | |
| "learning_rate": 0.00019958974691223572, | |
| "loss": 2.1339, | |
| "mean_token_accuracy": 0.5367397539317608, | |
| "num_tokens": 1068779.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 1.7499752998352052, | |
| "epoch": 0.47865459249676584, | |
| "grad_norm": 0.25705066323280334, | |
| "learning_rate": 0.00019954098541999634, | |
| "loss": 1.7626, | |
| "mean_token_accuracy": 0.6045101627707481, | |
| "num_tokens": 1101822.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 2.2398334205150605, | |
| "epoch": 0.49159120310478654, | |
| "grad_norm": 0.35060882568359375, | |
| "learning_rate": 0.00019948949314114208, | |
| "loss": 2.1407, | |
| "mean_token_accuracy": 0.5221379362046719, | |
| "num_tokens": 1125242.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 3.20022537112236, | |
| "epoch": 0.5045278137128072, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019943527148830138, | |
| "loss": 2.1867, | |
| "mean_token_accuracy": 0.3573383778333664, | |
| "num_tokens": 1132694.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 3.233865666389465, | |
| "epoch": 0.517464424320828, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019937832194897968, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1133334.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.883677563071251, | |
| "epoch": 0.5304010349288486, | |
| "grad_norm": 0.253384530544281, | |
| "learning_rate": 0.00019931864608551886, | |
| "loss": 2.065, | |
| "mean_token_accuracy": 0.5480175256729126, | |
| "num_tokens": 1208651.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.8230630427598953, | |
| "epoch": 0.5433376455368694, | |
| "grad_norm": 0.27244824171066284, | |
| "learning_rate": 0.000199256245535054, | |
| "loss": 1.7993, | |
| "mean_token_accuracy": 0.5971413522958755, | |
| "num_tokens": 1241633.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 2.1840337038040163, | |
| "epoch": 0.55627425614489, | |
| "grad_norm": 0.33489564061164856, | |
| "learning_rate": 0.00019919112200946878, | |
| "loss": 2.1355, | |
| "mean_token_accuracy": 0.523309488594532, | |
| "num_tokens": 1265245.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 3.2613951563835144, | |
| "epoch": 0.5692108667529108, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001991232772953485, | |
| "loss": 2.0666, | |
| "mean_token_accuracy": 0.36050624772906303, | |
| "num_tokens": 1272655.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 2.055793708562851, | |
| "epoch": 0.5821474773609314, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001990527132539308, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1273295.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5821474773609314, | |
| "eval_entropy": 1.5776830232420633, | |
| "eval_loss": 1.6587693691253662, | |
| "eval_mean_token_accuracy": 0.40149001534595047, | |
| "eval_num_tokens": 1273295.0, | |
| "eval_runtime": 245.22, | |
| "eval_samples_per_second": 22.413, | |
| "eval_steps_per_second": 1.403, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.9504007667303085, | |
| "epoch": 0.5950840879689522, | |
| "grad_norm": 0.2335178405046463, | |
| "learning_rate": 0.00019897943182105486, | |
| "loss": 2.1289, | |
| "mean_token_accuracy": 0.5388719126582145, | |
| "num_tokens": 1353662.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.812851694226265, | |
| "epoch": 0.6080206985769728, | |
| "grad_norm": 0.27590492367744446, | |
| "learning_rate": 0.00019890343500710827, | |
| "loss": 1.79, | |
| "mean_token_accuracy": 0.5952848941087723, | |
| "num_tokens": 1386745.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 2.1694509744644166, | |
| "epoch": 0.6209573091849935, | |
| "grad_norm": 0.36973315477371216, | |
| "learning_rate": 0.0001988247248969717, | |
| "loss": 2.1425, | |
| "mean_token_accuracy": 0.5235736042261123, | |
| "num_tokens": 1410114.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 3.2446247756481172, | |
| "epoch": 0.6338939197930142, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019874330364996192, | |
| "loss": 2.0907, | |
| "mean_token_accuracy": 0.3589281477034092, | |
| "num_tokens": 1417385.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 2.887257432937622, | |
| "epoch": 0.6468305304010349, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019865917349977242, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1418025.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 2.0031155347824097, | |
| "epoch": 0.6597671410090556, | |
| "grad_norm": 0.2290731519460678, | |
| "learning_rate": 0.00019857233675441217, | |
| "loss": 2.1288, | |
| "mean_token_accuracy": 0.5355072975158691, | |
| "num_tokens": 1498284.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.7464266479015351, | |
| "epoch": 0.6727037516170763, | |
| "grad_norm": 0.27917975187301636, | |
| "learning_rate": 0.0001984827957961423, | |
| "loss": 1.7213, | |
| "mean_token_accuracy": 0.6062818467617035, | |
| "num_tokens": 1531645.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 2.099287986755371, | |
| "epoch": 0.685640362225097, | |
| "grad_norm": 0.34847304224967957, | |
| "learning_rate": 0.00019839055308141078, | |
| "loss": 2.0957, | |
| "mean_token_accuracy": 0.5292750775814057, | |
| "num_tokens": 1555744.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 2.987301951646805, | |
| "epoch": 0.6985769728331177, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019829561114078503, | |
| "loss": 2.035, | |
| "mean_token_accuracy": 0.35071768537163733, | |
| "num_tokens": 1563621.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.7990713268518448, | |
| "epoch": 0.7115135834411385, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019819797257888237, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1564261.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.9829886645078658, | |
| "epoch": 0.7244501940491591, | |
| "grad_norm": 0.23086819052696228, | |
| "learning_rate": 0.00019809764007429874, | |
| "loss": 2.0682, | |
| "mean_token_accuracy": 0.546464990824461, | |
| "num_tokens": 1645469.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.742349737882614, | |
| "epoch": 0.7373868046571799, | |
| "grad_norm": 0.2855489253997803, | |
| "learning_rate": 0.00019799461637953517, | |
| "loss": 1.7437, | |
| "mean_token_accuracy": 0.6023638218641281, | |
| "num_tokens": 1678187.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 2.0789969861507416, | |
| "epoch": 0.7503234152652005, | |
| "grad_norm": 0.3439568877220154, | |
| "learning_rate": 0.00019788890432092211, | |
| "loss": 2.0849, | |
| "mean_token_accuracy": 0.5323359861969947, | |
| "num_tokens": 1701620.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 3.068958950042725, | |
| "epoch": 0.7632600258732212, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001977805067985422, | |
| "loss": 2.0752, | |
| "mean_token_accuracy": 0.34963107854127884, | |
| "num_tokens": 1709495.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.8244159191846847, | |
| "epoch": 0.7761966364812419, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019766942678615035, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1710135.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7761966364812419, | |
| "eval_entropy": 1.7642724643959555, | |
| "eval_loss": 1.5706199407577515, | |
| "eval_mean_token_accuracy": 0.41619762856253356, | |
| "eval_num_tokens": 1710135.0, | |
| "eval_runtime": 241.9763, | |
| "eval_samples_per_second": 22.713, | |
| "eval_steps_per_second": 1.422, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 2.0163265824317933, | |
| "epoch": 0.7891332470892626, | |
| "grad_norm": 0.21755698323249817, | |
| "learning_rate": 0.00019755566733109251, | |
| "loss": 2.083, | |
| "mean_token_accuracy": 0.5411292694509029, | |
| "num_tokens": 1791443.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.7245848059654236, | |
| "epoch": 0.8020698576972833, | |
| "grad_norm": 0.288361519575119, | |
| "learning_rate": 0.0001974392315542218, | |
| "loss": 1.735, | |
| "mean_token_accuracy": 0.6052085891366005, | |
| "num_tokens": 1824564.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 2.11205150783062, | |
| "epoch": 0.815006468305304, | |
| "grad_norm": 0.3383215069770813, | |
| "learning_rate": 0.000197320122649813, | |
| "loss": 2.1082, | |
| "mean_token_accuracy": 0.5229554586112499, | |
| "num_tokens": 1847974.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 3.1667094111442564, | |
| "epoch": 0.8279430789133247, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.000197198343885475, | |
| "loss": 2.0386, | |
| "mean_token_accuracy": 0.3788307599723339, | |
| "num_tokens": 1855343.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 4.338292050361633, | |
| "epoch": 0.8408796895213454, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019707389860206087, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1855983.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 2.0194253027439117, | |
| "epoch": 0.8538163001293662, | |
| "grad_norm": 0.2260085493326187, | |
| "learning_rate": 0.00019694679021357666, | |
| "loss": 2.0757, | |
| "mean_token_accuracy": 0.5414572946727276, | |
| "num_tokens": 1933686.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.7457041829824447, | |
| "epoch": 0.8667529107373868, | |
| "grad_norm": 0.2763752341270447, | |
| "learning_rate": 0.00019681702220708725, | |
| "loss": 1.7265, | |
| "mean_token_accuracy": 0.6072784595191478, | |
| "num_tokens": 1967008.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 2.0987232238054276, | |
| "epoch": 0.8796895213454075, | |
| "grad_norm": 0.3309071958065033, | |
| "learning_rate": 0.00019668459814262116, | |
| "loss": 2.0841, | |
| "mean_token_accuracy": 0.5245410539209843, | |
| "num_tokens": 1990659.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 3.1821718513965607, | |
| "epoch": 0.8926261319534282, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019654952165307245, | |
| "loss": 2.229, | |
| "mean_token_accuracy": 0.3981798455119133, | |
| "num_tokens": 1999251.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.533292955160141, | |
| "epoch": 0.9055627425614489, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019641179644410136, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 1999891.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.9957170754671096, | |
| "epoch": 0.9184993531694696, | |
| "grad_norm": 0.24394062161445618, | |
| "learning_rate": 0.00019627142629403258, | |
| "loss": 2.0975, | |
| "mean_token_accuracy": 0.5407429985702038, | |
| "num_tokens": 2079895.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.7518584847450256, | |
| "epoch": 0.9314359637774903, | |
| "grad_norm": 0.307822048664093, | |
| "learning_rate": 0.00019612841505375138, | |
| "loss": 1.7164, | |
| "mean_token_accuracy": 0.610467329621315, | |
| "num_tokens": 2113509.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 2.1020208179950712, | |
| "epoch": 0.944372574385511, | |
| "grad_norm": 0.35130032896995544, | |
| "learning_rate": 0.0001959827666465984, | |
| "loss": 2.1253, | |
| "mean_token_accuracy": 0.5220636121928692, | |
| "num_tokens": 2137129.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 2.8922512531280518, | |
| "epoch": 0.9573091849935317, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019583448506826155, | |
| "loss": 1.9805, | |
| "mean_token_accuracy": 0.3766542553901672, | |
| "num_tokens": 2144488.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 2.719996190071106, | |
| "epoch": 0.9702457956015524, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019568357438666675, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 2145128.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9702457956015524, | |
| "eval_entropy": 2.1529439126336296, | |
| "eval_loss": 1.5584267377853394, | |
| "eval_mean_token_accuracy": 0.41127229698522144, | |
| "eval_num_tokens": 2145128.0, | |
| "eval_runtime": 239.9653, | |
| "eval_samples_per_second": 22.903, | |
| "eval_steps_per_second": 1.434, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.9202934563159944, | |
| "epoch": 0.9831824062095731, | |
| "grad_norm": 0.28597503900527954, | |
| "learning_rate": 0.00019553003874186607, | |
| "loss": 1.9302, | |
| "mean_token_accuracy": 0.5697067268192768, | |
| "num_tokens": 2197523.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 2.492697748541832, | |
| "epoch": 0.9961190168175937, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019537388234592442, | |
| "loss": 1.81, | |
| "mean_token_accuracy": 0.39367630481719973, | |
| "num_tokens": 2210056.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 2.2332220911979674, | |
| "epoch": 1.0090556274256144, | |
| "grad_norm": 0.24866575002670288, | |
| "learning_rate": 0.00019521510948280373, | |
| "loss": 1.5005, | |
| "mean_token_accuracy": 0.36937303096055984, | |
| "num_tokens": 2275252.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.6707671225070952, | |
| "epoch": 1.0219922380336353, | |
| "grad_norm": 0.26215294003486633, | |
| "learning_rate": 0.0001950537245082456, | |
| "loss": 1.6341, | |
| "mean_token_accuracy": 0.6254087015986443, | |
| "num_tokens": 2311716.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.8872807383537293, | |
| "epoch": 1.034928848641656, | |
| "grad_norm": 0.36441880464553833, | |
| "learning_rate": 0.0001948897318496517, | |
| "loss": 1.8977, | |
| "mean_token_accuracy": 0.5622286461293697, | |
| "num_tokens": 2338280.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 2.630810996890068, | |
| "epoch": 1.0478654592496766, | |
| "grad_norm": 0.9684458374977112, | |
| "learning_rate": 0.0001947231360059624, | |
| "loss": 2.4046, | |
| "mean_token_accuracy": 0.48553739935159684, | |
| "num_tokens": 2351659.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 2.581965911388397, | |
| "epoch": 1.0608020698576972, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001945539415475333, | |
| "loss": 0.1693, | |
| "mean_token_accuracy": 0.06434160768985749, | |
| "num_tokens": 2352447.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 2.1218140482902528, | |
| "epoch": 1.073738680465718, | |
| "grad_norm": 0.28001680970191956, | |
| "learning_rate": 0.00019438215311600989, | |
| "loss": 1.5396, | |
| "mean_token_accuracy": 0.3639061972498894, | |
| "num_tokens": 2421672.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 1.6590570658445358, | |
| "epoch": 1.0866752910737387, | |
| "grad_norm": 0.27536195516586304, | |
| "learning_rate": 0.0001942077754242001, | |
| "loss": 1.5986, | |
| "mean_token_accuracy": 0.6285051852464676, | |
| "num_tokens": 2458016.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 1.8610892415046691, | |
| "epoch": 1.0996119016817594, | |
| "grad_norm": 0.3670406937599182, | |
| "learning_rate": 0.00019403081325594516, | |
| "loss": 1.8678, | |
| "mean_token_accuracy": 0.5674182385206222, | |
| "num_tokens": 2484503.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 2.5339868366718292, | |
| "epoch": 1.11254851228978, | |
| "grad_norm": 0.911289393901825, | |
| "learning_rate": 0.0001938512714659882, | |
| "loss": 2.3594, | |
| "mean_token_accuracy": 0.49951401725411415, | |
| "num_tokens": 2498485.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 1.802379448711872, | |
| "epoch": 1.1254851228978007, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019366915497984126, | |
| "loss": 0.1255, | |
| "mean_token_accuracy": 0.04691708832979202, | |
| "num_tokens": 2499204.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 1.716230283677578, | |
| "epoch": 1.1384217335058215, | |
| "grad_norm": 0.29532766342163086, | |
| "learning_rate": 0.00019348446879364998, | |
| "loss": 1.5067, | |
| "mean_token_accuracy": 0.3694909021258354, | |
| "num_tokens": 2567621.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 1.6236608117818832, | |
| "epoch": 1.1513583441138422, | |
| "grad_norm": 0.29713669419288635, | |
| "learning_rate": 0.00019329721797405665, | |
| "loss": 1.5861, | |
| "mean_token_accuracy": 0.6327742949128151, | |
| "num_tokens": 2603962.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 1.8917641669511795, | |
| "epoch": 1.1642949547218628, | |
| "grad_norm": 0.3658815324306488, | |
| "learning_rate": 0.00019310740765806112, | |
| "loss": 1.9243, | |
| "mean_token_accuracy": 0.5606695532798767, | |
| "num_tokens": 2630252.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1642949547218628, | |
| "eval_entropy": 1.9112664786882179, | |
| "eval_loss": 1.4807052612304688, | |
| "eval_mean_token_accuracy": 0.42875484818982523, | |
| "eval_num_tokens": 2630252.0, | |
| "eval_runtime": 244.6093, | |
| "eval_samples_per_second": 22.468, | |
| "eval_steps_per_second": 1.406, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 2.6821564227342605, | |
| "epoch": 1.1772315653298835, | |
| "grad_norm": 1.0140999555587769, | |
| "learning_rate": 0.00019291504305288005, | |
| "loss": 2.4338, | |
| "mean_token_accuracy": 0.482094044983387, | |
| "num_tokens": 2643300.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 2.024871030449867, | |
| "epoch": 1.1901681759379044, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019272012943580383, | |
| "loss": 0.088, | |
| "mean_token_accuracy": 0.05487980842590332, | |
| "num_tokens": 2644037.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 1.9695144146680832, | |
| "epoch": 1.203104786545925, | |
| "grad_norm": 0.290670245885849, | |
| "learning_rate": 0.00019252267215405188, | |
| "loss": 1.523, | |
| "mean_token_accuracy": 0.36803208142518995, | |
| "num_tokens": 2711455.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 1.634880828857422, | |
| "epoch": 1.2160413971539457, | |
| "grad_norm": 0.2892841100692749, | |
| "learning_rate": 0.00019232267662462618, | |
| "loss": 1.5725, | |
| "mean_token_accuracy": 0.6363927751779557, | |
| "num_tokens": 2747178.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 1.8903283953666687, | |
| "epoch": 1.2289780077619663, | |
| "grad_norm": 0.3681142330169678, | |
| "learning_rate": 0.00019212014833416222, | |
| "loss": 1.9128, | |
| "mean_token_accuracy": 0.5572593852877616, | |
| "num_tokens": 2773302.0, | |
| "step": 950 | |
| }, | |
| { | |
| "entropy": 2.5646925628185273, | |
| "epoch": 1.2419146183699872, | |
| "grad_norm": 2.999826669692993, | |
| "learning_rate": 0.00019191509283877892, | |
| "loss": 2.3972, | |
| "mean_token_accuracy": 0.49176110327243805, | |
| "num_tokens": 2787000.0, | |
| "step": 960 | |
| }, | |
| { | |
| "entropy": 2.151153501868248, | |
| "epoch": 1.2548512289780078, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019170751576392587, | |
| "loss": 0.1193, | |
| "mean_token_accuracy": 0.044841271638870236, | |
| "num_tokens": 2787722.0, | |
| "step": 970 | |
| }, | |
| { | |
| "entropy": 1.8522070705890656, | |
| "epoch": 1.2677878395860285, | |
| "grad_norm": 0.2727435827255249, | |
| "learning_rate": 0.00019149742280422924, | |
| "loss": 1.5171, | |
| "mean_token_accuracy": 0.36686722859740256, | |
| "num_tokens": 2854084.0, | |
| "step": 980 | |
| }, | |
| { | |
| "entropy": 1.5743449032306671, | |
| "epoch": 1.2807244501940491, | |
| "grad_norm": 0.2871781289577484, | |
| "learning_rate": 0.00019128481972333544, | |
| "loss": 1.5921, | |
| "mean_token_accuracy": 0.6345128893852234, | |
| "num_tokens": 2890579.0, | |
| "step": 990 | |
| }, | |
| { | |
| "entropy": 1.969143381714821, | |
| "epoch": 1.2936610608020698, | |
| "grad_norm": 0.4106636643409729, | |
| "learning_rate": 0.00019106971235375298, | |
| "loss": 1.9566, | |
| "mean_token_accuracy": 0.5519939877092839, | |
| "num_tokens": 2917103.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "entropy": 2.637175753712654, | |
| "epoch": 1.3065976714100906, | |
| "grad_norm": 0.956899881362915, | |
| "learning_rate": 0.0001908521065966926, | |
| "loss": 2.4367, | |
| "mean_token_accuracy": 0.47931770235300064, | |
| "num_tokens": 2930324.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "entropy": 1.2107470080256462, | |
| "epoch": 1.3195342820181113, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00019063200842190514, | |
| "loss": 0.1138, | |
| "mean_token_accuracy": 0.07033292502164841, | |
| "num_tokens": 2931098.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "entropy": 1.5640547186136247, | |
| "epoch": 1.332470892626132, | |
| "grad_norm": 0.2837156057357788, | |
| "learning_rate": 0.00019040942386751804, | |
| "loss": 1.5281, | |
| "mean_token_accuracy": 0.368409526348114, | |
| "num_tokens": 2998986.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "entropy": 1.6472883015871047, | |
| "epoch": 1.3454075032341526, | |
| "grad_norm": 0.31581056118011475, | |
| "learning_rate": 0.00019018435903986943, | |
| "loss": 1.6144, | |
| "mean_token_accuracy": 0.62486432492733, | |
| "num_tokens": 3035300.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "entropy": 1.8509329915046693, | |
| "epoch": 1.3583441138421732, | |
| "grad_norm": 0.39050692319869995, | |
| "learning_rate": 0.00018995682011334087, | |
| "loss": 1.8415, | |
| "mean_token_accuracy": 0.5710361421108245, | |
| "num_tokens": 3062133.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3583441138421732, | |
| "eval_entropy": 1.7658769363580749, | |
| "eval_loss": 1.464791178703308, | |
| "eval_mean_token_accuracy": 0.429339470125215, | |
| "eval_num_tokens": 3062133.0, | |
| "eval_runtime": 243.4077, | |
| "eval_samples_per_second": 22.579, | |
| "eval_steps_per_second": 1.413, | |
| "step": 1050 | |
| }, | |
| { | |
| "entropy": 2.4731887727975845, | |
| "epoch": 1.371280724450194, | |
| "grad_norm": 0.9063658714294434, | |
| "learning_rate": 0.00018972681333018776, | |
| "loss": 2.3412, | |
| "mean_token_accuracy": 0.4919880717992783, | |
| "num_tokens": 3076137.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "entropy": 1.815966796875, | |
| "epoch": 1.3842173350582148, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018949434500036816, | |
| "loss": 0.2748, | |
| "mean_token_accuracy": 0.094140625, | |
| "num_tokens": 3077033.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "entropy": 1.7788158431649208, | |
| "epoch": 1.3971539456662354, | |
| "grad_norm": 0.28700482845306396, | |
| "learning_rate": 0.0001892594215013697, | |
| "loss": 1.491, | |
| "mean_token_accuracy": 0.3707178644835949, | |
| "num_tokens": 3139012.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "entropy": 1.5893326640129088, | |
| "epoch": 1.4100905562742563, | |
| "grad_norm": 0.3248252868652344, | |
| "learning_rate": 0.00018902204927803462, | |
| "loss": 1.5707, | |
| "mean_token_accuracy": 0.6353108420968056, | |
| "num_tokens": 3175132.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "entropy": 1.8777880787849426, | |
| "epoch": 1.4230271668822767, | |
| "grad_norm": 0.4096948206424713, | |
| "learning_rate": 0.00018878223484238295, | |
| "loss": 1.9016, | |
| "mean_token_accuracy": 0.5628921225667, | |
| "num_tokens": 3201175.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "entropy": 2.5787813514471054, | |
| "epoch": 1.4359637774902976, | |
| "grad_norm": 0.9349520206451416, | |
| "learning_rate": 0.00018853998477343385, | |
| "loss": 2.4275, | |
| "mean_token_accuracy": 0.4918954521417618, | |
| "num_tokens": 3213218.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "entropy": 1.5953246742486953, | |
| "epoch": 1.4489003880983182, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018829530571702515, | |
| "loss": 0.0759, | |
| "mean_token_accuracy": 0.03794117569923401, | |
| "num_tokens": 3213902.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "entropy": 1.7004274040460587, | |
| "epoch": 1.4618369987063389, | |
| "grad_norm": 0.28281426429748535, | |
| "learning_rate": 0.000188048204385631, | |
| "loss": 1.4741, | |
| "mean_token_accuracy": 0.37432471886277197, | |
| "num_tokens": 3278399.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "entropy": 1.54911307990551, | |
| "epoch": 1.4747736093143597, | |
| "grad_norm": 0.3112603425979614, | |
| "learning_rate": 0.00018779868755817777, | |
| "loss": 1.529, | |
| "mean_token_accuracy": 0.6405477434396744, | |
| "num_tokens": 3314005.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "entropy": 1.8169409155845642, | |
| "epoch": 1.4877102199223804, | |
| "grad_norm": 0.4136084020137787, | |
| "learning_rate": 0.00018754676207985798, | |
| "loss": 1.8563, | |
| "mean_token_accuracy": 0.5684241697192192, | |
| "num_tokens": 3339761.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "entropy": 2.6468518733978272, | |
| "epoch": 1.500646830530401, | |
| "grad_norm": 0.9774990081787109, | |
| "learning_rate": 0.00018729243486194258, | |
| "loss": 2.4068, | |
| "mean_token_accuracy": 0.49020475447177886, | |
| "num_tokens": 3352396.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "entropy": 1.844868466258049, | |
| "epoch": 1.5135834411384217, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001870357128815915, | |
| "loss": 0.1083, | |
| "mean_token_accuracy": 0.03311403542757034, | |
| "num_tokens": 3353089.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "entropy": 1.77299522459507, | |
| "epoch": 1.5265200517464423, | |
| "grad_norm": 0.29017725586891174, | |
| "learning_rate": 0.00018677660318166178, | |
| "loss": 1.5134, | |
| "mean_token_accuracy": 0.37067501023411753, | |
| "num_tokens": 3417806.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "entropy": 1.605825701355934, | |
| "epoch": 1.5394566623544632, | |
| "grad_norm": 0.3007327616214752, | |
| "learning_rate": 0.000186515112870515, | |
| "loss": 1.5754, | |
| "mean_token_accuracy": 0.6359535038471222, | |
| "num_tokens": 3453968.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "entropy": 1.8059845566749573, | |
| "epoch": 1.5523932729624839, | |
| "grad_norm": 0.4170464277267456, | |
| "learning_rate": 0.0001862512491218217, | |
| "loss": 1.8209, | |
| "mean_token_accuracy": 0.5729366824030876, | |
| "num_tokens": 3480122.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.5523932729624839, | |
| "eval_entropy": 1.846903031302053, | |
| "eval_loss": 1.4494483470916748, | |
| "eval_mean_token_accuracy": 0.4338979678618353, | |
| "eval_num_tokens": 3480122.0, | |
| "eval_runtime": 245.7587, | |
| "eval_samples_per_second": 22.363, | |
| "eval_steps_per_second": 1.4, | |
| "step": 1200 | |
| }, | |
| { | |
| "entropy": 2.4776687741279604, | |
| "epoch": 1.5653298835705045, | |
| "grad_norm": 1.236024022102356, | |
| "learning_rate": 0.00018598501917436487, | |
| "loss": 2.2694, | |
| "mean_token_accuracy": 0.5161234959959984, | |
| "num_tokens": 3492043.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "entropy": 2.871905821561813, | |
| "epoch": 1.5782664941785254, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018571643033184136, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 3492683.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "entropy": 2.30782949924469, | |
| "epoch": 1.5912031047865458, | |
| "grad_norm": 0.3269418179988861, | |
| "learning_rate": 0.00018544548996266138, | |
| "loss": 1.4917, | |
| "mean_token_accuracy": 0.3702575147151947, | |
| "num_tokens": 3561621.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "entropy": 1.5860986828804016, | |
| "epoch": 1.6041397153945667, | |
| "grad_norm": 0.33811113238334656, | |
| "learning_rate": 0.00018517220549974642, | |
| "loss": 1.5659, | |
| "mean_token_accuracy": 0.6364668473601341, | |
| "num_tokens": 3597551.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "entropy": 1.8561100304126739, | |
| "epoch": 1.6170763260025873, | |
| "grad_norm": 0.4206816554069519, | |
| "learning_rate": 0.00018489658444032544, | |
| "loss": 1.8636, | |
| "mean_token_accuracy": 0.5685464948415756, | |
| "num_tokens": 3623516.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "entropy": 2.475165989995003, | |
| "epoch": 1.630012936610608, | |
| "grad_norm": 0.9206745624542236, | |
| "learning_rate": 0.00018461863434572905, | |
| "loss": 2.3686, | |
| "mean_token_accuracy": 0.49352553114295006, | |
| "num_tokens": 3636662.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "entropy": 1.5844505287706852, | |
| "epoch": 1.6429495472186288, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001843383628411821, | |
| "loss": 0.1782, | |
| "mean_token_accuracy": 0.08751860111951829, | |
| "num_tokens": 3637501.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "entropy": 1.5500032015144825, | |
| "epoch": 1.6558861578266493, | |
| "grad_norm": 0.2985474169254303, | |
| "learning_rate": 0.00018405577761559453, | |
| "loss": 1.5005, | |
| "mean_token_accuracy": 0.3704367861151695, | |
| "num_tokens": 3705898.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "entropy": 1.5747513711452483, | |
| "epoch": 1.6688227684346701, | |
| "grad_norm": 0.3510088622570038, | |
| "learning_rate": 0.0001837708864213505, | |
| "loss": 1.5586, | |
| "mean_token_accuracy": 0.6378742828965187, | |
| "num_tokens": 3742275.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "entropy": 1.7819489419460297, | |
| "epoch": 1.6817593790426908, | |
| "grad_norm": 0.42687690258026123, | |
| "learning_rate": 0.00018348369707409546, | |
| "loss": 1.8096, | |
| "mean_token_accuracy": 0.5733471587300301, | |
| "num_tokens": 3768563.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "entropy": 2.4534367620944977, | |
| "epoch": 1.6946959896507114, | |
| "grad_norm": 0.9902492165565491, | |
| "learning_rate": 0.00018319421745252208, | |
| "loss": 2.3035, | |
| "mean_token_accuracy": 0.49916471540927887, | |
| "num_tokens": 3782396.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "entropy": 1.977598437666893, | |
| "epoch": 1.7076326002587323, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018290245549815385, | |
| "loss": 0.1527, | |
| "mean_token_accuracy": 0.0657636746764183, | |
| "num_tokens": 3783196.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "entropy": 2.1555118948221206, | |
| "epoch": 1.720569210866753, | |
| "grad_norm": 0.3243282437324524, | |
| "learning_rate": 0.0001826084192151273, | |
| "loss": 1.5106, | |
| "mean_token_accuracy": 0.36851018443703654, | |
| "num_tokens": 3846769.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "entropy": 1.5848265200853349, | |
| "epoch": 1.7335058214747736, | |
| "grad_norm": 0.32707569003105164, | |
| "learning_rate": 0.00018231211666997247, | |
| "loss": 1.5277, | |
| "mean_token_accuracy": 0.642450013756752, | |
| "num_tokens": 3882748.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "entropy": 1.8691698461771011, | |
| "epoch": 1.7464424320827943, | |
| "grad_norm": 0.43988320231437683, | |
| "learning_rate": 0.00018201355599139154, | |
| "loss": 1.9016, | |
| "mean_token_accuracy": 0.56101154088974, | |
| "num_tokens": 3908934.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.7464424320827943, | |
| "eval_entropy": 1.7982253941685655, | |
| "eval_loss": 1.4296140670776367, | |
| "eval_mean_token_accuracy": 0.43251860254379204, | |
| "eval_num_tokens": 3908934.0, | |
| "eval_runtime": 245.0387, | |
| "eval_samples_per_second": 22.429, | |
| "eval_steps_per_second": 1.404, | |
| "step": 1350 | |
| }, | |
| { | |
| "entropy": 2.471151527762413, | |
| "epoch": 1.759379042690815, | |
| "grad_norm": 0.9302666187286377, | |
| "learning_rate": 0.0001817127453700358, | |
| "loss": 2.3247, | |
| "mean_token_accuracy": 0.5023237220942974, | |
| "num_tokens": 3922255.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "entropy": 1.8378637909889222, | |
| "epoch": 1.7723156532988358, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00018140969305828106, | |
| "loss": 0.0576, | |
| "mean_token_accuracy": 0.0373076930642128, | |
| "num_tokens": 3922926.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "entropy": 1.7470036551356316, | |
| "epoch": 1.7852522639068564, | |
| "grad_norm": 0.3011367619037628, | |
| "learning_rate": 0.00018110440737000122, | |
| "loss": 1.4591, | |
| "mean_token_accuracy": 0.3771127283573151, | |
| "num_tokens": 3990074.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "entropy": 1.5329654335975647, | |
| "epoch": 1.798188874514877, | |
| "grad_norm": 0.31504422426223755, | |
| "learning_rate": 0.00018079689668034005, | |
| "loss": 1.4973, | |
| "mean_token_accuracy": 0.6467197388410568, | |
| "num_tokens": 4026755.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "entropy": 1.7885783523321153, | |
| "epoch": 1.811125485122898, | |
| "grad_norm": 0.42766207456588745, | |
| "learning_rate": 0.00018048716942548168, | |
| "loss": 1.8211, | |
| "mean_token_accuracy": 0.5723803475499153, | |
| "num_tokens": 4053589.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "entropy": 2.405156469345093, | |
| "epoch": 1.8240620957309184, | |
| "grad_norm": 0.953956663608551, | |
| "learning_rate": 0.00018017523410241893, | |
| "loss": 2.2967, | |
| "mean_token_accuracy": 0.5070258714258671, | |
| "num_tokens": 4068297.0, | |
| "step": 1410 | |
| }, | |
| { | |
| "entropy": 1.202190825343132, | |
| "epoch": 1.8369987063389392, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017986109926872032, | |
| "loss": 0.2475, | |
| "mean_token_accuracy": 0.09388883709907532, | |
| "num_tokens": 4069205.0, | |
| "step": 1420 | |
| }, | |
| { | |
| "entropy": 1.8208864331245422, | |
| "epoch": 1.84993531694696, | |
| "grad_norm": 0.30337706208229065, | |
| "learning_rate": 0.00017954477354229536, | |
| "loss": 1.4609, | |
| "mean_token_accuracy": 0.3746915958821774, | |
| "num_tokens": 4135636.0, | |
| "step": 1430 | |
| }, | |
| { | |
| "entropy": 1.547205138206482, | |
| "epoch": 1.8628719275549805, | |
| "grad_norm": 0.3231499493122101, | |
| "learning_rate": 0.00017922626560115798, | |
| "loss": 1.5262, | |
| "mean_token_accuracy": 0.6422269076108933, | |
| "num_tokens": 4171871.0, | |
| "step": 1440 | |
| }, | |
| { | |
| "entropy": 1.8343932330608368, | |
| "epoch": 1.8758085381630014, | |
| "grad_norm": 0.45170995593070984, | |
| "learning_rate": 0.0001789055841831885, | |
| "loss": 1.8589, | |
| "mean_token_accuracy": 0.5682013630867004, | |
| "num_tokens": 4198004.0, | |
| "step": 1450 | |
| }, | |
| { | |
| "entropy": 2.4178356170654296, | |
| "epoch": 1.8887451487710218, | |
| "grad_norm": 1.1836594343185425, | |
| "learning_rate": 0.00017858273808589402, | |
| "loss": 2.219, | |
| "mean_token_accuracy": 0.5180532835423947, | |
| "num_tokens": 4210568.0, | |
| "step": 1460 | |
| }, | |
| { | |
| "entropy": 1.6111401319503784, | |
| "epoch": 1.9016817593790427, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017825773616616703, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 4211208.0, | |
| "step": 1470 | |
| }, | |
| { | |
| "entropy": 1.7321304202079773, | |
| "epoch": 1.9146183699870634, | |
| "grad_norm": 0.30086463689804077, | |
| "learning_rate": 0.0001779305873400423, | |
| "loss": 1.4654, | |
| "mean_token_accuracy": 0.3772578649222851, | |
| "num_tokens": 4279659.0, | |
| "step": 1480 | |
| }, | |
| { | |
| "entropy": 1.5423625767230988, | |
| "epoch": 1.927554980595084, | |
| "grad_norm": 0.33361881971359253, | |
| "learning_rate": 0.00017760130058245242, | |
| "loss": 1.4942, | |
| "mean_token_accuracy": 0.6453819587826729, | |
| "num_tokens": 4315273.0, | |
| "step": 1490 | |
| }, | |
| { | |
| "entropy": 1.8349818885326385, | |
| "epoch": 1.9404915912031049, | |
| "grad_norm": 0.4649695158004761, | |
| "learning_rate": 0.0001772698849269816, | |
| "loss": 1.8167, | |
| "mean_token_accuracy": 0.5768257766962052, | |
| "num_tokens": 4341460.0, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.9404915912031049, | |
| "eval_entropy": 1.7394565719851227, | |
| "eval_loss": 1.4138603210449219, | |
| "eval_mean_token_accuracy": 0.43571045708864237, | |
| "eval_num_tokens": 4341460.0, | |
| "eval_runtime": 245.3477, | |
| "eval_samples_per_second": 22.401, | |
| "eval_steps_per_second": 1.402, | |
| "step": 1500 | |
| }, | |
| { | |
| "entropy": 2.375590392947197, | |
| "epoch": 1.9534282018111255, | |
| "grad_norm": 0.9830443263053894, | |
| "learning_rate": 0.00017693634946561775, | |
| "loss": 2.2734, | |
| "mean_token_accuracy": 0.5091598987579345, | |
| "num_tokens": 4355559.0, | |
| "step": 1510 | |
| }, | |
| { | |
| "entropy": 2.5652388006448748, | |
| "epoch": 1.9663648124191462, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017660070334850304, | |
| "loss": 0.1559, | |
| "mean_token_accuracy": 0.07029985040426254, | |
| "num_tokens": 4356373.0, | |
| "step": 1520 | |
| }, | |
| { | |
| "entropy": 2.071737268567085, | |
| "epoch": 1.9793014230271668, | |
| "grad_norm": 0.33517006039619446, | |
| "learning_rate": 0.00017626295578368305, | |
| "loss": 1.2406, | |
| "mean_token_accuracy": 0.418473818898201, | |
| "num_tokens": 4398312.0, | |
| "step": 1530 | |
| }, | |
| { | |
| "entropy": 2.0724180698394776, | |
| "epoch": 1.9922380336351875, | |
| "grad_norm": 0.8757649660110474, | |
| "learning_rate": 0.00017592311603685393, | |
| "loss": 2.0395, | |
| "mean_token_accuracy": 0.5450932942330837, | |
| "num_tokens": 4419963.0, | |
| "step": 1540 | |
| }, | |
| { | |
| "entropy": 2.4648678690195083, | |
| "epoch": 2.0051746442432083, | |
| "grad_norm": 0.29208359122276306, | |
| "learning_rate": 0.00017558119343110838, | |
| "loss": 1.0811, | |
| "mean_token_accuracy": 0.2497500881552696, | |
| "num_tokens": 4466509.0, | |
| "step": 1550 | |
| }, | |
| { | |
| "entropy": 1.4679036349058152, | |
| "epoch": 2.0181112548512288, | |
| "grad_norm": 0.31473416090011597, | |
| "learning_rate": 0.00017523719734667973, | |
| "loss": 1.4439, | |
| "mean_token_accuracy": 0.6536323636770248, | |
| "num_tokens": 4506293.0, | |
| "step": 1560 | |
| }, | |
| { | |
| "entropy": 1.5847829729318619, | |
| "epoch": 2.0310478654592496, | |
| "grad_norm": 0.4749562740325928, | |
| "learning_rate": 0.0001748911372206848, | |
| "loss": 1.5723, | |
| "mean_token_accuracy": 0.6196332320570945, | |
| "num_tokens": 4535291.0, | |
| "step": 1570 | |
| }, | |
| { | |
| "entropy": 2.08716399371624, | |
| "epoch": 2.0439844760672705, | |
| "grad_norm": 0.6515533924102783, | |
| "learning_rate": 0.00017454302254686486, | |
| "loss": 2.0148, | |
| "mean_token_accuracy": 0.5413075156509877, | |
| "num_tokens": 4553239.0, | |
| "step": 1580 | |
| }, | |
| { | |
| "entropy": 2.5849849820137023, | |
| "epoch": 2.056921086675291, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017419286287532516, | |
| "loss": 0.7934, | |
| "mean_token_accuracy": 0.19277514591813089, | |
| "num_tokens": 4555288.0, | |
| "step": 1590 | |
| }, | |
| { | |
| "entropy": 2.378413477540016, | |
| "epoch": 2.069857697283312, | |
| "grad_norm": 0.28206267952919006, | |
| "learning_rate": 0.00017384066781227307, | |
| "loss": 0.9347, | |
| "mean_token_accuracy": 0.1983368217945099, | |
| "num_tokens": 4604552.0, | |
| "step": 1600 | |
| }, | |
| { | |
| "entropy": 1.5316940248012543, | |
| "epoch": 2.0827943078913327, | |
| "grad_norm": 0.33060652017593384, | |
| "learning_rate": 0.0001734864470197544, | |
| "loss": 1.5009, | |
| "mean_token_accuracy": 0.6414364308118821, | |
| "num_tokens": 4644766.0, | |
| "step": 1610 | |
| }, | |
| { | |
| "entropy": 1.6213007301092148, | |
| "epoch": 2.095730918499353, | |
| "grad_norm": 0.5015760064125061, | |
| "learning_rate": 0.00017313021021538844, | |
| "loss": 1.6038, | |
| "mean_token_accuracy": 0.6168796703219414, | |
| "num_tokens": 4673702.0, | |
| "step": 1620 | |
| }, | |
| { | |
| "entropy": 2.1126689702272414, | |
| "epoch": 2.108667529107374, | |
| "grad_norm": 0.7331583499908447, | |
| "learning_rate": 0.0001727719671721013, | |
| "loss": 2.0398, | |
| "mean_token_accuracy": 0.533772025257349, | |
| "num_tokens": 4691426.0, | |
| "step": 1630 | |
| }, | |
| { | |
| "entropy": 2.546648120880127, | |
| "epoch": 2.1216041397153944, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001724117277178579, | |
| "loss": 0.5647, | |
| "mean_token_accuracy": 0.1764907084405422, | |
| "num_tokens": 4693073.0, | |
| "step": 1640 | |
| }, | |
| { | |
| "entropy": 2.3147504776716232, | |
| "epoch": 2.1345407503234153, | |
| "grad_norm": 0.3223225474357605, | |
| "learning_rate": 0.0001720495017353922, | |
| "loss": 0.8825, | |
| "mean_token_accuracy": 0.2041303940117359, | |
| "num_tokens": 4745475.0, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.1345407503234153, | |
| "eval_entropy": 1.9804211553446083, | |
| "eval_loss": 1.4297912120819092, | |
| "eval_mean_token_accuracy": 0.43714187025677326, | |
| "eval_num_tokens": 4745475.0, | |
| "eval_runtime": 240.4238, | |
| "eval_samples_per_second": 22.86, | |
| "eval_steps_per_second": 1.431, | |
| "step": 1650 | |
| }, | |
| { | |
| "entropy": 1.5305457144975663, | |
| "epoch": 2.147477360931436, | |
| "grad_norm": 0.35115179419517517, | |
| "learning_rate": 0.00017168529916193614, | |
| "loss": 1.521, | |
| "mean_token_accuracy": 0.6396576210856437, | |
| "num_tokens": 4786054.0, | |
| "step": 1660 | |
| }, | |
| { | |
| "entropy": 1.5795167148113252, | |
| "epoch": 2.1604139715394566, | |
| "grad_norm": 0.50258469581604, | |
| "learning_rate": 0.00017131912998894717, | |
| "loss": 1.5679, | |
| "mean_token_accuracy": 0.6227076068520546, | |
| "num_tokens": 4815157.0, | |
| "step": 1670 | |
| }, | |
| { | |
| "entropy": 2.0948879569768906, | |
| "epoch": 2.1733505821474774, | |
| "grad_norm": 0.7732148766517639, | |
| "learning_rate": 0.0001709510042618339, | |
| "loss": 2.0484, | |
| "mean_token_accuracy": 0.539436261355877, | |
| "num_tokens": 4833514.0, | |
| "step": 1680 | |
| }, | |
| { | |
| "entropy": 2.419444125890732, | |
| "epoch": 2.186287192755498, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00017058093207968067, | |
| "loss": 0.6193, | |
| "mean_token_accuracy": 0.19686403200030328, | |
| "num_tokens": 4835320.0, | |
| "step": 1690 | |
| }, | |
| { | |
| "entropy": 2.0893223583698273, | |
| "epoch": 2.1992238033635187, | |
| "grad_norm": 0.3104536831378937, | |
| "learning_rate": 0.0001702089235949705, | |
| "loss": 0.8909, | |
| "mean_token_accuracy": 0.20202562659978868, | |
| "num_tokens": 4887586.0, | |
| "step": 1700 | |
| }, | |
| { | |
| "entropy": 1.493579125404358, | |
| "epoch": 2.2121604139715396, | |
| "grad_norm": 0.35835903882980347, | |
| "learning_rate": 0.0001698349890133065, | |
| "loss": 1.5107, | |
| "mean_token_accuracy": 0.6415591448545456, | |
| "num_tokens": 4928021.0, | |
| "step": 1710 | |
| }, | |
| { | |
| "entropy": 1.599936455488205, | |
| "epoch": 2.22509702457956, | |
| "grad_norm": 0.5604035258293152, | |
| "learning_rate": 0.0001694591385931319, | |
| "loss": 1.5589, | |
| "mean_token_accuracy": 0.6183684885501861, | |
| "num_tokens": 4956628.0, | |
| "step": 1720 | |
| }, | |
| { | |
| "entropy": 2.0975252121686934, | |
| "epoch": 2.238033635187581, | |
| "grad_norm": 0.7757624983787537, | |
| "learning_rate": 0.00016908138264544874, | |
| "loss": 2.0586, | |
| "mean_token_accuracy": 0.537506015598774, | |
| "num_tokens": 4973976.0, | |
| "step": 1730 | |
| }, | |
| { | |
| "entropy": 2.402835935354233, | |
| "epoch": 2.2509702457956013, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016870173153353478, | |
| "loss": 0.7325, | |
| "mean_token_accuracy": 0.21586424633860588, | |
| "num_tokens": 4975943.0, | |
| "step": 1740 | |
| }, | |
| { | |
| "entropy": 1.8081632763147355, | |
| "epoch": 2.263906856403622, | |
| "grad_norm": 0.29493167996406555, | |
| "learning_rate": 0.0001683201956726593, | |
| "loss": 0.8952, | |
| "mean_token_accuracy": 0.20223823115229606, | |
| "num_tokens": 5031202.0, | |
| "step": 1750 | |
| }, | |
| { | |
| "entropy": 1.5031701743602752, | |
| "epoch": 2.276843467011643, | |
| "grad_norm": 0.3834936320781708, | |
| "learning_rate": 0.0001679367855297976, | |
| "loss": 1.5076, | |
| "mean_token_accuracy": 0.643890731036663, | |
| "num_tokens": 5071593.0, | |
| "step": 1760 | |
| }, | |
| { | |
| "entropy": 1.6009941071271896, | |
| "epoch": 2.2897800776196635, | |
| "grad_norm": 0.5210739374160767, | |
| "learning_rate": 0.0001675515116233434, | |
| "loss": 1.5777, | |
| "mean_token_accuracy": 0.6210601255297661, | |
| "num_tokens": 5100741.0, | |
| "step": 1770 | |
| }, | |
| { | |
| "entropy": 2.032317912578583, | |
| "epoch": 2.3027166882276844, | |
| "grad_norm": 0.6077569723129272, | |
| "learning_rate": 0.0001671643845228207, | |
| "loss": 1.9718, | |
| "mean_token_accuracy": 0.5442127160727978, | |
| "num_tokens": 5120288.0, | |
| "step": 1780 | |
| }, | |
| { | |
| "entropy": 1.8405873313546182, | |
| "epoch": 2.315653298835705, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016677541484859352, | |
| "loss": 0.9106, | |
| "mean_token_accuracy": 0.22827735766768456, | |
| "num_tokens": 5122772.0, | |
| "step": 1790 | |
| }, | |
| { | |
| "entropy": 1.2277291655540465, | |
| "epoch": 2.3285899094437257, | |
| "grad_norm": 0.2893352806568146, | |
| "learning_rate": 0.0001663846132715747, | |
| "loss": 0.9194, | |
| "mean_token_accuracy": 0.1989746630191803, | |
| "num_tokens": 5178960.0, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.3285899094437257, | |
| "eval_entropy": 1.3877028687748798, | |
| "eval_loss": 1.4073032140731812, | |
| "eval_mean_token_accuracy": 0.44012743035374685, | |
| "eval_num_tokens": 5178960.0, | |
| "eval_runtime": 243.8297, | |
| "eval_samples_per_second": 22.54, | |
| "eval_steps_per_second": 1.411, | |
| "step": 1800 | |
| }, | |
| { | |
| "entropy": 1.5012196868658065, | |
| "epoch": 2.3415265200517466, | |
| "grad_norm": 0.3776693344116211, | |
| "learning_rate": 0.00016599199051293314, | |
| "loss": 1.4982, | |
| "mean_token_accuracy": 0.644976706802845, | |
| "num_tokens": 5220342.0, | |
| "step": 1810 | |
| }, | |
| { | |
| "entropy": 1.6106306850910186, | |
| "epoch": 2.354463130659767, | |
| "grad_norm": 0.5475464463233948, | |
| "learning_rate": 0.0001655975573437996, | |
| "loss": 1.5526, | |
| "mean_token_accuracy": 0.6244173154234887, | |
| "num_tokens": 5249776.0, | |
| "step": 1820 | |
| }, | |
| { | |
| "entropy": 2.004978260397911, | |
| "epoch": 2.367399741267788, | |
| "grad_norm": 0.6898283958435059, | |
| "learning_rate": 0.0001652013245849714, | |
| "loss": 1.9472, | |
| "mean_token_accuracy": 0.557063739746809, | |
| "num_tokens": 5268417.0, | |
| "step": 1830 | |
| }, | |
| { | |
| "entropy": 2.327620804309845, | |
| "epoch": 2.3803363518758087, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016480330310661523, | |
| "loss": 0.7845, | |
| "mean_token_accuracy": 0.20984074249863624, | |
| "num_tokens": 5270607.0, | |
| "step": 1840 | |
| }, | |
| { | |
| "entropy": 2.509730467200279, | |
| "epoch": 2.393272962483829, | |
| "grad_norm": 0.30109038949012756, | |
| "learning_rate": 0.00016440350382796929, | |
| "loss": 0.9268, | |
| "mean_token_accuracy": 0.19716072604060172, | |
| "num_tokens": 5325120.0, | |
| "step": 1850 | |
| }, | |
| { | |
| "entropy": 1.480056384205818, | |
| "epoch": 2.40620957309185, | |
| "grad_norm": 0.36303573846817017, | |
| "learning_rate": 0.00016400193771704354, | |
| "loss": 1.4947, | |
| "mean_token_accuracy": 0.6465561181306839, | |
| "num_tokens": 5366273.0, | |
| "step": 1860 | |
| }, | |
| { | |
| "entropy": 1.5890043556690217, | |
| "epoch": 2.4191461836998704, | |
| "grad_norm": 0.5530393123626709, | |
| "learning_rate": 0.00016359861579031884, | |
| "loss": 1.5522, | |
| "mean_token_accuracy": 0.6297082543373108, | |
| "num_tokens": 5395726.0, | |
| "step": 1870 | |
| }, | |
| { | |
| "entropy": 2.038092666864395, | |
| "epoch": 2.4320827943078913, | |
| "grad_norm": 1.0535674095153809, | |
| "learning_rate": 0.00016319354911244468, | |
| "loss": 1.9806, | |
| "mean_token_accuracy": 0.5464614436030388, | |
| "num_tokens": 5414798.0, | |
| "step": 1880 | |
| }, | |
| { | |
| "entropy": 2.923152169585228, | |
| "epoch": 2.445019404915912, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00016278674879593582, | |
| "loss": 0.7968, | |
| "mean_token_accuracy": 0.2314663991332054, | |
| "num_tokens": 5417197.0, | |
| "step": 1890 | |
| }, | |
| { | |
| "entropy": 2.655092605948448, | |
| "epoch": 2.4579560155239326, | |
| "grad_norm": 0.3218407928943634, | |
| "learning_rate": 0.00016237822600086716, | |
| "loss": 0.9259, | |
| "mean_token_accuracy": 0.19839748442173005, | |
| "num_tokens": 5470736.0, | |
| "step": 1900 | |
| }, | |
| { | |
| "entropy": 1.4376092582941056, | |
| "epoch": 2.4708926261319535, | |
| "grad_norm": 0.3781118094921112, | |
| "learning_rate": 0.00016196799193456785, | |
| "loss": 1.4415, | |
| "mean_token_accuracy": 0.6578261837363243, | |
| "num_tokens": 5511266.0, | |
| "step": 1910 | |
| }, | |
| { | |
| "entropy": 1.5665327340364457, | |
| "epoch": 2.4838292367399744, | |
| "grad_norm": 0.5386565327644348, | |
| "learning_rate": 0.00016155605785131357, | |
| "loss": 1.5497, | |
| "mean_token_accuracy": 0.6252920791506767, | |
| "num_tokens": 5541123.0, | |
| "step": 1920 | |
| }, | |
| { | |
| "entropy": 1.9834172219038009, | |
| "epoch": 2.496765847347995, | |
| "grad_norm": 0.6560537815093994, | |
| "learning_rate": 0.00016114243505201795, | |
| "loss": 1.9184, | |
| "mean_token_accuracy": 0.555550941824913, | |
| "num_tokens": 5561101.0, | |
| "step": 1930 | |
| }, | |
| { | |
| "entropy": 2.323999685049057, | |
| "epoch": 2.5097024579560157, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001607271348839226, | |
| "loss": 0.9326, | |
| "mean_token_accuracy": 0.2633499436080456, | |
| "num_tokens": 5564120.0, | |
| "step": 1940 | |
| }, | |
| { | |
| "entropy": 1.5099886417388917, | |
| "epoch": 2.522639068564036, | |
| "grad_norm": 0.39876788854599, | |
| "learning_rate": 0.00016031016874028557, | |
| "loss": 0.9269, | |
| "mean_token_accuracy": 0.20084442123770713, | |
| "num_tokens": 5613256.0, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.522639068564036, | |
| "eval_entropy": 1.3481496193034703, | |
| "eval_loss": 1.3939740657806396, | |
| "eval_mean_token_accuracy": 0.44758816895096804, | |
| "eval_num_tokens": 5613256.0, | |
| "eval_runtime": 246.9294, | |
| "eval_samples_per_second": 22.257, | |
| "eval_steps_per_second": 1.393, | |
| "step": 1950 | |
| }, | |
| { | |
| "entropy": 1.4310665398836135, | |
| "epoch": 2.535575679172057, | |
| "grad_norm": 0.39710840582847595, | |
| "learning_rate": 0.00015989154806006904, | |
| "loss": 1.4336, | |
| "mean_token_accuracy": 0.6602939382195473, | |
| "num_tokens": 5653638.0, | |
| "step": 1960 | |
| }, | |
| { | |
| "entropy": 1.5728681892156602, | |
| "epoch": 2.548512289780078, | |
| "grad_norm": 0.5568864941596985, | |
| "learning_rate": 0.00015947128432762536, | |
| "loss": 1.5237, | |
| "mean_token_accuracy": 0.627597238123417, | |
| "num_tokens": 5683333.0, | |
| "step": 1970 | |
| }, | |
| { | |
| "entropy": 1.9994044303894043, | |
| "epoch": 2.5614489003880982, | |
| "grad_norm": 0.6420727968215942, | |
| "learning_rate": 0.00015904938907238206, | |
| "loss": 1.9615, | |
| "mean_token_accuracy": 0.5487420856952667, | |
| "num_tokens": 5702066.0, | |
| "step": 1980 | |
| }, | |
| { | |
| "entropy": 2.452130767703056, | |
| "epoch": 2.574385510996119, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015862587386852541, | |
| "loss": 0.7703, | |
| "mean_token_accuracy": 0.2316281594336033, | |
| "num_tokens": 5704289.0, | |
| "step": 1990 | |
| }, | |
| { | |
| "entropy": 2.385006046295166, | |
| "epoch": 2.5873221216041395, | |
| "grad_norm": 0.3110261857509613, | |
| "learning_rate": 0.0001582007503346832, | |
| "loss": 0.9186, | |
| "mean_token_accuracy": 0.19861687943339348, | |
| "num_tokens": 5760847.0, | |
| "step": 2000 | |
| }, | |
| { | |
| "entropy": 1.4642044007778168, | |
| "epoch": 2.6002587322121604, | |
| "grad_norm": 0.38485661149024963, | |
| "learning_rate": 0.0001577740301336057, | |
| "loss": 1.4756, | |
| "mean_token_accuracy": 0.6492435604333877, | |
| "num_tokens": 5802455.0, | |
| "step": 2010 | |
| }, | |
| { | |
| "entropy": 1.5432655066251755, | |
| "epoch": 2.6131953428201813, | |
| "grad_norm": 0.6033521294593811, | |
| "learning_rate": 0.00015734572497184577, | |
| "loss": 1.5119, | |
| "mean_token_accuracy": 0.6332074150443077, | |
| "num_tokens": 5831848.0, | |
| "step": 2020 | |
| }, | |
| { | |
| "entropy": 2.0233444392681124, | |
| "epoch": 2.6261319534282017, | |
| "grad_norm": 0.7502851486206055, | |
| "learning_rate": 0.00015691584659943786, | |
| "loss": 1.9476, | |
| "mean_token_accuracy": 0.5473973207175732, | |
| "num_tokens": 5850975.0, | |
| "step": 2030 | |
| }, | |
| { | |
| "entropy": 2.2630896627902986, | |
| "epoch": 2.6390685640362226, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001564844068095755, | |
| "loss": 0.8525, | |
| "mean_token_accuracy": 0.23688365146517754, | |
| "num_tokens": 5853548.0, | |
| "step": 2040 | |
| }, | |
| { | |
| "entropy": 1.6931863486766816, | |
| "epoch": 2.652005174644243, | |
| "grad_norm": 0.3148477077484131, | |
| "learning_rate": 0.0001560514174382878, | |
| "loss": 0.8972, | |
| "mean_token_accuracy": 0.20218148753046988, | |
| "num_tokens": 5907614.0, | |
| "step": 2050 | |
| }, | |
| { | |
| "entropy": 1.4166515529155732, | |
| "epoch": 2.664941785252264, | |
| "grad_norm": 0.38905423879623413, | |
| "learning_rate": 0.0001556168903641148, | |
| "loss": 1.4368, | |
| "mean_token_accuracy": 0.6563202187418937, | |
| "num_tokens": 5947663.0, | |
| "step": 2060 | |
| }, | |
| { | |
| "entropy": 1.5506242126226426, | |
| "epoch": 2.6778783958602848, | |
| "grad_norm": 0.5905367136001587, | |
| "learning_rate": 0.00015518083750778157, | |
| "loss": 1.5309, | |
| "mean_token_accuracy": 0.6258940026164055, | |
| "num_tokens": 5976765.0, | |
| "step": 2070 | |
| }, | |
| { | |
| "entropy": 1.9377893030643463, | |
| "epoch": 2.690815006468305, | |
| "grad_norm": 0.6645969152450562, | |
| "learning_rate": 0.00015474327083187105, | |
| "loss": 1.9022, | |
| "mean_token_accuracy": 0.5610988035798072, | |
| "num_tokens": 5996303.0, | |
| "step": 2080 | |
| }, | |
| { | |
| "entropy": 2.6364343762397766, | |
| "epoch": 2.703751617076326, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015430420234049624, | |
| "loss": 1.038, | |
| "mean_token_accuracy": 0.2556902192533016, | |
| "num_tokens": 5999434.0, | |
| "step": 2090 | |
| }, | |
| { | |
| "entropy": 2.8424737572669985, | |
| "epoch": 2.7166882276843465, | |
| "grad_norm": 0.3264569938182831, | |
| "learning_rate": 0.00015386364407897035, | |
| "loss": 0.9078, | |
| "mean_token_accuracy": 0.20131859928369522, | |
| "num_tokens": 6051774.0, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.7166882276843465, | |
| "eval_entropy": 2.204050070671148, | |
| "eval_loss": 1.3715640306472778, | |
| "eval_mean_token_accuracy": 0.4440248931736447, | |
| "eval_num_tokens": 6051774.0, | |
| "eval_runtime": 244.556, | |
| "eval_samples_per_second": 22.473, | |
| "eval_steps_per_second": 1.407, | |
| "step": 2100 | |
| }, | |
| { | |
| "entropy": 1.4316389322280885, | |
| "epoch": 2.7296248382923674, | |
| "grad_norm": 0.3802427053451538, | |
| "learning_rate": 0.00015342160813347676, | |
| "loss": 1.4553, | |
| "mean_token_accuracy": 0.6519668206572533, | |
| "num_tokens": 6091750.0, | |
| "step": 2110 | |
| }, | |
| { | |
| "entropy": 1.5787472486495973, | |
| "epoch": 2.742561448900388, | |
| "grad_norm": 0.5799654126167297, | |
| "learning_rate": 0.00015297810663073743, | |
| "loss": 1.5507, | |
| "mean_token_accuracy": 0.6268433704972267, | |
| "num_tokens": 6120790.0, | |
| "step": 2120 | |
| }, | |
| { | |
| "entropy": 1.9796525478363036, | |
| "epoch": 2.7554980595084086, | |
| "grad_norm": 0.7903239727020264, | |
| "learning_rate": 0.00015253315173767993, | |
| "loss": 1.9383, | |
| "mean_token_accuracy": 0.5536467991769314, | |
| "num_tokens": 6139010.0, | |
| "step": 2130 | |
| }, | |
| { | |
| "entropy": 2.6805751383304597, | |
| "epoch": 2.7684346701164295, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00015208675566110387, | |
| "loss": 0.7659, | |
| "mean_token_accuracy": 0.21504319161176683, | |
| "num_tokens": 6141159.0, | |
| "step": 2140 | |
| }, | |
| { | |
| "entropy": 2.1302292913198473, | |
| "epoch": 2.78137128072445, | |
| "grad_norm": 0.3743366003036499, | |
| "learning_rate": 0.0001516389306473461, | |
| "loss": 0.8888, | |
| "mean_token_accuracy": 0.20484731644392012, | |
| "num_tokens": 6191053.0, | |
| "step": 2150 | |
| }, | |
| { | |
| "entropy": 1.4483990609645843, | |
| "epoch": 2.794307891332471, | |
| "grad_norm": 0.3969733417034149, | |
| "learning_rate": 0.00015118968898194458, | |
| "loss": 1.443, | |
| "mean_token_accuracy": 0.6526175752282143, | |
| "num_tokens": 6230521.0, | |
| "step": 2160 | |
| }, | |
| { | |
| "entropy": 1.582485669851303, | |
| "epoch": 2.8072445019404917, | |
| "grad_norm": 0.6144042611122131, | |
| "learning_rate": 0.00015073904298930132, | |
| "loss": 1.5429, | |
| "mean_token_accuracy": 0.6261137276887894, | |
| "num_tokens": 6259286.0, | |
| "step": 2170 | |
| }, | |
| { | |
| "entropy": 1.970637395977974, | |
| "epoch": 2.8201811125485126, | |
| "grad_norm": 0.7516705393791199, | |
| "learning_rate": 0.00015028700503234447, | |
| "loss": 1.9348, | |
| "mean_token_accuracy": 0.5558973327279091, | |
| "num_tokens": 6277729.0, | |
| "step": 2180 | |
| }, | |
| { | |
| "entropy": 2.001736190915108, | |
| "epoch": 2.833117723156533, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014983358751218892, | |
| "loss": 0.736, | |
| "mean_token_accuracy": 0.19615912958979606, | |
| "num_tokens": 6279643.0, | |
| "step": 2190 | |
| }, | |
| { | |
| "entropy": 1.9369044452905655, | |
| "epoch": 2.8460543337645534, | |
| "grad_norm": 0.32840585708618164, | |
| "learning_rate": 0.00014937880286779629, | |
| "loss": 0.9147, | |
| "mean_token_accuracy": 0.19959167763590813, | |
| "num_tokens": 6336300.0, | |
| "step": 2200 | |
| }, | |
| { | |
| "entropy": 1.4088002383708953, | |
| "epoch": 2.8589909443725743, | |
| "grad_norm": 0.4119824767112732, | |
| "learning_rate": 0.00014892266357563358, | |
| "loss": 1.4187, | |
| "mean_token_accuracy": 0.6627781435847282, | |
| "num_tokens": 6375995.0, | |
| "step": 2210 | |
| }, | |
| { | |
| "entropy": 1.6024494558572768, | |
| "epoch": 2.871927554980595, | |
| "grad_norm": 0.5892689228057861, | |
| "learning_rate": 0.0001484651821493309, | |
| "loss": 1.5693, | |
| "mean_token_accuracy": 0.6204348549246788, | |
| "num_tokens": 6404526.0, | |
| "step": 2220 | |
| }, | |
| { | |
| "entropy": 2.072836604714394, | |
| "epoch": 2.884864165588616, | |
| "grad_norm": 0.7402485013008118, | |
| "learning_rate": 0.0001480063711393382, | |
| "loss": 2.0136, | |
| "mean_token_accuracy": 0.5476931251585484, | |
| "num_tokens": 6421889.0, | |
| "step": 2230 | |
| }, | |
| { | |
| "entropy": 1.5923803925514222, | |
| "epoch": 2.8978007761966365, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014754624313258102, | |
| "loss": 0.6735, | |
| "mean_token_accuracy": 0.20976952239871025, | |
| "num_tokens": 6423681.0, | |
| "step": 2240 | |
| }, | |
| { | |
| "entropy": 1.2221377216279508, | |
| "epoch": 2.9107373868046573, | |
| "grad_norm": 0.3352583050727844, | |
| "learning_rate": 0.00014708481075211498, | |
| "loss": 0.9037, | |
| "mean_token_accuracy": 0.20100481137633325, | |
| "num_tokens": 6474539.0, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.9107373868046573, | |
| "eval_entropy": 1.358256766096104, | |
| "eval_loss": 1.3591663837432861, | |
| "eval_mean_token_accuracy": 0.45166019766136656, | |
| "eval_num_tokens": 6474539.0, | |
| "eval_runtime": 241.3389, | |
| "eval_samples_per_second": 22.773, | |
| "eval_steps_per_second": 1.425, | |
| "step": 2250 | |
| }, | |
| { | |
| "entropy": 1.3933149039745332, | |
| "epoch": 2.9236739974126777, | |
| "grad_norm": 0.4007508456707001, | |
| "learning_rate": 0.00014662208665677966, | |
| "loss": 1.4101, | |
| "mean_token_accuracy": 0.6611413463950158, | |
| "num_tokens": 6514494.0, | |
| "step": 2260 | |
| }, | |
| { | |
| "entropy": 1.5439734548330306, | |
| "epoch": 2.9366106080206986, | |
| "grad_norm": 0.5625568628311157, | |
| "learning_rate": 0.0001461580835408513, | |
| "loss": 1.4993, | |
| "mean_token_accuracy": 0.6339735224843025, | |
| "num_tokens": 6543746.0, | |
| "step": 2270 | |
| }, | |
| { | |
| "entropy": 1.982978528738022, | |
| "epoch": 2.9495472186287195, | |
| "grad_norm": 0.7641308307647705, | |
| "learning_rate": 0.00014569281413369462, | |
| "loss": 1.9328, | |
| "mean_token_accuracy": 0.5539643183350563, | |
| "num_tokens": 6562759.0, | |
| "step": 2280 | |
| }, | |
| { | |
| "entropy": 1.5298347800970078, | |
| "epoch": 2.96248382923674, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014522629119941333, | |
| "loss": 0.766, | |
| "mean_token_accuracy": 0.21878809183835984, | |
| "num_tokens": 6564974.0, | |
| "step": 2290 | |
| }, | |
| { | |
| "entropy": 1.4145286485552788, | |
| "epoch": 2.975420439844761, | |
| "grad_norm": 0.4561901092529297, | |
| "learning_rate": 0.00014475852753650023, | |
| "loss": 0.7577, | |
| "mean_token_accuracy": 0.22906568124890328, | |
| "num_tokens": 6598409.0, | |
| "step": 2300 | |
| }, | |
| { | |
| "entropy": 1.5782025367021562, | |
| "epoch": 2.988357050452781, | |
| "grad_norm": 0.5903820991516113, | |
| "learning_rate": 0.000144289535977486, | |
| "loss": 1.554, | |
| "mean_token_accuracy": 0.6246525257825851, | |
| "num_tokens": 6627531.0, | |
| "step": 2310 | |
| }, | |
| { | |
| "entropy": 1.9433803856372833, | |
| "epoch": 3.001293661060802, | |
| "grad_norm": 0.13881655037403107, | |
| "learning_rate": 0.00014381932938858718, | |
| "loss": 0.9444, | |
| "mean_token_accuracy": 0.22419775873422623, | |
| "num_tokens": 6660338.0, | |
| "step": 2320 | |
| }, | |
| { | |
| "entropy": 1.621496966481209, | |
| "epoch": 3.014230271668823, | |
| "grad_norm": 0.42520761489868164, | |
| "learning_rate": 0.0001433479206693532, | |
| "loss": 1.6127, | |
| "mean_token_accuracy": 0.6233608849346638, | |
| "num_tokens": 6713107.0, | |
| "step": 2330 | |
| }, | |
| { | |
| "entropy": 1.338898405432701, | |
| "epoch": 3.0271668822768434, | |
| "grad_norm": 0.6367995738983154, | |
| "learning_rate": 0.0001428753227523124, | |
| "loss": 1.3191, | |
| "mean_token_accuracy": 0.67000552713871, | |
| "num_tokens": 6744799.0, | |
| "step": 2340 | |
| }, | |
| { | |
| "entropy": 1.590729820728302, | |
| "epoch": 3.0401034928848643, | |
| "grad_norm": 0.6899548172950745, | |
| "learning_rate": 0.0001424015486026174, | |
| "loss": 1.5648, | |
| "mean_token_accuracy": 0.618783813714981, | |
| "num_tokens": 6766726.0, | |
| "step": 2350 | |
| }, | |
| { | |
| "entropy": 1.977810901403427, | |
| "epoch": 3.0530401034928847, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00014192661121768932, | |
| "loss": 1.3483, | |
| "mean_token_accuracy": 0.3756748877465725, | |
| "num_tokens": 6772184.0, | |
| "step": 2360 | |
| }, | |
| { | |
| "entropy": 1.1425089821219445, | |
| "epoch": 3.0659767141009056, | |
| "grad_norm": 0.1791164129972458, | |
| "learning_rate": 0.0001414505236268613, | |
| "loss": 0.2221, | |
| "mean_token_accuracy": 0.05023420602083206, | |
| "num_tokens": 6801985.0, | |
| "step": 2370 | |
| }, | |
| { | |
| "entropy": 1.534485575556755, | |
| "epoch": 3.0789133247089264, | |
| "grad_norm": 0.4513719975948334, | |
| "learning_rate": 0.00014097329889102084, | |
| "loss": 1.6302, | |
| "mean_token_accuracy": 0.6191562682390213, | |
| "num_tokens": 6853863.0, | |
| "step": 2380 | |
| }, | |
| { | |
| "entropy": 1.3535702049732208, | |
| "epoch": 3.091849935316947, | |
| "grad_norm": 0.6277197599411011, | |
| "learning_rate": 0.00014049495010225174, | |
| "loss": 1.2826, | |
| "mean_token_accuracy": 0.6846122413873672, | |
| "num_tokens": 6885860.0, | |
| "step": 2390 | |
| }, | |
| { | |
| "entropy": 1.611542597413063, | |
| "epoch": 3.1047865459249677, | |
| "grad_norm": 0.6629586219787598, | |
| "learning_rate": 0.00014001549038347488, | |
| "loss": 1.5841, | |
| "mean_token_accuracy": 0.6110770747065544, | |
| "num_tokens": 6907549.0, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.1047865459249677, | |
| "eval_entropy": 1.4435141939063405, | |
| "eval_loss": 1.3480572700500488, | |
| "eval_mean_token_accuracy": 0.45482284610354623, | |
| "eval_num_tokens": 6907549.0, | |
| "eval_runtime": 243.0256, | |
| "eval_samples_per_second": 22.615, | |
| "eval_steps_per_second": 1.415, | |
| "step": 2400 | |
| }, | |
| { | |
| "entropy": 2.002578613162041, | |
| "epoch": 3.117723156532988, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013953493288808804, | |
| "loss": 1.2204, | |
| "mean_token_accuracy": 0.3793766848742962, | |
| "num_tokens": 6912238.0, | |
| "step": 2410 | |
| }, | |
| { | |
| "entropy": 1.580290713906288, | |
| "epoch": 3.130659767141009, | |
| "grad_norm": 0.17965653538703918, | |
| "learning_rate": 0.00013905329079960522, | |
| "loss": 0.2405, | |
| "mean_token_accuracy": 0.04845013022422791, | |
| "num_tokens": 6941537.0, | |
| "step": 2420 | |
| }, | |
| { | |
| "entropy": 1.4815610826015473, | |
| "epoch": 3.14359637774903, | |
| "grad_norm": 0.46858540177345276, | |
| "learning_rate": 0.00013857057733129494, | |
| "loss": 1.5548, | |
| "mean_token_accuracy": 0.6307360790669918, | |
| "num_tokens": 6994352.0, | |
| "step": 2430 | |
| }, | |
| { | |
| "entropy": 1.3407190799713136, | |
| "epoch": 3.1565329883570503, | |
| "grad_norm": 0.6128517389297485, | |
| "learning_rate": 0.00013808680572581776, | |
| "loss": 1.2793, | |
| "mean_token_accuracy": 0.6835518077015876, | |
| "num_tokens": 7026544.0, | |
| "step": 2440 | |
| }, | |
| { | |
| "entropy": 1.6429592788219451, | |
| "epoch": 3.169469598965071, | |
| "grad_norm": 0.7309837937355042, | |
| "learning_rate": 0.0001376019892548629, | |
| "loss": 1.6028, | |
| "mean_token_accuracy": 0.6109883636236191, | |
| "num_tokens": 7049229.0, | |
| "step": 2450 | |
| }, | |
| { | |
| "entropy": 2.1930068999528887, | |
| "epoch": 3.1824062095730916, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013711614121878423, | |
| "loss": 1.3452, | |
| "mean_token_accuracy": 0.4032416954636574, | |
| "num_tokens": 7055638.0, | |
| "step": 2460 | |
| }, | |
| { | |
| "entropy": 2.582664442062378, | |
| "epoch": 3.1953428201811125, | |
| "grad_norm": 0.17951107025146484, | |
| "learning_rate": 0.00013662927494623528, | |
| "loss": 0.238, | |
| "mean_token_accuracy": 0.0486849807202816, | |
| "num_tokens": 7079933.0, | |
| "step": 2470 | |
| }, | |
| { | |
| "entropy": 1.4514012217521668, | |
| "epoch": 3.2082794307891334, | |
| "grad_norm": 0.48690128326416016, | |
| "learning_rate": 0.00013614140379380384, | |
| "loss": 1.5635, | |
| "mean_token_accuracy": 0.6299719527363777, | |
| "num_tokens": 7130984.0, | |
| "step": 2480 | |
| }, | |
| { | |
| "entropy": 1.3963081300258637, | |
| "epoch": 3.221216041397154, | |
| "grad_norm": 0.5850987434387207, | |
| "learning_rate": 0.00013565254114564522, | |
| "loss": 1.3093, | |
| "mean_token_accuracy": 0.6751079827547073, | |
| "num_tokens": 7162961.0, | |
| "step": 2490 | |
| }, | |
| { | |
| "entropy": 1.6287110567092895, | |
| "epoch": 3.2341526520051747, | |
| "grad_norm": 0.7363412976264954, | |
| "learning_rate": 0.00013516270041311523, | |
| "loss": 1.6109, | |
| "mean_token_accuracy": 0.6086324542760849, | |
| "num_tokens": 7185148.0, | |
| "step": 2500 | |
| }, | |
| { | |
| "entropy": 2.588909697532654, | |
| "epoch": 3.2470892626131955, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.0001346718950344023, | |
| "loss": 1.3295, | |
| "mean_token_accuracy": 0.36438525542616845, | |
| "num_tokens": 7190578.0, | |
| "step": 2510 | |
| }, | |
| { | |
| "entropy": 2.170939177274704, | |
| "epoch": 3.260025873221216, | |
| "grad_norm": 0.16089969873428345, | |
| "learning_rate": 0.00013418013847415875, | |
| "loss": 0.2333, | |
| "mean_token_accuracy": 0.04912624955177307, | |
| "num_tokens": 7223083.0, | |
| "step": 2520 | |
| }, | |
| { | |
| "entropy": 1.5124918982386588, | |
| "epoch": 3.272962483829237, | |
| "grad_norm": 0.48449796438217163, | |
| "learning_rate": 0.00013368744422313135, | |
| "loss": 1.5844, | |
| "mean_token_accuracy": 0.6292549699544907, | |
| "num_tokens": 7278262.0, | |
| "step": 2530 | |
| }, | |
| { | |
| "entropy": 1.3001452058553695, | |
| "epoch": 3.2858990944372573, | |
| "grad_norm": 0.6388899087905884, | |
| "learning_rate": 0.00013319382579779143, | |
| "loss": 1.2473, | |
| "mean_token_accuracy": 0.686492520570755, | |
| "num_tokens": 7310633.0, | |
| "step": 2540 | |
| }, | |
| { | |
| "entropy": 1.588513082265854, | |
| "epoch": 3.298835705045278, | |
| "grad_norm": 0.7601234316825867, | |
| "learning_rate": 0.00013269929673996372, | |
| "loss": 1.5813, | |
| "mean_token_accuracy": 0.6151460394263267, | |
| "num_tokens": 7333877.0, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 3.298835705045278, | |
| "eval_entropy": 1.50408104668523, | |
| "eval_loss": 1.3354183435440063, | |
| "eval_mean_token_accuracy": 0.4569617995862351, | |
| "eval_num_tokens": 7333877.0, | |
| "eval_runtime": 242.7951, | |
| "eval_samples_per_second": 22.636, | |
| "eval_steps_per_second": 1.417, | |
| "step": 2550 | |
| }, | |
| { | |
| "entropy": 1.8434918358922006, | |
| "epoch": 3.311772315653299, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00013220387061645518, | |
| "loss": 1.2378, | |
| "mean_token_accuracy": 0.3966076374053955, | |
| "num_tokens": 7340126.0, | |
| "step": 2560 | |
| }, | |
| { | |
| "entropy": 2.0701662808656693, | |
| "epoch": 3.3247089262613194, | |
| "grad_norm": 0.1653972566127777, | |
| "learning_rate": 0.00013170756101868274, | |
| "loss": 0.2363, | |
| "mean_token_accuracy": 0.04905220568180084, | |
| "num_tokens": 7368440.0, | |
| "step": 2570 | |
| }, | |
| { | |
| "entropy": 1.521276581287384, | |
| "epoch": 3.3376455368693403, | |
| "grad_norm": 0.5110422372817993, | |
| "learning_rate": 0.00013121038156230021, | |
| "loss": 1.6069, | |
| "mean_token_accuracy": 0.6247900031507015, | |
| "num_tokens": 7422449.0, | |
| "step": 2580 | |
| }, | |
| { | |
| "entropy": 1.3473992764949798, | |
| "epoch": 3.350582147477361, | |
| "grad_norm": 0.5985650420188904, | |
| "learning_rate": 0.00013071234588682507, | |
| "loss": 1.2818, | |
| "mean_token_accuracy": 0.6814156129956246, | |
| "num_tokens": 7455078.0, | |
| "step": 2590 | |
| }, | |
| { | |
| "entropy": 1.5794302642345428, | |
| "epoch": 3.3635187580853816, | |
| "grad_norm": 0.7455780506134033, | |
| "learning_rate": 0.00013021346765526405, | |
| "loss": 1.5565, | |
| "mean_token_accuracy": 0.6210769057273865, | |
| "num_tokens": 7478151.0, | |
| "step": 2600 | |
| }, | |
| { | |
| "entropy": 2.400119936466217, | |
| "epoch": 3.3764553686934025, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012971376055373842, | |
| "loss": 1.3398, | |
| "mean_token_accuracy": 0.3794242724776268, | |
| "num_tokens": 7483907.0, | |
| "step": 2610 | |
| }, | |
| { | |
| "entropy": 2.360330358147621, | |
| "epoch": 3.389391979301423, | |
| "grad_norm": 0.16837802529335022, | |
| "learning_rate": 0.0001292132382911085, | |
| "loss": 0.231, | |
| "mean_token_accuracy": 0.04970394000411034, | |
| "num_tokens": 7511728.0, | |
| "step": 2620 | |
| }, | |
| { | |
| "entropy": 1.5115429222583772, | |
| "epoch": 3.4023285899094438, | |
| "grad_norm": 0.5140193700790405, | |
| "learning_rate": 0.00012871191459859754, | |
| "loss": 1.5844, | |
| "mean_token_accuracy": 0.626202804595232, | |
| "num_tokens": 7564367.0, | |
| "step": 2630 | |
| }, | |
| { | |
| "entropy": 1.33871136456728, | |
| "epoch": 3.4152652005174646, | |
| "grad_norm": 0.5856406092643738, | |
| "learning_rate": 0.00012820980322941506, | |
| "loss": 1.2772, | |
| "mean_token_accuracy": 0.6828064471483231, | |
| "num_tokens": 7596458.0, | |
| "step": 2640 | |
| }, | |
| { | |
| "entropy": 1.5606994718313216, | |
| "epoch": 3.428201811125485, | |
| "grad_norm": 0.7913902401924133, | |
| "learning_rate": 0.00012770691795837956, | |
| "loss": 1.5388, | |
| "mean_token_accuracy": 0.6267461031675339, | |
| "num_tokens": 7618937.0, | |
| "step": 2650 | |
| }, | |
| { | |
| "entropy": 2.3131509482860566, | |
| "epoch": 3.441138421733506, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012720327258154059, | |
| "loss": 1.3789, | |
| "mean_token_accuracy": 0.39152705743908883, | |
| "num_tokens": 7624946.0, | |
| "step": 2660 | |
| }, | |
| { | |
| "entropy": 2.270913216471672, | |
| "epoch": 3.4540750323415264, | |
| "grad_norm": 0.1674034297466278, | |
| "learning_rate": 0.00012669888091580033, | |
| "loss": 0.2283, | |
| "mean_token_accuracy": 0.05011768788099289, | |
| "num_tokens": 7655621.0, | |
| "step": 2670 | |
| }, | |
| { | |
| "entropy": 1.5039668411016465, | |
| "epoch": 3.4670116429495472, | |
| "grad_norm": 0.5039061307907104, | |
| "learning_rate": 0.00012619375679853435, | |
| "loss": 1.5889, | |
| "mean_token_accuracy": 0.6255090057849884, | |
| "num_tokens": 7706496.0, | |
| "step": 2680 | |
| }, | |
| { | |
| "entropy": 1.299958510696888, | |
| "epoch": 3.479948253557568, | |
| "grad_norm": 0.6249063611030579, | |
| "learning_rate": 0.0001256879140872123, | |
| "loss": 1.2262, | |
| "mean_token_accuracy": 0.6930169105529785, | |
| "num_tokens": 7738457.0, | |
| "step": 2690 | |
| }, | |
| { | |
| "entropy": 1.5891169756650925, | |
| "epoch": 3.4928848641655885, | |
| "grad_norm": 0.7654421925544739, | |
| "learning_rate": 0.00012518136665901755, | |
| "loss": 1.5485, | |
| "mean_token_accuracy": 0.6236635655164718, | |
| "num_tokens": 7760759.0, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.4928848641655885, | |
| "eval_entropy": 1.7460197186054185, | |
| "eval_loss": 1.3263978958129883, | |
| "eval_mean_token_accuracy": 0.45740372557626213, | |
| "eval_num_tokens": 7760759.0, | |
| "eval_runtime": 244.9238, | |
| "eval_samples_per_second": 22.44, | |
| "eval_steps_per_second": 1.405, | |
| "step": 2700 | |
| }, | |
| { | |
| "entropy": 2.4236282050609588, | |
| "epoch": 3.5058214747736094, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012467412841046644, | |
| "loss": 1.3685, | |
| "mean_token_accuracy": 0.38023146614432335, | |
| "num_tokens": 7766609.0, | |
| "step": 2710 | |
| }, | |
| { | |
| "entropy": 2.481502190232277, | |
| "epoch": 3.51875808538163, | |
| "grad_norm": 0.18167299032211304, | |
| "learning_rate": 0.00012416621325702723, | |
| "loss": 0.2353, | |
| "mean_token_accuracy": 0.049381527304649356, | |
| "num_tokens": 7796963.0, | |
| "step": 2720 | |
| }, | |
| { | |
| "entropy": 1.526540043950081, | |
| "epoch": 3.5316946959896507, | |
| "grad_norm": 0.5063906908035278, | |
| "learning_rate": 0.00012365763513273826, | |
| "loss": 1.6301, | |
| "mean_token_accuracy": 0.6226166233420372, | |
| "num_tokens": 7851436.0, | |
| "step": 2730 | |
| }, | |
| { | |
| "entropy": 1.3451905250549316, | |
| "epoch": 3.5446313065976716, | |
| "grad_norm": 0.591876208782196, | |
| "learning_rate": 0.0001231484079898255, | |
| "loss": 1.2804, | |
| "mean_token_accuracy": 0.6807183653116227, | |
| "num_tokens": 7883623.0, | |
| "step": 2740 | |
| }, | |
| { | |
| "entropy": 1.6224838614463806, | |
| "epoch": 3.557567917205692, | |
| "grad_norm": 0.8054526448249817, | |
| "learning_rate": 0.00012263854579832022, | |
| "loss": 1.5855, | |
| "mean_token_accuracy": 0.6138912171125412, | |
| "num_tokens": 7906065.0, | |
| "step": 2750 | |
| }, | |
| { | |
| "entropy": 2.2193833112716677, | |
| "epoch": 3.570504527813713, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00012212806254567526, | |
| "loss": 1.3055, | |
| "mean_token_accuracy": 0.388429357111454, | |
| "num_tokens": 7911950.0, | |
| "step": 2760 | |
| }, | |
| { | |
| "entropy": 1.9380589336156846, | |
| "epoch": 3.5834411384217333, | |
| "grad_norm": 0.15811856091022491, | |
| "learning_rate": 0.00012161697223638162, | |
| "loss": 0.2486, | |
| "mean_token_accuracy": 0.048336771130561826, | |
| "num_tokens": 7944772.0, | |
| "step": 2770 | |
| }, | |
| { | |
| "entropy": 1.5291394203901292, | |
| "epoch": 3.596377749029754, | |
| "grad_norm": 0.5478163361549377, | |
| "learning_rate": 0.00012110528889158421, | |
| "loss": 1.6201, | |
| "mean_token_accuracy": 0.6210859633982182, | |
| "num_tokens": 7998744.0, | |
| "step": 2780 | |
| }, | |
| { | |
| "entropy": 1.3308267042040824, | |
| "epoch": 3.609314359637775, | |
| "grad_norm": 0.6494978070259094, | |
| "learning_rate": 0.00012059302654869707, | |
| "loss": 1.2747, | |
| "mean_token_accuracy": 0.6828291460871696, | |
| "num_tokens": 8030628.0, | |
| "step": 2790 | |
| }, | |
| { | |
| "entropy": 1.6048484414815902, | |
| "epoch": 3.6222509702457955, | |
| "grad_norm": 0.8232805132865906, | |
| "learning_rate": 0.00012008019926101837, | |
| "loss": 1.5858, | |
| "mean_token_accuracy": 0.614265987277031, | |
| "num_tokens": 8052959.0, | |
| "step": 2800 | |
| }, | |
| { | |
| "entropy": 2.457938811182976, | |
| "epoch": 3.6351875808538163, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011956682109734485, | |
| "loss": 1.3734, | |
| "mean_token_accuracy": 0.37425210550427435, | |
| "num_tokens": 8058605.0, | |
| "step": 2810 | |
| }, | |
| { | |
| "entropy": 2.780105286836624, | |
| "epoch": 3.6481241914618368, | |
| "grad_norm": 0.15952081978321075, | |
| "learning_rate": 0.0001190529061415859, | |
| "loss": 0.2238, | |
| "mean_token_accuracy": 0.0499541737139225, | |
| "num_tokens": 8088439.0, | |
| "step": 2820 | |
| }, | |
| { | |
| "entropy": 1.4993865296244622, | |
| "epoch": 3.6610608020698576, | |
| "grad_norm": 0.4854850769042969, | |
| "learning_rate": 0.0001185384684923772, | |
| "loss": 1.5841, | |
| "mean_token_accuracy": 0.6286533363163471, | |
| "num_tokens": 8140599.0, | |
| "step": 2830 | |
| }, | |
| { | |
| "entropy": 1.3472731560468674, | |
| "epoch": 3.6739974126778785, | |
| "grad_norm": 0.6306962966918945, | |
| "learning_rate": 0.00011802352226269375, | |
| "loss": 1.292, | |
| "mean_token_accuracy": 0.6775945991277694, | |
| "num_tokens": 8172688.0, | |
| "step": 2840 | |
| }, | |
| { | |
| "entropy": 1.5441134572029114, | |
| "epoch": 3.6869340232858994, | |
| "grad_norm": 0.8373256325721741, | |
| "learning_rate": 0.00011750808157946291, | |
| "loss": 1.5236, | |
| "mean_token_accuracy": 0.6226452678442002, | |
| "num_tokens": 8195667.0, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 3.6869340232858994, | |
| "eval_entropy": 2.019692697324032, | |
| "eval_loss": 1.3088935613632202, | |
| "eval_mean_token_accuracy": 0.45852816875937374, | |
| "eval_num_tokens": 8195667.0, | |
| "eval_runtime": 247.8075, | |
| "eval_samples_per_second": 22.179, | |
| "eval_steps_per_second": 1.388, | |
| "step": 2850 | |
| }, | |
| { | |
| "entropy": 2.331311251223087, | |
| "epoch": 3.69987063389392, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011699216058317686, | |
| "loss": 1.4345, | |
| "mean_token_accuracy": 0.42385049238801004, | |
| "num_tokens": 8202061.0, | |
| "step": 2860 | |
| }, | |
| { | |
| "entropy": 1.3996504232287408, | |
| "epoch": 3.71280724450194, | |
| "grad_norm": 0.16637884080410004, | |
| "learning_rate": 0.00011647577342750447, | |
| "loss": 0.232, | |
| "mean_token_accuracy": 0.05035848617553711, | |
| "num_tokens": 8229320.0, | |
| "step": 2870 | |
| }, | |
| { | |
| "entropy": 1.5440905675292016, | |
| "epoch": 3.725743855109961, | |
| "grad_norm": 0.5046349763870239, | |
| "learning_rate": 0.00011595893427890316, | |
| "loss": 1.6135, | |
| "mean_token_accuracy": 0.6227852456271649, | |
| "num_tokens": 8282159.0, | |
| "step": 2880 | |
| }, | |
| { | |
| "entropy": 1.313097244501114, | |
| "epoch": 3.738680465717982, | |
| "grad_norm": 0.6280332803726196, | |
| "learning_rate": 0.00011544165731623029, | |
| "loss": 1.283, | |
| "mean_token_accuracy": 0.6847794458270073, | |
| "num_tokens": 8314583.0, | |
| "step": 2890 | |
| }, | |
| { | |
| "entropy": 1.5734279870986938, | |
| "epoch": 3.751617076326003, | |
| "grad_norm": 0.8147013187408447, | |
| "learning_rate": 0.00011492395673035401, | |
| "loss": 1.5372, | |
| "mean_token_accuracy": 0.6240187495946884, | |
| "num_tokens": 8337156.0, | |
| "step": 2900 | |
| }, | |
| { | |
| "entropy": 1.903187246620655, | |
| "epoch": 3.7645536869340233, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011440584672376418, | |
| "loss": 1.3835, | |
| "mean_token_accuracy": 0.3674991957843304, | |
| "num_tokens": 8343309.0, | |
| "step": 2910 | |
| }, | |
| { | |
| "entropy": 1.1613501474261283, | |
| "epoch": 3.777490297542044, | |
| "grad_norm": 0.16990479826927185, | |
| "learning_rate": 0.00011388734151018252, | |
| "loss": 0.2192, | |
| "mean_token_accuracy": 0.050329743325710295, | |
| "num_tokens": 8374198.0, | |
| "step": 2920 | |
| }, | |
| { | |
| "entropy": 1.5224060222506524, | |
| "epoch": 3.7904269081500646, | |
| "grad_norm": 0.5338153839111328, | |
| "learning_rate": 0.00011336845531417286, | |
| "loss": 1.6167, | |
| "mean_token_accuracy": 0.6217537559568882, | |
| "num_tokens": 8426906.0, | |
| "step": 2930 | |
| }, | |
| { | |
| "entropy": 1.3422169074416161, | |
| "epoch": 3.8033635187580854, | |
| "grad_norm": 0.6484615802764893, | |
| "learning_rate": 0.00011284920237075076, | |
| "loss": 1.2771, | |
| "mean_token_accuracy": 0.6828199326992035, | |
| "num_tokens": 8458929.0, | |
| "step": 2940 | |
| }, | |
| { | |
| "entropy": 1.5778010010719299, | |
| "epoch": 3.8163001293661063, | |
| "grad_norm": 0.8282558917999268, | |
| "learning_rate": 0.00011232959692499308, | |
| "loss": 1.5224, | |
| "mean_token_accuracy": 0.6264667376875878, | |
| "num_tokens": 8481613.0, | |
| "step": 2950 | |
| }, | |
| { | |
| "entropy": 2.231258991360664, | |
| "epoch": 3.8292367399741267, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00011180965323164719, | |
| "loss": 1.3715, | |
| "mean_token_accuracy": 0.4014947086572647, | |
| "num_tokens": 8487887.0, | |
| "step": 2960 | |
| }, | |
| { | |
| "entropy": 2.2951877444982527, | |
| "epoch": 3.8421733505821476, | |
| "grad_norm": 0.16264809668064117, | |
| "learning_rate": 0.00011128938555473976, | |
| "loss": 0.242, | |
| "mean_token_accuracy": 0.04751046672463417, | |
| "num_tokens": 8522204.0, | |
| "step": 2970 | |
| }, | |
| { | |
| "entropy": 1.505036623775959, | |
| "epoch": 3.855109961190168, | |
| "grad_norm": 0.5537543892860413, | |
| "learning_rate": 0.00011076880816718569, | |
| "loss": 1.5994, | |
| "mean_token_accuracy": 0.6235061697661877, | |
| "num_tokens": 8576399.0, | |
| "step": 2980 | |
| }, | |
| { | |
| "entropy": 1.306050930917263, | |
| "epoch": 3.868046571798189, | |
| "grad_norm": 0.6618802547454834, | |
| "learning_rate": 0.00011024793535039634, | |
| "loss": 1.2665, | |
| "mean_token_accuracy": 0.6823444902896881, | |
| "num_tokens": 8607791.0, | |
| "step": 2990 | |
| }, | |
| { | |
| "entropy": 1.5978755921125412, | |
| "epoch": 3.8809831824062098, | |
| "grad_norm": 0.756771445274353, | |
| "learning_rate": 0.00010972678139388784, | |
| "loss": 1.5231, | |
| "mean_token_accuracy": 0.6199123159050941, | |
| "num_tokens": 8629942.0, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.8809831824062098, | |
| "eval_entropy": 1.7341382033949675, | |
| "eval_loss": 1.2953605651855469, | |
| "eval_mean_token_accuracy": 0.4613482361269552, | |
| "eval_num_tokens": 8629942.0, | |
| "eval_runtime": 243.363, | |
| "eval_samples_per_second": 22.584, | |
| "eval_steps_per_second": 1.414, | |
| "step": 3000 | |
| }, | |
| { | |
| "entropy": 1.9563438802957536, | |
| "epoch": 3.89391979301423, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010920536059488904, | |
| "loss": 1.2245, | |
| "mean_token_accuracy": 0.35897522792220116, | |
| "num_tokens": 8635069.0, | |
| "step": 3010 | |
| }, | |
| { | |
| "entropy": 0.9117880932986736, | |
| "epoch": 3.906856403622251, | |
| "grad_norm": 0.16995865106582642, | |
| "learning_rate": 0.00010868368725794928, | |
| "loss": 0.2219, | |
| "mean_token_accuracy": 0.050884007662534717, | |
| "num_tokens": 8661156.0, | |
| "step": 3020 | |
| }, | |
| { | |
| "entropy": 1.5383384585380555, | |
| "epoch": 3.9197930142302715, | |
| "grad_norm": 0.5345892310142517, | |
| "learning_rate": 0.000108161775694546, | |
| "loss": 1.6123, | |
| "mean_token_accuracy": 0.6229903392493725, | |
| "num_tokens": 8713506.0, | |
| "step": 3030 | |
| }, | |
| { | |
| "entropy": 1.2795201033353805, | |
| "epoch": 3.9327296248382924, | |
| "grad_norm": 0.682775616645813, | |
| "learning_rate": 0.00010763964022269213, | |
| "loss": 1.2389, | |
| "mean_token_accuracy": 0.6921025589108467, | |
| "num_tokens": 8745762.0, | |
| "step": 3040 | |
| }, | |
| { | |
| "entropy": 1.585690438747406, | |
| "epoch": 3.9456662354463132, | |
| "grad_norm": 0.7901929616928101, | |
| "learning_rate": 0.00010711729516654311, | |
| "loss": 1.5575, | |
| "mean_token_accuracy": 0.6214944392442703, | |
| "num_tokens": 8768560.0, | |
| "step": 3050 | |
| }, | |
| { | |
| "entropy": 2.0845181226730345, | |
| "epoch": 3.9586028460543337, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010659475485600423, | |
| "loss": 1.4895, | |
| "mean_token_accuracy": 0.39826231375336646, | |
| "num_tokens": 8775063.0, | |
| "step": 3060 | |
| }, | |
| { | |
| "entropy": 2.2135625928640366, | |
| "epoch": 3.9715394566623545, | |
| "grad_norm": 0.212826207280159, | |
| "learning_rate": 0.00010607203362633728, | |
| "loss": 0.2226, | |
| "mean_token_accuracy": 0.051099646091461184, | |
| "num_tokens": 8793192.0, | |
| "step": 3070 | |
| }, | |
| { | |
| "entropy": 1.4032258987426758, | |
| "epoch": 3.984476067270375, | |
| "grad_norm": 0.6924927830696106, | |
| "learning_rate": 0.00010554914581776738, | |
| "loss": 1.4474, | |
| "mean_token_accuracy": 0.6517833903431892, | |
| "num_tokens": 8831113.0, | |
| "step": 3080 | |
| }, | |
| { | |
| "entropy": 2.1208325177431107, | |
| "epoch": 3.997412677878396, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010502610577508949, | |
| "loss": 1.1819, | |
| "mean_token_accuracy": 0.38025794699788096, | |
| "num_tokens": 8840822.0, | |
| "step": 3090 | |
| }, | |
| { | |
| "entropy": 1.8567550331354141, | |
| "epoch": 4.010349288486417, | |
| "grad_norm": 0.5068947076797485, | |
| "learning_rate": 0.00010450292784727496, | |
| "loss": 1.3687, | |
| "mean_token_accuracy": 0.48387093394994735, | |
| "num_tokens": 8907582.0, | |
| "step": 3100 | |
| }, | |
| { | |
| "entropy": 1.1507928803563119, | |
| "epoch": 4.023285899094438, | |
| "grad_norm": 0.6847311854362488, | |
| "learning_rate": 0.00010397962638707783, | |
| "loss": 1.129, | |
| "mean_token_accuracy": 0.7149621859192848, | |
| "num_tokens": 8942268.0, | |
| "step": 3110 | |
| }, | |
| { | |
| "entropy": 1.3405901521444321, | |
| "epoch": 4.0362225097024576, | |
| "grad_norm": 0.8465374112129211, | |
| "learning_rate": 0.00010345621575064117, | |
| "loss": 1.3204, | |
| "mean_token_accuracy": 0.6661748513579369, | |
| "num_tokens": 8967621.0, | |
| "step": 3120 | |
| }, | |
| { | |
| "entropy": 1.9997529834508896, | |
| "epoch": 4.049159120310478, | |
| "grad_norm": 1.2902584075927734, | |
| "learning_rate": 0.00010293271029710307, | |
| "loss": 1.7005, | |
| "mean_token_accuracy": 0.5859146490693092, | |
| "num_tokens": 8978783.0, | |
| "step": 3130 | |
| }, | |
| { | |
| "entropy": 2.575493034720421, | |
| "epoch": 4.062095730918499, | |
| "grad_norm": 0.0, | |
| "learning_rate": 0.00010240912438820289, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 8979423.0, | |
| "step": 3140 | |
| }, | |
| { | |
| "entropy": 1.8354697600007057, | |
| "epoch": 4.07503234152652, | |
| "grad_norm": 0.6097379326820374, | |
| "learning_rate": 0.00010188547238788713, | |
| "loss": 1.3617, | |
| "mean_token_accuracy": 0.4855068750679493, | |
| "num_tokens": 9049300.0, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 4.07503234152652, | |
| "eval_entropy": 1.8855025125450866, | |
| "eval_loss": 1.301902413368225, | |
| "eval_mean_token_accuracy": 0.46030220640606656, | |
| "eval_num_tokens": 9049300.0, | |
| "eval_runtime": 243.8279, | |
| "eval_samples_per_second": 22.54, | |
| "eval_steps_per_second": 1.411, | |
| "step": 3150 | |
| }, | |
| { | |
| "entropy": 1.140310089290142, | |
| "epoch": 4.087968952134541, | |
| "grad_norm": 0.6553735136985779, | |
| "learning_rate": 0.00010136176866191548, | |
| "loss": 1.109, | |
| "mean_token_accuracy": 0.7216179341077804, | |
| "num_tokens": 9083874.0, | |
| "step": 3160 | |
| }, | |
| { | |
| "entropy": 1.3620821744203568, | |
| "epoch": 4.100905562742561, | |
| "grad_norm": 0.9848551154136658, | |
| "learning_rate": 0.00010083802757746668, | |
| "loss": 1.2997, | |
| "mean_token_accuracy": 0.6707961618900299, | |
| "num_tokens": 9108826.0, | |
| "step": 3170 | |
| }, | |
| { | |
| "entropy": 2.078350791335106, | |
| "epoch": 4.113842173350582, | |
| "grad_norm": 0.9935686588287354, | |
| "learning_rate": 0.0001003142635027442, | |
| "loss": 1.6088, | |
| "mean_token_accuracy": 0.5507442288100719, | |
| "num_tokens": 9118696.0, | |
| "step": 3180 | |
| }, | |
| { | |
| "entropy": 1.528096930682659, | |
| "epoch": 4.126778783958603, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.979049080658242e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 9119336.0, | |
| "step": 3190 | |
| }, | |
| { | |
| "entropy": 1.4985127076506615, | |
| "epoch": 4.139715394566624, | |
| "grad_norm": 0.6286259889602661, | |
| "learning_rate": 9.926672385805207e-05, | |
| "loss": 1.4428, | |
| "mean_token_accuracy": 0.46830192804336546, | |
| "num_tokens": 9198456.0, | |
| "step": 3200 | |
| }, | |
| { | |
| "entropy": 1.1341844990849494, | |
| "epoch": 4.1526520051746445, | |
| "grad_norm": 0.6682960391044617, | |
| "learning_rate": 9.874297702606636e-05, | |
| "loss": 1.1144, | |
| "mean_token_accuracy": 0.7213881194591523, | |
| "num_tokens": 9234104.0, | |
| "step": 3210 | |
| }, | |
| { | |
| "entropy": 1.3693108260631561, | |
| "epoch": 4.165588615782665, | |
| "grad_norm": 0.8303619027137756, | |
| "learning_rate": 9.821926467898653e-05, | |
| "loss": 1.3216, | |
| "mean_token_accuracy": 0.6689239561557769, | |
| "num_tokens": 9259921.0, | |
| "step": 3220 | |
| }, | |
| { | |
| "entropy": 1.9042235404253005, | |
| "epoch": 4.178525226390685, | |
| "grad_norm": 1.645528793334961, | |
| "learning_rate": 9.769560118422773e-05, | |
| "loss": 1.7769, | |
| "mean_token_accuracy": 0.5957130216062069, | |
| "num_tokens": 9272479.0, | |
| "step": 3230 | |
| }, | |
| { | |
| "entropy": 0.9734129890799522, | |
| "epoch": 4.191461836998706, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.717200090786501e-05, | |
| "loss": 0.0492, | |
| "mean_token_accuracy": 0.03619047701358795, | |
| "num_tokens": 9273156.0, | |
| "step": 3240 | |
| }, | |
| { | |
| "entropy": 1.5239285960793496, | |
| "epoch": 4.204398447606727, | |
| "grad_norm": 0.6020880937576294, | |
| "learning_rate": 9.664847821423907e-05, | |
| "loss": 1.4046, | |
| "mean_token_accuracy": 0.47501309886574744, | |
| "num_tokens": 9347748.0, | |
| "step": 3250 | |
| }, | |
| { | |
| "entropy": 1.103029479086399, | |
| "epoch": 4.217335058214748, | |
| "grad_norm": 0.6547256708145142, | |
| "learning_rate": 9.612504746556215e-05, | |
| "loss": 1.0853, | |
| "mean_token_accuracy": 0.722417363524437, | |
| "num_tokens": 9382776.0, | |
| "step": 3260 | |
| }, | |
| { | |
| "entropy": 1.371236687898636, | |
| "epoch": 4.230271668822769, | |
| "grad_norm": 0.910345733165741, | |
| "learning_rate": 9.560172302152414e-05, | |
| "loss": 1.3338, | |
| "mean_token_accuracy": 0.6663747102022171, | |
| "num_tokens": 9408048.0, | |
| "step": 3270 | |
| }, | |
| { | |
| "entropy": 1.8871563643217086, | |
| "epoch": 4.243208279430789, | |
| "grad_norm": 1.3442589044570923, | |
| "learning_rate": 9.507851923889868e-05, | |
| "loss": 1.6856, | |
| "mean_token_accuracy": 0.5958636343479157, | |
| "num_tokens": 9419207.0, | |
| "step": 3280 | |
| }, | |
| { | |
| "entropy": 2.1561751127243043, | |
| "epoch": 4.25614489003881, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.455545047114901e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 9419847.0, | |
| "step": 3290 | |
| }, | |
| { | |
| "entropy": 1.766649141907692, | |
| "epoch": 4.269081500646831, | |
| "grad_norm": 0.6345491409301758, | |
| "learning_rate": 9.40325310680346e-05, | |
| "loss": 1.3764, | |
| "mean_token_accuracy": 0.48196633756160734, | |
| "num_tokens": 9491348.0, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.269081500646831, | |
| "eval_entropy": 1.759770261860171, | |
| "eval_loss": 1.3021514415740967, | |
| "eval_mean_token_accuracy": 0.4654658474894457, | |
| "eval_num_tokens": 9491348.0, | |
| "eval_runtime": 243.8603, | |
| "eval_samples_per_second": 22.537, | |
| "eval_steps_per_second": 1.411, | |
| "step": 3300 | |
| }, | |
| { | |
| "entropy": 1.0932901889085769, | |
| "epoch": 4.282018111254851, | |
| "grad_norm": 0.6778357028961182, | |
| "learning_rate": 9.350977537521717e-05, | |
| "loss": 1.0699, | |
| "mean_token_accuracy": 0.7278983518481255, | |
| "num_tokens": 9526419.0, | |
| "step": 3310 | |
| }, | |
| { | |
| "entropy": 1.3789748430252076, | |
| "epoch": 4.294954721862872, | |
| "grad_norm": 0.8899635672569275, | |
| "learning_rate": 9.298719773386724e-05, | |
| "loss": 1.3351, | |
| "mean_token_accuracy": 0.6661961570382118, | |
| "num_tokens": 9551892.0, | |
| "step": 3320 | |
| }, | |
| { | |
| "entropy": 1.957590714097023, | |
| "epoch": 4.307891332470892, | |
| "grad_norm": 1.470860481262207, | |
| "learning_rate": 9.246481248027077e-05, | |
| "loss": 1.7173, | |
| "mean_token_accuracy": 0.5974891498684883, | |
| "num_tokens": 9563515.0, | |
| "step": 3330 | |
| }, | |
| { | |
| "entropy": 2.714459627866745, | |
| "epoch": 4.320827943078913, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.194263394543575e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 9564155.0, | |
| "step": 3340 | |
| }, | |
| { | |
| "entropy": 1.8973265826702117, | |
| "epoch": 4.333764553686934, | |
| "grad_norm": 0.6255518198013306, | |
| "learning_rate": 9.14206764546991e-05, | |
| "loss": 1.4331, | |
| "mean_token_accuracy": 0.47237296029925346, | |
| "num_tokens": 9638156.0, | |
| "step": 3350 | |
| }, | |
| { | |
| "entropy": 1.113997830450535, | |
| "epoch": 4.346701164294955, | |
| "grad_norm": 0.6197985410690308, | |
| "learning_rate": 9.089895432733364e-05, | |
| "loss": 1.1138, | |
| "mean_token_accuracy": 0.7213677644729615, | |
| "num_tokens": 9674105.0, | |
| "step": 3360 | |
| }, | |
| { | |
| "entropy": 1.355890506505966, | |
| "epoch": 4.359637774902976, | |
| "grad_norm": 0.8531930446624756, | |
| "learning_rate": 9.037748187615538e-05, | |
| "loss": 1.3064, | |
| "mean_token_accuracy": 0.6726941719651223, | |
| "num_tokens": 9700126.0, | |
| "step": 3370 | |
| }, | |
| { | |
| "entropy": 1.9791965007781982, | |
| "epoch": 4.372574385510996, | |
| "grad_norm": 1.7110706567764282, | |
| "learning_rate": 8.985627340713061e-05, | |
| "loss": 1.6769, | |
| "mean_token_accuracy": 0.5642684459686279, | |
| "num_tokens": 9711816.0, | |
| "step": 3380 | |
| }, | |
| { | |
| "entropy": 3.160873770713806, | |
| "epoch": 4.385510996119017, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.933534321898367e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 9712456.0, | |
| "step": 3390 | |
| }, | |
| { | |
| "entropy": 2.013157232105732, | |
| "epoch": 4.3984476067270375, | |
| "grad_norm": 0.6276950240135193, | |
| "learning_rate": 8.881470560280465e-05, | |
| "loss": 1.4395, | |
| "mean_token_accuracy": 0.4699708536267281, | |
| "num_tokens": 9789047.0, | |
| "step": 3400 | |
| }, | |
| { | |
| "entropy": 1.0582531332969665, | |
| "epoch": 4.411384217335058, | |
| "grad_norm": 0.6762167811393738, | |
| "learning_rate": 8.829437484165718e-05, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.7299133688211441, | |
| "num_tokens": 9824536.0, | |
| "step": 3410 | |
| }, | |
| { | |
| "entropy": 1.3210778176784514, | |
| "epoch": 4.424320827943079, | |
| "grad_norm": 0.8756985664367676, | |
| "learning_rate": 8.777436521018676e-05, | |
| "loss": 1.2846, | |
| "mean_token_accuracy": 0.6797921255230903, | |
| "num_tokens": 9850555.0, | |
| "step": 3420 | |
| }, | |
| { | |
| "entropy": 1.8927232474088669, | |
| "epoch": 4.437257438551099, | |
| "grad_norm": 1.5375664234161377, | |
| "learning_rate": 8.725469097422912e-05, | |
| "loss": 1.7705, | |
| "mean_token_accuracy": 0.5886133186519146, | |
| "num_tokens": 9863603.0, | |
| "step": 3430 | |
| }, | |
| { | |
| "entropy": 2.54144030213356, | |
| "epoch": 4.45019404915912, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.673536639041864e-05, | |
| "loss": 0.0476, | |
| "mean_token_accuracy": 0.04354838728904724, | |
| "num_tokens": 9864278.0, | |
| "step": 3440 | |
| }, | |
| { | |
| "entropy": 1.6926740244030953, | |
| "epoch": 4.463130659767141, | |
| "grad_norm": 0.639385461807251, | |
| "learning_rate": 8.621640570579764e-05, | |
| "loss": 1.2832, | |
| "mean_token_accuracy": 0.502137529104948, | |
| "num_tokens": 9929876.0, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 4.463130659767141, | |
| "eval_entropy": 1.6399936731471572, | |
| "eval_loss": 1.2823114395141602, | |
| "eval_mean_token_accuracy": 0.4697489900471166, | |
| "eval_num_tokens": 9929876.0, | |
| "eval_runtime": 242.6114, | |
| "eval_samples_per_second": 22.654, | |
| "eval_steps_per_second": 1.418, | |
| "step": 3450 | |
| }, | |
| { | |
| "entropy": 1.0890112176537514, | |
| "epoch": 4.476067270375162, | |
| "grad_norm": 0.6899943351745605, | |
| "learning_rate": 8.56978231574252e-05, | |
| "loss": 1.0627, | |
| "mean_token_accuracy": 0.7313546255230904, | |
| "num_tokens": 9964211.0, | |
| "step": 3460 | |
| }, | |
| { | |
| "entropy": 1.3737705022096633, | |
| "epoch": 4.489003880983183, | |
| "grad_norm": 0.9175981879234314, | |
| "learning_rate": 8.517963297198672e-05, | |
| "loss": 1.3508, | |
| "mean_token_accuracy": 0.6623948410153389, | |
| "num_tokens": 9989036.0, | |
| "step": 3470 | |
| }, | |
| { | |
| "entropy": 1.8537749290466308, | |
| "epoch": 4.501940491591203, | |
| "grad_norm": 1.1406779289245605, | |
| "learning_rate": 8.466184936540351e-05, | |
| "loss": 1.6469, | |
| "mean_token_accuracy": 0.590015722811222, | |
| "num_tokens": 9999994.0, | |
| "step": 3480 | |
| }, | |
| { | |
| "entropy": 1.9705951809883118, | |
| "epoch": 4.514877102199224, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.414448654244297e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10000634.0, | |
| "step": 3490 | |
| }, | |
| { | |
| "entropy": 1.7736740306019783, | |
| "epoch": 4.527813712807244, | |
| "grad_norm": 0.5741596817970276, | |
| "learning_rate": 8.362755869632883e-05, | |
| "loss": 1.418, | |
| "mean_token_accuracy": 0.4736007325351238, | |
| "num_tokens": 10069782.0, | |
| "step": 3500 | |
| }, | |
| { | |
| "entropy": 1.1099611327052117, | |
| "epoch": 4.540750323415265, | |
| "grad_norm": 0.6997600793838501, | |
| "learning_rate": 8.311108000835167e-05, | |
| "loss": 1.1002, | |
| "mean_token_accuracy": 0.7222409531474113, | |
| "num_tokens": 10105051.0, | |
| "step": 3510 | |
| }, | |
| { | |
| "entropy": 1.3370502710342407, | |
| "epoch": 4.553686934023286, | |
| "grad_norm": 0.9216951131820679, | |
| "learning_rate": 8.259506464747999e-05, | |
| "loss": 1.2856, | |
| "mean_token_accuracy": 0.6742190420627594, | |
| "num_tokens": 10129844.0, | |
| "step": 3520 | |
| }, | |
| { | |
| "entropy": 2.0127808332443236, | |
| "epoch": 4.566623544631307, | |
| "grad_norm": 1.644737958908081, | |
| "learning_rate": 8.207952676997153e-05, | |
| "loss": 1.7374, | |
| "mean_token_accuracy": 0.5706604786217213, | |
| "num_tokens": 10140891.0, | |
| "step": 3530 | |
| }, | |
| { | |
| "entropy": 2.3392362356185914, | |
| "epoch": 4.579560155239327, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.156448051898476e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10141531.0, | |
| "step": 3540 | |
| }, | |
| { | |
| "entropy": 1.7776501581072808, | |
| "epoch": 4.592496765847348, | |
| "grad_norm": 0.6358464956283569, | |
| "learning_rate": 8.1049940024191e-05, | |
| "loss": 1.4156, | |
| "mean_token_accuracy": 0.47597954645752905, | |
| "num_tokens": 10208071.0, | |
| "step": 3550 | |
| }, | |
| { | |
| "entropy": 1.103192213177681, | |
| "epoch": 4.605433376455369, | |
| "grad_norm": 0.6968359351158142, | |
| "learning_rate": 8.053591940138686e-05, | |
| "loss": 1.096, | |
| "mean_token_accuracy": 0.7267577677965165, | |
| "num_tokens": 10242851.0, | |
| "step": 3560 | |
| }, | |
| { | |
| "entropy": 1.3612541019916535, | |
| "epoch": 4.61836998706339, | |
| "grad_norm": 0.9655300974845886, | |
| "learning_rate": 8.002243275210669e-05, | |
| "loss": 1.3057, | |
| "mean_token_accuracy": 0.672816789150238, | |
| "num_tokens": 10268178.0, | |
| "step": 3570 | |
| }, | |
| { | |
| "entropy": 1.932911714911461, | |
| "epoch": 4.63130659767141, | |
| "grad_norm": 1.2096027135849, | |
| "learning_rate": 7.950949416323612e-05, | |
| "loss": 1.7086, | |
| "mean_token_accuracy": 0.612860233336687, | |
| "num_tokens": 10279495.0, | |
| "step": 3580 | |
| }, | |
| { | |
| "entropy": 1.9618256837129593, | |
| "epoch": 4.6442432082794305, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.899711770662532e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10280135.0, | |
| "step": 3590 | |
| }, | |
| { | |
| "entropy": 1.6968649536371232, | |
| "epoch": 4.657179818887451, | |
| "grad_norm": 0.6373590230941772, | |
| "learning_rate": 7.848531743870297e-05, | |
| "loss": 1.3993, | |
| "mean_token_accuracy": 0.4759638875722885, | |
| "num_tokens": 10346462.0, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.657179818887451, | |
| "eval_entropy": 1.577659371980401, | |
| "eval_loss": 1.273931622505188, | |
| "eval_mean_token_accuracy": 0.4723818853150967, | |
| "eval_num_tokens": 10346462.0, | |
| "eval_runtime": 245.6574, | |
| "eval_samples_per_second": 22.373, | |
| "eval_steps_per_second": 1.4, | |
| "step": 3600 | |
| }, | |
| { | |
| "entropy": 1.093438169360161, | |
| "epoch": 4.670116429495472, | |
| "grad_norm": 0.7240473628044128, | |
| "learning_rate": 7.797410740009084e-05, | |
| "loss": 1.0745, | |
| "mean_token_accuracy": 0.7309321075677871, | |
| "num_tokens": 10381489.0, | |
| "step": 3610 | |
| }, | |
| { | |
| "entropy": 1.37732635140419, | |
| "epoch": 4.683053040103493, | |
| "grad_norm": 0.9580934047698975, | |
| "learning_rate": 7.746350161521845e-05, | |
| "loss": 1.336, | |
| "mean_token_accuracy": 0.6637881115078926, | |
| "num_tokens": 10406795.0, | |
| "step": 3620 | |
| }, | |
| { | |
| "entropy": 1.9799594402313232, | |
| "epoch": 4.695989650711514, | |
| "grad_norm": 1.5260565280914307, | |
| "learning_rate": 7.695351409193823e-05, | |
| "loss": 1.7859, | |
| "mean_token_accuracy": 0.5888419583439827, | |
| "num_tokens": 10418685.0, | |
| "step": 3630 | |
| }, | |
| { | |
| "entropy": 1.8445574283599853, | |
| "epoch": 4.708926261319534, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.644415882114145e-05, | |
| "loss": 0.0354, | |
| "mean_token_accuracy": 0.04375, | |
| "num_tokens": 10419355.0, | |
| "step": 3640 | |
| }, | |
| { | |
| "entropy": 1.724594485759735, | |
| "epoch": 4.721862871927555, | |
| "grad_norm": 0.5997304320335388, | |
| "learning_rate": 7.593544977637436e-05, | |
| "loss": 1.4375, | |
| "mean_token_accuracy": 0.4693992160260677, | |
| "num_tokens": 10485312.0, | |
| "step": 3650 | |
| }, | |
| { | |
| "entropy": 1.079079033434391, | |
| "epoch": 4.734799482535576, | |
| "grad_norm": 0.6873499155044556, | |
| "learning_rate": 7.54274009134546e-05, | |
| "loss": 1.0708, | |
| "mean_token_accuracy": 0.7280381500720978, | |
| "num_tokens": 10520582.0, | |
| "step": 3660 | |
| }, | |
| { | |
| "entropy": 1.315394550561905, | |
| "epoch": 4.747736093143597, | |
| "grad_norm": 0.8612226843833923, | |
| "learning_rate": 7.492002617008866e-05, | |
| "loss": 1.2891, | |
| "mean_token_accuracy": 0.6757827803492547, | |
| "num_tokens": 10545966.0, | |
| "step": 3670 | |
| }, | |
| { | |
| "entropy": 1.840933558344841, | |
| "epoch": 4.760672703751617, | |
| "grad_norm": 0.7735125422477722, | |
| "learning_rate": 7.441333946548939e-05, | |
| "loss": 1.575, | |
| "mean_token_accuracy": 0.5655414000153541, | |
| "num_tokens": 10557080.0, | |
| "step": 3680 | |
| }, | |
| { | |
| "entropy": 1.232702499628067, | |
| "epoch": 4.773609314359637, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.390735469999398e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10557720.0, | |
| "step": 3690 | |
| }, | |
| { | |
| "entropy": 1.5656249672174454, | |
| "epoch": 4.786545924967658, | |
| "grad_norm": 0.6145333051681519, | |
| "learning_rate": 7.340208575468291e-05, | |
| "loss": 1.4603, | |
| "mean_token_accuracy": 0.46657404825091364, | |
| "num_tokens": 10627563.0, | |
| "step": 3700 | |
| }, | |
| { | |
| "entropy": 1.0934513494372369, | |
| "epoch": 4.799482535575679, | |
| "grad_norm": 0.7226387858390808, | |
| "learning_rate": 7.289754649099897e-05, | |
| "loss": 1.0786, | |
| "mean_token_accuracy": 0.7299003899097443, | |
| "num_tokens": 10662880.0, | |
| "step": 3710 | |
| }, | |
| { | |
| "entropy": 1.3585843235254287, | |
| "epoch": 4.8124191461837, | |
| "grad_norm": 0.8521022796630859, | |
| "learning_rate": 7.239375075036697e-05, | |
| "loss": 1.3144, | |
| "mean_token_accuracy": 0.6705298006534577, | |
| "num_tokens": 10688600.0, | |
| "step": 3720 | |
| }, | |
| { | |
| "entropy": 1.8722685337066651, | |
| "epoch": 4.825355756791721, | |
| "grad_norm": 1.371882677078247, | |
| "learning_rate": 7.189071235381406e-05, | |
| "loss": 1.7141, | |
| "mean_token_accuracy": 0.604588358104229, | |
| "num_tokens": 10700334.0, | |
| "step": 3730 | |
| }, | |
| { | |
| "entropy": 1.860415416955948, | |
| "epoch": 4.838292367399741, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.138844510159069e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10700974.0, | |
| "step": 3740 | |
| }, | |
| { | |
| "entropy": 1.68975418061018, | |
| "epoch": 4.851228978007762, | |
| "grad_norm": 0.6484793424606323, | |
| "learning_rate": 7.088696277279175e-05, | |
| "loss": 1.3382, | |
| "mean_token_accuracy": 0.4887751266360283, | |
| "num_tokens": 10771692.0, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 4.851228978007762, | |
| "eval_entropy": 1.7122522977202437, | |
| "eval_loss": 1.2648330926895142, | |
| "eval_mean_token_accuracy": 0.47577540377198263, | |
| "eval_num_tokens": 10771692.0, | |
| "eval_runtime": 244.9784, | |
| "eval_samples_per_second": 22.435, | |
| "eval_steps_per_second": 1.404, | |
| "step": 3750 | |
| }, | |
| { | |
| "entropy": 1.1040325671434403, | |
| "epoch": 4.864165588615783, | |
| "grad_norm": 0.7224993705749512, | |
| "learning_rate": 7.038627912497873e-05, | |
| "loss": 1.0872, | |
| "mean_token_accuracy": 0.7262751698493958, | |
| "num_tokens": 10806575.0, | |
| "step": 3760 | |
| }, | |
| { | |
| "entropy": 1.3863080263137817, | |
| "epoch": 4.8771021992238035, | |
| "grad_norm": 0.9205716252326965, | |
| "learning_rate": 6.988640789380241e-05, | |
| "loss": 1.3415, | |
| "mean_token_accuracy": 0.6670658677816391, | |
| "num_tokens": 10831607.0, | |
| "step": 3770 | |
| }, | |
| { | |
| "entropy": 1.986344888806343, | |
| "epoch": 4.890038809831824, | |
| "grad_norm": 1.2501696348190308, | |
| "learning_rate": 6.938736279262567e-05, | |
| "loss": 1.5931, | |
| "mean_token_accuracy": 0.5594463273882866, | |
| "num_tokens": 10842477.0, | |
| "step": 3780 | |
| }, | |
| { | |
| "entropy": 2.6916876256465914, | |
| "epoch": 4.902975420439844, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.888915751214774e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10843117.0, | |
| "step": 3790 | |
| }, | |
| { | |
| "entropy": 1.8490806862711906, | |
| "epoch": 4.915912031047865, | |
| "grad_norm": 0.6139810085296631, | |
| "learning_rate": 6.83918057200283e-05, | |
| "loss": 1.3791, | |
| "mean_token_accuracy": 0.4822954162955284, | |
| "num_tokens": 10917288.0, | |
| "step": 3800 | |
| }, | |
| { | |
| "entropy": 1.063162423670292, | |
| "epoch": 4.928848641655886, | |
| "grad_norm": 0.7340760231018066, | |
| "learning_rate": 6.789532106051246e-05, | |
| "loss": 1.0523, | |
| "mean_token_accuracy": 0.7331129983067513, | |
| "num_tokens": 10952906.0, | |
| "step": 3810 | |
| }, | |
| { | |
| "entropy": 1.348393714427948, | |
| "epoch": 4.941785252263907, | |
| "grad_norm": 0.979292094707489, | |
| "learning_rate": 6.739971715405684e-05, | |
| "loss": 1.3057, | |
| "mean_token_accuracy": 0.6723238781094552, | |
| "num_tokens": 10978606.0, | |
| "step": 3820 | |
| }, | |
| { | |
| "entropy": 1.887803316116333, | |
| "epoch": 4.954721862871928, | |
| "grad_norm": 1.4358190298080444, | |
| "learning_rate": 6.690500759695557e-05, | |
| "loss": 1.6779, | |
| "mean_token_accuracy": 0.6134289026260376, | |
| "num_tokens": 10990333.0, | |
| "step": 3830 | |
| }, | |
| { | |
| "entropy": 2.7988963067531585, | |
| "epoch": 4.967658473479949, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.641120596096729e-05, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 10990973.0, | |
| "step": 3840 | |
| }, | |
| { | |
| "entropy": 1.5671290338039399, | |
| "epoch": 4.980595084087969, | |
| "grad_norm": 0.697485625743866, | |
| "learning_rate": 6.591832579294303e-05, | |
| "loss": 1.0782, | |
| "mean_token_accuracy": 0.5383081540465355, | |
| "num_tokens": 11034414.0, | |
| "step": 3850 | |
| }, | |
| { | |
| "entropy": 1.7319936901330948, | |
| "epoch": 4.99353169469599, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.542638061445447e-05, | |
| "loss": 1.3846, | |
| "mean_token_accuracy": 0.5769012212753296, | |
| "num_tokens": 11050864.0, | |
| "step": 3860 | |
| }, | |
| { | |
| "entropy": 2.4691727608442307, | |
| "epoch": 5.00646830530401, | |
| "grad_norm": 0.49155521392822266, | |
| "learning_rate": 6.493538392142287e-05, | |
| "loss": 1.0145, | |
| "mean_token_accuracy": 0.26990386173129083, | |
| "num_tokens": 11109874.0, | |
| "step": 3870 | |
| }, | |
| { | |
| "entropy": 1.075531531870365, | |
| "epoch": 5.019404915912031, | |
| "grad_norm": 0.7045453190803528, | |
| "learning_rate": 6.444534918374906e-05, | |
| "loss": 1.0364, | |
| "mean_token_accuracy": 0.7393457636237144, | |
| "num_tokens": 11148394.0, | |
| "step": 3880 | |
| }, | |
| { | |
| "entropy": 1.1883759826421738, | |
| "epoch": 5.032341526520052, | |
| "grad_norm": 0.9995729327201843, | |
| "learning_rate": 6.395628984494378e-05, | |
| "loss": 1.2028, | |
| "mean_token_accuracy": 0.6972079753875733, | |
| "num_tokens": 11176092.0, | |
| "step": 3890 | |
| }, | |
| { | |
| "entropy": 1.7173998385667801, | |
| "epoch": 5.045278137128072, | |
| "grad_norm": 1.125909686088562, | |
| "learning_rate": 6.346821932175873e-05, | |
| "loss": 1.5967, | |
| "mean_token_accuracy": 0.6245104640722274, | |
| "num_tokens": 11192285.0, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 5.045278137128072, | |
| "eval_entropy": 1.9103823839578518, | |
| "eval_loss": 1.2630141973495483, | |
| "eval_mean_token_accuracy": 0.4754580475043419, | |
| "eval_num_tokens": 11192285.0, | |
| "eval_runtime": 244.3056, | |
| "eval_samples_per_second": 22.496, | |
| "eval_steps_per_second": 1.408, | |
| "step": 3900 | |
| }, | |
| { | |
| "entropy": 2.6084256947040556, | |
| "epoch": 5.058214747736093, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.298115100381882e-05, | |
| "loss": 0.3442, | |
| "mean_token_accuracy": 0.16731906533241273, | |
| "num_tokens": 11193644.0, | |
| "step": 3910 | |
| }, | |
| { | |
| "entropy": 2.282520645856857, | |
| "epoch": 5.071151358344114, | |
| "grad_norm": 0.5869239568710327, | |
| "learning_rate": 6.249509825325467e-05, | |
| "loss": 0.9511, | |
| "mean_token_accuracy": 0.28290636241436007, | |
| "num_tokens": 11249840.0, | |
| "step": 3920 | |
| }, | |
| { | |
| "entropy": 1.016249306499958, | |
| "epoch": 5.084087968952135, | |
| "grad_norm": 0.7197193503379822, | |
| "learning_rate": 6.201007440433588e-05, | |
| "loss": 1.007, | |
| "mean_token_accuracy": 0.7442266702651977, | |
| "num_tokens": 11287639.0, | |
| "step": 3930 | |
| }, | |
| { | |
| "entropy": 1.2221685394644737, | |
| "epoch": 5.097024579560156, | |
| "grad_norm": 0.9447595477104187, | |
| "learning_rate": 6.152609276310549e-05, | |
| "loss": 1.187, | |
| "mean_token_accuracy": 0.7011413544416427, | |
| "num_tokens": 11315215.0, | |
| "step": 3940 | |
| }, | |
| { | |
| "entropy": 1.6715268433094024, | |
| "epoch": 5.109961190168176, | |
| "grad_norm": 1.0949913263320923, | |
| "learning_rate": 6.104316660701485e-05, | |
| "loss": 1.5623, | |
| "mean_token_accuracy": 0.6256066203117371, | |
| "num_tokens": 11332567.0, | |
| "step": 3950 | |
| }, | |
| { | |
| "entropy": 2.496020531654358, | |
| "epoch": 5.1228978007761965, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.056130918455929e-05, | |
| "loss": 0.4602, | |
| "mean_token_accuracy": 0.21488995999097824, | |
| "num_tokens": 11334364.0, | |
| "step": 3960 | |
| }, | |
| { | |
| "entropy": 2.2577121645212173, | |
| "epoch": 5.135834411384217, | |
| "grad_norm": 0.6211187243461609, | |
| "learning_rate": 6.0080533714914766e-05, | |
| "loss": 1.0081, | |
| "mean_token_accuracy": 0.2705229982733727, | |
| "num_tokens": 11391718.0, | |
| "step": 3970 | |
| }, | |
| { | |
| "entropy": 1.0153650417923927, | |
| "epoch": 5.148771021992238, | |
| "grad_norm": 0.649202287197113, | |
| "learning_rate": 5.9600853387575163e-05, | |
| "loss": 1.0426, | |
| "mean_token_accuracy": 0.7383781686425209, | |
| "num_tokens": 11430710.0, | |
| "step": 3980 | |
| }, | |
| { | |
| "entropy": 1.1217432379722596, | |
| "epoch": 5.161707632600259, | |
| "grad_norm": 0.9362276792526245, | |
| "learning_rate": 5.912228136199038e-05, | |
| "loss": 1.0765, | |
| "mean_token_accuracy": 0.7234507903456688, | |
| "num_tokens": 11459154.0, | |
| "step": 3990 | |
| }, | |
| { | |
| "entropy": 1.6653785824775695, | |
| "epoch": 5.174644243208279, | |
| "grad_norm": 1.2307344675064087, | |
| "learning_rate": 5.864483076720555e-05, | |
| "loss": 1.5669, | |
| "mean_token_accuracy": 0.6285063222050666, | |
| "num_tokens": 11476268.0, | |
| "step": 4000 | |
| }, | |
| { | |
| "entropy": 2.082801552116871, | |
| "epoch": 5.1875808538163, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.81685147015006e-05, | |
| "loss": 0.3513, | |
| "mean_token_accuracy": 0.1956300586462021, | |
| "num_tokens": 11477779.0, | |
| "step": 4010 | |
| }, | |
| { | |
| "entropy": 2.0466490238904953, | |
| "epoch": 5.200517464424321, | |
| "grad_norm": 0.5699072480201721, | |
| "learning_rate": 5.769334623203095e-05, | |
| "loss": 0.9736, | |
| "mean_token_accuracy": 0.27822155207395555, | |
| "num_tokens": 11531993.0, | |
| "step": 4020 | |
| }, | |
| { | |
| "entropy": 1.0089649006724357, | |
| "epoch": 5.213454075032342, | |
| "grad_norm": 0.6833609938621521, | |
| "learning_rate": 5.7219338394469356e-05, | |
| "loss": 1.0355, | |
| "mean_token_accuracy": 0.7415396451950074, | |
| "num_tokens": 11570430.0, | |
| "step": 4030 | |
| }, | |
| { | |
| "entropy": 1.1602358788251876, | |
| "epoch": 5.226390685640363, | |
| "grad_norm": 0.933566153049469, | |
| "learning_rate": 5.674650419264782e-05, | |
| "loss": 1.1016, | |
| "mean_token_accuracy": 0.7183712035417557, | |
| "num_tokens": 11598642.0, | |
| "step": 4040 | |
| }, | |
| { | |
| "entropy": 1.6275397926568984, | |
| "epoch": 5.239327296248383, | |
| "grad_norm": 1.2435181140899658, | |
| "learning_rate": 5.6274856598201066e-05, | |
| "loss": 1.5472, | |
| "mean_token_accuracy": 0.6266872644424438, | |
| "num_tokens": 11615900.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 5.239327296248383, | |
| "eval_entropy": 1.7370388171700544, | |
| "eval_loss": 1.2589974403381348, | |
| "eval_mean_token_accuracy": 0.4773535789965197, | |
| "eval_num_tokens": 11615900.0, | |
| "eval_runtime": 242.7915, | |
| "eval_samples_per_second": 22.637, | |
| "eval_steps_per_second": 1.417, | |
| "step": 4050 | |
| }, | |
| { | |
| "entropy": 2.3815665364265444, | |
| "epoch": 5.252263906856403, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.580440855021083e-05, | |
| "loss": 0.4649, | |
| "mean_token_accuracy": 0.19248609468340874, | |
| "num_tokens": 11617642.0, | |
| "step": 4060 | |
| }, | |
| { | |
| "entropy": 2.2312158316373827, | |
| "epoch": 5.265200517464424, | |
| "grad_norm": 0.5702583193778992, | |
| "learning_rate": 5.533517295485062e-05, | |
| "loss": 0.9829, | |
| "mean_token_accuracy": 0.27761168628931043, | |
| "num_tokens": 11675101.0, | |
| "step": 4070 | |
| }, | |
| { | |
| "entropy": 1.0108808249235153, | |
| "epoch": 5.278137128072445, | |
| "grad_norm": 0.752931535243988, | |
| "learning_rate": 5.486716268503182e-05, | |
| "loss": 1.0438, | |
| "mean_token_accuracy": 0.7365775972604751, | |
| "num_tokens": 11713657.0, | |
| "step": 4080 | |
| }, | |
| { | |
| "entropy": 1.2421717032790185, | |
| "epoch": 5.291073738680466, | |
| "grad_norm": 0.9655210375785828, | |
| "learning_rate": 5.440039058005047e-05, | |
| "loss": 1.1822, | |
| "mean_token_accuracy": 0.7000416114926338, | |
| "num_tokens": 11741666.0, | |
| "step": 4090 | |
| }, | |
| { | |
| "entropy": 1.6973173677921296, | |
| "epoch": 5.304010349288486, | |
| "grad_norm": 1.5103716850280762, | |
| "learning_rate": 5.393486944523505e-05, | |
| "loss": 1.5623, | |
| "mean_token_accuracy": 0.6223144173622132, | |
| "num_tokens": 11758203.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "entropy": 2.1967957854270934, | |
| "epoch": 5.316946959896507, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.347061205159519e-05, | |
| "loss": 0.2983, | |
| "mean_token_accuracy": 0.15621012300252915, | |
| "num_tokens": 11759461.0, | |
| "step": 4110 | |
| }, | |
| { | |
| "entropy": 1.981054452061653, | |
| "epoch": 5.329883570504528, | |
| "grad_norm": 0.6421746611595154, | |
| "learning_rate": 5.3007631135471334e-05, | |
| "loss": 0.9895, | |
| "mean_token_accuracy": 0.2755612075328827, | |
| "num_tokens": 11813613.0, | |
| "step": 4120 | |
| }, | |
| { | |
| "entropy": 0.9758897602558136, | |
| "epoch": 5.342820181112549, | |
| "grad_norm": 0.7207741141319275, | |
| "learning_rate": 5.2545939398185284e-05, | |
| "loss": 1.0031, | |
| "mean_token_accuracy": 0.7456466734409333, | |
| "num_tokens": 11852165.0, | |
| "step": 4130 | |
| }, | |
| { | |
| "entropy": 1.195047491788864, | |
| "epoch": 5.3557567917205695, | |
| "grad_norm": 0.9851743578910828, | |
| "learning_rate": 5.208554950569178e-05, | |
| "loss": 1.1364, | |
| "mean_token_accuracy": 0.7128469496965408, | |
| "num_tokens": 11880541.0, | |
| "step": 4140 | |
| }, | |
| { | |
| "entropy": 1.6736773550510406, | |
| "epoch": 5.36869340232859, | |
| "grad_norm": 1.2857285737991333, | |
| "learning_rate": 5.1626474088231004e-05, | |
| "loss": 1.6022, | |
| "mean_token_accuracy": 0.6264947578310966, | |
| "num_tokens": 11897978.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "entropy": 2.3716455429792402, | |
| "epoch": 5.38163001293661, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.116872573998217e-05, | |
| "loss": 0.3852, | |
| "mean_token_accuracy": 0.18944832757115365, | |
| "num_tokens": 11899460.0, | |
| "step": 4160 | |
| }, | |
| { | |
| "entropy": 2.3163172632455824, | |
| "epoch": 5.394566623544631, | |
| "grad_norm": 0.60521000623703, | |
| "learning_rate": 5.071231701871787e-05, | |
| "loss": 0.9779, | |
| "mean_token_accuracy": 0.27711123302578927, | |
| "num_tokens": 11956251.0, | |
| "step": 4170 | |
| }, | |
| { | |
| "entropy": 1.026511162519455, | |
| "epoch": 5.407503234152652, | |
| "grad_norm": 0.7545950412750244, | |
| "learning_rate": 5.025726044545968e-05, | |
| "loss": 1.0516, | |
| "mean_token_accuracy": 0.7328214541077613, | |
| "num_tokens": 11995157.0, | |
| "step": 4180 | |
| }, | |
| { | |
| "entropy": 1.1451522946357726, | |
| "epoch": 5.420439844760673, | |
| "grad_norm": 0.9537347555160522, | |
| "learning_rate": 4.980356850413472e-05, | |
| "loss": 1.1319, | |
| "mean_token_accuracy": 0.7138208642601966, | |
| "num_tokens": 12023430.0, | |
| "step": 4190 | |
| }, | |
| { | |
| "entropy": 1.7249857246875764, | |
| "epoch": 5.433376455368693, | |
| "grad_norm": 1.279359221458435, | |
| "learning_rate": 4.935125364123292e-05, | |
| "loss": 1.6072, | |
| "mean_token_accuracy": 0.6237147711217403, | |
| "num_tokens": 12040024.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.433376455368693, | |
| "eval_entropy": 1.8443097567488982, | |
| "eval_loss": 1.2536410093307495, | |
| "eval_mean_token_accuracy": 0.4748884228079818, | |
| "eval_num_tokens": 12040024.0, | |
| "eval_runtime": 241.7185, | |
| "eval_samples_per_second": 22.737, | |
| "eval_steps_per_second": 1.423, | |
| "step": 4200 | |
| }, | |
| { | |
| "entropy": 2.7387112855911253, | |
| "epoch": 5.446313065976714, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.89003282654658e-05, | |
| "loss": 0.389, | |
| "mean_token_accuracy": 0.1823613777756691, | |
| "num_tokens": 12041467.0, | |
| "step": 4210 | |
| }, | |
| { | |
| "entropy": 2.44253671169281, | |
| "epoch": 5.459249676584735, | |
| "grad_norm": 0.5715515613555908, | |
| "learning_rate": 4.845080474742608e-05, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.2730660729110241, | |
| "num_tokens": 12103775.0, | |
| "step": 4220 | |
| }, | |
| { | |
| "entropy": 1.003270110487938, | |
| "epoch": 5.472186287192756, | |
| "grad_norm": 0.7785800099372864, | |
| "learning_rate": 4.800269541924799e-05, | |
| "loss": 1.0184, | |
| "mean_token_accuracy": 0.7413052409887314, | |
| "num_tokens": 12143014.0, | |
| "step": 4230 | |
| }, | |
| { | |
| "entropy": 1.1527703180909157, | |
| "epoch": 5.485122897800776, | |
| "grad_norm": 0.9831658005714417, | |
| "learning_rate": 4.7556012574269395e-05, | |
| "loss": 1.1284, | |
| "mean_token_accuracy": 0.7102037504315376, | |
| "num_tokens": 12171448.0, | |
| "step": 4240 | |
| }, | |
| { | |
| "entropy": 1.7090917527675629, | |
| "epoch": 5.498059508408797, | |
| "grad_norm": 1.4465516805648804, | |
| "learning_rate": 4.7110768466694224e-05, | |
| "loss": 1.6112, | |
| "mean_token_accuracy": 0.6218582183122635, | |
| "num_tokens": 12188400.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "entropy": 2.560487928986549, | |
| "epoch": 5.510996119016817, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.666697531125627e-05, | |
| "loss": 0.3879, | |
| "mean_token_accuracy": 0.16174983084201813, | |
| "num_tokens": 12189804.0, | |
| "step": 4260 | |
| }, | |
| { | |
| "entropy": 2.277393540740013, | |
| "epoch": 5.523932729624838, | |
| "grad_norm": 0.5197897553443909, | |
| "learning_rate": 4.622464528288443e-05, | |
| "loss": 1.027, | |
| "mean_token_accuracy": 0.2683463282883167, | |
| "num_tokens": 12249572.0, | |
| "step": 4270 | |
| }, | |
| { | |
| "entropy": 1.0234995201230048, | |
| "epoch": 5.536869340232859, | |
| "grad_norm": 0.7546108961105347, | |
| "learning_rate": 4.578379051636832e-05, | |
| "loss": 1.0282, | |
| "mean_token_accuracy": 0.7406062006950378, | |
| "num_tokens": 12288484.0, | |
| "step": 4280 | |
| }, | |
| { | |
| "entropy": 1.1632590115070343, | |
| "epoch": 5.54980595084088, | |
| "grad_norm": 1.0032302141189575, | |
| "learning_rate": 4.534442310602559e-05, | |
| "loss": 1.1404, | |
| "mean_token_accuracy": 0.7092833399772644, | |
| "num_tokens": 12316357.0, | |
| "step": 4290 | |
| }, | |
| { | |
| "entropy": 1.6969922810792923, | |
| "epoch": 5.5627425614489, | |
| "grad_norm": 1.356163740158081, | |
| "learning_rate": 4.490655510537004e-05, | |
| "loss": 1.5895, | |
| "mean_token_accuracy": 0.6228079289197922, | |
| "num_tokens": 12332741.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "entropy": 2.227242410182953, | |
| "epoch": 5.575679172056921, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.447019852678101e-05, | |
| "loss": 0.3691, | |
| "mean_token_accuracy": 0.18163795471191407, | |
| "num_tokens": 12334119.0, | |
| "step": 4310 | |
| }, | |
| { | |
| "entropy": 2.0241008371114733, | |
| "epoch": 5.588615782664942, | |
| "grad_norm": 0.5737898945808411, | |
| "learning_rate": 4.40353653411738e-05, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.2796335697174072, | |
| "num_tokens": 12386755.0, | |
| "step": 4320 | |
| }, | |
| { | |
| "entropy": 1.009289626777172, | |
| "epoch": 5.6015523932729625, | |
| "grad_norm": 0.7294387221336365, | |
| "learning_rate": 4.360206747767122e-05, | |
| "loss": 1.032, | |
| "mean_token_accuracy": 0.7417484134435653, | |
| "num_tokens": 12424985.0, | |
| "step": 4330 | |
| }, | |
| { | |
| "entropy": 1.1622566372156142, | |
| "epoch": 5.614489003880983, | |
| "grad_norm": 1.0234155654907227, | |
| "learning_rate": 4.3170316823276424e-05, | |
| "loss": 1.1576, | |
| "mean_token_accuracy": 0.7061204954981803, | |
| "num_tokens": 12452639.0, | |
| "step": 4340 | |
| }, | |
| { | |
| "entropy": 1.6782744824886322, | |
| "epoch": 5.627425614489004, | |
| "grad_norm": 1.4249658584594727, | |
| "learning_rate": 4.274012522254674e-05, | |
| "loss": 1.5881, | |
| "mean_token_accuracy": 0.6237360410392284, | |
| "num_tokens": 12469230.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 5.627425614489004, | |
| "eval_entropy": 1.6210007650214573, | |
| "eval_loss": 1.241470217704773, | |
| "eval_mean_token_accuracy": 0.47642275673705475, | |
| "eval_num_tokens": 12469230.0, | |
| "eval_runtime": 246.3378, | |
| "eval_samples_per_second": 22.311, | |
| "eval_steps_per_second": 1.396, | |
| "step": 4350 | |
| }, | |
| { | |
| "entropy": 1.9652087688446045, | |
| "epoch": 5.640362225097024, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.231150447726874e-05, | |
| "loss": 0.3533, | |
| "mean_token_accuracy": 0.19179367125034333, | |
| "num_tokens": 12470690.0, | |
| "step": 4360 | |
| }, | |
| { | |
| "entropy": 1.9280417621135713, | |
| "epoch": 5.653298835705045, | |
| "grad_norm": 0.5711302757263184, | |
| "learning_rate": 4.1884466346134466e-05, | |
| "loss": 0.9704, | |
| "mean_token_accuracy": 0.27944710552692414, | |
| "num_tokens": 12525117.0, | |
| "step": 4370 | |
| }, | |
| { | |
| "entropy": 1.0357938587665558, | |
| "epoch": 5.666235446313066, | |
| "grad_norm": 0.6963515877723694, | |
| "learning_rate": 4.145902254441888e-05, | |
| "loss": 1.0365, | |
| "mean_token_accuracy": 0.7398686364293099, | |
| "num_tokens": 12563021.0, | |
| "step": 4380 | |
| }, | |
| { | |
| "entropy": 1.1490644261240959, | |
| "epoch": 5.679172056921087, | |
| "grad_norm": 0.9824443459510803, | |
| "learning_rate": 4.1035184743658376e-05, | |
| "loss": 1.1307, | |
| "mean_token_accuracy": 0.7091254457831383, | |
| "num_tokens": 12591024.0, | |
| "step": 4390 | |
| }, | |
| { | |
| "entropy": 1.68570619225502, | |
| "epoch": 5.692108667529108, | |
| "grad_norm": 1.2685192823410034, | |
| "learning_rate": 4.0612964571330805e-05, | |
| "loss": 1.5877, | |
| "mean_token_accuracy": 0.6187320709228515, | |
| "num_tokens": 12607889.0, | |
| "step": 4400 | |
| }, | |
| { | |
| "entropy": 1.995962232351303, | |
| "epoch": 5.705045278137128, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.019237361053615e-05, | |
| "loss": 0.4375, | |
| "mean_token_accuracy": 0.1990293502807617, | |
| "num_tokens": 12609477.0, | |
| "step": 4410 | |
| }, | |
| { | |
| "entropy": 2.0628999888896944, | |
| "epoch": 5.717981888745149, | |
| "grad_norm": 0.582778811454773, | |
| "learning_rate": 3.977342339967902e-05, | |
| "loss": 0.9965, | |
| "mean_token_accuracy": 0.2732643634080887, | |
| "num_tokens": 12668390.0, | |
| "step": 4420 | |
| }, | |
| { | |
| "entropy": 1.0030916407704353, | |
| "epoch": 5.730918499353169, | |
| "grad_norm": 0.7195892930030823, | |
| "learning_rate": 3.935612543215216e-05, | |
| "loss": 1.0055, | |
| "mean_token_accuracy": 0.7438824102282524, | |
| "num_tokens": 12707626.0, | |
| "step": 4430 | |
| }, | |
| { | |
| "entropy": 1.1245022103190423, | |
| "epoch": 5.74385510996119, | |
| "grad_norm": 0.9609954357147217, | |
| "learning_rate": 3.8940491156020744e-05, | |
| "loss": 1.0932, | |
| "mean_token_accuracy": 0.7223910227417946, | |
| "num_tokens": 12736376.0, | |
| "step": 4440 | |
| }, | |
| { | |
| "entropy": 1.653869342803955, | |
| "epoch": 5.756791720569211, | |
| "grad_norm": 1.3840677738189697, | |
| "learning_rate": 3.852653197370885e-05, | |
| "loss": 1.5745, | |
| "mean_token_accuracy": 0.6224342837929726, | |
| "num_tokens": 12753560.0, | |
| "step": 4450 | |
| }, | |
| { | |
| "entropy": 2.0997736901044846, | |
| "epoch": 5.769728331177232, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.811425924168628e-05, | |
| "loss": 0.4083, | |
| "mean_token_accuracy": 0.17910270839929582, | |
| "num_tokens": 12755081.0, | |
| "step": 4460 | |
| }, | |
| { | |
| "entropy": 2.0056353509426117, | |
| "epoch": 5.782664941785252, | |
| "grad_norm": 0.5901302099227905, | |
| "learning_rate": 3.770368427015699e-05, | |
| "loss": 0.9965, | |
| "mean_token_accuracy": 0.2755757987499237, | |
| "num_tokens": 12818062.0, | |
| "step": 4470 | |
| }, | |
| { | |
| "entropy": 0.9973522603511811, | |
| "epoch": 5.795601552393273, | |
| "grad_norm": 0.7053154110908508, | |
| "learning_rate": 3.729481832274916e-05, | |
| "loss": 1.0101, | |
| "mean_token_accuracy": 0.7445162117481232, | |
| "num_tokens": 12856675.0, | |
| "step": 4480 | |
| }, | |
| { | |
| "entropy": 1.158506852388382, | |
| "epoch": 5.808538163001294, | |
| "grad_norm": 1.0795212984085083, | |
| "learning_rate": 3.688767261620578e-05, | |
| "loss": 1.1325, | |
| "mean_token_accuracy": 0.7126885786652565, | |
| "num_tokens": 12884620.0, | |
| "step": 4490 | |
| }, | |
| { | |
| "entropy": 1.6880556523799897, | |
| "epoch": 5.821474773609315, | |
| "grad_norm": 1.5192304849624634, | |
| "learning_rate": 3.64822583200772e-05, | |
| "loss": 1.5872, | |
| "mean_token_accuracy": 0.6223025761544705, | |
| "num_tokens": 12901293.0, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 5.821474773609315, | |
| "eval_entropy": 1.5741082594491715, | |
| "eval_loss": 1.2425955533981323, | |
| "eval_mean_token_accuracy": 0.4777476576178573, | |
| "eval_num_tokens": 12901293.0, | |
| "eval_runtime": 245.8608, | |
| "eval_samples_per_second": 22.354, | |
| "eval_steps_per_second": 1.399, | |
| "step": 4500 | |
| }, | |
| { | |
| "entropy": 1.8717746943235398, | |
| "epoch": 5.834411384217335, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.607858655641457e-05, | |
| "loss": 0.3819, | |
| "mean_token_accuracy": 0.20605695247650146, | |
| "num_tokens": 12902761.0, | |
| "step": 4510 | |
| }, | |
| { | |
| "entropy": 1.97312273979187, | |
| "epoch": 5.8473479948253555, | |
| "grad_norm": 0.5747093558311462, | |
| "learning_rate": 3.56766683994648e-05, | |
| "loss": 0.9997, | |
| "mean_token_accuracy": 0.27485966980457305, | |
| "num_tokens": 12956936.0, | |
| "step": 4520 | |
| }, | |
| { | |
| "entropy": 1.026018126308918, | |
| "epoch": 5.860284605433376, | |
| "grad_norm": 0.7504481077194214, | |
| "learning_rate": 3.527651487536669e-05, | |
| "loss": 1.044, | |
| "mean_token_accuracy": 0.7389606684446335, | |
| "num_tokens": 12995952.0, | |
| "step": 4530 | |
| }, | |
| { | |
| "entropy": 1.1011481299996375, | |
| "epoch": 5.873221216041397, | |
| "grad_norm": 0.9883886575698853, | |
| "learning_rate": 3.487813696184852e-05, | |
| "loss": 1.0814, | |
| "mean_token_accuracy": 0.722546960413456, | |
| "num_tokens": 13024545.0, | |
| "step": 4540 | |
| }, | |
| { | |
| "entropy": 1.6190055787563324, | |
| "epoch": 5.886157826649418, | |
| "grad_norm": 1.3633733987808228, | |
| "learning_rate": 3.448154558792677e-05, | |
| "loss": 1.5299, | |
| "mean_token_accuracy": 0.6360443904995918, | |
| "num_tokens": 13041707.0, | |
| "step": 4550 | |
| }, | |
| { | |
| "entropy": 1.919902539253235, | |
| "epoch": 5.899094437257439, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.408675163360643e-05, | |
| "loss": 0.3972, | |
| "mean_token_accuracy": 0.18492977023124696, | |
| "num_tokens": 13043179.0, | |
| "step": 4560 | |
| }, | |
| { | |
| "entropy": 1.9439027100801467, | |
| "epoch": 5.912031047865459, | |
| "grad_norm": 0.5576460957527161, | |
| "learning_rate": 3.369376592958243e-05, | |
| "loss": 1.0312, | |
| "mean_token_accuracy": 0.2685145862400532, | |
| "num_tokens": 13106663.0, | |
| "step": 4570 | |
| }, | |
| { | |
| "entropy": 1.0852982923388481, | |
| "epoch": 5.92496765847348, | |
| "grad_norm": 0.7461971044540405, | |
| "learning_rate": 3.3302599256942524e-05, | |
| "loss": 1.0907, | |
| "mean_token_accuracy": 0.7287055298686027, | |
| "num_tokens": 13146036.0, | |
| "step": 4580 | |
| }, | |
| { | |
| "entropy": 1.1466092258691787, | |
| "epoch": 5.937904269081501, | |
| "grad_norm": 0.9710547924041748, | |
| "learning_rate": 3.2913262346871564e-05, | |
| "loss": 1.118, | |
| "mean_token_accuracy": 0.7170251324772835, | |
| "num_tokens": 13175061.0, | |
| "step": 4590 | |
| }, | |
| { | |
| "entropy": 1.5422434598207473, | |
| "epoch": 5.950840879689522, | |
| "grad_norm": 1.2156635522842407, | |
| "learning_rate": 3.252576588035703e-05, | |
| "loss": 1.4615, | |
| "mean_token_accuracy": 0.6465979412198066, | |
| "num_tokens": 13192904.0, | |
| "step": 4600 | |
| }, | |
| { | |
| "entropy": 1.8891061872243882, | |
| "epoch": 5.963777490297542, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.21401204878962e-05, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.20470450967550277, | |
| "num_tokens": 13194636.0, | |
| "step": 4610 | |
| }, | |
| { | |
| "entropy": 1.679259254038334, | |
| "epoch": 5.976714100905562, | |
| "grad_norm": 0.7184410095214844, | |
| "learning_rate": 3.175633674920415e-05, | |
| "loss": 0.7382, | |
| "mean_token_accuracy": 0.3269588887691498, | |
| "num_tokens": 13232029.0, | |
| "step": 4620 | |
| }, | |
| { | |
| "entropy": 1.1688358381390571, | |
| "epoch": 5.989650711513583, | |
| "grad_norm": 0.9711093306541443, | |
| "learning_rate": 3.1374425192923874e-05, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.7072150468826294, | |
| "num_tokens": 13259115.0, | |
| "step": 4630 | |
| }, | |
| { | |
| "entropy": 1.995809930562973, | |
| "epoch": 6.002587322121604, | |
| "grad_norm": 0.3214434087276459, | |
| "learning_rate": 3.099439629633738e-05, | |
| "loss": 0.9081, | |
| "mean_token_accuracy": 0.2743851698935032, | |
| "num_tokens": 13302193.0, | |
| "step": 4640 | |
| }, | |
| { | |
| "entropy": 1.2387345060706139, | |
| "epoch": 6.015523932729625, | |
| "grad_norm": 0.7096182107925415, | |
| "learning_rate": 3.061626048507794e-05, | |
| "loss": 1.2251, | |
| "mean_token_accuracy": 0.7026221588253975, | |
| "num_tokens": 13349206.0, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 6.015523932729625, | |
| "eval_entropy": 1.4673164599510127, | |
| "eval_loss": 1.236427664756775, | |
| "eval_mean_token_accuracy": 0.4835313937171947, | |
| "eval_num_tokens": 13349206.0, | |
| "eval_runtime": 245.226, | |
| "eval_samples_per_second": 22.412, | |
| "eval_steps_per_second": 1.403, | |
| "step": 4650 | |
| }, | |
| { | |
| "entropy": 1.0112595960497857, | |
| "epoch": 6.028460543337646, | |
| "grad_norm": 0.9612884521484375, | |
| "learning_rate": 3.0240028132844577e-05, | |
| "loss": 0.9916, | |
| "mean_token_accuracy": 0.7466120198369026, | |
| "num_tokens": 13380735.0, | |
| "step": 4660 | |
| }, | |
| { | |
| "entropy": 1.321917925775051, | |
| "epoch": 6.041397153945666, | |
| "grad_norm": 0.9298290014266968, | |
| "learning_rate": 2.9865709561117093e-05, | |
| "loss": 1.277, | |
| "mean_token_accuracy": 0.6769641906023025, | |
| "num_tokens": 13402259.0, | |
| "step": 4670 | |
| }, | |
| { | |
| "entropy": 1.9312127828598022, | |
| "epoch": 6.054333764553687, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.949331503887296e-05, | |
| "loss": 0.9373, | |
| "mean_token_accuracy": 0.38414124920964243, | |
| "num_tokens": 13406702.0, | |
| "step": 4680 | |
| }, | |
| { | |
| "entropy": 1.8519232898950577, | |
| "epoch": 6.067270375161708, | |
| "grad_norm": 0.3253900706768036, | |
| "learning_rate": 2.9122854782305853e-05, | |
| "loss": 0.4393, | |
| "mean_token_accuracy": 0.10099697411060334, | |
| "num_tokens": 13448471.0, | |
| "step": 4690 | |
| }, | |
| { | |
| "entropy": 1.2315872982144356, | |
| "epoch": 6.0802069857697285, | |
| "grad_norm": 0.7172207832336426, | |
| "learning_rate": 2.8754338954545078e-05, | |
| "loss": 1.2677, | |
| "mean_token_accuracy": 0.6917841538786889, | |
| "num_tokens": 13494707.0, | |
| "step": 4700 | |
| }, | |
| { | |
| "entropy": 1.078819990158081, | |
| "epoch": 6.093143596377749, | |
| "grad_norm": 0.9585686326026917, | |
| "learning_rate": 2.8387777665376947e-05, | |
| "loss": 1.0795, | |
| "mean_token_accuracy": 0.7268196657299996, | |
| "num_tokens": 13525272.0, | |
| "step": 4710 | |
| }, | |
| { | |
| "entropy": 1.439416041970253, | |
| "epoch": 6.106080206985769, | |
| "grad_norm": 0.967811107635498, | |
| "learning_rate": 2.8023180970967333e-05, | |
| "loss": 1.3684, | |
| "mean_token_accuracy": 0.6664265364408493, | |
| "num_tokens": 13545790.0, | |
| "step": 4720 | |
| }, | |
| { | |
| "entropy": 1.8261877298355103, | |
| "epoch": 6.11901681759379, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.766055887358584e-05, | |
| "loss": 0.8898, | |
| "mean_token_accuracy": 0.34252284914255143, | |
| "num_tokens": 13549613.0, | |
| "step": 4730 | |
| }, | |
| { | |
| "entropy": 1.8926386773586272, | |
| "epoch": 6.131953428201811, | |
| "grad_norm": 0.33156275749206543, | |
| "learning_rate": 2.72999213213314e-05, | |
| "loss": 0.438, | |
| "mean_token_accuracy": 0.10151686370372773, | |
| "num_tokens": 13586113.0, | |
| "step": 4740 | |
| }, | |
| { | |
| "entropy": 1.2399160832166671, | |
| "epoch": 6.144890038809832, | |
| "grad_norm": 0.7470856308937073, | |
| "learning_rate": 2.6941278207859333e-05, | |
| "loss": 1.2593, | |
| "mean_token_accuracy": 0.6944727435708046, | |
| "num_tokens": 13632230.0, | |
| "step": 4750 | |
| }, | |
| { | |
| "entropy": 1.008384570479393, | |
| "epoch": 6.157826649417853, | |
| "grad_norm": 0.992726743221283, | |
| "learning_rate": 2.6584639372109942e-05, | |
| "loss": 0.991, | |
| "mean_token_accuracy": 0.7462219312787056, | |
| "num_tokens": 13663326.0, | |
| "step": 4760 | |
| }, | |
| { | |
| "entropy": 1.338111485540867, | |
| "epoch": 6.170763260025873, | |
| "grad_norm": 1.0804771184921265, | |
| "learning_rate": 2.623001459803861e-05, | |
| "loss": 1.3146, | |
| "mean_token_accuracy": 0.6769130662083626, | |
| "num_tokens": 13684547.0, | |
| "step": 4770 | |
| }, | |
| { | |
| "entropy": 1.9144802495837212, | |
| "epoch": 6.183699870633894, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.5877413614347358e-05, | |
| "loss": 0.8822, | |
| "mean_token_accuracy": 0.3425231367349625, | |
| "num_tokens": 13688744.0, | |
| "step": 4780 | |
| }, | |
| { | |
| "entropy": 1.9466426759958266, | |
| "epoch": 6.196636481241915, | |
| "grad_norm": 0.30804237723350525, | |
| "learning_rate": 2.5526846094217948e-05, | |
| "loss": 0.4398, | |
| "mean_token_accuracy": 0.10224909633398056, | |
| "num_tokens": 13724520.0, | |
| "step": 4790 | |
| }, | |
| { | |
| "entropy": 1.2254028126597405, | |
| "epoch": 6.2095730918499354, | |
| "grad_norm": 0.7537704706192017, | |
| "learning_rate": 2.5178321655046577e-05, | |
| "loss": 1.2608, | |
| "mean_token_accuracy": 0.6935150980949402, | |
| "num_tokens": 13771548.0, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 6.2095730918499354, | |
| "eval_entropy": 1.452152093482572, | |
| "eval_loss": 1.2386506795883179, | |
| "eval_mean_token_accuracy": 0.4820184623605983, | |
| "eval_num_tokens": 13771548.0, | |
| "eval_runtime": 247.597, | |
| "eval_samples_per_second": 22.197, | |
| "eval_steps_per_second": 1.389, | |
| "step": 4800 | |
| }, | |
| { | |
| "entropy": 1.0173511430621147, | |
| "epoch": 6.222509702457956, | |
| "grad_norm": 0.9651890993118286, | |
| "learning_rate": 2.4831849858179913e-05, | |
| "loss": 1.0019, | |
| "mean_token_accuracy": 0.7469066709280014, | |
| "num_tokens": 13802198.0, | |
| "step": 4810 | |
| }, | |
| { | |
| "entropy": 1.3790171161293983, | |
| "epoch": 6.235446313065976, | |
| "grad_norm": 1.0010708570480347, | |
| "learning_rate": 2.448744020865299e-05, | |
| "loss": 1.3194, | |
| "mean_token_accuracy": 0.6719131916761398, | |
| "num_tokens": 13823366.0, | |
| "step": 4820 | |
| }, | |
| { | |
| "entropy": 1.8534984111785888, | |
| "epoch": 6.248382923673997, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4145102154928156e-05, | |
| "loss": 0.9022, | |
| "mean_token_accuracy": 0.36226404309272764, | |
| "num_tokens": 13827780.0, | |
| "step": 4830 | |
| }, | |
| { | |
| "entropy": 1.787733218073845, | |
| "epoch": 6.261319534282018, | |
| "grad_norm": 0.35094037652015686, | |
| "learning_rate": 2.380484508863611e-05, | |
| "loss": 0.4416, | |
| "mean_token_accuracy": 0.10181766748428345, | |
| "num_tokens": 13865780.0, | |
| "step": 4840 | |
| }, | |
| { | |
| "entropy": 1.2079532265663147, | |
| "epoch": 6.274256144890039, | |
| "grad_norm": 0.7374927997589111, | |
| "learning_rate": 2.346667834431826e-05, | |
| "loss": 1.2223, | |
| "mean_token_accuracy": 0.704416724294424, | |
| "num_tokens": 13911952.0, | |
| "step": 4850 | |
| }, | |
| { | |
| "entropy": 0.9947008207440377, | |
| "epoch": 6.28719275549806, | |
| "grad_norm": 0.9054901003837585, | |
| "learning_rate": 2.3130611199170384e-05, | |
| "loss": 0.9776, | |
| "mean_token_accuracy": 0.7504064351320267, | |
| "num_tokens": 13943487.0, | |
| "step": 4860 | |
| }, | |
| { | |
| "entropy": 1.312053567171097, | |
| "epoch": 6.300129366106081, | |
| "grad_norm": 1.030329704284668, | |
| "learning_rate": 2.2796652872788448e-05, | |
| "loss": 1.2766, | |
| "mean_token_accuracy": 0.6872908189892769, | |
| "num_tokens": 13965764.0, | |
| "step": 4870 | |
| }, | |
| { | |
| "entropy": 1.73905668258667, | |
| "epoch": 6.313065976714101, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.246481252691548e-05, | |
| "loss": 0.9456, | |
| "mean_token_accuracy": 0.3810268484055996, | |
| "num_tokens": 13970660.0, | |
| "step": 4880 | |
| }, | |
| { | |
| "entropy": 1.7431816905736923, | |
| "epoch": 6.3260025873221215, | |
| "grad_norm": 0.3432393968105316, | |
| "learning_rate": 2.213509926519016e-05, | |
| "loss": 0.4431, | |
| "mean_token_accuracy": 0.103342554718256, | |
| "num_tokens": 14010149.0, | |
| "step": 4890 | |
| }, | |
| { | |
| "entropy": 1.2463560298085212, | |
| "epoch": 6.338939197930142, | |
| "grad_norm": 0.7458313703536987, | |
| "learning_rate": 2.1807522132897383e-05, | |
| "loss": 1.2702, | |
| "mean_token_accuracy": 0.6920596107840538, | |
| "num_tokens": 14057120.0, | |
| "step": 4900 | |
| }, | |
| { | |
| "entropy": 1.027150359749794, | |
| "epoch": 6.351875808538163, | |
| "grad_norm": 0.8767898082733154, | |
| "learning_rate": 2.148209011671979e-05, | |
| "loss": 0.9989, | |
| "mean_token_accuracy": 0.743067529797554, | |
| "num_tokens": 14088988.0, | |
| "step": 4910 | |
| }, | |
| { | |
| "entropy": 1.3012418672442436, | |
| "epoch": 6.364812419146183, | |
| "grad_norm": 1.0773974657058716, | |
| "learning_rate": 2.1158812144491357e-05, | |
| "loss": 1.247, | |
| "mean_token_accuracy": 0.6856265813112259, | |
| "num_tokens": 14111094.0, | |
| "step": 4920 | |
| }, | |
| { | |
| "entropy": 1.7512285083532333, | |
| "epoch": 6.377749029754204, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.0837697084952503e-05, | |
| "loss": 0.9705, | |
| "mean_token_accuracy": 0.38980276361107824, | |
| "num_tokens": 14115970.0, | |
| "step": 4930 | |
| }, | |
| { | |
| "entropy": 1.7514180034399032, | |
| "epoch": 6.390685640362225, | |
| "grad_norm": 0.3412686288356781, | |
| "learning_rate": 2.0518753747506748e-05, | |
| "loss": 0.4438, | |
| "mean_token_accuracy": 0.10270617604255676, | |
| "num_tokens": 14151452.0, | |
| "step": 4940 | |
| }, | |
| { | |
| "entropy": 1.2002925127744675, | |
| "epoch": 6.403622250970246, | |
| "grad_norm": 0.7483528852462769, | |
| "learning_rate": 2.0201990881979006e-05, | |
| "loss": 1.2267, | |
| "mean_token_accuracy": 0.7003540650010109, | |
| "num_tokens": 14198038.0, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 6.403622250970246, | |
| "eval_entropy": 1.4145794496979824, | |
| "eval_loss": 1.2361637353897095, | |
| "eval_mean_token_accuracy": 0.4807747915213884, | |
| "eval_num_tokens": 14198038.0, | |
| "eval_runtime": 239.3212, | |
| "eval_samples_per_second": 22.965, | |
| "eval_steps_per_second": 1.437, | |
| "step": 4950 | |
| }, | |
| { | |
| "entropy": 1.0035071596503258, | |
| "epoch": 6.416558861578267, | |
| "grad_norm": 0.9199973940849304, | |
| "learning_rate": 1.9887417178375633e-05, | |
| "loss": 0.9911, | |
| "mean_token_accuracy": 0.7502464011311532, | |
| "num_tokens": 14229396.0, | |
| "step": 4960 | |
| }, | |
| { | |
| "entropy": 1.3634681567549705, | |
| "epoch": 6.429495472186288, | |
| "grad_norm": 0.8955945372581482, | |
| "learning_rate": 1.957504126664593e-05, | |
| "loss": 1.3055, | |
| "mean_token_accuracy": 0.677581375837326, | |
| "num_tokens": 14251059.0, | |
| "step": 4970 | |
| }, | |
| { | |
| "entropy": 1.782031211256981, | |
| "epoch": 6.442432082794308, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.9264871716445454e-05, | |
| "loss": 1.0013, | |
| "mean_token_accuracy": 0.41802891343832016, | |
| "num_tokens": 14255872.0, | |
| "step": 4980 | |
| }, | |
| { | |
| "entropy": 1.542439764738083, | |
| "epoch": 6.455368693402328, | |
| "grad_norm": 0.34138184785842896, | |
| "learning_rate": 1.89569170369009e-05, | |
| "loss": 0.4513, | |
| "mean_token_accuracy": 0.1006891518831253, | |
| "num_tokens": 14297788.0, | |
| "step": 4990 | |
| }, | |
| { | |
| "entropy": 1.2497848883271216, | |
| "epoch": 6.468305304010349, | |
| "grad_norm": 0.7626767158508301, | |
| "learning_rate": 1.865118567637667e-05, | |
| "loss": 1.2743, | |
| "mean_token_accuracy": 0.6893603593111038, | |
| "num_tokens": 14345367.0, | |
| "step": 5000 | |
| }, | |
| { | |
| "entropy": 0.9866194486618042, | |
| "epoch": 6.48124191461837, | |
| "grad_norm": 1.0120469331741333, | |
| "learning_rate": 1.834768602224307e-05, | |
| "loss": 0.9661, | |
| "mean_token_accuracy": 0.752055998146534, | |
| "num_tokens": 14376619.0, | |
| "step": 5010 | |
| }, | |
| { | |
| "entropy": 1.2819917246699333, | |
| "epoch": 6.494178525226391, | |
| "grad_norm": 0.9832173585891724, | |
| "learning_rate": 1.8046426400646244e-05, | |
| "loss": 1.2393, | |
| "mean_token_accuracy": 0.6865051403641701, | |
| "num_tokens": 14398410.0, | |
| "step": 5020 | |
| }, | |
| { | |
| "entropy": 1.656550607085228, | |
| "epoch": 6.507115135834411, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.774741507627984e-05, | |
| "loss": 1.0363, | |
| "mean_token_accuracy": 0.402515621483326, | |
| "num_tokens": 14403699.0, | |
| "step": 5030 | |
| }, | |
| { | |
| "entropy": 1.4212503910064698, | |
| "epoch": 6.520051746442432, | |
| "grad_norm": 0.3207855820655823, | |
| "learning_rate": 1.7450660252158015e-05, | |
| "loss": 0.4273, | |
| "mean_token_accuracy": 0.10288792848587036, | |
| "num_tokens": 14446058.0, | |
| "step": 5040 | |
| }, | |
| { | |
| "entropy": 1.2183921545743943, | |
| "epoch": 6.532988357050453, | |
| "grad_norm": 0.7788935899734497, | |
| "learning_rate": 1.71561700693907e-05, | |
| "loss": 1.2401, | |
| "mean_token_accuracy": 0.7000276446342468, | |
| "num_tokens": 14492725.0, | |
| "step": 5050 | |
| }, | |
| { | |
| "entropy": 1.0459384858608245, | |
| "epoch": 6.545924967658474, | |
| "grad_norm": 0.9662116765975952, | |
| "learning_rate": 1.6863952606960132e-05, | |
| "loss": 1.037, | |
| "mean_token_accuracy": 0.7341208711266518, | |
| "num_tokens": 14523347.0, | |
| "step": 5060 | |
| }, | |
| { | |
| "entropy": 1.3962342336773872, | |
| "epoch": 6.5588615782664945, | |
| "grad_norm": 1.0042107105255127, | |
| "learning_rate": 1.6574015881499106e-05, | |
| "loss": 1.3439, | |
| "mean_token_accuracy": 0.6732321053743362, | |
| "num_tokens": 14543748.0, | |
| "step": 5070 | |
| }, | |
| { | |
| "entropy": 1.4976371228694916, | |
| "epoch": 6.5717981888745145, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6286367847071294e-05, | |
| "loss": 0.8495, | |
| "mean_token_accuracy": 0.37927755415439607, | |
| "num_tokens": 14547526.0, | |
| "step": 5080 | |
| }, | |
| { | |
| "entropy": 1.378989189863205, | |
| "epoch": 6.584734799482535, | |
| "grad_norm": 0.35467758774757385, | |
| "learning_rate": 1.6001016394952817e-05, | |
| "loss": 0.436, | |
| "mean_token_accuracy": 0.10404296517372132, | |
| "num_tokens": 14587727.0, | |
| "step": 5090 | |
| }, | |
| { | |
| "entropy": 1.2019992262125014, | |
| "epoch": 6.597671410090556, | |
| "grad_norm": 0.7634411454200745, | |
| "learning_rate": 1.5717969353415772e-05, | |
| "loss": 1.2363, | |
| "mean_token_accuracy": 0.7016454577445984, | |
| "num_tokens": 14633377.0, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.597671410090556, | |
| "eval_entropy": 1.3025533678226693, | |
| "eval_loss": 1.2344391345977783, | |
| "eval_mean_token_accuracy": 0.4806629490367202, | |
| "eval_num_tokens": 14633377.0, | |
| "eval_runtime": 243.0518, | |
| "eval_samples_per_second": 22.612, | |
| "eval_steps_per_second": 1.415, | |
| "step": 5100 | |
| }, | |
| { | |
| "entropy": 1.0161924228072166, | |
| "epoch": 6.610608020698577, | |
| "grad_norm": 1.0323160886764526, | |
| "learning_rate": 1.5437234487513687e-05, | |
| "loss": 0.9938, | |
| "mean_token_accuracy": 0.747073483467102, | |
| "num_tokens": 14664256.0, | |
| "step": 5110 | |
| }, | |
| { | |
| "entropy": 1.358753038942814, | |
| "epoch": 6.623544631306598, | |
| "grad_norm": 1.011472225189209, | |
| "learning_rate": 1.5158819498868248e-05, | |
| "loss": 1.3273, | |
| "mean_token_accuracy": 0.6735880345106124, | |
| "num_tokens": 14685452.0, | |
| "step": 5120 | |
| }, | |
| { | |
| "entropy": 1.5439666867256165, | |
| "epoch": 6.636481241914618, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4882732025458124e-05, | |
| "loss": 0.8744, | |
| "mean_token_accuracy": 0.35112617164850235, | |
| "num_tokens": 14689408.0, | |
| "step": 5130 | |
| }, | |
| { | |
| "entropy": 1.490699003636837, | |
| "epoch": 6.649417852522639, | |
| "grad_norm": 0.33567583560943604, | |
| "learning_rate": 1.4608979641409448e-05, | |
| "loss": 0.4429, | |
| "mean_token_accuracy": 0.10201395228505135, | |
| "num_tokens": 14730607.0, | |
| "step": 5140 | |
| }, | |
| { | |
| "entropy": 1.1885226652026177, | |
| "epoch": 6.66235446313066, | |
| "grad_norm": 0.7712506055831909, | |
| "learning_rate": 1.4337569856787958e-05, | |
| "loss": 1.2014, | |
| "mean_token_accuracy": 0.7031497925519943, | |
| "num_tokens": 14775950.0, | |
| "step": 5150 | |
| }, | |
| { | |
| "entropy": 1.013894683122635, | |
| "epoch": 6.675291073738681, | |
| "grad_norm": 0.993394672870636, | |
| "learning_rate": 1.406851011739303e-05, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.7462615251541138, | |
| "num_tokens": 14806798.0, | |
| "step": 5160 | |
| }, | |
| { | |
| "entropy": 1.276303158700466, | |
| "epoch": 6.6882276843467015, | |
| "grad_norm": 0.9287812113761902, | |
| "learning_rate": 1.3801807804553401e-05, | |
| "loss": 1.2193, | |
| "mean_token_accuracy": 0.701404669880867, | |
| "num_tokens": 14828450.0, | |
| "step": 5170 | |
| }, | |
| { | |
| "entropy": 1.639420548081398, | |
| "epoch": 6.701164294954722, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.3537470234924642e-05, | |
| "loss": 0.9149, | |
| "mean_token_accuracy": 0.36589213013648986, | |
| "num_tokens": 14832909.0, | |
| "step": 5180 | |
| }, | |
| { | |
| "entropy": 1.5444379433989526, | |
| "epoch": 6.714100905562742, | |
| "grad_norm": 0.33196088671684265, | |
| "learning_rate": 1.3275504660288462e-05, | |
| "loss": 0.4502, | |
| "mean_token_accuracy": 0.09918043613433838, | |
| "num_tokens": 14875888.0, | |
| "step": 5190 | |
| }, | |
| { | |
| "entropy": 1.1905731126666068, | |
| "epoch": 6.727037516170763, | |
| "grad_norm": 0.7245560884475708, | |
| "learning_rate": 1.3015918267353743e-05, | |
| "loss": 1.2055, | |
| "mean_token_accuracy": 0.7072307705879212, | |
| "num_tokens": 14921555.0, | |
| "step": 5200 | |
| }, | |
| { | |
| "entropy": 1.0091575369238854, | |
| "epoch": 6.739974126778784, | |
| "grad_norm": 0.9656630158424377, | |
| "learning_rate": 1.2758718177559403e-05, | |
| "loss": 1.0059, | |
| "mean_token_accuracy": 0.7457368150353432, | |
| "num_tokens": 14952319.0, | |
| "step": 5210 | |
| }, | |
| { | |
| "entropy": 1.3768625631928444, | |
| "epoch": 6.752910737386805, | |
| "grad_norm": 1.0023345947265625, | |
| "learning_rate": 1.2503911446879014e-05, | |
| "loss": 1.3323, | |
| "mean_token_accuracy": 0.6721446126699447, | |
| "num_tokens": 14973360.0, | |
| "step": 5220 | |
| }, | |
| { | |
| "entropy": 1.706917905807495, | |
| "epoch": 6.765847347994825, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.2251505065627211e-05, | |
| "loss": 0.884, | |
| "mean_token_accuracy": 0.34794071316719055, | |
| "num_tokens": 14977368.0, | |
| "step": 5230 | |
| }, | |
| { | |
| "entropy": 1.6983414202928544, | |
| "epoch": 6.778783958602846, | |
| "grad_norm": 0.34029924869537354, | |
| "learning_rate": 1.2001505958268045e-05, | |
| "loss": 0.4392, | |
| "mean_token_accuracy": 0.10167066529393196, | |
| "num_tokens": 15016518.0, | |
| "step": 5240 | |
| }, | |
| { | |
| "entropy": 1.1760634392499925, | |
| "epoch": 6.791720569210867, | |
| "grad_norm": 0.7289795875549316, | |
| "learning_rate": 1.1753920983224753e-05, | |
| "loss": 1.2004, | |
| "mean_token_accuracy": 0.7051770240068436, | |
| "num_tokens": 15062291.0, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 6.791720569210867, | |
| "eval_entropy": 1.3887645453214645, | |
| "eval_loss": 1.2298688888549805, | |
| "eval_mean_token_accuracy": 0.48596259925601093, | |
| "eval_num_tokens": 15062291.0, | |
| "eval_runtime": 246.7195, | |
| "eval_samples_per_second": 22.276, | |
| "eval_steps_per_second": 1.394, | |
| "step": 5250 | |
| }, | |
| { | |
| "entropy": 1.019908943772316, | |
| "epoch": 6.8046571798188875, | |
| "grad_norm": 1.0139966011047363, | |
| "learning_rate": 1.1508756932691878e-05, | |
| "loss": 1.016, | |
| "mean_token_accuracy": 0.7411870285868645, | |
| "num_tokens": 15093136.0, | |
| "step": 5260 | |
| }, | |
| { | |
| "entropy": 1.3366242468357086, | |
| "epoch": 6.817593790426908, | |
| "grad_norm": 1.015224814414978, | |
| "learning_rate": 1.1266020532448863e-05, | |
| "loss": 1.3099, | |
| "mean_token_accuracy": 0.680339677631855, | |
| "num_tokens": 15113801.0, | |
| "step": 5270 | |
| }, | |
| { | |
| "entropy": 1.7231854051351547, | |
| "epoch": 6.830530401034929, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1025718441675348e-05, | |
| "loss": 0.8459, | |
| "mean_token_accuracy": 0.34885319918394087, | |
| "num_tokens": 15117501.0, | |
| "step": 5280 | |
| }, | |
| { | |
| "entropy": 1.8012044936418534, | |
| "epoch": 6.843467011642949, | |
| "grad_norm": 0.3444773256778717, | |
| "learning_rate": 1.0787857252768807e-05, | |
| "loss": 0.4338, | |
| "mean_token_accuracy": 0.10217657834291458, | |
| "num_tokens": 15154208.0, | |
| "step": 5290 | |
| }, | |
| { | |
| "entropy": 1.17054093927145, | |
| "epoch": 6.85640362225097, | |
| "grad_norm": 0.7941517233848572, | |
| "learning_rate": 1.0552443491163422e-05, | |
| "loss": 1.1874, | |
| "mean_token_accuracy": 0.7076364248991013, | |
| "num_tokens": 15199469.0, | |
| "step": 5300 | |
| }, | |
| { | |
| "entropy": 1.0057064607739448, | |
| "epoch": 6.869340232858991, | |
| "grad_norm": 0.8840006589889526, | |
| "learning_rate": 1.0319483615151137e-05, | |
| "loss": 0.981, | |
| "mean_token_accuracy": 0.7503589361906051, | |
| "num_tokens": 15230670.0, | |
| "step": 5310 | |
| }, | |
| { | |
| "entropy": 1.2563072219491005, | |
| "epoch": 6.882276843467012, | |
| "grad_norm": 1.0177907943725586, | |
| "learning_rate": 1.0088984015704629e-05, | |
| "loss": 1.2394, | |
| "mean_token_accuracy": 0.6934975415468216, | |
| "num_tokens": 15252641.0, | |
| "step": 5320 | |
| }, | |
| { | |
| "entropy": 1.8372395306825637, | |
| "epoch": 6.895213454075033, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.860951016301756e-06, | |
| "loss": 0.9875, | |
| "mean_token_accuracy": 0.3743965640664101, | |
| "num_tokens": 15257407.0, | |
| "step": 5330 | |
| }, | |
| { | |
| "entropy": 1.7831202149391174, | |
| "epoch": 6.908150064683053, | |
| "grad_norm": 0.3214081823825836, | |
| "learning_rate": 9.635390872752237e-06, | |
| "loss": 0.43, | |
| "mean_token_accuracy": 0.10435229986906051, | |
| "num_tokens": 15299860.0, | |
| "step": 5340 | |
| }, | |
| { | |
| "entropy": 1.2522226199507713, | |
| "epoch": 6.921086675291074, | |
| "grad_norm": 0.8021490573883057, | |
| "learning_rate": 9.412309773025952e-06, | |
| "loss": 1.2766, | |
| "mean_token_accuracy": 0.6917116060853005, | |
| "num_tokens": 15347391.0, | |
| "step": 5350 | |
| }, | |
| { | |
| "entropy": 1.0165240302681924, | |
| "epoch": 6.9340232858990944, | |
| "grad_norm": 0.9851676225662231, | |
| "learning_rate": 9.191713837083238e-06, | |
| "loss": 1.0192, | |
| "mean_token_accuracy": 0.7415471941232681, | |
| "num_tokens": 15379391.0, | |
| "step": 5360 | |
| }, | |
| { | |
| "entropy": 1.2651836022734642, | |
| "epoch": 6.946959896507115, | |
| "grad_norm": 1.12442946434021, | |
| "learning_rate": 8.973609116706926e-06, | |
| "loss": 1.2443, | |
| "mean_token_accuracy": 0.6868803769350051, | |
| "num_tokens": 15401606.0, | |
| "step": 5370 | |
| }, | |
| { | |
| "entropy": 1.7752905175089837, | |
| "epoch": 6.959896507115136, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.758001595336418e-06, | |
| "loss": 0.8999, | |
| "mean_token_accuracy": 0.38887517899274826, | |
| "num_tokens": 15406538.0, | |
| "step": 5380 | |
| }, | |
| { | |
| "entropy": 1.7106927633285522, | |
| "epoch": 6.972833117723156, | |
| "grad_norm": 0.5107993483543396, | |
| "learning_rate": 8.544897187903423e-06, | |
| "loss": 0.4117, | |
| "mean_token_accuracy": 0.10680279433727265, | |
| "num_tokens": 15432463.0, | |
| "step": 5390 | |
| }, | |
| { | |
| "entropy": 1.0609442353248597, | |
| "epoch": 6.985769728331177, | |
| "grad_norm": 1.095216155052185, | |
| "learning_rate": 8.33430174066978e-06, | |
| "loss": 1.0514, | |
| "mean_token_accuracy": 0.7322214379906654, | |
| "num_tokens": 15465365.0, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 6.985769728331177, | |
| "eval_entropy": 1.3962991244571155, | |
| "eval_loss": 1.2261559963226318, | |
| "eval_mean_token_accuracy": 0.48680107668042183, | |
| "eval_num_tokens": 15465365.0, | |
| "eval_runtime": 244.9697, | |
| "eval_samples_per_second": 22.435, | |
| "eval_steps_per_second": 1.404, | |
| "step": 5400 | |
| }, | |
| { | |
| "entropy": 1.6976288080215454, | |
| "epoch": 6.998706338939198, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.126221031067027e-06, | |
| "loss": 0.7689, | |
| "mean_token_accuracy": 0.2966282024979591, | |
| "num_tokens": 15471588.0, | |
| "step": 5410 | |
| }, | |
| { | |
| "entropy": 1.497927661240101, | |
| "epoch": 7.011642949547219, | |
| "grad_norm": 0.7096975445747375, | |
| "learning_rate": 7.920660767537901e-06, | |
| "loss": 1.3894, | |
| "mean_token_accuracy": 0.5761201746761799, | |
| "num_tokens": 15542066.0, | |
| "step": 5420 | |
| }, | |
| { | |
| "entropy": 0.9780161440372467, | |
| "epoch": 7.02457956015524, | |
| "grad_norm": 0.9500054717063904, | |
| "learning_rate": 7.717626589379789e-06, | |
| "loss": 0.9513, | |
| "mean_token_accuracy": 0.7568799629807472, | |
| "num_tokens": 15575551.0, | |
| "step": 5430 | |
| }, | |
| { | |
| "entropy": 1.169414332509041, | |
| "epoch": 7.03751617076326, | |
| "grad_norm": 1.0309356451034546, | |
| "learning_rate": 7.517124066589909e-06, | |
| "loss": 1.1411, | |
| "mean_token_accuracy": 0.711452366411686, | |
| "num_tokens": 15599584.0, | |
| "step": 5440 | |
| }, | |
| { | |
| "entropy": 1.7210813522338868, | |
| "epoch": 7.0504527813712805, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.319158699712669e-06, | |
| "loss": 1.3323, | |
| "mean_token_accuracy": 0.5859084717929364, | |
| "num_tokens": 15608747.0, | |
| "step": 5450 | |
| }, | |
| { | |
| "entropy": 1.6397013187408447, | |
| "epoch": 7.063389391979301, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.12373591968859e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 15609387.0, | |
| "step": 5460 | |
| }, | |
| { | |
| "entropy": 1.5039527043700218, | |
| "epoch": 7.076326002587322, | |
| "grad_norm": 0.772226870059967, | |
| "learning_rate": 6.930861087705398e-06, | |
| "loss": 1.3666, | |
| "mean_token_accuracy": 0.5798796579241753, | |
| "num_tokens": 15685497.0, | |
| "step": 5470 | |
| }, | |
| { | |
| "entropy": 0.9571346640586853, | |
| "epoch": 7.089262613195343, | |
| "grad_norm": 0.9899272918701172, | |
| "learning_rate": 6.7405394950510345e-06, | |
| "loss": 0.9525, | |
| "mean_token_accuracy": 0.7557973235845565, | |
| "num_tokens": 15718968.0, | |
| "step": 5480 | |
| }, | |
| { | |
| "entropy": 1.1604458332061767, | |
| "epoch": 7.102199223803363, | |
| "grad_norm": 1.072095513343811, | |
| "learning_rate": 6.552776362968271e-06, | |
| "loss": 1.1571, | |
| "mean_token_accuracy": 0.7090446025133132, | |
| "num_tokens": 15742748.0, | |
| "step": 5490 | |
| }, | |
| { | |
| "entropy": 1.7930972754955292, | |
| "epoch": 7.115135834411384, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.367576842511735e-06, | |
| "loss": 1.3237, | |
| "mean_token_accuracy": 0.5362849146127701, | |
| "num_tokens": 15751803.0, | |
| "step": 5500 | |
| }, | |
| { | |
| "entropy": 1.7586050003767013, | |
| "epoch": 7.128072445019405, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.184946014406412e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 15752443.0, | |
| "step": 5510 | |
| }, | |
| { | |
| "entropy": 1.5453996002674102, | |
| "epoch": 7.141009055627426, | |
| "grad_norm": 0.7360463738441467, | |
| "learning_rate": 6.004888888908256e-06, | |
| "loss": 1.4109, | |
| "mean_token_accuracy": 0.5700584821403026, | |
| "num_tokens": 15828985.0, | |
| "step": 5520 | |
| }, | |
| { | |
| "entropy": 0.9328926429152489, | |
| "epoch": 7.153945666235447, | |
| "grad_norm": 0.9283819794654846, | |
| "learning_rate": 5.827410405666911e-06, | |
| "loss": 0.9175, | |
| "mean_token_accuracy": 0.7673766747117042, | |
| "num_tokens": 15862356.0, | |
| "step": 5530 | |
| }, | |
| { | |
| "entropy": 1.1613366797566413, | |
| "epoch": 7.166882276843467, | |
| "grad_norm": 1.0261551141738892, | |
| "learning_rate": 5.652515433590033e-06, | |
| "loss": 1.1253, | |
| "mean_token_accuracy": 0.7124258697032928, | |
| "num_tokens": 15886367.0, | |
| "step": 5540 | |
| }, | |
| { | |
| "entropy": 1.7588330313563347, | |
| "epoch": 7.179818887451487, | |
| "grad_norm": 0.00023454829351976514, | |
| "learning_rate": 5.480208770709771e-06, | |
| "loss": 1.4039, | |
| "mean_token_accuracy": 0.5946097061038017, | |
| "num_tokens": 15896207.0, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 7.179818887451487, | |
| "eval_entropy": 1.393599722794322, | |
| "eval_loss": 1.2324310541152954, | |
| "eval_mean_token_accuracy": 0.4852820281372514, | |
| "eval_num_tokens": 15896207.0, | |
| "eval_runtime": 245.6246, | |
| "eval_samples_per_second": 22.376, | |
| "eval_steps_per_second": 1.401, | |
| "step": 5550 | |
| }, | |
| { | |
| "entropy": 1.7070483982563018, | |
| "epoch": 7.192755498059508, | |
| "grad_norm": 0.0, | |
| "learning_rate": 5.310495144051142e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 15896847.0, | |
| "step": 5560 | |
| }, | |
| { | |
| "entropy": 1.557031399011612, | |
| "epoch": 7.205692108667529, | |
| "grad_norm": 0.7289990186691284, | |
| "learning_rate": 5.143379209502352e-06, | |
| "loss": 1.4125, | |
| "mean_token_accuracy": 0.5720368728041649, | |
| "num_tokens": 15976815.0, | |
| "step": 5570 | |
| }, | |
| { | |
| "entropy": 0.9512620970606804, | |
| "epoch": 7.21862871927555, | |
| "grad_norm": 0.9174538254737854, | |
| "learning_rate": 4.978865551687062e-06, | |
| "loss": 0.9534, | |
| "mean_token_accuracy": 0.7580740317702294, | |
| "num_tokens": 16010900.0, | |
| "step": 5580 | |
| }, | |
| { | |
| "entropy": 1.172946660220623, | |
| "epoch": 7.231565329883571, | |
| "grad_norm": 1.0972976684570312, | |
| "learning_rate": 4.8169586838386346e-06, | |
| "loss": 1.1532, | |
| "mean_token_accuracy": 0.7079381376504899, | |
| "num_tokens": 16035361.0, | |
| "step": 5590 | |
| }, | |
| { | |
| "entropy": 1.6811116263270378, | |
| "epoch": 7.244501940491591, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.657663047676264e-06, | |
| "loss": 1.2139, | |
| "mean_token_accuracy": 0.5401002943515778, | |
| "num_tokens": 16044571.0, | |
| "step": 5600 | |
| }, | |
| { | |
| "entropy": 1.6898091644048692, | |
| "epoch": 7.257438551099612, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.500983013283188e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16045211.0, | |
| "step": 5610 | |
| }, | |
| { | |
| "entropy": 1.5436704397201537, | |
| "epoch": 7.270375161707633, | |
| "grad_norm": 0.6892314553260803, | |
| "learning_rate": 4.34692287898677e-06, | |
| "loss": 1.4148, | |
| "mean_token_accuracy": 0.5717164523899555, | |
| "num_tokens": 16122336.0, | |
| "step": 5620 | |
| }, | |
| { | |
| "entropy": 0.9374915182590484, | |
| "epoch": 7.2833117723156535, | |
| "grad_norm": 0.9667730927467346, | |
| "learning_rate": 4.195486871240562e-06, | |
| "loss": 0.9394, | |
| "mean_token_accuracy": 0.7627643913030624, | |
| "num_tokens": 16156408.0, | |
| "step": 5630 | |
| }, | |
| { | |
| "entropy": 1.1849497631192207, | |
| "epoch": 7.296248382923674, | |
| "grad_norm": 1.1908502578735352, | |
| "learning_rate": 4.046679144508392e-06, | |
| "loss": 1.142, | |
| "mean_token_accuracy": 0.7130326569080353, | |
| "num_tokens": 16180323.0, | |
| "step": 5640 | |
| }, | |
| { | |
| "entropy": 1.829011231660843, | |
| "epoch": 7.309184993531694, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.900503781150366e-06, | |
| "loss": 1.4914, | |
| "mean_token_accuracy": 0.5614617101848125, | |
| "num_tokens": 16189805.0, | |
| "step": 5650 | |
| }, | |
| { | |
| "entropy": 1.7375122755765915, | |
| "epoch": 7.322121604139715, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.7569647913109243e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16190445.0, | |
| "step": 5660 | |
| }, | |
| { | |
| "entropy": 1.5200837269425391, | |
| "epoch": 7.335058214747736, | |
| "grad_norm": 0.7430135607719421, | |
| "learning_rate": 3.6160661128087025e-06, | |
| "loss": 1.397, | |
| "mean_token_accuracy": 0.57365105971694, | |
| "num_tokens": 16268426.0, | |
| "step": 5670 | |
| }, | |
| { | |
| "entropy": 0.9276900395751, | |
| "epoch": 7.347994825355757, | |
| "grad_norm": 0.9390348792076111, | |
| "learning_rate": 3.4778116110286473e-06, | |
| "loss": 0.9249, | |
| "mean_token_accuracy": 0.7620738327503205, | |
| "num_tokens": 16302856.0, | |
| "step": 5680 | |
| }, | |
| { | |
| "entropy": 1.1663517013192177, | |
| "epoch": 7.360931435963778, | |
| "grad_norm": 1.0117005109786987, | |
| "learning_rate": 3.34220507881593e-06, | |
| "loss": 1.1293, | |
| "mean_token_accuracy": 0.7132649436593056, | |
| "num_tokens": 16327211.0, | |
| "step": 5690 | |
| }, | |
| { | |
| "entropy": 1.7131205320358276, | |
| "epoch": 7.373868046571798, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.209250236371797e-06, | |
| "loss": 1.3032, | |
| "mean_token_accuracy": 0.5476110517978668, | |
| "num_tokens": 16336179.0, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 7.373868046571798, | |
| "eval_entropy": 1.4321047376061595, | |
| "eval_loss": 1.2324743270874023, | |
| "eval_mean_token_accuracy": 0.48222382652551626, | |
| "eval_num_tokens": 16336179.0, | |
| "eval_runtime": 242.208, | |
| "eval_samples_per_second": 22.691, | |
| "eval_steps_per_second": 1.42, | |
| "step": 5700 | |
| }, | |
| { | |
| "entropy": 1.744317215681076, | |
| "epoch": 7.386804657179819, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.0789507311516864e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16336819.0, | |
| "step": 5710 | |
| }, | |
| { | |
| "entropy": 1.5234818816184998, | |
| "epoch": 7.39974126778784, | |
| "grad_norm": 0.7303734421730042, | |
| "learning_rate": 2.9513101377650175e-06, | |
| "loss": 1.3797, | |
| "mean_token_accuracy": 0.5752100251615048, | |
| "num_tokens": 16404914.0, | |
| "step": 5720 | |
| }, | |
| { | |
| "entropy": 0.9265442088246345, | |
| "epoch": 7.4126778783958605, | |
| "grad_norm": 0.8770347237586975, | |
| "learning_rate": 2.8263319578771485e-06, | |
| "loss": 0.9069, | |
| "mean_token_accuracy": 0.7680046275258064, | |
| "num_tokens": 16439389.0, | |
| "step": 5730 | |
| }, | |
| { | |
| "entropy": 1.1830172911286354, | |
| "epoch": 7.425614489003881, | |
| "grad_norm": 1.0386770963668823, | |
| "learning_rate": 2.704019620113407e-06, | |
| "loss": 1.1733, | |
| "mean_token_accuracy": 0.7056162416934967, | |
| "num_tokens": 16464458.0, | |
| "step": 5740 | |
| }, | |
| { | |
| "entropy": 1.7503404572606087, | |
| "epoch": 7.438551099611901, | |
| "grad_norm": 1.7682623863220215, | |
| "learning_rate": 2.584376479964945e-06, | |
| "loss": 1.4882, | |
| "mean_token_accuracy": 0.6309158280491829, | |
| "num_tokens": 16475591.0, | |
| "step": 5750 | |
| }, | |
| { | |
| "entropy": 1.7254247039556503, | |
| "epoch": 7.451487710219922, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.4674058196966663e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16476231.0, | |
| "step": 5760 | |
| }, | |
| { | |
| "entropy": 1.5136717677116394, | |
| "epoch": 7.464424320827943, | |
| "grad_norm": 0.7473369240760803, | |
| "learning_rate": 2.353110848257267e-06, | |
| "loss": 1.3413, | |
| "mean_token_accuracy": 0.5824255973100663, | |
| "num_tokens": 16552018.0, | |
| "step": 5770 | |
| }, | |
| { | |
| "entropy": 0.9227760046720505, | |
| "epoch": 7.477360931435964, | |
| "grad_norm": 0.982836902141571, | |
| "learning_rate": 2.241494701191127e-06, | |
| "loss": 0.9069, | |
| "mean_token_accuracy": 0.7623407855629921, | |
| "num_tokens": 16586256.0, | |
| "step": 5780 | |
| }, | |
| { | |
| "entropy": 1.1348280161619186, | |
| "epoch": 7.490297542043985, | |
| "grad_norm": 1.1100831031799316, | |
| "learning_rate": 2.1325604405523334e-06, | |
| "loss": 1.1069, | |
| "mean_token_accuracy": 0.7201577231287957, | |
| "num_tokens": 16610709.0, | |
| "step": 5790 | |
| }, | |
| { | |
| "entropy": 1.771338665485382, | |
| "epoch": 7.503234152652006, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.026311054820629e-06, | |
| "loss": 1.411, | |
| "mean_token_accuracy": 0.5635204806923866, | |
| "num_tokens": 16620269.0, | |
| "step": 5800 | |
| }, | |
| { | |
| "entropy": 1.7322617769241333, | |
| "epoch": 7.516170763260026, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.922749458819506e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16620909.0, | |
| "step": 5810 | |
| }, | |
| { | |
| "entropy": 1.4817992717027664, | |
| "epoch": 7.5291073738680465, | |
| "grad_norm": 0.756270170211792, | |
| "learning_rate": 1.8218784936361644e-06, | |
| "loss": 1.353, | |
| "mean_token_accuracy": 0.5787275157868862, | |
| "num_tokens": 16690569.0, | |
| "step": 5820 | |
| }, | |
| { | |
| "entropy": 0.9674558937549591, | |
| "epoch": 7.542043984476067, | |
| "grad_norm": 0.8812004327774048, | |
| "learning_rate": 1.7237009265436032e-06, | |
| "loss": 0.9613, | |
| "mean_token_accuracy": 0.7560465827584266, | |
| "num_tokens": 16724649.0, | |
| "step": 5830 | |
| }, | |
| { | |
| "entropy": 1.1716067418456078, | |
| "epoch": 7.554980595084088, | |
| "grad_norm": 1.0925747156143188, | |
| "learning_rate": 1.6282194509247063e-06, | |
| "loss": 1.1436, | |
| "mean_token_accuracy": 0.7135581076145172, | |
| "num_tokens": 16749582.0, | |
| "step": 5840 | |
| }, | |
| { | |
| "entropy": 1.6912678241729737, | |
| "epoch": 7.567917205692108, | |
| "grad_norm": 1.6889742612838745, | |
| "learning_rate": 1.5354366861983438e-06, | |
| "loss": 1.5003, | |
| "mean_token_accuracy": 0.6513200134038926, | |
| "num_tokens": 16760847.0, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 7.567917205692108, | |
| "eval_entropy": 1.4259126506919084, | |
| "eval_loss": 1.2301470041275024, | |
| "eval_mean_token_accuracy": 0.4896806857093822, | |
| "eval_num_tokens": 16760847.0, | |
| "eval_runtime": 246.4439, | |
| "eval_samples_per_second": 22.301, | |
| "eval_steps_per_second": 1.396, | |
| "step": 5850 | |
| }, | |
| { | |
| "entropy": 1.7190734058618546, | |
| "epoch": 7.580853816300129, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.4453551777475094e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16761487.0, | |
| "step": 5860 | |
| }, | |
| { | |
| "entropy": 1.5320835530757904, | |
| "epoch": 7.59379042690815, | |
| "grad_norm": 0.7591171264648438, | |
| "learning_rate": 1.3579773968495191e-06, | |
| "loss": 1.3913, | |
| "mean_token_accuracy": 0.5738878205418587, | |
| "num_tokens": 16833368.0, | |
| "step": 5870 | |
| }, | |
| { | |
| "entropy": 0.9359873235225677, | |
| "epoch": 7.606727037516171, | |
| "grad_norm": 0.9182559847831726, | |
| "learning_rate": 1.2733057406081438e-06, | |
| "loss": 0.9307, | |
| "mean_token_accuracy": 0.7633048981428147, | |
| "num_tokens": 16867272.0, | |
| "step": 5880 | |
| }, | |
| { | |
| "entropy": 1.1327362582087517, | |
| "epoch": 7.619663648124192, | |
| "grad_norm": 1.0494729280471802, | |
| "learning_rate": 1.1913425318879511e-06, | |
| "loss": 1.1095, | |
| "mean_token_accuracy": 0.7176593467593193, | |
| "num_tokens": 16892030.0, | |
| "step": 5890 | |
| }, | |
| { | |
| "entropy": 1.7231059432029725, | |
| "epoch": 7.632600258732213, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.1120900192505e-06, | |
| "loss": 1.3184, | |
| "mean_token_accuracy": 0.5641655296087265, | |
| "num_tokens": 16901989.0, | |
| "step": 5900 | |
| }, | |
| { | |
| "entropy": 1.7543556302785874, | |
| "epoch": 7.645536869340233, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.0355503768926466e-06, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 16902629.0, | |
| "step": 5910 | |
| }, | |
| { | |
| "entropy": 1.4874625369906425, | |
| "epoch": 7.6584734799482534, | |
| "grad_norm": 0.7250481843948364, | |
| "learning_rate": 9.61725704587002e-07, | |
| "loss": 1.3483, | |
| "mean_token_accuracy": 0.5808299452066421, | |
| "num_tokens": 16975429.0, | |
| "step": 5920 | |
| }, | |
| { | |
| "entropy": 0.940713207423687, | |
| "epoch": 7.671410090556274, | |
| "grad_norm": 0.9228203296661377, | |
| "learning_rate": 8.906180276242015e-07, | |
| "loss": 0.9271, | |
| "mean_token_accuracy": 0.760072472691536, | |
| "num_tokens": 17009886.0, | |
| "step": 5930 | |
| }, | |
| { | |
| "entropy": 1.1436687961220742, | |
| "epoch": 7.684346701164295, | |
| "grad_norm": 1.0997246503829956, | |
| "learning_rate": 8.22229296757393e-07, | |
| "loss": 1.1408, | |
| "mean_token_accuracy": 0.7107081711292267, | |
| "num_tokens": 17034678.0, | |
| "step": 5940 | |
| }, | |
| { | |
| "entropy": 1.73554485142231, | |
| "epoch": 7.697283311772315, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.565613881487687e-07, | |
| "loss": 1.365, | |
| "mean_token_accuracy": 0.5842878207564354, | |
| "num_tokens": 17044424.0, | |
| "step": 5950 | |
| }, | |
| { | |
| "entropy": 1.7472249418497086, | |
| "epoch": 7.710219922380336, | |
| "grad_norm": 0.0, | |
| "learning_rate": 6.936161033180066e-07, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 17045064.0, | |
| "step": 5960 | |
| }, | |
| { | |
| "entropy": 1.5308178260922432, | |
| "epoch": 7.723156532988357, | |
| "grad_norm": 0.7369622588157654, | |
| "learning_rate": 6.333951690929318e-07, | |
| "loss": 1.3944, | |
| "mean_token_accuracy": 0.571716184169054, | |
| "num_tokens": 17121301.0, | |
| "step": 5970 | |
| }, | |
| { | |
| "entropy": 0.9166033461689949, | |
| "epoch": 7.736093143596378, | |
| "grad_norm": 0.8718245625495911, | |
| "learning_rate": 5.759002375620548e-07, | |
| "loss": 0.9191, | |
| "mean_token_accuracy": 0.7659956023097039, | |
| "num_tokens": 17155878.0, | |
| "step": 5980 | |
| }, | |
| { | |
| "entropy": 1.1351210102438927, | |
| "epoch": 7.749029754204399, | |
| "grad_norm": 1.1139835119247437, | |
| "learning_rate": 5.211328860293519e-07, | |
| "loss": 1.0937, | |
| "mean_token_accuracy": 0.7179104581475257, | |
| "num_tokens": 17180817.0, | |
| "step": 5990 | |
| }, | |
| { | |
| "entropy": 1.7042000949382783, | |
| "epoch": 7.7619663648124195, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.6909461697088874e-07, | |
| "loss": 1.2978, | |
| "mean_token_accuracy": 0.5402273468673229, | |
| "num_tokens": 17190238.0, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 7.7619663648124195, | |
| "eval_entropy": 1.4222364893486334, | |
| "eval_loss": 1.230813980102539, | |
| "eval_mean_token_accuracy": 0.483534776973863, | |
| "eval_num_tokens": 17190238.0, | |
| "eval_runtime": 243.8499, | |
| "eval_samples_per_second": 22.538, | |
| "eval_steps_per_second": 1.411, | |
| "step": 6000 | |
| }, | |
| { | |
| "entropy": 1.7714763969182967, | |
| "epoch": 7.7749029754204395, | |
| "grad_norm": 0.0, | |
| "learning_rate": 4.197868579936981e-07, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 17190878.0, | |
| "step": 6010 | |
| }, | |
| { | |
| "entropy": 1.5036073312163354, | |
| "epoch": 7.78783958602846, | |
| "grad_norm": 0.7586703896522522, | |
| "learning_rate": 3.732109617965218e-07, | |
| "loss": 1.3917, | |
| "mean_token_accuracy": 0.5730986759066582, | |
| "num_tokens": 17262910.0, | |
| "step": 6020 | |
| }, | |
| { | |
| "entropy": 0.9327082589268685, | |
| "epoch": 7.800776196636481, | |
| "grad_norm": 0.8685732483863831, | |
| "learning_rate": 3.293682061327963e-07, | |
| "loss": 0.9333, | |
| "mean_token_accuracy": 0.7620440036058426, | |
| "num_tokens": 17296857.0, | |
| "step": 6030 | |
| }, | |
| { | |
| "entropy": 1.177341391146183, | |
| "epoch": 7.813712807244502, | |
| "grad_norm": 1.1222566366195679, | |
| "learning_rate": 2.882597937755249e-07, | |
| "loss": 1.1641, | |
| "mean_token_accuracy": 0.7064913615584374, | |
| "num_tokens": 17321218.0, | |
| "step": 6040 | |
| }, | |
| { | |
| "entropy": 1.7008673965930938, | |
| "epoch": 7.826649417852523, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.498868524843045e-07, | |
| "loss": 1.2135, | |
| "mean_token_accuracy": 0.5372394770383835, | |
| "num_tokens": 17329684.0, | |
| "step": 6050 | |
| }, | |
| { | |
| "entropy": 1.7468272864818573, | |
| "epoch": 7.839586028460543, | |
| "grad_norm": 0.0, | |
| "learning_rate": 2.1425043497439456e-07, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 17330324.0, | |
| "step": 6060 | |
| }, | |
| { | |
| "entropy": 1.5415200561285018, | |
| "epoch": 7.852522639068564, | |
| "grad_norm": 0.7708677649497986, | |
| "learning_rate": 1.8135151888782899e-07, | |
| "loss": 1.3837, | |
| "mean_token_accuracy": 0.574844229221344, | |
| "num_tokens": 17408721.0, | |
| "step": 6070 | |
| }, | |
| { | |
| "entropy": 0.9075698807835579, | |
| "epoch": 7.865459249676585, | |
| "grad_norm": 0.8989212512969971, | |
| "learning_rate": 1.5119100676662667e-07, | |
| "loss": 0.8899, | |
| "mean_token_accuracy": 0.771544449031353, | |
| "num_tokens": 17442757.0, | |
| "step": 6080 | |
| }, | |
| { | |
| "entropy": 1.1743381530046464, | |
| "epoch": 7.878395860284606, | |
| "grad_norm": 1.025661826133728, | |
| "learning_rate": 1.2376972602795578e-07, | |
| "loss": 1.1425, | |
| "mean_token_accuracy": 0.7124027162790298, | |
| "num_tokens": 17467049.0, | |
| "step": 6090 | |
| }, | |
| { | |
| "entropy": 1.7484049052000046, | |
| "epoch": 7.8913324708926265, | |
| "grad_norm": 0.0, | |
| "learning_rate": 9.908842894151837e-08, | |
| "loss": 1.3114, | |
| "mean_token_accuracy": 0.5641379207372665, | |
| "num_tokens": 17475616.0, | |
| "step": 6100 | |
| }, | |
| { | |
| "entropy": 1.7715317398309707, | |
| "epoch": 7.904269081500646, | |
| "grad_norm": 0.0, | |
| "learning_rate": 7.714779260886707e-08, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 17476256.0, | |
| "step": 6110 | |
| }, | |
| { | |
| "entropy": 1.5000860661268234, | |
| "epoch": 7.917205692108667, | |
| "grad_norm": 0.7349119186401367, | |
| "learning_rate": 5.7948418944842043e-08, | |
| "loss": 1.3548, | |
| "mean_token_accuracy": 0.5794984824955464, | |
| "num_tokens": 17546950.0, | |
| "step": 6120 | |
| }, | |
| { | |
| "entropy": 0.9138670772314071, | |
| "epoch": 7.930142302716688, | |
| "grad_norm": 0.8542500138282776, | |
| "learning_rate": 4.149083466105097e-08, | |
| "loss": 0.9021, | |
| "mean_token_accuracy": 0.770347698032856, | |
| "num_tokens": 17581293.0, | |
| "step": 6130 | |
| }, | |
| { | |
| "entropy": 1.1947215780615807, | |
| "epoch": 7.943078913324709, | |
| "grad_norm": 1.0435749292373657, | |
| "learning_rate": 2.7775491251413877e-08, | |
| "loss": 1.1687, | |
| "mean_token_accuracy": 0.7094842702150345, | |
| "num_tokens": 17605803.0, | |
| "step": 6140 | |
| }, | |
| { | |
| "entropy": 1.6835207402706147, | |
| "epoch": 7.95601552393273, | |
| "grad_norm": 0.0, | |
| "learning_rate": 1.6802764979817474e-08, | |
| "loss": 1.1704, | |
| "mean_token_accuracy": 0.5183229476213456, | |
| "num_tokens": 17613695.0, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 7.95601552393273, | |
| "eval_entropy": 1.4208284545429917, | |
| "eval_loss": 1.2304351329803467, | |
| "eval_mean_token_accuracy": 0.4861882030097551, | |
| "eval_num_tokens": 17613695.0, | |
| "eval_runtime": 244.9318, | |
| "eval_samples_per_second": 22.439, | |
| "eval_steps_per_second": 1.404, | |
| "step": 6150 | |
| }, | |
| { | |
| "entropy": 1.7820782691240311, | |
| "epoch": 7.96895213454075, | |
| "grad_norm": 0.0, | |
| "learning_rate": 8.572956869734583e-09, | |
| "loss": 0.0, | |
| "mean_token_accuracy": 0.0, | |
| "num_tokens": 17614335.0, | |
| "step": 6160 | |
| }, | |
| { | |
| "entropy": 1.2753556087613105, | |
| "epoch": 7.981888745148771, | |
| "grad_norm": 0.9358561635017395, | |
| "learning_rate": 3.0862926959973617e-09, | |
| "loss": 1.1173, | |
| "mean_token_accuracy": 0.6308311700820923, | |
| "num_tokens": 17667096.0, | |
| "step": 6170 | |
| }, | |
| { | |
| "entropy": 1.4832376271486283, | |
| "epoch": 7.994825355756792, | |
| "grad_norm": 0.0, | |
| "learning_rate": 3.429229786133803e-10, | |
| "loss": 1.055, | |
| "mean_token_accuracy": 0.5700831845402717, | |
| "num_tokens": 17681630.0, | |
| "step": 6180 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 6184, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 600, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.974075450217726e+18, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |