| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.0, |
| "eval_steps": 1000, |
| "global_step": 26795, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.6402970719337463, |
| "epoch": 0.009330098899048329, |
| "grad_norm": 1.015625, |
| "learning_rate": 1.47e-05, |
| "loss": 0.778, |
| "mean_token_accuracy": 0.7212734770774841, |
| "num_tokens": 1548877.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.5775007322430611, |
| "epoch": 0.018660197798096658, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.97e-05, |
| "loss": 0.5918, |
| "mean_token_accuracy": 0.7543947434425354, |
| "num_tokens": 3018955.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.5692345011234283, |
| "epoch": 0.02799029669714499, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.999977554224005e-05, |
| "loss": 0.5835, |
| "mean_token_accuracy": 0.753638728260994, |
| "num_tokens": 4563356.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.570181995332241, |
| "epoch": 0.037320395596193316, |
| "grad_norm": 0.66015625, |
| "learning_rate": 2.999908376021796e-05, |
| "loss": 0.5835, |
| "mean_token_accuracy": 0.7521764719486237, |
| "num_tokens": 6114611.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.5772185748815537, |
| "epoch": 0.04665049449524165, |
| "grad_norm": 1.3125, |
| "learning_rate": 2.9997924584400694e-05, |
| "loss": 0.5841, |
| "mean_token_accuracy": 0.7549300968647004, |
| "num_tokens": 7635855.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.5723973855376243, |
| "epoch": 0.05598059339428998, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.9996298054923684e-05, |
| "loss": 0.5865, |
| "mean_token_accuracy": 0.7571439111232757, |
| "num_tokens": 9098584.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.5691715434193612, |
| "epoch": 0.06531069229333832, |
| "grad_norm": 0.625, |
| "learning_rate": 2.9994204228104075e-05, |
| "loss": 0.583, |
| "mean_token_accuracy": 0.7545554572343827, |
| "num_tokens": 10604199.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5754752764105797, |
| "epoch": 0.07464079119238663, |
| "grad_norm": 1.25, |
| "learning_rate": 2.9991643176438752e-05, |
| "loss": 0.5852, |
| "mean_token_accuracy": 0.7542591279745102, |
| "num_tokens": 12131126.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.5670795711874962, |
| "epoch": 0.08397089009143496, |
| "grad_norm": 0.68359375, |
| "learning_rate": 2.9988614988601868e-05, |
| "loss": 0.5779, |
| "mean_token_accuracy": 0.7581483513116837, |
| "num_tokens": 13642307.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.5779001507163047, |
| "epoch": 0.0933009889904833, |
| "grad_norm": 0.431640625, |
| "learning_rate": 2.998511976944173e-05, |
| "loss": 0.5874, |
| "mean_token_accuracy": 0.7504317510128021, |
| "num_tokens": 15217615.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.5665371876955032, |
| "epoch": 0.10263108788953162, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.998115763997721e-05, |
| "loss": 0.5851, |
| "mean_token_accuracy": 0.7549385547637939, |
| "num_tokens": 16724810.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.5638798615336418, |
| "epoch": 0.11196118678857996, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.9976728737393515e-05, |
| "loss": 0.5756, |
| "mean_token_accuracy": 0.757421538233757, |
| "num_tokens": 18270004.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.5710726794600487, |
| "epoch": 0.12129128568762829, |
| "grad_norm": 0.76171875, |
| "learning_rate": 2.997183321503747e-05, |
| "loss": 0.5854, |
| "mean_token_accuracy": 0.7553139424324036, |
| "num_tokens": 19791398.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.5717519807815552, |
| "epoch": 0.13062138458667663, |
| "grad_norm": 1.28125, |
| "learning_rate": 2.9966471242412192e-05, |
| "loss": 0.5828, |
| "mean_token_accuracy": 0.7528650748729706, |
| "num_tokens": 21373941.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.5660536578297615, |
| "epoch": 0.13995148348572495, |
| "grad_norm": 1.265625, |
| "learning_rate": 2.996064300517122e-05, |
| "loss": 0.5858, |
| "mean_token_accuracy": 0.7544754481315613, |
| "num_tokens": 22872328.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.5696932604908943, |
| "epoch": 0.14928158238477326, |
| "grad_norm": 0.953125, |
| "learning_rate": 2.995434870511211e-05, |
| "loss": 0.5881, |
| "mean_token_accuracy": 0.7530353850126267, |
| "num_tokens": 24433349.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.5728991779685021, |
| "epoch": 0.1586116812838216, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.9947588560169395e-05, |
| "loss": 0.5841, |
| "mean_token_accuracy": 0.753363783955574, |
| "num_tokens": 26038373.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 0.5628881072998047, |
| "epoch": 0.16794178018286993, |
| "grad_norm": 0.6328125, |
| "learning_rate": 2.994036280440711e-05, |
| "loss": 0.573, |
| "mean_token_accuracy": 0.7580157500505448, |
| "num_tokens": 27568454.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 0.5649524646997451, |
| "epoch": 0.17727187908191827, |
| "grad_norm": 0.95703125, |
| "learning_rate": 2.9932671688010632e-05, |
| "loss": 0.5766, |
| "mean_token_accuracy": 0.7574625754356384, |
| "num_tokens": 29049728.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 0.5648883840441704, |
| "epoch": 0.1866019779809666, |
| "grad_norm": 0.99609375, |
| "learning_rate": 2.992451547727804e-05, |
| "loss": 0.5883, |
| "mean_token_accuracy": 0.7526043313741684, |
| "num_tokens": 30603198.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1866019779809666, |
| "eval_entropy": 0.5557291887768911, |
| "eval_loss": 0.5737926959991455, |
| "eval_mean_token_accuracy": 0.758812694921406, |
| "eval_num_tokens": 30603198.0, |
| "eval_runtime": 16.2769, |
| "eval_samples_per_second": 53.266, |
| "eval_steps_per_second": 6.697, |
| "step": 1000 |
| }, |
| { |
| "entropy": 0.5666726857423783, |
| "epoch": 0.19593207688001493, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.9915894454610887e-05, |
| "loss": 0.5764, |
| "mean_token_accuracy": 0.7547625786066056, |
| "num_tokens": 32144851.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 0.561416018307209, |
| "epoch": 0.20526217577906325, |
| "grad_norm": 0.55859375, |
| "learning_rate": 2.990680891850444e-05, |
| "loss": 0.5794, |
| "mean_token_accuracy": 0.75654057264328, |
| "num_tokens": 33639533.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 0.5599774518609046, |
| "epoch": 0.2145922746781116, |
| "grad_norm": 1.4140625, |
| "learning_rate": 2.9897259183537322e-05, |
| "loss": 0.5772, |
| "mean_token_accuracy": 0.7588758039474487, |
| "num_tokens": 35130975.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 0.5628740054368973, |
| "epoch": 0.2239223735771599, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.9887245580360623e-05, |
| "loss": 0.581, |
| "mean_token_accuracy": 0.7554371774196624, |
| "num_tokens": 36651189.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 0.5624552240967751, |
| "epoch": 0.23325247247620826, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.9876768455686477e-05, |
| "loss": 0.5731, |
| "mean_token_accuracy": 0.7572768718004227, |
| "num_tokens": 38172935.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 0.5497725516557693, |
| "epoch": 0.24258257137525657, |
| "grad_norm": 0.8671875, |
| "learning_rate": 2.9865828172276023e-05, |
| "loss": 0.569, |
| "mean_token_accuracy": 0.7618441820144654, |
| "num_tokens": 39653342.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 0.5542441910505295, |
| "epoch": 0.2519126702743049, |
| "grad_norm": 0.84375, |
| "learning_rate": 2.9854425108926863e-05, |
| "loss": 0.5732, |
| "mean_token_accuracy": 0.762292046546936, |
| "num_tokens": 41090294.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 0.5551298156380653, |
| "epoch": 0.26124276917335326, |
| "grad_norm": 0.9375, |
| "learning_rate": 2.984255966045995e-05, |
| "loss": 0.5773, |
| "mean_token_accuracy": 0.755358315706253, |
| "num_tokens": 42625920.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 0.5527626049518585, |
| "epoch": 0.27057286807240155, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.9830232237705904e-05, |
| "loss": 0.5724, |
| "mean_token_accuracy": 0.7588168692588806, |
| "num_tokens": 44134645.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 0.5561913156509399, |
| "epoch": 0.2799029669714499, |
| "grad_norm": 0.396484375, |
| "learning_rate": 2.9817443267490797e-05, |
| "loss": 0.5742, |
| "mean_token_accuracy": 0.7577809965610505, |
| "num_tokens": 45605580.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 0.5598691233992577, |
| "epoch": 0.28923306587049824, |
| "grad_norm": 0.94921875, |
| "learning_rate": 2.9804193192621376e-05, |
| "loss": 0.5746, |
| "mean_token_accuracy": 0.7551334691047669, |
| "num_tokens": 47144264.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 0.554311693906784, |
| "epoch": 0.29856316476954653, |
| "grad_norm": 0.98828125, |
| "learning_rate": 2.979048247186972e-05, |
| "loss": 0.5691, |
| "mean_token_accuracy": 0.7589048826694489, |
| "num_tokens": 48660197.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 0.5676783239841461, |
| "epoch": 0.3078932636685949, |
| "grad_norm": 1.09375, |
| "learning_rate": 2.9776311579957372e-05, |
| "loss": 0.5797, |
| "mean_token_accuracy": 0.7567919301986694, |
| "num_tokens": 50152863.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 0.5468079242110252, |
| "epoch": 0.3172233625676432, |
| "grad_norm": 1.2421875, |
| "learning_rate": 2.976168100753889e-05, |
| "loss": 0.5677, |
| "mean_token_accuracy": 0.7582010948657989, |
| "num_tokens": 51722410.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 0.5553153255581855, |
| "epoch": 0.32655346146669156, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.974659126118485e-05, |
| "loss": 0.5696, |
| "mean_token_accuracy": 0.7591327953338624, |
| "num_tokens": 53257454.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 0.553457222878933, |
| "epoch": 0.33588356036573985, |
| "grad_norm": 0.49609375, |
| "learning_rate": 2.973104286336433e-05, |
| "loss": 0.5725, |
| "mean_token_accuracy": 0.7564568722248077, |
| "num_tokens": 54784162.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 0.5699556747078895, |
| "epoch": 0.3452136592647882, |
| "grad_norm": 0.69140625, |
| "learning_rate": 2.971503635242682e-05, |
| "loss": 0.5756, |
| "mean_token_accuracy": 0.7566489219665528, |
| "num_tokens": 56304516.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 0.5491332325339318, |
| "epoch": 0.35454375816383654, |
| "grad_norm": 0.7265625, |
| "learning_rate": 2.9698572282583534e-05, |
| "loss": 0.5683, |
| "mean_token_accuracy": 0.7583828049898148, |
| "num_tokens": 57819074.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 0.5511914587020874, |
| "epoch": 0.3638738570628849, |
| "grad_norm": 0.7265625, |
| "learning_rate": 2.9681651223888298e-05, |
| "loss": 0.571, |
| "mean_token_accuracy": 0.7572992449998855, |
| "num_tokens": 59346739.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 0.5618056333065033, |
| "epoch": 0.3732039559619332, |
| "grad_norm": 0.72265625, |
| "learning_rate": 2.966427376221774e-05, |
| "loss": 0.5792, |
| "mean_token_accuracy": 0.7539066845178604, |
| "num_tokens": 60876192.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3732039559619332, |
| "eval_entropy": 0.5431396165572175, |
| "eval_loss": 0.5645309686660767, |
| "eval_mean_token_accuracy": 0.7612812196442841, |
| "eval_num_tokens": 60876192.0, |
| "eval_runtime": 16.0974, |
| "eval_samples_per_second": 53.86, |
| "eval_steps_per_second": 6.771, |
| "step": 2000 |
| }, |
| { |
| "entropy": 0.5605012658238411, |
| "epoch": 0.3825340548609815, |
| "grad_norm": 0.578125, |
| "learning_rate": 2.9646440499251056e-05, |
| "loss": 0.5912, |
| "mean_token_accuracy": 0.7569118171930314, |
| "num_tokens": 62368509.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 0.5493465921282769, |
| "epoch": 0.39186415376002987, |
| "grad_norm": 1.1875, |
| "learning_rate": 2.9628152052449148e-05, |
| "loss": 0.5668, |
| "mean_token_accuracy": 0.7606059044599534, |
| "num_tokens": 63850871.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 0.5519525390863419, |
| "epoch": 0.4011942526590782, |
| "grad_norm": 0.8359375, |
| "learning_rate": 2.960940905503325e-05, |
| "loss": 0.5736, |
| "mean_token_accuracy": 0.7569921463727951, |
| "num_tokens": 65368844.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 0.5381503540277481, |
| "epoch": 0.4105243515581265, |
| "grad_norm": 0.6953125, |
| "learning_rate": 2.9590212155963024e-05, |
| "loss": 0.5602, |
| "mean_token_accuracy": 0.7622320890426636, |
| "num_tokens": 66879984.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 0.5599228474497795, |
| "epoch": 0.41985445045717484, |
| "grad_norm": 0.5390625, |
| "learning_rate": 2.9570562019914053e-05, |
| "loss": 0.5736, |
| "mean_token_accuracy": 0.7590676909685135, |
| "num_tokens": 68396429.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 0.5572463124990463, |
| "epoch": 0.4291845493562232, |
| "grad_norm": 0.458984375, |
| "learning_rate": 2.9550459327254864e-05, |
| "loss": 0.5732, |
| "mean_token_accuracy": 0.7579269409179688, |
| "num_tokens": 69905569.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 0.5512515944242478, |
| "epoch": 0.43851464825527153, |
| "grad_norm": 0.83203125, |
| "learning_rate": 2.9529904774023353e-05, |
| "loss": 0.5715, |
| "mean_token_accuracy": 0.7546812242269516, |
| "num_tokens": 71496156.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 0.5452529183030128, |
| "epoch": 0.4478447471543198, |
| "grad_norm": 1.453125, |
| "learning_rate": 2.9508899071902684e-05, |
| "loss": 0.5667, |
| "mean_token_accuracy": 0.7611679089069366, |
| "num_tokens": 72993303.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 0.5496764704585075, |
| "epoch": 0.45717484605336817, |
| "grad_norm": 0.80078125, |
| "learning_rate": 2.9487442948196643e-05, |
| "loss": 0.5677, |
| "mean_token_accuracy": 0.7588638842105866, |
| "num_tokens": 74502630.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 0.5490121757984161, |
| "epoch": 0.4665049449524165, |
| "grad_norm": 0.75390625, |
| "learning_rate": 2.9465537145804476e-05, |
| "loss": 0.5685, |
| "mean_token_accuracy": 0.7586365014314651, |
| "num_tokens": 76031000.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 0.5405518284440041, |
| "epoch": 0.4758350438514648, |
| "grad_norm": 1.0234375, |
| "learning_rate": 2.944318242319515e-05, |
| "loss": 0.562, |
| "mean_token_accuracy": 0.7639656978845596, |
| "num_tokens": 77482894.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 0.5369963318109512, |
| "epoch": 0.48516514275051315, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.94203795543811e-05, |
| "loss": 0.5595, |
| "mean_token_accuracy": 0.7631033205986023, |
| "num_tokens": 78956007.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 0.5459193900227547, |
| "epoch": 0.4944952416495615, |
| "grad_norm": 0.63671875, |
| "learning_rate": 2.939712932889142e-05, |
| "loss": 0.5678, |
| "mean_token_accuracy": 0.7564943873882294, |
| "num_tokens": 80549485.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 0.5403485292196274, |
| "epoch": 0.5038253405486098, |
| "grad_norm": 0.73828125, |
| "learning_rate": 2.937343255174453e-05, |
| "loss": 0.5665, |
| "mean_token_accuracy": 0.7631701147556305, |
| "num_tokens": 81987729.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 0.5511571237444878, |
| "epoch": 0.5131554394476582, |
| "grad_norm": 0.875, |
| "learning_rate": 2.9349290043420315e-05, |
| "loss": 0.5718, |
| "mean_token_accuracy": 0.7589112591743469, |
| "num_tokens": 83566503.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 0.545387190580368, |
| "epoch": 0.5224855383467065, |
| "grad_norm": 1.0390625, |
| "learning_rate": 2.932470263983169e-05, |
| "loss": 0.578, |
| "mean_token_accuracy": 0.7594633424282073, |
| "num_tokens": 85045132.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 0.5400431799888611, |
| "epoch": 0.5318156372457548, |
| "grad_norm": 0.60546875, |
| "learning_rate": 2.929967119229569e-05, |
| "loss": 0.5639, |
| "mean_token_accuracy": 0.7595540487766266, |
| "num_tokens": 86552294.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 0.5613244980573654, |
| "epoch": 0.5411457361448031, |
| "grad_norm": 1.0, |
| "learning_rate": 2.9274196567503974e-05, |
| "loss": 0.5882, |
| "mean_token_accuracy": 0.7518465319275855, |
| "num_tokens": 88138088.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 0.5406323432922363, |
| "epoch": 0.5504758350438514, |
| "grad_norm": 0.68359375, |
| "learning_rate": 2.9248279647492817e-05, |
| "loss": 0.563, |
| "mean_token_accuracy": 0.7594792503118515, |
| "num_tokens": 89629470.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 0.5328826600313187, |
| "epoch": 0.5598059339428998, |
| "grad_norm": 0.7890625, |
| "learning_rate": 2.9221921329612568e-05, |
| "loss": 0.559, |
| "mean_token_accuracy": 0.7616329395771027, |
| "num_tokens": 91153981.0, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.5598059339428998, |
| "eval_entropy": 0.5315866710942819, |
| "eval_loss": 0.5568196773529053, |
| "eval_mean_token_accuracy": 0.7634841575535065, |
| "eval_num_tokens": 91153981.0, |
| "eval_runtime": 16.3141, |
| "eval_samples_per_second": 53.144, |
| "eval_steps_per_second": 6.681, |
| "step": 3000 |
| }, |
| { |
| "entropy": 0.5549379280209541, |
| "epoch": 0.5691360328419481, |
| "grad_norm": 0.640625, |
| "learning_rate": 2.9195122526496596e-05, |
| "loss": 0.571, |
| "mean_token_accuracy": 0.7589174765348434, |
| "num_tokens": 92704541.0, |
| "step": 3050 |
| }, |
| { |
| "entropy": 0.5324976027011872, |
| "epoch": 0.5784661317409965, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.9167884166029674e-05, |
| "loss": 0.558, |
| "mean_token_accuracy": 0.766141871213913, |
| "num_tokens": 94114581.0, |
| "step": 3100 |
| }, |
| { |
| "entropy": 0.5359655514359474, |
| "epoch": 0.5877962306400448, |
| "grad_norm": 1.0546875, |
| "learning_rate": 2.9140207191315857e-05, |
| "loss": 0.5609, |
| "mean_token_accuracy": 0.7602073633670807, |
| "num_tokens": 95640629.0, |
| "step": 3150 |
| }, |
| { |
| "entropy": 0.5375589004158974, |
| "epoch": 0.5971263295390931, |
| "grad_norm": 1.0078125, |
| "learning_rate": 2.911209256064584e-05, |
| "loss": 0.5567, |
| "mean_token_accuracy": 0.7607348054647446, |
| "num_tokens": 97223569.0, |
| "step": 3200 |
| }, |
| { |
| "entropy": 0.5388145217299461, |
| "epoch": 0.6064564284381414, |
| "grad_norm": 0.73046875, |
| "learning_rate": 2.9083541247463754e-05, |
| "loss": 0.5612, |
| "mean_token_accuracy": 0.7596866941452026, |
| "num_tokens": 98767227.0, |
| "step": 3250 |
| }, |
| { |
| "entropy": 0.5369637748599052, |
| "epoch": 0.6157865273371897, |
| "grad_norm": 0.56640625, |
| "learning_rate": 2.9054554240333478e-05, |
| "loss": 0.5601, |
| "mean_token_accuracy": 0.7642514258623123, |
| "num_tokens": 100228436.0, |
| "step": 3300 |
| }, |
| { |
| "entropy": 0.5306126582622528, |
| "epoch": 0.6251166262362381, |
| "grad_norm": 1.3515625, |
| "learning_rate": 2.9025132542904414e-05, |
| "loss": 0.5548, |
| "mean_token_accuracy": 0.7639524918794632, |
| "num_tokens": 101762895.0, |
| "step": 3350 |
| }, |
| { |
| "entropy": 0.532108125090599, |
| "epoch": 0.6344467251352864, |
| "grad_norm": 0.48828125, |
| "learning_rate": 2.8995277173876718e-05, |
| "loss": 0.5565, |
| "mean_token_accuracy": 0.7622706252336502, |
| "num_tokens": 103288607.0, |
| "step": 3400 |
| }, |
| { |
| "entropy": 0.530606449842453, |
| "epoch": 0.6437768240343348, |
| "grad_norm": 1.1796875, |
| "learning_rate": 2.896498916696605e-05, |
| "loss": 0.5598, |
| "mean_token_accuracy": 0.763903112411499, |
| "num_tokens": 104756099.0, |
| "step": 3450 |
| }, |
| { |
| "entropy": 0.5285736629366875, |
| "epoch": 0.6531069229333831, |
| "grad_norm": 1.0703125, |
| "learning_rate": 2.8934269570867776e-05, |
| "loss": 0.5536, |
| "mean_token_accuracy": 0.7639499133825303, |
| "num_tokens": 106245797.0, |
| "step": 3500 |
| }, |
| { |
| "entropy": 0.5446360909938812, |
| "epoch": 0.6624370218324315, |
| "grad_norm": 2.125, |
| "learning_rate": 2.890311944922064e-05, |
| "loss": 0.5789, |
| "mean_token_accuracy": 0.7568975293636322, |
| "num_tokens": 107770481.0, |
| "step": 3550 |
| }, |
| { |
| "entropy": 0.5234782636165619, |
| "epoch": 0.6717671207314797, |
| "grad_norm": 1.7578125, |
| "learning_rate": 2.8871539880569963e-05, |
| "loss": 0.5532, |
| "mean_token_accuracy": 0.7649819606542587, |
| "num_tokens": 109269688.0, |
| "step": 3600 |
| }, |
| { |
| "entropy": 0.5393576291203499, |
| "epoch": 0.681097219630528, |
| "grad_norm": 0.69140625, |
| "learning_rate": 2.8839531958330277e-05, |
| "loss": 0.5617, |
| "mean_token_accuracy": 0.759439873099327, |
| "num_tokens": 110803400.0, |
| "step": 3650 |
| }, |
| { |
| "entropy": 0.539838764667511, |
| "epoch": 0.6904273185295764, |
| "grad_norm": 0.76171875, |
| "learning_rate": 2.880709679074749e-05, |
| "loss": 0.5631, |
| "mean_token_accuracy": 0.760960082411766, |
| "num_tokens": 112340326.0, |
| "step": 3700 |
| }, |
| { |
| "entropy": 0.5437505677342415, |
| "epoch": 0.6997574174286247, |
| "grad_norm": 0.9296875, |
| "learning_rate": 2.8774235500860494e-05, |
| "loss": 0.5656, |
| "mean_token_accuracy": 0.7594961816072464, |
| "num_tokens": 113873379.0, |
| "step": 3750 |
| }, |
| { |
| "entropy": 0.5357860559225083, |
| "epoch": 0.7090875163276731, |
| "grad_norm": 0.640625, |
| "learning_rate": 2.874094922646229e-05, |
| "loss": 0.5611, |
| "mean_token_accuracy": 0.7590708369016648, |
| "num_tokens": 115408557.0, |
| "step": 3800 |
| }, |
| { |
| "entropy": 0.5340870246291161, |
| "epoch": 0.7184176152267214, |
| "grad_norm": 0.953125, |
| "learning_rate": 2.870723912006058e-05, |
| "loss": 0.5552, |
| "mean_token_accuracy": 0.765527902841568, |
| "num_tokens": 116891513.0, |
| "step": 3850 |
| }, |
| { |
| "entropy": 0.5364308878779411, |
| "epoch": 0.7277477141257698, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.867310634883789e-05, |
| "loss": 0.5518, |
| "mean_token_accuracy": 0.7665286004543305, |
| "num_tokens": 118406575.0, |
| "step": 3900 |
| }, |
| { |
| "entropy": 0.5389542949199676, |
| "epoch": 0.7370778130248181, |
| "grad_norm": 0.390625, |
| "learning_rate": 2.863855209461113e-05, |
| "loss": 0.5628, |
| "mean_token_accuracy": 0.7604682886600495, |
| "num_tokens": 119888774.0, |
| "step": 3950 |
| }, |
| { |
| "entropy": 0.5373398035764694, |
| "epoch": 0.7464079119238664, |
| "grad_norm": 0.6484375, |
| "learning_rate": 2.8603577553790682e-05, |
| "loss": 0.5654, |
| "mean_token_accuracy": 0.7639918619394303, |
| "num_tokens": 121314895.0, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.7464079119238664, |
| "eval_entropy": 0.5262507380695518, |
| "eval_loss": 0.5490807890892029, |
| "eval_mean_token_accuracy": 0.7650549843770649, |
| "eval_num_tokens": 121314895.0, |
| "eval_runtime": 16.0402, |
| "eval_samples_per_second": 54.052, |
| "eval_steps_per_second": 6.795, |
| "step": 4000 |
| }, |
| { |
| "entropy": 0.5303604575991631, |
| "epoch": 0.7557380108229147, |
| "grad_norm": 0.65234375, |
| "learning_rate": 2.8568183937338984e-05, |
| "loss": 0.5609, |
| "mean_token_accuracy": 0.7618992066383362, |
| "num_tokens": 122793213.0, |
| "step": 4050 |
| }, |
| { |
| "entropy": 0.5326658990979195, |
| "epoch": 0.765068109721963, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.8532372470728608e-05, |
| "loss": 0.5552, |
| "mean_token_accuracy": 0.7656506180763245, |
| "num_tokens": 124227341.0, |
| "step": 4100 |
| }, |
| { |
| "entropy": 0.5209727981686592, |
| "epoch": 0.7743982086210114, |
| "grad_norm": 0.69140625, |
| "learning_rate": 2.8496144393899784e-05, |
| "loss": 0.5516, |
| "mean_token_accuracy": 0.7649285507202148, |
| "num_tokens": 125707121.0, |
| "step": 4150 |
| }, |
| { |
| "entropy": 0.5248242399096489, |
| "epoch": 0.7837283075200597, |
| "grad_norm": 0.734375, |
| "learning_rate": 2.8459500961217533e-05, |
| "loss": 0.5534, |
| "mean_token_accuracy": 0.7604096215963364, |
| "num_tokens": 127238194.0, |
| "step": 4200 |
| }, |
| { |
| "entropy": 0.532697811126709, |
| "epoch": 0.7930584064191081, |
| "grad_norm": 0.80078125, |
| "learning_rate": 2.842244344142819e-05, |
| "loss": 0.5622, |
| "mean_token_accuracy": 0.7587612766027451, |
| "num_tokens": 128737550.0, |
| "step": 4250 |
| }, |
| { |
| "entropy": 0.5316076844930648, |
| "epoch": 0.8023885053181564, |
| "grad_norm": 0.515625, |
| "learning_rate": 2.8384973117615488e-05, |
| "loss": 0.5562, |
| "mean_token_accuracy": 0.7626278126239776, |
| "num_tokens": 130249756.0, |
| "step": 4300 |
| }, |
| { |
| "entropy": 0.5310768684744835, |
| "epoch": 0.8117186042172047, |
| "grad_norm": 0.78515625, |
| "learning_rate": 2.8347091287156136e-05, |
| "loss": 0.5575, |
| "mean_token_accuracy": 0.7627124708890914, |
| "num_tokens": 131739377.0, |
| "step": 4350 |
| }, |
| { |
| "entropy": 0.5297018462419509, |
| "epoch": 0.821048703116253, |
| "grad_norm": 0.72265625, |
| "learning_rate": 2.8308799261674898e-05, |
| "loss": 0.5556, |
| "mean_token_accuracy": 0.7631601667404175, |
| "num_tokens": 133264527.0, |
| "step": 4400 |
| }, |
| { |
| "entropy": 0.5304474216699601, |
| "epoch": 0.8303788020153013, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.8270098366999166e-05, |
| "loss": 0.5597, |
| "mean_token_accuracy": 0.7665414202213288, |
| "num_tokens": 134690231.0, |
| "step": 4450 |
| }, |
| { |
| "entropy": 0.5239725235104561, |
| "epoch": 0.8397089009143497, |
| "grad_norm": 0.796875, |
| "learning_rate": 2.8230989943113075e-05, |
| "loss": 0.5517, |
| "mean_token_accuracy": 0.763014947772026, |
| "num_tokens": 136226470.0, |
| "step": 4500 |
| }, |
| { |
| "entropy": 0.527112789452076, |
| "epoch": 0.849038999813398, |
| "grad_norm": 0.84375, |
| "learning_rate": 2.8191475344111103e-05, |
| "loss": 0.5524, |
| "mean_token_accuracy": 0.7613210624456406, |
| "num_tokens": 137780275.0, |
| "step": 4550 |
| }, |
| { |
| "entropy": 0.5286294043064117, |
| "epoch": 0.8583690987124464, |
| "grad_norm": 0.68359375, |
| "learning_rate": 2.8151555938151165e-05, |
| "loss": 0.557, |
| "mean_token_accuracy": 0.7616584074497222, |
| "num_tokens": 139330494.0, |
| "step": 4600 |
| }, |
| { |
| "entropy": 0.5276841628551483, |
| "epoch": 0.8676991976114947, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.811123310740726e-05, |
| "loss": 0.5495, |
| "mean_token_accuracy": 0.7647832882404327, |
| "num_tokens": 140815597.0, |
| "step": 4650 |
| }, |
| { |
| "entropy": 0.5373046767711639, |
| "epoch": 0.8770292965105431, |
| "grad_norm": 0.67578125, |
| "learning_rate": 2.807050824802163e-05, |
| "loss": 0.5624, |
| "mean_token_accuracy": 0.758348998427391, |
| "num_tokens": 142383943.0, |
| "step": 4700 |
| }, |
| { |
| "entropy": 0.5215965616703033, |
| "epoch": 0.8863593954095913, |
| "grad_norm": 0.6171875, |
| "learning_rate": 2.802938277005638e-05, |
| "loss": 0.5462, |
| "mean_token_accuracy": 0.7664449107646942, |
| "num_tokens": 143879403.0, |
| "step": 4750 |
| }, |
| { |
| "entropy": 0.5327892461419106, |
| "epoch": 0.8956894943086396, |
| "grad_norm": 0.38671875, |
| "learning_rate": 2.7987858097444688e-05, |
| "loss": 0.5618, |
| "mean_token_accuracy": 0.7579188454151153, |
| "num_tokens": 145455384.0, |
| "step": 4800 |
| }, |
| { |
| "entropy": 0.5209793072938919, |
| "epoch": 0.905019593207688, |
| "grad_norm": 0.90625, |
| "learning_rate": 2.794593566794149e-05, |
| "loss": 0.5502, |
| "mean_token_accuracy": 0.7619763416051865, |
| "num_tokens": 147010897.0, |
| "step": 4850 |
| }, |
| { |
| "entropy": 0.5109856846928597, |
| "epoch": 0.9143496921067363, |
| "grad_norm": 0.4375, |
| "learning_rate": 2.7903616933073712e-05, |
| "loss": 0.5471, |
| "mean_token_accuracy": 0.7652358949184418, |
| "num_tokens": 148509259.0, |
| "step": 4900 |
| }, |
| { |
| "entropy": 0.5274619281291961, |
| "epoch": 0.9236797910057847, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.786090335808998e-05, |
| "loss": 0.5546, |
| "mean_token_accuracy": 0.7621842390298843, |
| "num_tokens": 149982645.0, |
| "step": 4950 |
| }, |
| { |
| "entropy": 0.5342949241399765, |
| "epoch": 0.933009889904833, |
| "grad_norm": 0.58984375, |
| "learning_rate": 2.7817796421909922e-05, |
| "loss": 0.5682, |
| "mean_token_accuracy": 0.7593452525138855, |
| "num_tokens": 151532149.0, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.933009889904833, |
| "eval_entropy": 0.5146727717797691, |
| "eval_loss": 0.5413097739219666, |
| "eval_mean_token_accuracy": 0.767384243120841, |
| "eval_num_tokens": 151532149.0, |
| "eval_runtime": 16.1495, |
| "eval_samples_per_second": 53.686, |
| "eval_steps_per_second": 6.749, |
| "step": 5000 |
| }, |
| { |
| "entropy": 0.5231131237745285, |
| "epoch": 0.9423399888038814, |
| "grad_norm": 0.408203125, |
| "learning_rate": 2.7774297617072963e-05, |
| "loss": 0.554, |
| "mean_token_accuracy": 0.7625243580341339, |
| "num_tokens": 153059857.0, |
| "step": 5050 |
| }, |
| { |
| "entropy": 0.5220058736205101, |
| "epoch": 0.9516700877029296, |
| "grad_norm": 1.140625, |
| "learning_rate": 2.7730408449686593e-05, |
| "loss": 0.5496, |
| "mean_token_accuracy": 0.7626436889171601, |
| "num_tokens": 154578128.0, |
| "step": 5100 |
| }, |
| { |
| "entropy": 0.5289176645874977, |
| "epoch": 0.961000186601978, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.7686130439374304e-05, |
| "loss": 0.5555, |
| "mean_token_accuracy": 0.7623570781946182, |
| "num_tokens": 156133259.0, |
| "step": 5150 |
| }, |
| { |
| "entropy": 0.5238713613152504, |
| "epoch": 0.9703302855010263, |
| "grad_norm": 0.92578125, |
| "learning_rate": 2.7641465119222893e-05, |
| "loss": 0.5525, |
| "mean_token_accuracy": 0.7626954644918442, |
| "num_tokens": 157655593.0, |
| "step": 5200 |
| }, |
| { |
| "entropy": 0.526420825123787, |
| "epoch": 0.9796603844000746, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.7596414035729436e-05, |
| "loss": 0.5523, |
| "mean_token_accuracy": 0.7634574353694916, |
| "num_tokens": 159173170.0, |
| "step": 5250 |
| }, |
| { |
| "entropy": 0.5410353738069534, |
| "epoch": 0.988990483299123, |
| "grad_norm": 0.70703125, |
| "learning_rate": 2.755097874874772e-05, |
| "loss": 0.5616, |
| "mean_token_accuracy": 0.7588550513982772, |
| "num_tokens": 160746023.0, |
| "step": 5300 |
| }, |
| { |
| "entropy": 0.5253653234243393, |
| "epoch": 0.9983205821981713, |
| "grad_norm": 0.984375, |
| "learning_rate": 2.7505160831434235e-05, |
| "loss": 0.5538, |
| "mean_token_accuracy": 0.7632267904281617, |
| "num_tokens": 162234796.0, |
| "step": 5350 |
| }, |
| { |
| "entropy": 0.4956328600645065, |
| "epoch": 1.0076506810972197, |
| "grad_norm": 2.28125, |
| "learning_rate": 2.7458961870193697e-05, |
| "loss": 0.5385, |
| "mean_token_accuracy": 0.7668155688047409, |
| "num_tokens": 163759549.0, |
| "step": 5400 |
| }, |
| { |
| "entropy": 0.4777234876155853, |
| "epoch": 1.016980779996268, |
| "grad_norm": 0.8203125, |
| "learning_rate": 2.741238346462415e-05, |
| "loss": 0.5267, |
| "mean_token_accuracy": 0.7712256401777268, |
| "num_tokens": 165218690.0, |
| "step": 5450 |
| }, |
| { |
| "entropy": 0.4960213273763657, |
| "epoch": 1.0263108788953164, |
| "grad_norm": 0.45703125, |
| "learning_rate": 2.7365427227461538e-05, |
| "loss": 0.5316, |
| "mean_token_accuracy": 0.7680883568525314, |
| "num_tokens": 166779383.0, |
| "step": 5500 |
| }, |
| { |
| "entropy": 0.48663112640380857, |
| "epoch": 1.0356409777943647, |
| "grad_norm": 0.66015625, |
| "learning_rate": 2.731809478452392e-05, |
| "loss": 0.5271, |
| "mean_token_accuracy": 0.772123327255249, |
| "num_tokens": 168262830.0, |
| "step": 5550 |
| }, |
| { |
| "entropy": 0.501155666410923, |
| "epoch": 1.044971076693413, |
| "grad_norm": 0.984375, |
| "learning_rate": 2.72703877746551e-05, |
| "loss": 0.5439, |
| "mean_token_accuracy": 0.7648611211776734, |
| "num_tokens": 169844757.0, |
| "step": 5600 |
| }, |
| { |
| "entropy": 0.5043464726209641, |
| "epoch": 1.0543011755924612, |
| "grad_norm": 0.69140625, |
| "learning_rate": 2.7222307849667976e-05, |
| "loss": 0.5458, |
| "mean_token_accuracy": 0.7605859559774398, |
| "num_tokens": 171432040.0, |
| "step": 5650 |
| }, |
| { |
| "entropy": 0.4868525117635727, |
| "epoch": 1.0636312744915095, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.7173856674287276e-05, |
| "loss": 0.532, |
| "mean_token_accuracy": 0.7668398702144623, |
| "num_tokens": 172949249.0, |
| "step": 5700 |
| }, |
| { |
| "entropy": 0.48061770796775816, |
| "epoch": 1.0729613733905579, |
| "grad_norm": 0.412109375, |
| "learning_rate": 2.7125035926091948e-05, |
| "loss": 0.5247, |
| "mean_token_accuracy": 0.7707643383741378, |
| "num_tokens": 174427624.0, |
| "step": 5750 |
| }, |
| { |
| "entropy": 0.49181210845708845, |
| "epoch": 1.0822914722896062, |
| "grad_norm": 0.703125, |
| "learning_rate": 2.7075847295457074e-05, |
| "loss": 0.5387, |
| "mean_token_accuracy": 0.7684424781799316, |
| "num_tokens": 175889073.0, |
| "step": 5800 |
| }, |
| { |
| "entropy": 0.47926140516996385, |
| "epoch": 1.0916215711886545, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.702629248549533e-05, |
| "loss": 0.5243, |
| "mean_token_accuracy": 0.7723050940036774, |
| "num_tokens": 177318508.0, |
| "step": 5850 |
| }, |
| { |
| "entropy": 0.49550765454769136, |
| "epoch": 1.100951670087703, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.6976373211998036e-05, |
| "loss": 0.5369, |
| "mean_token_accuracy": 0.7672231763601303, |
| "num_tokens": 178841949.0, |
| "step": 5900 |
| }, |
| { |
| "entropy": 0.4972337147593498, |
| "epoch": 1.1102817689867512, |
| "grad_norm": 0.57421875, |
| "learning_rate": 2.6926091203375736e-05, |
| "loss": 0.5359, |
| "mean_token_accuracy": 0.7673702806234359, |
| "num_tokens": 180355456.0, |
| "step": 5950 |
| }, |
| { |
| "entropy": 0.49163941740989686, |
| "epoch": 1.1196118678857996, |
| "grad_norm": 0.640625, |
| "learning_rate": 2.6875448200598356e-05, |
| "loss": 0.53, |
| "mean_token_accuracy": 0.76685063123703, |
| "num_tokens": 181895417.0, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.1196118678857996, |
| "eval_entropy": 0.49277083069906324, |
| "eval_loss": 0.5361812114715576, |
| "eval_mean_token_accuracy": 0.7686252342451603, |
| "eval_num_tokens": 181895417.0, |
| "eval_runtime": 16.2137, |
| "eval_samples_per_second": 53.473, |
| "eval_steps_per_second": 6.723, |
| "step": 6000 |
| }, |
| { |
| "entropy": 0.494234202504158, |
| "epoch": 1.128941966784848, |
| "grad_norm": 0.67578125, |
| "learning_rate": 2.682444595713492e-05, |
| "loss": 0.534, |
| "mean_token_accuracy": 0.7677116429805756, |
| "num_tokens": 183441033.0, |
| "step": 6050 |
| }, |
| { |
| "entropy": 0.4924536618590355, |
| "epoch": 1.1382720656838963, |
| "grad_norm": 1.4375, |
| "learning_rate": 2.6773086238892847e-05, |
| "loss": 0.5325, |
| "mean_token_accuracy": 0.7706644636392593, |
| "num_tokens": 184930303.0, |
| "step": 6100 |
| }, |
| { |
| "entropy": 0.49195749253034593, |
| "epoch": 1.1476021645829446, |
| "grad_norm": 0.921875, |
| "learning_rate": 2.67213708241568e-05, |
| "loss": 0.5323, |
| "mean_token_accuracy": 0.769943385720253, |
| "num_tokens": 186461707.0, |
| "step": 6150 |
| }, |
| { |
| "entropy": 0.4922616305947304, |
| "epoch": 1.156932263481993, |
| "grad_norm": 1.046875, |
| "learning_rate": 2.666930150352712e-05, |
| "loss": 0.5331, |
| "mean_token_accuracy": 0.765140592455864, |
| "num_tokens": 188051673.0, |
| "step": 6200 |
| }, |
| { |
| "entropy": 0.4984354588389397, |
| "epoch": 1.1662623623810413, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.661688007985782e-05, |
| "loss": 0.5366, |
| "mean_token_accuracy": 0.7678897380828857, |
| "num_tokens": 189540976.0, |
| "step": 6250 |
| }, |
| { |
| "entropy": 0.4744232183694839, |
| "epoch": 1.1755924612800897, |
| "grad_norm": 0.79296875, |
| "learning_rate": 2.6564108368194174e-05, |
| "loss": 0.5188, |
| "mean_token_accuracy": 0.7747587919235229, |
| "num_tokens": 190972681.0, |
| "step": 6300 |
| }, |
| { |
| "entropy": 0.48836414963006974, |
| "epoch": 1.184922560179138, |
| "grad_norm": 0.7265625, |
| "learning_rate": 2.6510988195709867e-05, |
| "loss": 0.5945, |
| "mean_token_accuracy": 0.7682410633563995, |
| "num_tokens": 192480163.0, |
| "step": 6350 |
| }, |
| { |
| "entropy": 0.481321419775486, |
| "epoch": 1.1942526590781863, |
| "grad_norm": 0.58984375, |
| "learning_rate": 2.6457521401643724e-05, |
| "loss": 0.5223, |
| "mean_token_accuracy": 0.7744923168420792, |
| "num_tokens": 193963819.0, |
| "step": 6400 |
| }, |
| { |
| "entropy": 0.4845267793536186, |
| "epoch": 1.2035827579772345, |
| "grad_norm": 0.73828125, |
| "learning_rate": 2.640370983723605e-05, |
| "loss": 0.5331, |
| "mean_token_accuracy": 0.7681414604187011, |
| "num_tokens": 195501297.0, |
| "step": 6450 |
| }, |
| { |
| "entropy": 0.4920358270406723, |
| "epoch": 1.2129128568762828, |
| "grad_norm": 1.1171875, |
| "learning_rate": 2.6349555365664496e-05, |
| "loss": 0.5315, |
| "mean_token_accuracy": 0.7675491815805435, |
| "num_tokens": 197047201.0, |
| "step": 6500 |
| }, |
| { |
| "entropy": 0.4892509970068932, |
| "epoch": 1.2222429557753312, |
| "grad_norm": 0.46875, |
| "learning_rate": 2.6295059861979585e-05, |
| "loss": 0.5324, |
| "mean_token_accuracy": 0.7672034209966659, |
| "num_tokens": 198554820.0, |
| "step": 6550 |
| }, |
| { |
| "entropy": 0.48003152668476107, |
| "epoch": 1.2315730546743795, |
| "grad_norm": 0.82421875, |
| "learning_rate": 2.6240225213039762e-05, |
| "loss": 0.5256, |
| "mean_token_accuracy": 0.7719682443141938, |
| "num_tokens": 200055450.0, |
| "step": 6600 |
| }, |
| { |
| "entropy": 0.49669885337352754, |
| "epoch": 1.2409031535734278, |
| "grad_norm": 0.5390625, |
| "learning_rate": 2.6185053317446094e-05, |
| "loss": 0.5394, |
| "mean_token_accuracy": 0.7652375429868699, |
| "num_tokens": 201621193.0, |
| "step": 6650 |
| }, |
| { |
| "entropy": 0.47468378067016603, |
| "epoch": 1.2502332524724762, |
| "grad_norm": 0.298828125, |
| "learning_rate": 2.6129546085476494e-05, |
| "loss": 0.5181, |
| "mean_token_accuracy": 0.7740450286865235, |
| "num_tokens": 203115630.0, |
| "step": 6700 |
| }, |
| { |
| "entropy": 0.48166996002197265, |
| "epoch": 1.2595633513715245, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.6073705439019604e-05, |
| "loss": 0.5235, |
| "mean_token_accuracy": 0.7715310126543045, |
| "num_tokens": 204659783.0, |
| "step": 6750 |
| }, |
| { |
| "entropy": 0.487423982322216, |
| "epoch": 1.2688934502705729, |
| "grad_norm": 0.94140625, |
| "learning_rate": 2.6017533311508262e-05, |
| "loss": 0.5271, |
| "mean_token_accuracy": 0.7717793607711791, |
| "num_tokens": 206207207.0, |
| "step": 6800 |
| }, |
| { |
| "entropy": 0.490067283809185, |
| "epoch": 1.2782235491696212, |
| "grad_norm": 0.98046875, |
| "learning_rate": 2.5961031647852525e-05, |
| "loss": 0.5291, |
| "mean_token_accuracy": 0.7670228743553161, |
| "num_tokens": 207776741.0, |
| "step": 6850 |
| }, |
| { |
| "entropy": 0.47844831019639966, |
| "epoch": 1.2875536480686696, |
| "grad_norm": 0.60546875, |
| "learning_rate": 2.590420240437236e-05, |
| "loss": 0.5286, |
| "mean_token_accuracy": 0.7723558592796326, |
| "num_tokens": 209300786.0, |
| "step": 6900 |
| }, |
| { |
| "entropy": 0.4823366206884384, |
| "epoch": 1.296883746967718, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.584704754872988e-05, |
| "loss": 0.5294, |
| "mean_token_accuracy": 0.7727249205112457, |
| "num_tokens": 210784669.0, |
| "step": 6950 |
| }, |
| { |
| "entropy": 0.4943872797489166, |
| "epoch": 1.3062138458667663, |
| "grad_norm": 0.8046875, |
| "learning_rate": 2.578956905986124e-05, |
| "loss": 0.5374, |
| "mean_token_accuracy": 0.7674384766817093, |
| "num_tokens": 212346372.0, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.3062138458667663, |
| "eval_entropy": 0.48885188408947866, |
| "eval_loss": 0.5305144190788269, |
| "eval_mean_token_accuracy": 0.7706459596616413, |
| "eval_num_tokens": 212346372.0, |
| "eval_runtime": 16.2232, |
| "eval_samples_per_second": 53.442, |
| "eval_steps_per_second": 6.719, |
| "step": 7000 |
| }, |
| { |
| "entropy": 0.48752534478902815, |
| "epoch": 1.3155439447658146, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.573176892790812e-05, |
| "loss": 0.5315, |
| "mean_token_accuracy": 0.7696154469251633, |
| "num_tokens": 213860843.0, |
| "step": 7050 |
| }, |
| { |
| "entropy": 0.4900174245238304, |
| "epoch": 1.3248740436648627, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.567364915414877e-05, |
| "loss": 0.5292, |
| "mean_token_accuracy": 0.7694221770763398, |
| "num_tokens": 215367383.0, |
| "step": 7100 |
| }, |
| { |
| "entropy": 0.48771278649568556, |
| "epoch": 1.334204142563911, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.5615211750928794e-05, |
| "loss": 0.5353, |
| "mean_token_accuracy": 0.7697239458560944, |
| "num_tokens": 216845421.0, |
| "step": 7150 |
| }, |
| { |
| "entropy": 0.47983416020870207, |
| "epoch": 1.3435342414629594, |
| "grad_norm": 0.6015625, |
| "learning_rate": 2.555645874159141e-05, |
| "loss": 0.5234, |
| "mean_token_accuracy": 0.7707635217905044, |
| "num_tokens": 218372903.0, |
| "step": 7200 |
| }, |
| { |
| "entropy": 0.47865484192967417, |
| "epoch": 1.3528643403620078, |
| "grad_norm": 0.59765625, |
| "learning_rate": 2.549739216040743e-05, |
| "loss": 0.5221, |
| "mean_token_accuracy": 0.7720851230621338, |
| "num_tokens": 219883662.0, |
| "step": 7250 |
| }, |
| { |
| "entropy": 0.4899566939473152, |
| "epoch": 1.362194439261056, |
| "grad_norm": 0.453125, |
| "learning_rate": 2.5438014052504802e-05, |
| "loss": 0.532, |
| "mean_token_accuracy": 0.7674814122915268, |
| "num_tokens": 221426709.0, |
| "step": 7300 |
| }, |
| { |
| "entropy": 0.4817213848233223, |
| "epoch": 1.3715245381601044, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.5378326473797818e-05, |
| "loss": 0.5265, |
| "mean_token_accuracy": 0.769315534234047, |
| "num_tokens": 222950520.0, |
| "step": 7350 |
| }, |
| { |
| "entropy": 0.4752693668007851, |
| "epoch": 1.3808546370591528, |
| "grad_norm": 0.5859375, |
| "learning_rate": 2.5318331490915925e-05, |
| "loss": 0.5195, |
| "mean_token_accuracy": 0.7739131230115891, |
| "num_tokens": 224448854.0, |
| "step": 7400 |
| }, |
| { |
| "entropy": 0.4732086658477783, |
| "epoch": 1.3901847359582011, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.525803118113215e-05, |
| "loss": 0.5225, |
| "mean_token_accuracy": 0.7735245388746261, |
| "num_tokens": 225932263.0, |
| "step": 7450 |
| }, |
| { |
| "entropy": 0.48352263927459715, |
| "epoch": 1.3995148348572495, |
| "grad_norm": 0.546875, |
| "learning_rate": 2.5197427632291214e-05, |
| "loss": 0.531, |
| "mean_token_accuracy": 0.7676544332504273, |
| "num_tokens": 227452316.0, |
| "step": 7500 |
| }, |
| { |
| "entropy": 0.48029813915491104, |
| "epoch": 1.4088449337562978, |
| "grad_norm": 0.61328125, |
| "learning_rate": 2.513652294273721e-05, |
| "loss": 0.5257, |
| "mean_token_accuracy": 0.7688204735517502, |
| "num_tokens": 228995142.0, |
| "step": 7550 |
| }, |
| { |
| "entropy": 0.49092908948659897, |
| "epoch": 1.4181750326553462, |
| "grad_norm": 0.90625, |
| "learning_rate": 2.507531922124096e-05, |
| "loss": 0.5515, |
| "mean_token_accuracy": 0.7691348105669021, |
| "num_tokens": 230473945.0, |
| "step": 7600 |
| }, |
| { |
| "entropy": 0.46930390000343325, |
| "epoch": 1.4275051315543945, |
| "grad_norm": 0.91015625, |
| "learning_rate": 2.501381858692701e-05, |
| "loss": 0.5192, |
| "mean_token_accuracy": 0.7731947559118271, |
| "num_tokens": 231969989.0, |
| "step": 7650 |
| }, |
| { |
| "entropy": 0.4841861927509308, |
| "epoch": 1.4368352304534429, |
| "grad_norm": 0.74609375, |
| "learning_rate": 2.495202316920024e-05, |
| "loss": 0.5281, |
| "mean_token_accuracy": 0.7702859449386597, |
| "num_tokens": 233496905.0, |
| "step": 7700 |
| }, |
| { |
| "entropy": 0.47611463099718093, |
| "epoch": 1.4461653293524912, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.488993510767214e-05, |
| "loss": 0.5292, |
| "mean_token_accuracy": 0.7676136875152588, |
| "num_tokens": 235025876.0, |
| "step": 7750 |
| }, |
| { |
| "entropy": 0.480805746614933, |
| "epoch": 1.4554954282515395, |
| "grad_norm": 0.447265625, |
| "learning_rate": 2.4827556552086753e-05, |
| "loss": 0.5228, |
| "mean_token_accuracy": 0.7714346569776535, |
| "num_tokens": 236573132.0, |
| "step": 7800 |
| }, |
| { |
| "entropy": 0.4773865479230881, |
| "epoch": 1.464825527150588, |
| "grad_norm": 0.8828125, |
| "learning_rate": 2.47648896622462e-05, |
| "loss": 0.5233, |
| "mean_token_accuracy": 0.7716735368967056, |
| "num_tokens": 238108249.0, |
| "step": 7850 |
| }, |
| { |
| "entropy": 0.49567407727241514, |
| "epoch": 1.4741556260496362, |
| "grad_norm": 0.49609375, |
| "learning_rate": 2.4701936607935922e-05, |
| "loss": 0.548, |
| "mean_token_accuracy": 0.7675740510225296, |
| "num_tokens": 239609374.0, |
| "step": 7900 |
| }, |
| { |
| "entropy": 0.47016422227025034, |
| "epoch": 1.4834857249486846, |
| "grad_norm": 0.65625, |
| "learning_rate": 2.463869956884957e-05, |
| "loss": 0.5141, |
| "mean_token_accuracy": 0.7800671440362931, |
| "num_tokens": 241035036.0, |
| "step": 7950 |
| }, |
| { |
| "entropy": 0.4691419780254364, |
| "epoch": 1.4928158238477327, |
| "grad_norm": 0.6640625, |
| "learning_rate": 2.457518073451348e-05, |
| "loss": 0.5183, |
| "mean_token_accuracy": 0.7747016477584839, |
| "num_tokens": 242482052.0, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.4928158238477327, |
| "eval_entropy": 0.48408404293410273, |
| "eval_loss": 0.5262120962142944, |
| "eval_mean_token_accuracy": 0.7715235532970603, |
| "eval_num_tokens": 242482052.0, |
| "eval_runtime": 16.1685, |
| "eval_samples_per_second": 53.623, |
| "eval_steps_per_second": 6.741, |
| "step": 8000 |
| }, |
| { |
| "entropy": 0.48640842020511627, |
| "epoch": 1.5021459227467813, |
| "grad_norm": 0.8671875, |
| "learning_rate": 2.451138230421094e-05, |
| "loss": 0.5291, |
| "mean_token_accuracy": 0.768261170387268, |
| "num_tokens": 244051944.0, |
| "step": 8050 |
| }, |
| { |
| "entropy": 0.4861644932627678, |
| "epoch": 1.5114760216458294, |
| "grad_norm": 0.86328125, |
| "learning_rate": 2.4447306486905965e-05, |
| "loss": 0.5303, |
| "mean_token_accuracy": 0.7654324793815612, |
| "num_tokens": 245667242.0, |
| "step": 8100 |
| }, |
| { |
| "entropy": 0.46581872284412384, |
| "epoch": 1.5208061205448777, |
| "grad_norm": 0.64453125, |
| "learning_rate": 2.4382955501166878e-05, |
| "loss": 0.517, |
| "mean_token_accuracy": 0.7791347569227218, |
| "num_tokens": 247084677.0, |
| "step": 8150 |
| }, |
| { |
| "entropy": 0.48655702769756315, |
| "epoch": 1.530136219443926, |
| "grad_norm": 0.5859375, |
| "learning_rate": 2.4318331575089437e-05, |
| "loss": 0.5283, |
| "mean_token_accuracy": 0.7677739357948303, |
| "num_tokens": 248666161.0, |
| "step": 8200 |
| }, |
| { |
| "entropy": 0.48474507868289946, |
| "epoch": 1.5394663183429744, |
| "grad_norm": 0.83203125, |
| "learning_rate": 2.425343694621974e-05, |
| "loss": 0.5218, |
| "mean_token_accuracy": 0.7710594099760055, |
| "num_tokens": 250228344.0, |
| "step": 8250 |
| }, |
| { |
| "entropy": 0.501356900036335, |
| "epoch": 1.5487964172420228, |
| "grad_norm": 1.09375, |
| "learning_rate": 2.418827386147672e-05, |
| "loss": 0.5391, |
| "mean_token_accuracy": 0.7636065500974655, |
| "num_tokens": 251842667.0, |
| "step": 8300 |
| }, |
| { |
| "entropy": 0.48092112705111506, |
| "epoch": 1.5581265161410711, |
| "grad_norm": 0.65625, |
| "learning_rate": 2.4122844577074344e-05, |
| "loss": 0.5246, |
| "mean_token_accuracy": 0.7711791855096817, |
| "num_tokens": 253387970.0, |
| "step": 8350 |
| }, |
| { |
| "entropy": 0.4703651532530785, |
| "epoch": 1.5674566150401195, |
| "grad_norm": 0.76171875, |
| "learning_rate": 2.4057151358443537e-05, |
| "loss": 0.523, |
| "mean_token_accuracy": 0.7739822679758072, |
| "num_tokens": 254893911.0, |
| "step": 8400 |
| }, |
| { |
| "entropy": 0.48381629049777986, |
| "epoch": 1.5767867139391678, |
| "grad_norm": 0.5390625, |
| "learning_rate": 2.3991196480153678e-05, |
| "loss": 0.5295, |
| "mean_token_accuracy": 0.7688455355167388, |
| "num_tokens": 256399394.0, |
| "step": 8450 |
| }, |
| { |
| "entropy": 0.4885594379901886, |
| "epoch": 1.586116812838216, |
| "grad_norm": 0.73828125, |
| "learning_rate": 2.39249822258339e-05, |
| "loss": 0.5269, |
| "mean_token_accuracy": 0.7714411211013794, |
| "num_tokens": 257935448.0, |
| "step": 8500 |
| }, |
| { |
| "entropy": 0.48577941954135895, |
| "epoch": 1.5954469117372643, |
| "grad_norm": 0.8828125, |
| "learning_rate": 2.3858510888093997e-05, |
| "loss": 0.5231, |
| "mean_token_accuracy": 0.7718437218666077, |
| "num_tokens": 259486589.0, |
| "step": 8550 |
| }, |
| { |
| "entropy": 0.48262311398983004, |
| "epoch": 1.6047770106363126, |
| "grad_norm": 0.578125, |
| "learning_rate": 2.3791784768445045e-05, |
| "loss": 0.5248, |
| "mean_token_accuracy": 0.7686738175153732, |
| "num_tokens": 261065847.0, |
| "step": 8600 |
| }, |
| { |
| "entropy": 0.471617269217968, |
| "epoch": 1.614107109535361, |
| "grad_norm": 0.9765625, |
| "learning_rate": 2.3724806177219723e-05, |
| "loss": 0.5203, |
| "mean_token_accuracy": 0.772919489145279, |
| "num_tokens": 262600411.0, |
| "step": 8650 |
| }, |
| { |
| "entropy": 0.47281612068414686, |
| "epoch": 1.6234372084344093, |
| "grad_norm": 1.9140625, |
| "learning_rate": 2.36575774334923e-05, |
| "loss": 0.5169, |
| "mean_token_accuracy": 0.774740971326828, |
| "num_tokens": 264141493.0, |
| "step": 8700 |
| }, |
| { |
| "entropy": 0.47484747022390367, |
| "epoch": 1.6327673073334577, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.359010086499838e-05, |
| "loss": 0.5184, |
| "mean_token_accuracy": 0.7732387953996658, |
| "num_tokens": 265636375.0, |
| "step": 8750 |
| }, |
| { |
| "entropy": 0.47206893771886826, |
| "epoch": 1.642097406232506, |
| "grad_norm": 0.7890625, |
| "learning_rate": 2.352237880805426e-05, |
| "loss": 0.5303, |
| "mean_token_accuracy": 0.7746653699874878, |
| "num_tokens": 267090177.0, |
| "step": 8800 |
| }, |
| { |
| "entropy": 0.4694141258299351, |
| "epoch": 1.6514275051315543, |
| "grad_norm": 0.64453125, |
| "learning_rate": 2.3454413607476044e-05, |
| "loss": 0.5199, |
| "mean_token_accuracy": 0.7748447281122207, |
| "num_tokens": 268551821.0, |
| "step": 8850 |
| }, |
| { |
| "entropy": 0.47594692051410675, |
| "epoch": 1.6607576040306027, |
| "grad_norm": 1.125, |
| "learning_rate": 2.3386207616498503e-05, |
| "loss": 0.5186, |
| "mean_token_accuracy": 0.773201887011528, |
| "num_tokens": 270084312.0, |
| "step": 8900 |
| }, |
| { |
| "entropy": 0.47182066380977633, |
| "epoch": 1.670087702929651, |
| "grad_norm": 0.66796875, |
| "learning_rate": 2.331776319669354e-05, |
| "loss": 0.5286, |
| "mean_token_accuracy": 0.7718379843235016, |
| "num_tokens": 271546211.0, |
| "step": 8950 |
| }, |
| { |
| "entropy": 0.4656666761636734, |
| "epoch": 1.6794178018286994, |
| "grad_norm": 0.490234375, |
| "learning_rate": 2.324908271788844e-05, |
| "loss": 0.5158, |
| "mean_token_accuracy": 0.7762594664096832, |
| "num_tokens": 272998807.0, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.6794178018286994, |
| "eval_entropy": 0.4748075161505183, |
| "eval_loss": 0.5214188694953918, |
| "eval_mean_token_accuracy": 0.7731986155203723, |
| "eval_num_tokens": 272998807.0, |
| "eval_runtime": 16.1272, |
| "eval_samples_per_second": 53.76, |
| "eval_steps_per_second": 6.759, |
| "step": 9000 |
| }, |
| { |
| "entropy": 0.4896981066465378, |
| "epoch": 1.6887479007277477, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.3180168558083844e-05, |
| "loss": 0.5287, |
| "mean_token_accuracy": 0.7671917879581451, |
| "num_tokens": 274587533.0, |
| "step": 9050 |
| }, |
| { |
| "entropy": 0.4765408200025558, |
| "epoch": 1.698077999626796, |
| "grad_norm": 1.921875, |
| "learning_rate": 2.311102310337136e-05, |
| "loss": 0.5188, |
| "mean_token_accuracy": 0.7724708420038223, |
| "num_tokens": 276131737.0, |
| "step": 9100 |
| }, |
| { |
| "entropy": 0.4824476379156113, |
| "epoch": 1.7074080985258444, |
| "grad_norm": 0.46875, |
| "learning_rate": 2.304164874785101e-05, |
| "loss": 0.5267, |
| "mean_token_accuracy": 0.7684733641147613, |
| "num_tokens": 277715304.0, |
| "step": 9150 |
| }, |
| { |
| "entropy": 0.48801319271326066, |
| "epoch": 1.7167381974248928, |
| "grad_norm": 0.7890625, |
| "learning_rate": 2.297204789354827e-05, |
| "loss": 0.5345, |
| "mean_token_accuracy": 0.7730580461025238, |
| "num_tokens": 279224176.0, |
| "step": 9200 |
| }, |
| { |
| "entropy": 0.47024243041872976, |
| "epoch": 1.726068296323941, |
| "grad_norm": 0.70703125, |
| "learning_rate": 2.2902222950330966e-05, |
| "loss": 0.5208, |
| "mean_token_accuracy": 0.7733591181039811, |
| "num_tokens": 280746272.0, |
| "step": 9250 |
| }, |
| { |
| "entropy": 0.4675961661338806, |
| "epoch": 1.7353983952229894, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.283217633582578e-05, |
| "loss": 0.5249, |
| "mean_token_accuracy": 0.7734372174739838, |
| "num_tokens": 282236117.0, |
| "step": 9300 |
| }, |
| { |
| "entropy": 0.48016302675008776, |
| "epoch": 1.7447284941220378, |
| "grad_norm": 1.4921875, |
| "learning_rate": 2.276191047533458e-05, |
| "loss": 0.5231, |
| "mean_token_accuracy": 0.770177600979805, |
| "num_tokens": 283794545.0, |
| "step": 9350 |
| }, |
| { |
| "entropy": 0.47259823501110076, |
| "epoch": 1.7540585930210861, |
| "grad_norm": 0.71484375, |
| "learning_rate": 2.269142780175042e-05, |
| "loss": 0.5192, |
| "mean_token_accuracy": 0.7728340399265289, |
| "num_tokens": 285333807.0, |
| "step": 9400 |
| }, |
| { |
| "entropy": 0.47532046377658843, |
| "epoch": 1.7633886919201345, |
| "grad_norm": 0.86328125, |
| "learning_rate": 2.2620730755473328e-05, |
| "loss": 0.5259, |
| "mean_token_accuracy": 0.7706443351507187, |
| "num_tokens": 286852697.0, |
| "step": 9450 |
| }, |
| { |
| "entropy": 0.4715300849080086, |
| "epoch": 1.7727187908191828, |
| "grad_norm": 0.6640625, |
| "learning_rate": 2.2549821784325787e-05, |
| "loss": 0.5226, |
| "mean_token_accuracy": 0.7710268515348434, |
| "num_tokens": 288390169.0, |
| "step": 9500 |
| }, |
| { |
| "entropy": 0.45817104071378706, |
| "epoch": 1.7820488897182312, |
| "grad_norm": 0.65234375, |
| "learning_rate": 2.2478703343467995e-05, |
| "loss": 0.5084, |
| "mean_token_accuracy": 0.7808880287408829, |
| "num_tokens": 289825517.0, |
| "step": 9550 |
| }, |
| { |
| "entropy": 0.46380849391222, |
| "epoch": 1.7913789886172793, |
| "grad_norm": 0.609375, |
| "learning_rate": 2.2407377895312848e-05, |
| "loss": 0.522, |
| "mean_token_accuracy": 0.7710244971513748, |
| "num_tokens": 291345905.0, |
| "step": 9600 |
| }, |
| { |
| "entropy": 0.4663406872749329, |
| "epoch": 1.8007090875163276, |
| "grad_norm": 0.97265625, |
| "learning_rate": 2.2335847909440697e-05, |
| "loss": 0.5174, |
| "mean_token_accuracy": 0.7765918165445328, |
| "num_tokens": 292843956.0, |
| "step": 9650 |
| }, |
| { |
| "entropy": 0.46827891767024993, |
| "epoch": 1.810039186415376, |
| "grad_norm": 1.4765625, |
| "learning_rate": 2.226411586251381e-05, |
| "loss": 0.5135, |
| "mean_token_accuracy": 0.7766558998823165, |
| "num_tokens": 294304944.0, |
| "step": 9700 |
| }, |
| { |
| "entropy": 0.4667487397789955, |
| "epoch": 1.8193692853144243, |
| "grad_norm": 0.75390625, |
| "learning_rate": 2.2192184238190666e-05, |
| "loss": 0.5173, |
| "mean_token_accuracy": 0.7746782380342484, |
| "num_tokens": 295807703.0, |
| "step": 9750 |
| }, |
| { |
| "entropy": 0.4796573233604431, |
| "epoch": 1.8286993842134727, |
| "grad_norm": 0.80078125, |
| "learning_rate": 2.2120055527039914e-05, |
| "loss": 0.5265, |
| "mean_token_accuracy": 0.7696005600690842, |
| "num_tokens": 297318135.0, |
| "step": 9800 |
| }, |
| { |
| "entropy": 0.47267288982868194, |
| "epoch": 1.838029483112521, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.2047732226454157e-05, |
| "loss": 0.5153, |
| "mean_token_accuracy": 0.7761634987592697, |
| "num_tokens": 298791062.0, |
| "step": 9850 |
| }, |
| { |
| "entropy": 0.47834465265274045, |
| "epoch": 1.8473595820115694, |
| "grad_norm": 0.625, |
| "learning_rate": 2.1975216840563502e-05, |
| "loss": 0.5222, |
| "mean_token_accuracy": 0.7747543674707412, |
| "num_tokens": 300275324.0, |
| "step": 9900 |
| }, |
| { |
| "entropy": 0.46263691544532776, |
| "epoch": 1.8566896809106177, |
| "grad_norm": 1.0, |
| "learning_rate": 2.1902511880148835e-05, |
| "loss": 0.5148, |
| "mean_token_accuracy": 0.7750599044561386, |
| "num_tokens": 301729884.0, |
| "step": 9950 |
| }, |
| { |
| "entropy": 0.4763942888379097, |
| "epoch": 1.8660197798096658, |
| "grad_norm": 0.796875, |
| "learning_rate": 2.1829619862554877e-05, |
| "loss": 0.5183, |
| "mean_token_accuracy": 0.7708618581295014, |
| "num_tokens": 303287949.0, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.8660197798096658, |
| "eval_entropy": 0.4685867871713201, |
| "eval_loss": 0.5169408321380615, |
| "eval_mean_token_accuracy": 0.7741667956387231, |
| "eval_num_tokens": 303287949.0, |
| "eval_runtime": 16.0644, |
| "eval_samples_per_second": 53.97, |
| "eval_steps_per_second": 6.785, |
| "step": 10000 |
| }, |
| { |
| "entropy": 0.47606048226356507, |
| "epoch": 1.8753498787087142, |
| "grad_norm": 1.109375, |
| "learning_rate": 2.175654331160305e-05, |
| "loss": 0.5189, |
| "mean_token_accuracy": 0.7730025327205658, |
| "num_tokens": 304818318.0, |
| "step": 10050 |
| }, |
| { |
| "entropy": 0.4616896215081215, |
| "epoch": 1.8846799776077625, |
| "grad_norm": 0.9140625, |
| "learning_rate": 2.168328475750408e-05, |
| "loss": 0.5089, |
| "mean_token_accuracy": 0.7780868858098984, |
| "num_tokens": 306331367.0, |
| "step": 10100 |
| }, |
| { |
| "entropy": 0.47027878910303117, |
| "epoch": 1.8940100765068109, |
| "grad_norm": 0.53125, |
| "learning_rate": 2.160984673677039e-05, |
| "loss": 0.5186, |
| "mean_token_accuracy": 0.772971043586731, |
| "num_tokens": 307830293.0, |
| "step": 10150 |
| }, |
| { |
| "entropy": 0.4677613499760628, |
| "epoch": 1.9033401754058592, |
| "grad_norm": 0.376953125, |
| "learning_rate": 2.153623179212827e-05, |
| "loss": 0.5163, |
| "mean_token_accuracy": 0.7740986323356629, |
| "num_tokens": 309360930.0, |
| "step": 10200 |
| }, |
| { |
| "entropy": 0.47715963318943977, |
| "epoch": 1.9126702743049075, |
| "grad_norm": 0.73046875, |
| "learning_rate": 2.146244247242985e-05, |
| "loss": 0.5271, |
| "mean_token_accuracy": 0.7717376494407654, |
| "num_tokens": 310855697.0, |
| "step": 10250 |
| }, |
| { |
| "entropy": 0.46572228729724885, |
| "epoch": 1.922000373203956, |
| "grad_norm": 0.71875, |
| "learning_rate": 2.1388481332564835e-05, |
| "loss": 0.5145, |
| "mean_token_accuracy": 0.774823442697525, |
| "num_tokens": 312370909.0, |
| "step": 10300 |
| }, |
| { |
| "entropy": 0.46713058680295944, |
| "epoch": 1.9313304721030042, |
| "grad_norm": 0.404296875, |
| "learning_rate": 2.1314350933372053e-05, |
| "loss": 0.5129, |
| "mean_token_accuracy": 0.7726266753673553, |
| "num_tokens": 313900324.0, |
| "step": 10350 |
| }, |
| { |
| "entropy": 0.4743108308315277, |
| "epoch": 1.9406605710020526, |
| "grad_norm": 0.8359375, |
| "learning_rate": 2.1240053841550792e-05, |
| "loss": 0.5226, |
| "mean_token_accuracy": 0.7713726377487182, |
| "num_tokens": 315441715.0, |
| "step": 10400 |
| }, |
| { |
| "entropy": 0.4706794250011444, |
| "epoch": 1.949990669901101, |
| "grad_norm": 0.9453125, |
| "learning_rate": 2.1165592629571923e-05, |
| "loss": 0.517, |
| "mean_token_accuracy": 0.7740881043672562, |
| "num_tokens": 316910090.0, |
| "step": 10450 |
| }, |
| { |
| "entropy": 0.4839508882164955, |
| "epoch": 1.9593207688001493, |
| "grad_norm": 0.81640625, |
| "learning_rate": 2.1090969875588827e-05, |
| "loss": 0.5236, |
| "mean_token_accuracy": 0.7707830715179443, |
| "num_tokens": 318480506.0, |
| "step": 10500 |
| }, |
| { |
| "entropy": 0.4602850756049156, |
| "epoch": 1.9686508676991976, |
| "grad_norm": 2.34375, |
| "learning_rate": 2.1016188163348126e-05, |
| "loss": 0.5121, |
| "mean_token_accuracy": 0.7764248877763749, |
| "num_tokens": 320008211.0, |
| "step": 10550 |
| }, |
| { |
| "entropy": 0.4730991995334625, |
| "epoch": 1.977980966598246, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.0941250082100253e-05, |
| "loss": 0.5288, |
| "mean_token_accuracy": 0.7702510052919388, |
| "num_tokens": 321525032.0, |
| "step": 10600 |
| }, |
| { |
| "entropy": 0.446844310760498, |
| "epoch": 1.9873110654972943, |
| "grad_norm": 0.296875, |
| "learning_rate": 2.0866158226509758e-05, |
| "loss": 0.5058, |
| "mean_token_accuracy": 0.7796232843399048, |
| "num_tokens": 322954203.0, |
| "step": 10650 |
| }, |
| { |
| "entropy": 0.4643040466308594, |
| "epoch": 1.9966411643963426, |
| "grad_norm": 1.0, |
| "learning_rate": 2.07909151965655e-05, |
| "loss": 0.5125, |
| "mean_token_accuracy": 0.7746139895915986, |
| "num_tokens": 324473634.0, |
| "step": 10700 |
| }, |
| { |
| "entropy": 0.46121720626950263, |
| "epoch": 2.005971263295391, |
| "grad_norm": 0.7109375, |
| "learning_rate": 2.071552359749062e-05, |
| "loss": 0.5151, |
| "mean_token_accuracy": 0.7720960187911987, |
| "num_tokens": 325999882.0, |
| "step": 10750 |
| }, |
| { |
| "entropy": 0.425725160241127, |
| "epoch": 2.0153013621944393, |
| "grad_norm": 0.474609375, |
| "learning_rate": 2.063998603965232e-05, |
| "loss": 0.4962, |
| "mean_token_accuracy": 0.7846533066034317, |
| "num_tokens": 327442405.0, |
| "step": 10800 |
| }, |
| { |
| "entropy": 0.45527834951877594, |
| "epoch": 2.0246314610934877, |
| "grad_norm": 1.1328125, |
| "learning_rate": 2.056430513847151e-05, |
| "loss": 0.5079, |
| "mean_token_accuracy": 0.7760635191202163, |
| "num_tokens": 329012059.0, |
| "step": 10850 |
| }, |
| { |
| "entropy": 0.4397959718108177, |
| "epoch": 2.033961559992536, |
| "grad_norm": 1.03125, |
| "learning_rate": 2.0488483514332225e-05, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.7809435164928437, |
| "num_tokens": 330491757.0, |
| "step": 10900 |
| }, |
| { |
| "entropy": 0.4280716378986835, |
| "epoch": 2.0432916588915844, |
| "grad_norm": 0.796875, |
| "learning_rate": 2.041252379249091e-05, |
| "loss": 0.4888, |
| "mean_token_accuracy": 0.785589964389801, |
| "num_tokens": 331979812.0, |
| "step": 10950 |
| }, |
| { |
| "entropy": 0.44335421919822693, |
| "epoch": 2.0526217577906327, |
| "grad_norm": 0.5703125, |
| "learning_rate": 2.0336428602985527e-05, |
| "loss": 0.5037, |
| "mean_token_accuracy": 0.7756373131275177, |
| "num_tokens": 333477541.0, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.0526217577906327, |
| "eval_entropy": 0.45860785820068567, |
| "eval_loss": 0.5141582489013672, |
| "eval_mean_token_accuracy": 0.7748899443433919, |
| "eval_num_tokens": 333477541.0, |
| "eval_runtime": 16.1759, |
| "eval_samples_per_second": 53.598, |
| "eval_steps_per_second": 6.738, |
| "step": 11000 |
| }, |
| { |
| "entropy": 0.43529929786920546, |
| "epoch": 2.061951856689681, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.026020058054448e-05, |
| "loss": 0.4936, |
| "mean_token_accuracy": 0.7826171773672104, |
| "num_tokens": 334939322.0, |
| "step": 11050 |
| }, |
| { |
| "entropy": 0.4596153527498245, |
| "epoch": 2.0712819555887294, |
| "grad_norm": 1.234375, |
| "learning_rate": 2.018384236449539e-05, |
| "loss": 0.5192, |
| "mean_token_accuracy": 0.7747271412611008, |
| "num_tokens": 336492472.0, |
| "step": 11100 |
| }, |
| { |
| "entropy": 0.45726164370775224, |
| "epoch": 2.0806120544877778, |
| "grad_norm": 0.3828125, |
| "learning_rate": 2.0107356598673732e-05, |
| "loss": 0.5075, |
| "mean_token_accuracy": 0.7738985830545425, |
| "num_tokens": 338077203.0, |
| "step": 11150 |
| }, |
| { |
| "entropy": 0.4586531579494476, |
| "epoch": 2.089942153386826, |
| "grad_norm": 0.423828125, |
| "learning_rate": 2.0030745931331256e-05, |
| "loss": 0.5128, |
| "mean_token_accuracy": 0.7716344100236893, |
| "num_tokens": 339676503.0, |
| "step": 11200 |
| }, |
| { |
| "entropy": 0.44181185990571975, |
| "epoch": 2.0992722522858744, |
| "grad_norm": 0.703125, |
| "learning_rate": 1.995401301504434e-05, |
| "loss": 0.5033, |
| "mean_token_accuracy": 0.7782072865962982, |
| "num_tokens": 341201747.0, |
| "step": 11250 |
| }, |
| { |
| "entropy": 0.431854664683342, |
| "epoch": 2.1086023511849223, |
| "grad_norm": 1.0390625, |
| "learning_rate": 1.9877160506622106e-05, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.7848282158374786, |
| "num_tokens": 342665361.0, |
| "step": 11300 |
| }, |
| { |
| "entropy": 0.43534082144498826, |
| "epoch": 2.1179324500839707, |
| "grad_norm": 0.75, |
| "learning_rate": 1.9800191067014458e-05, |
| "loss": 0.4917, |
| "mean_token_accuracy": 0.7822025471925735, |
| "num_tokens": 344149761.0, |
| "step": 11350 |
| }, |
| { |
| "entropy": 0.444383510351181, |
| "epoch": 2.127262548983019, |
| "grad_norm": 0.494140625, |
| "learning_rate": 1.9723107361219928e-05, |
| "loss": 0.5044, |
| "mean_token_accuracy": 0.7769185125827789, |
| "num_tokens": 345653004.0, |
| "step": 11400 |
| }, |
| { |
| "entropy": 0.4527071109414101, |
| "epoch": 2.1365926478820674, |
| "grad_norm": 0.62890625, |
| "learning_rate": 1.964591205819343e-05, |
| "loss": 0.5026, |
| "mean_token_accuracy": 0.7761050814390182, |
| "num_tokens": 347228235.0, |
| "step": 11450 |
| }, |
| { |
| "entropy": 0.4523302459716797, |
| "epoch": 2.1459227467811157, |
| "grad_norm": 0.97265625, |
| "learning_rate": 1.9568607830753818e-05, |
| "loss": 0.5175, |
| "mean_token_accuracy": 0.777667219042778, |
| "num_tokens": 348730988.0, |
| "step": 11500 |
| }, |
| { |
| "entropy": 0.44508363455533984, |
| "epoch": 2.155252845680164, |
| "grad_norm": 1.3984375, |
| "learning_rate": 1.9491197355491355e-05, |
| "loss": 0.5016, |
| "mean_token_accuracy": 0.7767183601856231, |
| "num_tokens": 350265615.0, |
| "step": 11550 |
| }, |
| { |
| "entropy": 0.44807049065828325, |
| "epoch": 2.1645829445792124, |
| "grad_norm": 1.4921875, |
| "learning_rate": 1.941368331267506e-05, |
| "loss": 0.5179, |
| "mean_token_accuracy": 0.7722341948747635, |
| "num_tokens": 351794521.0, |
| "step": 11600 |
| }, |
| { |
| "entropy": 0.44992916941642763, |
| "epoch": 2.1739130434782608, |
| "grad_norm": 0.62109375, |
| "learning_rate": 1.9336068386159866e-05, |
| "loss": 0.5013, |
| "mean_token_accuracy": 0.7756163114309311, |
| "num_tokens": 353325027.0, |
| "step": 11650 |
| }, |
| { |
| "entropy": 0.4497057408094406, |
| "epoch": 2.183243142377309, |
| "grad_norm": 0.486328125, |
| "learning_rate": 1.9258355263293722e-05, |
| "loss": 0.5101, |
| "mean_token_accuracy": 0.7762971234321594, |
| "num_tokens": 354798870.0, |
| "step": 11700 |
| }, |
| { |
| "entropy": 0.4416424559056759, |
| "epoch": 2.1925732412763574, |
| "grad_norm": 0.83984375, |
| "learning_rate": 1.9180546634824542e-05, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.780478093624115, |
| "num_tokens": 356285178.0, |
| "step": 11750 |
| }, |
| { |
| "entropy": 0.4536583548784256, |
| "epoch": 2.201903340175406, |
| "grad_norm": 1.1328125, |
| "learning_rate": 1.910264519480704e-05, |
| "loss": 0.5081, |
| "mean_token_accuracy": 0.7730959630012513, |
| "num_tokens": 357853469.0, |
| "step": 11800 |
| }, |
| { |
| "entropy": 0.432369647026062, |
| "epoch": 2.211233439074454, |
| "grad_norm": 1.5703125, |
| "learning_rate": 1.902465364050943e-05, |
| "loss": 0.4962, |
| "mean_token_accuracy": 0.7790928614139557, |
| "num_tokens": 359347327.0, |
| "step": 11850 |
| }, |
| { |
| "entropy": 0.4379693388938904, |
| "epoch": 2.2205635379735025, |
| "grad_norm": 0.5078125, |
| "learning_rate": 1.894657467232007e-05, |
| "loss": 0.4922, |
| "mean_token_accuracy": 0.7815437364578247, |
| "num_tokens": 360883930.0, |
| "step": 11900 |
| }, |
| { |
| "entropy": 0.4471505701541901, |
| "epoch": 2.229893636872551, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.8868410993653945e-05, |
| "loss": 0.5073, |
| "mean_token_accuracy": 0.776453862786293, |
| "num_tokens": 362395340.0, |
| "step": 11950 |
| }, |
| { |
| "entropy": 0.45013984248042105, |
| "epoch": 2.239223735771599, |
| "grad_norm": 0.65234375, |
| "learning_rate": 1.879016531085905e-05, |
| "loss": 0.5059, |
| "mean_token_accuracy": 0.7744881427288055, |
| "num_tokens": 363975143.0, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.239223735771599, |
| "eval_entropy": 0.45516312778542894, |
| "eval_loss": 0.5110519528388977, |
| "eval_mean_token_accuracy": 0.7761742434370409, |
| "eval_num_tokens": 363975143.0, |
| "eval_runtime": 16.1613, |
| "eval_samples_per_second": 53.647, |
| "eval_steps_per_second": 6.744, |
| "step": 12000 |
| }, |
| { |
| "entropy": 0.46731566220521925, |
| "epoch": 2.2485538346706475, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.871184033312272e-05, |
| "loss": 0.5183, |
| "mean_token_accuracy": 0.7688807338476181, |
| "num_tokens": 365608309.0, |
| "step": 12050 |
| }, |
| { |
| "entropy": 0.43275348499417304, |
| "epoch": 2.257883933569696, |
| "grad_norm": 0.609375, |
| "learning_rate": 1.86334387723778e-05, |
| "loss": 0.5003, |
| "mean_token_accuracy": 0.7791133409738541, |
| "num_tokens": 367079142.0, |
| "step": 12100 |
| }, |
| { |
| "entropy": 0.4450415739417076, |
| "epoch": 2.267214032468744, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.8554963343208748e-05, |
| "loss": 0.508, |
| "mean_token_accuracy": 0.7741218858957291, |
| "num_tokens": 368613238.0, |
| "step": 12150 |
| }, |
| { |
| "entropy": 0.4465838612616062, |
| "epoch": 2.2765441313677925, |
| "grad_norm": 0.578125, |
| "learning_rate": 1.8476416762757647e-05, |
| "loss": 0.5025, |
| "mean_token_accuracy": 0.7775521212816239, |
| "num_tokens": 370170401.0, |
| "step": 12200 |
| }, |
| { |
| "entropy": 0.44225269854068755, |
| "epoch": 2.285874230266841, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.8397801750630147e-05, |
| "loss": 0.496, |
| "mean_token_accuracy": 0.7806216114759446, |
| "num_tokens": 371657034.0, |
| "step": 12250 |
| }, |
| { |
| "entropy": 0.4700926415622234, |
| "epoch": 2.2952043291658892, |
| "grad_norm": 0.55078125, |
| "learning_rate": 1.831912102880127e-05, |
| "loss": 0.5264, |
| "mean_token_accuracy": 0.7698476028442383, |
| "num_tokens": 373254414.0, |
| "step": 12300 |
| }, |
| { |
| "entropy": 0.4466784715652466, |
| "epoch": 2.3045344280649376, |
| "grad_norm": 1.1953125, |
| "learning_rate": 1.8240377321521187e-05, |
| "loss": 0.5059, |
| "mean_token_accuracy": 0.775156998038292, |
| "num_tokens": 374801131.0, |
| "step": 12350 |
| }, |
| { |
| "entropy": 0.4366114428639412, |
| "epoch": 2.313864526963986, |
| "grad_norm": 0.43359375, |
| "learning_rate": 1.816157335522088e-05, |
| "loss": 0.4963, |
| "mean_token_accuracy": 0.7822502106428146, |
| "num_tokens": 376328664.0, |
| "step": 12400 |
| }, |
| { |
| "entropy": 0.4395353105664253, |
| "epoch": 2.3231946258630343, |
| "grad_norm": 2.25, |
| "learning_rate": 1.808271185841774e-05, |
| "loss": 0.499, |
| "mean_token_accuracy": 0.7797509133815765, |
| "num_tokens": 377858310.0, |
| "step": 12450 |
| }, |
| { |
| "entropy": 0.42968779906630516, |
| "epoch": 2.3325247247620826, |
| "grad_norm": 1.1015625, |
| "learning_rate": 1.8003795561621118e-05, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.7836663633584976, |
| "num_tokens": 379330727.0, |
| "step": 12500 |
| }, |
| { |
| "entropy": 0.4358790573477745, |
| "epoch": 2.341854823661131, |
| "grad_norm": 0.95703125, |
| "learning_rate": 1.792482719723774e-05, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.7849816447496414, |
| "num_tokens": 380789154.0, |
| "step": 12550 |
| }, |
| { |
| "entropy": 0.4382615853846073, |
| "epoch": 2.3511849225601793, |
| "grad_norm": 0.71484375, |
| "learning_rate": 1.7845809499477147e-05, |
| "loss": 0.5003, |
| "mean_token_accuracy": 0.777313020825386, |
| "num_tokens": 382304834.0, |
| "step": 12600 |
| }, |
| { |
| "entropy": 0.4590240094065666, |
| "epoch": 2.3605150214592276, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.7766745204257005e-05, |
| "loss": 0.5087, |
| "mean_token_accuracy": 0.7742151153087616, |
| "num_tokens": 383864187.0, |
| "step": 12650 |
| }, |
| { |
| "entropy": 0.44439409762620924, |
| "epoch": 2.369845120358276, |
| "grad_norm": 0.92578125, |
| "learning_rate": 1.7687637049108356e-05, |
| "loss": 0.5074, |
| "mean_token_accuracy": 0.7769033217430115, |
| "num_tokens": 385371218.0, |
| "step": 12700 |
| }, |
| { |
| "entropy": 0.45312762558460234, |
| "epoch": 2.3791752192573243, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.7608487773080876e-05, |
| "loss": 0.5138, |
| "mean_token_accuracy": 0.7725106239318847, |
| "num_tokens": 386905152.0, |
| "step": 12750 |
| }, |
| { |
| "entropy": 0.4439894749224186, |
| "epoch": 2.3885053181563727, |
| "grad_norm": 0.62109375, |
| "learning_rate": 1.7529300116648006e-05, |
| "loss": 0.5065, |
| "mean_token_accuracy": 0.777021074295044, |
| "num_tokens": 388444369.0, |
| "step": 12800 |
| }, |
| { |
| "entropy": 0.42678302526474, |
| "epoch": 2.397835417055421, |
| "grad_norm": 1.5390625, |
| "learning_rate": 1.745007682161206e-05, |
| "loss": 0.4882, |
| "mean_token_accuracy": 0.7850719147920608, |
| "num_tokens": 389939944.0, |
| "step": 12850 |
| }, |
| { |
| "entropy": 0.43243088483810427, |
| "epoch": 2.407165515954469, |
| "grad_norm": 1.0625, |
| "learning_rate": 1.7370820631009338e-05, |
| "loss": 0.4964, |
| "mean_token_accuracy": 0.7798051989078522, |
| "num_tokens": 391456345.0, |
| "step": 12900 |
| }, |
| { |
| "entropy": 0.4408321186900139, |
| "epoch": 2.4164956148535173, |
| "grad_norm": 0.88671875, |
| "learning_rate": 1.729153428901509e-05, |
| "loss": 0.5048, |
| "mean_token_accuracy": 0.7777973639965058, |
| "num_tokens": 392972606.0, |
| "step": 12950 |
| }, |
| { |
| "entropy": 0.42640509456396103, |
| "epoch": 2.4258257137525656, |
| "grad_norm": 0.54296875, |
| "learning_rate": 1.721222054084855e-05, |
| "loss": 0.489, |
| "mean_token_accuracy": 0.7861467552185059, |
| "num_tokens": 394419739.0, |
| "step": 13000 |
| }, |
| { |
| "epoch": 2.4258257137525656, |
| "eval_entropy": 0.45086843404201193, |
| "eval_loss": 0.5095834136009216, |
| "eval_mean_token_accuracy": 0.7765465198306862, |
| "eval_num_tokens": 394419739.0, |
| "eval_runtime": 16.3, |
| "eval_samples_per_second": 53.19, |
| "eval_steps_per_second": 6.687, |
| "step": 13000 |
| }, |
| { |
| "entropy": 0.4339776523411274, |
| "epoch": 2.435155812651614, |
| "grad_norm": 1.1171875, |
| "learning_rate": 1.7132882132677856e-05, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.7817716175317764, |
| "num_tokens": 395915283.0, |
| "step": 13050 |
| }, |
| { |
| "entropy": 0.43754623234272005, |
| "epoch": 2.4444859115506623, |
| "grad_norm": 0.447265625, |
| "learning_rate": 1.7053521811524983e-05, |
| "loss": 0.5022, |
| "mean_token_accuracy": 0.7792576867341995, |
| "num_tokens": 397406785.0, |
| "step": 13100 |
| }, |
| { |
| "entropy": 0.4319652807712555, |
| "epoch": 2.4538160104497106, |
| "grad_norm": 0.423828125, |
| "learning_rate": 1.6974142325170614e-05, |
| "loss": 0.4932, |
| "mean_token_accuracy": 0.780916188955307, |
| "num_tokens": 398889013.0, |
| "step": 13150 |
| }, |
| { |
| "entropy": 0.42943828999996186, |
| "epoch": 2.463146109348759, |
| "grad_norm": 1.703125, |
| "learning_rate": 1.6894746422059023e-05, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.7860763943195344, |
| "num_tokens": 400360522.0, |
| "step": 13200 |
| }, |
| { |
| "entropy": 0.4486216183006764, |
| "epoch": 2.4724762082478073, |
| "grad_norm": 0.55859375, |
| "learning_rate": 1.6815336851202897e-05, |
| "loss": 0.503, |
| "mean_token_accuracy": 0.7766870594024659, |
| "num_tokens": 401889271.0, |
| "step": 13250 |
| }, |
| { |
| "entropy": 0.4432876881957054, |
| "epoch": 2.4818063071468557, |
| "grad_norm": 0.73046875, |
| "learning_rate": 1.6735916362088154e-05, |
| "loss": 0.4969, |
| "mean_token_accuracy": 0.7788010305166244, |
| "num_tokens": 403427748.0, |
| "step": 13300 |
| }, |
| { |
| "entropy": 0.44121997892856596, |
| "epoch": 2.491136406045904, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.6656487704578733e-05, |
| "loss": 0.4999, |
| "mean_token_accuracy": 0.7776626753807068, |
| "num_tokens": 404954195.0, |
| "step": 13350 |
| }, |
| { |
| "entropy": 0.433184619396925, |
| "epoch": 2.5004665049449524, |
| "grad_norm": 0.91015625, |
| "learning_rate": 1.6577053628821423e-05, |
| "loss": 0.4912, |
| "mean_token_accuracy": 0.7811095035076141, |
| "num_tokens": 406493981.0, |
| "step": 13400 |
| }, |
| { |
| "entropy": 0.42312393710017204, |
| "epoch": 2.5097966038440007, |
| "grad_norm": 0.73046875, |
| "learning_rate": 1.6497616885150602e-05, |
| "loss": 0.4876, |
| "mean_token_accuracy": 0.7851846623420715, |
| "num_tokens": 407934358.0, |
| "step": 13450 |
| }, |
| { |
| "entropy": 0.4381246021389961, |
| "epoch": 2.519126702743049, |
| "grad_norm": 0.55078125, |
| "learning_rate": 1.6418180223993015e-05, |
| "loss": 0.4935, |
| "mean_token_accuracy": 0.7794178009033204, |
| "num_tokens": 409440914.0, |
| "step": 13500 |
| }, |
| { |
| "entropy": 0.44361667945981026, |
| "epoch": 2.5284568016420974, |
| "grad_norm": 0.69140625, |
| "learning_rate": 1.6338746395772556e-05, |
| "loss": 0.4969, |
| "mean_token_accuracy": 0.7793952637910843, |
| "num_tokens": 410989159.0, |
| "step": 13550 |
| }, |
| { |
| "entropy": 0.43151887714862824, |
| "epoch": 2.5377869005411458, |
| "grad_norm": 0.94140625, |
| "learning_rate": 1.625931815081504e-05, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.7826283901929856, |
| "num_tokens": 412472158.0, |
| "step": 13600 |
| }, |
| { |
| "entropy": 0.44044260889291764, |
| "epoch": 2.547116999440194, |
| "grad_norm": 0.71875, |
| "learning_rate": 1.6179898239252952e-05, |
| "loss": 0.4989, |
| "mean_token_accuracy": 0.7796867018938065, |
| "num_tokens": 414005272.0, |
| "step": 13650 |
| }, |
| { |
| "entropy": 0.43727659299969673, |
| "epoch": 2.5564470983392424, |
| "grad_norm": 0.5390625, |
| "learning_rate": 1.6100489410930248e-05, |
| "loss": 0.4983, |
| "mean_token_accuracy": 0.7779977285861969, |
| "num_tokens": 415515917.0, |
| "step": 13700 |
| }, |
| { |
| "entropy": 0.4339106129109859, |
| "epoch": 2.565777197238291, |
| "grad_norm": 1.21875, |
| "learning_rate": 1.602109441530714e-05, |
| "loss": 0.5021, |
| "mean_token_accuracy": 0.7849010616540909, |
| "num_tokens": 416950426.0, |
| "step": 13750 |
| }, |
| { |
| "entropy": 0.4218116353452206, |
| "epoch": 2.575107296137339, |
| "grad_norm": 0.82421875, |
| "learning_rate": 1.5941716001364893e-05, |
| "loss": 0.4868, |
| "mean_token_accuracy": 0.7843046194314957, |
| "num_tokens": 418436936.0, |
| "step": 13800 |
| }, |
| { |
| "entropy": 0.42178965732455254, |
| "epoch": 2.5844373950363875, |
| "grad_norm": 0.73046875, |
| "learning_rate": 1.5862356917510624e-05, |
| "loss": 0.4819, |
| "mean_token_accuracy": 0.7881238484382629, |
| "num_tokens": 419883666.0, |
| "step": 13850 |
| }, |
| { |
| "entropy": 0.43385469675064087, |
| "epoch": 2.593767493935436, |
| "grad_norm": 0.88671875, |
| "learning_rate": 1.5783019911482207e-05, |
| "loss": 0.4999, |
| "mean_token_accuracy": 0.7784091866016388, |
| "num_tokens": 421397686.0, |
| "step": 13900 |
| }, |
| { |
| "entropy": 0.4416196349263191, |
| "epoch": 2.603097592834484, |
| "grad_norm": 1.0078125, |
| "learning_rate": 1.570370773025306e-05, |
| "loss": 0.5006, |
| "mean_token_accuracy": 0.7770800250768661, |
| "num_tokens": 422952565.0, |
| "step": 13950 |
| }, |
| { |
| "entropy": 0.4428327572345734, |
| "epoch": 2.6124276917335325, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.5624423119937062e-05, |
| "loss": 0.4991, |
| "mean_token_accuracy": 0.7780466437339782, |
| "num_tokens": 424490641.0, |
| "step": 14000 |
| }, |
| { |
| "epoch": 2.6124276917335325, |
| "eval_entropy": 0.4452941595414363, |
| "eval_loss": 0.507188618183136, |
| "eval_mean_token_accuracy": 0.7770852725440209, |
| "eval_num_tokens": 424490641.0, |
| "eval_runtime": 16.1783, |
| "eval_samples_per_second": 53.59, |
| "eval_steps_per_second": 6.737, |
| "step": 14000 |
| }, |
| { |
| "entropy": 0.4200237849354744, |
| "epoch": 2.621757790632581, |
| "grad_norm": 0.48046875, |
| "learning_rate": 1.554516882569349e-05, |
| "loss": 0.4903, |
| "mean_token_accuracy": 0.787417265176773, |
| "num_tokens": 425904302.0, |
| "step": 14050 |
| }, |
| { |
| "entropy": 0.4459881857037544, |
| "epoch": 2.631087889531629, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.5465947591631947e-05, |
| "loss": 0.5028, |
| "mean_token_accuracy": 0.7770888382196426, |
| "num_tokens": 427427333.0, |
| "step": 14100 |
| }, |
| { |
| "entropy": 0.44237791940569876, |
| "epoch": 2.640417988430677, |
| "grad_norm": 0.515625, |
| "learning_rate": 1.5386762160717355e-05, |
| "loss": 0.5098, |
| "mean_token_accuracy": 0.779816085100174, |
| "num_tokens": 428936850.0, |
| "step": 14150 |
| }, |
| { |
| "entropy": 0.4365511977672577, |
| "epoch": 2.6497480873297254, |
| "grad_norm": 0.6796875, |
| "learning_rate": 1.5307615274674984e-05, |
| "loss": 0.5003, |
| "mean_token_accuracy": 0.7807082986831665, |
| "num_tokens": 430411091.0, |
| "step": 14200 |
| }, |
| { |
| "entropy": 0.45521570563316344, |
| "epoch": 2.659078186228774, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.522850967389552e-05, |
| "loss": 0.5169, |
| "mean_token_accuracy": 0.774719997048378, |
| "num_tokens": 431926456.0, |
| "step": 14250 |
| }, |
| { |
| "entropy": 0.43436133086681367, |
| "epoch": 2.668408285127822, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.5149448097340193e-05, |
| "loss": 0.4954, |
| "mean_token_accuracy": 0.7820281451940536, |
| "num_tokens": 433433882.0, |
| "step": 14300 |
| }, |
| { |
| "entropy": 0.4265731783211231, |
| "epoch": 2.6777383840268705, |
| "grad_norm": 0.75, |
| "learning_rate": 1.5070433282445917e-05, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.7826755654811859, |
| "num_tokens": 434847537.0, |
| "step": 14350 |
| }, |
| { |
| "entropy": 0.4315530589222908, |
| "epoch": 2.687068482925919, |
| "grad_norm": 0.92578125, |
| "learning_rate": 1.4991467965030544e-05, |
| "loss": 0.5031, |
| "mean_token_accuracy": 0.7789240056276321, |
| "num_tokens": 436306220.0, |
| "step": 14400 |
| }, |
| { |
| "entropy": 0.4366298992931843, |
| "epoch": 2.696398581824967, |
| "grad_norm": 0.75390625, |
| "learning_rate": 1.4912554879198106e-05, |
| "loss": 0.4974, |
| "mean_token_accuracy": 0.7800502121448517, |
| "num_tokens": 437854187.0, |
| "step": 14450 |
| }, |
| { |
| "entropy": 0.429113384783268, |
| "epoch": 2.7057286807240155, |
| "grad_norm": 0.94921875, |
| "learning_rate": 1.4833696757244162e-05, |
| "loss": 0.496, |
| "mean_token_accuracy": 0.7821574300527573, |
| "num_tokens": 439322082.0, |
| "step": 14500 |
| }, |
| { |
| "entropy": 0.45050988361239436, |
| "epoch": 2.715058779623064, |
| "grad_norm": 0.765625, |
| "learning_rate": 1.475489632956121e-05, |
| "loss": 0.5181, |
| "mean_token_accuracy": 0.7749482518434525, |
| "num_tokens": 440931906.0, |
| "step": 14550 |
| }, |
| { |
| "entropy": 0.45400950565934184, |
| "epoch": 2.724388878522112, |
| "grad_norm": 0.77734375, |
| "learning_rate": 1.4676156324544123e-05, |
| "loss": 0.5091, |
| "mean_token_accuracy": 0.7732315069437027, |
| "num_tokens": 442503054.0, |
| "step": 14600 |
| }, |
| { |
| "entropy": 0.42937296599149705, |
| "epoch": 2.7337189774211605, |
| "grad_norm": 0.68359375, |
| "learning_rate": 1.4597479468495688e-05, |
| "loss": 0.4923, |
| "mean_token_accuracy": 0.7824376839399337, |
| "num_tokens": 443993311.0, |
| "step": 14650 |
| }, |
| { |
| "entropy": 0.44196070849895475, |
| "epoch": 2.743049076320209, |
| "grad_norm": 0.45703125, |
| "learning_rate": 1.4518868485532235e-05, |
| "loss": 0.5056, |
| "mean_token_accuracy": 0.7760562229156495, |
| "num_tokens": 445522017.0, |
| "step": 14700 |
| }, |
| { |
| "entropy": 0.44267056584358216, |
| "epoch": 2.7523791752192572, |
| "grad_norm": 1.0703125, |
| "learning_rate": 1.4440326097489263e-05, |
| "loss": 0.4978, |
| "mean_token_accuracy": 0.7780000925064087, |
| "num_tokens": 447063778.0, |
| "step": 14750 |
| }, |
| { |
| "entropy": 0.4377539825439453, |
| "epoch": 2.7617092741183056, |
| "grad_norm": 0.73828125, |
| "learning_rate": 1.436185502382728e-05, |
| "loss": 0.494, |
| "mean_token_accuracy": 0.7799159944057464, |
| "num_tokens": 448574576.0, |
| "step": 14800 |
| }, |
| { |
| "entropy": 0.43343299850821493, |
| "epoch": 2.771039373017354, |
| "grad_norm": 0.71484375, |
| "learning_rate": 1.428345798153756e-05, |
| "loss": 0.4936, |
| "mean_token_accuracy": 0.7831123304367066, |
| "num_tokens": 450066594.0, |
| "step": 14850 |
| }, |
| { |
| "entropy": 0.430605805516243, |
| "epoch": 2.7803694719164023, |
| "grad_norm": 0.3046875, |
| "learning_rate": 1.4205137685048111e-05, |
| "loss": 0.495, |
| "mean_token_accuracy": 0.7821512734889984, |
| "num_tokens": 451568798.0, |
| "step": 14900 |
| }, |
| { |
| "entropy": 0.43412218809127806, |
| "epoch": 2.7896995708154506, |
| "grad_norm": 1.4140625, |
| "learning_rate": 1.4126896846129701e-05, |
| "loss": 0.4975, |
| "mean_token_accuracy": 0.7786238652467727, |
| "num_tokens": 453062565.0, |
| "step": 14950 |
| }, |
| { |
| "entropy": 0.4382785783708096, |
| "epoch": 2.799029669714499, |
| "grad_norm": 0.828125, |
| "learning_rate": 1.4048738173801939e-05, |
| "loss": 0.4948, |
| "mean_token_accuracy": 0.7823023611307144, |
| "num_tokens": 454574611.0, |
| "step": 15000 |
| }, |
| { |
| "epoch": 2.799029669714499, |
| "eval_entropy": 0.44517116043545785, |
| "eval_loss": 0.5036894679069519, |
| "eval_mean_token_accuracy": 0.7785478141329704, |
| "eval_num_tokens": 454574611.0, |
| "eval_runtime": 16.6862, |
| "eval_samples_per_second": 51.959, |
| "eval_steps_per_second": 6.532, |
| "step": 15000 |
| }, |
| { |
| "entropy": 0.42489848256111146, |
| "epoch": 2.8083597686135473, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.3970664374239483e-05, |
| "loss": 0.495, |
| "mean_token_accuracy": 0.7826015204191208, |
| "num_tokens": 456088007.0, |
| "step": 15050 |
| }, |
| { |
| "entropy": 0.4351053491234779, |
| "epoch": 2.8176898675125956, |
| "grad_norm": 0.62890625, |
| "learning_rate": 1.3892678150678369e-05, |
| "loss": 0.4909, |
| "mean_token_accuracy": 0.7825271499156952, |
| "num_tokens": 457634148.0, |
| "step": 15100 |
| }, |
| { |
| "entropy": 0.4429400071501732, |
| "epoch": 2.827019966411644, |
| "grad_norm": 0.7265625, |
| "learning_rate": 1.3814782203322367e-05, |
| "loss": 0.5001, |
| "mean_token_accuracy": 0.7786577945947647, |
| "num_tokens": 459151611.0, |
| "step": 15150 |
| }, |
| { |
| "entropy": 0.4451659436523914, |
| "epoch": 2.8363500653106923, |
| "grad_norm": 0.30078125, |
| "learning_rate": 1.3736979229249526e-05, |
| "loss": 0.5041, |
| "mean_token_accuracy": 0.7781134587526322, |
| "num_tokens": 460711762.0, |
| "step": 15200 |
| }, |
| { |
| "entropy": 0.4428605942428112, |
| "epoch": 2.8456801642097407, |
| "grad_norm": 0.5625, |
| "learning_rate": 1.3659271922318776e-05, |
| "loss": 0.5088, |
| "mean_token_accuracy": 0.7755980342626572, |
| "num_tokens": 462239645.0, |
| "step": 15250 |
| }, |
| { |
| "entropy": 0.44321066468954085, |
| "epoch": 2.855010263108789, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.3581662973076661e-05, |
| "loss": 0.5057, |
| "mean_token_accuracy": 0.7742398703098297, |
| "num_tokens": 463780352.0, |
| "step": 15300 |
| }, |
| { |
| "entropy": 0.43519594669342043, |
| "epoch": 2.8643403620078374, |
| "grad_norm": 0.8359375, |
| "learning_rate": 1.3504155068664164e-05, |
| "loss": 0.4946, |
| "mean_token_accuracy": 0.7806341469287872, |
| "num_tokens": 465306354.0, |
| "step": 15350 |
| }, |
| { |
| "entropy": 0.452918638586998, |
| "epoch": 2.8736704609068857, |
| "grad_norm": 0.625, |
| "learning_rate": 1.3426750892723697e-05, |
| "loss": 0.5097, |
| "mean_token_accuracy": 0.7724657821655273, |
| "num_tokens": 466885785.0, |
| "step": 15400 |
| }, |
| { |
| "entropy": 0.43585652247071266, |
| "epoch": 2.883000559805934, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.3349453125306156e-05, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.7797726893424988, |
| "num_tokens": 468383581.0, |
| "step": 15450 |
| }, |
| { |
| "entropy": 0.43165954887866975, |
| "epoch": 2.8923306587049824, |
| "grad_norm": 0.68359375, |
| "learning_rate": 1.3272264442778136e-05, |
| "loss": 0.4937, |
| "mean_token_accuracy": 0.7796162796020508, |
| "num_tokens": 469917430.0, |
| "step": 15500 |
| }, |
| { |
| "entropy": 0.4401167546212673, |
| "epoch": 2.9016607576040307, |
| "grad_norm": 0.66015625, |
| "learning_rate": 1.319518751772927e-05, |
| "loss": 0.5006, |
| "mean_token_accuracy": 0.7786802816390991, |
| "num_tokens": 471424114.0, |
| "step": 15550 |
| }, |
| { |
| "entropy": 0.45204014524817465, |
| "epoch": 2.910990856503079, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.3118225018879684e-05, |
| "loss": 0.5071, |
| "mean_token_accuracy": 0.7716954737901688, |
| "num_tokens": 473004348.0, |
| "step": 15600 |
| }, |
| { |
| "entropy": 0.43205771446228025, |
| "epoch": 2.9203209554021274, |
| "grad_norm": 1.6484375, |
| "learning_rate": 1.3041379610987594e-05, |
| "loss": 0.4982, |
| "mean_token_accuracy": 0.7792713183164597, |
| "num_tokens": 474501801.0, |
| "step": 15650 |
| }, |
| { |
| "entropy": 0.43552364617586137, |
| "epoch": 2.929651054301176, |
| "grad_norm": 0.79296875, |
| "learning_rate": 1.2964653954757057e-05, |
| "loss": 0.4991, |
| "mean_token_accuracy": 0.7788824599981308, |
| "num_tokens": 476043852.0, |
| "step": 15700 |
| }, |
| { |
| "entropy": 0.44703464940190313, |
| "epoch": 2.938981153200224, |
| "grad_norm": 0.490234375, |
| "learning_rate": 1.2888050706745822e-05, |
| "loss": 0.5064, |
| "mean_token_accuracy": 0.7750799888372422, |
| "num_tokens": 477598151.0, |
| "step": 15750 |
| }, |
| { |
| "entropy": 0.43971230536699296, |
| "epoch": 2.9483112520992725, |
| "grad_norm": 0.67578125, |
| "learning_rate": 1.2811572519273378e-05, |
| "loss": 0.5041, |
| "mean_token_accuracy": 0.7766543072462082, |
| "num_tokens": 479148774.0, |
| "step": 15800 |
| }, |
| { |
| "entropy": 0.447616363465786, |
| "epoch": 2.957641350998321, |
| "grad_norm": 1.3046875, |
| "learning_rate": 1.2735222040329087e-05, |
| "loss": 0.5084, |
| "mean_token_accuracy": 0.7735405403375626, |
| "num_tokens": 480694879.0, |
| "step": 15850 |
| }, |
| { |
| "entropy": 0.4212371516227722, |
| "epoch": 2.966971449897369, |
| "grad_norm": 2.265625, |
| "learning_rate": 1.2659001913480522e-05, |
| "loss": 0.4861, |
| "mean_token_accuracy": 0.785834304690361, |
| "num_tokens": 482165709.0, |
| "step": 15900 |
| }, |
| { |
| "entropy": 0.41868353605270386, |
| "epoch": 2.9763015487964175, |
| "grad_norm": 1.234375, |
| "learning_rate": 1.2582914777781937e-05, |
| "loss": 0.4843, |
| "mean_token_accuracy": 0.7870692014694214, |
| "num_tokens": 483643611.0, |
| "step": 15950 |
| }, |
| { |
| "entropy": 0.44808956772089004, |
| "epoch": 2.9856316476954654, |
| "grad_norm": 0.65234375, |
| "learning_rate": 1.2506963267682884e-05, |
| "loss": 0.5052, |
| "mean_token_accuracy": 0.7751398229598999, |
| "num_tokens": 485205508.0, |
| "step": 16000 |
| }, |
| { |
| "epoch": 2.9856316476954654, |
| "eval_entropy": 0.44267906945779784, |
| "eval_loss": 0.501590371131897, |
| "eval_mean_token_accuracy": 0.7786080602112166, |
| "eval_num_tokens": 485205508.0, |
| "eval_runtime": 16.2258, |
| "eval_samples_per_second": 53.433, |
| "eval_steps_per_second": 6.718, |
| "step": 16000 |
| }, |
| { |
| "entropy": 0.4390178045630455, |
| "epoch": 2.9949617465945138, |
| "grad_norm": 1.8515625, |
| "learning_rate": 1.2431150012936982e-05, |
| "loss": 0.5016, |
| "mean_token_accuracy": 0.7765721970796585, |
| "num_tokens": 486717739.0, |
| "step": 16050 |
| }, |
| { |
| "entropy": 0.43280943170189856, |
| "epoch": 3.004291845493562, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.2355477638510904e-05, |
| "loss": 0.5016, |
| "mean_token_accuracy": 0.7793767899274826, |
| "num_tokens": 488220825.0, |
| "step": 16100 |
| }, |
| { |
| "entropy": 0.41857230544090274, |
| "epoch": 3.0136219443926104, |
| "grad_norm": 0.96875, |
| "learning_rate": 1.2279948764493463e-05, |
| "loss": 0.4857, |
| "mean_token_accuracy": 0.7855171990394593, |
| "num_tokens": 489714834.0, |
| "step": 16150 |
| }, |
| { |
| "entropy": 0.4343470679223537, |
| "epoch": 3.022952043291659, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.220456600600488e-05, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.7775657117366791, |
| "num_tokens": 491256234.0, |
| "step": 16200 |
| }, |
| { |
| "entropy": 0.42674369990825656, |
| "epoch": 3.032282142190707, |
| "grad_norm": 0.76171875, |
| "learning_rate": 1.2129331973106275e-05, |
| "loss": 0.4875, |
| "mean_token_accuracy": 0.7840164464712143, |
| "num_tokens": 492813353.0, |
| "step": 16250 |
| }, |
| { |
| "entropy": 0.42470823049545287, |
| "epoch": 3.0416122410897555, |
| "grad_norm": 1.078125, |
| "learning_rate": 1.2054249270709271e-05, |
| "loss": 0.4898, |
| "mean_token_accuracy": 0.7800269144773483, |
| "num_tokens": 494302175.0, |
| "step": 16300 |
| }, |
| { |
| "entropy": 0.42407613903284075, |
| "epoch": 3.050942339988804, |
| "grad_norm": 0.7109375, |
| "learning_rate": 1.1979320498485797e-05, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.782460133433342, |
| "num_tokens": 495800714.0, |
| "step": 16350 |
| }, |
| { |
| "entropy": 0.4180370423197746, |
| "epoch": 3.060272438887852, |
| "grad_norm": 1.109375, |
| "learning_rate": 1.1904548250778101e-05, |
| "loss": 0.4882, |
| "mean_token_accuracy": 0.7806040924787522, |
| "num_tokens": 497321561.0, |
| "step": 16400 |
| }, |
| { |
| "entropy": 0.42719221144914626, |
| "epoch": 3.0696025377869005, |
| "grad_norm": 0.66015625, |
| "learning_rate": 1.1829935116508903e-05, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.7813178312778473, |
| "num_tokens": 498854626.0, |
| "step": 16450 |
| }, |
| { |
| "entropy": 0.4178078393638134, |
| "epoch": 3.078932636685949, |
| "grad_norm": 0.76953125, |
| "learning_rate": 1.175548367909175e-05, |
| "loss": 0.485, |
| "mean_token_accuracy": 0.783756075501442, |
| "num_tokens": 500380642.0, |
| "step": 16500 |
| }, |
| { |
| "entropy": 0.4012910355627537, |
| "epoch": 3.088262735584997, |
| "grad_norm": 1.1328125, |
| "learning_rate": 1.1681196516341603e-05, |
| "loss": 0.4694, |
| "mean_token_accuracy": 0.7900429528951645, |
| "num_tokens": 501885081.0, |
| "step": 16550 |
| }, |
| { |
| "entropy": 0.41146292686462405, |
| "epoch": 3.0975928344840455, |
| "grad_norm": 0.46484375, |
| "learning_rate": 1.1607076200385529e-05, |
| "loss": 0.4784, |
| "mean_token_accuracy": 0.7883473831415176, |
| "num_tokens": 503332757.0, |
| "step": 16600 |
| }, |
| { |
| "entropy": 0.4057569517195225, |
| "epoch": 3.106922933383094, |
| "grad_norm": 0.326171875, |
| "learning_rate": 1.1533125297573703e-05, |
| "loss": 0.4788, |
| "mean_token_accuracy": 0.788205589056015, |
| "num_tokens": 504782228.0, |
| "step": 16650 |
| }, |
| { |
| "entropy": 0.41990657716989516, |
| "epoch": 3.1162530322821422, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.1459346368390504e-05, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.782880739569664, |
| "num_tokens": 506282342.0, |
| "step": 16700 |
| }, |
| { |
| "entropy": 0.4344457286596298, |
| "epoch": 3.1255831311811906, |
| "grad_norm": 0.412109375, |
| "learning_rate": 1.1385741967365869e-05, |
| "loss": 0.492, |
| "mean_token_accuracy": 0.779918566942215, |
| "num_tokens": 507797216.0, |
| "step": 16750 |
| }, |
| { |
| "entropy": 0.425162510573864, |
| "epoch": 3.134913230080239, |
| "grad_norm": 0.6640625, |
| "learning_rate": 1.131231464298687e-05, |
| "loss": 0.4889, |
| "mean_token_accuracy": 0.7809261924028397, |
| "num_tokens": 509307379.0, |
| "step": 16800 |
| }, |
| { |
| "entropy": 0.4163677006959915, |
| "epoch": 3.1442433289792873, |
| "grad_norm": 0.96484375, |
| "learning_rate": 1.1239066937609447e-05, |
| "loss": 0.4903, |
| "mean_token_accuracy": 0.7836281234025955, |
| "num_tokens": 510827157.0, |
| "step": 16850 |
| }, |
| { |
| "entropy": 0.41290561139583587, |
| "epoch": 3.1535734278783356, |
| "grad_norm": 0.2890625, |
| "learning_rate": 1.1166001387370388e-05, |
| "loss": 0.4915, |
| "mean_token_accuracy": 0.7884875816106797, |
| "num_tokens": 512261808.0, |
| "step": 16900 |
| }, |
| { |
| "entropy": 0.41839638456702233, |
| "epoch": 3.162903526777384, |
| "grad_norm": 0.48046875, |
| "learning_rate": 1.1093120522099535e-05, |
| "loss": 0.4855, |
| "mean_token_accuracy": 0.7832108342647552, |
| "num_tokens": 513784693.0, |
| "step": 16950 |
| }, |
| { |
| "entropy": 0.4217331621050835, |
| "epoch": 3.1722336256764323, |
| "grad_norm": 1.0234375, |
| "learning_rate": 1.1020426865232167e-05, |
| "loss": 0.4965, |
| "mean_token_accuracy": 0.7817919147014618, |
| "num_tokens": 515320766.0, |
| "step": 17000 |
| }, |
| { |
| "epoch": 3.1722336256764323, |
| "eval_entropy": 0.4328604711031695, |
| "eval_loss": 0.501544177532196, |
| "eval_mean_token_accuracy": 0.7793953303897053, |
| "eval_num_tokens": 515320766.0, |
| "eval_runtime": 16.0856, |
| "eval_samples_per_second": 53.899, |
| "eval_steps_per_second": 6.776, |
| "step": 17000 |
| }, |
| { |
| "entropy": 0.4205896918475628, |
| "epoch": 3.1815637245754806, |
| "grad_norm": 1.140625, |
| "learning_rate": 1.0947922933721634e-05, |
| "loss": 0.4908, |
| "mean_token_accuracy": 0.7806319332122803, |
| "num_tokens": 516850357.0, |
| "step": 17050 |
| }, |
| { |
| "entropy": 0.4369584143161774, |
| "epoch": 3.190893823474529, |
| "grad_norm": 0.6796875, |
| "learning_rate": 1.0875611237952227e-05, |
| "loss": 0.5028, |
| "mean_token_accuracy": 0.7777555876970291, |
| "num_tokens": 518426223.0, |
| "step": 17100 |
| }, |
| { |
| "entropy": 0.42318257100880147, |
| "epoch": 3.2002239223735773, |
| "grad_norm": 0.578125, |
| "learning_rate": 1.0803494281652234e-05, |
| "loss": 0.4893, |
| "mean_token_accuracy": 0.7821793848276138, |
| "num_tokens": 519954467.0, |
| "step": 17150 |
| }, |
| { |
| "entropy": 0.43405372768640516, |
| "epoch": 3.2095540212726257, |
| "grad_norm": 1.1484375, |
| "learning_rate": 1.0731574561807265e-05, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.7766797173023224, |
| "num_tokens": 521503586.0, |
| "step": 17200 |
| }, |
| { |
| "entropy": 0.42191226355731487, |
| "epoch": 3.218884120171674, |
| "grad_norm": 0.66796875, |
| "learning_rate": 1.0659854568573787e-05, |
| "loss": 0.4846, |
| "mean_token_accuracy": 0.7843712168931961, |
| "num_tokens": 523039085.0, |
| "step": 17250 |
| }, |
| { |
| "entropy": 0.4242657233774662, |
| "epoch": 3.228214219070722, |
| "grad_norm": 0.47265625, |
| "learning_rate": 1.058833678519293e-05, |
| "loss": 0.4901, |
| "mean_token_accuracy": 0.7792167681455612, |
| "num_tokens": 524604685.0, |
| "step": 17300 |
| }, |
| { |
| "entropy": 0.40574381925165653, |
| "epoch": 3.2375443179697703, |
| "grad_norm": 0.76171875, |
| "learning_rate": 1.051702368790447e-05, |
| "loss": 0.4739, |
| "mean_token_accuracy": 0.789325880408287, |
| "num_tokens": 526100115.0, |
| "step": 17350 |
| }, |
| { |
| "entropy": 0.41887831434607503, |
| "epoch": 3.2468744168688186, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.0445917745861102e-05, |
| "loss": 0.4818, |
| "mean_token_accuracy": 0.7836150753498078, |
| "num_tokens": 527595996.0, |
| "step": 17400 |
| }, |
| { |
| "entropy": 0.40637139081954954, |
| "epoch": 3.256204515767867, |
| "grad_norm": 0.6015625, |
| "learning_rate": 1.0375021421042974e-05, |
| "loss": 0.4733, |
| "mean_token_accuracy": 0.7896361410617828, |
| "num_tokens": 529079715.0, |
| "step": 17450 |
| }, |
| { |
| "entropy": 0.4036496952176094, |
| "epoch": 3.2655346146669153, |
| "grad_norm": 0.298828125, |
| "learning_rate": 1.030433716817241e-05, |
| "loss": 0.4819, |
| "mean_token_accuracy": 0.7864921605587005, |
| "num_tokens": 530573725.0, |
| "step": 17500 |
| }, |
| { |
| "entropy": 0.41815487191081047, |
| "epoch": 3.2748647135659636, |
| "grad_norm": 0.90234375, |
| "learning_rate": 1.0233867434628938e-05, |
| "loss": 0.4813, |
| "mean_token_accuracy": 0.7873369532823563, |
| "num_tokens": 532067163.0, |
| "step": 17550 |
| }, |
| { |
| "entropy": 0.4044397334754467, |
| "epoch": 3.284194812465012, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.0163614660364547e-05, |
| "loss": 0.4802, |
| "mean_token_accuracy": 0.7873817622661591, |
| "num_tokens": 533512972.0, |
| "step": 17600 |
| }, |
| { |
| "entropy": 0.4253586496412754, |
| "epoch": 3.2935249113640603, |
| "grad_norm": 0.494140625, |
| "learning_rate": 1.0093581277819186e-05, |
| "loss": 0.4906, |
| "mean_token_accuracy": 0.7798972427845001, |
| "num_tokens": 535070119.0, |
| "step": 17650 |
| }, |
| { |
| "entropy": 0.41822995960712434, |
| "epoch": 3.3028550102631087, |
| "grad_norm": 0.8046875, |
| "learning_rate": 1.0023769711836586e-05, |
| "loss": 0.4878, |
| "mean_token_accuracy": 0.7849278378486634, |
| "num_tokens": 536578335.0, |
| "step": 17700 |
| }, |
| { |
| "entropy": 0.4179137668013573, |
| "epoch": 3.312185109162157, |
| "grad_norm": 0.953125, |
| "learning_rate": 9.95418237958026e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.7825618571043015, |
| "num_tokens": 538111995.0, |
| "step": 17750 |
| }, |
| { |
| "entropy": 0.4254558201134205, |
| "epoch": 3.3215152080612054, |
| "grad_norm": 0.5625, |
| "learning_rate": 9.88482169044983e-06, |
| "loss": 0.488, |
| "mean_token_accuracy": 0.7804460400342941, |
| "num_tokens": 539663342.0, |
| "step": 17800 |
| }, |
| { |
| "entropy": 0.4253497390449047, |
| "epoch": 3.3308453069602537, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.815690045997598e-06, |
| "loss": 0.4877, |
| "mean_token_accuracy": 0.7805328375101089, |
| "num_tokens": 541217582.0, |
| "step": 17850 |
| }, |
| { |
| "entropy": 0.41277687311172484, |
| "epoch": 3.340175405859302, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.746789839845406e-06, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.7856322342157364, |
| "num_tokens": 542694785.0, |
| "step": 17900 |
| }, |
| { |
| "entropy": 0.4202821546792984, |
| "epoch": 3.3495055047583504, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.67812345760174e-06, |
| "loss": 0.4955, |
| "mean_token_accuracy": 0.7834526008367538, |
| "num_tokens": 544234574.0, |
| "step": 17950 |
| }, |
| { |
| "entropy": 0.41871184706687925, |
| "epoch": 3.3588356036573987, |
| "grad_norm": 0.50390625, |
| "learning_rate": 9.609693276779152e-06, |
| "loss": 0.4849, |
| "mean_token_accuracy": 0.7832159209251404, |
| "num_tokens": 545775120.0, |
| "step": 18000 |
| }, |
| { |
| "epoch": 3.3588356036573987, |
| "eval_entropy": 0.4325732376870759, |
| "eval_loss": 0.5006441473960876, |
| "eval_mean_token_accuracy": 0.7796490897826098, |
| "eval_num_tokens": 545775120.0, |
| "eval_runtime": 16.1616, |
| "eval_samples_per_second": 53.646, |
| "eval_steps_per_second": 6.744, |
| "step": 18000 |
| }, |
| { |
| "entropy": 0.42268205478787424, |
| "epoch": 3.368165702556447, |
| "grad_norm": 0.77734375, |
| "learning_rate": 9.541501666711921e-06, |
| "loss": 0.4778, |
| "mean_token_accuracy": 0.7864983838796615, |
| "num_tokens": 547311884.0, |
| "step": 18050 |
| }, |
| { |
| "entropy": 0.42853294894099236, |
| "epoch": 3.3774958014554954, |
| "grad_norm": 0.53515625, |
| "learning_rate": 9.473550988474026e-06, |
| "loss": 0.4904, |
| "mean_token_accuracy": 0.7811102610826492, |
| "num_tokens": 548872868.0, |
| "step": 18100 |
| }, |
| { |
| "entropy": 0.43325465768575666, |
| "epoch": 3.386825900354544, |
| "grad_norm": 0.890625, |
| "learning_rate": 9.4058435947974e-06, |
| "loss": 0.4935, |
| "mean_token_accuracy": 0.7787714445590973, |
| "num_tokens": 550487269.0, |
| "step": 18150 |
| }, |
| { |
| "entropy": 0.42910467088222504, |
| "epoch": 3.396155999253592, |
| "grad_norm": 0.87109375, |
| "learning_rate": 9.338381829990456e-06, |
| "loss": 0.4903, |
| "mean_token_accuracy": 0.7796713817119598, |
| "num_tokens": 552030450.0, |
| "step": 18200 |
| }, |
| { |
| "entropy": 0.41868083611130713, |
| "epoch": 3.4054860981526405, |
| "grad_norm": 0.388671875, |
| "learning_rate": 9.271168029856928e-06, |
| "loss": 0.4823, |
| "mean_token_accuracy": 0.7847259587049484, |
| "num_tokens": 553519745.0, |
| "step": 18250 |
| }, |
| { |
| "entropy": 0.41218777537345885, |
| "epoch": 3.414816197051689, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.204204521615007e-06, |
| "loss": 0.4803, |
| "mean_token_accuracy": 0.7875041741132737, |
| "num_tokens": 555001060.0, |
| "step": 18300 |
| }, |
| { |
| "entropy": 0.4341864985227585, |
| "epoch": 3.424146295950737, |
| "grad_norm": 0.63671875, |
| "learning_rate": 9.13749362381673e-06, |
| "loss": 0.4935, |
| "mean_token_accuracy": 0.7791831189393997, |
| "num_tokens": 556547827.0, |
| "step": 18350 |
| }, |
| { |
| "entropy": 0.4298609687387943, |
| "epoch": 3.4334763948497855, |
| "grad_norm": 0.39453125, |
| "learning_rate": 9.07103764626773e-06, |
| "loss": 0.49, |
| "mean_token_accuracy": 0.7830371624231338, |
| "num_tokens": 558106220.0, |
| "step": 18400 |
| }, |
| { |
| "entropy": 0.4168567133694887, |
| "epoch": 3.442806493748834, |
| "grad_norm": 0.7265625, |
| "learning_rate": 9.00483888994725e-06, |
| "loss": 0.4787, |
| "mean_token_accuracy": 0.7888282573223114, |
| "num_tokens": 559579247.0, |
| "step": 18450 |
| }, |
| { |
| "entropy": 0.4177075420320034, |
| "epoch": 3.452136592647882, |
| "grad_norm": 0.5078125, |
| "learning_rate": 8.938899646928482e-06, |
| "loss": 0.4841, |
| "mean_token_accuracy": 0.7841418528556824, |
| "num_tokens": 561118173.0, |
| "step": 18500 |
| }, |
| { |
| "entropy": 0.42074175730347635, |
| "epoch": 3.4614666915469305, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.8732222002992e-06, |
| "loss": 0.4943, |
| "mean_token_accuracy": 0.7784902250766754, |
| "num_tokens": 562667212.0, |
| "step": 18550 |
| }, |
| { |
| "entropy": 0.41027487240731714, |
| "epoch": 3.470796790445979, |
| "grad_norm": 0.80859375, |
| "learning_rate": 8.807808824082699e-06, |
| "loss": 0.479, |
| "mean_token_accuracy": 0.7876066911220551, |
| "num_tokens": 564131244.0, |
| "step": 18600 |
| }, |
| { |
| "entropy": 0.4196596726775169, |
| "epoch": 3.4801268893450272, |
| "grad_norm": 0.765625, |
| "learning_rate": 8.742661783159075e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.7828355920314789, |
| "num_tokens": 565650541.0, |
| "step": 18650 |
| }, |
| { |
| "entropy": 0.42029051125049594, |
| "epoch": 3.4894569882440756, |
| "grad_norm": 0.89453125, |
| "learning_rate": 8.677783333186817e-06, |
| "loss": 0.4907, |
| "mean_token_accuracy": 0.7820263600349426, |
| "num_tokens": 567162445.0, |
| "step": 18700 |
| }, |
| { |
| "entropy": 0.42464843273162844, |
| "epoch": 3.498787087143124, |
| "grad_norm": 0.49609375, |
| "learning_rate": 8.61317572052467e-06, |
| "loss": 0.4888, |
| "mean_token_accuracy": 0.7788948893547059, |
| "num_tokens": 568688382.0, |
| "step": 18750 |
| }, |
| { |
| "entropy": 0.426156534999609, |
| "epoch": 3.5081171860421723, |
| "grad_norm": 0.78515625, |
| "learning_rate": 8.548841182153889e-06, |
| "loss": 0.4944, |
| "mean_token_accuracy": 0.7789571231603623, |
| "num_tokens": 570273693.0, |
| "step": 18800 |
| }, |
| { |
| "entropy": 0.4215981301665306, |
| "epoch": 3.5174472849412206, |
| "grad_norm": 0.88671875, |
| "learning_rate": 8.484781945600765e-06, |
| "loss": 0.4901, |
| "mean_token_accuracy": 0.781245459318161, |
| "num_tokens": 571787716.0, |
| "step": 18850 |
| }, |
| { |
| "entropy": 0.4165229081362486, |
| "epoch": 3.526777383840269, |
| "grad_norm": 0.7265625, |
| "learning_rate": 8.421000228859513e-06, |
| "loss": 0.483, |
| "mean_token_accuracy": 0.7865099585056305, |
| "num_tokens": 573264670.0, |
| "step": 18900 |
| }, |
| { |
| "entropy": 0.4245117911696434, |
| "epoch": 3.5361074827393173, |
| "grad_norm": 0.486328125, |
| "learning_rate": 8.35749824031547e-06, |
| "loss": 0.4938, |
| "mean_token_accuracy": 0.778894921541214, |
| "num_tokens": 574817675.0, |
| "step": 18950 |
| }, |
| { |
| "entropy": 0.417805362790823, |
| "epoch": 3.5454375816383656, |
| "grad_norm": 0.423828125, |
| "learning_rate": 8.294278178668643e-06, |
| "loss": 0.4872, |
| "mean_token_accuracy": 0.7830862325429916, |
| "num_tokens": 576331491.0, |
| "step": 19000 |
| }, |
| { |
| "epoch": 3.5454375816383656, |
| "eval_entropy": 0.4310347127531647, |
| "eval_loss": 0.499735027551651, |
| "eval_mean_token_accuracy": 0.779860559953462, |
| "eval_num_tokens": 576331491.0, |
| "eval_runtime": 16.1624, |
| "eval_samples_per_second": 53.643, |
| "eval_steps_per_second": 6.744, |
| "step": 19000 |
| }, |
| { |
| "entropy": 0.4081883782148361, |
| "epoch": 3.5547676805374135, |
| "grad_norm": 0.9375, |
| "learning_rate": 8.231342232857553e-06, |
| "loss": 0.4705, |
| "mean_token_accuracy": 0.7903977036476135, |
| "num_tokens": 577775016.0, |
| "step": 19050 |
| }, |
| { |
| "entropy": 0.4005671316385269, |
| "epoch": 3.564097779436462, |
| "grad_norm": 0.875, |
| "learning_rate": 8.16869258198347e-06, |
| "loss": 0.4737, |
| "mean_token_accuracy": 0.7880425137281418, |
| "num_tokens": 579207311.0, |
| "step": 19100 |
| }, |
| { |
| "entropy": 0.41074585855007173, |
| "epoch": 3.5734278783355102, |
| "grad_norm": 0.87109375, |
| "learning_rate": 8.106331395234957e-06, |
| "loss": 0.4797, |
| "mean_token_accuracy": 0.786340873837471, |
| "num_tokens": 580709130.0, |
| "step": 19150 |
| }, |
| { |
| "entropy": 0.4269941046833992, |
| "epoch": 3.5827579772345586, |
| "grad_norm": 0.5234375, |
| "learning_rate": 8.044260831812762e-06, |
| "loss": 0.4965, |
| "mean_token_accuracy": 0.7799100142717361, |
| "num_tokens": 582263480.0, |
| "step": 19200 |
| }, |
| { |
| "entropy": 0.40871032655239103, |
| "epoch": 3.592088076133607, |
| "grad_norm": 0.89453125, |
| "learning_rate": 7.982483040855052e-06, |
| "loss": 0.4885, |
| "mean_token_accuracy": 0.7831970340013504, |
| "num_tokens": 583753874.0, |
| "step": 19250 |
| }, |
| { |
| "entropy": 0.41553946167230604, |
| "epoch": 3.6014181750326553, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.921000161363023e-06, |
| "loss": 0.4892, |
| "mean_token_accuracy": 0.7814596974849701, |
| "num_tokens": 585269107.0, |
| "step": 19300 |
| }, |
| { |
| "entropy": 0.4114553180336952, |
| "epoch": 3.6107482739317036, |
| "grad_norm": 0.73828125, |
| "learning_rate": 7.859814322126803e-06, |
| "loss": 0.5044, |
| "mean_token_accuracy": 0.7843243205547332, |
| "num_tokens": 586764493.0, |
| "step": 19350 |
| }, |
| { |
| "entropy": 0.4229819716513157, |
| "epoch": 3.620078372830752, |
| "grad_norm": 0.5, |
| "learning_rate": 7.798927641651787e-06, |
| "loss": 0.493, |
| "mean_token_accuracy": 0.7810362190008163, |
| "num_tokens": 588285178.0, |
| "step": 19400 |
| }, |
| { |
| "entropy": 0.43099318161606787, |
| "epoch": 3.6294084717298003, |
| "grad_norm": 0.466796875, |
| "learning_rate": 7.738342228085244e-06, |
| "loss": 0.4921, |
| "mean_token_accuracy": 0.7791334927082062, |
| "num_tokens": 589860202.0, |
| "step": 19450 |
| }, |
| { |
| "entropy": 0.4297554486989975, |
| "epoch": 3.6387385706288486, |
| "grad_norm": 0.6796875, |
| "learning_rate": 7.678060179143354e-06, |
| "loss": 0.4965, |
| "mean_token_accuracy": 0.7779843896627426, |
| "num_tokens": 591386894.0, |
| "step": 19500 |
| }, |
| { |
| "entropy": 0.4341718791425228, |
| "epoch": 3.648068669527897, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.618083582038559e-06, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.7771023726463318, |
| "num_tokens": 592966640.0, |
| "step": 19550 |
| }, |
| { |
| "entropy": 0.43129597157239913, |
| "epoch": 3.6573987684269453, |
| "grad_norm": 0.45703125, |
| "learning_rate": 7.558414513407309e-06, |
| "loss": 0.4966, |
| "mean_token_accuracy": 0.7794705367088318, |
| "num_tokens": 594551411.0, |
| "step": 19600 |
| }, |
| { |
| "entropy": 0.416672650128603, |
| "epoch": 3.6667288673259937, |
| "grad_norm": 0.5390625, |
| "learning_rate": 7.499055039238146e-06, |
| "loss": 0.4847, |
| "mean_token_accuracy": 0.7843469917774201, |
| "num_tokens": 596025749.0, |
| "step": 19650 |
| }, |
| { |
| "entropy": 0.4332768340408802, |
| "epoch": 3.676058966225042, |
| "grad_norm": 0.65625, |
| "learning_rate": 7.4400072148001895e-06, |
| "loss": 0.4897, |
| "mean_token_accuracy": 0.7826856952905655, |
| "num_tokens": 597572029.0, |
| "step": 19700 |
| }, |
| { |
| "entropy": 0.4150558638572693, |
| "epoch": 3.6853890651240904, |
| "grad_norm": 0.5, |
| "learning_rate": 7.381273084571959e-06, |
| "loss": 0.4844, |
| "mean_token_accuracy": 0.785199624300003, |
| "num_tokens": 599047819.0, |
| "step": 19750 |
| }, |
| { |
| "entropy": 0.4170726762712002, |
| "epoch": 3.6947191640231387, |
| "grad_norm": 0.7890625, |
| "learning_rate": 7.322854682170584e-06, |
| "loss": 0.4918, |
| "mean_token_accuracy": 0.7805746030807496, |
| "num_tokens": 600528500.0, |
| "step": 19800 |
| }, |
| { |
| "entropy": 0.43822780847549436, |
| "epoch": 3.704049262922187, |
| "grad_norm": 0.77734375, |
| "learning_rate": 7.264754030281405e-06, |
| "loss": 0.4994, |
| "mean_token_accuracy": 0.7803661721944809, |
| "num_tokens": 602042275.0, |
| "step": 19850 |
| }, |
| { |
| "entropy": 0.41316955953836443, |
| "epoch": 3.7133793618212354, |
| "grad_norm": 0.78515625, |
| "learning_rate": 7.2069731405879325e-06, |
| "loss": 0.4822, |
| "mean_token_accuracy": 0.7859818071126938, |
| "num_tokens": 603534613.0, |
| "step": 19900 |
| }, |
| { |
| "entropy": 0.43403887152671816, |
| "epoch": 3.7227094607202837, |
| "grad_norm": 0.48046875, |
| "learning_rate": 7.149514013702186e-06, |
| "loss": 0.499, |
| "mean_token_accuracy": 0.7757015681266785, |
| "num_tokens": 605111735.0, |
| "step": 19950 |
| }, |
| { |
| "entropy": 0.41590030148625373, |
| "epoch": 3.732039559619332, |
| "grad_norm": 0.57421875, |
| "learning_rate": 7.092378639095451e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.7830818378925324, |
| "num_tokens": 606616332.0, |
| "step": 20000 |
| }, |
| { |
| "epoch": 3.732039559619332, |
| "eval_entropy": 0.42986192684107966, |
| "eval_loss": 0.4989897310733795, |
| "eval_mean_token_accuracy": 0.779869570097792, |
| "eval_num_tokens": 606616332.0, |
| "eval_runtime": 16.2831, |
| "eval_samples_per_second": 53.246, |
| "eval_steps_per_second": 6.694, |
| "step": 20000 |
| }, |
| { |
| "entropy": 0.41359525367617606, |
| "epoch": 3.7413696585183804, |
| "grad_norm": 0.5078125, |
| "learning_rate": 7.0355689950293636e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.7825395846366883, |
| "num_tokens": 608094829.0, |
| "step": 20050 |
| }, |
| { |
| "entropy": 0.4180074107646942, |
| "epoch": 3.750699757417429, |
| "grad_norm": 0.5390625, |
| "learning_rate": 6.979087048487432e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.7824456262588501, |
| "num_tokens": 609631728.0, |
| "step": 20100 |
| }, |
| { |
| "entropy": 0.4242085382342339, |
| "epoch": 3.7600298563164767, |
| "grad_norm": 0.6015625, |
| "learning_rate": 6.922934755106929e-06, |
| "loss": 0.4895, |
| "mean_token_accuracy": 0.7821110928058624, |
| "num_tokens": 611154863.0, |
| "step": 20150 |
| }, |
| { |
| "entropy": 0.4358582437038422, |
| "epoch": 3.769359955215525, |
| "grad_norm": 0.66796875, |
| "learning_rate": 6.867114059111178e-06, |
| "loss": 0.4957, |
| "mean_token_accuracy": 0.7778018289804458, |
| "num_tokens": 612699583.0, |
| "step": 20200 |
| }, |
| { |
| "entropy": 0.4160428442060947, |
| "epoch": 3.7786900541145734, |
| "grad_norm": 0.71484375, |
| "learning_rate": 6.81162689324224e-06, |
| "loss": 0.4816, |
| "mean_token_accuracy": 0.7859532070159913, |
| "num_tokens": 614179127.0, |
| "step": 20250 |
| }, |
| { |
| "entropy": 0.42236584216356277, |
| "epoch": 3.7880201530136217, |
| "grad_norm": 0.404296875, |
| "learning_rate": 6.756475178693988e-06, |
| "loss": 0.497, |
| "mean_token_accuracy": 0.77904121696949, |
| "num_tokens": 615681957.0, |
| "step": 20300 |
| }, |
| { |
| "entropy": 0.4261379070580006, |
| "epoch": 3.79735025191267, |
| "grad_norm": 0.33984375, |
| "learning_rate": 6.701660825045599e-06, |
| "loss": 0.491, |
| "mean_token_accuracy": 0.7808138716220856, |
| "num_tokens": 617206693.0, |
| "step": 20350 |
| }, |
| { |
| "entropy": 0.41188708037137983, |
| "epoch": 3.8066803508117184, |
| "grad_norm": 0.443359375, |
| "learning_rate": 6.64718573019542e-06, |
| "loss": 0.482, |
| "mean_token_accuracy": 0.7834567302465438, |
| "num_tokens": 618685882.0, |
| "step": 20400 |
| }, |
| { |
| "entropy": 0.42921313650906084, |
| "epoch": 3.8160104497107667, |
| "grad_norm": 0.326171875, |
| "learning_rate": 6.593051780295262e-06, |
| "loss": 0.4886, |
| "mean_token_accuracy": 0.7812140667438507, |
| "num_tokens": 620242613.0, |
| "step": 20450 |
| }, |
| { |
| "entropy": 0.4060940612852573, |
| "epoch": 3.825340548609815, |
| "grad_norm": 0.80859375, |
| "learning_rate": 6.5392608496851006e-06, |
| "loss": 0.4804, |
| "mean_token_accuracy": 0.7881927186250687, |
| "num_tokens": 621695607.0, |
| "step": 20500 |
| }, |
| { |
| "entropy": 0.41289987429976466, |
| "epoch": 3.8346706475088634, |
| "grad_norm": 0.4921875, |
| "learning_rate": 6.48581480082817e-06, |
| "loss": 0.4814, |
| "mean_token_accuracy": 0.7860061007738114, |
| "num_tokens": 623202780.0, |
| "step": 20550 |
| }, |
| { |
| "entropy": 0.4154231162369251, |
| "epoch": 3.844000746407912, |
| "grad_norm": 0.81640625, |
| "learning_rate": 6.432715484246474e-06, |
| "loss": 0.4822, |
| "mean_token_accuracy": 0.7877626097202302, |
| "num_tokens": 624719320.0, |
| "step": 20600 |
| }, |
| { |
| "entropy": 0.43937826111912726, |
| "epoch": 3.85333084530696, |
| "grad_norm": 0.54296875, |
| "learning_rate": 6.379964738456737e-06, |
| "loss": 0.5081, |
| "mean_token_accuracy": 0.7722378653287888, |
| "num_tokens": 626304789.0, |
| "step": 20650 |
| }, |
| { |
| "entropy": 0.4095226752758026, |
| "epoch": 3.8626609442060085, |
| "grad_norm": 0.431640625, |
| "learning_rate": 6.3275643899067095e-06, |
| "loss": 0.4808, |
| "mean_token_accuracy": 0.784711457490921, |
| "num_tokens": 627775886.0, |
| "step": 20700 |
| }, |
| { |
| "entropy": 0.42841140910983083, |
| "epoch": 3.871991043105057, |
| "grad_norm": 0.7265625, |
| "learning_rate": 6.275516252911957e-06, |
| "loss": 0.495, |
| "mean_token_accuracy": 0.7787529402971267, |
| "num_tokens": 629320324.0, |
| "step": 20750 |
| }, |
| { |
| "entropy": 0.4122103577852249, |
| "epoch": 3.881321142004105, |
| "grad_norm": 0.8125, |
| "learning_rate": 6.223822129593035e-06, |
| "loss": 0.4872, |
| "mean_token_accuracy": 0.7841408705711365, |
| "num_tokens": 630822612.0, |
| "step": 20800 |
| }, |
| { |
| "entropy": 0.42884288884699345, |
| "epoch": 3.8906512409031535, |
| "grad_norm": 0.97265625, |
| "learning_rate": 6.172483809813082e-06, |
| "loss": 0.4946, |
| "mean_token_accuracy": 0.7817574542760849, |
| "num_tokens": 632330762.0, |
| "step": 20850 |
| }, |
| { |
| "entropy": 0.42925117403268814, |
| "epoch": 3.899981339802202, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.121503071115863e-06, |
| "loss": 0.4968, |
| "mean_token_accuracy": 0.7788661700487137, |
| "num_tokens": 633862126.0, |
| "step": 20900 |
| }, |
| { |
| "entropy": 0.4173249228298664, |
| "epoch": 3.90931143870125, |
| "grad_norm": 0.48828125, |
| "learning_rate": 6.0708816786642055e-06, |
| "loss": 0.4896, |
| "mean_token_accuracy": 0.7805528366565704, |
| "num_tokens": 635369892.0, |
| "step": 20950 |
| }, |
| { |
| "entropy": 0.42858991749584674, |
| "epoch": 3.9186415376002985, |
| "grad_norm": 0.89453125, |
| "learning_rate": 6.0206213851789065e-06, |
| "loss": 0.4922, |
| "mean_token_accuracy": 0.7816365206241608, |
| "num_tokens": 636904691.0, |
| "step": 21000 |
| }, |
| { |
| "epoch": 3.9186415376002985, |
| "eval_entropy": 0.4298953804947914, |
| "eval_loss": 0.4981645345687866, |
| "eval_mean_token_accuracy": 0.7802923715442692, |
| "eval_num_tokens": 636904691.0, |
| "eval_runtime": 16.1855, |
| "eval_samples_per_second": 53.566, |
| "eval_steps_per_second": 6.734, |
| "step": 21000 |
| }, |
| { |
| "entropy": 0.42679292619228365, |
| "epoch": 3.927971636499347, |
| "grad_norm": 0.7109375, |
| "learning_rate": 5.970723930878021e-06, |
| "loss": 0.4965, |
| "mean_token_accuracy": 0.7826879835128784, |
| "num_tokens": 638381309.0, |
| "step": 21050 |
| }, |
| { |
| "entropy": 0.4261575947701931, |
| "epoch": 3.9373017353983952, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.921191043416619e-06, |
| "loss": 0.4962, |
| "mean_token_accuracy": 0.7802698415517807, |
| "num_tokens": 639907942.0, |
| "step": 21100 |
| }, |
| { |
| "entropy": 0.3956790755689144, |
| "epoch": 3.9466318342974436, |
| "grad_norm": 0.7265625, |
| "learning_rate": 5.87202443782697e-06, |
| "loss": 0.4689, |
| "mean_token_accuracy": 0.7924928772449493, |
| "num_tokens": 641357830.0, |
| "step": 21150 |
| }, |
| { |
| "entropy": 0.4151885700970888, |
| "epoch": 3.955961933196492, |
| "grad_norm": 0.62890625, |
| "learning_rate": 5.823225816459159e-06, |
| "loss": 0.4835, |
| "mean_token_accuracy": 0.784718370437622, |
| "num_tokens": 642852716.0, |
| "step": 21200 |
| }, |
| { |
| "entropy": 0.4133622221648693, |
| "epoch": 3.9652920320955403, |
| "grad_norm": 0.56640625, |
| "learning_rate": 5.774796868922148e-06, |
| "loss": 0.4817, |
| "mean_token_accuracy": 0.7855832195281982, |
| "num_tokens": 644369211.0, |
| "step": 21250 |
| }, |
| { |
| "entropy": 0.4185038904845715, |
| "epoch": 3.9746221309945886, |
| "grad_norm": 0.6953125, |
| "learning_rate": 5.726739272025258e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.7824643701314926, |
| "num_tokens": 645872051.0, |
| "step": 21300 |
| }, |
| { |
| "entropy": 0.4287873014807701, |
| "epoch": 3.983952229893637, |
| "grad_norm": 0.87890625, |
| "learning_rate": 5.679054689720142e-06, |
| "loss": 0.4919, |
| "mean_token_accuracy": 0.7797924143075943, |
| "num_tokens": 647435628.0, |
| "step": 21350 |
| }, |
| { |
| "entropy": 0.42188764035701753, |
| "epoch": 3.9932823287926853, |
| "grad_norm": 0.8828125, |
| "learning_rate": 5.631744773043137e-06, |
| "loss": 0.4953, |
| "mean_token_accuracy": 0.7852949523925781, |
| "num_tokens": 648930592.0, |
| "step": 21400 |
| }, |
| { |
| "entropy": 0.4236401343345642, |
| "epoch": 4.002612427691734, |
| "grad_norm": 0.412109375, |
| "learning_rate": 5.584811160058123e-06, |
| "loss": 0.487, |
| "mean_token_accuracy": 0.7832311981916428, |
| "num_tokens": 650447365.0, |
| "step": 21450 |
| }, |
| { |
| "entropy": 0.4323525831103325, |
| "epoch": 4.011942526590782, |
| "grad_norm": 0.396484375, |
| "learning_rate": 5.5382554757998e-06, |
| "loss": 0.4934, |
| "mean_token_accuracy": 0.7795790702104568, |
| "num_tokens": 651990713.0, |
| "step": 21500 |
| }, |
| { |
| "entropy": 0.39284430593252184, |
| "epoch": 4.02127262548983, |
| "grad_norm": 0.6171875, |
| "learning_rate": 5.492079332217413e-06, |
| "loss": 0.4657, |
| "mean_token_accuracy": 0.7923216378688812, |
| "num_tokens": 653445504.0, |
| "step": 21550 |
| }, |
| { |
| "entropy": 0.4254847614467144, |
| "epoch": 4.030602724388879, |
| "grad_norm": 0.58203125, |
| "learning_rate": 5.446284328118956e-06, |
| "loss": 0.4899, |
| "mean_token_accuracy": 0.7845428907871246, |
| "num_tokens": 654991638.0, |
| "step": 21600 |
| }, |
| { |
| "entropy": 0.41144783079624175, |
| "epoch": 4.039932823287927, |
| "grad_norm": 0.48828125, |
| "learning_rate": 5.4008720491158105e-06, |
| "loss": 0.4816, |
| "mean_token_accuracy": 0.7855917030572891, |
| "num_tokens": 656505470.0, |
| "step": 21650 |
| }, |
| { |
| "entropy": 0.41339424341917036, |
| "epoch": 4.049262922186975, |
| "grad_norm": 0.7890625, |
| "learning_rate": 5.355844067567827e-06, |
| "loss": 0.487, |
| "mean_token_accuracy": 0.7834293109178543, |
| "num_tokens": 658059888.0, |
| "step": 21700 |
| }, |
| { |
| "entropy": 0.43376033812761305, |
| "epoch": 4.058593021086024, |
| "grad_norm": 0.55859375, |
| "learning_rate": 5.311201942528911e-06, |
| "loss": 0.5004, |
| "mean_token_accuracy": 0.7760592538118363, |
| "num_tokens": 659625009.0, |
| "step": 21750 |
| }, |
| { |
| "entropy": 0.4149209675192833, |
| "epoch": 4.067923119985072, |
| "grad_norm": 0.67578125, |
| "learning_rate": 5.266947219693018e-06, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.7843904966115951, |
| "num_tokens": 661130951.0, |
| "step": 21800 |
| }, |
| { |
| "entropy": 0.40518364384770394, |
| "epoch": 4.07725321888412, |
| "grad_norm": 0.322265625, |
| "learning_rate": 5.2230814313406564e-06, |
| "loss": 0.4804, |
| "mean_token_accuracy": 0.7845012962818145, |
| "num_tokens": 662681099.0, |
| "step": 21850 |
| }, |
| { |
| "entropy": 0.4218353702127933, |
| "epoch": 4.086583317783169, |
| "grad_norm": 0.89453125, |
| "learning_rate": 5.179606096285814e-06, |
| "loss": 0.4867, |
| "mean_token_accuracy": 0.7828029912710189, |
| "num_tokens": 664238581.0, |
| "step": 21900 |
| }, |
| { |
| "entropy": 0.4128453577309847, |
| "epoch": 4.095913416682217, |
| "grad_norm": 0.9296875, |
| "learning_rate": 5.136522719823388e-06, |
| "loss": 0.4777, |
| "mean_token_accuracy": 0.7887437117099761, |
| "num_tokens": 665738717.0, |
| "step": 21950 |
| }, |
| { |
| "entropy": 0.4112414425611496, |
| "epoch": 4.105243515581265, |
| "grad_norm": 0.490234375, |
| "learning_rate": 5.093832793677053e-06, |
| "loss": 0.4858, |
| "mean_token_accuracy": 0.7840606135129928, |
| "num_tokens": 667221460.0, |
| "step": 22000 |
| }, |
| { |
| "epoch": 4.105243515581265, |
| "eval_entropy": 0.4262016671239783, |
| "eval_loss": 0.49918287992477417, |
| "eval_mean_token_accuracy": 0.7801302873760189, |
| "eval_num_tokens": 667221460.0, |
| "eval_runtime": 16.1734, |
| "eval_samples_per_second": 53.607, |
| "eval_steps_per_second": 6.739, |
| "step": 22000 |
| }, |
| { |
| "entropy": 0.4187443208694458, |
| "epoch": 4.114573614480314, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.051537795947614e-06, |
| "loss": 0.4833, |
| "mean_token_accuracy": 0.783671562075615, |
| "num_tokens": 668746696.0, |
| "step": 22050 |
| }, |
| { |
| "entropy": 0.4121988409757614, |
| "epoch": 4.123903713379362, |
| "grad_norm": 0.5390625, |
| "learning_rate": 5.009639191061831e-06, |
| "loss": 0.479, |
| "mean_token_accuracy": 0.7846984696388245, |
| "num_tokens": 670258347.0, |
| "step": 22100 |
| }, |
| { |
| "entropy": 0.41994678273797037, |
| "epoch": 4.1332338122784105, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.968138429721715e-06, |
| "loss": 0.4852, |
| "mean_token_accuracy": 0.7819422298669815, |
| "num_tokens": 671783187.0, |
| "step": 22150 |
| }, |
| { |
| "entropy": 0.40594893679022787, |
| "epoch": 4.142563911177459, |
| "grad_norm": 0.423828125, |
| "learning_rate": 4.9270369488543e-06, |
| "loss": 0.4737, |
| "mean_token_accuracy": 0.7885110950469971, |
| "num_tokens": 673297222.0, |
| "step": 22200 |
| }, |
| { |
| "entropy": 0.4166788300871849, |
| "epoch": 4.151894010076507, |
| "grad_norm": 0.76953125, |
| "learning_rate": 4.886336171561883e-06, |
| "loss": 0.4809, |
| "mean_token_accuracy": 0.783810424208641, |
| "num_tokens": 674805400.0, |
| "step": 22250 |
| }, |
| { |
| "entropy": 0.4085456937551498, |
| "epoch": 4.1612241089755555, |
| "grad_norm": 0.4765625, |
| "learning_rate": 4.846037507072753e-06, |
| "loss": 0.4769, |
| "mean_token_accuracy": 0.7872794485092163, |
| "num_tokens": 676275357.0, |
| "step": 22300 |
| }, |
| { |
| "entropy": 0.4122830269485712, |
| "epoch": 4.170554207874604, |
| "grad_norm": 0.71484375, |
| "learning_rate": 4.806142350692409e-06, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.7875606679916382, |
| "num_tokens": 677776471.0, |
| "step": 22350 |
| }, |
| { |
| "entropy": 0.4124917629361153, |
| "epoch": 4.179884306773652, |
| "grad_norm": 0.46875, |
| "learning_rate": 4.766652083755236e-06, |
| "loss": 0.4826, |
| "mean_token_accuracy": 0.7857430422306061, |
| "num_tokens": 679312867.0, |
| "step": 22400 |
| }, |
| { |
| "entropy": 0.3990041773021221, |
| "epoch": 4.1892144056727005, |
| "grad_norm": 0.84765625, |
| "learning_rate": 4.727568073576675e-06, |
| "loss": 0.4759, |
| "mean_token_accuracy": 0.7875368773937226, |
| "num_tokens": 680753964.0, |
| "step": 22450 |
| }, |
| { |
| "entropy": 0.41429411858320236, |
| "epoch": 4.198544504571749, |
| "grad_norm": 0.9921875, |
| "learning_rate": 4.688891673405898e-06, |
| "loss": 0.4866, |
| "mean_token_accuracy": 0.7836510550975799, |
| "num_tokens": 682282646.0, |
| "step": 22500 |
| }, |
| { |
| "entropy": 0.4262200190126896, |
| "epoch": 4.207874603470797, |
| "grad_norm": 0.494140625, |
| "learning_rate": 4.650624222378934e-06, |
| "loss": 0.4956, |
| "mean_token_accuracy": 0.7772960156202317, |
| "num_tokens": 683822441.0, |
| "step": 22550 |
| }, |
| { |
| "entropy": 0.4214519140869379, |
| "epoch": 4.217204702369845, |
| "grad_norm": 0.578125, |
| "learning_rate": 4.6127670454723106e-06, |
| "loss": 0.4895, |
| "mean_token_accuracy": 0.7828962570428848, |
| "num_tokens": 685322393.0, |
| "step": 22600 |
| }, |
| { |
| "entropy": 0.41216524183750153, |
| "epoch": 4.226534801268893, |
| "grad_norm": 0.671875, |
| "learning_rate": 4.575321453457185e-06, |
| "loss": 0.4801, |
| "mean_token_accuracy": 0.7840064114332199, |
| "num_tokens": 686841081.0, |
| "step": 22650 |
| }, |
| { |
| "entropy": 0.41634872302412984, |
| "epoch": 4.235864900167941, |
| "grad_norm": 0.7109375, |
| "learning_rate": 4.53828874285395e-06, |
| "loss": 0.4867, |
| "mean_token_accuracy": 0.7810469180345535, |
| "num_tokens": 688411100.0, |
| "step": 22700 |
| }, |
| { |
| "entropy": 0.4129963879287243, |
| "epoch": 4.24519499906699, |
| "grad_norm": 0.546875, |
| "learning_rate": 4.501670195887344e-06, |
| "loss": 0.4887, |
| "mean_token_accuracy": 0.785649740099907, |
| "num_tokens": 689911759.0, |
| "step": 22750 |
| }, |
| { |
| "entropy": 0.4034364421665668, |
| "epoch": 4.254525097966038, |
| "grad_norm": 0.56640625, |
| "learning_rate": 4.465467080442056e-06, |
| "loss": 0.4727, |
| "mean_token_accuracy": 0.7873591876029968, |
| "num_tokens": 691405944.0, |
| "step": 22800 |
| }, |
| { |
| "entropy": 0.41453997910022733, |
| "epoch": 4.263855196865086, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.4296806500188296e-06, |
| "loss": 0.4843, |
| "mean_token_accuracy": 0.7840264475345612, |
| "num_tokens": 692922273.0, |
| "step": 22850 |
| }, |
| { |
| "entropy": 0.4219287024438381, |
| "epoch": 4.273185295764135, |
| "grad_norm": 0.435546875, |
| "learning_rate": 4.394312143691058e-06, |
| "loss": 0.4909, |
| "mean_token_accuracy": 0.7800329983234405, |
| "num_tokens": 694464854.0, |
| "step": 22900 |
| }, |
| { |
| "entropy": 0.41194928884506227, |
| "epoch": 4.282515394663183, |
| "grad_norm": 0.5078125, |
| "learning_rate": 4.359362786061886e-06, |
| "loss": 0.4874, |
| "mean_token_accuracy": 0.7856051474809647, |
| "num_tokens": 695964788.0, |
| "step": 22950 |
| }, |
| { |
| "entropy": 0.41727676048874857, |
| "epoch": 4.291845493562231, |
| "grad_norm": 0.94140625, |
| "learning_rate": 4.324833787221808e-06, |
| "loss": 0.4872, |
| "mean_token_accuracy": 0.7820509207248688, |
| "num_tokens": 697478901.0, |
| "step": 23000 |
| }, |
| { |
| "epoch": 4.291845493562231, |
| "eval_entropy": 0.42559362865916084, |
| "eval_loss": 0.4991084337234497, |
| "eval_mean_token_accuracy": 0.7802343352125325, |
| "eval_num_tokens": 697478901.0, |
| "eval_runtime": 16.0896, |
| "eval_samples_per_second": 53.886, |
| "eval_steps_per_second": 6.775, |
| "step": 23000 |
| }, |
| { |
| "entropy": 0.42197408616542814, |
| "epoch": 4.30117559246128, |
| "grad_norm": 0.80859375, |
| "learning_rate": 4.290726342706758e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.7817700058221817, |
| "num_tokens": 699021137.0, |
| "step": 23050 |
| }, |
| { |
| "entropy": 0.4093606770038605, |
| "epoch": 4.310505691360328, |
| "grad_norm": 0.9609375, |
| "learning_rate": 4.257041633456738e-06, |
| "loss": 0.4802, |
| "mean_token_accuracy": 0.7852538430690765, |
| "num_tokens": 700512894.0, |
| "step": 23100 |
| }, |
| { |
| "entropy": 0.42187732078135015, |
| "epoch": 4.3198357902593765, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.223780825774913e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.7813728898763657, |
| "num_tokens": 702059517.0, |
| "step": 23150 |
| }, |
| { |
| "entropy": 0.4125171934068203, |
| "epoch": 4.329165889158425, |
| "grad_norm": 0.5859375, |
| "learning_rate": 4.1909450712872285e-06, |
| "loss": 0.4905, |
| "mean_token_accuracy": 0.7848614448308945, |
| "num_tokens": 703551158.0, |
| "step": 23200 |
| }, |
| { |
| "entropy": 0.4088340279459953, |
| "epoch": 4.338495988057473, |
| "grad_norm": 0.76171875, |
| "learning_rate": 4.158535506902543e-06, |
| "loss": 0.4786, |
| "mean_token_accuracy": 0.7860189139842987, |
| "num_tokens": 705046371.0, |
| "step": 23250 |
| }, |
| { |
| "entropy": 0.4250268609821796, |
| "epoch": 4.3478260869565215, |
| "grad_norm": 0.423828125, |
| "learning_rate": 4.1265532547732586e-06, |
| "loss": 0.4883, |
| "mean_token_accuracy": 0.7804222059249878, |
| "num_tokens": 706651598.0, |
| "step": 23300 |
| }, |
| { |
| "entropy": 0.42093482360243795, |
| "epoch": 4.35715618585557, |
| "grad_norm": 0.29296875, |
| "learning_rate": 4.094999422256478e-06, |
| "loss": 0.4919, |
| "mean_token_accuracy": 0.7815903490781784, |
| "num_tokens": 708166771.0, |
| "step": 23350 |
| }, |
| { |
| "entropy": 0.40209699779748914, |
| "epoch": 4.366486284754618, |
| "grad_norm": 0.9453125, |
| "learning_rate": 4.063875101875644e-06, |
| "loss": 0.4719, |
| "mean_token_accuracy": 0.7900790423154831, |
| "num_tokens": 709635262.0, |
| "step": 23400 |
| }, |
| { |
| "entropy": 0.411265781968832, |
| "epoch": 4.3758163836536665, |
| "grad_norm": 0.81640625, |
| "learning_rate": 4.033181371282729e-06, |
| "loss": 0.4774, |
| "mean_token_accuracy": 0.7869658744335175, |
| "num_tokens": 711131934.0, |
| "step": 23450 |
| }, |
| { |
| "entropy": 0.41227160826325415, |
| "epoch": 4.385146482552715, |
| "grad_norm": 1.6953125, |
| "learning_rate": 4.002919293220917e-06, |
| "loss": 0.4809, |
| "mean_token_accuracy": 0.7838652014732361, |
| "num_tokens": 712614938.0, |
| "step": 23500 |
| }, |
| { |
| "entropy": 0.4009349416196346, |
| "epoch": 4.394476581451763, |
| "grad_norm": 0.73046875, |
| "learning_rate": 3.973089915487803e-06, |
| "loss": 0.4823, |
| "mean_token_accuracy": 0.7867969334125519, |
| "num_tokens": 714060976.0, |
| "step": 23550 |
| }, |
| { |
| "entropy": 0.43059924483299256, |
| "epoch": 4.403806680350812, |
| "grad_norm": 0.8125, |
| "learning_rate": 3.943694270899114e-06, |
| "loss": 0.5009, |
| "mean_token_accuracy": 0.7788757783174515, |
| "num_tokens": 715598099.0, |
| "step": 23600 |
| }, |
| { |
| "entropy": 0.41181773334741595, |
| "epoch": 4.41313677924986, |
| "grad_norm": 0.380859375, |
| "learning_rate": 3.914733377252963e-06, |
| "loss": 0.4793, |
| "mean_token_accuracy": 0.786955691576004, |
| "num_tokens": 717099596.0, |
| "step": 23650 |
| }, |
| { |
| "entropy": 0.3970365000516176, |
| "epoch": 4.422466878148908, |
| "grad_norm": 0.60546875, |
| "learning_rate": 3.886208237294589e-06, |
| "loss": 0.4733, |
| "mean_token_accuracy": 0.7909884482622147, |
| "num_tokens": 718556937.0, |
| "step": 23700 |
| }, |
| { |
| "entropy": 0.4260143294930458, |
| "epoch": 4.431796977047957, |
| "grad_norm": 0.73828125, |
| "learning_rate": 3.858119838681645e-06, |
| "loss": 0.4887, |
| "mean_token_accuracy": 0.7821561121940612, |
| "num_tokens": 720105007.0, |
| "step": 23750 |
| }, |
| { |
| "entropy": 0.4135922496020794, |
| "epoch": 4.441127075947005, |
| "grad_norm": 0.6640625, |
| "learning_rate": 3.830469153950008e-06, |
| "loss": 0.4802, |
| "mean_token_accuracy": 0.7842557770013809, |
| "num_tokens": 721637538.0, |
| "step": 23800 |
| }, |
| { |
| "entropy": 0.4049617177248001, |
| "epoch": 4.450457174846053, |
| "grad_norm": 0.72265625, |
| "learning_rate": 3.803257140480098e-06, |
| "loss": 0.4855, |
| "mean_token_accuracy": 0.7906941068172455, |
| "num_tokens": 723121765.0, |
| "step": 23850 |
| }, |
| { |
| "entropy": 0.40623982638120654, |
| "epoch": 4.459787273745102, |
| "grad_norm": 1.0390625, |
| "learning_rate": 3.776484740463726e-06, |
| "loss": 0.4776, |
| "mean_token_accuracy": 0.785234968662262, |
| "num_tokens": 724610759.0, |
| "step": 23900 |
| }, |
| { |
| "entropy": 0.4155420292913914, |
| "epoch": 4.46911737264415, |
| "grad_norm": 0.6484375, |
| "learning_rate": 3.7501528808714883e-06, |
| "loss": 0.4854, |
| "mean_token_accuracy": 0.7824651861190796, |
| "num_tokens": 726130784.0, |
| "step": 23950 |
| }, |
| { |
| "entropy": 0.41285816714167595, |
| "epoch": 4.478447471543198, |
| "grad_norm": 0.5859375, |
| "learning_rate": 3.7242624734206554e-06, |
| "loss": 0.4816, |
| "mean_token_accuracy": 0.785009593963623, |
| "num_tokens": 727648846.0, |
| "step": 24000 |
| }, |
| { |
| "epoch": 4.478447471543198, |
| "eval_entropy": 0.4252187248763688, |
| "eval_loss": 0.49884533882141113, |
| "eval_mean_token_accuracy": 0.7805168371681773, |
| "eval_num_tokens": 727648846.0, |
| "eval_runtime": 16.6326, |
| "eval_samples_per_second": 52.126, |
| "eval_steps_per_second": 6.553, |
| "step": 24000 |
| }, |
| { |
| "entropy": 0.417481614202261, |
| "epoch": 4.487777570442247, |
| "grad_norm": 0.59375, |
| "learning_rate": 3.6988144145436063e-06, |
| "loss": 0.4834, |
| "mean_token_accuracy": 0.7845394277572632, |
| "num_tokens": 729133282.0, |
| "step": 24050 |
| }, |
| { |
| "entropy": 0.42680337965488435, |
| "epoch": 4.497107669341295, |
| "grad_norm": 1.0, |
| "learning_rate": 3.6738095853567963e-06, |
| "loss": 0.4951, |
| "mean_token_accuracy": 0.7786115556955338, |
| "num_tokens": 730690418.0, |
| "step": 24100 |
| }, |
| { |
| "entropy": 0.4081826032698154, |
| "epoch": 4.506437768240343, |
| "grad_norm": 0.3984375, |
| "learning_rate": 3.6492488516302438e-06, |
| "loss": 0.4776, |
| "mean_token_accuracy": 0.7866486293077469, |
| "num_tokens": 732175841.0, |
| "step": 24150 |
| }, |
| { |
| "entropy": 0.4243287441134453, |
| "epoch": 4.515767867139392, |
| "grad_norm": 0.439453125, |
| "learning_rate": 3.625133063757556e-06, |
| "loss": 0.4862, |
| "mean_token_accuracy": 0.7824915134906769, |
| "num_tokens": 733702523.0, |
| "step": 24200 |
| }, |
| { |
| "entropy": 0.40577477023005487, |
| "epoch": 4.52509796603844, |
| "grad_norm": 0.81640625, |
| "learning_rate": 3.6014630567264895e-06, |
| "loss": 0.4744, |
| "mean_token_accuracy": 0.788703248500824, |
| "num_tokens": 735166918.0, |
| "step": 24250 |
| }, |
| { |
| "entropy": 0.4156419275701046, |
| "epoch": 4.534428064937488, |
| "grad_norm": 0.6484375, |
| "learning_rate": 3.578239650090026e-06, |
| "loss": 0.4787, |
| "mean_token_accuracy": 0.7852990156412125, |
| "num_tokens": 736643884.0, |
| "step": 24300 |
| }, |
| { |
| "entropy": 0.39459425553679467, |
| "epoch": 4.543758163836537, |
| "grad_norm": 0.5546875, |
| "learning_rate": 3.555463647938016e-06, |
| "loss": 0.4681, |
| "mean_token_accuracy": 0.792020954489708, |
| "num_tokens": 738110498.0, |
| "step": 24350 |
| }, |
| { |
| "entropy": 0.4069563465565443, |
| "epoch": 4.553088262735585, |
| "grad_norm": 0.416015625, |
| "learning_rate": 3.533135838869318e-06, |
| "loss": 0.4753, |
| "mean_token_accuracy": 0.7872859954833984, |
| "num_tokens": 739623859.0, |
| "step": 24400 |
| }, |
| { |
| "entropy": 0.4120598857104778, |
| "epoch": 4.562418361634633, |
| "grad_norm": 0.40234375, |
| "learning_rate": 3.5112569959645072e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.7832210195064545, |
| "num_tokens": 741111342.0, |
| "step": 24450 |
| }, |
| { |
| "entropy": 0.411198351085186, |
| "epoch": 4.571748460533682, |
| "grad_norm": 0.84375, |
| "learning_rate": 3.4898278767591007e-06, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.7840376651287079, |
| "num_tokens": 742634170.0, |
| "step": 24500 |
| }, |
| { |
| "entropy": 0.41103513091802596, |
| "epoch": 4.58107855943273, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.4688492232173343e-06, |
| "loss": 0.4797, |
| "mean_token_accuracy": 0.785804370045662, |
| "num_tokens": 744158302.0, |
| "step": 24550 |
| }, |
| { |
| "entropy": 0.41402764439582823, |
| "epoch": 4.5904086583317785, |
| "grad_norm": 1.4921875, |
| "learning_rate": 3.448321761706467e-06, |
| "loss": 0.4883, |
| "mean_token_accuracy": 0.78047394156456, |
| "num_tokens": 745688655.0, |
| "step": 24600 |
| }, |
| { |
| "entropy": 0.4192537406086922, |
| "epoch": 4.599738757230827, |
| "grad_norm": 0.67578125, |
| "learning_rate": 3.428246202971639e-06, |
| "loss": 0.484, |
| "mean_token_accuracy": 0.7821294456720352, |
| "num_tokens": 747226392.0, |
| "step": 24650 |
| }, |
| { |
| "entropy": 0.3943482875823975, |
| "epoch": 4.609068856129875, |
| "grad_norm": 0.7734375, |
| "learning_rate": 3.408623242111255e-06, |
| "loss": 0.475, |
| "mean_token_accuracy": 0.7858047294616699, |
| "num_tokens": 748701674.0, |
| "step": 24700 |
| }, |
| { |
| "entropy": 0.39550173744559286, |
| "epoch": 4.6183989550289235, |
| "grad_norm": 0.3515625, |
| "learning_rate": 3.389453558552918e-06, |
| "loss": 0.4673, |
| "mean_token_accuracy": 0.7902515822649002, |
| "num_tokens": 750138953.0, |
| "step": 24750 |
| }, |
| { |
| "entropy": 0.41649769321084024, |
| "epoch": 4.627729053927972, |
| "grad_norm": 0.6484375, |
| "learning_rate": 3.37073781602991e-06, |
| "loss": 0.4877, |
| "mean_token_accuracy": 0.7842141664028168, |
| "num_tokens": 751652780.0, |
| "step": 24800 |
| }, |
| { |
| "entropy": 0.41776536986231805, |
| "epoch": 4.63705915282702, |
| "grad_norm": 0.44140625, |
| "learning_rate": 3.3524766625582052e-06, |
| "loss": 0.4836, |
| "mean_token_accuracy": 0.7836203473806381, |
| "num_tokens": 753230769.0, |
| "step": 24850 |
| }, |
| { |
| "entropy": 0.4045536919683218, |
| "epoch": 4.6463892517260685, |
| "grad_norm": 0.6875, |
| "learning_rate": 3.334670730414037e-06, |
| "loss": 0.4825, |
| "mean_token_accuracy": 0.7888794159889221, |
| "num_tokens": 754719857.0, |
| "step": 24900 |
| }, |
| { |
| "entropy": 0.4182581885159016, |
| "epoch": 4.655719350625117, |
| "grad_norm": 0.408203125, |
| "learning_rate": 3.3173206361120026e-06, |
| "loss": 0.4862, |
| "mean_token_accuracy": 0.7835237330198288, |
| "num_tokens": 756242660.0, |
| "step": 24950 |
| }, |
| { |
| "entropy": 0.41097776919603346, |
| "epoch": 4.665049449524165, |
| "grad_norm": 0.4921875, |
| "learning_rate": 3.3004269803837223e-06, |
| "loss": 0.4833, |
| "mean_token_accuracy": 0.7847666722536087, |
| "num_tokens": 757779830.0, |
| "step": 25000 |
| }, |
| { |
| "epoch": 4.665049449524165, |
| "eval_entropy": 0.425675423730404, |
| "eval_loss": 0.49867185950279236, |
| "eval_mean_token_accuracy": 0.7805509616475587, |
| "eval_num_tokens": 757779830.0, |
| "eval_runtime": 16.0252, |
| "eval_samples_per_second": 54.102, |
| "eval_steps_per_second": 6.802, |
| "step": 25000 |
| }, |
| { |
| "entropy": 0.4132141026854515, |
| "epoch": 4.674379548423214, |
| "grad_norm": 1.2109375, |
| "learning_rate": 3.2839903481570305e-06, |
| "loss": 0.4843, |
| "mean_token_accuracy": 0.7841847789287567, |
| "num_tokens": 759296685.0, |
| "step": 25050 |
| }, |
| { |
| "entropy": 0.40370859257876873, |
| "epoch": 4.683709647322262, |
| "grad_norm": 0.4765625, |
| "learning_rate": 3.268011308535733e-06, |
| "loss": 0.4746, |
| "mean_token_accuracy": 0.7883295321464538, |
| "num_tokens": 760792002.0, |
| "step": 25100 |
| }, |
| { |
| "entropy": 0.4057003001868725, |
| "epoch": 4.69303974622131, |
| "grad_norm": 0.55859375, |
| "learning_rate": 3.252490414779895e-06, |
| "loss": 0.4792, |
| "mean_token_accuracy": 0.7840817874670029, |
| "num_tokens": 762288394.0, |
| "step": 25150 |
| }, |
| { |
| "entropy": 0.4204439736157656, |
| "epoch": 4.702369845120359, |
| "grad_norm": 0.671875, |
| "learning_rate": 3.2374282042866876e-06, |
| "loss": 0.4853, |
| "mean_token_accuracy": 0.7837431621551514, |
| "num_tokens": 763765331.0, |
| "step": 25200 |
| }, |
| { |
| "entropy": 0.4071824544668198, |
| "epoch": 4.711699944019407, |
| "grad_norm": 1.1796875, |
| "learning_rate": 3.2228251985717824e-06, |
| "loss": 0.4852, |
| "mean_token_accuracy": 0.7848959761857986, |
| "num_tokens": 765243737.0, |
| "step": 25250 |
| }, |
| { |
| "entropy": 0.43803592413663867, |
| "epoch": 4.721030042918455, |
| "grad_norm": 0.8515625, |
| "learning_rate": 3.208681903251291e-06, |
| "loss": 0.4973, |
| "mean_token_accuracy": 0.7763216584920883, |
| "num_tokens": 766812993.0, |
| "step": 25300 |
| }, |
| { |
| "entropy": 0.4035507388412952, |
| "epoch": 4.730360141817504, |
| "grad_norm": 0.8359375, |
| "learning_rate": 3.1949988080242665e-06, |
| "loss": 0.4751, |
| "mean_token_accuracy": 0.7876280504465103, |
| "num_tokens": 768318546.0, |
| "step": 25350 |
| }, |
| { |
| "entropy": 0.41653711020946504, |
| "epoch": 4.739690240716552, |
| "grad_norm": 0.625, |
| "learning_rate": 3.181776386655733e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.7842164701223373, |
| "num_tokens": 769827713.0, |
| "step": 25400 |
| }, |
| { |
| "entropy": 0.43134715765714643, |
| "epoch": 4.749020339615599, |
| "grad_norm": 0.66015625, |
| "learning_rate": 3.1690150969603e-06, |
| "loss": 0.4975, |
| "mean_token_accuracy": 0.7809195184707641, |
| "num_tokens": 771418246.0, |
| "step": 25450 |
| }, |
| { |
| "entropy": 0.4131816050410271, |
| "epoch": 4.758350438514649, |
| "grad_norm": 0.384765625, |
| "learning_rate": 3.1567153807862953e-06, |
| "loss": 0.4821, |
| "mean_token_accuracy": 0.7868939906358718, |
| "num_tokens": 772904428.0, |
| "step": 25500 |
| }, |
| { |
| "entropy": 0.4172322556376457, |
| "epoch": 4.767680537413696, |
| "grad_norm": 0.458984375, |
| "learning_rate": 3.1448776640004756e-06, |
| "loss": 0.4864, |
| "mean_token_accuracy": 0.7821826964616776, |
| "num_tokens": 774432393.0, |
| "step": 25550 |
| }, |
| { |
| "entropy": 0.410623489767313, |
| "epoch": 4.777010636312745, |
| "grad_norm": 0.7578125, |
| "learning_rate": 3.133502356473279e-06, |
| "loss": 0.4765, |
| "mean_token_accuracy": 0.7861051166057587, |
| "num_tokens": 775955056.0, |
| "step": 25600 |
| }, |
| { |
| "entropy": 0.4126855818927288, |
| "epoch": 4.786340735211793, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.1225898520646354e-06, |
| "loss": 0.4791, |
| "mean_token_accuracy": 0.7849954336881637, |
| "num_tokens": 777481570.0, |
| "step": 25650 |
| }, |
| { |
| "entropy": 0.42760392755270005, |
| "epoch": 4.795670834110842, |
| "grad_norm": 0.54296875, |
| "learning_rate": 3.112140528610325e-06, |
| "loss": 0.4888, |
| "mean_token_accuracy": 0.7792460584640503, |
| "num_tokens": 779084872.0, |
| "step": 25700 |
| }, |
| { |
| "entropy": 0.4246444535255432, |
| "epoch": 4.8050009330098895, |
| "grad_norm": 0.58203125, |
| "learning_rate": 3.102154747908898e-06, |
| "loss": 0.4901, |
| "mean_token_accuracy": 0.779129432439804, |
| "num_tokens": 780621011.0, |
| "step": 25750 |
| }, |
| { |
| "entropy": 0.4207975560426712, |
| "epoch": 4.814331031908938, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.0926328557091484e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.7824300426244736, |
| "num_tokens": 782204557.0, |
| "step": 25800 |
| }, |
| { |
| "entropy": 0.40202761128544806, |
| "epoch": 4.823661130807986, |
| "grad_norm": 0.65625, |
| "learning_rate": 3.0835751816981437e-06, |
| "loss": 0.4742, |
| "mean_token_accuracy": 0.7883801186084747, |
| "num_tokens": 783710597.0, |
| "step": 25850 |
| }, |
| { |
| "entropy": 0.4042065401375294, |
| "epoch": 4.8329912297070345, |
| "grad_norm": 0.890625, |
| "learning_rate": 3.0749820394898103e-06, |
| "loss": 0.4773, |
| "mean_token_accuracy": 0.7863856315612793, |
| "num_tokens": 785204376.0, |
| "step": 25900 |
| }, |
| { |
| "entropy": 0.4113363729417324, |
| "epoch": 4.842321328606083, |
| "grad_norm": 0.42578125, |
| "learning_rate": 3.066853726614068e-06, |
| "loss": 0.4836, |
| "mean_token_accuracy": 0.7829122406244278, |
| "num_tokens": 786745388.0, |
| "step": 25950 |
| }, |
| { |
| "entropy": 0.4050539457052946, |
| "epoch": 4.851651427505131, |
| "grad_norm": 1.0546875, |
| "learning_rate": 3.0591905245065378e-06, |
| "loss": 0.4782, |
| "mean_token_accuracy": 0.7893619048595428, |
| "num_tokens": 788238814.0, |
| "step": 26000 |
| }, |
| { |
| "epoch": 4.851651427505131, |
| "eval_entropy": 0.42556045840092754, |
| "eval_loss": 0.4986078143119812, |
| "eval_mean_token_accuracy": 0.7803256960090147, |
| "eval_num_tokens": 788238814.0, |
| "eval_runtime": 16.1117, |
| "eval_samples_per_second": 53.812, |
| "eval_steps_per_second": 6.765, |
| "step": 26000 |
| }, |
| { |
| "entropy": 0.4172664260864258, |
| "epoch": 4.86098152640418, |
| "grad_norm": 0.8828125, |
| "learning_rate": 3.0519926984987924e-06, |
| "loss": 0.4896, |
| "mean_token_accuracy": 0.781678112745285, |
| "num_tokens": 789737207.0, |
| "step": 26050 |
| }, |
| { |
| "entropy": 0.4131707660853863, |
| "epoch": 4.870311625303228, |
| "grad_norm": 0.484375, |
| "learning_rate": 3.045260497809169e-06, |
| "loss": 0.4816, |
| "mean_token_accuracy": 0.7855587202310562, |
| "num_tokens": 791272449.0, |
| "step": 26100 |
| }, |
| { |
| "entropy": 0.438031694740057, |
| "epoch": 4.879641724202276, |
| "grad_norm": 0.43359375, |
| "learning_rate": 3.0389941555341412e-06, |
| "loss": 0.4988, |
| "mean_token_accuracy": 0.7749890965223313, |
| "num_tokens": 792843109.0, |
| "step": 26150 |
| }, |
| { |
| "entropy": 0.41347076088190077, |
| "epoch": 4.888971823101325, |
| "grad_norm": 0.66015625, |
| "learning_rate": 3.03319388864025e-06, |
| "loss": 0.4862, |
| "mean_token_accuracy": 0.782042904496193, |
| "num_tokens": 794349700.0, |
| "step": 26200 |
| }, |
| { |
| "entropy": 0.4158082590997219, |
| "epoch": 4.898301922000373, |
| "grad_norm": 0.5, |
| "learning_rate": 3.0278598979565877e-06, |
| "loss": 0.4888, |
| "mean_token_accuracy": 0.7814363497495651, |
| "num_tokens": 795874145.0, |
| "step": 26250 |
| }, |
| { |
| "entropy": 0.4144511626660824, |
| "epoch": 4.907632020899421, |
| "grad_norm": 0.373046875, |
| "learning_rate": 3.0229923681678497e-06, |
| "loss": 0.4811, |
| "mean_token_accuracy": 0.7852653992176056, |
| "num_tokens": 797377284.0, |
| "step": 26300 |
| }, |
| { |
| "entropy": 0.4087340448796749, |
| "epoch": 4.91696211979847, |
| "grad_norm": 0.3828125, |
| "learning_rate": 3.018591467807935e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.7855605220794678, |
| "num_tokens": 798886169.0, |
| "step": 26350 |
| }, |
| { |
| "entropy": 0.4149911729991436, |
| "epoch": 4.926292218697518, |
| "grad_norm": 0.478515625, |
| "learning_rate": 3.0146573492541123e-06, |
| "loss": 0.4865, |
| "mean_token_accuracy": 0.7815834748744964, |
| "num_tokens": 800437490.0, |
| "step": 26400 |
| }, |
| { |
| "entropy": 0.4159658246487379, |
| "epoch": 4.935622317596566, |
| "grad_norm": 0.498046875, |
| "learning_rate": 3.0111901487217452e-06, |
| "loss": 0.485, |
| "mean_token_accuracy": 0.7831065011024475, |
| "num_tokens": 801983835.0, |
| "step": 26450 |
| }, |
| { |
| "entropy": 0.4220912031829357, |
| "epoch": 4.944952416495615, |
| "grad_norm": 0.51171875, |
| "learning_rate": 3.008189986259573e-06, |
| "loss": 0.4914, |
| "mean_token_accuracy": 0.7806734621524811, |
| "num_tokens": 803525044.0, |
| "step": 26500 |
| }, |
| { |
| "entropy": 0.4241555346548557, |
| "epoch": 4.954282515394663, |
| "grad_norm": 0.45703125, |
| "learning_rate": 3.0056569657455626e-06, |
| "loss": 0.4879, |
| "mean_token_accuracy": 0.7803041088581085, |
| "num_tokens": 805085203.0, |
| "step": 26550 |
| }, |
| { |
| "entropy": 0.4313288567960262, |
| "epoch": 4.963612614293711, |
| "grad_norm": 0.546875, |
| "learning_rate": 3.0035911748832985e-06, |
| "loss": 0.4982, |
| "mean_token_accuracy": 0.7780981206893921, |
| "num_tokens": 806639306.0, |
| "step": 26600 |
| }, |
| { |
| "entropy": 0.40001085847616197, |
| "epoch": 4.97294271319276, |
| "grad_norm": 0.6328125, |
| "learning_rate": 3.0019926851989556e-06, |
| "loss": 0.4735, |
| "mean_token_accuracy": 0.7876440799236297, |
| "num_tokens": 808101409.0, |
| "step": 26650 |
| }, |
| { |
| "entropy": 0.4143665814399719, |
| "epoch": 4.982272812091808, |
| "grad_norm": 1.0, |
| "learning_rate": 3.000861552038823e-06, |
| "loss": 0.4829, |
| "mean_token_accuracy": 0.7828416174650192, |
| "num_tokens": 809661206.0, |
| "step": 26700 |
| }, |
| { |
| "entropy": 0.410667944252491, |
| "epoch": 4.991602910990856, |
| "grad_norm": 0.28125, |
| "learning_rate": 3.0001978145673808e-06, |
| "loss": 0.4815, |
| "mean_token_accuracy": 0.7867605596780777, |
| "num_tokens": 811153343.0, |
| "step": 26750 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 26795, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.4288887116625084e+19, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|