{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 1000, "global_step": 26795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.6402970719337463, "epoch": 0.009330098899048329, "grad_norm": 1.015625, "learning_rate": 1.47e-05, "loss": 0.778, "mean_token_accuracy": 0.7212734770774841, "num_tokens": 1548877.0, "step": 50 }, { "entropy": 0.5775007322430611, "epoch": 0.018660197798096658, "grad_norm": 1.109375, "learning_rate": 2.97e-05, "loss": 0.5918, "mean_token_accuracy": 0.7543947434425354, "num_tokens": 3018955.0, "step": 100 }, { "entropy": 0.5692345011234283, "epoch": 0.02799029669714499, "grad_norm": 0.98046875, "learning_rate": 2.999977554224005e-05, "loss": 0.5835, "mean_token_accuracy": 0.753638728260994, "num_tokens": 4563356.0, "step": 150 }, { "entropy": 0.570181995332241, "epoch": 0.037320395596193316, "grad_norm": 0.66015625, "learning_rate": 2.999908376021796e-05, "loss": 0.5835, "mean_token_accuracy": 0.7521764719486237, "num_tokens": 6114611.0, "step": 200 }, { "entropy": 0.5772185748815537, "epoch": 0.04665049449524165, "grad_norm": 1.3125, "learning_rate": 2.9997924584400694e-05, "loss": 0.5841, "mean_token_accuracy": 0.7549300968647004, "num_tokens": 7635855.0, "step": 250 }, { "entropy": 0.5723973855376243, "epoch": 0.05598059339428998, "grad_norm": 0.81640625, "learning_rate": 2.9996298054923684e-05, "loss": 0.5865, "mean_token_accuracy": 0.7571439111232757, "num_tokens": 9098584.0, "step": 300 }, { "entropy": 0.5691715434193612, "epoch": 0.06531069229333832, "grad_norm": 0.625, "learning_rate": 2.9994204228104075e-05, "loss": 0.583, "mean_token_accuracy": 0.7545554572343827, "num_tokens": 10604199.0, "step": 350 }, { "entropy": 0.5754752764105797, "epoch": 0.07464079119238663, "grad_norm": 1.25, "learning_rate": 2.9991643176438752e-05, "loss": 0.5852, "mean_token_accuracy": 0.7542591279745102, "num_tokens": 12131126.0, "step": 400 }, { "entropy": 0.5670795711874962, "epoch": 0.08397089009143496, "grad_norm": 0.68359375, "learning_rate": 2.9988614988601868e-05, "loss": 0.5779, "mean_token_accuracy": 0.7581483513116837, "num_tokens": 13642307.0, "step": 450 }, { "entropy": 0.5779001507163047, "epoch": 0.0933009889904833, "grad_norm": 0.431640625, "learning_rate": 2.998511976944173e-05, "loss": 0.5874, "mean_token_accuracy": 0.7504317510128021, "num_tokens": 15217615.0, "step": 500 }, { "entropy": 0.5665371876955032, "epoch": 0.10263108788953162, "grad_norm": 0.9140625, "learning_rate": 2.998115763997721e-05, "loss": 0.5851, "mean_token_accuracy": 0.7549385547637939, "num_tokens": 16724810.0, "step": 550 }, { "entropy": 0.5638798615336418, "epoch": 0.11196118678857996, "grad_norm": 1.046875, "learning_rate": 2.9976728737393515e-05, "loss": 0.5756, "mean_token_accuracy": 0.757421538233757, "num_tokens": 18270004.0, "step": 600 }, { "entropy": 0.5710726794600487, "epoch": 0.12129128568762829, "grad_norm": 0.76171875, "learning_rate": 2.997183321503747e-05, "loss": 0.5854, "mean_token_accuracy": 0.7553139424324036, "num_tokens": 19791398.0, "step": 650 }, { "entropy": 0.5717519807815552, "epoch": 0.13062138458667663, "grad_norm": 1.28125, "learning_rate": 2.9966471242412192e-05, "loss": 0.5828, "mean_token_accuracy": 0.7528650748729706, "num_tokens": 21373941.0, "step": 700 }, { "entropy": 0.5660536578297615, "epoch": 0.13995148348572495, "grad_norm": 1.265625, "learning_rate": 2.996064300517122e-05, "loss": 0.5858, "mean_token_accuracy": 0.7544754481315613, "num_tokens": 22872328.0, "step": 750 }, { "entropy": 0.5696932604908943, "epoch": 0.14928158238477326, "grad_norm": 0.953125, "learning_rate": 2.995434870511211e-05, "loss": 0.5881, "mean_token_accuracy": 0.7530353850126267, "num_tokens": 24433349.0, "step": 800 }, { "entropy": 0.5728991779685021, "epoch": 0.1586116812838216, "grad_norm": 1.140625, "learning_rate": 2.9947588560169395e-05, "loss": 0.5841, "mean_token_accuracy": 0.753363783955574, "num_tokens": 26038373.0, "step": 850 }, { "entropy": 0.5628881072998047, "epoch": 0.16794178018286993, "grad_norm": 0.6328125, "learning_rate": 2.994036280440711e-05, "loss": 0.573, "mean_token_accuracy": 0.7580157500505448, "num_tokens": 27568454.0, "step": 900 }, { "entropy": 0.5649524646997451, "epoch": 0.17727187908191827, "grad_norm": 0.95703125, "learning_rate": 2.9932671688010632e-05, "loss": 0.5766, "mean_token_accuracy": 0.7574625754356384, "num_tokens": 29049728.0, "step": 950 }, { "entropy": 0.5648883840441704, "epoch": 0.1866019779809666, "grad_norm": 0.99609375, "learning_rate": 2.992451547727804e-05, "loss": 0.5883, "mean_token_accuracy": 0.7526043313741684, "num_tokens": 30603198.0, "step": 1000 }, { "epoch": 0.1866019779809666, "eval_entropy": 0.5557291887768911, "eval_loss": 0.5737926959991455, "eval_mean_token_accuracy": 0.758812694921406, "eval_num_tokens": 30603198.0, "eval_runtime": 16.2769, "eval_samples_per_second": 53.266, "eval_steps_per_second": 6.697, "step": 1000 }, { "entropy": 0.5666726857423783, "epoch": 0.19593207688001493, "grad_norm": 0.609375, "learning_rate": 2.9915894454610887e-05, "loss": 0.5764, "mean_token_accuracy": 0.7547625786066056, "num_tokens": 32144851.0, "step": 1050 }, { "entropy": 0.561416018307209, "epoch": 0.20526217577906325, "grad_norm": 0.55859375, "learning_rate": 2.990680891850444e-05, "loss": 0.5794, "mean_token_accuracy": 0.75654057264328, "num_tokens": 33639533.0, "step": 1100 }, { "entropy": 0.5599774518609046, "epoch": 0.2145922746781116, "grad_norm": 1.4140625, "learning_rate": 2.9897259183537322e-05, "loss": 0.5772, "mean_token_accuracy": 0.7588758039474487, "num_tokens": 35130975.0, "step": 1150 }, { "entropy": 0.5628740054368973, "epoch": 0.2239223735771599, "grad_norm": 0.91015625, "learning_rate": 2.9887245580360623e-05, "loss": 0.581, "mean_token_accuracy": 0.7554371774196624, "num_tokens": 36651189.0, "step": 1200 }, { "entropy": 0.5624552240967751, "epoch": 0.23325247247620826, "grad_norm": 0.92578125, "learning_rate": 2.9876768455686477e-05, "loss": 0.5731, "mean_token_accuracy": 0.7572768718004227, "num_tokens": 38172935.0, "step": 1250 }, { "entropy": 0.5497725516557693, "epoch": 0.24258257137525657, "grad_norm": 0.8671875, "learning_rate": 2.9865828172276023e-05, "loss": 0.569, "mean_token_accuracy": 0.7618441820144654, "num_tokens": 39653342.0, "step": 1300 }, { "entropy": 0.5542441910505295, "epoch": 0.2519126702743049, "grad_norm": 0.84375, "learning_rate": 2.9854425108926863e-05, "loss": 0.5732, "mean_token_accuracy": 0.762292046546936, "num_tokens": 41090294.0, "step": 1350 }, { "entropy": 0.5551298156380653, "epoch": 0.26124276917335326, "grad_norm": 0.9375, "learning_rate": 2.984255966045995e-05, "loss": 0.5773, "mean_token_accuracy": 0.755358315706253, "num_tokens": 42625920.0, "step": 1400 }, { "entropy": 0.5527626049518585, "epoch": 0.27057286807240155, "grad_norm": 0.54296875, "learning_rate": 2.9830232237705904e-05, "loss": 0.5724, "mean_token_accuracy": 0.7588168692588806, "num_tokens": 44134645.0, "step": 1450 }, { "entropy": 0.5561913156509399, "epoch": 0.2799029669714499, "grad_norm": 0.396484375, "learning_rate": 2.9817443267490797e-05, "loss": 0.5742, "mean_token_accuracy": 0.7577809965610505, "num_tokens": 45605580.0, "step": 1500 }, { "entropy": 0.5598691233992577, "epoch": 0.28923306587049824, "grad_norm": 0.94921875, "learning_rate": 2.9804193192621376e-05, "loss": 0.5746, "mean_token_accuracy": 0.7551334691047669, "num_tokens": 47144264.0, "step": 1550 }, { "entropy": 0.554311693906784, "epoch": 0.29856316476954653, "grad_norm": 0.98828125, "learning_rate": 2.979048247186972e-05, "loss": 0.5691, "mean_token_accuracy": 0.7589048826694489, "num_tokens": 48660197.0, "step": 1600 }, { "entropy": 0.5676783239841461, "epoch": 0.3078932636685949, "grad_norm": 1.09375, "learning_rate": 2.9776311579957372e-05, "loss": 0.5797, "mean_token_accuracy": 0.7567919301986694, "num_tokens": 50152863.0, "step": 1650 }, { "entropy": 0.5468079242110252, "epoch": 0.3172233625676432, "grad_norm": 1.2421875, "learning_rate": 2.976168100753889e-05, "loss": 0.5677, "mean_token_accuracy": 0.7582010948657989, "num_tokens": 51722410.0, "step": 1700 }, { "entropy": 0.5553153255581855, "epoch": 0.32655346146669156, "grad_norm": 0.97265625, "learning_rate": 2.974659126118485e-05, "loss": 0.5696, "mean_token_accuracy": 0.7591327953338624, "num_tokens": 53257454.0, "step": 1750 }, { "entropy": 0.553457222878933, "epoch": 0.33588356036573985, "grad_norm": 0.49609375, "learning_rate": 2.973104286336433e-05, "loss": 0.5725, "mean_token_accuracy": 0.7564568722248077, "num_tokens": 54784162.0, "step": 1800 }, { "entropy": 0.5699556747078895, "epoch": 0.3452136592647882, "grad_norm": 0.69140625, "learning_rate": 2.971503635242682e-05, "loss": 0.5756, "mean_token_accuracy": 0.7566489219665528, "num_tokens": 56304516.0, "step": 1850 }, { "entropy": 0.5491332325339318, "epoch": 0.35454375816383654, "grad_norm": 0.7265625, "learning_rate": 2.9698572282583534e-05, "loss": 0.5683, "mean_token_accuracy": 0.7583828049898148, "num_tokens": 57819074.0, "step": 1900 }, { "entropy": 0.5511914587020874, "epoch": 0.3638738570628849, "grad_norm": 0.7265625, "learning_rate": 2.9681651223888298e-05, "loss": 0.571, "mean_token_accuracy": 0.7572992449998855, "num_tokens": 59346739.0, "step": 1950 }, { "entropy": 0.5618056333065033, "epoch": 0.3732039559619332, "grad_norm": 0.72265625, "learning_rate": 2.966427376221774e-05, "loss": 0.5792, "mean_token_accuracy": 0.7539066845178604, "num_tokens": 60876192.0, "step": 2000 }, { "epoch": 0.3732039559619332, "eval_entropy": 0.5431396165572175, "eval_loss": 0.5645309686660767, "eval_mean_token_accuracy": 0.7612812196442841, "eval_num_tokens": 60876192.0, "eval_runtime": 16.0974, "eval_samples_per_second": 53.86, "eval_steps_per_second": 6.771, "step": 2000 }, { "entropy": 0.5605012658238411, "epoch": 0.3825340548609815, "grad_norm": 0.578125, "learning_rate": 2.9646440499251056e-05, "loss": 0.5912, "mean_token_accuracy": 0.7569118171930314, "num_tokens": 62368509.0, "step": 2050 }, { "entropy": 0.5493465921282769, "epoch": 0.39186415376002987, "grad_norm": 1.1875, "learning_rate": 2.9628152052449148e-05, "loss": 0.5668, "mean_token_accuracy": 0.7606059044599534, "num_tokens": 63850871.0, "step": 2100 }, { "entropy": 0.5519525390863419, "epoch": 0.4011942526590782, "grad_norm": 0.8359375, "learning_rate": 2.960940905503325e-05, "loss": 0.5736, "mean_token_accuracy": 0.7569921463727951, "num_tokens": 65368844.0, "step": 2150 }, { "entropy": 0.5381503540277481, "epoch": 0.4105243515581265, "grad_norm": 0.6953125, "learning_rate": 2.9590212155963024e-05, "loss": 0.5602, "mean_token_accuracy": 0.7622320890426636, "num_tokens": 66879984.0, "step": 2200 }, { "entropy": 0.5599228474497795, "epoch": 0.41985445045717484, "grad_norm": 0.5390625, "learning_rate": 2.9570562019914053e-05, "loss": 0.5736, "mean_token_accuracy": 0.7590676909685135, "num_tokens": 68396429.0, "step": 2250 }, { "entropy": 0.5572463124990463, "epoch": 0.4291845493562232, "grad_norm": 0.458984375, "learning_rate": 2.9550459327254864e-05, "loss": 0.5732, "mean_token_accuracy": 0.7579269409179688, "num_tokens": 69905569.0, "step": 2300 }, { "entropy": 0.5512515944242478, "epoch": 0.43851464825527153, "grad_norm": 0.83203125, "learning_rate": 2.9529904774023353e-05, "loss": 0.5715, "mean_token_accuracy": 0.7546812242269516, "num_tokens": 71496156.0, "step": 2350 }, { "entropy": 0.5452529183030128, "epoch": 0.4478447471543198, "grad_norm": 1.453125, "learning_rate": 2.9508899071902684e-05, "loss": 0.5667, "mean_token_accuracy": 0.7611679089069366, "num_tokens": 72993303.0, "step": 2400 }, { "entropy": 0.5496764704585075, "epoch": 0.45717484605336817, "grad_norm": 0.80078125, "learning_rate": 2.9487442948196643e-05, "loss": 0.5677, "mean_token_accuracy": 0.7588638842105866, "num_tokens": 74502630.0, "step": 2450 }, { "entropy": 0.5490121757984161, "epoch": 0.4665049449524165, "grad_norm": 0.75390625, "learning_rate": 2.9465537145804476e-05, "loss": 0.5685, "mean_token_accuracy": 0.7586365014314651, "num_tokens": 76031000.0, "step": 2500 }, { "entropy": 0.5405518284440041, "epoch": 0.4758350438514648, "grad_norm": 1.0234375, "learning_rate": 2.944318242319515e-05, "loss": 0.562, "mean_token_accuracy": 0.7639656978845596, "num_tokens": 77482894.0, "step": 2550 }, { "entropy": 0.5369963318109512, "epoch": 0.48516514275051315, "grad_norm": 1.2109375, "learning_rate": 2.94203795543811e-05, "loss": 0.5595, "mean_token_accuracy": 0.7631033205986023, "num_tokens": 78956007.0, "step": 2600 }, { "entropy": 0.5459193900227547, "epoch": 0.4944952416495615, "grad_norm": 0.63671875, "learning_rate": 2.939712932889142e-05, "loss": 0.5678, "mean_token_accuracy": 0.7564943873882294, "num_tokens": 80549485.0, "step": 2650 }, { "entropy": 0.5403485292196274, "epoch": 0.5038253405486098, "grad_norm": 0.73828125, "learning_rate": 2.937343255174453e-05, "loss": 0.5665, "mean_token_accuracy": 0.7631701147556305, "num_tokens": 81987729.0, "step": 2700 }, { "entropy": 0.5511571237444878, "epoch": 0.5131554394476582, "grad_norm": 0.875, "learning_rate": 2.9349290043420315e-05, "loss": 0.5718, "mean_token_accuracy": 0.7589112591743469, "num_tokens": 83566503.0, "step": 2750 }, { "entropy": 0.545387190580368, "epoch": 0.5224855383467065, "grad_norm": 1.0390625, "learning_rate": 2.932470263983169e-05, "loss": 0.578, "mean_token_accuracy": 0.7594633424282073, "num_tokens": 85045132.0, "step": 2800 }, { "entropy": 0.5400431799888611, "epoch": 0.5318156372457548, "grad_norm": 0.60546875, "learning_rate": 2.929967119229569e-05, "loss": 0.5639, "mean_token_accuracy": 0.7595540487766266, "num_tokens": 86552294.0, "step": 2850 }, { "entropy": 0.5613244980573654, "epoch": 0.5411457361448031, "grad_norm": 1.0, "learning_rate": 2.9274196567503974e-05, "loss": 0.5882, "mean_token_accuracy": 0.7518465319275855, "num_tokens": 88138088.0, "step": 2900 }, { "entropy": 0.5406323432922363, "epoch": 0.5504758350438514, "grad_norm": 0.68359375, "learning_rate": 2.9248279647492817e-05, "loss": 0.563, "mean_token_accuracy": 0.7594792503118515, "num_tokens": 89629470.0, "step": 2950 }, { "entropy": 0.5328826600313187, "epoch": 0.5598059339428998, "grad_norm": 0.7890625, "learning_rate": 2.9221921329612568e-05, "loss": 0.559, "mean_token_accuracy": 0.7616329395771027, "num_tokens": 91153981.0, "step": 3000 }, { "epoch": 0.5598059339428998, "eval_entropy": 0.5315866710942819, "eval_loss": 0.5568196773529053, "eval_mean_token_accuracy": 0.7634841575535065, "eval_num_tokens": 91153981.0, "eval_runtime": 16.3141, "eval_samples_per_second": 53.144, "eval_steps_per_second": 6.681, "step": 3000 }, { "entropy": 0.5549379280209541, "epoch": 0.5691360328419481, "grad_norm": 0.640625, "learning_rate": 2.9195122526496596e-05, "loss": 0.571, "mean_token_accuracy": 0.7589174765348434, "num_tokens": 92704541.0, "step": 3050 }, { "entropy": 0.5324976027011872, "epoch": 0.5784661317409965, "grad_norm": 0.9140625, "learning_rate": 2.9167884166029674e-05, "loss": 0.558, "mean_token_accuracy": 0.766141871213913, "num_tokens": 94114581.0, "step": 3100 }, { "entropy": 0.5359655514359474, "epoch": 0.5877962306400448, "grad_norm": 1.0546875, "learning_rate": 2.9140207191315857e-05, "loss": 0.5609, "mean_token_accuracy": 0.7602073633670807, "num_tokens": 95640629.0, "step": 3150 }, { "entropy": 0.5375589004158974, "epoch": 0.5971263295390931, "grad_norm": 1.0078125, "learning_rate": 2.911209256064584e-05, "loss": 0.5567, "mean_token_accuracy": 0.7607348054647446, "num_tokens": 97223569.0, "step": 3200 }, { "entropy": 0.5388145217299461, "epoch": 0.6064564284381414, "grad_norm": 0.73046875, "learning_rate": 2.9083541247463754e-05, "loss": 0.5612, "mean_token_accuracy": 0.7596866941452026, "num_tokens": 98767227.0, "step": 3250 }, { "entropy": 0.5369637748599052, "epoch": 0.6157865273371897, "grad_norm": 0.56640625, "learning_rate": 2.9054554240333478e-05, "loss": 0.5601, "mean_token_accuracy": 0.7642514258623123, "num_tokens": 100228436.0, "step": 3300 }, { "entropy": 0.5306126582622528, "epoch": 0.6251166262362381, "grad_norm": 1.3515625, "learning_rate": 2.9025132542904414e-05, "loss": 0.5548, "mean_token_accuracy": 0.7639524918794632, "num_tokens": 101762895.0, "step": 3350 }, { "entropy": 0.532108125090599, "epoch": 0.6344467251352864, "grad_norm": 0.48828125, "learning_rate": 2.8995277173876718e-05, "loss": 0.5565, "mean_token_accuracy": 0.7622706252336502, "num_tokens": 103288607.0, "step": 3400 }, { "entropy": 0.530606449842453, "epoch": 0.6437768240343348, "grad_norm": 1.1796875, "learning_rate": 2.896498916696605e-05, "loss": 0.5598, "mean_token_accuracy": 0.763903112411499, "num_tokens": 104756099.0, "step": 3450 }, { "entropy": 0.5285736629366875, "epoch": 0.6531069229333831, "grad_norm": 1.0703125, "learning_rate": 2.8934269570867776e-05, "loss": 0.5536, "mean_token_accuracy": 0.7639499133825303, "num_tokens": 106245797.0, "step": 3500 }, { "entropy": 0.5446360909938812, "epoch": 0.6624370218324315, "grad_norm": 2.125, "learning_rate": 2.890311944922064e-05, "loss": 0.5789, "mean_token_accuracy": 0.7568975293636322, "num_tokens": 107770481.0, "step": 3550 }, { "entropy": 0.5234782636165619, "epoch": 0.6717671207314797, "grad_norm": 1.7578125, "learning_rate": 2.8871539880569963e-05, "loss": 0.5532, "mean_token_accuracy": 0.7649819606542587, "num_tokens": 109269688.0, "step": 3600 }, { "entropy": 0.5393576291203499, "epoch": 0.681097219630528, "grad_norm": 0.69140625, "learning_rate": 2.8839531958330277e-05, "loss": 0.5617, "mean_token_accuracy": 0.759439873099327, "num_tokens": 110803400.0, "step": 3650 }, { "entropy": 0.539838764667511, "epoch": 0.6904273185295764, "grad_norm": 0.76171875, "learning_rate": 2.880709679074749e-05, "loss": 0.5631, "mean_token_accuracy": 0.760960082411766, "num_tokens": 112340326.0, "step": 3700 }, { "entropy": 0.5437505677342415, "epoch": 0.6997574174286247, "grad_norm": 0.9296875, "learning_rate": 2.8774235500860494e-05, "loss": 0.5656, "mean_token_accuracy": 0.7594961816072464, "num_tokens": 113873379.0, "step": 3750 }, { "entropy": 0.5357860559225083, "epoch": 0.7090875163276731, "grad_norm": 0.640625, "learning_rate": 2.874094922646229e-05, "loss": 0.5611, "mean_token_accuracy": 0.7590708369016648, "num_tokens": 115408557.0, "step": 3800 }, { "entropy": 0.5340870246291161, "epoch": 0.7184176152267214, "grad_norm": 0.953125, "learning_rate": 2.870723912006058e-05, "loss": 0.5552, "mean_token_accuracy": 0.765527902841568, "num_tokens": 116891513.0, "step": 3850 }, { "entropy": 0.5364308878779411, "epoch": 0.7277477141257698, "grad_norm": 1.109375, "learning_rate": 2.867310634883789e-05, "loss": 0.5518, "mean_token_accuracy": 0.7665286004543305, "num_tokens": 118406575.0, "step": 3900 }, { "entropy": 0.5389542949199676, "epoch": 0.7370778130248181, "grad_norm": 0.390625, "learning_rate": 2.863855209461113e-05, "loss": 0.5628, "mean_token_accuracy": 0.7604682886600495, "num_tokens": 119888774.0, "step": 3950 }, { "entropy": 0.5373398035764694, "epoch": 0.7464079119238664, "grad_norm": 0.6484375, "learning_rate": 2.8603577553790682e-05, "loss": 0.5654, "mean_token_accuracy": 0.7639918619394303, "num_tokens": 121314895.0, "step": 4000 }, { "epoch": 0.7464079119238664, "eval_entropy": 0.5262507380695518, "eval_loss": 0.5490807890892029, "eval_mean_token_accuracy": 0.7650549843770649, "eval_num_tokens": 121314895.0, "eval_runtime": 16.0402, "eval_samples_per_second": 54.052, "eval_steps_per_second": 6.795, "step": 4000 }, { "entropy": 0.5303604575991631, "epoch": 0.7557380108229147, "grad_norm": 0.65234375, "learning_rate": 2.8568183937338984e-05, "loss": 0.5609, "mean_token_accuracy": 0.7618992066383362, "num_tokens": 122793213.0, "step": 4050 }, { "entropy": 0.5326658990979195, "epoch": 0.765068109721963, "grad_norm": 1.1171875, "learning_rate": 2.8532372470728608e-05, "loss": 0.5552, "mean_token_accuracy": 0.7656506180763245, "num_tokens": 124227341.0, "step": 4100 }, { "entropy": 0.5209727981686592, "epoch": 0.7743982086210114, "grad_norm": 0.69140625, "learning_rate": 2.8496144393899784e-05, "loss": 0.5516, "mean_token_accuracy": 0.7649285507202148, "num_tokens": 125707121.0, "step": 4150 }, { "entropy": 0.5248242399096489, "epoch": 0.7837283075200597, "grad_norm": 0.734375, "learning_rate": 2.8459500961217533e-05, "loss": 0.5534, "mean_token_accuracy": 0.7604096215963364, "num_tokens": 127238194.0, "step": 4200 }, { "entropy": 0.532697811126709, "epoch": 0.7930584064191081, "grad_norm": 0.80078125, "learning_rate": 2.842244344142819e-05, "loss": 0.5622, "mean_token_accuracy": 0.7587612766027451, "num_tokens": 128737550.0, "step": 4250 }, { "entropy": 0.5316076844930648, "epoch": 0.8023885053181564, "grad_norm": 0.515625, "learning_rate": 2.8384973117615488e-05, "loss": 0.5562, "mean_token_accuracy": 0.7626278126239776, "num_tokens": 130249756.0, "step": 4300 }, { "entropy": 0.5310768684744835, "epoch": 0.8117186042172047, "grad_norm": 0.78515625, "learning_rate": 2.8347091287156136e-05, "loss": 0.5575, "mean_token_accuracy": 0.7627124708890914, "num_tokens": 131739377.0, "step": 4350 }, { "entropy": 0.5297018462419509, "epoch": 0.821048703116253, "grad_norm": 0.72265625, "learning_rate": 2.8308799261674898e-05, "loss": 0.5556, "mean_token_accuracy": 0.7631601667404175, "num_tokens": 133264527.0, "step": 4400 }, { "entropy": 0.5304474216699601, "epoch": 0.8303788020153013, "grad_norm": 1.140625, "learning_rate": 2.8270098366999166e-05, "loss": 0.5597, "mean_token_accuracy": 0.7665414202213288, "num_tokens": 134690231.0, "step": 4450 }, { "entropy": 0.5239725235104561, "epoch": 0.8397089009143497, "grad_norm": 0.796875, "learning_rate": 2.8230989943113075e-05, "loss": 0.5517, "mean_token_accuracy": 0.763014947772026, "num_tokens": 136226470.0, "step": 4500 }, { "entropy": 0.527112789452076, "epoch": 0.849038999813398, "grad_norm": 0.84375, "learning_rate": 2.8191475344111103e-05, "loss": 0.5524, "mean_token_accuracy": 0.7613210624456406, "num_tokens": 137780275.0, "step": 4550 }, { "entropy": 0.5286294043064117, "epoch": 0.8583690987124464, "grad_norm": 0.68359375, "learning_rate": 2.8151555938151165e-05, "loss": 0.557, "mean_token_accuracy": 0.7616584074497222, "num_tokens": 139330494.0, "step": 4600 }, { "entropy": 0.5276841628551483, "epoch": 0.8676991976114947, "grad_norm": 0.609375, "learning_rate": 2.811123310740726e-05, "loss": 0.5495, "mean_token_accuracy": 0.7647832882404327, "num_tokens": 140815597.0, "step": 4650 }, { "entropy": 0.5373046767711639, "epoch": 0.8770292965105431, "grad_norm": 0.67578125, "learning_rate": 2.807050824802163e-05, "loss": 0.5624, "mean_token_accuracy": 0.758348998427391, "num_tokens": 142383943.0, "step": 4700 }, { "entropy": 0.5215965616703033, "epoch": 0.8863593954095913, "grad_norm": 0.6171875, "learning_rate": 2.802938277005638e-05, "loss": 0.5462, "mean_token_accuracy": 0.7664449107646942, "num_tokens": 143879403.0, "step": 4750 }, { "entropy": 0.5327892461419106, "epoch": 0.8956894943086396, "grad_norm": 0.38671875, "learning_rate": 2.7987858097444688e-05, "loss": 0.5618, "mean_token_accuracy": 0.7579188454151153, "num_tokens": 145455384.0, "step": 4800 }, { "entropy": 0.5209793072938919, "epoch": 0.905019593207688, "grad_norm": 0.90625, "learning_rate": 2.794593566794149e-05, "loss": 0.5502, "mean_token_accuracy": 0.7619763416051865, "num_tokens": 147010897.0, "step": 4850 }, { "entropy": 0.5109856846928597, "epoch": 0.9143496921067363, "grad_norm": 0.4375, "learning_rate": 2.7903616933073712e-05, "loss": 0.5471, "mean_token_accuracy": 0.7652358949184418, "num_tokens": 148509259.0, "step": 4900 }, { "entropy": 0.5274619281291961, "epoch": 0.9236797910057847, "grad_norm": 1.1171875, "learning_rate": 2.786090335808998e-05, "loss": 0.5546, "mean_token_accuracy": 0.7621842390298843, "num_tokens": 149982645.0, "step": 4950 }, { "entropy": 0.5342949241399765, "epoch": 0.933009889904833, "grad_norm": 0.58984375, "learning_rate": 2.7817796421909922e-05, "loss": 0.5682, "mean_token_accuracy": 0.7593452525138855, "num_tokens": 151532149.0, "step": 5000 }, { "epoch": 0.933009889904833, "eval_entropy": 0.5146727717797691, "eval_loss": 0.5413097739219666, "eval_mean_token_accuracy": 0.767384243120841, "eval_num_tokens": 151532149.0, "eval_runtime": 16.1495, "eval_samples_per_second": 53.686, "eval_steps_per_second": 6.749, "step": 5000 }, { "entropy": 0.5231131237745285, "epoch": 0.9423399888038814, "grad_norm": 0.408203125, "learning_rate": 2.7774297617072963e-05, "loss": 0.554, "mean_token_accuracy": 0.7625243580341339, "num_tokens": 153059857.0, "step": 5050 }, { "entropy": 0.5220058736205101, "epoch": 0.9516700877029296, "grad_norm": 1.140625, "learning_rate": 2.7730408449686593e-05, "loss": 0.5496, "mean_token_accuracy": 0.7626436889171601, "num_tokens": 154578128.0, "step": 5100 }, { "entropy": 0.5289176645874977, "epoch": 0.961000186601978, "grad_norm": 1.046875, "learning_rate": 2.7686130439374304e-05, "loss": 0.5555, "mean_token_accuracy": 0.7623570781946182, "num_tokens": 156133259.0, "step": 5150 }, { "entropy": 0.5238713613152504, "epoch": 0.9703302855010263, "grad_norm": 0.92578125, "learning_rate": 2.7641465119222893e-05, "loss": 0.5525, "mean_token_accuracy": 0.7626954644918442, "num_tokens": 157655593.0, "step": 5200 }, { "entropy": 0.526420825123787, "epoch": 0.9796603844000746, "grad_norm": 0.53125, "learning_rate": 2.7596414035729436e-05, "loss": 0.5523, "mean_token_accuracy": 0.7634574353694916, "num_tokens": 159173170.0, "step": 5250 }, { "entropy": 0.5410353738069534, "epoch": 0.988990483299123, "grad_norm": 0.70703125, "learning_rate": 2.755097874874772e-05, "loss": 0.5616, "mean_token_accuracy": 0.7588550513982772, "num_tokens": 160746023.0, "step": 5300 }, { "entropy": 0.5253653234243393, "epoch": 0.9983205821981713, "grad_norm": 0.984375, "learning_rate": 2.7505160831434235e-05, "loss": 0.5538, "mean_token_accuracy": 0.7632267904281617, "num_tokens": 162234796.0, "step": 5350 }, { "entropy": 0.4956328600645065, "epoch": 1.0076506810972197, "grad_norm": 2.28125, "learning_rate": 2.7458961870193697e-05, "loss": 0.5385, "mean_token_accuracy": 0.7668155688047409, "num_tokens": 163759549.0, "step": 5400 }, { "entropy": 0.4777234876155853, "epoch": 1.016980779996268, "grad_norm": 0.8203125, "learning_rate": 2.741238346462415e-05, "loss": 0.5267, "mean_token_accuracy": 0.7712256401777268, "num_tokens": 165218690.0, "step": 5450 }, { "entropy": 0.4960213273763657, "epoch": 1.0263108788953164, "grad_norm": 0.45703125, "learning_rate": 2.7365427227461538e-05, "loss": 0.5316, "mean_token_accuracy": 0.7680883568525314, "num_tokens": 166779383.0, "step": 5500 }, { "entropy": 0.48663112640380857, "epoch": 1.0356409777943647, "grad_norm": 0.66015625, "learning_rate": 2.731809478452392e-05, "loss": 0.5271, "mean_token_accuracy": 0.772123327255249, "num_tokens": 168262830.0, "step": 5550 }, { "entropy": 0.501155666410923, "epoch": 1.044971076693413, "grad_norm": 0.984375, "learning_rate": 2.72703877746551e-05, "loss": 0.5439, "mean_token_accuracy": 0.7648611211776734, "num_tokens": 169844757.0, "step": 5600 }, { "entropy": 0.5043464726209641, "epoch": 1.0543011755924612, "grad_norm": 0.69140625, "learning_rate": 2.7222307849667976e-05, "loss": 0.5458, "mean_token_accuracy": 0.7605859559774398, "num_tokens": 171432040.0, "step": 5650 }, { "entropy": 0.4868525117635727, "epoch": 1.0636312744915095, "grad_norm": 0.81640625, "learning_rate": 2.7173856674287276e-05, "loss": 0.532, "mean_token_accuracy": 0.7668398702144623, "num_tokens": 172949249.0, "step": 5700 }, { "entropy": 0.48061770796775816, "epoch": 1.0729613733905579, "grad_norm": 0.412109375, "learning_rate": 2.7125035926091948e-05, "loss": 0.5247, "mean_token_accuracy": 0.7707643383741378, "num_tokens": 174427624.0, "step": 5750 }, { "entropy": 0.49181210845708845, "epoch": 1.0822914722896062, "grad_norm": 0.703125, "learning_rate": 2.7075847295457074e-05, "loss": 0.5387, "mean_token_accuracy": 0.7684424781799316, "num_tokens": 175889073.0, "step": 5800 }, { "entropy": 0.47926140516996385, "epoch": 1.0916215711886545, "grad_norm": 0.828125, "learning_rate": 2.702629248549533e-05, "loss": 0.5243, "mean_token_accuracy": 0.7723050940036774, "num_tokens": 177318508.0, "step": 5850 }, { "entropy": 0.49550765454769136, "epoch": 1.100951670087703, "grad_norm": 1.046875, "learning_rate": 2.6976373211998036e-05, "loss": 0.5369, "mean_token_accuracy": 0.7672231763601303, "num_tokens": 178841949.0, "step": 5900 }, { "entropy": 0.4972337147593498, "epoch": 1.1102817689867512, "grad_norm": 0.57421875, "learning_rate": 2.6926091203375736e-05, "loss": 0.5359, "mean_token_accuracy": 0.7673702806234359, "num_tokens": 180355456.0, "step": 5950 }, { "entropy": 0.49163941740989686, "epoch": 1.1196118678857996, "grad_norm": 0.640625, "learning_rate": 2.6875448200598356e-05, "loss": 0.53, "mean_token_accuracy": 0.76685063123703, "num_tokens": 181895417.0, "step": 6000 }, { "epoch": 1.1196118678857996, "eval_entropy": 0.49277083069906324, "eval_loss": 0.5361812114715576, "eval_mean_token_accuracy": 0.7686252342451603, "eval_num_tokens": 181895417.0, "eval_runtime": 16.2137, "eval_samples_per_second": 53.473, "eval_steps_per_second": 6.723, "step": 6000 }, { "entropy": 0.494234202504158, "epoch": 1.128941966784848, "grad_norm": 0.67578125, "learning_rate": 2.682444595713492e-05, "loss": 0.534, "mean_token_accuracy": 0.7677116429805756, "num_tokens": 183441033.0, "step": 6050 }, { "entropy": 0.4924536618590355, "epoch": 1.1382720656838963, "grad_norm": 1.4375, "learning_rate": 2.6773086238892847e-05, "loss": 0.5325, "mean_token_accuracy": 0.7706644636392593, "num_tokens": 184930303.0, "step": 6100 }, { "entropy": 0.49195749253034593, "epoch": 1.1476021645829446, "grad_norm": 0.921875, "learning_rate": 2.67213708241568e-05, "loss": 0.5323, "mean_token_accuracy": 0.769943385720253, "num_tokens": 186461707.0, "step": 6150 }, { "entropy": 0.4922616305947304, "epoch": 1.156932263481993, "grad_norm": 1.046875, "learning_rate": 2.666930150352712e-05, "loss": 0.5331, "mean_token_accuracy": 0.765140592455864, "num_tokens": 188051673.0, "step": 6200 }, { "entropy": 0.4984354588389397, "epoch": 1.1662623623810413, "grad_norm": 0.98046875, "learning_rate": 2.661688007985782e-05, "loss": 0.5366, "mean_token_accuracy": 0.7678897380828857, "num_tokens": 189540976.0, "step": 6250 }, { "entropy": 0.4744232183694839, "epoch": 1.1755924612800897, "grad_norm": 0.79296875, "learning_rate": 2.6564108368194174e-05, "loss": 0.5188, "mean_token_accuracy": 0.7747587919235229, "num_tokens": 190972681.0, "step": 6300 }, { "entropy": 0.48836414963006974, "epoch": 1.184922560179138, "grad_norm": 0.7265625, "learning_rate": 2.6510988195709867e-05, "loss": 0.5945, "mean_token_accuracy": 0.7682410633563995, "num_tokens": 192480163.0, "step": 6350 }, { "entropy": 0.481321419775486, "epoch": 1.1942526590781863, "grad_norm": 0.58984375, "learning_rate": 2.6457521401643724e-05, "loss": 0.5223, "mean_token_accuracy": 0.7744923168420792, "num_tokens": 193963819.0, "step": 6400 }, { "entropy": 0.4845267793536186, "epoch": 1.2035827579772345, "grad_norm": 0.73828125, "learning_rate": 2.640370983723605e-05, "loss": 0.5331, "mean_token_accuracy": 0.7681414604187011, "num_tokens": 195501297.0, "step": 6450 }, { "entropy": 0.4920358270406723, "epoch": 1.2129128568762828, "grad_norm": 1.1171875, "learning_rate": 2.6349555365664496e-05, "loss": 0.5315, "mean_token_accuracy": 0.7675491815805435, "num_tokens": 197047201.0, "step": 6500 }, { "entropy": 0.4892509970068932, "epoch": 1.2222429557753312, "grad_norm": 0.46875, "learning_rate": 2.6295059861979585e-05, "loss": 0.5324, "mean_token_accuracy": 0.7672034209966659, "num_tokens": 198554820.0, "step": 6550 }, { "entropy": 0.48003152668476107, "epoch": 1.2315730546743795, "grad_norm": 0.82421875, "learning_rate": 2.6240225213039762e-05, "loss": 0.5256, "mean_token_accuracy": 0.7719682443141938, "num_tokens": 200055450.0, "step": 6600 }, { "entropy": 0.49669885337352754, "epoch": 1.2409031535734278, "grad_norm": 0.5390625, "learning_rate": 2.6185053317446094e-05, "loss": 0.5394, "mean_token_accuracy": 0.7652375429868699, "num_tokens": 201621193.0, "step": 6650 }, { "entropy": 0.47468378067016603, "epoch": 1.2502332524724762, "grad_norm": 0.298828125, "learning_rate": 2.6129546085476494e-05, "loss": 0.5181, "mean_token_accuracy": 0.7740450286865235, "num_tokens": 203115630.0, "step": 6700 }, { "entropy": 0.48166996002197265, "epoch": 1.2595633513715245, "grad_norm": 0.53515625, "learning_rate": 2.6073705439019604e-05, "loss": 0.5235, "mean_token_accuracy": 0.7715310126543045, "num_tokens": 204659783.0, "step": 6750 }, { "entropy": 0.487423982322216, "epoch": 1.2688934502705729, "grad_norm": 0.94140625, "learning_rate": 2.6017533311508262e-05, "loss": 0.5271, "mean_token_accuracy": 0.7717793607711791, "num_tokens": 206207207.0, "step": 6800 }, { "entropy": 0.490067283809185, "epoch": 1.2782235491696212, "grad_norm": 0.98046875, "learning_rate": 2.5961031647852525e-05, "loss": 0.5291, "mean_token_accuracy": 0.7670228743553161, "num_tokens": 207776741.0, "step": 6850 }, { "entropy": 0.47844831019639966, "epoch": 1.2875536480686696, "grad_norm": 0.60546875, "learning_rate": 2.590420240437236e-05, "loss": 0.5286, "mean_token_accuracy": 0.7723558592796326, "num_tokens": 209300786.0, "step": 6900 }, { "entropy": 0.4823366206884384, "epoch": 1.296883746967718, "grad_norm": 0.828125, "learning_rate": 2.584704754872988e-05, "loss": 0.5294, "mean_token_accuracy": 0.7727249205112457, "num_tokens": 210784669.0, "step": 6950 }, { "entropy": 0.4943872797489166, "epoch": 1.3062138458667663, "grad_norm": 0.8046875, "learning_rate": 2.578956905986124e-05, "loss": 0.5374, "mean_token_accuracy": 0.7674384766817093, "num_tokens": 212346372.0, "step": 7000 }, { "epoch": 1.3062138458667663, "eval_entropy": 0.48885188408947866, "eval_loss": 0.5305144190788269, "eval_mean_token_accuracy": 0.7706459596616413, "eval_num_tokens": 212346372.0, "eval_runtime": 16.2232, "eval_samples_per_second": 53.442, "eval_steps_per_second": 6.719, "step": 7000 }, { "entropy": 0.48752534478902815, "epoch": 1.3155439447658146, "grad_norm": 0.609375, "learning_rate": 2.573176892790812e-05, "loss": 0.5315, "mean_token_accuracy": 0.7696154469251633, "num_tokens": 213860843.0, "step": 7050 }, { "entropy": 0.4900174245238304, "epoch": 1.3248740436648627, "grad_norm": 0.53515625, "learning_rate": 2.567364915414877e-05, "loss": 0.5292, "mean_token_accuracy": 0.7694221770763398, "num_tokens": 215367383.0, "step": 7100 }, { "entropy": 0.48771278649568556, "epoch": 1.334204142563911, "grad_norm": 1.5859375, "learning_rate": 2.5615211750928794e-05, "loss": 0.5353, "mean_token_accuracy": 0.7697239458560944, "num_tokens": 216845421.0, "step": 7150 }, { "entropy": 0.47983416020870207, "epoch": 1.3435342414629594, "grad_norm": 0.6015625, "learning_rate": 2.555645874159141e-05, "loss": 0.5234, "mean_token_accuracy": 0.7707635217905044, "num_tokens": 218372903.0, "step": 7200 }, { "entropy": 0.47865484192967417, "epoch": 1.3528643403620078, "grad_norm": 0.59765625, "learning_rate": 2.549739216040743e-05, "loss": 0.5221, "mean_token_accuracy": 0.7720851230621338, "num_tokens": 219883662.0, "step": 7250 }, { "entropy": 0.4899566939473152, "epoch": 1.362194439261056, "grad_norm": 0.453125, "learning_rate": 2.5438014052504802e-05, "loss": 0.532, "mean_token_accuracy": 0.7674814122915268, "num_tokens": 221426709.0, "step": 7300 }, { "entropy": 0.4817213848233223, "epoch": 1.3715245381601044, "grad_norm": 0.53125, "learning_rate": 2.5378326473797818e-05, "loss": 0.5265, "mean_token_accuracy": 0.769315534234047, "num_tokens": 222950520.0, "step": 7350 }, { "entropy": 0.4752693668007851, "epoch": 1.3808546370591528, "grad_norm": 0.5859375, "learning_rate": 2.5318331490915925e-05, "loss": 0.5195, "mean_token_accuracy": 0.7739131230115891, "num_tokens": 224448854.0, "step": 7400 }, { "entropy": 0.4732086658477783, "epoch": 1.3901847359582011, "grad_norm": 0.9140625, "learning_rate": 2.525803118113215e-05, "loss": 0.5225, "mean_token_accuracy": 0.7735245388746261, "num_tokens": 225932263.0, "step": 7450 }, { "entropy": 0.48352263927459715, "epoch": 1.3995148348572495, "grad_norm": 0.546875, "learning_rate": 2.5197427632291214e-05, "loss": 0.531, "mean_token_accuracy": 0.7676544332504273, "num_tokens": 227452316.0, "step": 7500 }, { "entropy": 0.48029813915491104, "epoch": 1.4088449337562978, "grad_norm": 0.61328125, "learning_rate": 2.513652294273721e-05, "loss": 0.5257, "mean_token_accuracy": 0.7688204735517502, "num_tokens": 228995142.0, "step": 7550 }, { "entropy": 0.49092908948659897, "epoch": 1.4181750326553462, "grad_norm": 0.90625, "learning_rate": 2.507531922124096e-05, "loss": 0.5515, "mean_token_accuracy": 0.7691348105669021, "num_tokens": 230473945.0, "step": 7600 }, { "entropy": 0.46930390000343325, "epoch": 1.4275051315543945, "grad_norm": 0.91015625, "learning_rate": 2.501381858692701e-05, "loss": 0.5192, "mean_token_accuracy": 0.7731947559118271, "num_tokens": 231969989.0, "step": 7650 }, { "entropy": 0.4841861927509308, "epoch": 1.4368352304534429, "grad_norm": 0.74609375, "learning_rate": 2.495202316920024e-05, "loss": 0.5281, "mean_token_accuracy": 0.7702859449386597, "num_tokens": 233496905.0, "step": 7700 }, { "entropy": 0.47611463099718093, "epoch": 1.4461653293524912, "grad_norm": 0.9140625, "learning_rate": 2.488993510767214e-05, "loss": 0.5292, "mean_token_accuracy": 0.7676136875152588, "num_tokens": 235025876.0, "step": 7750 }, { "entropy": 0.480805746614933, "epoch": 1.4554954282515395, "grad_norm": 0.447265625, "learning_rate": 2.4827556552086753e-05, "loss": 0.5228, "mean_token_accuracy": 0.7714346569776535, "num_tokens": 236573132.0, "step": 7800 }, { "entropy": 0.4773865479230881, "epoch": 1.464825527150588, "grad_norm": 0.8828125, "learning_rate": 2.47648896622462e-05, "loss": 0.5233, "mean_token_accuracy": 0.7716735368967056, "num_tokens": 238108249.0, "step": 7850 }, { "entropy": 0.49567407727241514, "epoch": 1.4741556260496362, "grad_norm": 0.49609375, "learning_rate": 2.4701936607935922e-05, "loss": 0.548, "mean_token_accuracy": 0.7675740510225296, "num_tokens": 239609374.0, "step": 7900 }, { "entropy": 0.47016422227025034, "epoch": 1.4834857249486846, "grad_norm": 0.65625, "learning_rate": 2.463869956884957e-05, "loss": 0.5141, "mean_token_accuracy": 0.7800671440362931, "num_tokens": 241035036.0, "step": 7950 }, { "entropy": 0.4691419780254364, "epoch": 1.4928158238477327, "grad_norm": 0.6640625, "learning_rate": 2.457518073451348e-05, "loss": 0.5183, "mean_token_accuracy": 0.7747016477584839, "num_tokens": 242482052.0, "step": 8000 }, { "epoch": 1.4928158238477327, "eval_entropy": 0.48408404293410273, "eval_loss": 0.5262120962142944, "eval_mean_token_accuracy": 0.7715235532970603, "eval_num_tokens": 242482052.0, "eval_runtime": 16.1685, "eval_samples_per_second": 53.623, "eval_steps_per_second": 6.741, "step": 8000 }, { "entropy": 0.48640842020511627, "epoch": 1.5021459227467813, "grad_norm": 0.8671875, "learning_rate": 2.451138230421094e-05, "loss": 0.5291, "mean_token_accuracy": 0.768261170387268, "num_tokens": 244051944.0, "step": 8050 }, { "entropy": 0.4861644932627678, "epoch": 1.5114760216458294, "grad_norm": 0.86328125, "learning_rate": 2.4447306486905965e-05, "loss": 0.5303, "mean_token_accuracy": 0.7654324793815612, "num_tokens": 245667242.0, "step": 8100 }, { "entropy": 0.46581872284412384, "epoch": 1.5208061205448777, "grad_norm": 0.64453125, "learning_rate": 2.4382955501166878e-05, "loss": 0.517, "mean_token_accuracy": 0.7791347569227218, "num_tokens": 247084677.0, "step": 8150 }, { "entropy": 0.48655702769756315, "epoch": 1.530136219443926, "grad_norm": 0.5859375, "learning_rate": 2.4318331575089437e-05, "loss": 0.5283, "mean_token_accuracy": 0.7677739357948303, "num_tokens": 248666161.0, "step": 8200 }, { "entropy": 0.48474507868289946, "epoch": 1.5394663183429744, "grad_norm": 0.83203125, "learning_rate": 2.425343694621974e-05, "loss": 0.5218, "mean_token_accuracy": 0.7710594099760055, "num_tokens": 250228344.0, "step": 8250 }, { "entropy": 0.501356900036335, "epoch": 1.5487964172420228, "grad_norm": 1.09375, "learning_rate": 2.418827386147672e-05, "loss": 0.5391, "mean_token_accuracy": 0.7636065500974655, "num_tokens": 251842667.0, "step": 8300 }, { "entropy": 0.48092112705111506, "epoch": 1.5581265161410711, "grad_norm": 0.65625, "learning_rate": 2.4122844577074344e-05, "loss": 0.5246, "mean_token_accuracy": 0.7711791855096817, "num_tokens": 253387970.0, "step": 8350 }, { "entropy": 0.4703651532530785, "epoch": 1.5674566150401195, "grad_norm": 0.76171875, "learning_rate": 2.4057151358443537e-05, "loss": 0.523, "mean_token_accuracy": 0.7739822679758072, "num_tokens": 254893911.0, "step": 8400 }, { "entropy": 0.48381629049777986, "epoch": 1.5767867139391678, "grad_norm": 0.5390625, "learning_rate": 2.3991196480153678e-05, "loss": 0.5295, "mean_token_accuracy": 0.7688455355167388, "num_tokens": 256399394.0, "step": 8450 }, { "entropy": 0.4885594379901886, "epoch": 1.586116812838216, "grad_norm": 0.73828125, "learning_rate": 2.39249822258339e-05, "loss": 0.5269, "mean_token_accuracy": 0.7714411211013794, "num_tokens": 257935448.0, "step": 8500 }, { "entropy": 0.48577941954135895, "epoch": 1.5954469117372643, "grad_norm": 0.8828125, "learning_rate": 2.3858510888093997e-05, "loss": 0.5231, "mean_token_accuracy": 0.7718437218666077, "num_tokens": 259486589.0, "step": 8550 }, { "entropy": 0.48262311398983004, "epoch": 1.6047770106363126, "grad_norm": 0.578125, "learning_rate": 2.3791784768445045e-05, "loss": 0.5248, "mean_token_accuracy": 0.7686738175153732, "num_tokens": 261065847.0, "step": 8600 }, { "entropy": 0.471617269217968, "epoch": 1.614107109535361, "grad_norm": 0.9765625, "learning_rate": 2.3724806177219723e-05, "loss": 0.5203, "mean_token_accuracy": 0.772919489145279, "num_tokens": 262600411.0, "step": 8650 }, { "entropy": 0.47281612068414686, "epoch": 1.6234372084344093, "grad_norm": 1.9140625, "learning_rate": 2.36575774334923e-05, "loss": 0.5169, "mean_token_accuracy": 0.774740971326828, "num_tokens": 264141493.0, "step": 8700 }, { "entropy": 0.47484747022390367, "epoch": 1.6327673073334577, "grad_norm": 0.609375, "learning_rate": 2.359010086499838e-05, "loss": 0.5184, "mean_token_accuracy": 0.7732387953996658, "num_tokens": 265636375.0, "step": 8750 }, { "entropy": 0.47206893771886826, "epoch": 1.642097406232506, "grad_norm": 0.7890625, "learning_rate": 2.352237880805426e-05, "loss": 0.5303, "mean_token_accuracy": 0.7746653699874878, "num_tokens": 267090177.0, "step": 8800 }, { "entropy": 0.4694141258299351, "epoch": 1.6514275051315543, "grad_norm": 0.64453125, "learning_rate": 2.3454413607476044e-05, "loss": 0.5199, "mean_token_accuracy": 0.7748447281122207, "num_tokens": 268551821.0, "step": 8850 }, { "entropy": 0.47594692051410675, "epoch": 1.6607576040306027, "grad_norm": 1.125, "learning_rate": 2.3386207616498503e-05, "loss": 0.5186, "mean_token_accuracy": 0.773201887011528, "num_tokens": 270084312.0, "step": 8900 }, { "entropy": 0.47182066380977633, "epoch": 1.670087702929651, "grad_norm": 0.66796875, "learning_rate": 2.331776319669354e-05, "loss": 0.5286, "mean_token_accuracy": 0.7718379843235016, "num_tokens": 271546211.0, "step": 8950 }, { "entropy": 0.4656666761636734, "epoch": 1.6794178018286994, "grad_norm": 0.490234375, "learning_rate": 2.324908271788844e-05, "loss": 0.5158, "mean_token_accuracy": 0.7762594664096832, "num_tokens": 272998807.0, "step": 9000 }, { "epoch": 1.6794178018286994, "eval_entropy": 0.4748075161505183, "eval_loss": 0.5214188694953918, "eval_mean_token_accuracy": 0.7731986155203723, "eval_num_tokens": 272998807.0, "eval_runtime": 16.1272, "eval_samples_per_second": 53.76, "eval_steps_per_second": 6.759, "step": 9000 }, { "entropy": 0.4896981066465378, "epoch": 1.6887479007277477, "grad_norm": 0.828125, "learning_rate": 2.3180168558083844e-05, "loss": 0.5287, "mean_token_accuracy": 0.7671917879581451, "num_tokens": 274587533.0, "step": 9050 }, { "entropy": 0.4765408200025558, "epoch": 1.698077999626796, "grad_norm": 1.921875, "learning_rate": 2.311102310337136e-05, "loss": 0.5188, "mean_token_accuracy": 0.7724708420038223, "num_tokens": 276131737.0, "step": 9100 }, { "entropy": 0.4824476379156113, "epoch": 1.7074080985258444, "grad_norm": 0.46875, "learning_rate": 2.304164874785101e-05, "loss": 0.5267, "mean_token_accuracy": 0.7684733641147613, "num_tokens": 277715304.0, "step": 9150 }, { "entropy": 0.48801319271326066, "epoch": 1.7167381974248928, "grad_norm": 0.7890625, "learning_rate": 2.297204789354827e-05, "loss": 0.5345, "mean_token_accuracy": 0.7730580461025238, "num_tokens": 279224176.0, "step": 9200 }, { "entropy": 0.47024243041872976, "epoch": 1.726068296323941, "grad_norm": 0.70703125, "learning_rate": 2.2902222950330966e-05, "loss": 0.5208, "mean_token_accuracy": 0.7733591181039811, "num_tokens": 280746272.0, "step": 9250 }, { "entropy": 0.4675961661338806, "epoch": 1.7353983952229894, "grad_norm": 0.9140625, "learning_rate": 2.283217633582578e-05, "loss": 0.5249, "mean_token_accuracy": 0.7734372174739838, "num_tokens": 282236117.0, "step": 9300 }, { "entropy": 0.48016302675008776, "epoch": 1.7447284941220378, "grad_norm": 1.4921875, "learning_rate": 2.276191047533458e-05, "loss": 0.5231, "mean_token_accuracy": 0.770177600979805, "num_tokens": 283794545.0, "step": 9350 }, { "entropy": 0.47259823501110076, "epoch": 1.7540585930210861, "grad_norm": 0.71484375, "learning_rate": 2.269142780175042e-05, "loss": 0.5192, "mean_token_accuracy": 0.7728340399265289, "num_tokens": 285333807.0, "step": 9400 }, { "entropy": 0.47532046377658843, "epoch": 1.7633886919201345, "grad_norm": 0.86328125, "learning_rate": 2.2620730755473328e-05, "loss": 0.5259, "mean_token_accuracy": 0.7706443351507187, "num_tokens": 286852697.0, "step": 9450 }, { "entropy": 0.4715300849080086, "epoch": 1.7727187908191828, "grad_norm": 0.6640625, "learning_rate": 2.2549821784325787e-05, "loss": 0.5226, "mean_token_accuracy": 0.7710268515348434, "num_tokens": 288390169.0, "step": 9500 }, { "entropy": 0.45817104071378706, "epoch": 1.7820488897182312, "grad_norm": 0.65234375, "learning_rate": 2.2478703343467995e-05, "loss": 0.5084, "mean_token_accuracy": 0.7808880287408829, "num_tokens": 289825517.0, "step": 9550 }, { "entropy": 0.46380849391222, "epoch": 1.7913789886172793, "grad_norm": 0.609375, "learning_rate": 2.2407377895312848e-05, "loss": 0.522, "mean_token_accuracy": 0.7710244971513748, "num_tokens": 291345905.0, "step": 9600 }, { "entropy": 0.4663406872749329, "epoch": 1.8007090875163276, "grad_norm": 0.97265625, "learning_rate": 2.2335847909440697e-05, "loss": 0.5174, "mean_token_accuracy": 0.7765918165445328, "num_tokens": 292843956.0, "step": 9650 }, { "entropy": 0.46827891767024993, "epoch": 1.810039186415376, "grad_norm": 1.4765625, "learning_rate": 2.226411586251381e-05, "loss": 0.5135, "mean_token_accuracy": 0.7766558998823165, "num_tokens": 294304944.0, "step": 9700 }, { "entropy": 0.4667487397789955, "epoch": 1.8193692853144243, "grad_norm": 0.75390625, "learning_rate": 2.2192184238190666e-05, "loss": 0.5173, "mean_token_accuracy": 0.7746782380342484, "num_tokens": 295807703.0, "step": 9750 }, { "entropy": 0.4796573233604431, "epoch": 1.8286993842134727, "grad_norm": 0.80078125, "learning_rate": 2.2120055527039914e-05, "loss": 0.5265, "mean_token_accuracy": 0.7696005600690842, "num_tokens": 297318135.0, "step": 9800 }, { "entropy": 0.47267288982868194, "epoch": 1.838029483112521, "grad_norm": 0.9140625, "learning_rate": 2.2047732226454157e-05, "loss": 0.5153, "mean_token_accuracy": 0.7761634987592697, "num_tokens": 298791062.0, "step": 9850 }, { "entropy": 0.47834465265274045, "epoch": 1.8473595820115694, "grad_norm": 0.625, "learning_rate": 2.1975216840563502e-05, "loss": 0.5222, "mean_token_accuracy": 0.7747543674707412, "num_tokens": 300275324.0, "step": 9900 }, { "entropy": 0.46263691544532776, "epoch": 1.8566896809106177, "grad_norm": 1.0, "learning_rate": 2.1902511880148835e-05, "loss": 0.5148, "mean_token_accuracy": 0.7750599044561386, "num_tokens": 301729884.0, "step": 9950 }, { "entropy": 0.4763942888379097, "epoch": 1.8660197798096658, "grad_norm": 0.796875, "learning_rate": 2.1829619862554877e-05, "loss": 0.5183, "mean_token_accuracy": 0.7708618581295014, "num_tokens": 303287949.0, "step": 10000 }, { "epoch": 1.8660197798096658, "eval_entropy": 0.4685867871713201, "eval_loss": 0.5169408321380615, "eval_mean_token_accuracy": 0.7741667956387231, "eval_num_tokens": 303287949.0, "eval_runtime": 16.0644, "eval_samples_per_second": 53.97, "eval_steps_per_second": 6.785, "step": 10000 }, { "entropy": 0.47606048226356507, "epoch": 1.8753498787087142, "grad_norm": 1.109375, "learning_rate": 2.175654331160305e-05, "loss": 0.5189, "mean_token_accuracy": 0.7730025327205658, "num_tokens": 304818318.0, "step": 10050 }, { "entropy": 0.4616896215081215, "epoch": 1.8846799776077625, "grad_norm": 0.9140625, "learning_rate": 2.168328475750408e-05, "loss": 0.5089, "mean_token_accuracy": 0.7780868858098984, "num_tokens": 306331367.0, "step": 10100 }, { "entropy": 0.47027878910303117, "epoch": 1.8940100765068109, "grad_norm": 0.53125, "learning_rate": 2.160984673677039e-05, "loss": 0.5186, "mean_token_accuracy": 0.772971043586731, "num_tokens": 307830293.0, "step": 10150 }, { "entropy": 0.4677613499760628, "epoch": 1.9033401754058592, "grad_norm": 0.376953125, "learning_rate": 2.153623179212827e-05, "loss": 0.5163, "mean_token_accuracy": 0.7740986323356629, "num_tokens": 309360930.0, "step": 10200 }, { "entropy": 0.47715963318943977, "epoch": 1.9126702743049075, "grad_norm": 0.73046875, "learning_rate": 2.146244247242985e-05, "loss": 0.5271, "mean_token_accuracy": 0.7717376494407654, "num_tokens": 310855697.0, "step": 10250 }, { "entropy": 0.46572228729724885, "epoch": 1.922000373203956, "grad_norm": 0.71875, "learning_rate": 2.1388481332564835e-05, "loss": 0.5145, "mean_token_accuracy": 0.774823442697525, "num_tokens": 312370909.0, "step": 10300 }, { "entropy": 0.46713058680295944, "epoch": 1.9313304721030042, "grad_norm": 0.404296875, "learning_rate": 2.1314350933372053e-05, "loss": 0.5129, "mean_token_accuracy": 0.7726266753673553, "num_tokens": 313900324.0, "step": 10350 }, { "entropy": 0.4743108308315277, "epoch": 1.9406605710020526, "grad_norm": 0.8359375, "learning_rate": 2.1240053841550792e-05, "loss": 0.5226, "mean_token_accuracy": 0.7713726377487182, "num_tokens": 315441715.0, "step": 10400 }, { "entropy": 0.4706794250011444, "epoch": 1.949990669901101, "grad_norm": 0.9453125, "learning_rate": 2.1165592629571923e-05, "loss": 0.517, "mean_token_accuracy": 0.7740881043672562, "num_tokens": 316910090.0, "step": 10450 }, { "entropy": 0.4839508882164955, "epoch": 1.9593207688001493, "grad_norm": 0.81640625, "learning_rate": 2.1090969875588827e-05, "loss": 0.5236, "mean_token_accuracy": 0.7707830715179443, "num_tokens": 318480506.0, "step": 10500 }, { "entropy": 0.4602850756049156, "epoch": 1.9686508676991976, "grad_norm": 2.34375, "learning_rate": 2.1016188163348126e-05, "loss": 0.5121, "mean_token_accuracy": 0.7764248877763749, "num_tokens": 320008211.0, "step": 10550 }, { "entropy": 0.4730991995334625, "epoch": 1.977980966598246, "grad_norm": 0.51171875, "learning_rate": 2.0941250082100253e-05, "loss": 0.5288, "mean_token_accuracy": 0.7702510052919388, "num_tokens": 321525032.0, "step": 10600 }, { "entropy": 0.446844310760498, "epoch": 1.9873110654972943, "grad_norm": 0.296875, "learning_rate": 2.0866158226509758e-05, "loss": 0.5058, "mean_token_accuracy": 0.7796232843399048, "num_tokens": 322954203.0, "step": 10650 }, { "entropy": 0.4643040466308594, "epoch": 1.9966411643963426, "grad_norm": 1.0, "learning_rate": 2.07909151965655e-05, "loss": 0.5125, "mean_token_accuracy": 0.7746139895915986, "num_tokens": 324473634.0, "step": 10700 }, { "entropy": 0.46121720626950263, "epoch": 2.005971263295391, "grad_norm": 0.7109375, "learning_rate": 2.071552359749062e-05, "loss": 0.5151, "mean_token_accuracy": 0.7720960187911987, "num_tokens": 325999882.0, "step": 10750 }, { "entropy": 0.425725160241127, "epoch": 2.0153013621944393, "grad_norm": 0.474609375, "learning_rate": 2.063998603965232e-05, "loss": 0.4962, "mean_token_accuracy": 0.7846533066034317, "num_tokens": 327442405.0, "step": 10800 }, { "entropy": 0.45527834951877594, "epoch": 2.0246314610934877, "grad_norm": 1.1328125, "learning_rate": 2.056430513847151e-05, "loss": 0.5079, "mean_token_accuracy": 0.7760635191202163, "num_tokens": 329012059.0, "step": 10850 }, { "entropy": 0.4397959718108177, "epoch": 2.033961559992536, "grad_norm": 1.03125, "learning_rate": 2.0488483514332225e-05, "loss": 0.4988, "mean_token_accuracy": 0.7809435164928437, "num_tokens": 330491757.0, "step": 10900 }, { "entropy": 0.4280716378986835, "epoch": 2.0432916588915844, "grad_norm": 0.796875, "learning_rate": 2.041252379249091e-05, "loss": 0.4888, "mean_token_accuracy": 0.785589964389801, "num_tokens": 331979812.0, "step": 10950 }, { "entropy": 0.44335421919822693, "epoch": 2.0526217577906327, "grad_norm": 0.5703125, "learning_rate": 2.0336428602985527e-05, "loss": 0.5037, "mean_token_accuracy": 0.7756373131275177, "num_tokens": 333477541.0, "step": 11000 }, { "epoch": 2.0526217577906327, "eval_entropy": 0.45860785820068567, "eval_loss": 0.5141582489013672, "eval_mean_token_accuracy": 0.7748899443433919, "eval_num_tokens": 333477541.0, "eval_runtime": 16.1759, "eval_samples_per_second": 53.598, "eval_steps_per_second": 6.738, "step": 11000 }, { "entropy": 0.43529929786920546, "epoch": 2.061951856689681, "grad_norm": 1.2109375, "learning_rate": 2.026020058054448e-05, "loss": 0.4936, "mean_token_accuracy": 0.7826171773672104, "num_tokens": 334939322.0, "step": 11050 }, { "entropy": 0.4596153527498245, "epoch": 2.0712819555887294, "grad_norm": 1.234375, "learning_rate": 2.018384236449539e-05, "loss": 0.5192, "mean_token_accuracy": 0.7747271412611008, "num_tokens": 336492472.0, "step": 11100 }, { "entropy": 0.45726164370775224, "epoch": 2.0806120544877778, "grad_norm": 0.3828125, "learning_rate": 2.0107356598673732e-05, "loss": 0.5075, "mean_token_accuracy": 0.7738985830545425, "num_tokens": 338077203.0, "step": 11150 }, { "entropy": 0.4586531579494476, "epoch": 2.089942153386826, "grad_norm": 0.423828125, "learning_rate": 2.0030745931331256e-05, "loss": 0.5128, "mean_token_accuracy": 0.7716344100236893, "num_tokens": 339676503.0, "step": 11200 }, { "entropy": 0.44181185990571975, "epoch": 2.0992722522858744, "grad_norm": 0.703125, "learning_rate": 1.995401301504434e-05, "loss": 0.5033, "mean_token_accuracy": 0.7782072865962982, "num_tokens": 341201747.0, "step": 11250 }, { "entropy": 0.431854664683342, "epoch": 2.1086023511849223, "grad_norm": 1.0390625, "learning_rate": 1.9877160506622106e-05, "loss": 0.49, "mean_token_accuracy": 0.7848282158374786, "num_tokens": 342665361.0, "step": 11300 }, { "entropy": 0.43534082144498826, "epoch": 2.1179324500839707, "grad_norm": 0.75, "learning_rate": 1.9800191067014458e-05, "loss": 0.4917, "mean_token_accuracy": 0.7822025471925735, "num_tokens": 344149761.0, "step": 11350 }, { "entropy": 0.444383510351181, "epoch": 2.127262548983019, "grad_norm": 0.494140625, "learning_rate": 1.9723107361219928e-05, "loss": 0.5044, "mean_token_accuracy": 0.7769185125827789, "num_tokens": 345653004.0, "step": 11400 }, { "entropy": 0.4527071109414101, "epoch": 2.1365926478820674, "grad_norm": 0.62890625, "learning_rate": 1.964591205819343e-05, "loss": 0.5026, "mean_token_accuracy": 0.7761050814390182, "num_tokens": 347228235.0, "step": 11450 }, { "entropy": 0.4523302459716797, "epoch": 2.1459227467811157, "grad_norm": 0.97265625, "learning_rate": 1.9568607830753818e-05, "loss": 0.5175, "mean_token_accuracy": 0.777667219042778, "num_tokens": 348730988.0, "step": 11500 }, { "entropy": 0.44508363455533984, "epoch": 2.155252845680164, "grad_norm": 1.3984375, "learning_rate": 1.9491197355491355e-05, "loss": 0.5016, "mean_token_accuracy": 0.7767183601856231, "num_tokens": 350265615.0, "step": 11550 }, { "entropy": 0.44807049065828325, "epoch": 2.1645829445792124, "grad_norm": 1.4921875, "learning_rate": 1.941368331267506e-05, "loss": 0.5179, "mean_token_accuracy": 0.7722341948747635, "num_tokens": 351794521.0, "step": 11600 }, { "entropy": 0.44992916941642763, "epoch": 2.1739130434782608, "grad_norm": 0.62109375, "learning_rate": 1.9336068386159866e-05, "loss": 0.5013, "mean_token_accuracy": 0.7756163114309311, "num_tokens": 353325027.0, "step": 11650 }, { "entropy": 0.4497057408094406, "epoch": 2.183243142377309, "grad_norm": 0.486328125, "learning_rate": 1.9258355263293722e-05, "loss": 0.5101, "mean_token_accuracy": 0.7762971234321594, "num_tokens": 354798870.0, "step": 11700 }, { "entropy": 0.4416424559056759, "epoch": 2.1925732412763574, "grad_norm": 0.83984375, "learning_rate": 1.9180546634824542e-05, "loss": 0.4978, "mean_token_accuracy": 0.780478093624115, "num_tokens": 356285178.0, "step": 11750 }, { "entropy": 0.4536583548784256, "epoch": 2.201903340175406, "grad_norm": 1.1328125, "learning_rate": 1.910264519480704e-05, "loss": 0.5081, "mean_token_accuracy": 0.7730959630012513, "num_tokens": 357853469.0, "step": 11800 }, { "entropy": 0.432369647026062, "epoch": 2.211233439074454, "grad_norm": 1.5703125, "learning_rate": 1.902465364050943e-05, "loss": 0.4962, "mean_token_accuracy": 0.7790928614139557, "num_tokens": 359347327.0, "step": 11850 }, { "entropy": 0.4379693388938904, "epoch": 2.2205635379735025, "grad_norm": 0.5078125, "learning_rate": 1.894657467232007e-05, "loss": 0.4922, "mean_token_accuracy": 0.7815437364578247, "num_tokens": 360883930.0, "step": 11900 }, { "entropy": 0.4471505701541901, "epoch": 2.229893636872551, "grad_norm": 0.75390625, "learning_rate": 1.8868410993653945e-05, "loss": 0.5073, "mean_token_accuracy": 0.776453862786293, "num_tokens": 362395340.0, "step": 11950 }, { "entropy": 0.45013984248042105, "epoch": 2.239223735771599, "grad_norm": 0.65234375, "learning_rate": 1.879016531085905e-05, "loss": 0.5059, "mean_token_accuracy": 0.7744881427288055, "num_tokens": 363975143.0, "step": 12000 }, { "epoch": 2.239223735771599, "eval_entropy": 0.45516312778542894, "eval_loss": 0.5110519528388977, "eval_mean_token_accuracy": 0.7761742434370409, "eval_num_tokens": 363975143.0, "eval_runtime": 16.1613, "eval_samples_per_second": 53.647, "eval_steps_per_second": 6.744, "step": 12000 }, { "entropy": 0.46731566220521925, "epoch": 2.2485538346706475, "grad_norm": 0.95703125, "learning_rate": 1.871184033312272e-05, "loss": 0.5183, "mean_token_accuracy": 0.7688807338476181, "num_tokens": 365608309.0, "step": 12050 }, { "entropy": 0.43275348499417304, "epoch": 2.257883933569696, "grad_norm": 0.609375, "learning_rate": 1.86334387723778e-05, "loss": 0.5003, "mean_token_accuracy": 0.7791133409738541, "num_tokens": 367079142.0, "step": 12100 }, { "entropy": 0.4450415739417076, "epoch": 2.267214032468744, "grad_norm": 0.8046875, "learning_rate": 1.8554963343208748e-05, "loss": 0.508, "mean_token_accuracy": 0.7741218858957291, "num_tokens": 368613238.0, "step": 12150 }, { "entropy": 0.4465838612616062, "epoch": 2.2765441313677925, "grad_norm": 0.578125, "learning_rate": 1.8476416762757647e-05, "loss": 0.5025, "mean_token_accuracy": 0.7775521212816239, "num_tokens": 370170401.0, "step": 12200 }, { "entropy": 0.44225269854068755, "epoch": 2.285874230266841, "grad_norm": 0.57421875, "learning_rate": 1.8397801750630147e-05, "loss": 0.496, "mean_token_accuracy": 0.7806216114759446, "num_tokens": 371657034.0, "step": 12250 }, { "entropy": 0.4700926415622234, "epoch": 2.2952043291658892, "grad_norm": 0.55078125, "learning_rate": 1.831912102880127e-05, "loss": 0.5264, "mean_token_accuracy": 0.7698476028442383, "num_tokens": 373254414.0, "step": 12300 }, { "entropy": 0.4466784715652466, "epoch": 2.3045344280649376, "grad_norm": 1.1953125, "learning_rate": 1.8240377321521187e-05, "loss": 0.5059, "mean_token_accuracy": 0.775156998038292, "num_tokens": 374801131.0, "step": 12350 }, { "entropy": 0.4366114428639412, "epoch": 2.313864526963986, "grad_norm": 0.43359375, "learning_rate": 1.816157335522088e-05, "loss": 0.4963, "mean_token_accuracy": 0.7822502106428146, "num_tokens": 376328664.0, "step": 12400 }, { "entropy": 0.4395353105664253, "epoch": 2.3231946258630343, "grad_norm": 2.25, "learning_rate": 1.808271185841774e-05, "loss": 0.499, "mean_token_accuracy": 0.7797509133815765, "num_tokens": 377858310.0, "step": 12450 }, { "entropy": 0.42968779906630516, "epoch": 2.3325247247620826, "grad_norm": 1.1015625, "learning_rate": 1.8003795561621118e-05, "loss": 0.4905, "mean_token_accuracy": 0.7836663633584976, "num_tokens": 379330727.0, "step": 12500 }, { "entropy": 0.4358790573477745, "epoch": 2.341854823661131, "grad_norm": 0.95703125, "learning_rate": 1.792482719723774e-05, "loss": 0.4952, "mean_token_accuracy": 0.7849816447496414, "num_tokens": 380789154.0, "step": 12550 }, { "entropy": 0.4382615853846073, "epoch": 2.3511849225601793, "grad_norm": 0.71484375, "learning_rate": 1.7845809499477147e-05, "loss": 0.5003, "mean_token_accuracy": 0.777313020825386, "num_tokens": 382304834.0, "step": 12600 }, { "entropy": 0.4590240094065666, "epoch": 2.3605150214592276, "grad_norm": 0.7109375, "learning_rate": 1.7766745204257005e-05, "loss": 0.5087, "mean_token_accuracy": 0.7742151153087616, "num_tokens": 383864187.0, "step": 12650 }, { "entropy": 0.44439409762620924, "epoch": 2.369845120358276, "grad_norm": 0.92578125, "learning_rate": 1.7687637049108356e-05, "loss": 0.5074, "mean_token_accuracy": 0.7769033217430115, "num_tokens": 385371218.0, "step": 12700 }, { "entropy": 0.45312762558460234, "epoch": 2.3791752192573243, "grad_norm": 1.078125, "learning_rate": 1.7608487773080876e-05, "loss": 0.5138, "mean_token_accuracy": 0.7725106239318847, "num_tokens": 386905152.0, "step": 12750 }, { "entropy": 0.4439894749224186, "epoch": 2.3885053181563727, "grad_norm": 0.62109375, "learning_rate": 1.7529300116648006e-05, "loss": 0.5065, "mean_token_accuracy": 0.777021074295044, "num_tokens": 388444369.0, "step": 12800 }, { "entropy": 0.42678302526474, "epoch": 2.397835417055421, "grad_norm": 1.5390625, "learning_rate": 1.745007682161206e-05, "loss": 0.4882, "mean_token_accuracy": 0.7850719147920608, "num_tokens": 389939944.0, "step": 12850 }, { "entropy": 0.43243088483810427, "epoch": 2.407165515954469, "grad_norm": 1.0625, "learning_rate": 1.7370820631009338e-05, "loss": 0.4964, "mean_token_accuracy": 0.7798051989078522, "num_tokens": 391456345.0, "step": 12900 }, { "entropy": 0.4408321186900139, "epoch": 2.4164956148535173, "grad_norm": 0.88671875, "learning_rate": 1.729153428901509e-05, "loss": 0.5048, "mean_token_accuracy": 0.7777973639965058, "num_tokens": 392972606.0, "step": 12950 }, { "entropy": 0.42640509456396103, "epoch": 2.4258257137525656, "grad_norm": 0.54296875, "learning_rate": 1.721222054084855e-05, "loss": 0.489, "mean_token_accuracy": 0.7861467552185059, "num_tokens": 394419739.0, "step": 13000 }, { "epoch": 2.4258257137525656, "eval_entropy": 0.45086843404201193, "eval_loss": 0.5095834136009216, "eval_mean_token_accuracy": 0.7765465198306862, "eval_num_tokens": 394419739.0, "eval_runtime": 16.3, "eval_samples_per_second": 53.19, "eval_steps_per_second": 6.687, "step": 13000 }, { "entropy": 0.4339776523411274, "epoch": 2.435155812651614, "grad_norm": 1.1171875, "learning_rate": 1.7132882132677856e-05, "loss": 0.4952, "mean_token_accuracy": 0.7817716175317764, "num_tokens": 395915283.0, "step": 13050 }, { "entropy": 0.43754623234272005, "epoch": 2.4444859115506623, "grad_norm": 0.447265625, "learning_rate": 1.7053521811524983e-05, "loss": 0.5022, "mean_token_accuracy": 0.7792576867341995, "num_tokens": 397406785.0, "step": 13100 }, { "entropy": 0.4319652807712555, "epoch": 2.4538160104497106, "grad_norm": 0.423828125, "learning_rate": 1.6974142325170614e-05, "loss": 0.4932, "mean_token_accuracy": 0.780916188955307, "num_tokens": 398889013.0, "step": 13150 }, { "entropy": 0.42943828999996186, "epoch": 2.463146109348759, "grad_norm": 1.703125, "learning_rate": 1.6894746422059023e-05, "loss": 0.4874, "mean_token_accuracy": 0.7860763943195344, "num_tokens": 400360522.0, "step": 13200 }, { "entropy": 0.4486216183006764, "epoch": 2.4724762082478073, "grad_norm": 0.55859375, "learning_rate": 1.6815336851202897e-05, "loss": 0.503, "mean_token_accuracy": 0.7766870594024659, "num_tokens": 401889271.0, "step": 13250 }, { "entropy": 0.4432876881957054, "epoch": 2.4818063071468557, "grad_norm": 0.73046875, "learning_rate": 1.6735916362088154e-05, "loss": 0.4969, "mean_token_accuracy": 0.7788010305166244, "num_tokens": 403427748.0, "step": 13300 }, { "entropy": 0.44121997892856596, "epoch": 2.491136406045904, "grad_norm": 1.0078125, "learning_rate": 1.6656487704578733e-05, "loss": 0.4999, "mean_token_accuracy": 0.7776626753807068, "num_tokens": 404954195.0, "step": 13350 }, { "entropy": 0.433184619396925, "epoch": 2.5004665049449524, "grad_norm": 0.91015625, "learning_rate": 1.6577053628821423e-05, "loss": 0.4912, "mean_token_accuracy": 0.7811095035076141, "num_tokens": 406493981.0, "step": 13400 }, { "entropy": 0.42312393710017204, "epoch": 2.5097966038440007, "grad_norm": 0.73046875, "learning_rate": 1.6497616885150602e-05, "loss": 0.4876, "mean_token_accuracy": 0.7851846623420715, "num_tokens": 407934358.0, "step": 13450 }, { "entropy": 0.4381246021389961, "epoch": 2.519126702743049, "grad_norm": 0.55078125, "learning_rate": 1.6418180223993015e-05, "loss": 0.4935, "mean_token_accuracy": 0.7794178009033204, "num_tokens": 409440914.0, "step": 13500 }, { "entropy": 0.44361667945981026, "epoch": 2.5284568016420974, "grad_norm": 0.69140625, "learning_rate": 1.6338746395772556e-05, "loss": 0.4969, "mean_token_accuracy": 0.7793952637910843, "num_tokens": 410989159.0, "step": 13550 }, { "entropy": 0.43151887714862824, "epoch": 2.5377869005411458, "grad_norm": 0.94140625, "learning_rate": 1.625931815081504e-05, "loss": 0.4952, "mean_token_accuracy": 0.7826283901929856, "num_tokens": 412472158.0, "step": 13600 }, { "entropy": 0.44044260889291764, "epoch": 2.547116999440194, "grad_norm": 0.71875, "learning_rate": 1.6179898239252952e-05, "loss": 0.4989, "mean_token_accuracy": 0.7796867018938065, "num_tokens": 414005272.0, "step": 13650 }, { "entropy": 0.43727659299969673, "epoch": 2.5564470983392424, "grad_norm": 0.5390625, "learning_rate": 1.6100489410930248e-05, "loss": 0.4983, "mean_token_accuracy": 0.7779977285861969, "num_tokens": 415515917.0, "step": 13700 }, { "entropy": 0.4339106129109859, "epoch": 2.565777197238291, "grad_norm": 1.21875, "learning_rate": 1.602109441530714e-05, "loss": 0.5021, "mean_token_accuracy": 0.7849010616540909, "num_tokens": 416950426.0, "step": 13750 }, { "entropy": 0.4218116353452206, "epoch": 2.575107296137339, "grad_norm": 0.82421875, "learning_rate": 1.5941716001364893e-05, "loss": 0.4868, "mean_token_accuracy": 0.7843046194314957, "num_tokens": 418436936.0, "step": 13800 }, { "entropy": 0.42178965732455254, "epoch": 2.5844373950363875, "grad_norm": 0.73046875, "learning_rate": 1.5862356917510624e-05, "loss": 0.4819, "mean_token_accuracy": 0.7881238484382629, "num_tokens": 419883666.0, "step": 13850 }, { "entropy": 0.43385469675064087, "epoch": 2.593767493935436, "grad_norm": 0.88671875, "learning_rate": 1.5783019911482207e-05, "loss": 0.4999, "mean_token_accuracy": 0.7784091866016388, "num_tokens": 421397686.0, "step": 13900 }, { "entropy": 0.4416196349263191, "epoch": 2.603097592834484, "grad_norm": 1.0078125, "learning_rate": 1.570370773025306e-05, "loss": 0.5006, "mean_token_accuracy": 0.7770800250768661, "num_tokens": 422952565.0, "step": 13950 }, { "entropy": 0.4428327572345734, "epoch": 2.6124276917335325, "grad_norm": 1.109375, "learning_rate": 1.5624423119937062e-05, "loss": 0.4991, "mean_token_accuracy": 0.7780466437339782, "num_tokens": 424490641.0, "step": 14000 }, { "epoch": 2.6124276917335325, "eval_entropy": 0.4452941595414363, "eval_loss": 0.507188618183136, "eval_mean_token_accuracy": 0.7770852725440209, "eval_num_tokens": 424490641.0, "eval_runtime": 16.1783, "eval_samples_per_second": 53.59, "eval_steps_per_second": 6.737, "step": 14000 }, { "entropy": 0.4200237849354744, "epoch": 2.621757790632581, "grad_norm": 0.48046875, "learning_rate": 1.554516882569349e-05, "loss": 0.4903, "mean_token_accuracy": 0.787417265176773, "num_tokens": 425904302.0, "step": 14050 }, { "entropy": 0.4459881857037544, "epoch": 2.631087889531629, "grad_norm": 0.96875, "learning_rate": 1.5465947591631947e-05, "loss": 0.5028, "mean_token_accuracy": 0.7770888382196426, "num_tokens": 427427333.0, "step": 14100 }, { "entropy": 0.44237791940569876, "epoch": 2.640417988430677, "grad_norm": 0.515625, "learning_rate": 1.5386762160717355e-05, "loss": 0.5098, "mean_token_accuracy": 0.779816085100174, "num_tokens": 428936850.0, "step": 14150 }, { "entropy": 0.4365511977672577, "epoch": 2.6497480873297254, "grad_norm": 0.6796875, "learning_rate": 1.5307615274674984e-05, "loss": 0.5003, "mean_token_accuracy": 0.7807082986831665, "num_tokens": 430411091.0, "step": 14200 }, { "entropy": 0.45521570563316344, "epoch": 2.659078186228774, "grad_norm": 1.0703125, "learning_rate": 1.522850967389552e-05, "loss": 0.5169, "mean_token_accuracy": 0.774719997048378, "num_tokens": 431926456.0, "step": 14250 }, { "entropy": 0.43436133086681367, "epoch": 2.668408285127822, "grad_norm": 1.0703125, "learning_rate": 1.5149448097340193e-05, "loss": 0.4954, "mean_token_accuracy": 0.7820281451940536, "num_tokens": 433433882.0, "step": 14300 }, { "entropy": 0.4265731783211231, "epoch": 2.6777383840268705, "grad_norm": 0.75, "learning_rate": 1.5070433282445917e-05, "loss": 0.4952, "mean_token_accuracy": 0.7826755654811859, "num_tokens": 434847537.0, "step": 14350 }, { "entropy": 0.4315530589222908, "epoch": 2.687068482925919, "grad_norm": 0.92578125, "learning_rate": 1.4991467965030544e-05, "loss": 0.5031, "mean_token_accuracy": 0.7789240056276321, "num_tokens": 436306220.0, "step": 14400 }, { "entropy": 0.4366298992931843, "epoch": 2.696398581824967, "grad_norm": 0.75390625, "learning_rate": 1.4912554879198106e-05, "loss": 0.4974, "mean_token_accuracy": 0.7800502121448517, "num_tokens": 437854187.0, "step": 14450 }, { "entropy": 0.429113384783268, "epoch": 2.7057286807240155, "grad_norm": 0.94921875, "learning_rate": 1.4833696757244162e-05, "loss": 0.496, "mean_token_accuracy": 0.7821574300527573, "num_tokens": 439322082.0, "step": 14500 }, { "entropy": 0.45050988361239436, "epoch": 2.715058779623064, "grad_norm": 0.765625, "learning_rate": 1.475489632956121e-05, "loss": 0.5181, "mean_token_accuracy": 0.7749482518434525, "num_tokens": 440931906.0, "step": 14550 }, { "entropy": 0.45400950565934184, "epoch": 2.724388878522112, "grad_norm": 0.77734375, "learning_rate": 1.4676156324544123e-05, "loss": 0.5091, "mean_token_accuracy": 0.7732315069437027, "num_tokens": 442503054.0, "step": 14600 }, { "entropy": 0.42937296599149705, "epoch": 2.7337189774211605, "grad_norm": 0.68359375, "learning_rate": 1.4597479468495688e-05, "loss": 0.4923, "mean_token_accuracy": 0.7824376839399337, "num_tokens": 443993311.0, "step": 14650 }, { "entropy": 0.44196070849895475, "epoch": 2.743049076320209, "grad_norm": 0.45703125, "learning_rate": 1.4518868485532235e-05, "loss": 0.5056, "mean_token_accuracy": 0.7760562229156495, "num_tokens": 445522017.0, "step": 14700 }, { "entropy": 0.44267056584358216, "epoch": 2.7523791752192572, "grad_norm": 1.0703125, "learning_rate": 1.4440326097489263e-05, "loss": 0.4978, "mean_token_accuracy": 0.7780000925064087, "num_tokens": 447063778.0, "step": 14750 }, { "entropy": 0.4377539825439453, "epoch": 2.7617092741183056, "grad_norm": 0.73828125, "learning_rate": 1.436185502382728e-05, "loss": 0.494, "mean_token_accuracy": 0.7799159944057464, "num_tokens": 448574576.0, "step": 14800 }, { "entropy": 0.43343299850821493, "epoch": 2.771039373017354, "grad_norm": 0.71484375, "learning_rate": 1.428345798153756e-05, "loss": 0.4936, "mean_token_accuracy": 0.7831123304367066, "num_tokens": 450066594.0, "step": 14850 }, { "entropy": 0.430605805516243, "epoch": 2.7803694719164023, "grad_norm": 0.3046875, "learning_rate": 1.4205137685048111e-05, "loss": 0.495, "mean_token_accuracy": 0.7821512734889984, "num_tokens": 451568798.0, "step": 14900 }, { "entropy": 0.43412218809127806, "epoch": 2.7896995708154506, "grad_norm": 1.4140625, "learning_rate": 1.4126896846129701e-05, "loss": 0.4975, "mean_token_accuracy": 0.7786238652467727, "num_tokens": 453062565.0, "step": 14950 }, { "entropy": 0.4382785783708096, "epoch": 2.799029669714499, "grad_norm": 0.828125, "learning_rate": 1.4048738173801939e-05, "loss": 0.4948, "mean_token_accuracy": 0.7823023611307144, "num_tokens": 454574611.0, "step": 15000 }, { "epoch": 2.799029669714499, "eval_entropy": 0.44517116043545785, "eval_loss": 0.5036894679069519, "eval_mean_token_accuracy": 0.7785478141329704, "eval_num_tokens": 454574611.0, "eval_runtime": 16.6862, "eval_samples_per_second": 51.959, "eval_steps_per_second": 6.532, "step": 15000 }, { "entropy": 0.42489848256111146, "epoch": 2.8083597686135473, "grad_norm": 1.109375, "learning_rate": 1.3970664374239483e-05, "loss": 0.495, "mean_token_accuracy": 0.7826015204191208, "num_tokens": 456088007.0, "step": 15050 }, { "entropy": 0.4351053491234779, "epoch": 2.8176898675125956, "grad_norm": 0.62890625, "learning_rate": 1.3892678150678369e-05, "loss": 0.4909, "mean_token_accuracy": 0.7825271499156952, "num_tokens": 457634148.0, "step": 15100 }, { "entropy": 0.4429400071501732, "epoch": 2.827019966411644, "grad_norm": 0.7265625, "learning_rate": 1.3814782203322367e-05, "loss": 0.5001, "mean_token_accuracy": 0.7786577945947647, "num_tokens": 459151611.0, "step": 15150 }, { "entropy": 0.4451659436523914, "epoch": 2.8363500653106923, "grad_norm": 0.30078125, "learning_rate": 1.3736979229249526e-05, "loss": 0.5041, "mean_token_accuracy": 0.7781134587526322, "num_tokens": 460711762.0, "step": 15200 }, { "entropy": 0.4428605942428112, "epoch": 2.8456801642097407, "grad_norm": 0.5625, "learning_rate": 1.3659271922318776e-05, "loss": 0.5088, "mean_token_accuracy": 0.7755980342626572, "num_tokens": 462239645.0, "step": 15250 }, { "entropy": 0.44321066468954085, "epoch": 2.855010263108789, "grad_norm": 1.90625, "learning_rate": 1.3581662973076661e-05, "loss": 0.5057, "mean_token_accuracy": 0.7742398703098297, "num_tokens": 463780352.0, "step": 15300 }, { "entropy": 0.43519594669342043, "epoch": 2.8643403620078374, "grad_norm": 0.8359375, "learning_rate": 1.3504155068664164e-05, "loss": 0.4946, "mean_token_accuracy": 0.7806341469287872, "num_tokens": 465306354.0, "step": 15350 }, { "entropy": 0.452918638586998, "epoch": 2.8736704609068857, "grad_norm": 0.625, "learning_rate": 1.3426750892723697e-05, "loss": 0.5097, "mean_token_accuracy": 0.7724657821655273, "num_tokens": 466885785.0, "step": 15400 }, { "entropy": 0.43585652247071266, "epoch": 2.883000559805934, "grad_norm": 0.57421875, "learning_rate": 1.3349453125306156e-05, "loss": 0.4973, "mean_token_accuracy": 0.7797726893424988, "num_tokens": 468383581.0, "step": 15450 }, { "entropy": 0.43165954887866975, "epoch": 2.8923306587049824, "grad_norm": 0.68359375, "learning_rate": 1.3272264442778136e-05, "loss": 0.4937, "mean_token_accuracy": 0.7796162796020508, "num_tokens": 469917430.0, "step": 15500 }, { "entropy": 0.4401167546212673, "epoch": 2.9016607576040307, "grad_norm": 0.66015625, "learning_rate": 1.319518751772927e-05, "loss": 0.5006, "mean_token_accuracy": 0.7786802816390991, "num_tokens": 471424114.0, "step": 15550 }, { "entropy": 0.45204014524817465, "epoch": 2.910990856503079, "grad_norm": 1.28125, "learning_rate": 1.3118225018879684e-05, "loss": 0.5071, "mean_token_accuracy": 0.7716954737901688, "num_tokens": 473004348.0, "step": 15600 }, { "entropy": 0.43205771446228025, "epoch": 2.9203209554021274, "grad_norm": 1.6484375, "learning_rate": 1.3041379610987594e-05, "loss": 0.4982, "mean_token_accuracy": 0.7792713183164597, "num_tokens": 474501801.0, "step": 15650 }, { "entropy": 0.43552364617586137, "epoch": 2.929651054301176, "grad_norm": 0.79296875, "learning_rate": 1.2964653954757057e-05, "loss": 0.4991, "mean_token_accuracy": 0.7788824599981308, "num_tokens": 476043852.0, "step": 15700 }, { "entropy": 0.44703464940190313, "epoch": 2.938981153200224, "grad_norm": 0.490234375, "learning_rate": 1.2888050706745822e-05, "loss": 0.5064, "mean_token_accuracy": 0.7750799888372422, "num_tokens": 477598151.0, "step": 15750 }, { "entropy": 0.43971230536699296, "epoch": 2.9483112520992725, "grad_norm": 0.67578125, "learning_rate": 1.2811572519273378e-05, "loss": 0.5041, "mean_token_accuracy": 0.7766543072462082, "num_tokens": 479148774.0, "step": 15800 }, { "entropy": 0.447616363465786, "epoch": 2.957641350998321, "grad_norm": 1.3046875, "learning_rate": 1.2735222040329087e-05, "loss": 0.5084, "mean_token_accuracy": 0.7735405403375626, "num_tokens": 480694879.0, "step": 15850 }, { "entropy": 0.4212371516227722, "epoch": 2.966971449897369, "grad_norm": 2.265625, "learning_rate": 1.2659001913480522e-05, "loss": 0.4861, "mean_token_accuracy": 0.785834304690361, "num_tokens": 482165709.0, "step": 15900 }, { "entropy": 0.41868353605270386, "epoch": 2.9763015487964175, "grad_norm": 1.234375, "learning_rate": 1.2582914777781937e-05, "loss": 0.4843, "mean_token_accuracy": 0.7870692014694214, "num_tokens": 483643611.0, "step": 15950 }, { "entropy": 0.44808956772089004, "epoch": 2.9856316476954654, "grad_norm": 0.65234375, "learning_rate": 1.2506963267682884e-05, "loss": 0.5052, "mean_token_accuracy": 0.7751398229598999, "num_tokens": 485205508.0, "step": 16000 }, { "epoch": 2.9856316476954654, "eval_entropy": 0.44267906945779784, "eval_loss": 0.501590371131897, "eval_mean_token_accuracy": 0.7786080602112166, "eval_num_tokens": 485205508.0, "eval_runtime": 16.2258, "eval_samples_per_second": 53.433, "eval_steps_per_second": 6.718, "step": 16000 }, { "entropy": 0.4390178045630455, "epoch": 2.9949617465945138, "grad_norm": 1.8515625, "learning_rate": 1.2431150012936982e-05, "loss": 0.5016, "mean_token_accuracy": 0.7765721970796585, "num_tokens": 486717739.0, "step": 16050 }, { "entropy": 0.43280943170189856, "epoch": 3.004291845493562, "grad_norm": 1.109375, "learning_rate": 1.2355477638510904e-05, "loss": 0.5016, "mean_token_accuracy": 0.7793767899274826, "num_tokens": 488220825.0, "step": 16100 }, { "entropy": 0.41857230544090274, "epoch": 3.0136219443926104, "grad_norm": 0.96875, "learning_rate": 1.2279948764493463e-05, "loss": 0.4857, "mean_token_accuracy": 0.7855171990394593, "num_tokens": 489714834.0, "step": 16150 }, { "entropy": 0.4343470679223537, "epoch": 3.022952043291659, "grad_norm": 0.57421875, "learning_rate": 1.220456600600488e-05, "loss": 0.4952, "mean_token_accuracy": 0.7775657117366791, "num_tokens": 491256234.0, "step": 16200 }, { "entropy": 0.42674369990825656, "epoch": 3.032282142190707, "grad_norm": 0.76171875, "learning_rate": 1.2129331973106275e-05, "loss": 0.4875, "mean_token_accuracy": 0.7840164464712143, "num_tokens": 492813353.0, "step": 16250 }, { "entropy": 0.42470823049545287, "epoch": 3.0416122410897555, "grad_norm": 1.078125, "learning_rate": 1.2054249270709271e-05, "loss": 0.4898, "mean_token_accuracy": 0.7800269144773483, "num_tokens": 494302175.0, "step": 16300 }, { "entropy": 0.42407613903284075, "epoch": 3.050942339988804, "grad_norm": 0.7109375, "learning_rate": 1.1979320498485797e-05, "loss": 0.4849, "mean_token_accuracy": 0.782460133433342, "num_tokens": 495800714.0, "step": 16350 }, { "entropy": 0.4180370423197746, "epoch": 3.060272438887852, "grad_norm": 1.109375, "learning_rate": 1.1904548250778101e-05, "loss": 0.4882, "mean_token_accuracy": 0.7806040924787522, "num_tokens": 497321561.0, "step": 16400 }, { "entropy": 0.42719221144914626, "epoch": 3.0696025377869005, "grad_norm": 0.66015625, "learning_rate": 1.1829935116508903e-05, "loss": 0.4905, "mean_token_accuracy": 0.7813178312778473, "num_tokens": 498854626.0, "step": 16450 }, { "entropy": 0.4178078393638134, "epoch": 3.078932636685949, "grad_norm": 0.76953125, "learning_rate": 1.175548367909175e-05, "loss": 0.485, "mean_token_accuracy": 0.783756075501442, "num_tokens": 500380642.0, "step": 16500 }, { "entropy": 0.4012910355627537, "epoch": 3.088262735584997, "grad_norm": 1.1328125, "learning_rate": 1.1681196516341603e-05, "loss": 0.4694, "mean_token_accuracy": 0.7900429528951645, "num_tokens": 501885081.0, "step": 16550 }, { "entropy": 0.41146292686462405, "epoch": 3.0975928344840455, "grad_norm": 0.46484375, "learning_rate": 1.1607076200385529e-05, "loss": 0.4784, "mean_token_accuracy": 0.7883473831415176, "num_tokens": 503332757.0, "step": 16600 }, { "entropy": 0.4057569517195225, "epoch": 3.106922933383094, "grad_norm": 0.326171875, "learning_rate": 1.1533125297573703e-05, "loss": 0.4788, "mean_token_accuracy": 0.788205589056015, "num_tokens": 504782228.0, "step": 16650 }, { "entropy": 0.41990657716989516, "epoch": 3.1162530322821422, "grad_norm": 0.57421875, "learning_rate": 1.1459346368390504e-05, "loss": 0.4849, "mean_token_accuracy": 0.782880739569664, "num_tokens": 506282342.0, "step": 16700 }, { "entropy": 0.4344457286596298, "epoch": 3.1255831311811906, "grad_norm": 0.412109375, "learning_rate": 1.1385741967365869e-05, "loss": 0.492, "mean_token_accuracy": 0.779918566942215, "num_tokens": 507797216.0, "step": 16750 }, { "entropy": 0.425162510573864, "epoch": 3.134913230080239, "grad_norm": 0.6640625, "learning_rate": 1.131231464298687e-05, "loss": 0.4889, "mean_token_accuracy": 0.7809261924028397, "num_tokens": 509307379.0, "step": 16800 }, { "entropy": 0.4163677006959915, "epoch": 3.1442433289792873, "grad_norm": 0.96484375, "learning_rate": 1.1239066937609447e-05, "loss": 0.4903, "mean_token_accuracy": 0.7836281234025955, "num_tokens": 510827157.0, "step": 16850 }, { "entropy": 0.41290561139583587, "epoch": 3.1535734278783356, "grad_norm": 0.2890625, "learning_rate": 1.1166001387370388e-05, "loss": 0.4915, "mean_token_accuracy": 0.7884875816106797, "num_tokens": 512261808.0, "step": 16900 }, { "entropy": 0.41839638456702233, "epoch": 3.162903526777384, "grad_norm": 0.48046875, "learning_rate": 1.1093120522099535e-05, "loss": 0.4855, "mean_token_accuracy": 0.7832108342647552, "num_tokens": 513784693.0, "step": 16950 }, { "entropy": 0.4217331621050835, "epoch": 3.1722336256764323, "grad_norm": 1.0234375, "learning_rate": 1.1020426865232167e-05, "loss": 0.4965, "mean_token_accuracy": 0.7817919147014618, "num_tokens": 515320766.0, "step": 17000 }, { "epoch": 3.1722336256764323, "eval_entropy": 0.4328604711031695, "eval_loss": 0.501544177532196, "eval_mean_token_accuracy": 0.7793953303897053, "eval_num_tokens": 515320766.0, "eval_runtime": 16.0856, "eval_samples_per_second": 53.899, "eval_steps_per_second": 6.776, "step": 17000 }, { "entropy": 0.4205896918475628, "epoch": 3.1815637245754806, "grad_norm": 1.140625, "learning_rate": 1.0947922933721634e-05, "loss": 0.4908, "mean_token_accuracy": 0.7806319332122803, "num_tokens": 516850357.0, "step": 17050 }, { "entropy": 0.4369584143161774, "epoch": 3.190893823474529, "grad_norm": 0.6796875, "learning_rate": 1.0875611237952227e-05, "loss": 0.5028, "mean_token_accuracy": 0.7777555876970291, "num_tokens": 518426223.0, "step": 17100 }, { "entropy": 0.42318257100880147, "epoch": 3.2002239223735773, "grad_norm": 0.578125, "learning_rate": 1.0803494281652234e-05, "loss": 0.4893, "mean_token_accuracy": 0.7821793848276138, "num_tokens": 519954467.0, "step": 17150 }, { "entropy": 0.43405372768640516, "epoch": 3.2095540212726257, "grad_norm": 1.1484375, "learning_rate": 1.0731574561807265e-05, "loss": 0.4994, "mean_token_accuracy": 0.7766797173023224, "num_tokens": 521503586.0, "step": 17200 }, { "entropy": 0.42191226355731487, "epoch": 3.218884120171674, "grad_norm": 0.66796875, "learning_rate": 1.0659854568573787e-05, "loss": 0.4846, "mean_token_accuracy": 0.7843712168931961, "num_tokens": 523039085.0, "step": 17250 }, { "entropy": 0.4242657233774662, "epoch": 3.228214219070722, "grad_norm": 0.47265625, "learning_rate": 1.058833678519293e-05, "loss": 0.4901, "mean_token_accuracy": 0.7792167681455612, "num_tokens": 524604685.0, "step": 17300 }, { "entropy": 0.40574381925165653, "epoch": 3.2375443179697703, "grad_norm": 0.76171875, "learning_rate": 1.051702368790447e-05, "loss": 0.4739, "mean_token_accuracy": 0.789325880408287, "num_tokens": 526100115.0, "step": 17350 }, { "entropy": 0.41887831434607503, "epoch": 3.2468744168688186, "grad_norm": 1.2265625, "learning_rate": 1.0445917745861102e-05, "loss": 0.4818, "mean_token_accuracy": 0.7836150753498078, "num_tokens": 527595996.0, "step": 17400 }, { "entropy": 0.40637139081954954, "epoch": 3.256204515767867, "grad_norm": 0.6015625, "learning_rate": 1.0375021421042974e-05, "loss": 0.4733, "mean_token_accuracy": 0.7896361410617828, "num_tokens": 529079715.0, "step": 17450 }, { "entropy": 0.4036496952176094, "epoch": 3.2655346146669153, "grad_norm": 0.298828125, "learning_rate": 1.030433716817241e-05, "loss": 0.4819, "mean_token_accuracy": 0.7864921605587005, "num_tokens": 530573725.0, "step": 17500 }, { "entropy": 0.41815487191081047, "epoch": 3.2748647135659636, "grad_norm": 0.90234375, "learning_rate": 1.0233867434628938e-05, "loss": 0.4813, "mean_token_accuracy": 0.7873369532823563, "num_tokens": 532067163.0, "step": 17550 }, { "entropy": 0.4044397334754467, "epoch": 3.284194812465012, "grad_norm": 1.2265625, "learning_rate": 1.0163614660364547e-05, "loss": 0.4802, "mean_token_accuracy": 0.7873817622661591, "num_tokens": 533512972.0, "step": 17600 }, { "entropy": 0.4253586496412754, "epoch": 3.2935249113640603, "grad_norm": 0.494140625, "learning_rate": 1.0093581277819186e-05, "loss": 0.4906, "mean_token_accuracy": 0.7798972427845001, "num_tokens": 535070119.0, "step": 17650 }, { "entropy": 0.41822995960712434, "epoch": 3.3028550102631087, "grad_norm": 0.8046875, "learning_rate": 1.0023769711836586e-05, "loss": 0.4878, "mean_token_accuracy": 0.7849278378486634, "num_tokens": 536578335.0, "step": 17700 }, { "entropy": 0.4179137668013573, "epoch": 3.312185109162157, "grad_norm": 0.953125, "learning_rate": 9.95418237958026e-06, "loss": 0.4858, "mean_token_accuracy": 0.7825618571043015, "num_tokens": 538111995.0, "step": 17750 }, { "entropy": 0.4254558201134205, "epoch": 3.3215152080612054, "grad_norm": 0.5625, "learning_rate": 9.88482169044983e-06, "loss": 0.488, "mean_token_accuracy": 0.7804460400342941, "num_tokens": 539663342.0, "step": 17800 }, { "entropy": 0.4253497390449047, "epoch": 3.3308453069602537, "grad_norm": 1.15625, "learning_rate": 9.815690045997598e-06, "loss": 0.4877, "mean_token_accuracy": 0.7805328375101089, "num_tokens": 541217582.0, "step": 17850 }, { "entropy": 0.41277687311172484, "epoch": 3.340175405859302, "grad_norm": 1.0625, "learning_rate": 9.746789839845406e-06, "loss": 0.4874, "mean_token_accuracy": 0.7856322342157364, "num_tokens": 542694785.0, "step": 17900 }, { "entropy": 0.4202821546792984, "epoch": 3.3495055047583504, "grad_norm": 1.2265625, "learning_rate": 9.67812345760174e-06, "loss": 0.4955, "mean_token_accuracy": 0.7834526008367538, "num_tokens": 544234574.0, "step": 17950 }, { "entropy": 0.41871184706687925, "epoch": 3.3588356036573987, "grad_norm": 0.50390625, "learning_rate": 9.609693276779152e-06, "loss": 0.4849, "mean_token_accuracy": 0.7832159209251404, "num_tokens": 545775120.0, "step": 18000 }, { "epoch": 3.3588356036573987, "eval_entropy": 0.4325732376870759, "eval_loss": 0.5006441473960876, "eval_mean_token_accuracy": 0.7796490897826098, "eval_num_tokens": 545775120.0, "eval_runtime": 16.1616, "eval_samples_per_second": 53.646, "eval_steps_per_second": 6.744, "step": 18000 }, { "entropy": 0.42268205478787424, "epoch": 3.368165702556447, "grad_norm": 0.77734375, "learning_rate": 9.541501666711921e-06, "loss": 0.4778, "mean_token_accuracy": 0.7864983838796615, "num_tokens": 547311884.0, "step": 18050 }, { "entropy": 0.42853294894099236, "epoch": 3.3774958014554954, "grad_norm": 0.53515625, "learning_rate": 9.473550988474026e-06, "loss": 0.4904, "mean_token_accuracy": 0.7811102610826492, "num_tokens": 548872868.0, "step": 18100 }, { "entropy": 0.43325465768575666, "epoch": 3.386825900354544, "grad_norm": 0.890625, "learning_rate": 9.4058435947974e-06, "loss": 0.4935, "mean_token_accuracy": 0.7787714445590973, "num_tokens": 550487269.0, "step": 18150 }, { "entropy": 0.42910467088222504, "epoch": 3.396155999253592, "grad_norm": 0.87109375, "learning_rate": 9.338381829990456e-06, "loss": 0.4903, "mean_token_accuracy": 0.7796713817119598, "num_tokens": 552030450.0, "step": 18200 }, { "entropy": 0.41868083611130713, "epoch": 3.4054860981526405, "grad_norm": 0.388671875, "learning_rate": 9.271168029856928e-06, "loss": 0.4823, "mean_token_accuracy": 0.7847259587049484, "num_tokens": 553519745.0, "step": 18250 }, { "entropy": 0.41218777537345885, "epoch": 3.414816197051689, "grad_norm": 1.203125, "learning_rate": 9.204204521615007e-06, "loss": 0.4803, "mean_token_accuracy": 0.7875041741132737, "num_tokens": 555001060.0, "step": 18300 }, { "entropy": 0.4341864985227585, "epoch": 3.424146295950737, "grad_norm": 0.63671875, "learning_rate": 9.13749362381673e-06, "loss": 0.4935, "mean_token_accuracy": 0.7791831189393997, "num_tokens": 556547827.0, "step": 18350 }, { "entropy": 0.4298609687387943, "epoch": 3.4334763948497855, "grad_norm": 0.39453125, "learning_rate": 9.07103764626773e-06, "loss": 0.49, "mean_token_accuracy": 0.7830371624231338, "num_tokens": 558106220.0, "step": 18400 }, { "entropy": 0.4168567133694887, "epoch": 3.442806493748834, "grad_norm": 0.7265625, "learning_rate": 9.00483888994725e-06, "loss": 0.4787, "mean_token_accuracy": 0.7888282573223114, "num_tokens": 559579247.0, "step": 18450 }, { "entropy": 0.4177075420320034, "epoch": 3.452136592647882, "grad_norm": 0.5078125, "learning_rate": 8.938899646928482e-06, "loss": 0.4841, "mean_token_accuracy": 0.7841418528556824, "num_tokens": 561118173.0, "step": 18500 }, { "entropy": 0.42074175730347635, "epoch": 3.4614666915469305, "grad_norm": 1.390625, "learning_rate": 8.8732222002992e-06, "loss": 0.4943, "mean_token_accuracy": 0.7784902250766754, "num_tokens": 562667212.0, "step": 18550 }, { "entropy": 0.41027487240731714, "epoch": 3.470796790445979, "grad_norm": 0.80859375, "learning_rate": 8.807808824082699e-06, "loss": 0.479, "mean_token_accuracy": 0.7876066911220551, "num_tokens": 564131244.0, "step": 18600 }, { "entropy": 0.4196596726775169, "epoch": 3.4801268893450272, "grad_norm": 0.765625, "learning_rate": 8.742661783159075e-06, "loss": 0.4859, "mean_token_accuracy": 0.7828355920314789, "num_tokens": 565650541.0, "step": 18650 }, { "entropy": 0.42029051125049594, "epoch": 3.4894569882440756, "grad_norm": 0.89453125, "learning_rate": 8.677783333186817e-06, "loss": 0.4907, "mean_token_accuracy": 0.7820263600349426, "num_tokens": 567162445.0, "step": 18700 }, { "entropy": 0.42464843273162844, "epoch": 3.498787087143124, "grad_norm": 0.49609375, "learning_rate": 8.61317572052467e-06, "loss": 0.4888, "mean_token_accuracy": 0.7788948893547059, "num_tokens": 568688382.0, "step": 18750 }, { "entropy": 0.426156534999609, "epoch": 3.5081171860421723, "grad_norm": 0.78515625, "learning_rate": 8.548841182153889e-06, "loss": 0.4944, "mean_token_accuracy": 0.7789571231603623, "num_tokens": 570273693.0, "step": 18800 }, { "entropy": 0.4215981301665306, "epoch": 3.5174472849412206, "grad_norm": 0.88671875, "learning_rate": 8.484781945600765e-06, "loss": 0.4901, "mean_token_accuracy": 0.781245459318161, "num_tokens": 571787716.0, "step": 18850 }, { "entropy": 0.4165229081362486, "epoch": 3.526777383840269, "grad_norm": 0.7265625, "learning_rate": 8.421000228859513e-06, "loss": 0.483, "mean_token_accuracy": 0.7865099585056305, "num_tokens": 573264670.0, "step": 18900 }, { "entropy": 0.4245117911696434, "epoch": 3.5361074827393173, "grad_norm": 0.486328125, "learning_rate": 8.35749824031547e-06, "loss": 0.4938, "mean_token_accuracy": 0.778894921541214, "num_tokens": 574817675.0, "step": 18950 }, { "entropy": 0.417805362790823, "epoch": 3.5454375816383656, "grad_norm": 0.423828125, "learning_rate": 8.294278178668643e-06, "loss": 0.4872, "mean_token_accuracy": 0.7830862325429916, "num_tokens": 576331491.0, "step": 19000 }, { "epoch": 3.5454375816383656, "eval_entropy": 0.4310347127531647, "eval_loss": 0.499735027551651, "eval_mean_token_accuracy": 0.779860559953462, "eval_num_tokens": 576331491.0, "eval_runtime": 16.1624, "eval_samples_per_second": 53.643, "eval_steps_per_second": 6.744, "step": 19000 }, { "entropy": 0.4081883782148361, "epoch": 3.5547676805374135, "grad_norm": 0.9375, "learning_rate": 8.231342232857553e-06, "loss": 0.4705, "mean_token_accuracy": 0.7903977036476135, "num_tokens": 577775016.0, "step": 19050 }, { "entropy": 0.4005671316385269, "epoch": 3.564097779436462, "grad_norm": 0.875, "learning_rate": 8.16869258198347e-06, "loss": 0.4737, "mean_token_accuracy": 0.7880425137281418, "num_tokens": 579207311.0, "step": 19100 }, { "entropy": 0.41074585855007173, "epoch": 3.5734278783355102, "grad_norm": 0.87109375, "learning_rate": 8.106331395234957e-06, "loss": 0.4797, "mean_token_accuracy": 0.786340873837471, "num_tokens": 580709130.0, "step": 19150 }, { "entropy": 0.4269941046833992, "epoch": 3.5827579772345586, "grad_norm": 0.5234375, "learning_rate": 8.044260831812762e-06, "loss": 0.4965, "mean_token_accuracy": 0.7799100142717361, "num_tokens": 582263480.0, "step": 19200 }, { "entropy": 0.40871032655239103, "epoch": 3.592088076133607, "grad_norm": 0.89453125, "learning_rate": 7.982483040855052e-06, "loss": 0.4885, "mean_token_accuracy": 0.7831970340013504, "num_tokens": 583753874.0, "step": 19250 }, { "entropy": 0.41553946167230604, "epoch": 3.6014181750326553, "grad_norm": 1.1875, "learning_rate": 7.921000161363023e-06, "loss": 0.4892, "mean_token_accuracy": 0.7814596974849701, "num_tokens": 585269107.0, "step": 19300 }, { "entropy": 0.4114553180336952, "epoch": 3.6107482739317036, "grad_norm": 0.73828125, "learning_rate": 7.859814322126803e-06, "loss": 0.5044, "mean_token_accuracy": 0.7843243205547332, "num_tokens": 586764493.0, "step": 19350 }, { "entropy": 0.4229819716513157, "epoch": 3.620078372830752, "grad_norm": 0.5, "learning_rate": 7.798927641651787e-06, "loss": 0.493, "mean_token_accuracy": 0.7810362190008163, "num_tokens": 588285178.0, "step": 19400 }, { "entropy": 0.43099318161606787, "epoch": 3.6294084717298003, "grad_norm": 0.466796875, "learning_rate": 7.738342228085244e-06, "loss": 0.4921, "mean_token_accuracy": 0.7791334927082062, "num_tokens": 589860202.0, "step": 19450 }, { "entropy": 0.4297554486989975, "epoch": 3.6387385706288486, "grad_norm": 0.6796875, "learning_rate": 7.678060179143354e-06, "loss": 0.4965, "mean_token_accuracy": 0.7779843896627426, "num_tokens": 591386894.0, "step": 19500 }, { "entropy": 0.4341718791425228, "epoch": 3.648068669527897, "grad_norm": 1.1171875, "learning_rate": 7.618083582038559e-06, "loss": 0.4973, "mean_token_accuracy": 0.7771023726463318, "num_tokens": 592966640.0, "step": 19550 }, { "entropy": 0.43129597157239913, "epoch": 3.6573987684269453, "grad_norm": 0.45703125, "learning_rate": 7.558414513407309e-06, "loss": 0.4966, "mean_token_accuracy": 0.7794705367088318, "num_tokens": 594551411.0, "step": 19600 }, { "entropy": 0.416672650128603, "epoch": 3.6667288673259937, "grad_norm": 0.5390625, "learning_rate": 7.499055039238146e-06, "loss": 0.4847, "mean_token_accuracy": 0.7843469917774201, "num_tokens": 596025749.0, "step": 19650 }, { "entropy": 0.4332768340408802, "epoch": 3.676058966225042, "grad_norm": 0.65625, "learning_rate": 7.4400072148001895e-06, "loss": 0.4897, "mean_token_accuracy": 0.7826856952905655, "num_tokens": 597572029.0, "step": 19700 }, { "entropy": 0.4150558638572693, "epoch": 3.6853890651240904, "grad_norm": 0.5, "learning_rate": 7.381273084571959e-06, "loss": 0.4844, "mean_token_accuracy": 0.785199624300003, "num_tokens": 599047819.0, "step": 19750 }, { "entropy": 0.4170726762712002, "epoch": 3.6947191640231387, "grad_norm": 0.7890625, "learning_rate": 7.322854682170584e-06, "loss": 0.4918, "mean_token_accuracy": 0.7805746030807496, "num_tokens": 600528500.0, "step": 19800 }, { "entropy": 0.43822780847549436, "epoch": 3.704049262922187, "grad_norm": 0.77734375, "learning_rate": 7.264754030281405e-06, "loss": 0.4994, "mean_token_accuracy": 0.7803661721944809, "num_tokens": 602042275.0, "step": 19850 }, { "entropy": 0.41316955953836443, "epoch": 3.7133793618212354, "grad_norm": 0.78515625, "learning_rate": 7.2069731405879325e-06, "loss": 0.4822, "mean_token_accuracy": 0.7859818071126938, "num_tokens": 603534613.0, "step": 19900 }, { "entropy": 0.43403887152671816, "epoch": 3.7227094607202837, "grad_norm": 0.48046875, "learning_rate": 7.149514013702186e-06, "loss": 0.499, "mean_token_accuracy": 0.7757015681266785, "num_tokens": 605111735.0, "step": 19950 }, { "entropy": 0.41590030148625373, "epoch": 3.732039559619332, "grad_norm": 0.57421875, "learning_rate": 7.092378639095451e-06, "loss": 0.4829, "mean_token_accuracy": 0.7830818378925324, "num_tokens": 606616332.0, "step": 20000 }, { "epoch": 3.732039559619332, "eval_entropy": 0.42986192684107966, "eval_loss": 0.4989897310733795, "eval_mean_token_accuracy": 0.779869570097792, "eval_num_tokens": 606616332.0, "eval_runtime": 16.2831, "eval_samples_per_second": 53.246, "eval_steps_per_second": 6.694, "step": 20000 }, { "entropy": 0.41359525367617606, "epoch": 3.7413696585183804, "grad_norm": 0.5078125, "learning_rate": 7.0355689950293636e-06, "loss": 0.4858, "mean_token_accuracy": 0.7825395846366883, "num_tokens": 608094829.0, "step": 20050 }, { "entropy": 0.4180074107646942, "epoch": 3.750699757417429, "grad_norm": 0.5390625, "learning_rate": 6.979087048487432e-06, "loss": 0.4865, "mean_token_accuracy": 0.7824456262588501, "num_tokens": 609631728.0, "step": 20100 }, { "entropy": 0.4242085382342339, "epoch": 3.7600298563164767, "grad_norm": 0.6015625, "learning_rate": 6.922934755106929e-06, "loss": 0.4895, "mean_token_accuracy": 0.7821110928058624, "num_tokens": 611154863.0, "step": 20150 }, { "entropy": 0.4358582437038422, "epoch": 3.769359955215525, "grad_norm": 0.66796875, "learning_rate": 6.867114059111178e-06, "loss": 0.4957, "mean_token_accuracy": 0.7778018289804458, "num_tokens": 612699583.0, "step": 20200 }, { "entropy": 0.4160428442060947, "epoch": 3.7786900541145734, "grad_norm": 0.71484375, "learning_rate": 6.81162689324224e-06, "loss": 0.4816, "mean_token_accuracy": 0.7859532070159913, "num_tokens": 614179127.0, "step": 20250 }, { "entropy": 0.42236584216356277, "epoch": 3.7880201530136217, "grad_norm": 0.404296875, "learning_rate": 6.756475178693988e-06, "loss": 0.497, "mean_token_accuracy": 0.77904121696949, "num_tokens": 615681957.0, "step": 20300 }, { "entropy": 0.4261379070580006, "epoch": 3.79735025191267, "grad_norm": 0.33984375, "learning_rate": 6.701660825045599e-06, "loss": 0.491, "mean_token_accuracy": 0.7808138716220856, "num_tokens": 617206693.0, "step": 20350 }, { "entropy": 0.41188708037137983, "epoch": 3.8066803508117184, "grad_norm": 0.443359375, "learning_rate": 6.64718573019542e-06, "loss": 0.482, "mean_token_accuracy": 0.7834567302465438, "num_tokens": 618685882.0, "step": 20400 }, { "entropy": 0.42921313650906084, "epoch": 3.8160104497107667, "grad_norm": 0.326171875, "learning_rate": 6.593051780295262e-06, "loss": 0.4886, "mean_token_accuracy": 0.7812140667438507, "num_tokens": 620242613.0, "step": 20450 }, { "entropy": 0.4060940612852573, "epoch": 3.825340548609815, "grad_norm": 0.80859375, "learning_rate": 6.5392608496851006e-06, "loss": 0.4804, "mean_token_accuracy": 0.7881927186250687, "num_tokens": 621695607.0, "step": 20500 }, { "entropy": 0.41289987429976466, "epoch": 3.8346706475088634, "grad_norm": 0.4921875, "learning_rate": 6.48581480082817e-06, "loss": 0.4814, "mean_token_accuracy": 0.7860061007738114, "num_tokens": 623202780.0, "step": 20550 }, { "entropy": 0.4154231162369251, "epoch": 3.844000746407912, "grad_norm": 0.81640625, "learning_rate": 6.432715484246474e-06, "loss": 0.4822, "mean_token_accuracy": 0.7877626097202302, "num_tokens": 624719320.0, "step": 20600 }, { "entropy": 0.43937826111912726, "epoch": 3.85333084530696, "grad_norm": 0.54296875, "learning_rate": 6.379964738456737e-06, "loss": 0.5081, "mean_token_accuracy": 0.7722378653287888, "num_tokens": 626304789.0, "step": 20650 }, { "entropy": 0.4095226752758026, "epoch": 3.8626609442060085, "grad_norm": 0.431640625, "learning_rate": 6.3275643899067095e-06, "loss": 0.4808, "mean_token_accuracy": 0.784711457490921, "num_tokens": 627775886.0, "step": 20700 }, { "entropy": 0.42841140910983083, "epoch": 3.871991043105057, "grad_norm": 0.7265625, "learning_rate": 6.275516252911957e-06, "loss": 0.495, "mean_token_accuracy": 0.7787529402971267, "num_tokens": 629320324.0, "step": 20750 }, { "entropy": 0.4122103577852249, "epoch": 3.881321142004105, "grad_norm": 0.8125, "learning_rate": 6.223822129593035e-06, "loss": 0.4872, "mean_token_accuracy": 0.7841408705711365, "num_tokens": 630822612.0, "step": 20800 }, { "entropy": 0.42884288884699345, "epoch": 3.8906512409031535, "grad_norm": 0.97265625, "learning_rate": 6.172483809813082e-06, "loss": 0.4946, "mean_token_accuracy": 0.7817574542760849, "num_tokens": 632330762.0, "step": 20850 }, { "entropy": 0.42925117403268814, "epoch": 3.899981339802202, "grad_norm": 0.53125, "learning_rate": 6.121503071115863e-06, "loss": 0.4968, "mean_token_accuracy": 0.7788661700487137, "num_tokens": 633862126.0, "step": 20900 }, { "entropy": 0.4173249228298664, "epoch": 3.90931143870125, "grad_norm": 0.48828125, "learning_rate": 6.0708816786642055e-06, "loss": 0.4896, "mean_token_accuracy": 0.7805528366565704, "num_tokens": 635369892.0, "step": 20950 }, { "entropy": 0.42858991749584674, "epoch": 3.9186415376002985, "grad_norm": 0.89453125, "learning_rate": 6.0206213851789065e-06, "loss": 0.4922, "mean_token_accuracy": 0.7816365206241608, "num_tokens": 636904691.0, "step": 21000 }, { "epoch": 3.9186415376002985, "eval_entropy": 0.4298953804947914, "eval_loss": 0.4981645345687866, "eval_mean_token_accuracy": 0.7802923715442692, "eval_num_tokens": 636904691.0, "eval_runtime": 16.1855, "eval_samples_per_second": 53.566, "eval_steps_per_second": 6.734, "step": 21000 }, { "entropy": 0.42679292619228365, "epoch": 3.927971636499347, "grad_norm": 0.7109375, "learning_rate": 5.970723930878021e-06, "loss": 0.4965, "mean_token_accuracy": 0.7826879835128784, "num_tokens": 638381309.0, "step": 21050 }, { "entropy": 0.4261575947701931, "epoch": 3.9373017353983952, "grad_norm": 1.03125, "learning_rate": 5.921191043416619e-06, "loss": 0.4962, "mean_token_accuracy": 0.7802698415517807, "num_tokens": 639907942.0, "step": 21100 }, { "entropy": 0.3956790755689144, "epoch": 3.9466318342974436, "grad_norm": 0.7265625, "learning_rate": 5.87202443782697e-06, "loss": 0.4689, "mean_token_accuracy": 0.7924928772449493, "num_tokens": 641357830.0, "step": 21150 }, { "entropy": 0.4151885700970888, "epoch": 3.955961933196492, "grad_norm": 0.62890625, "learning_rate": 5.823225816459159e-06, "loss": 0.4835, "mean_token_accuracy": 0.784718370437622, "num_tokens": 642852716.0, "step": 21200 }, { "entropy": 0.4133622221648693, "epoch": 3.9652920320955403, "grad_norm": 0.56640625, "learning_rate": 5.774796868922148e-06, "loss": 0.4817, "mean_token_accuracy": 0.7855832195281982, "num_tokens": 644369211.0, "step": 21250 }, { "entropy": 0.4185038904845715, "epoch": 3.9746221309945886, "grad_norm": 0.6953125, "learning_rate": 5.726739272025258e-06, "loss": 0.4859, "mean_token_accuracy": 0.7824643701314926, "num_tokens": 645872051.0, "step": 21300 }, { "entropy": 0.4287873014807701, "epoch": 3.983952229893637, "grad_norm": 0.87890625, "learning_rate": 5.679054689720142e-06, "loss": 0.4919, "mean_token_accuracy": 0.7797924143075943, "num_tokens": 647435628.0, "step": 21350 }, { "entropy": 0.42188764035701753, "epoch": 3.9932823287926853, "grad_norm": 0.8828125, "learning_rate": 5.631744773043137e-06, "loss": 0.4953, "mean_token_accuracy": 0.7852949523925781, "num_tokens": 648930592.0, "step": 21400 }, { "entropy": 0.4236401343345642, "epoch": 4.002612427691734, "grad_norm": 0.412109375, "learning_rate": 5.584811160058123e-06, "loss": 0.487, "mean_token_accuracy": 0.7832311981916428, "num_tokens": 650447365.0, "step": 21450 }, { "entropy": 0.4323525831103325, "epoch": 4.011942526590782, "grad_norm": 0.396484375, "learning_rate": 5.5382554757998e-06, "loss": 0.4934, "mean_token_accuracy": 0.7795790702104568, "num_tokens": 651990713.0, "step": 21500 }, { "entropy": 0.39284430593252184, "epoch": 4.02127262548983, "grad_norm": 0.6171875, "learning_rate": 5.492079332217413e-06, "loss": 0.4657, "mean_token_accuracy": 0.7923216378688812, "num_tokens": 653445504.0, "step": 21550 }, { "entropy": 0.4254847614467144, "epoch": 4.030602724388879, "grad_norm": 0.58203125, "learning_rate": 5.446284328118956e-06, "loss": 0.4899, "mean_token_accuracy": 0.7845428907871246, "num_tokens": 654991638.0, "step": 21600 }, { "entropy": 0.41144783079624175, "epoch": 4.039932823287927, "grad_norm": 0.48828125, "learning_rate": 5.4008720491158105e-06, "loss": 0.4816, "mean_token_accuracy": 0.7855917030572891, "num_tokens": 656505470.0, "step": 21650 }, { "entropy": 0.41339424341917036, "epoch": 4.049262922186975, "grad_norm": 0.7890625, "learning_rate": 5.355844067567827e-06, "loss": 0.487, "mean_token_accuracy": 0.7834293109178543, "num_tokens": 658059888.0, "step": 21700 }, { "entropy": 0.43376033812761305, "epoch": 4.058593021086024, "grad_norm": 0.55859375, "learning_rate": 5.311201942528911e-06, "loss": 0.5004, "mean_token_accuracy": 0.7760592538118363, "num_tokens": 659625009.0, "step": 21750 }, { "entropy": 0.4149209675192833, "epoch": 4.067923119985072, "grad_norm": 0.67578125, "learning_rate": 5.266947219693018e-06, "loss": 0.4821, "mean_token_accuracy": 0.7843904966115951, "num_tokens": 661130951.0, "step": 21800 }, { "entropy": 0.40518364384770394, "epoch": 4.07725321888412, "grad_norm": 0.322265625, "learning_rate": 5.2230814313406564e-06, "loss": 0.4804, "mean_token_accuracy": 0.7845012962818145, "num_tokens": 662681099.0, "step": 21850 }, { "entropy": 0.4218353702127933, "epoch": 4.086583317783169, "grad_norm": 0.89453125, "learning_rate": 5.179606096285814e-06, "loss": 0.4867, "mean_token_accuracy": 0.7828029912710189, "num_tokens": 664238581.0, "step": 21900 }, { "entropy": 0.4128453577309847, "epoch": 4.095913416682217, "grad_norm": 0.9296875, "learning_rate": 5.136522719823388e-06, "loss": 0.4777, "mean_token_accuracy": 0.7887437117099761, "num_tokens": 665738717.0, "step": 21950 }, { "entropy": 0.4112414425611496, "epoch": 4.105243515581265, "grad_norm": 0.490234375, "learning_rate": 5.093832793677053e-06, "loss": 0.4858, "mean_token_accuracy": 0.7840606135129928, "num_tokens": 667221460.0, "step": 22000 }, { "epoch": 4.105243515581265, "eval_entropy": 0.4262016671239783, "eval_loss": 0.49918287992477417, "eval_mean_token_accuracy": 0.7801302873760189, "eval_num_tokens": 667221460.0, "eval_runtime": 16.1734, "eval_samples_per_second": 53.607, "eval_steps_per_second": 6.739, "step": 22000 }, { "entropy": 0.4187443208694458, "epoch": 4.114573614480314, "grad_norm": 1.09375, "learning_rate": 5.051537795947614e-06, "loss": 0.4833, "mean_token_accuracy": 0.783671562075615, "num_tokens": 668746696.0, "step": 22050 }, { "entropy": 0.4121988409757614, "epoch": 4.123903713379362, "grad_norm": 0.5390625, "learning_rate": 5.009639191061831e-06, "loss": 0.479, "mean_token_accuracy": 0.7846984696388245, "num_tokens": 670258347.0, "step": 22100 }, { "entropy": 0.41994678273797037, "epoch": 4.1332338122784105, "grad_norm": 1.1171875, "learning_rate": 4.968138429721715e-06, "loss": 0.4852, "mean_token_accuracy": 0.7819422298669815, "num_tokens": 671783187.0, "step": 22150 }, { "entropy": 0.40594893679022787, "epoch": 4.142563911177459, "grad_norm": 0.423828125, "learning_rate": 4.9270369488543e-06, "loss": 0.4737, "mean_token_accuracy": 0.7885110950469971, "num_tokens": 673297222.0, "step": 22200 }, { "entropy": 0.4166788300871849, "epoch": 4.151894010076507, "grad_norm": 0.76953125, "learning_rate": 4.886336171561883e-06, "loss": 0.4809, "mean_token_accuracy": 0.783810424208641, "num_tokens": 674805400.0, "step": 22250 }, { "entropy": 0.4085456937551498, "epoch": 4.1612241089755555, "grad_norm": 0.4765625, "learning_rate": 4.846037507072753e-06, "loss": 0.4769, "mean_token_accuracy": 0.7872794485092163, "num_tokens": 676275357.0, "step": 22300 }, { "entropy": 0.4122830269485712, "epoch": 4.170554207874604, "grad_norm": 0.71484375, "learning_rate": 4.806142350692409e-06, "loss": 0.4821, "mean_token_accuracy": 0.7875606679916382, "num_tokens": 677776471.0, "step": 22350 }, { "entropy": 0.4124917629361153, "epoch": 4.179884306773652, "grad_norm": 0.46875, "learning_rate": 4.766652083755236e-06, "loss": 0.4826, "mean_token_accuracy": 0.7857430422306061, "num_tokens": 679312867.0, "step": 22400 }, { "entropy": 0.3990041773021221, "epoch": 4.1892144056727005, "grad_norm": 0.84765625, "learning_rate": 4.727568073576675e-06, "loss": 0.4759, "mean_token_accuracy": 0.7875368773937226, "num_tokens": 680753964.0, "step": 22450 }, { "entropy": 0.41429411858320236, "epoch": 4.198544504571749, "grad_norm": 0.9921875, "learning_rate": 4.688891673405898e-06, "loss": 0.4866, "mean_token_accuracy": 0.7836510550975799, "num_tokens": 682282646.0, "step": 22500 }, { "entropy": 0.4262200190126896, "epoch": 4.207874603470797, "grad_norm": 0.494140625, "learning_rate": 4.650624222378934e-06, "loss": 0.4956, "mean_token_accuracy": 0.7772960156202317, "num_tokens": 683822441.0, "step": 22550 }, { "entropy": 0.4214519140869379, "epoch": 4.217204702369845, "grad_norm": 0.578125, "learning_rate": 4.6127670454723106e-06, "loss": 0.4895, "mean_token_accuracy": 0.7828962570428848, "num_tokens": 685322393.0, "step": 22600 }, { "entropy": 0.41216524183750153, "epoch": 4.226534801268893, "grad_norm": 0.671875, "learning_rate": 4.575321453457185e-06, "loss": 0.4801, "mean_token_accuracy": 0.7840064114332199, "num_tokens": 686841081.0, "step": 22650 }, { "entropy": 0.41634872302412984, "epoch": 4.235864900167941, "grad_norm": 0.7109375, "learning_rate": 4.53828874285395e-06, "loss": 0.4867, "mean_token_accuracy": 0.7810469180345535, "num_tokens": 688411100.0, "step": 22700 }, { "entropy": 0.4129963879287243, "epoch": 4.24519499906699, "grad_norm": 0.546875, "learning_rate": 4.501670195887344e-06, "loss": 0.4887, "mean_token_accuracy": 0.785649740099907, "num_tokens": 689911759.0, "step": 22750 }, { "entropy": 0.4034364421665668, "epoch": 4.254525097966038, "grad_norm": 0.56640625, "learning_rate": 4.465467080442056e-06, "loss": 0.4727, "mean_token_accuracy": 0.7873591876029968, "num_tokens": 691405944.0, "step": 22800 }, { "entropy": 0.41453997910022733, "epoch": 4.263855196865086, "grad_norm": 1.1015625, "learning_rate": 4.4296806500188296e-06, "loss": 0.4843, "mean_token_accuracy": 0.7840264475345612, "num_tokens": 692922273.0, "step": 22850 }, { "entropy": 0.4219287024438381, "epoch": 4.273185295764135, "grad_norm": 0.435546875, "learning_rate": 4.394312143691058e-06, "loss": 0.4909, "mean_token_accuracy": 0.7800329983234405, "num_tokens": 694464854.0, "step": 22900 }, { "entropy": 0.41194928884506227, "epoch": 4.282515394663183, "grad_norm": 0.5078125, "learning_rate": 4.359362786061886e-06, "loss": 0.4874, "mean_token_accuracy": 0.7856051474809647, "num_tokens": 695964788.0, "step": 22950 }, { "entropy": 0.41727676048874857, "epoch": 4.291845493562231, "grad_norm": 0.94140625, "learning_rate": 4.324833787221808e-06, "loss": 0.4872, "mean_token_accuracy": 0.7820509207248688, "num_tokens": 697478901.0, "step": 23000 }, { "epoch": 4.291845493562231, "eval_entropy": 0.42559362865916084, "eval_loss": 0.4991084337234497, "eval_mean_token_accuracy": 0.7802343352125325, "eval_num_tokens": 697478901.0, "eval_runtime": 16.0896, "eval_samples_per_second": 53.886, "eval_steps_per_second": 6.775, "step": 23000 }, { "entropy": 0.42197408616542814, "epoch": 4.30117559246128, "grad_norm": 0.80859375, "learning_rate": 4.290726342706758e-06, "loss": 0.4859, "mean_token_accuracy": 0.7817700058221817, "num_tokens": 699021137.0, "step": 23050 }, { "entropy": 0.4093606770038605, "epoch": 4.310505691360328, "grad_norm": 0.9609375, "learning_rate": 4.257041633456738e-06, "loss": 0.4802, "mean_token_accuracy": 0.7852538430690765, "num_tokens": 700512894.0, "step": 23100 }, { "entropy": 0.42187732078135015, "epoch": 4.3198357902593765, "grad_norm": 1.171875, "learning_rate": 4.223780825774913e-06, "loss": 0.4865, "mean_token_accuracy": 0.7813728898763657, "num_tokens": 702059517.0, "step": 23150 }, { "entropy": 0.4125171934068203, "epoch": 4.329165889158425, "grad_norm": 0.5859375, "learning_rate": 4.1909450712872285e-06, "loss": 0.4905, "mean_token_accuracy": 0.7848614448308945, "num_tokens": 703551158.0, "step": 23200 }, { "entropy": 0.4088340279459953, "epoch": 4.338495988057473, "grad_norm": 0.76171875, "learning_rate": 4.158535506902543e-06, "loss": 0.4786, "mean_token_accuracy": 0.7860189139842987, "num_tokens": 705046371.0, "step": 23250 }, { "entropy": 0.4250268609821796, "epoch": 4.3478260869565215, "grad_norm": 0.423828125, "learning_rate": 4.1265532547732586e-06, "loss": 0.4883, "mean_token_accuracy": 0.7804222059249878, "num_tokens": 706651598.0, "step": 23300 }, { "entropy": 0.42093482360243795, "epoch": 4.35715618585557, "grad_norm": 0.29296875, "learning_rate": 4.094999422256478e-06, "loss": 0.4919, "mean_token_accuracy": 0.7815903490781784, "num_tokens": 708166771.0, "step": 23350 }, { "entropy": 0.40209699779748914, "epoch": 4.366486284754618, "grad_norm": 0.9453125, "learning_rate": 4.063875101875644e-06, "loss": 0.4719, "mean_token_accuracy": 0.7900790423154831, "num_tokens": 709635262.0, "step": 23400 }, { "entropy": 0.411265781968832, "epoch": 4.3758163836536665, "grad_norm": 0.81640625, "learning_rate": 4.033181371282729e-06, "loss": 0.4774, "mean_token_accuracy": 0.7869658744335175, "num_tokens": 711131934.0, "step": 23450 }, { "entropy": 0.41227160826325415, "epoch": 4.385146482552715, "grad_norm": 1.6953125, "learning_rate": 4.002919293220917e-06, "loss": 0.4809, "mean_token_accuracy": 0.7838652014732361, "num_tokens": 712614938.0, "step": 23500 }, { "entropy": 0.4009349416196346, "epoch": 4.394476581451763, "grad_norm": 0.73046875, "learning_rate": 3.973089915487803e-06, "loss": 0.4823, "mean_token_accuracy": 0.7867969334125519, "num_tokens": 714060976.0, "step": 23550 }, { "entropy": 0.43059924483299256, "epoch": 4.403806680350812, "grad_norm": 0.8125, "learning_rate": 3.943694270899114e-06, "loss": 0.5009, "mean_token_accuracy": 0.7788757783174515, "num_tokens": 715598099.0, "step": 23600 }, { "entropy": 0.41181773334741595, "epoch": 4.41313677924986, "grad_norm": 0.380859375, "learning_rate": 3.914733377252963e-06, "loss": 0.4793, "mean_token_accuracy": 0.786955691576004, "num_tokens": 717099596.0, "step": 23650 }, { "entropy": 0.3970365000516176, "epoch": 4.422466878148908, "grad_norm": 0.60546875, "learning_rate": 3.886208237294589e-06, "loss": 0.4733, "mean_token_accuracy": 0.7909884482622147, "num_tokens": 718556937.0, "step": 23700 }, { "entropy": 0.4260143294930458, "epoch": 4.431796977047957, "grad_norm": 0.73828125, "learning_rate": 3.858119838681645e-06, "loss": 0.4887, "mean_token_accuracy": 0.7821561121940612, "num_tokens": 720105007.0, "step": 23750 }, { "entropy": 0.4135922496020794, "epoch": 4.441127075947005, "grad_norm": 0.6640625, "learning_rate": 3.830469153950008e-06, "loss": 0.4802, "mean_token_accuracy": 0.7842557770013809, "num_tokens": 721637538.0, "step": 23800 }, { "entropy": 0.4049617177248001, "epoch": 4.450457174846053, "grad_norm": 0.72265625, "learning_rate": 3.803257140480098e-06, "loss": 0.4855, "mean_token_accuracy": 0.7906941068172455, "num_tokens": 723121765.0, "step": 23850 }, { "entropy": 0.40623982638120654, "epoch": 4.459787273745102, "grad_norm": 1.0390625, "learning_rate": 3.776484740463726e-06, "loss": 0.4776, "mean_token_accuracy": 0.785234968662262, "num_tokens": 724610759.0, "step": 23900 }, { "entropy": 0.4155420292913914, "epoch": 4.46911737264415, "grad_norm": 0.6484375, "learning_rate": 3.7501528808714883e-06, "loss": 0.4854, "mean_token_accuracy": 0.7824651861190796, "num_tokens": 726130784.0, "step": 23950 }, { "entropy": 0.41285816714167595, "epoch": 4.478447471543198, "grad_norm": 0.5859375, "learning_rate": 3.7242624734206554e-06, "loss": 0.4816, "mean_token_accuracy": 0.785009593963623, "num_tokens": 727648846.0, "step": 24000 }, { "epoch": 4.478447471543198, "eval_entropy": 0.4252187248763688, "eval_loss": 0.49884533882141113, "eval_mean_token_accuracy": 0.7805168371681773, "eval_num_tokens": 727648846.0, "eval_runtime": 16.6326, "eval_samples_per_second": 52.126, "eval_steps_per_second": 6.553, "step": 24000 }, { "entropy": 0.417481614202261, "epoch": 4.487777570442247, "grad_norm": 0.59375, "learning_rate": 3.6988144145436063e-06, "loss": 0.4834, "mean_token_accuracy": 0.7845394277572632, "num_tokens": 729133282.0, "step": 24050 }, { "entropy": 0.42680337965488435, "epoch": 4.497107669341295, "grad_norm": 1.0, "learning_rate": 3.6738095853567963e-06, "loss": 0.4951, "mean_token_accuracy": 0.7786115556955338, "num_tokens": 730690418.0, "step": 24100 }, { "entropy": 0.4081826032698154, "epoch": 4.506437768240343, "grad_norm": 0.3984375, "learning_rate": 3.6492488516302438e-06, "loss": 0.4776, "mean_token_accuracy": 0.7866486293077469, "num_tokens": 732175841.0, "step": 24150 }, { "entropy": 0.4243287441134453, "epoch": 4.515767867139392, "grad_norm": 0.439453125, "learning_rate": 3.625133063757556e-06, "loss": 0.4862, "mean_token_accuracy": 0.7824915134906769, "num_tokens": 733702523.0, "step": 24200 }, { "entropy": 0.40577477023005487, "epoch": 4.52509796603844, "grad_norm": 0.81640625, "learning_rate": 3.6014630567264895e-06, "loss": 0.4744, "mean_token_accuracy": 0.788703248500824, "num_tokens": 735166918.0, "step": 24250 }, { "entropy": 0.4156419275701046, "epoch": 4.534428064937488, "grad_norm": 0.6484375, "learning_rate": 3.578239650090026e-06, "loss": 0.4787, "mean_token_accuracy": 0.7852990156412125, "num_tokens": 736643884.0, "step": 24300 }, { "entropy": 0.39459425553679467, "epoch": 4.543758163836537, "grad_norm": 0.5546875, "learning_rate": 3.555463647938016e-06, "loss": 0.4681, "mean_token_accuracy": 0.792020954489708, "num_tokens": 738110498.0, "step": 24350 }, { "entropy": 0.4069563465565443, "epoch": 4.553088262735585, "grad_norm": 0.416015625, "learning_rate": 3.533135838869318e-06, "loss": 0.4753, "mean_token_accuracy": 0.7872859954833984, "num_tokens": 739623859.0, "step": 24400 }, { "entropy": 0.4120598857104778, "epoch": 4.562418361634633, "grad_norm": 0.40234375, "learning_rate": 3.5112569959645072e-06, "loss": 0.4859, "mean_token_accuracy": 0.7832210195064545, "num_tokens": 741111342.0, "step": 24450 }, { "entropy": 0.411198351085186, "epoch": 4.571748460533682, "grad_norm": 0.84375, "learning_rate": 3.4898278767591007e-06, "loss": 0.4821, "mean_token_accuracy": 0.7840376651287079, "num_tokens": 742634170.0, "step": 24500 }, { "entropy": 0.41103513091802596, "epoch": 4.58107855943273, "grad_norm": 0.53515625, "learning_rate": 3.4688492232173343e-06, "loss": 0.4797, "mean_token_accuracy": 0.785804370045662, "num_tokens": 744158302.0, "step": 24550 }, { "entropy": 0.41402764439582823, "epoch": 4.5904086583317785, "grad_norm": 1.4921875, "learning_rate": 3.448321761706467e-06, "loss": 0.4883, "mean_token_accuracy": 0.78047394156456, "num_tokens": 745688655.0, "step": 24600 }, { "entropy": 0.4192537406086922, "epoch": 4.599738757230827, "grad_norm": 0.67578125, "learning_rate": 3.428246202971639e-06, "loss": 0.484, "mean_token_accuracy": 0.7821294456720352, "num_tokens": 747226392.0, "step": 24650 }, { "entropy": 0.3943482875823975, "epoch": 4.609068856129875, "grad_norm": 0.7734375, "learning_rate": 3.408623242111255e-06, "loss": 0.475, "mean_token_accuracy": 0.7858047294616699, "num_tokens": 748701674.0, "step": 24700 }, { "entropy": 0.39550173744559286, "epoch": 4.6183989550289235, "grad_norm": 0.3515625, "learning_rate": 3.389453558552918e-06, "loss": 0.4673, "mean_token_accuracy": 0.7902515822649002, "num_tokens": 750138953.0, "step": 24750 }, { "entropy": 0.41649769321084024, "epoch": 4.627729053927972, "grad_norm": 0.6484375, "learning_rate": 3.37073781602991e-06, "loss": 0.4877, "mean_token_accuracy": 0.7842141664028168, "num_tokens": 751652780.0, "step": 24800 }, { "entropy": 0.41776536986231805, "epoch": 4.63705915282702, "grad_norm": 0.44140625, "learning_rate": 3.3524766625582052e-06, "loss": 0.4836, "mean_token_accuracy": 0.7836203473806381, "num_tokens": 753230769.0, "step": 24850 }, { "entropy": 0.4045536919683218, "epoch": 4.6463892517260685, "grad_norm": 0.6875, "learning_rate": 3.334670730414037e-06, "loss": 0.4825, "mean_token_accuracy": 0.7888794159889221, "num_tokens": 754719857.0, "step": 24900 }, { "entropy": 0.4182581885159016, "epoch": 4.655719350625117, "grad_norm": 0.408203125, "learning_rate": 3.3173206361120026e-06, "loss": 0.4862, "mean_token_accuracy": 0.7835237330198288, "num_tokens": 756242660.0, "step": 24950 }, { "entropy": 0.41097776919603346, "epoch": 4.665049449524165, "grad_norm": 0.4921875, "learning_rate": 3.3004269803837223e-06, "loss": 0.4833, "mean_token_accuracy": 0.7847666722536087, "num_tokens": 757779830.0, "step": 25000 }, { "epoch": 4.665049449524165, "eval_entropy": 0.425675423730404, "eval_loss": 0.49867185950279236, "eval_mean_token_accuracy": 0.7805509616475587, "eval_num_tokens": 757779830.0, "eval_runtime": 16.0252, "eval_samples_per_second": 54.102, "eval_steps_per_second": 6.802, "step": 25000 }, { "entropy": 0.4132141026854515, "epoch": 4.674379548423214, "grad_norm": 1.2109375, "learning_rate": 3.2839903481570305e-06, "loss": 0.4843, "mean_token_accuracy": 0.7841847789287567, "num_tokens": 759296685.0, "step": 25050 }, { "entropy": 0.40370859257876873, "epoch": 4.683709647322262, "grad_norm": 0.4765625, "learning_rate": 3.268011308535733e-06, "loss": 0.4746, "mean_token_accuracy": 0.7883295321464538, "num_tokens": 760792002.0, "step": 25100 }, { "entropy": 0.4057003001868725, "epoch": 4.69303974622131, "grad_norm": 0.55859375, "learning_rate": 3.252490414779895e-06, "loss": 0.4792, "mean_token_accuracy": 0.7840817874670029, "num_tokens": 762288394.0, "step": 25150 }, { "entropy": 0.4204439736157656, "epoch": 4.702369845120359, "grad_norm": 0.671875, "learning_rate": 3.2374282042866876e-06, "loss": 0.4853, "mean_token_accuracy": 0.7837431621551514, "num_tokens": 763765331.0, "step": 25200 }, { "entropy": 0.4071824544668198, "epoch": 4.711699944019407, "grad_norm": 1.1796875, "learning_rate": 3.2228251985717824e-06, "loss": 0.4852, "mean_token_accuracy": 0.7848959761857986, "num_tokens": 765243737.0, "step": 25250 }, { "entropy": 0.43803592413663867, "epoch": 4.721030042918455, "grad_norm": 0.8515625, "learning_rate": 3.208681903251291e-06, "loss": 0.4973, "mean_token_accuracy": 0.7763216584920883, "num_tokens": 766812993.0, "step": 25300 }, { "entropy": 0.4035507388412952, "epoch": 4.730360141817504, "grad_norm": 0.8359375, "learning_rate": 3.1949988080242665e-06, "loss": 0.4751, "mean_token_accuracy": 0.7876280504465103, "num_tokens": 768318546.0, "step": 25350 }, { "entropy": 0.41653711020946504, "epoch": 4.739690240716552, "grad_norm": 0.625, "learning_rate": 3.181776386655733e-06, "loss": 0.4859, "mean_token_accuracy": 0.7842164701223373, "num_tokens": 769827713.0, "step": 25400 }, { "entropy": 0.43134715765714643, "epoch": 4.749020339615599, "grad_norm": 0.66015625, "learning_rate": 3.1690150969603e-06, "loss": 0.4975, "mean_token_accuracy": 0.7809195184707641, "num_tokens": 771418246.0, "step": 25450 }, { "entropy": 0.4131816050410271, "epoch": 4.758350438514649, "grad_norm": 0.384765625, "learning_rate": 3.1567153807862953e-06, "loss": 0.4821, "mean_token_accuracy": 0.7868939906358718, "num_tokens": 772904428.0, "step": 25500 }, { "entropy": 0.4172322556376457, "epoch": 4.767680537413696, "grad_norm": 0.458984375, "learning_rate": 3.1448776640004756e-06, "loss": 0.4864, "mean_token_accuracy": 0.7821826964616776, "num_tokens": 774432393.0, "step": 25550 }, { "entropy": 0.410623489767313, "epoch": 4.777010636312745, "grad_norm": 0.7578125, "learning_rate": 3.133502356473279e-06, "loss": 0.4765, "mean_token_accuracy": 0.7861051166057587, "num_tokens": 775955056.0, "step": 25600 }, { "entropy": 0.4126855818927288, "epoch": 4.786340735211793, "grad_norm": 1.1875, "learning_rate": 3.1225898520646354e-06, "loss": 0.4791, "mean_token_accuracy": 0.7849954336881637, "num_tokens": 777481570.0, "step": 25650 }, { "entropy": 0.42760392755270005, "epoch": 4.795670834110842, "grad_norm": 0.54296875, "learning_rate": 3.112140528610325e-06, "loss": 0.4888, "mean_token_accuracy": 0.7792460584640503, "num_tokens": 779084872.0, "step": 25700 }, { "entropy": 0.4246444535255432, "epoch": 4.8050009330098895, "grad_norm": 0.58203125, "learning_rate": 3.102154747908898e-06, "loss": 0.4901, "mean_token_accuracy": 0.779129432439804, "num_tokens": 780621011.0, "step": 25750 }, { "entropy": 0.4207975560426712, "epoch": 4.814331031908938, "grad_norm": 0.53515625, "learning_rate": 3.0926328557091484e-06, "loss": 0.4829, "mean_token_accuracy": 0.7824300426244736, "num_tokens": 782204557.0, "step": 25800 }, { "entropy": 0.40202761128544806, "epoch": 4.823661130807986, "grad_norm": 0.65625, "learning_rate": 3.0835751816981437e-06, "loss": 0.4742, "mean_token_accuracy": 0.7883801186084747, "num_tokens": 783710597.0, "step": 25850 }, { "entropy": 0.4042065401375294, "epoch": 4.8329912297070345, "grad_norm": 0.890625, "learning_rate": 3.0749820394898103e-06, "loss": 0.4773, "mean_token_accuracy": 0.7863856315612793, "num_tokens": 785204376.0, "step": 25900 }, { "entropy": 0.4113363729417324, "epoch": 4.842321328606083, "grad_norm": 0.42578125, "learning_rate": 3.066853726614068e-06, "loss": 0.4836, "mean_token_accuracy": 0.7829122406244278, "num_tokens": 786745388.0, "step": 25950 }, { "entropy": 0.4050539457052946, "epoch": 4.851651427505131, "grad_norm": 1.0546875, "learning_rate": 3.0591905245065378e-06, "loss": 0.4782, "mean_token_accuracy": 0.7893619048595428, "num_tokens": 788238814.0, "step": 26000 }, { "epoch": 4.851651427505131, "eval_entropy": 0.42556045840092754, "eval_loss": 0.4986078143119812, "eval_mean_token_accuracy": 0.7803256960090147, "eval_num_tokens": 788238814.0, "eval_runtime": 16.1117, "eval_samples_per_second": 53.812, "eval_steps_per_second": 6.765, "step": 26000 }, { "entropy": 0.4172664260864258, "epoch": 4.86098152640418, "grad_norm": 0.8828125, "learning_rate": 3.0519926984987924e-06, "loss": 0.4896, "mean_token_accuracy": 0.781678112745285, "num_tokens": 789737207.0, "step": 26050 }, { "entropy": 0.4131707660853863, "epoch": 4.870311625303228, "grad_norm": 0.484375, "learning_rate": 3.045260497809169e-06, "loss": 0.4816, "mean_token_accuracy": 0.7855587202310562, "num_tokens": 791272449.0, "step": 26100 }, { "entropy": 0.438031694740057, "epoch": 4.879641724202276, "grad_norm": 0.43359375, "learning_rate": 3.0389941555341412e-06, "loss": 0.4988, "mean_token_accuracy": 0.7749890965223313, "num_tokens": 792843109.0, "step": 26150 }, { "entropy": 0.41347076088190077, "epoch": 4.888971823101325, "grad_norm": 0.66015625, "learning_rate": 3.03319388864025e-06, "loss": 0.4862, "mean_token_accuracy": 0.782042904496193, "num_tokens": 794349700.0, "step": 26200 }, { "entropy": 0.4158082590997219, "epoch": 4.898301922000373, "grad_norm": 0.5, "learning_rate": 3.0278598979565877e-06, "loss": 0.4888, "mean_token_accuracy": 0.7814363497495651, "num_tokens": 795874145.0, "step": 26250 }, { "entropy": 0.4144511626660824, "epoch": 4.907632020899421, "grad_norm": 0.373046875, "learning_rate": 3.0229923681678497e-06, "loss": 0.4811, "mean_token_accuracy": 0.7852653992176056, "num_tokens": 797377284.0, "step": 26300 }, { "entropy": 0.4087340448796749, "epoch": 4.91696211979847, "grad_norm": 0.3828125, "learning_rate": 3.018591467807935e-06, "loss": 0.4829, "mean_token_accuracy": 0.7855605220794678, "num_tokens": 798886169.0, "step": 26350 }, { "entropy": 0.4149911729991436, "epoch": 4.926292218697518, "grad_norm": 0.478515625, "learning_rate": 3.0146573492541123e-06, "loss": 0.4865, "mean_token_accuracy": 0.7815834748744964, "num_tokens": 800437490.0, "step": 26400 }, { "entropy": 0.4159658246487379, "epoch": 4.935622317596566, "grad_norm": 0.498046875, "learning_rate": 3.0111901487217452e-06, "loss": 0.485, "mean_token_accuracy": 0.7831065011024475, "num_tokens": 801983835.0, "step": 26450 }, { "entropy": 0.4220912031829357, "epoch": 4.944952416495615, "grad_norm": 0.51171875, "learning_rate": 3.008189986259573e-06, "loss": 0.4914, "mean_token_accuracy": 0.7806734621524811, "num_tokens": 803525044.0, "step": 26500 }, { "entropy": 0.4241555346548557, "epoch": 4.954282515394663, "grad_norm": 0.45703125, "learning_rate": 3.0056569657455626e-06, "loss": 0.4879, "mean_token_accuracy": 0.7803041088581085, "num_tokens": 805085203.0, "step": 26550 }, { "entropy": 0.4313288567960262, "epoch": 4.963612614293711, "grad_norm": 0.546875, "learning_rate": 3.0035911748832985e-06, "loss": 0.4982, "mean_token_accuracy": 0.7780981206893921, "num_tokens": 806639306.0, "step": 26600 }, { "entropy": 0.40001085847616197, "epoch": 4.97294271319276, "grad_norm": 0.6328125, "learning_rate": 3.0019926851989556e-06, "loss": 0.4735, "mean_token_accuracy": 0.7876440799236297, "num_tokens": 808101409.0, "step": 26650 }, { "entropy": 0.4143665814399719, "epoch": 4.982272812091808, "grad_norm": 1.0, "learning_rate": 3.000861552038823e-06, "loss": 0.4829, "mean_token_accuracy": 0.7828416174650192, "num_tokens": 809661206.0, "step": 26700 }, { "entropy": 0.410667944252491, "epoch": 4.991602910990856, "grad_norm": 0.28125, "learning_rate": 3.0001978145673808e-06, "loss": 0.4815, "mean_token_accuracy": 0.7867605596780777, "num_tokens": 811153343.0, "step": 26750 } ], "logging_steps": 50, "max_steps": 26795, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4288887116625084e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }