test-11-20-01 / trainer_state.json
robiulawalgunjon's picture
Upload folder using huggingface_hub
bcf993b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 1000,
"global_step": 26795,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 0.6402970719337463,
"epoch": 0.009330098899048329,
"grad_norm": 1.015625,
"learning_rate": 1.47e-05,
"loss": 0.778,
"mean_token_accuracy": 0.7212734770774841,
"num_tokens": 1548877.0,
"step": 50
},
{
"entropy": 0.5775007322430611,
"epoch": 0.018660197798096658,
"grad_norm": 1.109375,
"learning_rate": 2.97e-05,
"loss": 0.5918,
"mean_token_accuracy": 0.7543947434425354,
"num_tokens": 3018955.0,
"step": 100
},
{
"entropy": 0.5692345011234283,
"epoch": 0.02799029669714499,
"grad_norm": 0.98046875,
"learning_rate": 2.999977554224005e-05,
"loss": 0.5835,
"mean_token_accuracy": 0.753638728260994,
"num_tokens": 4563356.0,
"step": 150
},
{
"entropy": 0.570181995332241,
"epoch": 0.037320395596193316,
"grad_norm": 0.66015625,
"learning_rate": 2.999908376021796e-05,
"loss": 0.5835,
"mean_token_accuracy": 0.7521764719486237,
"num_tokens": 6114611.0,
"step": 200
},
{
"entropy": 0.5772185748815537,
"epoch": 0.04665049449524165,
"grad_norm": 1.3125,
"learning_rate": 2.9997924584400694e-05,
"loss": 0.5841,
"mean_token_accuracy": 0.7549300968647004,
"num_tokens": 7635855.0,
"step": 250
},
{
"entropy": 0.5723973855376243,
"epoch": 0.05598059339428998,
"grad_norm": 0.81640625,
"learning_rate": 2.9996298054923684e-05,
"loss": 0.5865,
"mean_token_accuracy": 0.7571439111232757,
"num_tokens": 9098584.0,
"step": 300
},
{
"entropy": 0.5691715434193612,
"epoch": 0.06531069229333832,
"grad_norm": 0.625,
"learning_rate": 2.9994204228104075e-05,
"loss": 0.583,
"mean_token_accuracy": 0.7545554572343827,
"num_tokens": 10604199.0,
"step": 350
},
{
"entropy": 0.5754752764105797,
"epoch": 0.07464079119238663,
"grad_norm": 1.25,
"learning_rate": 2.9991643176438752e-05,
"loss": 0.5852,
"mean_token_accuracy": 0.7542591279745102,
"num_tokens": 12131126.0,
"step": 400
},
{
"entropy": 0.5670795711874962,
"epoch": 0.08397089009143496,
"grad_norm": 0.68359375,
"learning_rate": 2.9988614988601868e-05,
"loss": 0.5779,
"mean_token_accuracy": 0.7581483513116837,
"num_tokens": 13642307.0,
"step": 450
},
{
"entropy": 0.5779001507163047,
"epoch": 0.0933009889904833,
"grad_norm": 0.431640625,
"learning_rate": 2.998511976944173e-05,
"loss": 0.5874,
"mean_token_accuracy": 0.7504317510128021,
"num_tokens": 15217615.0,
"step": 500
},
{
"entropy": 0.5665371876955032,
"epoch": 0.10263108788953162,
"grad_norm": 0.9140625,
"learning_rate": 2.998115763997721e-05,
"loss": 0.5851,
"mean_token_accuracy": 0.7549385547637939,
"num_tokens": 16724810.0,
"step": 550
},
{
"entropy": 0.5638798615336418,
"epoch": 0.11196118678857996,
"grad_norm": 1.046875,
"learning_rate": 2.9976728737393515e-05,
"loss": 0.5756,
"mean_token_accuracy": 0.757421538233757,
"num_tokens": 18270004.0,
"step": 600
},
{
"entropy": 0.5710726794600487,
"epoch": 0.12129128568762829,
"grad_norm": 0.76171875,
"learning_rate": 2.997183321503747e-05,
"loss": 0.5854,
"mean_token_accuracy": 0.7553139424324036,
"num_tokens": 19791398.0,
"step": 650
},
{
"entropy": 0.5717519807815552,
"epoch": 0.13062138458667663,
"grad_norm": 1.28125,
"learning_rate": 2.9966471242412192e-05,
"loss": 0.5828,
"mean_token_accuracy": 0.7528650748729706,
"num_tokens": 21373941.0,
"step": 700
},
{
"entropy": 0.5660536578297615,
"epoch": 0.13995148348572495,
"grad_norm": 1.265625,
"learning_rate": 2.996064300517122e-05,
"loss": 0.5858,
"mean_token_accuracy": 0.7544754481315613,
"num_tokens": 22872328.0,
"step": 750
},
{
"entropy": 0.5696932604908943,
"epoch": 0.14928158238477326,
"grad_norm": 0.953125,
"learning_rate": 2.995434870511211e-05,
"loss": 0.5881,
"mean_token_accuracy": 0.7530353850126267,
"num_tokens": 24433349.0,
"step": 800
},
{
"entropy": 0.5728991779685021,
"epoch": 0.1586116812838216,
"grad_norm": 1.140625,
"learning_rate": 2.9947588560169395e-05,
"loss": 0.5841,
"mean_token_accuracy": 0.753363783955574,
"num_tokens": 26038373.0,
"step": 850
},
{
"entropy": 0.5628881072998047,
"epoch": 0.16794178018286993,
"grad_norm": 0.6328125,
"learning_rate": 2.994036280440711e-05,
"loss": 0.573,
"mean_token_accuracy": 0.7580157500505448,
"num_tokens": 27568454.0,
"step": 900
},
{
"entropy": 0.5649524646997451,
"epoch": 0.17727187908191827,
"grad_norm": 0.95703125,
"learning_rate": 2.9932671688010632e-05,
"loss": 0.5766,
"mean_token_accuracy": 0.7574625754356384,
"num_tokens": 29049728.0,
"step": 950
},
{
"entropy": 0.5648883840441704,
"epoch": 0.1866019779809666,
"grad_norm": 0.99609375,
"learning_rate": 2.992451547727804e-05,
"loss": 0.5883,
"mean_token_accuracy": 0.7526043313741684,
"num_tokens": 30603198.0,
"step": 1000
},
{
"epoch": 0.1866019779809666,
"eval_entropy": 0.5557291887768911,
"eval_loss": 0.5737926959991455,
"eval_mean_token_accuracy": 0.758812694921406,
"eval_num_tokens": 30603198.0,
"eval_runtime": 16.2769,
"eval_samples_per_second": 53.266,
"eval_steps_per_second": 6.697,
"step": 1000
},
{
"entropy": 0.5666726857423783,
"epoch": 0.19593207688001493,
"grad_norm": 0.609375,
"learning_rate": 2.9915894454610887e-05,
"loss": 0.5764,
"mean_token_accuracy": 0.7547625786066056,
"num_tokens": 32144851.0,
"step": 1050
},
{
"entropy": 0.561416018307209,
"epoch": 0.20526217577906325,
"grad_norm": 0.55859375,
"learning_rate": 2.990680891850444e-05,
"loss": 0.5794,
"mean_token_accuracy": 0.75654057264328,
"num_tokens": 33639533.0,
"step": 1100
},
{
"entropy": 0.5599774518609046,
"epoch": 0.2145922746781116,
"grad_norm": 1.4140625,
"learning_rate": 2.9897259183537322e-05,
"loss": 0.5772,
"mean_token_accuracy": 0.7588758039474487,
"num_tokens": 35130975.0,
"step": 1150
},
{
"entropy": 0.5628740054368973,
"epoch": 0.2239223735771599,
"grad_norm": 0.91015625,
"learning_rate": 2.9887245580360623e-05,
"loss": 0.581,
"mean_token_accuracy": 0.7554371774196624,
"num_tokens": 36651189.0,
"step": 1200
},
{
"entropy": 0.5624552240967751,
"epoch": 0.23325247247620826,
"grad_norm": 0.92578125,
"learning_rate": 2.9876768455686477e-05,
"loss": 0.5731,
"mean_token_accuracy": 0.7572768718004227,
"num_tokens": 38172935.0,
"step": 1250
},
{
"entropy": 0.5497725516557693,
"epoch": 0.24258257137525657,
"grad_norm": 0.8671875,
"learning_rate": 2.9865828172276023e-05,
"loss": 0.569,
"mean_token_accuracy": 0.7618441820144654,
"num_tokens": 39653342.0,
"step": 1300
},
{
"entropy": 0.5542441910505295,
"epoch": 0.2519126702743049,
"grad_norm": 0.84375,
"learning_rate": 2.9854425108926863e-05,
"loss": 0.5732,
"mean_token_accuracy": 0.762292046546936,
"num_tokens": 41090294.0,
"step": 1350
},
{
"entropy": 0.5551298156380653,
"epoch": 0.26124276917335326,
"grad_norm": 0.9375,
"learning_rate": 2.984255966045995e-05,
"loss": 0.5773,
"mean_token_accuracy": 0.755358315706253,
"num_tokens": 42625920.0,
"step": 1400
},
{
"entropy": 0.5527626049518585,
"epoch": 0.27057286807240155,
"grad_norm": 0.54296875,
"learning_rate": 2.9830232237705904e-05,
"loss": 0.5724,
"mean_token_accuracy": 0.7588168692588806,
"num_tokens": 44134645.0,
"step": 1450
},
{
"entropy": 0.5561913156509399,
"epoch": 0.2799029669714499,
"grad_norm": 0.396484375,
"learning_rate": 2.9817443267490797e-05,
"loss": 0.5742,
"mean_token_accuracy": 0.7577809965610505,
"num_tokens": 45605580.0,
"step": 1500
},
{
"entropy": 0.5598691233992577,
"epoch": 0.28923306587049824,
"grad_norm": 0.94921875,
"learning_rate": 2.9804193192621376e-05,
"loss": 0.5746,
"mean_token_accuracy": 0.7551334691047669,
"num_tokens": 47144264.0,
"step": 1550
},
{
"entropy": 0.554311693906784,
"epoch": 0.29856316476954653,
"grad_norm": 0.98828125,
"learning_rate": 2.979048247186972e-05,
"loss": 0.5691,
"mean_token_accuracy": 0.7589048826694489,
"num_tokens": 48660197.0,
"step": 1600
},
{
"entropy": 0.5676783239841461,
"epoch": 0.3078932636685949,
"grad_norm": 1.09375,
"learning_rate": 2.9776311579957372e-05,
"loss": 0.5797,
"mean_token_accuracy": 0.7567919301986694,
"num_tokens": 50152863.0,
"step": 1650
},
{
"entropy": 0.5468079242110252,
"epoch": 0.3172233625676432,
"grad_norm": 1.2421875,
"learning_rate": 2.976168100753889e-05,
"loss": 0.5677,
"mean_token_accuracy": 0.7582010948657989,
"num_tokens": 51722410.0,
"step": 1700
},
{
"entropy": 0.5553153255581855,
"epoch": 0.32655346146669156,
"grad_norm": 0.97265625,
"learning_rate": 2.974659126118485e-05,
"loss": 0.5696,
"mean_token_accuracy": 0.7591327953338624,
"num_tokens": 53257454.0,
"step": 1750
},
{
"entropy": 0.553457222878933,
"epoch": 0.33588356036573985,
"grad_norm": 0.49609375,
"learning_rate": 2.973104286336433e-05,
"loss": 0.5725,
"mean_token_accuracy": 0.7564568722248077,
"num_tokens": 54784162.0,
"step": 1800
},
{
"entropy": 0.5699556747078895,
"epoch": 0.3452136592647882,
"grad_norm": 0.69140625,
"learning_rate": 2.971503635242682e-05,
"loss": 0.5756,
"mean_token_accuracy": 0.7566489219665528,
"num_tokens": 56304516.0,
"step": 1850
},
{
"entropy": 0.5491332325339318,
"epoch": 0.35454375816383654,
"grad_norm": 0.7265625,
"learning_rate": 2.9698572282583534e-05,
"loss": 0.5683,
"mean_token_accuracy": 0.7583828049898148,
"num_tokens": 57819074.0,
"step": 1900
},
{
"entropy": 0.5511914587020874,
"epoch": 0.3638738570628849,
"grad_norm": 0.7265625,
"learning_rate": 2.9681651223888298e-05,
"loss": 0.571,
"mean_token_accuracy": 0.7572992449998855,
"num_tokens": 59346739.0,
"step": 1950
},
{
"entropy": 0.5618056333065033,
"epoch": 0.3732039559619332,
"grad_norm": 0.72265625,
"learning_rate": 2.966427376221774e-05,
"loss": 0.5792,
"mean_token_accuracy": 0.7539066845178604,
"num_tokens": 60876192.0,
"step": 2000
},
{
"epoch": 0.3732039559619332,
"eval_entropy": 0.5431396165572175,
"eval_loss": 0.5645309686660767,
"eval_mean_token_accuracy": 0.7612812196442841,
"eval_num_tokens": 60876192.0,
"eval_runtime": 16.0974,
"eval_samples_per_second": 53.86,
"eval_steps_per_second": 6.771,
"step": 2000
},
{
"entropy": 0.5605012658238411,
"epoch": 0.3825340548609815,
"grad_norm": 0.578125,
"learning_rate": 2.9646440499251056e-05,
"loss": 0.5912,
"mean_token_accuracy": 0.7569118171930314,
"num_tokens": 62368509.0,
"step": 2050
},
{
"entropy": 0.5493465921282769,
"epoch": 0.39186415376002987,
"grad_norm": 1.1875,
"learning_rate": 2.9628152052449148e-05,
"loss": 0.5668,
"mean_token_accuracy": 0.7606059044599534,
"num_tokens": 63850871.0,
"step": 2100
},
{
"entropy": 0.5519525390863419,
"epoch": 0.4011942526590782,
"grad_norm": 0.8359375,
"learning_rate": 2.960940905503325e-05,
"loss": 0.5736,
"mean_token_accuracy": 0.7569921463727951,
"num_tokens": 65368844.0,
"step": 2150
},
{
"entropy": 0.5381503540277481,
"epoch": 0.4105243515581265,
"grad_norm": 0.6953125,
"learning_rate": 2.9590212155963024e-05,
"loss": 0.5602,
"mean_token_accuracy": 0.7622320890426636,
"num_tokens": 66879984.0,
"step": 2200
},
{
"entropy": 0.5599228474497795,
"epoch": 0.41985445045717484,
"grad_norm": 0.5390625,
"learning_rate": 2.9570562019914053e-05,
"loss": 0.5736,
"mean_token_accuracy": 0.7590676909685135,
"num_tokens": 68396429.0,
"step": 2250
},
{
"entropy": 0.5572463124990463,
"epoch": 0.4291845493562232,
"grad_norm": 0.458984375,
"learning_rate": 2.9550459327254864e-05,
"loss": 0.5732,
"mean_token_accuracy": 0.7579269409179688,
"num_tokens": 69905569.0,
"step": 2300
},
{
"entropy": 0.5512515944242478,
"epoch": 0.43851464825527153,
"grad_norm": 0.83203125,
"learning_rate": 2.9529904774023353e-05,
"loss": 0.5715,
"mean_token_accuracy": 0.7546812242269516,
"num_tokens": 71496156.0,
"step": 2350
},
{
"entropy": 0.5452529183030128,
"epoch": 0.4478447471543198,
"grad_norm": 1.453125,
"learning_rate": 2.9508899071902684e-05,
"loss": 0.5667,
"mean_token_accuracy": 0.7611679089069366,
"num_tokens": 72993303.0,
"step": 2400
},
{
"entropy": 0.5496764704585075,
"epoch": 0.45717484605336817,
"grad_norm": 0.80078125,
"learning_rate": 2.9487442948196643e-05,
"loss": 0.5677,
"mean_token_accuracy": 0.7588638842105866,
"num_tokens": 74502630.0,
"step": 2450
},
{
"entropy": 0.5490121757984161,
"epoch": 0.4665049449524165,
"grad_norm": 0.75390625,
"learning_rate": 2.9465537145804476e-05,
"loss": 0.5685,
"mean_token_accuracy": 0.7586365014314651,
"num_tokens": 76031000.0,
"step": 2500
},
{
"entropy": 0.5405518284440041,
"epoch": 0.4758350438514648,
"grad_norm": 1.0234375,
"learning_rate": 2.944318242319515e-05,
"loss": 0.562,
"mean_token_accuracy": 0.7639656978845596,
"num_tokens": 77482894.0,
"step": 2550
},
{
"entropy": 0.5369963318109512,
"epoch": 0.48516514275051315,
"grad_norm": 1.2109375,
"learning_rate": 2.94203795543811e-05,
"loss": 0.5595,
"mean_token_accuracy": 0.7631033205986023,
"num_tokens": 78956007.0,
"step": 2600
},
{
"entropy": 0.5459193900227547,
"epoch": 0.4944952416495615,
"grad_norm": 0.63671875,
"learning_rate": 2.939712932889142e-05,
"loss": 0.5678,
"mean_token_accuracy": 0.7564943873882294,
"num_tokens": 80549485.0,
"step": 2650
},
{
"entropy": 0.5403485292196274,
"epoch": 0.5038253405486098,
"grad_norm": 0.73828125,
"learning_rate": 2.937343255174453e-05,
"loss": 0.5665,
"mean_token_accuracy": 0.7631701147556305,
"num_tokens": 81987729.0,
"step": 2700
},
{
"entropy": 0.5511571237444878,
"epoch": 0.5131554394476582,
"grad_norm": 0.875,
"learning_rate": 2.9349290043420315e-05,
"loss": 0.5718,
"mean_token_accuracy": 0.7589112591743469,
"num_tokens": 83566503.0,
"step": 2750
},
{
"entropy": 0.545387190580368,
"epoch": 0.5224855383467065,
"grad_norm": 1.0390625,
"learning_rate": 2.932470263983169e-05,
"loss": 0.578,
"mean_token_accuracy": 0.7594633424282073,
"num_tokens": 85045132.0,
"step": 2800
},
{
"entropy": 0.5400431799888611,
"epoch": 0.5318156372457548,
"grad_norm": 0.60546875,
"learning_rate": 2.929967119229569e-05,
"loss": 0.5639,
"mean_token_accuracy": 0.7595540487766266,
"num_tokens": 86552294.0,
"step": 2850
},
{
"entropy": 0.5613244980573654,
"epoch": 0.5411457361448031,
"grad_norm": 1.0,
"learning_rate": 2.9274196567503974e-05,
"loss": 0.5882,
"mean_token_accuracy": 0.7518465319275855,
"num_tokens": 88138088.0,
"step": 2900
},
{
"entropy": 0.5406323432922363,
"epoch": 0.5504758350438514,
"grad_norm": 0.68359375,
"learning_rate": 2.9248279647492817e-05,
"loss": 0.563,
"mean_token_accuracy": 0.7594792503118515,
"num_tokens": 89629470.0,
"step": 2950
},
{
"entropy": 0.5328826600313187,
"epoch": 0.5598059339428998,
"grad_norm": 0.7890625,
"learning_rate": 2.9221921329612568e-05,
"loss": 0.559,
"mean_token_accuracy": 0.7616329395771027,
"num_tokens": 91153981.0,
"step": 3000
},
{
"epoch": 0.5598059339428998,
"eval_entropy": 0.5315866710942819,
"eval_loss": 0.5568196773529053,
"eval_mean_token_accuracy": 0.7634841575535065,
"eval_num_tokens": 91153981.0,
"eval_runtime": 16.3141,
"eval_samples_per_second": 53.144,
"eval_steps_per_second": 6.681,
"step": 3000
},
{
"entropy": 0.5549379280209541,
"epoch": 0.5691360328419481,
"grad_norm": 0.640625,
"learning_rate": 2.9195122526496596e-05,
"loss": 0.571,
"mean_token_accuracy": 0.7589174765348434,
"num_tokens": 92704541.0,
"step": 3050
},
{
"entropy": 0.5324976027011872,
"epoch": 0.5784661317409965,
"grad_norm": 0.9140625,
"learning_rate": 2.9167884166029674e-05,
"loss": 0.558,
"mean_token_accuracy": 0.766141871213913,
"num_tokens": 94114581.0,
"step": 3100
},
{
"entropy": 0.5359655514359474,
"epoch": 0.5877962306400448,
"grad_norm": 1.0546875,
"learning_rate": 2.9140207191315857e-05,
"loss": 0.5609,
"mean_token_accuracy": 0.7602073633670807,
"num_tokens": 95640629.0,
"step": 3150
},
{
"entropy": 0.5375589004158974,
"epoch": 0.5971263295390931,
"grad_norm": 1.0078125,
"learning_rate": 2.911209256064584e-05,
"loss": 0.5567,
"mean_token_accuracy": 0.7607348054647446,
"num_tokens": 97223569.0,
"step": 3200
},
{
"entropy": 0.5388145217299461,
"epoch": 0.6064564284381414,
"grad_norm": 0.73046875,
"learning_rate": 2.9083541247463754e-05,
"loss": 0.5612,
"mean_token_accuracy": 0.7596866941452026,
"num_tokens": 98767227.0,
"step": 3250
},
{
"entropy": 0.5369637748599052,
"epoch": 0.6157865273371897,
"grad_norm": 0.56640625,
"learning_rate": 2.9054554240333478e-05,
"loss": 0.5601,
"mean_token_accuracy": 0.7642514258623123,
"num_tokens": 100228436.0,
"step": 3300
},
{
"entropy": 0.5306126582622528,
"epoch": 0.6251166262362381,
"grad_norm": 1.3515625,
"learning_rate": 2.9025132542904414e-05,
"loss": 0.5548,
"mean_token_accuracy": 0.7639524918794632,
"num_tokens": 101762895.0,
"step": 3350
},
{
"entropy": 0.532108125090599,
"epoch": 0.6344467251352864,
"grad_norm": 0.48828125,
"learning_rate": 2.8995277173876718e-05,
"loss": 0.5565,
"mean_token_accuracy": 0.7622706252336502,
"num_tokens": 103288607.0,
"step": 3400
},
{
"entropy": 0.530606449842453,
"epoch": 0.6437768240343348,
"grad_norm": 1.1796875,
"learning_rate": 2.896498916696605e-05,
"loss": 0.5598,
"mean_token_accuracy": 0.763903112411499,
"num_tokens": 104756099.0,
"step": 3450
},
{
"entropy": 0.5285736629366875,
"epoch": 0.6531069229333831,
"grad_norm": 1.0703125,
"learning_rate": 2.8934269570867776e-05,
"loss": 0.5536,
"mean_token_accuracy": 0.7639499133825303,
"num_tokens": 106245797.0,
"step": 3500
},
{
"entropy": 0.5446360909938812,
"epoch": 0.6624370218324315,
"grad_norm": 2.125,
"learning_rate": 2.890311944922064e-05,
"loss": 0.5789,
"mean_token_accuracy": 0.7568975293636322,
"num_tokens": 107770481.0,
"step": 3550
},
{
"entropy": 0.5234782636165619,
"epoch": 0.6717671207314797,
"grad_norm": 1.7578125,
"learning_rate": 2.8871539880569963e-05,
"loss": 0.5532,
"mean_token_accuracy": 0.7649819606542587,
"num_tokens": 109269688.0,
"step": 3600
},
{
"entropy": 0.5393576291203499,
"epoch": 0.681097219630528,
"grad_norm": 0.69140625,
"learning_rate": 2.8839531958330277e-05,
"loss": 0.5617,
"mean_token_accuracy": 0.759439873099327,
"num_tokens": 110803400.0,
"step": 3650
},
{
"entropy": 0.539838764667511,
"epoch": 0.6904273185295764,
"grad_norm": 0.76171875,
"learning_rate": 2.880709679074749e-05,
"loss": 0.5631,
"mean_token_accuracy": 0.760960082411766,
"num_tokens": 112340326.0,
"step": 3700
},
{
"entropy": 0.5437505677342415,
"epoch": 0.6997574174286247,
"grad_norm": 0.9296875,
"learning_rate": 2.8774235500860494e-05,
"loss": 0.5656,
"mean_token_accuracy": 0.7594961816072464,
"num_tokens": 113873379.0,
"step": 3750
},
{
"entropy": 0.5357860559225083,
"epoch": 0.7090875163276731,
"grad_norm": 0.640625,
"learning_rate": 2.874094922646229e-05,
"loss": 0.5611,
"mean_token_accuracy": 0.7590708369016648,
"num_tokens": 115408557.0,
"step": 3800
},
{
"entropy": 0.5340870246291161,
"epoch": 0.7184176152267214,
"grad_norm": 0.953125,
"learning_rate": 2.870723912006058e-05,
"loss": 0.5552,
"mean_token_accuracy": 0.765527902841568,
"num_tokens": 116891513.0,
"step": 3850
},
{
"entropy": 0.5364308878779411,
"epoch": 0.7277477141257698,
"grad_norm": 1.109375,
"learning_rate": 2.867310634883789e-05,
"loss": 0.5518,
"mean_token_accuracy": 0.7665286004543305,
"num_tokens": 118406575.0,
"step": 3900
},
{
"entropy": 0.5389542949199676,
"epoch": 0.7370778130248181,
"grad_norm": 0.390625,
"learning_rate": 2.863855209461113e-05,
"loss": 0.5628,
"mean_token_accuracy": 0.7604682886600495,
"num_tokens": 119888774.0,
"step": 3950
},
{
"entropy": 0.5373398035764694,
"epoch": 0.7464079119238664,
"grad_norm": 0.6484375,
"learning_rate": 2.8603577553790682e-05,
"loss": 0.5654,
"mean_token_accuracy": 0.7639918619394303,
"num_tokens": 121314895.0,
"step": 4000
},
{
"epoch": 0.7464079119238664,
"eval_entropy": 0.5262507380695518,
"eval_loss": 0.5490807890892029,
"eval_mean_token_accuracy": 0.7650549843770649,
"eval_num_tokens": 121314895.0,
"eval_runtime": 16.0402,
"eval_samples_per_second": 54.052,
"eval_steps_per_second": 6.795,
"step": 4000
},
{
"entropy": 0.5303604575991631,
"epoch": 0.7557380108229147,
"grad_norm": 0.65234375,
"learning_rate": 2.8568183937338984e-05,
"loss": 0.5609,
"mean_token_accuracy": 0.7618992066383362,
"num_tokens": 122793213.0,
"step": 4050
},
{
"entropy": 0.5326658990979195,
"epoch": 0.765068109721963,
"grad_norm": 1.1171875,
"learning_rate": 2.8532372470728608e-05,
"loss": 0.5552,
"mean_token_accuracy": 0.7656506180763245,
"num_tokens": 124227341.0,
"step": 4100
},
{
"entropy": 0.5209727981686592,
"epoch": 0.7743982086210114,
"grad_norm": 0.69140625,
"learning_rate": 2.8496144393899784e-05,
"loss": 0.5516,
"mean_token_accuracy": 0.7649285507202148,
"num_tokens": 125707121.0,
"step": 4150
},
{
"entropy": 0.5248242399096489,
"epoch": 0.7837283075200597,
"grad_norm": 0.734375,
"learning_rate": 2.8459500961217533e-05,
"loss": 0.5534,
"mean_token_accuracy": 0.7604096215963364,
"num_tokens": 127238194.0,
"step": 4200
},
{
"entropy": 0.532697811126709,
"epoch": 0.7930584064191081,
"grad_norm": 0.80078125,
"learning_rate": 2.842244344142819e-05,
"loss": 0.5622,
"mean_token_accuracy": 0.7587612766027451,
"num_tokens": 128737550.0,
"step": 4250
},
{
"entropy": 0.5316076844930648,
"epoch": 0.8023885053181564,
"grad_norm": 0.515625,
"learning_rate": 2.8384973117615488e-05,
"loss": 0.5562,
"mean_token_accuracy": 0.7626278126239776,
"num_tokens": 130249756.0,
"step": 4300
},
{
"entropy": 0.5310768684744835,
"epoch": 0.8117186042172047,
"grad_norm": 0.78515625,
"learning_rate": 2.8347091287156136e-05,
"loss": 0.5575,
"mean_token_accuracy": 0.7627124708890914,
"num_tokens": 131739377.0,
"step": 4350
},
{
"entropy": 0.5297018462419509,
"epoch": 0.821048703116253,
"grad_norm": 0.72265625,
"learning_rate": 2.8308799261674898e-05,
"loss": 0.5556,
"mean_token_accuracy": 0.7631601667404175,
"num_tokens": 133264527.0,
"step": 4400
},
{
"entropy": 0.5304474216699601,
"epoch": 0.8303788020153013,
"grad_norm": 1.140625,
"learning_rate": 2.8270098366999166e-05,
"loss": 0.5597,
"mean_token_accuracy": 0.7665414202213288,
"num_tokens": 134690231.0,
"step": 4450
},
{
"entropy": 0.5239725235104561,
"epoch": 0.8397089009143497,
"grad_norm": 0.796875,
"learning_rate": 2.8230989943113075e-05,
"loss": 0.5517,
"mean_token_accuracy": 0.763014947772026,
"num_tokens": 136226470.0,
"step": 4500
},
{
"entropy": 0.527112789452076,
"epoch": 0.849038999813398,
"grad_norm": 0.84375,
"learning_rate": 2.8191475344111103e-05,
"loss": 0.5524,
"mean_token_accuracy": 0.7613210624456406,
"num_tokens": 137780275.0,
"step": 4550
},
{
"entropy": 0.5286294043064117,
"epoch": 0.8583690987124464,
"grad_norm": 0.68359375,
"learning_rate": 2.8151555938151165e-05,
"loss": 0.557,
"mean_token_accuracy": 0.7616584074497222,
"num_tokens": 139330494.0,
"step": 4600
},
{
"entropy": 0.5276841628551483,
"epoch": 0.8676991976114947,
"grad_norm": 0.609375,
"learning_rate": 2.811123310740726e-05,
"loss": 0.5495,
"mean_token_accuracy": 0.7647832882404327,
"num_tokens": 140815597.0,
"step": 4650
},
{
"entropy": 0.5373046767711639,
"epoch": 0.8770292965105431,
"grad_norm": 0.67578125,
"learning_rate": 2.807050824802163e-05,
"loss": 0.5624,
"mean_token_accuracy": 0.758348998427391,
"num_tokens": 142383943.0,
"step": 4700
},
{
"entropy": 0.5215965616703033,
"epoch": 0.8863593954095913,
"grad_norm": 0.6171875,
"learning_rate": 2.802938277005638e-05,
"loss": 0.5462,
"mean_token_accuracy": 0.7664449107646942,
"num_tokens": 143879403.0,
"step": 4750
},
{
"entropy": 0.5327892461419106,
"epoch": 0.8956894943086396,
"grad_norm": 0.38671875,
"learning_rate": 2.7987858097444688e-05,
"loss": 0.5618,
"mean_token_accuracy": 0.7579188454151153,
"num_tokens": 145455384.0,
"step": 4800
},
{
"entropy": 0.5209793072938919,
"epoch": 0.905019593207688,
"grad_norm": 0.90625,
"learning_rate": 2.794593566794149e-05,
"loss": 0.5502,
"mean_token_accuracy": 0.7619763416051865,
"num_tokens": 147010897.0,
"step": 4850
},
{
"entropy": 0.5109856846928597,
"epoch": 0.9143496921067363,
"grad_norm": 0.4375,
"learning_rate": 2.7903616933073712e-05,
"loss": 0.5471,
"mean_token_accuracy": 0.7652358949184418,
"num_tokens": 148509259.0,
"step": 4900
},
{
"entropy": 0.5274619281291961,
"epoch": 0.9236797910057847,
"grad_norm": 1.1171875,
"learning_rate": 2.786090335808998e-05,
"loss": 0.5546,
"mean_token_accuracy": 0.7621842390298843,
"num_tokens": 149982645.0,
"step": 4950
},
{
"entropy": 0.5342949241399765,
"epoch": 0.933009889904833,
"grad_norm": 0.58984375,
"learning_rate": 2.7817796421909922e-05,
"loss": 0.5682,
"mean_token_accuracy": 0.7593452525138855,
"num_tokens": 151532149.0,
"step": 5000
},
{
"epoch": 0.933009889904833,
"eval_entropy": 0.5146727717797691,
"eval_loss": 0.5413097739219666,
"eval_mean_token_accuracy": 0.767384243120841,
"eval_num_tokens": 151532149.0,
"eval_runtime": 16.1495,
"eval_samples_per_second": 53.686,
"eval_steps_per_second": 6.749,
"step": 5000
},
{
"entropy": 0.5231131237745285,
"epoch": 0.9423399888038814,
"grad_norm": 0.408203125,
"learning_rate": 2.7774297617072963e-05,
"loss": 0.554,
"mean_token_accuracy": 0.7625243580341339,
"num_tokens": 153059857.0,
"step": 5050
},
{
"entropy": 0.5220058736205101,
"epoch": 0.9516700877029296,
"grad_norm": 1.140625,
"learning_rate": 2.7730408449686593e-05,
"loss": 0.5496,
"mean_token_accuracy": 0.7626436889171601,
"num_tokens": 154578128.0,
"step": 5100
},
{
"entropy": 0.5289176645874977,
"epoch": 0.961000186601978,
"grad_norm": 1.046875,
"learning_rate": 2.7686130439374304e-05,
"loss": 0.5555,
"mean_token_accuracy": 0.7623570781946182,
"num_tokens": 156133259.0,
"step": 5150
},
{
"entropy": 0.5238713613152504,
"epoch": 0.9703302855010263,
"grad_norm": 0.92578125,
"learning_rate": 2.7641465119222893e-05,
"loss": 0.5525,
"mean_token_accuracy": 0.7626954644918442,
"num_tokens": 157655593.0,
"step": 5200
},
{
"entropy": 0.526420825123787,
"epoch": 0.9796603844000746,
"grad_norm": 0.53125,
"learning_rate": 2.7596414035729436e-05,
"loss": 0.5523,
"mean_token_accuracy": 0.7634574353694916,
"num_tokens": 159173170.0,
"step": 5250
},
{
"entropy": 0.5410353738069534,
"epoch": 0.988990483299123,
"grad_norm": 0.70703125,
"learning_rate": 2.755097874874772e-05,
"loss": 0.5616,
"mean_token_accuracy": 0.7588550513982772,
"num_tokens": 160746023.0,
"step": 5300
},
{
"entropy": 0.5253653234243393,
"epoch": 0.9983205821981713,
"grad_norm": 0.984375,
"learning_rate": 2.7505160831434235e-05,
"loss": 0.5538,
"mean_token_accuracy": 0.7632267904281617,
"num_tokens": 162234796.0,
"step": 5350
},
{
"entropy": 0.4956328600645065,
"epoch": 1.0076506810972197,
"grad_norm": 2.28125,
"learning_rate": 2.7458961870193697e-05,
"loss": 0.5385,
"mean_token_accuracy": 0.7668155688047409,
"num_tokens": 163759549.0,
"step": 5400
},
{
"entropy": 0.4777234876155853,
"epoch": 1.016980779996268,
"grad_norm": 0.8203125,
"learning_rate": 2.741238346462415e-05,
"loss": 0.5267,
"mean_token_accuracy": 0.7712256401777268,
"num_tokens": 165218690.0,
"step": 5450
},
{
"entropy": 0.4960213273763657,
"epoch": 1.0263108788953164,
"grad_norm": 0.45703125,
"learning_rate": 2.7365427227461538e-05,
"loss": 0.5316,
"mean_token_accuracy": 0.7680883568525314,
"num_tokens": 166779383.0,
"step": 5500
},
{
"entropy": 0.48663112640380857,
"epoch": 1.0356409777943647,
"grad_norm": 0.66015625,
"learning_rate": 2.731809478452392e-05,
"loss": 0.5271,
"mean_token_accuracy": 0.772123327255249,
"num_tokens": 168262830.0,
"step": 5550
},
{
"entropy": 0.501155666410923,
"epoch": 1.044971076693413,
"grad_norm": 0.984375,
"learning_rate": 2.72703877746551e-05,
"loss": 0.5439,
"mean_token_accuracy": 0.7648611211776734,
"num_tokens": 169844757.0,
"step": 5600
},
{
"entropy": 0.5043464726209641,
"epoch": 1.0543011755924612,
"grad_norm": 0.69140625,
"learning_rate": 2.7222307849667976e-05,
"loss": 0.5458,
"mean_token_accuracy": 0.7605859559774398,
"num_tokens": 171432040.0,
"step": 5650
},
{
"entropy": 0.4868525117635727,
"epoch": 1.0636312744915095,
"grad_norm": 0.81640625,
"learning_rate": 2.7173856674287276e-05,
"loss": 0.532,
"mean_token_accuracy": 0.7668398702144623,
"num_tokens": 172949249.0,
"step": 5700
},
{
"entropy": 0.48061770796775816,
"epoch": 1.0729613733905579,
"grad_norm": 0.412109375,
"learning_rate": 2.7125035926091948e-05,
"loss": 0.5247,
"mean_token_accuracy": 0.7707643383741378,
"num_tokens": 174427624.0,
"step": 5750
},
{
"entropy": 0.49181210845708845,
"epoch": 1.0822914722896062,
"grad_norm": 0.703125,
"learning_rate": 2.7075847295457074e-05,
"loss": 0.5387,
"mean_token_accuracy": 0.7684424781799316,
"num_tokens": 175889073.0,
"step": 5800
},
{
"entropy": 0.47926140516996385,
"epoch": 1.0916215711886545,
"grad_norm": 0.828125,
"learning_rate": 2.702629248549533e-05,
"loss": 0.5243,
"mean_token_accuracy": 0.7723050940036774,
"num_tokens": 177318508.0,
"step": 5850
},
{
"entropy": 0.49550765454769136,
"epoch": 1.100951670087703,
"grad_norm": 1.046875,
"learning_rate": 2.6976373211998036e-05,
"loss": 0.5369,
"mean_token_accuracy": 0.7672231763601303,
"num_tokens": 178841949.0,
"step": 5900
},
{
"entropy": 0.4972337147593498,
"epoch": 1.1102817689867512,
"grad_norm": 0.57421875,
"learning_rate": 2.6926091203375736e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.7673702806234359,
"num_tokens": 180355456.0,
"step": 5950
},
{
"entropy": 0.49163941740989686,
"epoch": 1.1196118678857996,
"grad_norm": 0.640625,
"learning_rate": 2.6875448200598356e-05,
"loss": 0.53,
"mean_token_accuracy": 0.76685063123703,
"num_tokens": 181895417.0,
"step": 6000
},
{
"epoch": 1.1196118678857996,
"eval_entropy": 0.49277083069906324,
"eval_loss": 0.5361812114715576,
"eval_mean_token_accuracy": 0.7686252342451603,
"eval_num_tokens": 181895417.0,
"eval_runtime": 16.2137,
"eval_samples_per_second": 53.473,
"eval_steps_per_second": 6.723,
"step": 6000
},
{
"entropy": 0.494234202504158,
"epoch": 1.128941966784848,
"grad_norm": 0.67578125,
"learning_rate": 2.682444595713492e-05,
"loss": 0.534,
"mean_token_accuracy": 0.7677116429805756,
"num_tokens": 183441033.0,
"step": 6050
},
{
"entropy": 0.4924536618590355,
"epoch": 1.1382720656838963,
"grad_norm": 1.4375,
"learning_rate": 2.6773086238892847e-05,
"loss": 0.5325,
"mean_token_accuracy": 0.7706644636392593,
"num_tokens": 184930303.0,
"step": 6100
},
{
"entropy": 0.49195749253034593,
"epoch": 1.1476021645829446,
"grad_norm": 0.921875,
"learning_rate": 2.67213708241568e-05,
"loss": 0.5323,
"mean_token_accuracy": 0.769943385720253,
"num_tokens": 186461707.0,
"step": 6150
},
{
"entropy": 0.4922616305947304,
"epoch": 1.156932263481993,
"grad_norm": 1.046875,
"learning_rate": 2.666930150352712e-05,
"loss": 0.5331,
"mean_token_accuracy": 0.765140592455864,
"num_tokens": 188051673.0,
"step": 6200
},
{
"entropy": 0.4984354588389397,
"epoch": 1.1662623623810413,
"grad_norm": 0.98046875,
"learning_rate": 2.661688007985782e-05,
"loss": 0.5366,
"mean_token_accuracy": 0.7678897380828857,
"num_tokens": 189540976.0,
"step": 6250
},
{
"entropy": 0.4744232183694839,
"epoch": 1.1755924612800897,
"grad_norm": 0.79296875,
"learning_rate": 2.6564108368194174e-05,
"loss": 0.5188,
"mean_token_accuracy": 0.7747587919235229,
"num_tokens": 190972681.0,
"step": 6300
},
{
"entropy": 0.48836414963006974,
"epoch": 1.184922560179138,
"grad_norm": 0.7265625,
"learning_rate": 2.6510988195709867e-05,
"loss": 0.5945,
"mean_token_accuracy": 0.7682410633563995,
"num_tokens": 192480163.0,
"step": 6350
},
{
"entropy": 0.481321419775486,
"epoch": 1.1942526590781863,
"grad_norm": 0.58984375,
"learning_rate": 2.6457521401643724e-05,
"loss": 0.5223,
"mean_token_accuracy": 0.7744923168420792,
"num_tokens": 193963819.0,
"step": 6400
},
{
"entropy": 0.4845267793536186,
"epoch": 1.2035827579772345,
"grad_norm": 0.73828125,
"learning_rate": 2.640370983723605e-05,
"loss": 0.5331,
"mean_token_accuracy": 0.7681414604187011,
"num_tokens": 195501297.0,
"step": 6450
},
{
"entropy": 0.4920358270406723,
"epoch": 1.2129128568762828,
"grad_norm": 1.1171875,
"learning_rate": 2.6349555365664496e-05,
"loss": 0.5315,
"mean_token_accuracy": 0.7675491815805435,
"num_tokens": 197047201.0,
"step": 6500
},
{
"entropy": 0.4892509970068932,
"epoch": 1.2222429557753312,
"grad_norm": 0.46875,
"learning_rate": 2.6295059861979585e-05,
"loss": 0.5324,
"mean_token_accuracy": 0.7672034209966659,
"num_tokens": 198554820.0,
"step": 6550
},
{
"entropy": 0.48003152668476107,
"epoch": 1.2315730546743795,
"grad_norm": 0.82421875,
"learning_rate": 2.6240225213039762e-05,
"loss": 0.5256,
"mean_token_accuracy": 0.7719682443141938,
"num_tokens": 200055450.0,
"step": 6600
},
{
"entropy": 0.49669885337352754,
"epoch": 1.2409031535734278,
"grad_norm": 0.5390625,
"learning_rate": 2.6185053317446094e-05,
"loss": 0.5394,
"mean_token_accuracy": 0.7652375429868699,
"num_tokens": 201621193.0,
"step": 6650
},
{
"entropy": 0.47468378067016603,
"epoch": 1.2502332524724762,
"grad_norm": 0.298828125,
"learning_rate": 2.6129546085476494e-05,
"loss": 0.5181,
"mean_token_accuracy": 0.7740450286865235,
"num_tokens": 203115630.0,
"step": 6700
},
{
"entropy": 0.48166996002197265,
"epoch": 1.2595633513715245,
"grad_norm": 0.53515625,
"learning_rate": 2.6073705439019604e-05,
"loss": 0.5235,
"mean_token_accuracy": 0.7715310126543045,
"num_tokens": 204659783.0,
"step": 6750
},
{
"entropy": 0.487423982322216,
"epoch": 1.2688934502705729,
"grad_norm": 0.94140625,
"learning_rate": 2.6017533311508262e-05,
"loss": 0.5271,
"mean_token_accuracy": 0.7717793607711791,
"num_tokens": 206207207.0,
"step": 6800
},
{
"entropy": 0.490067283809185,
"epoch": 1.2782235491696212,
"grad_norm": 0.98046875,
"learning_rate": 2.5961031647852525e-05,
"loss": 0.5291,
"mean_token_accuracy": 0.7670228743553161,
"num_tokens": 207776741.0,
"step": 6850
},
{
"entropy": 0.47844831019639966,
"epoch": 1.2875536480686696,
"grad_norm": 0.60546875,
"learning_rate": 2.590420240437236e-05,
"loss": 0.5286,
"mean_token_accuracy": 0.7723558592796326,
"num_tokens": 209300786.0,
"step": 6900
},
{
"entropy": 0.4823366206884384,
"epoch": 1.296883746967718,
"grad_norm": 0.828125,
"learning_rate": 2.584704754872988e-05,
"loss": 0.5294,
"mean_token_accuracy": 0.7727249205112457,
"num_tokens": 210784669.0,
"step": 6950
},
{
"entropy": 0.4943872797489166,
"epoch": 1.3062138458667663,
"grad_norm": 0.8046875,
"learning_rate": 2.578956905986124e-05,
"loss": 0.5374,
"mean_token_accuracy": 0.7674384766817093,
"num_tokens": 212346372.0,
"step": 7000
},
{
"epoch": 1.3062138458667663,
"eval_entropy": 0.48885188408947866,
"eval_loss": 0.5305144190788269,
"eval_mean_token_accuracy": 0.7706459596616413,
"eval_num_tokens": 212346372.0,
"eval_runtime": 16.2232,
"eval_samples_per_second": 53.442,
"eval_steps_per_second": 6.719,
"step": 7000
},
{
"entropy": 0.48752534478902815,
"epoch": 1.3155439447658146,
"grad_norm": 0.609375,
"learning_rate": 2.573176892790812e-05,
"loss": 0.5315,
"mean_token_accuracy": 0.7696154469251633,
"num_tokens": 213860843.0,
"step": 7050
},
{
"entropy": 0.4900174245238304,
"epoch": 1.3248740436648627,
"grad_norm": 0.53515625,
"learning_rate": 2.567364915414877e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.7694221770763398,
"num_tokens": 215367383.0,
"step": 7100
},
{
"entropy": 0.48771278649568556,
"epoch": 1.334204142563911,
"grad_norm": 1.5859375,
"learning_rate": 2.5615211750928794e-05,
"loss": 0.5353,
"mean_token_accuracy": 0.7697239458560944,
"num_tokens": 216845421.0,
"step": 7150
},
{
"entropy": 0.47983416020870207,
"epoch": 1.3435342414629594,
"grad_norm": 0.6015625,
"learning_rate": 2.555645874159141e-05,
"loss": 0.5234,
"mean_token_accuracy": 0.7707635217905044,
"num_tokens": 218372903.0,
"step": 7200
},
{
"entropy": 0.47865484192967417,
"epoch": 1.3528643403620078,
"grad_norm": 0.59765625,
"learning_rate": 2.549739216040743e-05,
"loss": 0.5221,
"mean_token_accuracy": 0.7720851230621338,
"num_tokens": 219883662.0,
"step": 7250
},
{
"entropy": 0.4899566939473152,
"epoch": 1.362194439261056,
"grad_norm": 0.453125,
"learning_rate": 2.5438014052504802e-05,
"loss": 0.532,
"mean_token_accuracy": 0.7674814122915268,
"num_tokens": 221426709.0,
"step": 7300
},
{
"entropy": 0.4817213848233223,
"epoch": 1.3715245381601044,
"grad_norm": 0.53125,
"learning_rate": 2.5378326473797818e-05,
"loss": 0.5265,
"mean_token_accuracy": 0.769315534234047,
"num_tokens": 222950520.0,
"step": 7350
},
{
"entropy": 0.4752693668007851,
"epoch": 1.3808546370591528,
"grad_norm": 0.5859375,
"learning_rate": 2.5318331490915925e-05,
"loss": 0.5195,
"mean_token_accuracy": 0.7739131230115891,
"num_tokens": 224448854.0,
"step": 7400
},
{
"entropy": 0.4732086658477783,
"epoch": 1.3901847359582011,
"grad_norm": 0.9140625,
"learning_rate": 2.525803118113215e-05,
"loss": 0.5225,
"mean_token_accuracy": 0.7735245388746261,
"num_tokens": 225932263.0,
"step": 7450
},
{
"entropy": 0.48352263927459715,
"epoch": 1.3995148348572495,
"grad_norm": 0.546875,
"learning_rate": 2.5197427632291214e-05,
"loss": 0.531,
"mean_token_accuracy": 0.7676544332504273,
"num_tokens": 227452316.0,
"step": 7500
},
{
"entropy": 0.48029813915491104,
"epoch": 1.4088449337562978,
"grad_norm": 0.61328125,
"learning_rate": 2.513652294273721e-05,
"loss": 0.5257,
"mean_token_accuracy": 0.7688204735517502,
"num_tokens": 228995142.0,
"step": 7550
},
{
"entropy": 0.49092908948659897,
"epoch": 1.4181750326553462,
"grad_norm": 0.90625,
"learning_rate": 2.507531922124096e-05,
"loss": 0.5515,
"mean_token_accuracy": 0.7691348105669021,
"num_tokens": 230473945.0,
"step": 7600
},
{
"entropy": 0.46930390000343325,
"epoch": 1.4275051315543945,
"grad_norm": 0.91015625,
"learning_rate": 2.501381858692701e-05,
"loss": 0.5192,
"mean_token_accuracy": 0.7731947559118271,
"num_tokens": 231969989.0,
"step": 7650
},
{
"entropy": 0.4841861927509308,
"epoch": 1.4368352304534429,
"grad_norm": 0.74609375,
"learning_rate": 2.495202316920024e-05,
"loss": 0.5281,
"mean_token_accuracy": 0.7702859449386597,
"num_tokens": 233496905.0,
"step": 7700
},
{
"entropy": 0.47611463099718093,
"epoch": 1.4461653293524912,
"grad_norm": 0.9140625,
"learning_rate": 2.488993510767214e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.7676136875152588,
"num_tokens": 235025876.0,
"step": 7750
},
{
"entropy": 0.480805746614933,
"epoch": 1.4554954282515395,
"grad_norm": 0.447265625,
"learning_rate": 2.4827556552086753e-05,
"loss": 0.5228,
"mean_token_accuracy": 0.7714346569776535,
"num_tokens": 236573132.0,
"step": 7800
},
{
"entropy": 0.4773865479230881,
"epoch": 1.464825527150588,
"grad_norm": 0.8828125,
"learning_rate": 2.47648896622462e-05,
"loss": 0.5233,
"mean_token_accuracy": 0.7716735368967056,
"num_tokens": 238108249.0,
"step": 7850
},
{
"entropy": 0.49567407727241514,
"epoch": 1.4741556260496362,
"grad_norm": 0.49609375,
"learning_rate": 2.4701936607935922e-05,
"loss": 0.548,
"mean_token_accuracy": 0.7675740510225296,
"num_tokens": 239609374.0,
"step": 7900
},
{
"entropy": 0.47016422227025034,
"epoch": 1.4834857249486846,
"grad_norm": 0.65625,
"learning_rate": 2.463869956884957e-05,
"loss": 0.5141,
"mean_token_accuracy": 0.7800671440362931,
"num_tokens": 241035036.0,
"step": 7950
},
{
"entropy": 0.4691419780254364,
"epoch": 1.4928158238477327,
"grad_norm": 0.6640625,
"learning_rate": 2.457518073451348e-05,
"loss": 0.5183,
"mean_token_accuracy": 0.7747016477584839,
"num_tokens": 242482052.0,
"step": 8000
},
{
"epoch": 1.4928158238477327,
"eval_entropy": 0.48408404293410273,
"eval_loss": 0.5262120962142944,
"eval_mean_token_accuracy": 0.7715235532970603,
"eval_num_tokens": 242482052.0,
"eval_runtime": 16.1685,
"eval_samples_per_second": 53.623,
"eval_steps_per_second": 6.741,
"step": 8000
},
{
"entropy": 0.48640842020511627,
"epoch": 1.5021459227467813,
"grad_norm": 0.8671875,
"learning_rate": 2.451138230421094e-05,
"loss": 0.5291,
"mean_token_accuracy": 0.768261170387268,
"num_tokens": 244051944.0,
"step": 8050
},
{
"entropy": 0.4861644932627678,
"epoch": 1.5114760216458294,
"grad_norm": 0.86328125,
"learning_rate": 2.4447306486905965e-05,
"loss": 0.5303,
"mean_token_accuracy": 0.7654324793815612,
"num_tokens": 245667242.0,
"step": 8100
},
{
"entropy": 0.46581872284412384,
"epoch": 1.5208061205448777,
"grad_norm": 0.64453125,
"learning_rate": 2.4382955501166878e-05,
"loss": 0.517,
"mean_token_accuracy": 0.7791347569227218,
"num_tokens": 247084677.0,
"step": 8150
},
{
"entropy": 0.48655702769756315,
"epoch": 1.530136219443926,
"grad_norm": 0.5859375,
"learning_rate": 2.4318331575089437e-05,
"loss": 0.5283,
"mean_token_accuracy": 0.7677739357948303,
"num_tokens": 248666161.0,
"step": 8200
},
{
"entropy": 0.48474507868289946,
"epoch": 1.5394663183429744,
"grad_norm": 0.83203125,
"learning_rate": 2.425343694621974e-05,
"loss": 0.5218,
"mean_token_accuracy": 0.7710594099760055,
"num_tokens": 250228344.0,
"step": 8250
},
{
"entropy": 0.501356900036335,
"epoch": 1.5487964172420228,
"grad_norm": 1.09375,
"learning_rate": 2.418827386147672e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.7636065500974655,
"num_tokens": 251842667.0,
"step": 8300
},
{
"entropy": 0.48092112705111506,
"epoch": 1.5581265161410711,
"grad_norm": 0.65625,
"learning_rate": 2.4122844577074344e-05,
"loss": 0.5246,
"mean_token_accuracy": 0.7711791855096817,
"num_tokens": 253387970.0,
"step": 8350
},
{
"entropy": 0.4703651532530785,
"epoch": 1.5674566150401195,
"grad_norm": 0.76171875,
"learning_rate": 2.4057151358443537e-05,
"loss": 0.523,
"mean_token_accuracy": 0.7739822679758072,
"num_tokens": 254893911.0,
"step": 8400
},
{
"entropy": 0.48381629049777986,
"epoch": 1.5767867139391678,
"grad_norm": 0.5390625,
"learning_rate": 2.3991196480153678e-05,
"loss": 0.5295,
"mean_token_accuracy": 0.7688455355167388,
"num_tokens": 256399394.0,
"step": 8450
},
{
"entropy": 0.4885594379901886,
"epoch": 1.586116812838216,
"grad_norm": 0.73828125,
"learning_rate": 2.39249822258339e-05,
"loss": 0.5269,
"mean_token_accuracy": 0.7714411211013794,
"num_tokens": 257935448.0,
"step": 8500
},
{
"entropy": 0.48577941954135895,
"epoch": 1.5954469117372643,
"grad_norm": 0.8828125,
"learning_rate": 2.3858510888093997e-05,
"loss": 0.5231,
"mean_token_accuracy": 0.7718437218666077,
"num_tokens": 259486589.0,
"step": 8550
},
{
"entropy": 0.48262311398983004,
"epoch": 1.6047770106363126,
"grad_norm": 0.578125,
"learning_rate": 2.3791784768445045e-05,
"loss": 0.5248,
"mean_token_accuracy": 0.7686738175153732,
"num_tokens": 261065847.0,
"step": 8600
},
{
"entropy": 0.471617269217968,
"epoch": 1.614107109535361,
"grad_norm": 0.9765625,
"learning_rate": 2.3724806177219723e-05,
"loss": 0.5203,
"mean_token_accuracy": 0.772919489145279,
"num_tokens": 262600411.0,
"step": 8650
},
{
"entropy": 0.47281612068414686,
"epoch": 1.6234372084344093,
"grad_norm": 1.9140625,
"learning_rate": 2.36575774334923e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.774740971326828,
"num_tokens": 264141493.0,
"step": 8700
},
{
"entropy": 0.47484747022390367,
"epoch": 1.6327673073334577,
"grad_norm": 0.609375,
"learning_rate": 2.359010086499838e-05,
"loss": 0.5184,
"mean_token_accuracy": 0.7732387953996658,
"num_tokens": 265636375.0,
"step": 8750
},
{
"entropy": 0.47206893771886826,
"epoch": 1.642097406232506,
"grad_norm": 0.7890625,
"learning_rate": 2.352237880805426e-05,
"loss": 0.5303,
"mean_token_accuracy": 0.7746653699874878,
"num_tokens": 267090177.0,
"step": 8800
},
{
"entropy": 0.4694141258299351,
"epoch": 1.6514275051315543,
"grad_norm": 0.64453125,
"learning_rate": 2.3454413607476044e-05,
"loss": 0.5199,
"mean_token_accuracy": 0.7748447281122207,
"num_tokens": 268551821.0,
"step": 8850
},
{
"entropy": 0.47594692051410675,
"epoch": 1.6607576040306027,
"grad_norm": 1.125,
"learning_rate": 2.3386207616498503e-05,
"loss": 0.5186,
"mean_token_accuracy": 0.773201887011528,
"num_tokens": 270084312.0,
"step": 8900
},
{
"entropy": 0.47182066380977633,
"epoch": 1.670087702929651,
"grad_norm": 0.66796875,
"learning_rate": 2.331776319669354e-05,
"loss": 0.5286,
"mean_token_accuracy": 0.7718379843235016,
"num_tokens": 271546211.0,
"step": 8950
},
{
"entropy": 0.4656666761636734,
"epoch": 1.6794178018286994,
"grad_norm": 0.490234375,
"learning_rate": 2.324908271788844e-05,
"loss": 0.5158,
"mean_token_accuracy": 0.7762594664096832,
"num_tokens": 272998807.0,
"step": 9000
},
{
"epoch": 1.6794178018286994,
"eval_entropy": 0.4748075161505183,
"eval_loss": 0.5214188694953918,
"eval_mean_token_accuracy": 0.7731986155203723,
"eval_num_tokens": 272998807.0,
"eval_runtime": 16.1272,
"eval_samples_per_second": 53.76,
"eval_steps_per_second": 6.759,
"step": 9000
},
{
"entropy": 0.4896981066465378,
"epoch": 1.6887479007277477,
"grad_norm": 0.828125,
"learning_rate": 2.3180168558083844e-05,
"loss": 0.5287,
"mean_token_accuracy": 0.7671917879581451,
"num_tokens": 274587533.0,
"step": 9050
},
{
"entropy": 0.4765408200025558,
"epoch": 1.698077999626796,
"grad_norm": 1.921875,
"learning_rate": 2.311102310337136e-05,
"loss": 0.5188,
"mean_token_accuracy": 0.7724708420038223,
"num_tokens": 276131737.0,
"step": 9100
},
{
"entropy": 0.4824476379156113,
"epoch": 1.7074080985258444,
"grad_norm": 0.46875,
"learning_rate": 2.304164874785101e-05,
"loss": 0.5267,
"mean_token_accuracy": 0.7684733641147613,
"num_tokens": 277715304.0,
"step": 9150
},
{
"entropy": 0.48801319271326066,
"epoch": 1.7167381974248928,
"grad_norm": 0.7890625,
"learning_rate": 2.297204789354827e-05,
"loss": 0.5345,
"mean_token_accuracy": 0.7730580461025238,
"num_tokens": 279224176.0,
"step": 9200
},
{
"entropy": 0.47024243041872976,
"epoch": 1.726068296323941,
"grad_norm": 0.70703125,
"learning_rate": 2.2902222950330966e-05,
"loss": 0.5208,
"mean_token_accuracy": 0.7733591181039811,
"num_tokens": 280746272.0,
"step": 9250
},
{
"entropy": 0.4675961661338806,
"epoch": 1.7353983952229894,
"grad_norm": 0.9140625,
"learning_rate": 2.283217633582578e-05,
"loss": 0.5249,
"mean_token_accuracy": 0.7734372174739838,
"num_tokens": 282236117.0,
"step": 9300
},
{
"entropy": 0.48016302675008776,
"epoch": 1.7447284941220378,
"grad_norm": 1.4921875,
"learning_rate": 2.276191047533458e-05,
"loss": 0.5231,
"mean_token_accuracy": 0.770177600979805,
"num_tokens": 283794545.0,
"step": 9350
},
{
"entropy": 0.47259823501110076,
"epoch": 1.7540585930210861,
"grad_norm": 0.71484375,
"learning_rate": 2.269142780175042e-05,
"loss": 0.5192,
"mean_token_accuracy": 0.7728340399265289,
"num_tokens": 285333807.0,
"step": 9400
},
{
"entropy": 0.47532046377658843,
"epoch": 1.7633886919201345,
"grad_norm": 0.86328125,
"learning_rate": 2.2620730755473328e-05,
"loss": 0.5259,
"mean_token_accuracy": 0.7706443351507187,
"num_tokens": 286852697.0,
"step": 9450
},
{
"entropy": 0.4715300849080086,
"epoch": 1.7727187908191828,
"grad_norm": 0.6640625,
"learning_rate": 2.2549821784325787e-05,
"loss": 0.5226,
"mean_token_accuracy": 0.7710268515348434,
"num_tokens": 288390169.0,
"step": 9500
},
{
"entropy": 0.45817104071378706,
"epoch": 1.7820488897182312,
"grad_norm": 0.65234375,
"learning_rate": 2.2478703343467995e-05,
"loss": 0.5084,
"mean_token_accuracy": 0.7808880287408829,
"num_tokens": 289825517.0,
"step": 9550
},
{
"entropy": 0.46380849391222,
"epoch": 1.7913789886172793,
"grad_norm": 0.609375,
"learning_rate": 2.2407377895312848e-05,
"loss": 0.522,
"mean_token_accuracy": 0.7710244971513748,
"num_tokens": 291345905.0,
"step": 9600
},
{
"entropy": 0.4663406872749329,
"epoch": 1.8007090875163276,
"grad_norm": 0.97265625,
"learning_rate": 2.2335847909440697e-05,
"loss": 0.5174,
"mean_token_accuracy": 0.7765918165445328,
"num_tokens": 292843956.0,
"step": 9650
},
{
"entropy": 0.46827891767024993,
"epoch": 1.810039186415376,
"grad_norm": 1.4765625,
"learning_rate": 2.226411586251381e-05,
"loss": 0.5135,
"mean_token_accuracy": 0.7766558998823165,
"num_tokens": 294304944.0,
"step": 9700
},
{
"entropy": 0.4667487397789955,
"epoch": 1.8193692853144243,
"grad_norm": 0.75390625,
"learning_rate": 2.2192184238190666e-05,
"loss": 0.5173,
"mean_token_accuracy": 0.7746782380342484,
"num_tokens": 295807703.0,
"step": 9750
},
{
"entropy": 0.4796573233604431,
"epoch": 1.8286993842134727,
"grad_norm": 0.80078125,
"learning_rate": 2.2120055527039914e-05,
"loss": 0.5265,
"mean_token_accuracy": 0.7696005600690842,
"num_tokens": 297318135.0,
"step": 9800
},
{
"entropy": 0.47267288982868194,
"epoch": 1.838029483112521,
"grad_norm": 0.9140625,
"learning_rate": 2.2047732226454157e-05,
"loss": 0.5153,
"mean_token_accuracy": 0.7761634987592697,
"num_tokens": 298791062.0,
"step": 9850
},
{
"entropy": 0.47834465265274045,
"epoch": 1.8473595820115694,
"grad_norm": 0.625,
"learning_rate": 2.1975216840563502e-05,
"loss": 0.5222,
"mean_token_accuracy": 0.7747543674707412,
"num_tokens": 300275324.0,
"step": 9900
},
{
"entropy": 0.46263691544532776,
"epoch": 1.8566896809106177,
"grad_norm": 1.0,
"learning_rate": 2.1902511880148835e-05,
"loss": 0.5148,
"mean_token_accuracy": 0.7750599044561386,
"num_tokens": 301729884.0,
"step": 9950
},
{
"entropy": 0.4763942888379097,
"epoch": 1.8660197798096658,
"grad_norm": 0.796875,
"learning_rate": 2.1829619862554877e-05,
"loss": 0.5183,
"mean_token_accuracy": 0.7708618581295014,
"num_tokens": 303287949.0,
"step": 10000
},
{
"epoch": 1.8660197798096658,
"eval_entropy": 0.4685867871713201,
"eval_loss": 0.5169408321380615,
"eval_mean_token_accuracy": 0.7741667956387231,
"eval_num_tokens": 303287949.0,
"eval_runtime": 16.0644,
"eval_samples_per_second": 53.97,
"eval_steps_per_second": 6.785,
"step": 10000
},
{
"entropy": 0.47606048226356507,
"epoch": 1.8753498787087142,
"grad_norm": 1.109375,
"learning_rate": 2.175654331160305e-05,
"loss": 0.5189,
"mean_token_accuracy": 0.7730025327205658,
"num_tokens": 304818318.0,
"step": 10050
},
{
"entropy": 0.4616896215081215,
"epoch": 1.8846799776077625,
"grad_norm": 0.9140625,
"learning_rate": 2.168328475750408e-05,
"loss": 0.5089,
"mean_token_accuracy": 0.7780868858098984,
"num_tokens": 306331367.0,
"step": 10100
},
{
"entropy": 0.47027878910303117,
"epoch": 1.8940100765068109,
"grad_norm": 0.53125,
"learning_rate": 2.160984673677039e-05,
"loss": 0.5186,
"mean_token_accuracy": 0.772971043586731,
"num_tokens": 307830293.0,
"step": 10150
},
{
"entropy": 0.4677613499760628,
"epoch": 1.9033401754058592,
"grad_norm": 0.376953125,
"learning_rate": 2.153623179212827e-05,
"loss": 0.5163,
"mean_token_accuracy": 0.7740986323356629,
"num_tokens": 309360930.0,
"step": 10200
},
{
"entropy": 0.47715963318943977,
"epoch": 1.9126702743049075,
"grad_norm": 0.73046875,
"learning_rate": 2.146244247242985e-05,
"loss": 0.5271,
"mean_token_accuracy": 0.7717376494407654,
"num_tokens": 310855697.0,
"step": 10250
},
{
"entropy": 0.46572228729724885,
"epoch": 1.922000373203956,
"grad_norm": 0.71875,
"learning_rate": 2.1388481332564835e-05,
"loss": 0.5145,
"mean_token_accuracy": 0.774823442697525,
"num_tokens": 312370909.0,
"step": 10300
},
{
"entropy": 0.46713058680295944,
"epoch": 1.9313304721030042,
"grad_norm": 0.404296875,
"learning_rate": 2.1314350933372053e-05,
"loss": 0.5129,
"mean_token_accuracy": 0.7726266753673553,
"num_tokens": 313900324.0,
"step": 10350
},
{
"entropy": 0.4743108308315277,
"epoch": 1.9406605710020526,
"grad_norm": 0.8359375,
"learning_rate": 2.1240053841550792e-05,
"loss": 0.5226,
"mean_token_accuracy": 0.7713726377487182,
"num_tokens": 315441715.0,
"step": 10400
},
{
"entropy": 0.4706794250011444,
"epoch": 1.949990669901101,
"grad_norm": 0.9453125,
"learning_rate": 2.1165592629571923e-05,
"loss": 0.517,
"mean_token_accuracy": 0.7740881043672562,
"num_tokens": 316910090.0,
"step": 10450
},
{
"entropy": 0.4839508882164955,
"epoch": 1.9593207688001493,
"grad_norm": 0.81640625,
"learning_rate": 2.1090969875588827e-05,
"loss": 0.5236,
"mean_token_accuracy": 0.7707830715179443,
"num_tokens": 318480506.0,
"step": 10500
},
{
"entropy": 0.4602850756049156,
"epoch": 1.9686508676991976,
"grad_norm": 2.34375,
"learning_rate": 2.1016188163348126e-05,
"loss": 0.5121,
"mean_token_accuracy": 0.7764248877763749,
"num_tokens": 320008211.0,
"step": 10550
},
{
"entropy": 0.4730991995334625,
"epoch": 1.977980966598246,
"grad_norm": 0.51171875,
"learning_rate": 2.0941250082100253e-05,
"loss": 0.5288,
"mean_token_accuracy": 0.7702510052919388,
"num_tokens": 321525032.0,
"step": 10600
},
{
"entropy": 0.446844310760498,
"epoch": 1.9873110654972943,
"grad_norm": 0.296875,
"learning_rate": 2.0866158226509758e-05,
"loss": 0.5058,
"mean_token_accuracy": 0.7796232843399048,
"num_tokens": 322954203.0,
"step": 10650
},
{
"entropy": 0.4643040466308594,
"epoch": 1.9966411643963426,
"grad_norm": 1.0,
"learning_rate": 2.07909151965655e-05,
"loss": 0.5125,
"mean_token_accuracy": 0.7746139895915986,
"num_tokens": 324473634.0,
"step": 10700
},
{
"entropy": 0.46121720626950263,
"epoch": 2.005971263295391,
"grad_norm": 0.7109375,
"learning_rate": 2.071552359749062e-05,
"loss": 0.5151,
"mean_token_accuracy": 0.7720960187911987,
"num_tokens": 325999882.0,
"step": 10750
},
{
"entropy": 0.425725160241127,
"epoch": 2.0153013621944393,
"grad_norm": 0.474609375,
"learning_rate": 2.063998603965232e-05,
"loss": 0.4962,
"mean_token_accuracy": 0.7846533066034317,
"num_tokens": 327442405.0,
"step": 10800
},
{
"entropy": 0.45527834951877594,
"epoch": 2.0246314610934877,
"grad_norm": 1.1328125,
"learning_rate": 2.056430513847151e-05,
"loss": 0.5079,
"mean_token_accuracy": 0.7760635191202163,
"num_tokens": 329012059.0,
"step": 10850
},
{
"entropy": 0.4397959718108177,
"epoch": 2.033961559992536,
"grad_norm": 1.03125,
"learning_rate": 2.0488483514332225e-05,
"loss": 0.4988,
"mean_token_accuracy": 0.7809435164928437,
"num_tokens": 330491757.0,
"step": 10900
},
{
"entropy": 0.4280716378986835,
"epoch": 2.0432916588915844,
"grad_norm": 0.796875,
"learning_rate": 2.041252379249091e-05,
"loss": 0.4888,
"mean_token_accuracy": 0.785589964389801,
"num_tokens": 331979812.0,
"step": 10950
},
{
"entropy": 0.44335421919822693,
"epoch": 2.0526217577906327,
"grad_norm": 0.5703125,
"learning_rate": 2.0336428602985527e-05,
"loss": 0.5037,
"mean_token_accuracy": 0.7756373131275177,
"num_tokens": 333477541.0,
"step": 11000
},
{
"epoch": 2.0526217577906327,
"eval_entropy": 0.45860785820068567,
"eval_loss": 0.5141582489013672,
"eval_mean_token_accuracy": 0.7748899443433919,
"eval_num_tokens": 333477541.0,
"eval_runtime": 16.1759,
"eval_samples_per_second": 53.598,
"eval_steps_per_second": 6.738,
"step": 11000
},
{
"entropy": 0.43529929786920546,
"epoch": 2.061951856689681,
"grad_norm": 1.2109375,
"learning_rate": 2.026020058054448e-05,
"loss": 0.4936,
"mean_token_accuracy": 0.7826171773672104,
"num_tokens": 334939322.0,
"step": 11050
},
{
"entropy": 0.4596153527498245,
"epoch": 2.0712819555887294,
"grad_norm": 1.234375,
"learning_rate": 2.018384236449539e-05,
"loss": 0.5192,
"mean_token_accuracy": 0.7747271412611008,
"num_tokens": 336492472.0,
"step": 11100
},
{
"entropy": 0.45726164370775224,
"epoch": 2.0806120544877778,
"grad_norm": 0.3828125,
"learning_rate": 2.0107356598673732e-05,
"loss": 0.5075,
"mean_token_accuracy": 0.7738985830545425,
"num_tokens": 338077203.0,
"step": 11150
},
{
"entropy": 0.4586531579494476,
"epoch": 2.089942153386826,
"grad_norm": 0.423828125,
"learning_rate": 2.0030745931331256e-05,
"loss": 0.5128,
"mean_token_accuracy": 0.7716344100236893,
"num_tokens": 339676503.0,
"step": 11200
},
{
"entropy": 0.44181185990571975,
"epoch": 2.0992722522858744,
"grad_norm": 0.703125,
"learning_rate": 1.995401301504434e-05,
"loss": 0.5033,
"mean_token_accuracy": 0.7782072865962982,
"num_tokens": 341201747.0,
"step": 11250
},
{
"entropy": 0.431854664683342,
"epoch": 2.1086023511849223,
"grad_norm": 1.0390625,
"learning_rate": 1.9877160506622106e-05,
"loss": 0.49,
"mean_token_accuracy": 0.7848282158374786,
"num_tokens": 342665361.0,
"step": 11300
},
{
"entropy": 0.43534082144498826,
"epoch": 2.1179324500839707,
"grad_norm": 0.75,
"learning_rate": 1.9800191067014458e-05,
"loss": 0.4917,
"mean_token_accuracy": 0.7822025471925735,
"num_tokens": 344149761.0,
"step": 11350
},
{
"entropy": 0.444383510351181,
"epoch": 2.127262548983019,
"grad_norm": 0.494140625,
"learning_rate": 1.9723107361219928e-05,
"loss": 0.5044,
"mean_token_accuracy": 0.7769185125827789,
"num_tokens": 345653004.0,
"step": 11400
},
{
"entropy": 0.4527071109414101,
"epoch": 2.1365926478820674,
"grad_norm": 0.62890625,
"learning_rate": 1.964591205819343e-05,
"loss": 0.5026,
"mean_token_accuracy": 0.7761050814390182,
"num_tokens": 347228235.0,
"step": 11450
},
{
"entropy": 0.4523302459716797,
"epoch": 2.1459227467811157,
"grad_norm": 0.97265625,
"learning_rate": 1.9568607830753818e-05,
"loss": 0.5175,
"mean_token_accuracy": 0.777667219042778,
"num_tokens": 348730988.0,
"step": 11500
},
{
"entropy": 0.44508363455533984,
"epoch": 2.155252845680164,
"grad_norm": 1.3984375,
"learning_rate": 1.9491197355491355e-05,
"loss": 0.5016,
"mean_token_accuracy": 0.7767183601856231,
"num_tokens": 350265615.0,
"step": 11550
},
{
"entropy": 0.44807049065828325,
"epoch": 2.1645829445792124,
"grad_norm": 1.4921875,
"learning_rate": 1.941368331267506e-05,
"loss": 0.5179,
"mean_token_accuracy": 0.7722341948747635,
"num_tokens": 351794521.0,
"step": 11600
},
{
"entropy": 0.44992916941642763,
"epoch": 2.1739130434782608,
"grad_norm": 0.62109375,
"learning_rate": 1.9336068386159866e-05,
"loss": 0.5013,
"mean_token_accuracy": 0.7756163114309311,
"num_tokens": 353325027.0,
"step": 11650
},
{
"entropy": 0.4497057408094406,
"epoch": 2.183243142377309,
"grad_norm": 0.486328125,
"learning_rate": 1.9258355263293722e-05,
"loss": 0.5101,
"mean_token_accuracy": 0.7762971234321594,
"num_tokens": 354798870.0,
"step": 11700
},
{
"entropy": 0.4416424559056759,
"epoch": 2.1925732412763574,
"grad_norm": 0.83984375,
"learning_rate": 1.9180546634824542e-05,
"loss": 0.4978,
"mean_token_accuracy": 0.780478093624115,
"num_tokens": 356285178.0,
"step": 11750
},
{
"entropy": 0.4536583548784256,
"epoch": 2.201903340175406,
"grad_norm": 1.1328125,
"learning_rate": 1.910264519480704e-05,
"loss": 0.5081,
"mean_token_accuracy": 0.7730959630012513,
"num_tokens": 357853469.0,
"step": 11800
},
{
"entropy": 0.432369647026062,
"epoch": 2.211233439074454,
"grad_norm": 1.5703125,
"learning_rate": 1.902465364050943e-05,
"loss": 0.4962,
"mean_token_accuracy": 0.7790928614139557,
"num_tokens": 359347327.0,
"step": 11850
},
{
"entropy": 0.4379693388938904,
"epoch": 2.2205635379735025,
"grad_norm": 0.5078125,
"learning_rate": 1.894657467232007e-05,
"loss": 0.4922,
"mean_token_accuracy": 0.7815437364578247,
"num_tokens": 360883930.0,
"step": 11900
},
{
"entropy": 0.4471505701541901,
"epoch": 2.229893636872551,
"grad_norm": 0.75390625,
"learning_rate": 1.8868410993653945e-05,
"loss": 0.5073,
"mean_token_accuracy": 0.776453862786293,
"num_tokens": 362395340.0,
"step": 11950
},
{
"entropy": 0.45013984248042105,
"epoch": 2.239223735771599,
"grad_norm": 0.65234375,
"learning_rate": 1.879016531085905e-05,
"loss": 0.5059,
"mean_token_accuracy": 0.7744881427288055,
"num_tokens": 363975143.0,
"step": 12000
},
{
"epoch": 2.239223735771599,
"eval_entropy": 0.45516312778542894,
"eval_loss": 0.5110519528388977,
"eval_mean_token_accuracy": 0.7761742434370409,
"eval_num_tokens": 363975143.0,
"eval_runtime": 16.1613,
"eval_samples_per_second": 53.647,
"eval_steps_per_second": 6.744,
"step": 12000
},
{
"entropy": 0.46731566220521925,
"epoch": 2.2485538346706475,
"grad_norm": 0.95703125,
"learning_rate": 1.871184033312272e-05,
"loss": 0.5183,
"mean_token_accuracy": 0.7688807338476181,
"num_tokens": 365608309.0,
"step": 12050
},
{
"entropy": 0.43275348499417304,
"epoch": 2.257883933569696,
"grad_norm": 0.609375,
"learning_rate": 1.86334387723778e-05,
"loss": 0.5003,
"mean_token_accuracy": 0.7791133409738541,
"num_tokens": 367079142.0,
"step": 12100
},
{
"entropy": 0.4450415739417076,
"epoch": 2.267214032468744,
"grad_norm": 0.8046875,
"learning_rate": 1.8554963343208748e-05,
"loss": 0.508,
"mean_token_accuracy": 0.7741218858957291,
"num_tokens": 368613238.0,
"step": 12150
},
{
"entropy": 0.4465838612616062,
"epoch": 2.2765441313677925,
"grad_norm": 0.578125,
"learning_rate": 1.8476416762757647e-05,
"loss": 0.5025,
"mean_token_accuracy": 0.7775521212816239,
"num_tokens": 370170401.0,
"step": 12200
},
{
"entropy": 0.44225269854068755,
"epoch": 2.285874230266841,
"grad_norm": 0.57421875,
"learning_rate": 1.8397801750630147e-05,
"loss": 0.496,
"mean_token_accuracy": 0.7806216114759446,
"num_tokens": 371657034.0,
"step": 12250
},
{
"entropy": 0.4700926415622234,
"epoch": 2.2952043291658892,
"grad_norm": 0.55078125,
"learning_rate": 1.831912102880127e-05,
"loss": 0.5264,
"mean_token_accuracy": 0.7698476028442383,
"num_tokens": 373254414.0,
"step": 12300
},
{
"entropy": 0.4466784715652466,
"epoch": 2.3045344280649376,
"grad_norm": 1.1953125,
"learning_rate": 1.8240377321521187e-05,
"loss": 0.5059,
"mean_token_accuracy": 0.775156998038292,
"num_tokens": 374801131.0,
"step": 12350
},
{
"entropy": 0.4366114428639412,
"epoch": 2.313864526963986,
"grad_norm": 0.43359375,
"learning_rate": 1.816157335522088e-05,
"loss": 0.4963,
"mean_token_accuracy": 0.7822502106428146,
"num_tokens": 376328664.0,
"step": 12400
},
{
"entropy": 0.4395353105664253,
"epoch": 2.3231946258630343,
"grad_norm": 2.25,
"learning_rate": 1.808271185841774e-05,
"loss": 0.499,
"mean_token_accuracy": 0.7797509133815765,
"num_tokens": 377858310.0,
"step": 12450
},
{
"entropy": 0.42968779906630516,
"epoch": 2.3325247247620826,
"grad_norm": 1.1015625,
"learning_rate": 1.8003795561621118e-05,
"loss": 0.4905,
"mean_token_accuracy": 0.7836663633584976,
"num_tokens": 379330727.0,
"step": 12500
},
{
"entropy": 0.4358790573477745,
"epoch": 2.341854823661131,
"grad_norm": 0.95703125,
"learning_rate": 1.792482719723774e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.7849816447496414,
"num_tokens": 380789154.0,
"step": 12550
},
{
"entropy": 0.4382615853846073,
"epoch": 2.3511849225601793,
"grad_norm": 0.71484375,
"learning_rate": 1.7845809499477147e-05,
"loss": 0.5003,
"mean_token_accuracy": 0.777313020825386,
"num_tokens": 382304834.0,
"step": 12600
},
{
"entropy": 0.4590240094065666,
"epoch": 2.3605150214592276,
"grad_norm": 0.7109375,
"learning_rate": 1.7766745204257005e-05,
"loss": 0.5087,
"mean_token_accuracy": 0.7742151153087616,
"num_tokens": 383864187.0,
"step": 12650
},
{
"entropy": 0.44439409762620924,
"epoch": 2.369845120358276,
"grad_norm": 0.92578125,
"learning_rate": 1.7687637049108356e-05,
"loss": 0.5074,
"mean_token_accuracy": 0.7769033217430115,
"num_tokens": 385371218.0,
"step": 12700
},
{
"entropy": 0.45312762558460234,
"epoch": 2.3791752192573243,
"grad_norm": 1.078125,
"learning_rate": 1.7608487773080876e-05,
"loss": 0.5138,
"mean_token_accuracy": 0.7725106239318847,
"num_tokens": 386905152.0,
"step": 12750
},
{
"entropy": 0.4439894749224186,
"epoch": 2.3885053181563727,
"grad_norm": 0.62109375,
"learning_rate": 1.7529300116648006e-05,
"loss": 0.5065,
"mean_token_accuracy": 0.777021074295044,
"num_tokens": 388444369.0,
"step": 12800
},
{
"entropy": 0.42678302526474,
"epoch": 2.397835417055421,
"grad_norm": 1.5390625,
"learning_rate": 1.745007682161206e-05,
"loss": 0.4882,
"mean_token_accuracy": 0.7850719147920608,
"num_tokens": 389939944.0,
"step": 12850
},
{
"entropy": 0.43243088483810427,
"epoch": 2.407165515954469,
"grad_norm": 1.0625,
"learning_rate": 1.7370820631009338e-05,
"loss": 0.4964,
"mean_token_accuracy": 0.7798051989078522,
"num_tokens": 391456345.0,
"step": 12900
},
{
"entropy": 0.4408321186900139,
"epoch": 2.4164956148535173,
"grad_norm": 0.88671875,
"learning_rate": 1.729153428901509e-05,
"loss": 0.5048,
"mean_token_accuracy": 0.7777973639965058,
"num_tokens": 392972606.0,
"step": 12950
},
{
"entropy": 0.42640509456396103,
"epoch": 2.4258257137525656,
"grad_norm": 0.54296875,
"learning_rate": 1.721222054084855e-05,
"loss": 0.489,
"mean_token_accuracy": 0.7861467552185059,
"num_tokens": 394419739.0,
"step": 13000
},
{
"epoch": 2.4258257137525656,
"eval_entropy": 0.45086843404201193,
"eval_loss": 0.5095834136009216,
"eval_mean_token_accuracy": 0.7765465198306862,
"eval_num_tokens": 394419739.0,
"eval_runtime": 16.3,
"eval_samples_per_second": 53.19,
"eval_steps_per_second": 6.687,
"step": 13000
},
{
"entropy": 0.4339776523411274,
"epoch": 2.435155812651614,
"grad_norm": 1.1171875,
"learning_rate": 1.7132882132677856e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.7817716175317764,
"num_tokens": 395915283.0,
"step": 13050
},
{
"entropy": 0.43754623234272005,
"epoch": 2.4444859115506623,
"grad_norm": 0.447265625,
"learning_rate": 1.7053521811524983e-05,
"loss": 0.5022,
"mean_token_accuracy": 0.7792576867341995,
"num_tokens": 397406785.0,
"step": 13100
},
{
"entropy": 0.4319652807712555,
"epoch": 2.4538160104497106,
"grad_norm": 0.423828125,
"learning_rate": 1.6974142325170614e-05,
"loss": 0.4932,
"mean_token_accuracy": 0.780916188955307,
"num_tokens": 398889013.0,
"step": 13150
},
{
"entropy": 0.42943828999996186,
"epoch": 2.463146109348759,
"grad_norm": 1.703125,
"learning_rate": 1.6894746422059023e-05,
"loss": 0.4874,
"mean_token_accuracy": 0.7860763943195344,
"num_tokens": 400360522.0,
"step": 13200
},
{
"entropy": 0.4486216183006764,
"epoch": 2.4724762082478073,
"grad_norm": 0.55859375,
"learning_rate": 1.6815336851202897e-05,
"loss": 0.503,
"mean_token_accuracy": 0.7766870594024659,
"num_tokens": 401889271.0,
"step": 13250
},
{
"entropy": 0.4432876881957054,
"epoch": 2.4818063071468557,
"grad_norm": 0.73046875,
"learning_rate": 1.6735916362088154e-05,
"loss": 0.4969,
"mean_token_accuracy": 0.7788010305166244,
"num_tokens": 403427748.0,
"step": 13300
},
{
"entropy": 0.44121997892856596,
"epoch": 2.491136406045904,
"grad_norm": 1.0078125,
"learning_rate": 1.6656487704578733e-05,
"loss": 0.4999,
"mean_token_accuracy": 0.7776626753807068,
"num_tokens": 404954195.0,
"step": 13350
},
{
"entropy": 0.433184619396925,
"epoch": 2.5004665049449524,
"grad_norm": 0.91015625,
"learning_rate": 1.6577053628821423e-05,
"loss": 0.4912,
"mean_token_accuracy": 0.7811095035076141,
"num_tokens": 406493981.0,
"step": 13400
},
{
"entropy": 0.42312393710017204,
"epoch": 2.5097966038440007,
"grad_norm": 0.73046875,
"learning_rate": 1.6497616885150602e-05,
"loss": 0.4876,
"mean_token_accuracy": 0.7851846623420715,
"num_tokens": 407934358.0,
"step": 13450
},
{
"entropy": 0.4381246021389961,
"epoch": 2.519126702743049,
"grad_norm": 0.55078125,
"learning_rate": 1.6418180223993015e-05,
"loss": 0.4935,
"mean_token_accuracy": 0.7794178009033204,
"num_tokens": 409440914.0,
"step": 13500
},
{
"entropy": 0.44361667945981026,
"epoch": 2.5284568016420974,
"grad_norm": 0.69140625,
"learning_rate": 1.6338746395772556e-05,
"loss": 0.4969,
"mean_token_accuracy": 0.7793952637910843,
"num_tokens": 410989159.0,
"step": 13550
},
{
"entropy": 0.43151887714862824,
"epoch": 2.5377869005411458,
"grad_norm": 0.94140625,
"learning_rate": 1.625931815081504e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.7826283901929856,
"num_tokens": 412472158.0,
"step": 13600
},
{
"entropy": 0.44044260889291764,
"epoch": 2.547116999440194,
"grad_norm": 0.71875,
"learning_rate": 1.6179898239252952e-05,
"loss": 0.4989,
"mean_token_accuracy": 0.7796867018938065,
"num_tokens": 414005272.0,
"step": 13650
},
{
"entropy": 0.43727659299969673,
"epoch": 2.5564470983392424,
"grad_norm": 0.5390625,
"learning_rate": 1.6100489410930248e-05,
"loss": 0.4983,
"mean_token_accuracy": 0.7779977285861969,
"num_tokens": 415515917.0,
"step": 13700
},
{
"entropy": 0.4339106129109859,
"epoch": 2.565777197238291,
"grad_norm": 1.21875,
"learning_rate": 1.602109441530714e-05,
"loss": 0.5021,
"mean_token_accuracy": 0.7849010616540909,
"num_tokens": 416950426.0,
"step": 13750
},
{
"entropy": 0.4218116353452206,
"epoch": 2.575107296137339,
"grad_norm": 0.82421875,
"learning_rate": 1.5941716001364893e-05,
"loss": 0.4868,
"mean_token_accuracy": 0.7843046194314957,
"num_tokens": 418436936.0,
"step": 13800
},
{
"entropy": 0.42178965732455254,
"epoch": 2.5844373950363875,
"grad_norm": 0.73046875,
"learning_rate": 1.5862356917510624e-05,
"loss": 0.4819,
"mean_token_accuracy": 0.7881238484382629,
"num_tokens": 419883666.0,
"step": 13850
},
{
"entropy": 0.43385469675064087,
"epoch": 2.593767493935436,
"grad_norm": 0.88671875,
"learning_rate": 1.5783019911482207e-05,
"loss": 0.4999,
"mean_token_accuracy": 0.7784091866016388,
"num_tokens": 421397686.0,
"step": 13900
},
{
"entropy": 0.4416196349263191,
"epoch": 2.603097592834484,
"grad_norm": 1.0078125,
"learning_rate": 1.570370773025306e-05,
"loss": 0.5006,
"mean_token_accuracy": 0.7770800250768661,
"num_tokens": 422952565.0,
"step": 13950
},
{
"entropy": 0.4428327572345734,
"epoch": 2.6124276917335325,
"grad_norm": 1.109375,
"learning_rate": 1.5624423119937062e-05,
"loss": 0.4991,
"mean_token_accuracy": 0.7780466437339782,
"num_tokens": 424490641.0,
"step": 14000
},
{
"epoch": 2.6124276917335325,
"eval_entropy": 0.4452941595414363,
"eval_loss": 0.507188618183136,
"eval_mean_token_accuracy": 0.7770852725440209,
"eval_num_tokens": 424490641.0,
"eval_runtime": 16.1783,
"eval_samples_per_second": 53.59,
"eval_steps_per_second": 6.737,
"step": 14000
},
{
"entropy": 0.4200237849354744,
"epoch": 2.621757790632581,
"grad_norm": 0.48046875,
"learning_rate": 1.554516882569349e-05,
"loss": 0.4903,
"mean_token_accuracy": 0.787417265176773,
"num_tokens": 425904302.0,
"step": 14050
},
{
"entropy": 0.4459881857037544,
"epoch": 2.631087889531629,
"grad_norm": 0.96875,
"learning_rate": 1.5465947591631947e-05,
"loss": 0.5028,
"mean_token_accuracy": 0.7770888382196426,
"num_tokens": 427427333.0,
"step": 14100
},
{
"entropy": 0.44237791940569876,
"epoch": 2.640417988430677,
"grad_norm": 0.515625,
"learning_rate": 1.5386762160717355e-05,
"loss": 0.5098,
"mean_token_accuracy": 0.779816085100174,
"num_tokens": 428936850.0,
"step": 14150
},
{
"entropy": 0.4365511977672577,
"epoch": 2.6497480873297254,
"grad_norm": 0.6796875,
"learning_rate": 1.5307615274674984e-05,
"loss": 0.5003,
"mean_token_accuracy": 0.7807082986831665,
"num_tokens": 430411091.0,
"step": 14200
},
{
"entropy": 0.45521570563316344,
"epoch": 2.659078186228774,
"grad_norm": 1.0703125,
"learning_rate": 1.522850967389552e-05,
"loss": 0.5169,
"mean_token_accuracy": 0.774719997048378,
"num_tokens": 431926456.0,
"step": 14250
},
{
"entropy": 0.43436133086681367,
"epoch": 2.668408285127822,
"grad_norm": 1.0703125,
"learning_rate": 1.5149448097340193e-05,
"loss": 0.4954,
"mean_token_accuracy": 0.7820281451940536,
"num_tokens": 433433882.0,
"step": 14300
},
{
"entropy": 0.4265731783211231,
"epoch": 2.6777383840268705,
"grad_norm": 0.75,
"learning_rate": 1.5070433282445917e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.7826755654811859,
"num_tokens": 434847537.0,
"step": 14350
},
{
"entropy": 0.4315530589222908,
"epoch": 2.687068482925919,
"grad_norm": 0.92578125,
"learning_rate": 1.4991467965030544e-05,
"loss": 0.5031,
"mean_token_accuracy": 0.7789240056276321,
"num_tokens": 436306220.0,
"step": 14400
},
{
"entropy": 0.4366298992931843,
"epoch": 2.696398581824967,
"grad_norm": 0.75390625,
"learning_rate": 1.4912554879198106e-05,
"loss": 0.4974,
"mean_token_accuracy": 0.7800502121448517,
"num_tokens": 437854187.0,
"step": 14450
},
{
"entropy": 0.429113384783268,
"epoch": 2.7057286807240155,
"grad_norm": 0.94921875,
"learning_rate": 1.4833696757244162e-05,
"loss": 0.496,
"mean_token_accuracy": 0.7821574300527573,
"num_tokens": 439322082.0,
"step": 14500
},
{
"entropy": 0.45050988361239436,
"epoch": 2.715058779623064,
"grad_norm": 0.765625,
"learning_rate": 1.475489632956121e-05,
"loss": 0.5181,
"mean_token_accuracy": 0.7749482518434525,
"num_tokens": 440931906.0,
"step": 14550
},
{
"entropy": 0.45400950565934184,
"epoch": 2.724388878522112,
"grad_norm": 0.77734375,
"learning_rate": 1.4676156324544123e-05,
"loss": 0.5091,
"mean_token_accuracy": 0.7732315069437027,
"num_tokens": 442503054.0,
"step": 14600
},
{
"entropy": 0.42937296599149705,
"epoch": 2.7337189774211605,
"grad_norm": 0.68359375,
"learning_rate": 1.4597479468495688e-05,
"loss": 0.4923,
"mean_token_accuracy": 0.7824376839399337,
"num_tokens": 443993311.0,
"step": 14650
},
{
"entropy": 0.44196070849895475,
"epoch": 2.743049076320209,
"grad_norm": 0.45703125,
"learning_rate": 1.4518868485532235e-05,
"loss": 0.5056,
"mean_token_accuracy": 0.7760562229156495,
"num_tokens": 445522017.0,
"step": 14700
},
{
"entropy": 0.44267056584358216,
"epoch": 2.7523791752192572,
"grad_norm": 1.0703125,
"learning_rate": 1.4440326097489263e-05,
"loss": 0.4978,
"mean_token_accuracy": 0.7780000925064087,
"num_tokens": 447063778.0,
"step": 14750
},
{
"entropy": 0.4377539825439453,
"epoch": 2.7617092741183056,
"grad_norm": 0.73828125,
"learning_rate": 1.436185502382728e-05,
"loss": 0.494,
"mean_token_accuracy": 0.7799159944057464,
"num_tokens": 448574576.0,
"step": 14800
},
{
"entropy": 0.43343299850821493,
"epoch": 2.771039373017354,
"grad_norm": 0.71484375,
"learning_rate": 1.428345798153756e-05,
"loss": 0.4936,
"mean_token_accuracy": 0.7831123304367066,
"num_tokens": 450066594.0,
"step": 14850
},
{
"entropy": 0.430605805516243,
"epoch": 2.7803694719164023,
"grad_norm": 0.3046875,
"learning_rate": 1.4205137685048111e-05,
"loss": 0.495,
"mean_token_accuracy": 0.7821512734889984,
"num_tokens": 451568798.0,
"step": 14900
},
{
"entropy": 0.43412218809127806,
"epoch": 2.7896995708154506,
"grad_norm": 1.4140625,
"learning_rate": 1.4126896846129701e-05,
"loss": 0.4975,
"mean_token_accuracy": 0.7786238652467727,
"num_tokens": 453062565.0,
"step": 14950
},
{
"entropy": 0.4382785783708096,
"epoch": 2.799029669714499,
"grad_norm": 0.828125,
"learning_rate": 1.4048738173801939e-05,
"loss": 0.4948,
"mean_token_accuracy": 0.7823023611307144,
"num_tokens": 454574611.0,
"step": 15000
},
{
"epoch": 2.799029669714499,
"eval_entropy": 0.44517116043545785,
"eval_loss": 0.5036894679069519,
"eval_mean_token_accuracy": 0.7785478141329704,
"eval_num_tokens": 454574611.0,
"eval_runtime": 16.6862,
"eval_samples_per_second": 51.959,
"eval_steps_per_second": 6.532,
"step": 15000
},
{
"entropy": 0.42489848256111146,
"epoch": 2.8083597686135473,
"grad_norm": 1.109375,
"learning_rate": 1.3970664374239483e-05,
"loss": 0.495,
"mean_token_accuracy": 0.7826015204191208,
"num_tokens": 456088007.0,
"step": 15050
},
{
"entropy": 0.4351053491234779,
"epoch": 2.8176898675125956,
"grad_norm": 0.62890625,
"learning_rate": 1.3892678150678369e-05,
"loss": 0.4909,
"mean_token_accuracy": 0.7825271499156952,
"num_tokens": 457634148.0,
"step": 15100
},
{
"entropy": 0.4429400071501732,
"epoch": 2.827019966411644,
"grad_norm": 0.7265625,
"learning_rate": 1.3814782203322367e-05,
"loss": 0.5001,
"mean_token_accuracy": 0.7786577945947647,
"num_tokens": 459151611.0,
"step": 15150
},
{
"entropy": 0.4451659436523914,
"epoch": 2.8363500653106923,
"grad_norm": 0.30078125,
"learning_rate": 1.3736979229249526e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.7781134587526322,
"num_tokens": 460711762.0,
"step": 15200
},
{
"entropy": 0.4428605942428112,
"epoch": 2.8456801642097407,
"grad_norm": 0.5625,
"learning_rate": 1.3659271922318776e-05,
"loss": 0.5088,
"mean_token_accuracy": 0.7755980342626572,
"num_tokens": 462239645.0,
"step": 15250
},
{
"entropy": 0.44321066468954085,
"epoch": 2.855010263108789,
"grad_norm": 1.90625,
"learning_rate": 1.3581662973076661e-05,
"loss": 0.5057,
"mean_token_accuracy": 0.7742398703098297,
"num_tokens": 463780352.0,
"step": 15300
},
{
"entropy": 0.43519594669342043,
"epoch": 2.8643403620078374,
"grad_norm": 0.8359375,
"learning_rate": 1.3504155068664164e-05,
"loss": 0.4946,
"mean_token_accuracy": 0.7806341469287872,
"num_tokens": 465306354.0,
"step": 15350
},
{
"entropy": 0.452918638586998,
"epoch": 2.8736704609068857,
"grad_norm": 0.625,
"learning_rate": 1.3426750892723697e-05,
"loss": 0.5097,
"mean_token_accuracy": 0.7724657821655273,
"num_tokens": 466885785.0,
"step": 15400
},
{
"entropy": 0.43585652247071266,
"epoch": 2.883000559805934,
"grad_norm": 0.57421875,
"learning_rate": 1.3349453125306156e-05,
"loss": 0.4973,
"mean_token_accuracy": 0.7797726893424988,
"num_tokens": 468383581.0,
"step": 15450
},
{
"entropy": 0.43165954887866975,
"epoch": 2.8923306587049824,
"grad_norm": 0.68359375,
"learning_rate": 1.3272264442778136e-05,
"loss": 0.4937,
"mean_token_accuracy": 0.7796162796020508,
"num_tokens": 469917430.0,
"step": 15500
},
{
"entropy": 0.4401167546212673,
"epoch": 2.9016607576040307,
"grad_norm": 0.66015625,
"learning_rate": 1.319518751772927e-05,
"loss": 0.5006,
"mean_token_accuracy": 0.7786802816390991,
"num_tokens": 471424114.0,
"step": 15550
},
{
"entropy": 0.45204014524817465,
"epoch": 2.910990856503079,
"grad_norm": 1.28125,
"learning_rate": 1.3118225018879684e-05,
"loss": 0.5071,
"mean_token_accuracy": 0.7716954737901688,
"num_tokens": 473004348.0,
"step": 15600
},
{
"entropy": 0.43205771446228025,
"epoch": 2.9203209554021274,
"grad_norm": 1.6484375,
"learning_rate": 1.3041379610987594e-05,
"loss": 0.4982,
"mean_token_accuracy": 0.7792713183164597,
"num_tokens": 474501801.0,
"step": 15650
},
{
"entropy": 0.43552364617586137,
"epoch": 2.929651054301176,
"grad_norm": 0.79296875,
"learning_rate": 1.2964653954757057e-05,
"loss": 0.4991,
"mean_token_accuracy": 0.7788824599981308,
"num_tokens": 476043852.0,
"step": 15700
},
{
"entropy": 0.44703464940190313,
"epoch": 2.938981153200224,
"grad_norm": 0.490234375,
"learning_rate": 1.2888050706745822e-05,
"loss": 0.5064,
"mean_token_accuracy": 0.7750799888372422,
"num_tokens": 477598151.0,
"step": 15750
},
{
"entropy": 0.43971230536699296,
"epoch": 2.9483112520992725,
"grad_norm": 0.67578125,
"learning_rate": 1.2811572519273378e-05,
"loss": 0.5041,
"mean_token_accuracy": 0.7766543072462082,
"num_tokens": 479148774.0,
"step": 15800
},
{
"entropy": 0.447616363465786,
"epoch": 2.957641350998321,
"grad_norm": 1.3046875,
"learning_rate": 1.2735222040329087e-05,
"loss": 0.5084,
"mean_token_accuracy": 0.7735405403375626,
"num_tokens": 480694879.0,
"step": 15850
},
{
"entropy": 0.4212371516227722,
"epoch": 2.966971449897369,
"grad_norm": 2.265625,
"learning_rate": 1.2659001913480522e-05,
"loss": 0.4861,
"mean_token_accuracy": 0.785834304690361,
"num_tokens": 482165709.0,
"step": 15900
},
{
"entropy": 0.41868353605270386,
"epoch": 2.9763015487964175,
"grad_norm": 1.234375,
"learning_rate": 1.2582914777781937e-05,
"loss": 0.4843,
"mean_token_accuracy": 0.7870692014694214,
"num_tokens": 483643611.0,
"step": 15950
},
{
"entropy": 0.44808956772089004,
"epoch": 2.9856316476954654,
"grad_norm": 0.65234375,
"learning_rate": 1.2506963267682884e-05,
"loss": 0.5052,
"mean_token_accuracy": 0.7751398229598999,
"num_tokens": 485205508.0,
"step": 16000
},
{
"epoch": 2.9856316476954654,
"eval_entropy": 0.44267906945779784,
"eval_loss": 0.501590371131897,
"eval_mean_token_accuracy": 0.7786080602112166,
"eval_num_tokens": 485205508.0,
"eval_runtime": 16.2258,
"eval_samples_per_second": 53.433,
"eval_steps_per_second": 6.718,
"step": 16000
},
{
"entropy": 0.4390178045630455,
"epoch": 2.9949617465945138,
"grad_norm": 1.8515625,
"learning_rate": 1.2431150012936982e-05,
"loss": 0.5016,
"mean_token_accuracy": 0.7765721970796585,
"num_tokens": 486717739.0,
"step": 16050
},
{
"entropy": 0.43280943170189856,
"epoch": 3.004291845493562,
"grad_norm": 1.109375,
"learning_rate": 1.2355477638510904e-05,
"loss": 0.5016,
"mean_token_accuracy": 0.7793767899274826,
"num_tokens": 488220825.0,
"step": 16100
},
{
"entropy": 0.41857230544090274,
"epoch": 3.0136219443926104,
"grad_norm": 0.96875,
"learning_rate": 1.2279948764493463e-05,
"loss": 0.4857,
"mean_token_accuracy": 0.7855171990394593,
"num_tokens": 489714834.0,
"step": 16150
},
{
"entropy": 0.4343470679223537,
"epoch": 3.022952043291659,
"grad_norm": 0.57421875,
"learning_rate": 1.220456600600488e-05,
"loss": 0.4952,
"mean_token_accuracy": 0.7775657117366791,
"num_tokens": 491256234.0,
"step": 16200
},
{
"entropy": 0.42674369990825656,
"epoch": 3.032282142190707,
"grad_norm": 0.76171875,
"learning_rate": 1.2129331973106275e-05,
"loss": 0.4875,
"mean_token_accuracy": 0.7840164464712143,
"num_tokens": 492813353.0,
"step": 16250
},
{
"entropy": 0.42470823049545287,
"epoch": 3.0416122410897555,
"grad_norm": 1.078125,
"learning_rate": 1.2054249270709271e-05,
"loss": 0.4898,
"mean_token_accuracy": 0.7800269144773483,
"num_tokens": 494302175.0,
"step": 16300
},
{
"entropy": 0.42407613903284075,
"epoch": 3.050942339988804,
"grad_norm": 0.7109375,
"learning_rate": 1.1979320498485797e-05,
"loss": 0.4849,
"mean_token_accuracy": 0.782460133433342,
"num_tokens": 495800714.0,
"step": 16350
},
{
"entropy": 0.4180370423197746,
"epoch": 3.060272438887852,
"grad_norm": 1.109375,
"learning_rate": 1.1904548250778101e-05,
"loss": 0.4882,
"mean_token_accuracy": 0.7806040924787522,
"num_tokens": 497321561.0,
"step": 16400
},
{
"entropy": 0.42719221144914626,
"epoch": 3.0696025377869005,
"grad_norm": 0.66015625,
"learning_rate": 1.1829935116508903e-05,
"loss": 0.4905,
"mean_token_accuracy": 0.7813178312778473,
"num_tokens": 498854626.0,
"step": 16450
},
{
"entropy": 0.4178078393638134,
"epoch": 3.078932636685949,
"grad_norm": 0.76953125,
"learning_rate": 1.175548367909175e-05,
"loss": 0.485,
"mean_token_accuracy": 0.783756075501442,
"num_tokens": 500380642.0,
"step": 16500
},
{
"entropy": 0.4012910355627537,
"epoch": 3.088262735584997,
"grad_norm": 1.1328125,
"learning_rate": 1.1681196516341603e-05,
"loss": 0.4694,
"mean_token_accuracy": 0.7900429528951645,
"num_tokens": 501885081.0,
"step": 16550
},
{
"entropy": 0.41146292686462405,
"epoch": 3.0975928344840455,
"grad_norm": 0.46484375,
"learning_rate": 1.1607076200385529e-05,
"loss": 0.4784,
"mean_token_accuracy": 0.7883473831415176,
"num_tokens": 503332757.0,
"step": 16600
},
{
"entropy": 0.4057569517195225,
"epoch": 3.106922933383094,
"grad_norm": 0.326171875,
"learning_rate": 1.1533125297573703e-05,
"loss": 0.4788,
"mean_token_accuracy": 0.788205589056015,
"num_tokens": 504782228.0,
"step": 16650
},
{
"entropy": 0.41990657716989516,
"epoch": 3.1162530322821422,
"grad_norm": 0.57421875,
"learning_rate": 1.1459346368390504e-05,
"loss": 0.4849,
"mean_token_accuracy": 0.782880739569664,
"num_tokens": 506282342.0,
"step": 16700
},
{
"entropy": 0.4344457286596298,
"epoch": 3.1255831311811906,
"grad_norm": 0.412109375,
"learning_rate": 1.1385741967365869e-05,
"loss": 0.492,
"mean_token_accuracy": 0.779918566942215,
"num_tokens": 507797216.0,
"step": 16750
},
{
"entropy": 0.425162510573864,
"epoch": 3.134913230080239,
"grad_norm": 0.6640625,
"learning_rate": 1.131231464298687e-05,
"loss": 0.4889,
"mean_token_accuracy": 0.7809261924028397,
"num_tokens": 509307379.0,
"step": 16800
},
{
"entropy": 0.4163677006959915,
"epoch": 3.1442433289792873,
"grad_norm": 0.96484375,
"learning_rate": 1.1239066937609447e-05,
"loss": 0.4903,
"mean_token_accuracy": 0.7836281234025955,
"num_tokens": 510827157.0,
"step": 16850
},
{
"entropy": 0.41290561139583587,
"epoch": 3.1535734278783356,
"grad_norm": 0.2890625,
"learning_rate": 1.1166001387370388e-05,
"loss": 0.4915,
"mean_token_accuracy": 0.7884875816106797,
"num_tokens": 512261808.0,
"step": 16900
},
{
"entropy": 0.41839638456702233,
"epoch": 3.162903526777384,
"grad_norm": 0.48046875,
"learning_rate": 1.1093120522099535e-05,
"loss": 0.4855,
"mean_token_accuracy": 0.7832108342647552,
"num_tokens": 513784693.0,
"step": 16950
},
{
"entropy": 0.4217331621050835,
"epoch": 3.1722336256764323,
"grad_norm": 1.0234375,
"learning_rate": 1.1020426865232167e-05,
"loss": 0.4965,
"mean_token_accuracy": 0.7817919147014618,
"num_tokens": 515320766.0,
"step": 17000
},
{
"epoch": 3.1722336256764323,
"eval_entropy": 0.4328604711031695,
"eval_loss": 0.501544177532196,
"eval_mean_token_accuracy": 0.7793953303897053,
"eval_num_tokens": 515320766.0,
"eval_runtime": 16.0856,
"eval_samples_per_second": 53.899,
"eval_steps_per_second": 6.776,
"step": 17000
},
{
"entropy": 0.4205896918475628,
"epoch": 3.1815637245754806,
"grad_norm": 1.140625,
"learning_rate": 1.0947922933721634e-05,
"loss": 0.4908,
"mean_token_accuracy": 0.7806319332122803,
"num_tokens": 516850357.0,
"step": 17050
},
{
"entropy": 0.4369584143161774,
"epoch": 3.190893823474529,
"grad_norm": 0.6796875,
"learning_rate": 1.0875611237952227e-05,
"loss": 0.5028,
"mean_token_accuracy": 0.7777555876970291,
"num_tokens": 518426223.0,
"step": 17100
},
{
"entropy": 0.42318257100880147,
"epoch": 3.2002239223735773,
"grad_norm": 0.578125,
"learning_rate": 1.0803494281652234e-05,
"loss": 0.4893,
"mean_token_accuracy": 0.7821793848276138,
"num_tokens": 519954467.0,
"step": 17150
},
{
"entropy": 0.43405372768640516,
"epoch": 3.2095540212726257,
"grad_norm": 1.1484375,
"learning_rate": 1.0731574561807265e-05,
"loss": 0.4994,
"mean_token_accuracy": 0.7766797173023224,
"num_tokens": 521503586.0,
"step": 17200
},
{
"entropy": 0.42191226355731487,
"epoch": 3.218884120171674,
"grad_norm": 0.66796875,
"learning_rate": 1.0659854568573787e-05,
"loss": 0.4846,
"mean_token_accuracy": 0.7843712168931961,
"num_tokens": 523039085.0,
"step": 17250
},
{
"entropy": 0.4242657233774662,
"epoch": 3.228214219070722,
"grad_norm": 0.47265625,
"learning_rate": 1.058833678519293e-05,
"loss": 0.4901,
"mean_token_accuracy": 0.7792167681455612,
"num_tokens": 524604685.0,
"step": 17300
},
{
"entropy": 0.40574381925165653,
"epoch": 3.2375443179697703,
"grad_norm": 0.76171875,
"learning_rate": 1.051702368790447e-05,
"loss": 0.4739,
"mean_token_accuracy": 0.789325880408287,
"num_tokens": 526100115.0,
"step": 17350
},
{
"entropy": 0.41887831434607503,
"epoch": 3.2468744168688186,
"grad_norm": 1.2265625,
"learning_rate": 1.0445917745861102e-05,
"loss": 0.4818,
"mean_token_accuracy": 0.7836150753498078,
"num_tokens": 527595996.0,
"step": 17400
},
{
"entropy": 0.40637139081954954,
"epoch": 3.256204515767867,
"grad_norm": 0.6015625,
"learning_rate": 1.0375021421042974e-05,
"loss": 0.4733,
"mean_token_accuracy": 0.7896361410617828,
"num_tokens": 529079715.0,
"step": 17450
},
{
"entropy": 0.4036496952176094,
"epoch": 3.2655346146669153,
"grad_norm": 0.298828125,
"learning_rate": 1.030433716817241e-05,
"loss": 0.4819,
"mean_token_accuracy": 0.7864921605587005,
"num_tokens": 530573725.0,
"step": 17500
},
{
"entropy": 0.41815487191081047,
"epoch": 3.2748647135659636,
"grad_norm": 0.90234375,
"learning_rate": 1.0233867434628938e-05,
"loss": 0.4813,
"mean_token_accuracy": 0.7873369532823563,
"num_tokens": 532067163.0,
"step": 17550
},
{
"entropy": 0.4044397334754467,
"epoch": 3.284194812465012,
"grad_norm": 1.2265625,
"learning_rate": 1.0163614660364547e-05,
"loss": 0.4802,
"mean_token_accuracy": 0.7873817622661591,
"num_tokens": 533512972.0,
"step": 17600
},
{
"entropy": 0.4253586496412754,
"epoch": 3.2935249113640603,
"grad_norm": 0.494140625,
"learning_rate": 1.0093581277819186e-05,
"loss": 0.4906,
"mean_token_accuracy": 0.7798972427845001,
"num_tokens": 535070119.0,
"step": 17650
},
{
"entropy": 0.41822995960712434,
"epoch": 3.3028550102631087,
"grad_norm": 0.8046875,
"learning_rate": 1.0023769711836586e-05,
"loss": 0.4878,
"mean_token_accuracy": 0.7849278378486634,
"num_tokens": 536578335.0,
"step": 17700
},
{
"entropy": 0.4179137668013573,
"epoch": 3.312185109162157,
"grad_norm": 0.953125,
"learning_rate": 9.95418237958026e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.7825618571043015,
"num_tokens": 538111995.0,
"step": 17750
},
{
"entropy": 0.4254558201134205,
"epoch": 3.3215152080612054,
"grad_norm": 0.5625,
"learning_rate": 9.88482169044983e-06,
"loss": 0.488,
"mean_token_accuracy": 0.7804460400342941,
"num_tokens": 539663342.0,
"step": 17800
},
{
"entropy": 0.4253497390449047,
"epoch": 3.3308453069602537,
"grad_norm": 1.15625,
"learning_rate": 9.815690045997598e-06,
"loss": 0.4877,
"mean_token_accuracy": 0.7805328375101089,
"num_tokens": 541217582.0,
"step": 17850
},
{
"entropy": 0.41277687311172484,
"epoch": 3.340175405859302,
"grad_norm": 1.0625,
"learning_rate": 9.746789839845406e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.7856322342157364,
"num_tokens": 542694785.0,
"step": 17900
},
{
"entropy": 0.4202821546792984,
"epoch": 3.3495055047583504,
"grad_norm": 1.2265625,
"learning_rate": 9.67812345760174e-06,
"loss": 0.4955,
"mean_token_accuracy": 0.7834526008367538,
"num_tokens": 544234574.0,
"step": 17950
},
{
"entropy": 0.41871184706687925,
"epoch": 3.3588356036573987,
"grad_norm": 0.50390625,
"learning_rate": 9.609693276779152e-06,
"loss": 0.4849,
"mean_token_accuracy": 0.7832159209251404,
"num_tokens": 545775120.0,
"step": 18000
},
{
"epoch": 3.3588356036573987,
"eval_entropy": 0.4325732376870759,
"eval_loss": 0.5006441473960876,
"eval_mean_token_accuracy": 0.7796490897826098,
"eval_num_tokens": 545775120.0,
"eval_runtime": 16.1616,
"eval_samples_per_second": 53.646,
"eval_steps_per_second": 6.744,
"step": 18000
},
{
"entropy": 0.42268205478787424,
"epoch": 3.368165702556447,
"grad_norm": 0.77734375,
"learning_rate": 9.541501666711921e-06,
"loss": 0.4778,
"mean_token_accuracy": 0.7864983838796615,
"num_tokens": 547311884.0,
"step": 18050
},
{
"entropy": 0.42853294894099236,
"epoch": 3.3774958014554954,
"grad_norm": 0.53515625,
"learning_rate": 9.473550988474026e-06,
"loss": 0.4904,
"mean_token_accuracy": 0.7811102610826492,
"num_tokens": 548872868.0,
"step": 18100
},
{
"entropy": 0.43325465768575666,
"epoch": 3.386825900354544,
"grad_norm": 0.890625,
"learning_rate": 9.4058435947974e-06,
"loss": 0.4935,
"mean_token_accuracy": 0.7787714445590973,
"num_tokens": 550487269.0,
"step": 18150
},
{
"entropy": 0.42910467088222504,
"epoch": 3.396155999253592,
"grad_norm": 0.87109375,
"learning_rate": 9.338381829990456e-06,
"loss": 0.4903,
"mean_token_accuracy": 0.7796713817119598,
"num_tokens": 552030450.0,
"step": 18200
},
{
"entropy": 0.41868083611130713,
"epoch": 3.4054860981526405,
"grad_norm": 0.388671875,
"learning_rate": 9.271168029856928e-06,
"loss": 0.4823,
"mean_token_accuracy": 0.7847259587049484,
"num_tokens": 553519745.0,
"step": 18250
},
{
"entropy": 0.41218777537345885,
"epoch": 3.414816197051689,
"grad_norm": 1.203125,
"learning_rate": 9.204204521615007e-06,
"loss": 0.4803,
"mean_token_accuracy": 0.7875041741132737,
"num_tokens": 555001060.0,
"step": 18300
},
{
"entropy": 0.4341864985227585,
"epoch": 3.424146295950737,
"grad_norm": 0.63671875,
"learning_rate": 9.13749362381673e-06,
"loss": 0.4935,
"mean_token_accuracy": 0.7791831189393997,
"num_tokens": 556547827.0,
"step": 18350
},
{
"entropy": 0.4298609687387943,
"epoch": 3.4334763948497855,
"grad_norm": 0.39453125,
"learning_rate": 9.07103764626773e-06,
"loss": 0.49,
"mean_token_accuracy": 0.7830371624231338,
"num_tokens": 558106220.0,
"step": 18400
},
{
"entropy": 0.4168567133694887,
"epoch": 3.442806493748834,
"grad_norm": 0.7265625,
"learning_rate": 9.00483888994725e-06,
"loss": 0.4787,
"mean_token_accuracy": 0.7888282573223114,
"num_tokens": 559579247.0,
"step": 18450
},
{
"entropy": 0.4177075420320034,
"epoch": 3.452136592647882,
"grad_norm": 0.5078125,
"learning_rate": 8.938899646928482e-06,
"loss": 0.4841,
"mean_token_accuracy": 0.7841418528556824,
"num_tokens": 561118173.0,
"step": 18500
},
{
"entropy": 0.42074175730347635,
"epoch": 3.4614666915469305,
"grad_norm": 1.390625,
"learning_rate": 8.8732222002992e-06,
"loss": 0.4943,
"mean_token_accuracy": 0.7784902250766754,
"num_tokens": 562667212.0,
"step": 18550
},
{
"entropy": 0.41027487240731714,
"epoch": 3.470796790445979,
"grad_norm": 0.80859375,
"learning_rate": 8.807808824082699e-06,
"loss": 0.479,
"mean_token_accuracy": 0.7876066911220551,
"num_tokens": 564131244.0,
"step": 18600
},
{
"entropy": 0.4196596726775169,
"epoch": 3.4801268893450272,
"grad_norm": 0.765625,
"learning_rate": 8.742661783159075e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.7828355920314789,
"num_tokens": 565650541.0,
"step": 18650
},
{
"entropy": 0.42029051125049594,
"epoch": 3.4894569882440756,
"grad_norm": 0.89453125,
"learning_rate": 8.677783333186817e-06,
"loss": 0.4907,
"mean_token_accuracy": 0.7820263600349426,
"num_tokens": 567162445.0,
"step": 18700
},
{
"entropy": 0.42464843273162844,
"epoch": 3.498787087143124,
"grad_norm": 0.49609375,
"learning_rate": 8.61317572052467e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.7788948893547059,
"num_tokens": 568688382.0,
"step": 18750
},
{
"entropy": 0.426156534999609,
"epoch": 3.5081171860421723,
"grad_norm": 0.78515625,
"learning_rate": 8.548841182153889e-06,
"loss": 0.4944,
"mean_token_accuracy": 0.7789571231603623,
"num_tokens": 570273693.0,
"step": 18800
},
{
"entropy": 0.4215981301665306,
"epoch": 3.5174472849412206,
"grad_norm": 0.88671875,
"learning_rate": 8.484781945600765e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.781245459318161,
"num_tokens": 571787716.0,
"step": 18850
},
{
"entropy": 0.4165229081362486,
"epoch": 3.526777383840269,
"grad_norm": 0.7265625,
"learning_rate": 8.421000228859513e-06,
"loss": 0.483,
"mean_token_accuracy": 0.7865099585056305,
"num_tokens": 573264670.0,
"step": 18900
},
{
"entropy": 0.4245117911696434,
"epoch": 3.5361074827393173,
"grad_norm": 0.486328125,
"learning_rate": 8.35749824031547e-06,
"loss": 0.4938,
"mean_token_accuracy": 0.778894921541214,
"num_tokens": 574817675.0,
"step": 18950
},
{
"entropy": 0.417805362790823,
"epoch": 3.5454375816383656,
"grad_norm": 0.423828125,
"learning_rate": 8.294278178668643e-06,
"loss": 0.4872,
"mean_token_accuracy": 0.7830862325429916,
"num_tokens": 576331491.0,
"step": 19000
},
{
"epoch": 3.5454375816383656,
"eval_entropy": 0.4310347127531647,
"eval_loss": 0.499735027551651,
"eval_mean_token_accuracy": 0.779860559953462,
"eval_num_tokens": 576331491.0,
"eval_runtime": 16.1624,
"eval_samples_per_second": 53.643,
"eval_steps_per_second": 6.744,
"step": 19000
},
{
"entropy": 0.4081883782148361,
"epoch": 3.5547676805374135,
"grad_norm": 0.9375,
"learning_rate": 8.231342232857553e-06,
"loss": 0.4705,
"mean_token_accuracy": 0.7903977036476135,
"num_tokens": 577775016.0,
"step": 19050
},
{
"entropy": 0.4005671316385269,
"epoch": 3.564097779436462,
"grad_norm": 0.875,
"learning_rate": 8.16869258198347e-06,
"loss": 0.4737,
"mean_token_accuracy": 0.7880425137281418,
"num_tokens": 579207311.0,
"step": 19100
},
{
"entropy": 0.41074585855007173,
"epoch": 3.5734278783355102,
"grad_norm": 0.87109375,
"learning_rate": 8.106331395234957e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.786340873837471,
"num_tokens": 580709130.0,
"step": 19150
},
{
"entropy": 0.4269941046833992,
"epoch": 3.5827579772345586,
"grad_norm": 0.5234375,
"learning_rate": 8.044260831812762e-06,
"loss": 0.4965,
"mean_token_accuracy": 0.7799100142717361,
"num_tokens": 582263480.0,
"step": 19200
},
{
"entropy": 0.40871032655239103,
"epoch": 3.592088076133607,
"grad_norm": 0.89453125,
"learning_rate": 7.982483040855052e-06,
"loss": 0.4885,
"mean_token_accuracy": 0.7831970340013504,
"num_tokens": 583753874.0,
"step": 19250
},
{
"entropy": 0.41553946167230604,
"epoch": 3.6014181750326553,
"grad_norm": 1.1875,
"learning_rate": 7.921000161363023e-06,
"loss": 0.4892,
"mean_token_accuracy": 0.7814596974849701,
"num_tokens": 585269107.0,
"step": 19300
},
{
"entropy": 0.4114553180336952,
"epoch": 3.6107482739317036,
"grad_norm": 0.73828125,
"learning_rate": 7.859814322126803e-06,
"loss": 0.5044,
"mean_token_accuracy": 0.7843243205547332,
"num_tokens": 586764493.0,
"step": 19350
},
{
"entropy": 0.4229819716513157,
"epoch": 3.620078372830752,
"grad_norm": 0.5,
"learning_rate": 7.798927641651787e-06,
"loss": 0.493,
"mean_token_accuracy": 0.7810362190008163,
"num_tokens": 588285178.0,
"step": 19400
},
{
"entropy": 0.43099318161606787,
"epoch": 3.6294084717298003,
"grad_norm": 0.466796875,
"learning_rate": 7.738342228085244e-06,
"loss": 0.4921,
"mean_token_accuracy": 0.7791334927082062,
"num_tokens": 589860202.0,
"step": 19450
},
{
"entropy": 0.4297554486989975,
"epoch": 3.6387385706288486,
"grad_norm": 0.6796875,
"learning_rate": 7.678060179143354e-06,
"loss": 0.4965,
"mean_token_accuracy": 0.7779843896627426,
"num_tokens": 591386894.0,
"step": 19500
},
{
"entropy": 0.4341718791425228,
"epoch": 3.648068669527897,
"grad_norm": 1.1171875,
"learning_rate": 7.618083582038559e-06,
"loss": 0.4973,
"mean_token_accuracy": 0.7771023726463318,
"num_tokens": 592966640.0,
"step": 19550
},
{
"entropy": 0.43129597157239913,
"epoch": 3.6573987684269453,
"grad_norm": 0.45703125,
"learning_rate": 7.558414513407309e-06,
"loss": 0.4966,
"mean_token_accuracy": 0.7794705367088318,
"num_tokens": 594551411.0,
"step": 19600
},
{
"entropy": 0.416672650128603,
"epoch": 3.6667288673259937,
"grad_norm": 0.5390625,
"learning_rate": 7.499055039238146e-06,
"loss": 0.4847,
"mean_token_accuracy": 0.7843469917774201,
"num_tokens": 596025749.0,
"step": 19650
},
{
"entropy": 0.4332768340408802,
"epoch": 3.676058966225042,
"grad_norm": 0.65625,
"learning_rate": 7.4400072148001895e-06,
"loss": 0.4897,
"mean_token_accuracy": 0.7826856952905655,
"num_tokens": 597572029.0,
"step": 19700
},
{
"entropy": 0.4150558638572693,
"epoch": 3.6853890651240904,
"grad_norm": 0.5,
"learning_rate": 7.381273084571959e-06,
"loss": 0.4844,
"mean_token_accuracy": 0.785199624300003,
"num_tokens": 599047819.0,
"step": 19750
},
{
"entropy": 0.4170726762712002,
"epoch": 3.6947191640231387,
"grad_norm": 0.7890625,
"learning_rate": 7.322854682170584e-06,
"loss": 0.4918,
"mean_token_accuracy": 0.7805746030807496,
"num_tokens": 600528500.0,
"step": 19800
},
{
"entropy": 0.43822780847549436,
"epoch": 3.704049262922187,
"grad_norm": 0.77734375,
"learning_rate": 7.264754030281405e-06,
"loss": 0.4994,
"mean_token_accuracy": 0.7803661721944809,
"num_tokens": 602042275.0,
"step": 19850
},
{
"entropy": 0.41316955953836443,
"epoch": 3.7133793618212354,
"grad_norm": 0.78515625,
"learning_rate": 7.2069731405879325e-06,
"loss": 0.4822,
"mean_token_accuracy": 0.7859818071126938,
"num_tokens": 603534613.0,
"step": 19900
},
{
"entropy": 0.43403887152671816,
"epoch": 3.7227094607202837,
"grad_norm": 0.48046875,
"learning_rate": 7.149514013702186e-06,
"loss": 0.499,
"mean_token_accuracy": 0.7757015681266785,
"num_tokens": 605111735.0,
"step": 19950
},
{
"entropy": 0.41590030148625373,
"epoch": 3.732039559619332,
"grad_norm": 0.57421875,
"learning_rate": 7.092378639095451e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.7830818378925324,
"num_tokens": 606616332.0,
"step": 20000
},
{
"epoch": 3.732039559619332,
"eval_entropy": 0.42986192684107966,
"eval_loss": 0.4989897310733795,
"eval_mean_token_accuracy": 0.779869570097792,
"eval_num_tokens": 606616332.0,
"eval_runtime": 16.2831,
"eval_samples_per_second": 53.246,
"eval_steps_per_second": 6.694,
"step": 20000
},
{
"entropy": 0.41359525367617606,
"epoch": 3.7413696585183804,
"grad_norm": 0.5078125,
"learning_rate": 7.0355689950293636e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.7825395846366883,
"num_tokens": 608094829.0,
"step": 20050
},
{
"entropy": 0.4180074107646942,
"epoch": 3.750699757417429,
"grad_norm": 0.5390625,
"learning_rate": 6.979087048487432e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.7824456262588501,
"num_tokens": 609631728.0,
"step": 20100
},
{
"entropy": 0.4242085382342339,
"epoch": 3.7600298563164767,
"grad_norm": 0.6015625,
"learning_rate": 6.922934755106929e-06,
"loss": 0.4895,
"mean_token_accuracy": 0.7821110928058624,
"num_tokens": 611154863.0,
"step": 20150
},
{
"entropy": 0.4358582437038422,
"epoch": 3.769359955215525,
"grad_norm": 0.66796875,
"learning_rate": 6.867114059111178e-06,
"loss": 0.4957,
"mean_token_accuracy": 0.7778018289804458,
"num_tokens": 612699583.0,
"step": 20200
},
{
"entropy": 0.4160428442060947,
"epoch": 3.7786900541145734,
"grad_norm": 0.71484375,
"learning_rate": 6.81162689324224e-06,
"loss": 0.4816,
"mean_token_accuracy": 0.7859532070159913,
"num_tokens": 614179127.0,
"step": 20250
},
{
"entropy": 0.42236584216356277,
"epoch": 3.7880201530136217,
"grad_norm": 0.404296875,
"learning_rate": 6.756475178693988e-06,
"loss": 0.497,
"mean_token_accuracy": 0.77904121696949,
"num_tokens": 615681957.0,
"step": 20300
},
{
"entropy": 0.4261379070580006,
"epoch": 3.79735025191267,
"grad_norm": 0.33984375,
"learning_rate": 6.701660825045599e-06,
"loss": 0.491,
"mean_token_accuracy": 0.7808138716220856,
"num_tokens": 617206693.0,
"step": 20350
},
{
"entropy": 0.41188708037137983,
"epoch": 3.8066803508117184,
"grad_norm": 0.443359375,
"learning_rate": 6.64718573019542e-06,
"loss": 0.482,
"mean_token_accuracy": 0.7834567302465438,
"num_tokens": 618685882.0,
"step": 20400
},
{
"entropy": 0.42921313650906084,
"epoch": 3.8160104497107667,
"grad_norm": 0.326171875,
"learning_rate": 6.593051780295262e-06,
"loss": 0.4886,
"mean_token_accuracy": 0.7812140667438507,
"num_tokens": 620242613.0,
"step": 20450
},
{
"entropy": 0.4060940612852573,
"epoch": 3.825340548609815,
"grad_norm": 0.80859375,
"learning_rate": 6.5392608496851006e-06,
"loss": 0.4804,
"mean_token_accuracy": 0.7881927186250687,
"num_tokens": 621695607.0,
"step": 20500
},
{
"entropy": 0.41289987429976466,
"epoch": 3.8346706475088634,
"grad_norm": 0.4921875,
"learning_rate": 6.48581480082817e-06,
"loss": 0.4814,
"mean_token_accuracy": 0.7860061007738114,
"num_tokens": 623202780.0,
"step": 20550
},
{
"entropy": 0.4154231162369251,
"epoch": 3.844000746407912,
"grad_norm": 0.81640625,
"learning_rate": 6.432715484246474e-06,
"loss": 0.4822,
"mean_token_accuracy": 0.7877626097202302,
"num_tokens": 624719320.0,
"step": 20600
},
{
"entropy": 0.43937826111912726,
"epoch": 3.85333084530696,
"grad_norm": 0.54296875,
"learning_rate": 6.379964738456737e-06,
"loss": 0.5081,
"mean_token_accuracy": 0.7722378653287888,
"num_tokens": 626304789.0,
"step": 20650
},
{
"entropy": 0.4095226752758026,
"epoch": 3.8626609442060085,
"grad_norm": 0.431640625,
"learning_rate": 6.3275643899067095e-06,
"loss": 0.4808,
"mean_token_accuracy": 0.784711457490921,
"num_tokens": 627775886.0,
"step": 20700
},
{
"entropy": 0.42841140910983083,
"epoch": 3.871991043105057,
"grad_norm": 0.7265625,
"learning_rate": 6.275516252911957e-06,
"loss": 0.495,
"mean_token_accuracy": 0.7787529402971267,
"num_tokens": 629320324.0,
"step": 20750
},
{
"entropy": 0.4122103577852249,
"epoch": 3.881321142004105,
"grad_norm": 0.8125,
"learning_rate": 6.223822129593035e-06,
"loss": 0.4872,
"mean_token_accuracy": 0.7841408705711365,
"num_tokens": 630822612.0,
"step": 20800
},
{
"entropy": 0.42884288884699345,
"epoch": 3.8906512409031535,
"grad_norm": 0.97265625,
"learning_rate": 6.172483809813082e-06,
"loss": 0.4946,
"mean_token_accuracy": 0.7817574542760849,
"num_tokens": 632330762.0,
"step": 20850
},
{
"entropy": 0.42925117403268814,
"epoch": 3.899981339802202,
"grad_norm": 0.53125,
"learning_rate": 6.121503071115863e-06,
"loss": 0.4968,
"mean_token_accuracy": 0.7788661700487137,
"num_tokens": 633862126.0,
"step": 20900
},
{
"entropy": 0.4173249228298664,
"epoch": 3.90931143870125,
"grad_norm": 0.48828125,
"learning_rate": 6.0708816786642055e-06,
"loss": 0.4896,
"mean_token_accuracy": 0.7805528366565704,
"num_tokens": 635369892.0,
"step": 20950
},
{
"entropy": 0.42858991749584674,
"epoch": 3.9186415376002985,
"grad_norm": 0.89453125,
"learning_rate": 6.0206213851789065e-06,
"loss": 0.4922,
"mean_token_accuracy": 0.7816365206241608,
"num_tokens": 636904691.0,
"step": 21000
},
{
"epoch": 3.9186415376002985,
"eval_entropy": 0.4298953804947914,
"eval_loss": 0.4981645345687866,
"eval_mean_token_accuracy": 0.7802923715442692,
"eval_num_tokens": 636904691.0,
"eval_runtime": 16.1855,
"eval_samples_per_second": 53.566,
"eval_steps_per_second": 6.734,
"step": 21000
},
{
"entropy": 0.42679292619228365,
"epoch": 3.927971636499347,
"grad_norm": 0.7109375,
"learning_rate": 5.970723930878021e-06,
"loss": 0.4965,
"mean_token_accuracy": 0.7826879835128784,
"num_tokens": 638381309.0,
"step": 21050
},
{
"entropy": 0.4261575947701931,
"epoch": 3.9373017353983952,
"grad_norm": 1.03125,
"learning_rate": 5.921191043416619e-06,
"loss": 0.4962,
"mean_token_accuracy": 0.7802698415517807,
"num_tokens": 639907942.0,
"step": 21100
},
{
"entropy": 0.3956790755689144,
"epoch": 3.9466318342974436,
"grad_norm": 0.7265625,
"learning_rate": 5.87202443782697e-06,
"loss": 0.4689,
"mean_token_accuracy": 0.7924928772449493,
"num_tokens": 641357830.0,
"step": 21150
},
{
"entropy": 0.4151885700970888,
"epoch": 3.955961933196492,
"grad_norm": 0.62890625,
"learning_rate": 5.823225816459159e-06,
"loss": 0.4835,
"mean_token_accuracy": 0.784718370437622,
"num_tokens": 642852716.0,
"step": 21200
},
{
"entropy": 0.4133622221648693,
"epoch": 3.9652920320955403,
"grad_norm": 0.56640625,
"learning_rate": 5.774796868922148e-06,
"loss": 0.4817,
"mean_token_accuracy": 0.7855832195281982,
"num_tokens": 644369211.0,
"step": 21250
},
{
"entropy": 0.4185038904845715,
"epoch": 3.9746221309945886,
"grad_norm": 0.6953125,
"learning_rate": 5.726739272025258e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.7824643701314926,
"num_tokens": 645872051.0,
"step": 21300
},
{
"entropy": 0.4287873014807701,
"epoch": 3.983952229893637,
"grad_norm": 0.87890625,
"learning_rate": 5.679054689720142e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.7797924143075943,
"num_tokens": 647435628.0,
"step": 21350
},
{
"entropy": 0.42188764035701753,
"epoch": 3.9932823287926853,
"grad_norm": 0.8828125,
"learning_rate": 5.631744773043137e-06,
"loss": 0.4953,
"mean_token_accuracy": 0.7852949523925781,
"num_tokens": 648930592.0,
"step": 21400
},
{
"entropy": 0.4236401343345642,
"epoch": 4.002612427691734,
"grad_norm": 0.412109375,
"learning_rate": 5.584811160058123e-06,
"loss": 0.487,
"mean_token_accuracy": 0.7832311981916428,
"num_tokens": 650447365.0,
"step": 21450
},
{
"entropy": 0.4323525831103325,
"epoch": 4.011942526590782,
"grad_norm": 0.396484375,
"learning_rate": 5.5382554757998e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.7795790702104568,
"num_tokens": 651990713.0,
"step": 21500
},
{
"entropy": 0.39284430593252184,
"epoch": 4.02127262548983,
"grad_norm": 0.6171875,
"learning_rate": 5.492079332217413e-06,
"loss": 0.4657,
"mean_token_accuracy": 0.7923216378688812,
"num_tokens": 653445504.0,
"step": 21550
},
{
"entropy": 0.4254847614467144,
"epoch": 4.030602724388879,
"grad_norm": 0.58203125,
"learning_rate": 5.446284328118956e-06,
"loss": 0.4899,
"mean_token_accuracy": 0.7845428907871246,
"num_tokens": 654991638.0,
"step": 21600
},
{
"entropy": 0.41144783079624175,
"epoch": 4.039932823287927,
"grad_norm": 0.48828125,
"learning_rate": 5.4008720491158105e-06,
"loss": 0.4816,
"mean_token_accuracy": 0.7855917030572891,
"num_tokens": 656505470.0,
"step": 21650
},
{
"entropy": 0.41339424341917036,
"epoch": 4.049262922186975,
"grad_norm": 0.7890625,
"learning_rate": 5.355844067567827e-06,
"loss": 0.487,
"mean_token_accuracy": 0.7834293109178543,
"num_tokens": 658059888.0,
"step": 21700
},
{
"entropy": 0.43376033812761305,
"epoch": 4.058593021086024,
"grad_norm": 0.55859375,
"learning_rate": 5.311201942528911e-06,
"loss": 0.5004,
"mean_token_accuracy": 0.7760592538118363,
"num_tokens": 659625009.0,
"step": 21750
},
{
"entropy": 0.4149209675192833,
"epoch": 4.067923119985072,
"grad_norm": 0.67578125,
"learning_rate": 5.266947219693018e-06,
"loss": 0.4821,
"mean_token_accuracy": 0.7843904966115951,
"num_tokens": 661130951.0,
"step": 21800
},
{
"entropy": 0.40518364384770394,
"epoch": 4.07725321888412,
"grad_norm": 0.322265625,
"learning_rate": 5.2230814313406564e-06,
"loss": 0.4804,
"mean_token_accuracy": 0.7845012962818145,
"num_tokens": 662681099.0,
"step": 21850
},
{
"entropy": 0.4218353702127933,
"epoch": 4.086583317783169,
"grad_norm": 0.89453125,
"learning_rate": 5.179606096285814e-06,
"loss": 0.4867,
"mean_token_accuracy": 0.7828029912710189,
"num_tokens": 664238581.0,
"step": 21900
},
{
"entropy": 0.4128453577309847,
"epoch": 4.095913416682217,
"grad_norm": 0.9296875,
"learning_rate": 5.136522719823388e-06,
"loss": 0.4777,
"mean_token_accuracy": 0.7887437117099761,
"num_tokens": 665738717.0,
"step": 21950
},
{
"entropy": 0.4112414425611496,
"epoch": 4.105243515581265,
"grad_norm": 0.490234375,
"learning_rate": 5.093832793677053e-06,
"loss": 0.4858,
"mean_token_accuracy": 0.7840606135129928,
"num_tokens": 667221460.0,
"step": 22000
},
{
"epoch": 4.105243515581265,
"eval_entropy": 0.4262016671239783,
"eval_loss": 0.49918287992477417,
"eval_mean_token_accuracy": 0.7801302873760189,
"eval_num_tokens": 667221460.0,
"eval_runtime": 16.1734,
"eval_samples_per_second": 53.607,
"eval_steps_per_second": 6.739,
"step": 22000
},
{
"entropy": 0.4187443208694458,
"epoch": 4.114573614480314,
"grad_norm": 1.09375,
"learning_rate": 5.051537795947614e-06,
"loss": 0.4833,
"mean_token_accuracy": 0.783671562075615,
"num_tokens": 668746696.0,
"step": 22050
},
{
"entropy": 0.4121988409757614,
"epoch": 4.123903713379362,
"grad_norm": 0.5390625,
"learning_rate": 5.009639191061831e-06,
"loss": 0.479,
"mean_token_accuracy": 0.7846984696388245,
"num_tokens": 670258347.0,
"step": 22100
},
{
"entropy": 0.41994678273797037,
"epoch": 4.1332338122784105,
"grad_norm": 1.1171875,
"learning_rate": 4.968138429721715e-06,
"loss": 0.4852,
"mean_token_accuracy": 0.7819422298669815,
"num_tokens": 671783187.0,
"step": 22150
},
{
"entropy": 0.40594893679022787,
"epoch": 4.142563911177459,
"grad_norm": 0.423828125,
"learning_rate": 4.9270369488543e-06,
"loss": 0.4737,
"mean_token_accuracy": 0.7885110950469971,
"num_tokens": 673297222.0,
"step": 22200
},
{
"entropy": 0.4166788300871849,
"epoch": 4.151894010076507,
"grad_norm": 0.76953125,
"learning_rate": 4.886336171561883e-06,
"loss": 0.4809,
"mean_token_accuracy": 0.783810424208641,
"num_tokens": 674805400.0,
"step": 22250
},
{
"entropy": 0.4085456937551498,
"epoch": 4.1612241089755555,
"grad_norm": 0.4765625,
"learning_rate": 4.846037507072753e-06,
"loss": 0.4769,
"mean_token_accuracy": 0.7872794485092163,
"num_tokens": 676275357.0,
"step": 22300
},
{
"entropy": 0.4122830269485712,
"epoch": 4.170554207874604,
"grad_norm": 0.71484375,
"learning_rate": 4.806142350692409e-06,
"loss": 0.4821,
"mean_token_accuracy": 0.7875606679916382,
"num_tokens": 677776471.0,
"step": 22350
},
{
"entropy": 0.4124917629361153,
"epoch": 4.179884306773652,
"grad_norm": 0.46875,
"learning_rate": 4.766652083755236e-06,
"loss": 0.4826,
"mean_token_accuracy": 0.7857430422306061,
"num_tokens": 679312867.0,
"step": 22400
},
{
"entropy": 0.3990041773021221,
"epoch": 4.1892144056727005,
"grad_norm": 0.84765625,
"learning_rate": 4.727568073576675e-06,
"loss": 0.4759,
"mean_token_accuracy": 0.7875368773937226,
"num_tokens": 680753964.0,
"step": 22450
},
{
"entropy": 0.41429411858320236,
"epoch": 4.198544504571749,
"grad_norm": 0.9921875,
"learning_rate": 4.688891673405898e-06,
"loss": 0.4866,
"mean_token_accuracy": 0.7836510550975799,
"num_tokens": 682282646.0,
"step": 22500
},
{
"entropy": 0.4262200190126896,
"epoch": 4.207874603470797,
"grad_norm": 0.494140625,
"learning_rate": 4.650624222378934e-06,
"loss": 0.4956,
"mean_token_accuracy": 0.7772960156202317,
"num_tokens": 683822441.0,
"step": 22550
},
{
"entropy": 0.4214519140869379,
"epoch": 4.217204702369845,
"grad_norm": 0.578125,
"learning_rate": 4.6127670454723106e-06,
"loss": 0.4895,
"mean_token_accuracy": 0.7828962570428848,
"num_tokens": 685322393.0,
"step": 22600
},
{
"entropy": 0.41216524183750153,
"epoch": 4.226534801268893,
"grad_norm": 0.671875,
"learning_rate": 4.575321453457185e-06,
"loss": 0.4801,
"mean_token_accuracy": 0.7840064114332199,
"num_tokens": 686841081.0,
"step": 22650
},
{
"entropy": 0.41634872302412984,
"epoch": 4.235864900167941,
"grad_norm": 0.7109375,
"learning_rate": 4.53828874285395e-06,
"loss": 0.4867,
"mean_token_accuracy": 0.7810469180345535,
"num_tokens": 688411100.0,
"step": 22700
},
{
"entropy": 0.4129963879287243,
"epoch": 4.24519499906699,
"grad_norm": 0.546875,
"learning_rate": 4.501670195887344e-06,
"loss": 0.4887,
"mean_token_accuracy": 0.785649740099907,
"num_tokens": 689911759.0,
"step": 22750
},
{
"entropy": 0.4034364421665668,
"epoch": 4.254525097966038,
"grad_norm": 0.56640625,
"learning_rate": 4.465467080442056e-06,
"loss": 0.4727,
"mean_token_accuracy": 0.7873591876029968,
"num_tokens": 691405944.0,
"step": 22800
},
{
"entropy": 0.41453997910022733,
"epoch": 4.263855196865086,
"grad_norm": 1.1015625,
"learning_rate": 4.4296806500188296e-06,
"loss": 0.4843,
"mean_token_accuracy": 0.7840264475345612,
"num_tokens": 692922273.0,
"step": 22850
},
{
"entropy": 0.4219287024438381,
"epoch": 4.273185295764135,
"grad_norm": 0.435546875,
"learning_rate": 4.394312143691058e-06,
"loss": 0.4909,
"mean_token_accuracy": 0.7800329983234405,
"num_tokens": 694464854.0,
"step": 22900
},
{
"entropy": 0.41194928884506227,
"epoch": 4.282515394663183,
"grad_norm": 0.5078125,
"learning_rate": 4.359362786061886e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.7856051474809647,
"num_tokens": 695964788.0,
"step": 22950
},
{
"entropy": 0.41727676048874857,
"epoch": 4.291845493562231,
"grad_norm": 0.94140625,
"learning_rate": 4.324833787221808e-06,
"loss": 0.4872,
"mean_token_accuracy": 0.7820509207248688,
"num_tokens": 697478901.0,
"step": 23000
},
{
"epoch": 4.291845493562231,
"eval_entropy": 0.42559362865916084,
"eval_loss": 0.4991084337234497,
"eval_mean_token_accuracy": 0.7802343352125325,
"eval_num_tokens": 697478901.0,
"eval_runtime": 16.0896,
"eval_samples_per_second": 53.886,
"eval_steps_per_second": 6.775,
"step": 23000
},
{
"entropy": 0.42197408616542814,
"epoch": 4.30117559246128,
"grad_norm": 0.80859375,
"learning_rate": 4.290726342706758e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.7817700058221817,
"num_tokens": 699021137.0,
"step": 23050
},
{
"entropy": 0.4093606770038605,
"epoch": 4.310505691360328,
"grad_norm": 0.9609375,
"learning_rate": 4.257041633456738e-06,
"loss": 0.4802,
"mean_token_accuracy": 0.7852538430690765,
"num_tokens": 700512894.0,
"step": 23100
},
{
"entropy": 0.42187732078135015,
"epoch": 4.3198357902593765,
"grad_norm": 1.171875,
"learning_rate": 4.223780825774913e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.7813728898763657,
"num_tokens": 702059517.0,
"step": 23150
},
{
"entropy": 0.4125171934068203,
"epoch": 4.329165889158425,
"grad_norm": 0.5859375,
"learning_rate": 4.1909450712872285e-06,
"loss": 0.4905,
"mean_token_accuracy": 0.7848614448308945,
"num_tokens": 703551158.0,
"step": 23200
},
{
"entropy": 0.4088340279459953,
"epoch": 4.338495988057473,
"grad_norm": 0.76171875,
"learning_rate": 4.158535506902543e-06,
"loss": 0.4786,
"mean_token_accuracy": 0.7860189139842987,
"num_tokens": 705046371.0,
"step": 23250
},
{
"entropy": 0.4250268609821796,
"epoch": 4.3478260869565215,
"grad_norm": 0.423828125,
"learning_rate": 4.1265532547732586e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.7804222059249878,
"num_tokens": 706651598.0,
"step": 23300
},
{
"entropy": 0.42093482360243795,
"epoch": 4.35715618585557,
"grad_norm": 0.29296875,
"learning_rate": 4.094999422256478e-06,
"loss": 0.4919,
"mean_token_accuracy": 0.7815903490781784,
"num_tokens": 708166771.0,
"step": 23350
},
{
"entropy": 0.40209699779748914,
"epoch": 4.366486284754618,
"grad_norm": 0.9453125,
"learning_rate": 4.063875101875644e-06,
"loss": 0.4719,
"mean_token_accuracy": 0.7900790423154831,
"num_tokens": 709635262.0,
"step": 23400
},
{
"entropy": 0.411265781968832,
"epoch": 4.3758163836536665,
"grad_norm": 0.81640625,
"learning_rate": 4.033181371282729e-06,
"loss": 0.4774,
"mean_token_accuracy": 0.7869658744335175,
"num_tokens": 711131934.0,
"step": 23450
},
{
"entropy": 0.41227160826325415,
"epoch": 4.385146482552715,
"grad_norm": 1.6953125,
"learning_rate": 4.002919293220917e-06,
"loss": 0.4809,
"mean_token_accuracy": 0.7838652014732361,
"num_tokens": 712614938.0,
"step": 23500
},
{
"entropy": 0.4009349416196346,
"epoch": 4.394476581451763,
"grad_norm": 0.73046875,
"learning_rate": 3.973089915487803e-06,
"loss": 0.4823,
"mean_token_accuracy": 0.7867969334125519,
"num_tokens": 714060976.0,
"step": 23550
},
{
"entropy": 0.43059924483299256,
"epoch": 4.403806680350812,
"grad_norm": 0.8125,
"learning_rate": 3.943694270899114e-06,
"loss": 0.5009,
"mean_token_accuracy": 0.7788757783174515,
"num_tokens": 715598099.0,
"step": 23600
},
{
"entropy": 0.41181773334741595,
"epoch": 4.41313677924986,
"grad_norm": 0.380859375,
"learning_rate": 3.914733377252963e-06,
"loss": 0.4793,
"mean_token_accuracy": 0.786955691576004,
"num_tokens": 717099596.0,
"step": 23650
},
{
"entropy": 0.3970365000516176,
"epoch": 4.422466878148908,
"grad_norm": 0.60546875,
"learning_rate": 3.886208237294589e-06,
"loss": 0.4733,
"mean_token_accuracy": 0.7909884482622147,
"num_tokens": 718556937.0,
"step": 23700
},
{
"entropy": 0.4260143294930458,
"epoch": 4.431796977047957,
"grad_norm": 0.73828125,
"learning_rate": 3.858119838681645e-06,
"loss": 0.4887,
"mean_token_accuracy": 0.7821561121940612,
"num_tokens": 720105007.0,
"step": 23750
},
{
"entropy": 0.4135922496020794,
"epoch": 4.441127075947005,
"grad_norm": 0.6640625,
"learning_rate": 3.830469153950008e-06,
"loss": 0.4802,
"mean_token_accuracy": 0.7842557770013809,
"num_tokens": 721637538.0,
"step": 23800
},
{
"entropy": 0.4049617177248001,
"epoch": 4.450457174846053,
"grad_norm": 0.72265625,
"learning_rate": 3.803257140480098e-06,
"loss": 0.4855,
"mean_token_accuracy": 0.7906941068172455,
"num_tokens": 723121765.0,
"step": 23850
},
{
"entropy": 0.40623982638120654,
"epoch": 4.459787273745102,
"grad_norm": 1.0390625,
"learning_rate": 3.776484740463726e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.785234968662262,
"num_tokens": 724610759.0,
"step": 23900
},
{
"entropy": 0.4155420292913914,
"epoch": 4.46911737264415,
"grad_norm": 0.6484375,
"learning_rate": 3.7501528808714883e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.7824651861190796,
"num_tokens": 726130784.0,
"step": 23950
},
{
"entropy": 0.41285816714167595,
"epoch": 4.478447471543198,
"grad_norm": 0.5859375,
"learning_rate": 3.7242624734206554e-06,
"loss": 0.4816,
"mean_token_accuracy": 0.785009593963623,
"num_tokens": 727648846.0,
"step": 24000
},
{
"epoch": 4.478447471543198,
"eval_entropy": 0.4252187248763688,
"eval_loss": 0.49884533882141113,
"eval_mean_token_accuracy": 0.7805168371681773,
"eval_num_tokens": 727648846.0,
"eval_runtime": 16.6326,
"eval_samples_per_second": 52.126,
"eval_steps_per_second": 6.553,
"step": 24000
},
{
"entropy": 0.417481614202261,
"epoch": 4.487777570442247,
"grad_norm": 0.59375,
"learning_rate": 3.6988144145436063e-06,
"loss": 0.4834,
"mean_token_accuracy": 0.7845394277572632,
"num_tokens": 729133282.0,
"step": 24050
},
{
"entropy": 0.42680337965488435,
"epoch": 4.497107669341295,
"grad_norm": 1.0,
"learning_rate": 3.6738095853567963e-06,
"loss": 0.4951,
"mean_token_accuracy": 0.7786115556955338,
"num_tokens": 730690418.0,
"step": 24100
},
{
"entropy": 0.4081826032698154,
"epoch": 4.506437768240343,
"grad_norm": 0.3984375,
"learning_rate": 3.6492488516302438e-06,
"loss": 0.4776,
"mean_token_accuracy": 0.7866486293077469,
"num_tokens": 732175841.0,
"step": 24150
},
{
"entropy": 0.4243287441134453,
"epoch": 4.515767867139392,
"grad_norm": 0.439453125,
"learning_rate": 3.625133063757556e-06,
"loss": 0.4862,
"mean_token_accuracy": 0.7824915134906769,
"num_tokens": 733702523.0,
"step": 24200
},
{
"entropy": 0.40577477023005487,
"epoch": 4.52509796603844,
"grad_norm": 0.81640625,
"learning_rate": 3.6014630567264895e-06,
"loss": 0.4744,
"mean_token_accuracy": 0.788703248500824,
"num_tokens": 735166918.0,
"step": 24250
},
{
"entropy": 0.4156419275701046,
"epoch": 4.534428064937488,
"grad_norm": 0.6484375,
"learning_rate": 3.578239650090026e-06,
"loss": 0.4787,
"mean_token_accuracy": 0.7852990156412125,
"num_tokens": 736643884.0,
"step": 24300
},
{
"entropy": 0.39459425553679467,
"epoch": 4.543758163836537,
"grad_norm": 0.5546875,
"learning_rate": 3.555463647938016e-06,
"loss": 0.4681,
"mean_token_accuracy": 0.792020954489708,
"num_tokens": 738110498.0,
"step": 24350
},
{
"entropy": 0.4069563465565443,
"epoch": 4.553088262735585,
"grad_norm": 0.416015625,
"learning_rate": 3.533135838869318e-06,
"loss": 0.4753,
"mean_token_accuracy": 0.7872859954833984,
"num_tokens": 739623859.0,
"step": 24400
},
{
"entropy": 0.4120598857104778,
"epoch": 4.562418361634633,
"grad_norm": 0.40234375,
"learning_rate": 3.5112569959645072e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.7832210195064545,
"num_tokens": 741111342.0,
"step": 24450
},
{
"entropy": 0.411198351085186,
"epoch": 4.571748460533682,
"grad_norm": 0.84375,
"learning_rate": 3.4898278767591007e-06,
"loss": 0.4821,
"mean_token_accuracy": 0.7840376651287079,
"num_tokens": 742634170.0,
"step": 24500
},
{
"entropy": 0.41103513091802596,
"epoch": 4.58107855943273,
"grad_norm": 0.53515625,
"learning_rate": 3.4688492232173343e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.785804370045662,
"num_tokens": 744158302.0,
"step": 24550
},
{
"entropy": 0.41402764439582823,
"epoch": 4.5904086583317785,
"grad_norm": 1.4921875,
"learning_rate": 3.448321761706467e-06,
"loss": 0.4883,
"mean_token_accuracy": 0.78047394156456,
"num_tokens": 745688655.0,
"step": 24600
},
{
"entropy": 0.4192537406086922,
"epoch": 4.599738757230827,
"grad_norm": 0.67578125,
"learning_rate": 3.428246202971639e-06,
"loss": 0.484,
"mean_token_accuracy": 0.7821294456720352,
"num_tokens": 747226392.0,
"step": 24650
},
{
"entropy": 0.3943482875823975,
"epoch": 4.609068856129875,
"grad_norm": 0.7734375,
"learning_rate": 3.408623242111255e-06,
"loss": 0.475,
"mean_token_accuracy": 0.7858047294616699,
"num_tokens": 748701674.0,
"step": 24700
},
{
"entropy": 0.39550173744559286,
"epoch": 4.6183989550289235,
"grad_norm": 0.3515625,
"learning_rate": 3.389453558552918e-06,
"loss": 0.4673,
"mean_token_accuracy": 0.7902515822649002,
"num_tokens": 750138953.0,
"step": 24750
},
{
"entropy": 0.41649769321084024,
"epoch": 4.627729053927972,
"grad_norm": 0.6484375,
"learning_rate": 3.37073781602991e-06,
"loss": 0.4877,
"mean_token_accuracy": 0.7842141664028168,
"num_tokens": 751652780.0,
"step": 24800
},
{
"entropy": 0.41776536986231805,
"epoch": 4.63705915282702,
"grad_norm": 0.44140625,
"learning_rate": 3.3524766625582052e-06,
"loss": 0.4836,
"mean_token_accuracy": 0.7836203473806381,
"num_tokens": 753230769.0,
"step": 24850
},
{
"entropy": 0.4045536919683218,
"epoch": 4.6463892517260685,
"grad_norm": 0.6875,
"learning_rate": 3.334670730414037e-06,
"loss": 0.4825,
"mean_token_accuracy": 0.7888794159889221,
"num_tokens": 754719857.0,
"step": 24900
},
{
"entropy": 0.4182581885159016,
"epoch": 4.655719350625117,
"grad_norm": 0.408203125,
"learning_rate": 3.3173206361120026e-06,
"loss": 0.4862,
"mean_token_accuracy": 0.7835237330198288,
"num_tokens": 756242660.0,
"step": 24950
},
{
"entropy": 0.41097776919603346,
"epoch": 4.665049449524165,
"grad_norm": 0.4921875,
"learning_rate": 3.3004269803837223e-06,
"loss": 0.4833,
"mean_token_accuracy": 0.7847666722536087,
"num_tokens": 757779830.0,
"step": 25000
},
{
"epoch": 4.665049449524165,
"eval_entropy": 0.425675423730404,
"eval_loss": 0.49867185950279236,
"eval_mean_token_accuracy": 0.7805509616475587,
"eval_num_tokens": 757779830.0,
"eval_runtime": 16.0252,
"eval_samples_per_second": 54.102,
"eval_steps_per_second": 6.802,
"step": 25000
},
{
"entropy": 0.4132141026854515,
"epoch": 4.674379548423214,
"grad_norm": 1.2109375,
"learning_rate": 3.2839903481570305e-06,
"loss": 0.4843,
"mean_token_accuracy": 0.7841847789287567,
"num_tokens": 759296685.0,
"step": 25050
},
{
"entropy": 0.40370859257876873,
"epoch": 4.683709647322262,
"grad_norm": 0.4765625,
"learning_rate": 3.268011308535733e-06,
"loss": 0.4746,
"mean_token_accuracy": 0.7883295321464538,
"num_tokens": 760792002.0,
"step": 25100
},
{
"entropy": 0.4057003001868725,
"epoch": 4.69303974622131,
"grad_norm": 0.55859375,
"learning_rate": 3.252490414779895e-06,
"loss": 0.4792,
"mean_token_accuracy": 0.7840817874670029,
"num_tokens": 762288394.0,
"step": 25150
},
{
"entropy": 0.4204439736157656,
"epoch": 4.702369845120359,
"grad_norm": 0.671875,
"learning_rate": 3.2374282042866876e-06,
"loss": 0.4853,
"mean_token_accuracy": 0.7837431621551514,
"num_tokens": 763765331.0,
"step": 25200
},
{
"entropy": 0.4071824544668198,
"epoch": 4.711699944019407,
"grad_norm": 1.1796875,
"learning_rate": 3.2228251985717824e-06,
"loss": 0.4852,
"mean_token_accuracy": 0.7848959761857986,
"num_tokens": 765243737.0,
"step": 25250
},
{
"entropy": 0.43803592413663867,
"epoch": 4.721030042918455,
"grad_norm": 0.8515625,
"learning_rate": 3.208681903251291e-06,
"loss": 0.4973,
"mean_token_accuracy": 0.7763216584920883,
"num_tokens": 766812993.0,
"step": 25300
},
{
"entropy": 0.4035507388412952,
"epoch": 4.730360141817504,
"grad_norm": 0.8359375,
"learning_rate": 3.1949988080242665e-06,
"loss": 0.4751,
"mean_token_accuracy": 0.7876280504465103,
"num_tokens": 768318546.0,
"step": 25350
},
{
"entropy": 0.41653711020946504,
"epoch": 4.739690240716552,
"grad_norm": 0.625,
"learning_rate": 3.181776386655733e-06,
"loss": 0.4859,
"mean_token_accuracy": 0.7842164701223373,
"num_tokens": 769827713.0,
"step": 25400
},
{
"entropy": 0.43134715765714643,
"epoch": 4.749020339615599,
"grad_norm": 0.66015625,
"learning_rate": 3.1690150969603e-06,
"loss": 0.4975,
"mean_token_accuracy": 0.7809195184707641,
"num_tokens": 771418246.0,
"step": 25450
},
{
"entropy": 0.4131816050410271,
"epoch": 4.758350438514649,
"grad_norm": 0.384765625,
"learning_rate": 3.1567153807862953e-06,
"loss": 0.4821,
"mean_token_accuracy": 0.7868939906358718,
"num_tokens": 772904428.0,
"step": 25500
},
{
"entropy": 0.4172322556376457,
"epoch": 4.767680537413696,
"grad_norm": 0.458984375,
"learning_rate": 3.1448776640004756e-06,
"loss": 0.4864,
"mean_token_accuracy": 0.7821826964616776,
"num_tokens": 774432393.0,
"step": 25550
},
{
"entropy": 0.410623489767313,
"epoch": 4.777010636312745,
"grad_norm": 0.7578125,
"learning_rate": 3.133502356473279e-06,
"loss": 0.4765,
"mean_token_accuracy": 0.7861051166057587,
"num_tokens": 775955056.0,
"step": 25600
},
{
"entropy": 0.4126855818927288,
"epoch": 4.786340735211793,
"grad_norm": 1.1875,
"learning_rate": 3.1225898520646354e-06,
"loss": 0.4791,
"mean_token_accuracy": 0.7849954336881637,
"num_tokens": 777481570.0,
"step": 25650
},
{
"entropy": 0.42760392755270005,
"epoch": 4.795670834110842,
"grad_norm": 0.54296875,
"learning_rate": 3.112140528610325e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.7792460584640503,
"num_tokens": 779084872.0,
"step": 25700
},
{
"entropy": 0.4246444535255432,
"epoch": 4.8050009330098895,
"grad_norm": 0.58203125,
"learning_rate": 3.102154747908898e-06,
"loss": 0.4901,
"mean_token_accuracy": 0.779129432439804,
"num_tokens": 780621011.0,
"step": 25750
},
{
"entropy": 0.4207975560426712,
"epoch": 4.814331031908938,
"grad_norm": 0.53515625,
"learning_rate": 3.0926328557091484e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.7824300426244736,
"num_tokens": 782204557.0,
"step": 25800
},
{
"entropy": 0.40202761128544806,
"epoch": 4.823661130807986,
"grad_norm": 0.65625,
"learning_rate": 3.0835751816981437e-06,
"loss": 0.4742,
"mean_token_accuracy": 0.7883801186084747,
"num_tokens": 783710597.0,
"step": 25850
},
{
"entropy": 0.4042065401375294,
"epoch": 4.8329912297070345,
"grad_norm": 0.890625,
"learning_rate": 3.0749820394898103e-06,
"loss": 0.4773,
"mean_token_accuracy": 0.7863856315612793,
"num_tokens": 785204376.0,
"step": 25900
},
{
"entropy": 0.4113363729417324,
"epoch": 4.842321328606083,
"grad_norm": 0.42578125,
"learning_rate": 3.066853726614068e-06,
"loss": 0.4836,
"mean_token_accuracy": 0.7829122406244278,
"num_tokens": 786745388.0,
"step": 25950
},
{
"entropy": 0.4050539457052946,
"epoch": 4.851651427505131,
"grad_norm": 1.0546875,
"learning_rate": 3.0591905245065378e-06,
"loss": 0.4782,
"mean_token_accuracy": 0.7893619048595428,
"num_tokens": 788238814.0,
"step": 26000
},
{
"epoch": 4.851651427505131,
"eval_entropy": 0.42556045840092754,
"eval_loss": 0.4986078143119812,
"eval_mean_token_accuracy": 0.7803256960090147,
"eval_num_tokens": 788238814.0,
"eval_runtime": 16.1117,
"eval_samples_per_second": 53.812,
"eval_steps_per_second": 6.765,
"step": 26000
},
{
"entropy": 0.4172664260864258,
"epoch": 4.86098152640418,
"grad_norm": 0.8828125,
"learning_rate": 3.0519926984987924e-06,
"loss": 0.4896,
"mean_token_accuracy": 0.781678112745285,
"num_tokens": 789737207.0,
"step": 26050
},
{
"entropy": 0.4131707660853863,
"epoch": 4.870311625303228,
"grad_norm": 0.484375,
"learning_rate": 3.045260497809169e-06,
"loss": 0.4816,
"mean_token_accuracy": 0.7855587202310562,
"num_tokens": 791272449.0,
"step": 26100
},
{
"entropy": 0.438031694740057,
"epoch": 4.879641724202276,
"grad_norm": 0.43359375,
"learning_rate": 3.0389941555341412e-06,
"loss": 0.4988,
"mean_token_accuracy": 0.7749890965223313,
"num_tokens": 792843109.0,
"step": 26150
},
{
"entropy": 0.41347076088190077,
"epoch": 4.888971823101325,
"grad_norm": 0.66015625,
"learning_rate": 3.03319388864025e-06,
"loss": 0.4862,
"mean_token_accuracy": 0.782042904496193,
"num_tokens": 794349700.0,
"step": 26200
},
{
"entropy": 0.4158082590997219,
"epoch": 4.898301922000373,
"grad_norm": 0.5,
"learning_rate": 3.0278598979565877e-06,
"loss": 0.4888,
"mean_token_accuracy": 0.7814363497495651,
"num_tokens": 795874145.0,
"step": 26250
},
{
"entropy": 0.4144511626660824,
"epoch": 4.907632020899421,
"grad_norm": 0.373046875,
"learning_rate": 3.0229923681678497e-06,
"loss": 0.4811,
"mean_token_accuracy": 0.7852653992176056,
"num_tokens": 797377284.0,
"step": 26300
},
{
"entropy": 0.4087340448796749,
"epoch": 4.91696211979847,
"grad_norm": 0.3828125,
"learning_rate": 3.018591467807935e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.7855605220794678,
"num_tokens": 798886169.0,
"step": 26350
},
{
"entropy": 0.4149911729991436,
"epoch": 4.926292218697518,
"grad_norm": 0.478515625,
"learning_rate": 3.0146573492541123e-06,
"loss": 0.4865,
"mean_token_accuracy": 0.7815834748744964,
"num_tokens": 800437490.0,
"step": 26400
},
{
"entropy": 0.4159658246487379,
"epoch": 4.935622317596566,
"grad_norm": 0.498046875,
"learning_rate": 3.0111901487217452e-06,
"loss": 0.485,
"mean_token_accuracy": 0.7831065011024475,
"num_tokens": 801983835.0,
"step": 26450
},
{
"entropy": 0.4220912031829357,
"epoch": 4.944952416495615,
"grad_norm": 0.51171875,
"learning_rate": 3.008189986259573e-06,
"loss": 0.4914,
"mean_token_accuracy": 0.7806734621524811,
"num_tokens": 803525044.0,
"step": 26500
},
{
"entropy": 0.4241555346548557,
"epoch": 4.954282515394663,
"grad_norm": 0.45703125,
"learning_rate": 3.0056569657455626e-06,
"loss": 0.4879,
"mean_token_accuracy": 0.7803041088581085,
"num_tokens": 805085203.0,
"step": 26550
},
{
"entropy": 0.4313288567960262,
"epoch": 4.963612614293711,
"grad_norm": 0.546875,
"learning_rate": 3.0035911748832985e-06,
"loss": 0.4982,
"mean_token_accuracy": 0.7780981206893921,
"num_tokens": 806639306.0,
"step": 26600
},
{
"entropy": 0.40001085847616197,
"epoch": 4.97294271319276,
"grad_norm": 0.6328125,
"learning_rate": 3.0019926851989556e-06,
"loss": 0.4735,
"mean_token_accuracy": 0.7876440799236297,
"num_tokens": 808101409.0,
"step": 26650
},
{
"entropy": 0.4143665814399719,
"epoch": 4.982272812091808,
"grad_norm": 1.0,
"learning_rate": 3.000861552038823e-06,
"loss": 0.4829,
"mean_token_accuracy": 0.7828416174650192,
"num_tokens": 809661206.0,
"step": 26700
},
{
"entropy": 0.410667944252491,
"epoch": 4.991602910990856,
"grad_norm": 0.28125,
"learning_rate": 3.0001978145673808e-06,
"loss": 0.4815,
"mean_token_accuracy": 0.7867605596780777,
"num_tokens": 811153343.0,
"step": 26750
}
],
"logging_steps": 50,
"max_steps": 26795,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4288887116625084e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}