medgemma_check / trainer_state.json
pablopimentel's picture
Training in progress, epoch 1
62f063d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 13947,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0035850646207897896,
"grad_norm": 28.958446502685547,
"learning_rate": 3.512544802867384e-06,
"loss": 14.3981,
"mean_token_accuracy": 0.4658013021945953,
"num_tokens": 631305.0,
"step": 50
},
{
"epoch": 0.0035850646207897896,
"eval_loss": 3.598968505859375,
"eval_mean_token_accuracy": 0.4642415362596512,
"eval_num_tokens": 631305.0,
"eval_runtime": 55.3723,
"eval_samples_per_second": 7.224,
"eval_steps_per_second": 0.903,
"step": 50
},
{
"epoch": 0.007170129241579579,
"grad_norm": 46.98331832885742,
"learning_rate": 7.096774193548387e-06,
"loss": 13.6155,
"mean_token_accuracy": 0.47677032694220545,
"num_tokens": 1263143.0,
"step": 100
},
{
"epoch": 0.007170129241579579,
"eval_loss": 3.242854595184326,
"eval_mean_token_accuracy": 0.4895547354221344,
"eval_num_tokens": 1263143.0,
"eval_runtime": 56.3676,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 100
},
{
"epoch": 0.01075519386236937,
"grad_norm": 23.78474235534668,
"learning_rate": 1.0681003584229391e-05,
"loss": 11.8849,
"mean_token_accuracy": 0.5030167695879936,
"num_tokens": 1896120.0,
"step": 150
},
{
"epoch": 0.01075519386236937,
"eval_loss": 2.791355609893799,
"eval_mean_token_accuracy": 0.5165965485572815,
"eval_num_tokens": 1896120.0,
"eval_runtime": 55.265,
"eval_samples_per_second": 7.238,
"eval_steps_per_second": 0.905,
"step": 150
},
{
"epoch": 0.014340258483159158,
"grad_norm": 11.690914154052734,
"learning_rate": 1.4265232974910395e-05,
"loss": 9.8852,
"mean_token_accuracy": 0.5450691656768322,
"num_tokens": 2527332.0,
"step": 200
},
{
"epoch": 0.014340258483159158,
"eval_loss": 2.1748604774475098,
"eval_mean_token_accuracy": 0.5758289074897767,
"eval_num_tokens": 2527332.0,
"eval_runtime": 55.335,
"eval_samples_per_second": 7.229,
"eval_steps_per_second": 0.904,
"step": 200
},
{
"epoch": 0.01792532310394895,
"grad_norm": 9.011432647705078,
"learning_rate": 1.78494623655914e-05,
"loss": 7.7315,
"mean_token_accuracy": 0.5919728323817253,
"num_tokens": 3158451.0,
"step": 250
},
{
"epoch": 0.01792532310394895,
"eval_loss": 1.7914152145385742,
"eval_mean_token_accuracy": 0.6036396706104279,
"eval_num_tokens": 3158451.0,
"eval_runtime": 55.3537,
"eval_samples_per_second": 7.226,
"eval_steps_per_second": 0.903,
"step": 250
},
{
"epoch": 0.02151038772473874,
"grad_norm": 9.172738075256348,
"learning_rate": 2.1433691756272405e-05,
"loss": 6.6634,
"mean_token_accuracy": 0.6193091833591461,
"num_tokens": 3790537.0,
"step": 300
},
{
"epoch": 0.02151038772473874,
"eval_loss": 1.5858986377716064,
"eval_mean_token_accuracy": 0.6325684702396392,
"eval_num_tokens": 3790537.0,
"eval_runtime": 55.5221,
"eval_samples_per_second": 7.204,
"eval_steps_per_second": 0.901,
"step": 300
},
{
"epoch": 0.025095452345528527,
"grad_norm": 6.380577087402344,
"learning_rate": 2.5017921146953403e-05,
"loss": 5.9955,
"mean_token_accuracy": 0.6299453395605087,
"num_tokens": 4416803.0,
"step": 350
},
{
"epoch": 0.025095452345528527,
"eval_loss": 1.4283970594406128,
"eval_mean_token_accuracy": 0.637500970363617,
"eval_num_tokens": 4416803.0,
"eval_runtime": 55.6098,
"eval_samples_per_second": 7.193,
"eval_steps_per_second": 0.899,
"step": 350
},
{
"epoch": 0.028680516966318317,
"grad_norm": 8.31059455871582,
"learning_rate": 2.860215053763441e-05,
"loss": 5.6524,
"mean_token_accuracy": 0.6386667934060096,
"num_tokens": 5049525.0,
"step": 400
},
{
"epoch": 0.028680516966318317,
"eval_loss": 1.4044820070266724,
"eval_mean_token_accuracy": 0.6408572208881378,
"eval_num_tokens": 5049525.0,
"eval_runtime": 55.375,
"eval_samples_per_second": 7.223,
"eval_steps_per_second": 0.903,
"step": 400
},
{
"epoch": 0.03226558158710811,
"grad_norm": 8.33178997039795,
"learning_rate": 3.218637992831541e-05,
"loss": 5.5798,
"mean_token_accuracy": 0.6421743601560592,
"num_tokens": 5681852.0,
"step": 450
},
{
"epoch": 0.03226558158710811,
"eval_loss": 1.390726923942566,
"eval_mean_token_accuracy": 0.642348815202713,
"eval_num_tokens": 5681852.0,
"eval_runtime": 55.3167,
"eval_samples_per_second": 7.231,
"eval_steps_per_second": 0.904,
"step": 450
},
{
"epoch": 0.0358506462078979,
"grad_norm": 6.159327507019043,
"learning_rate": 3.577060931899642e-05,
"loss": 5.5753,
"mean_token_accuracy": 0.6416634133458138,
"num_tokens": 6314159.0,
"step": 500
},
{
"epoch": 0.0358506462078979,
"eval_loss": 1.3759286403656006,
"eval_mean_token_accuracy": 0.6448968076705932,
"eval_num_tokens": 6314159.0,
"eval_runtime": 55.4132,
"eval_samples_per_second": 7.219,
"eval_steps_per_second": 0.902,
"step": 500
},
{
"epoch": 0.03943571082868769,
"grad_norm": 7.295239448547363,
"learning_rate": 3.935483870967742e-05,
"loss": 5.4486,
"mean_token_accuracy": 0.6444561332464218,
"num_tokens": 6948430.0,
"step": 550
},
{
"epoch": 0.03943571082868769,
"eval_loss": 1.3677067756652832,
"eval_mean_token_accuracy": 0.6456243467330932,
"eval_num_tokens": 6948430.0,
"eval_runtime": 55.4348,
"eval_samples_per_second": 7.216,
"eval_steps_per_second": 0.902,
"step": 550
},
{
"epoch": 0.04302077544947748,
"grad_norm": 8.140225410461426,
"learning_rate": 4.2939068100358425e-05,
"loss": 5.491,
"mean_token_accuracy": 0.6452211833000183,
"num_tokens": 7574739.0,
"step": 600
},
{
"epoch": 0.04302077544947748,
"eval_loss": 1.3580710887908936,
"eval_mean_token_accuracy": 0.6465290606021881,
"eval_num_tokens": 7574739.0,
"eval_runtime": 55.3881,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.903,
"step": 600
},
{
"epoch": 0.04660584007026727,
"grad_norm": 7.000651836395264,
"learning_rate": 4.6523297491039434e-05,
"loss": 5.4196,
"mean_token_accuracy": 0.6482405418157577,
"num_tokens": 8203512.0,
"step": 650
},
{
"epoch": 0.04660584007026727,
"eval_loss": 1.3494269847869873,
"eval_mean_token_accuracy": 0.6481932699680328,
"eval_num_tokens": 8203512.0,
"eval_runtime": 55.3883,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.903,
"step": 650
},
{
"epoch": 0.05019090469105705,
"grad_norm": 8.582626342773438,
"learning_rate": 5.0107526881720436e-05,
"loss": 5.3867,
"mean_token_accuracy": 0.650465478003025,
"num_tokens": 8831306.0,
"step": 700
},
{
"epoch": 0.05019090469105705,
"eval_loss": 1.3439626693725586,
"eval_mean_token_accuracy": 0.6484399271011353,
"eval_num_tokens": 8831306.0,
"eval_runtime": 55.4414,
"eval_samples_per_second": 7.215,
"eval_steps_per_second": 0.902,
"step": 700
},
{
"epoch": 0.05377596931184685,
"grad_norm": 8.785462379455566,
"learning_rate": 5.369175627240144e-05,
"loss": 5.3822,
"mean_token_accuracy": 0.6480473777651787,
"num_tokens": 9462041.0,
"step": 750
},
{
"epoch": 0.05377596931184685,
"eval_loss": 1.3385406732559204,
"eval_mean_token_accuracy": 0.649928457736969,
"eval_num_tokens": 9462041.0,
"eval_runtime": 55.3213,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 0.904,
"step": 750
},
{
"epoch": 0.05736103393263663,
"grad_norm": 6.1994547843933105,
"learning_rate": 5.727598566308244e-05,
"loss": 5.305,
"mean_token_accuracy": 0.6530899196863175,
"num_tokens": 10095648.0,
"step": 800
},
{
"epoch": 0.05736103393263663,
"eval_loss": 1.333341360092163,
"eval_mean_token_accuracy": 0.6506243336200714,
"eval_num_tokens": 10095648.0,
"eval_runtime": 55.4523,
"eval_samples_per_second": 7.213,
"eval_steps_per_second": 0.902,
"step": 800
},
{
"epoch": 0.06094609855342643,
"grad_norm": 5.850490570068359,
"learning_rate": 6.086021505376345e-05,
"loss": 5.3301,
"mean_token_accuracy": 0.6499336344003678,
"num_tokens": 10730377.0,
"step": 850
},
{
"epoch": 0.06094609855342643,
"eval_loss": 1.3294757604599,
"eval_mean_token_accuracy": 0.6494286286830903,
"eval_num_tokens": 10730377.0,
"eval_runtime": 55.628,
"eval_samples_per_second": 7.191,
"eval_steps_per_second": 0.899,
"step": 850
},
{
"epoch": 0.06453116317421621,
"grad_norm": 5.629384517669678,
"learning_rate": 6.444444444444446e-05,
"loss": 5.2911,
"mean_token_accuracy": 0.6521104833483696,
"num_tokens": 11363798.0,
"step": 900
},
{
"epoch": 0.06453116317421621,
"eval_loss": 1.3235622644424438,
"eval_mean_token_accuracy": 0.6512654149532318,
"eval_num_tokens": 11363798.0,
"eval_runtime": 55.5848,
"eval_samples_per_second": 7.196,
"eval_steps_per_second": 0.9,
"step": 900
},
{
"epoch": 0.068116227795006,
"grad_norm": 6.046393871307373,
"learning_rate": 6.802867383512545e-05,
"loss": 5.2478,
"mean_token_accuracy": 0.6536632561683655,
"num_tokens": 11993502.0,
"step": 950
},
{
"epoch": 0.068116227795006,
"eval_loss": 1.3197156190872192,
"eval_mean_token_accuracy": 0.6517732429504395,
"eval_num_tokens": 11993502.0,
"eval_runtime": 55.3206,
"eval_samples_per_second": 7.231,
"eval_steps_per_second": 0.904,
"step": 950
},
{
"epoch": 0.0717012924157958,
"grad_norm": 6.950500011444092,
"learning_rate": 7.161290322580646e-05,
"loss": 5.2368,
"mean_token_accuracy": 0.6554682296514511,
"num_tokens": 12628081.0,
"step": 1000
},
{
"epoch": 0.0717012924157958,
"eval_loss": 1.3148993253707886,
"eval_mean_token_accuracy": 0.652986958026886,
"eval_num_tokens": 12628081.0,
"eval_runtime": 55.4045,
"eval_samples_per_second": 7.22,
"eval_steps_per_second": 0.902,
"step": 1000
},
{
"epoch": 0.07528635703658558,
"grad_norm": 5.844649791717529,
"learning_rate": 7.519713261648746e-05,
"loss": 5.2604,
"mean_token_accuracy": 0.6538248571753502,
"num_tokens": 13254893.0,
"step": 1050
},
{
"epoch": 0.07528635703658558,
"eval_loss": 1.3124916553497314,
"eval_mean_token_accuracy": 0.6538188600540161,
"eval_num_tokens": 13254893.0,
"eval_runtime": 56.9195,
"eval_samples_per_second": 7.027,
"eval_steps_per_second": 0.878,
"step": 1050
},
{
"epoch": 0.07887142165737537,
"grad_norm": 5.3114094734191895,
"learning_rate": 7.878136200716845e-05,
"loss": 5.235,
"mean_token_accuracy": 0.6541680765151977,
"num_tokens": 13893524.0,
"step": 1100
},
{
"epoch": 0.07887142165737537,
"eval_loss": 1.309714674949646,
"eval_mean_token_accuracy": 0.6538467502593994,
"eval_num_tokens": 13893524.0,
"eval_runtime": 56.3705,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 1100
},
{
"epoch": 0.08245648627816517,
"grad_norm": 5.666459083557129,
"learning_rate": 8.236559139784946e-05,
"loss": 5.1803,
"mean_token_accuracy": 0.6568083089590072,
"num_tokens": 14522906.0,
"step": 1150
},
{
"epoch": 0.08245648627816517,
"eval_loss": 1.305640459060669,
"eval_mean_token_accuracy": 0.655296059846878,
"eval_num_tokens": 14522906.0,
"eval_runtime": 57.2106,
"eval_samples_per_second": 6.992,
"eval_steps_per_second": 0.874,
"step": 1150
},
{
"epoch": 0.08604155089895496,
"grad_norm": 6.020337104797363,
"learning_rate": 8.594982078853047e-05,
"loss": 5.2056,
"mean_token_accuracy": 0.653913055062294,
"num_tokens": 15156803.0,
"step": 1200
},
{
"epoch": 0.08604155089895496,
"eval_loss": 1.3023688793182373,
"eval_mean_token_accuracy": 0.6569918835163117,
"eval_num_tokens": 15156803.0,
"eval_runtime": 56.4134,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 0.886,
"step": 1200
},
{
"epoch": 0.08962661551974474,
"grad_norm": 5.757259368896484,
"learning_rate": 8.953405017921147e-05,
"loss": 5.2154,
"mean_token_accuracy": 0.6549820226430892,
"num_tokens": 15788828.0,
"step": 1250
},
{
"epoch": 0.08962661551974474,
"eval_loss": 1.3033726215362549,
"eval_mean_token_accuracy": 0.6548340058326722,
"eval_num_tokens": 15788828.0,
"eval_runtime": 56.3999,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 0.887,
"step": 1250
},
{
"epoch": 0.09321168014053453,
"grad_norm": 6.876058101654053,
"learning_rate": 9.311827956989248e-05,
"loss": 5.2374,
"mean_token_accuracy": 0.6526922315359116,
"num_tokens": 16423385.0,
"step": 1300
},
{
"epoch": 0.09321168014053453,
"eval_loss": 1.2987463474273682,
"eval_mean_token_accuracy": 0.6556110656261445,
"eval_num_tokens": 16423385.0,
"eval_runtime": 56.3308,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 0.888,
"step": 1300
},
{
"epoch": 0.09679674476132433,
"grad_norm": 5.170133590698242,
"learning_rate": 9.670250896057349e-05,
"loss": 5.2584,
"mean_token_accuracy": 0.6529216593503953,
"num_tokens": 17058608.0,
"step": 1350
},
{
"epoch": 0.09679674476132433,
"eval_loss": 1.2979986667633057,
"eval_mean_token_accuracy": 0.6553151261806488,
"eval_num_tokens": 17058608.0,
"eval_runtime": 56.3719,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 1350
},
{
"epoch": 0.1003818093821141,
"grad_norm": 5.678673267364502,
"learning_rate": 9.996813256851498e-05,
"loss": 5.1909,
"mean_token_accuracy": 0.6570302325487137,
"num_tokens": 17689690.0,
"step": 1400
},
{
"epoch": 0.1003818093821141,
"eval_loss": 1.2949328422546387,
"eval_mean_token_accuracy": 0.6552190041542053,
"eval_num_tokens": 17689690.0,
"eval_runtime": 56.3054,
"eval_samples_per_second": 7.104,
"eval_steps_per_second": 0.888,
"step": 1400
},
{
"epoch": 0.1039668740029039,
"grad_norm": 4.694892406463623,
"learning_rate": 9.956978967495221e-05,
"loss": 5.1132,
"mean_token_accuracy": 0.6600784501433372,
"num_tokens": 18321232.0,
"step": 1450
},
{
"epoch": 0.1039668740029039,
"eval_loss": 1.2946751117706299,
"eval_mean_token_accuracy": 0.6560806667804718,
"eval_num_tokens": 18321232.0,
"eval_runtime": 56.7345,
"eval_samples_per_second": 7.05,
"eval_steps_per_second": 0.881,
"step": 1450
},
{
"epoch": 0.1075519386236937,
"grad_norm": 5.286959171295166,
"learning_rate": 9.917144678138942e-05,
"loss": 5.2297,
"mean_token_accuracy": 0.6539956346154213,
"num_tokens": 18952518.0,
"step": 1500
},
{
"epoch": 0.1075519386236937,
"eval_loss": 1.2900216579437256,
"eval_mean_token_accuracy": 0.6562173092365264,
"eval_num_tokens": 18952518.0,
"eval_runtime": 56.3853,
"eval_samples_per_second": 7.094,
"eval_steps_per_second": 0.887,
"step": 1500
},
{
"epoch": 0.11113700324448349,
"grad_norm": 5.229610443115234,
"learning_rate": 9.877310388782664e-05,
"loss": 5.1376,
"mean_token_accuracy": 0.6599933451414108,
"num_tokens": 19580453.0,
"step": 1550
},
{
"epoch": 0.11113700324448349,
"eval_loss": 1.2871261835098267,
"eval_mean_token_accuracy": 0.6577617633342743,
"eval_num_tokens": 19580453.0,
"eval_runtime": 56.3376,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.888,
"step": 1550
},
{
"epoch": 0.11472206786527327,
"grad_norm": 4.540684223175049,
"learning_rate": 9.837476099426386e-05,
"loss": 5.1124,
"mean_token_accuracy": 0.659881052672863,
"num_tokens": 20220713.0,
"step": 1600
},
{
"epoch": 0.11472206786527327,
"eval_loss": 1.2855585813522339,
"eval_mean_token_accuracy": 0.657675279378891,
"eval_num_tokens": 20220713.0,
"eval_runtime": 56.4741,
"eval_samples_per_second": 7.083,
"eval_steps_per_second": 0.885,
"step": 1600
},
{
"epoch": 0.11830713248606306,
"grad_norm": 5.147482872009277,
"learning_rate": 9.797641810070109e-05,
"loss": 5.1251,
"mean_token_accuracy": 0.658254965543747,
"num_tokens": 20853860.0,
"step": 1650
},
{
"epoch": 0.11830713248606306,
"eval_loss": 1.283848762512207,
"eval_mean_token_accuracy": 0.6582966887950897,
"eval_num_tokens": 20853860.0,
"eval_runtime": 57.6401,
"eval_samples_per_second": 6.94,
"eval_steps_per_second": 0.867,
"step": 1650
},
{
"epoch": 0.12189219710685285,
"grad_norm": 4.544667720794678,
"learning_rate": 9.757807520713831e-05,
"loss": 5.0706,
"mean_token_accuracy": 0.6628770676255226,
"num_tokens": 21487498.0,
"step": 1700
},
{
"epoch": 0.12189219710685285,
"eval_loss": 1.2798463106155396,
"eval_mean_token_accuracy": 0.6587248671054841,
"eval_num_tokens": 21487498.0,
"eval_runtime": 56.4142,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 0.886,
"step": 1700
},
{
"epoch": 0.12547726172764265,
"grad_norm": 4.541973114013672,
"learning_rate": 9.717973231357553e-05,
"loss": 5.054,
"mean_token_accuracy": 0.6628148990869522,
"num_tokens": 22120725.0,
"step": 1750
},
{
"epoch": 0.12547726172764265,
"eval_loss": 1.278252124786377,
"eval_mean_token_accuracy": 0.6595626533031463,
"eval_num_tokens": 22120725.0,
"eval_runtime": 56.5087,
"eval_samples_per_second": 7.079,
"eval_steps_per_second": 0.885,
"step": 1750
},
{
"epoch": 0.12906232634843243,
"grad_norm": 5.01814603805542,
"learning_rate": 9.678138942001275e-05,
"loss": 5.1334,
"mean_token_accuracy": 0.6570143532752991,
"num_tokens": 22751630.0,
"step": 1800
},
{
"epoch": 0.12906232634843243,
"eval_loss": 1.2745345830917358,
"eval_mean_token_accuracy": 0.6588572013378143,
"eval_num_tokens": 22751630.0,
"eval_runtime": 56.4708,
"eval_samples_per_second": 7.083,
"eval_steps_per_second": 0.885,
"step": 1800
},
{
"epoch": 0.1326473909692222,
"grad_norm": 5.249142646789551,
"learning_rate": 9.638304652644997e-05,
"loss": 5.0772,
"mean_token_accuracy": 0.6610330584645271,
"num_tokens": 23380871.0,
"step": 1850
},
{
"epoch": 0.1326473909692222,
"eval_loss": 1.271730661392212,
"eval_mean_token_accuracy": 0.6606413364410401,
"eval_num_tokens": 23380871.0,
"eval_runtime": 56.3956,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 0.887,
"step": 1850
},
{
"epoch": 0.136232455590012,
"grad_norm": 4.495316505432129,
"learning_rate": 9.598470363288719e-05,
"loss": 5.1115,
"mean_token_accuracy": 0.6598174887895584,
"num_tokens": 24016153.0,
"step": 1900
},
{
"epoch": 0.136232455590012,
"eval_loss": 1.2678121328353882,
"eval_mean_token_accuracy": 0.659992311000824,
"eval_num_tokens": 24016153.0,
"eval_runtime": 56.3408,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.887,
"step": 1900
},
{
"epoch": 0.1398175202108018,
"grad_norm": 4.675460338592529,
"learning_rate": 9.558636073932441e-05,
"loss": 5.1039,
"mean_token_accuracy": 0.6611927005648613,
"num_tokens": 24651410.0,
"step": 1950
},
{
"epoch": 0.1398175202108018,
"eval_loss": 1.2681583166122437,
"eval_mean_token_accuracy": 0.6607218337059021,
"eval_num_tokens": 24651410.0,
"eval_runtime": 56.0826,
"eval_samples_per_second": 7.132,
"eval_steps_per_second": 0.892,
"step": 1950
},
{
"epoch": 0.1434025848315916,
"grad_norm": 4.928748607635498,
"learning_rate": 9.518801784576164e-05,
"loss": 5.1005,
"mean_token_accuracy": 0.6606691733002663,
"num_tokens": 25282438.0,
"step": 2000
},
{
"epoch": 0.1434025848315916,
"eval_loss": 1.2662436962127686,
"eval_mean_token_accuracy": 0.6602835392951966,
"eval_num_tokens": 25282438.0,
"eval_runtime": 56.1345,
"eval_samples_per_second": 7.126,
"eval_steps_per_second": 0.891,
"step": 2000
},
{
"epoch": 0.14698764945238138,
"grad_norm": 4.237011432647705,
"learning_rate": 9.478967495219886e-05,
"loss": 5.0865,
"mean_token_accuracy": 0.6610729214549065,
"num_tokens": 25914467.0,
"step": 2050
},
{
"epoch": 0.14698764945238138,
"eval_loss": 1.2659285068511963,
"eval_mean_token_accuracy": 0.6619378459453583,
"eval_num_tokens": 25914467.0,
"eval_runtime": 56.41,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 0.886,
"step": 2050
},
{
"epoch": 0.15057271407317116,
"grad_norm": 4.498386383056641,
"learning_rate": 9.439133205863608e-05,
"loss": 5.0536,
"mean_token_accuracy": 0.662959768474102,
"num_tokens": 26547088.0,
"step": 2100
},
{
"epoch": 0.15057271407317116,
"eval_loss": 1.2620855569839478,
"eval_mean_token_accuracy": 0.6622272551059722,
"eval_num_tokens": 26547088.0,
"eval_runtime": 56.4966,
"eval_samples_per_second": 7.08,
"eval_steps_per_second": 0.885,
"step": 2100
},
{
"epoch": 0.15415777869396097,
"grad_norm": 4.547789573669434,
"learning_rate": 9.39929891650733e-05,
"loss": 5.0074,
"mean_token_accuracy": 0.6667503699660301,
"num_tokens": 27181781.0,
"step": 2150
},
{
"epoch": 0.15415777869396097,
"eval_loss": 1.2630900144577026,
"eval_mean_token_accuracy": 0.6616924941539765,
"eval_num_tokens": 27181781.0,
"eval_runtime": 56.423,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 0.886,
"step": 2150
},
{
"epoch": 0.15774284331475075,
"grad_norm": 4.9150896072387695,
"learning_rate": 9.359464627151052e-05,
"loss": 5.0802,
"mean_token_accuracy": 0.6615395992994308,
"num_tokens": 27816217.0,
"step": 2200
},
{
"epoch": 0.15774284331475075,
"eval_loss": 1.2614257335662842,
"eval_mean_token_accuracy": 0.6630010890960694,
"eval_num_tokens": 27816217.0,
"eval_runtime": 56.1652,
"eval_samples_per_second": 7.122,
"eval_steps_per_second": 0.89,
"step": 2200
},
{
"epoch": 0.16132790793554053,
"grad_norm": 4.487524032592773,
"learning_rate": 9.319630337794774e-05,
"loss": 5.0135,
"mean_token_accuracy": 0.6651386457681656,
"num_tokens": 28449167.0,
"step": 2250
},
{
"epoch": 0.16132790793554053,
"eval_loss": 1.2573643922805786,
"eval_mean_token_accuracy": 0.6616563200950623,
"eval_num_tokens": 28449167.0,
"eval_runtime": 56.2616,
"eval_samples_per_second": 7.11,
"eval_steps_per_second": 0.889,
"step": 2250
},
{
"epoch": 0.16491297255633033,
"grad_norm": 4.237537860870361,
"learning_rate": 9.279796048438496e-05,
"loss": 5.0198,
"mean_token_accuracy": 0.6647991991043091,
"num_tokens": 29080404.0,
"step": 2300
},
{
"epoch": 0.16491297255633033,
"eval_loss": 1.2570703029632568,
"eval_mean_token_accuracy": 0.6629821956157684,
"eval_num_tokens": 29080404.0,
"eval_runtime": 56.2199,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 0.889,
"step": 2300
},
{
"epoch": 0.1684980371771201,
"grad_norm": 4.835733890533447,
"learning_rate": 9.239961759082219e-05,
"loss": 5.0592,
"mean_token_accuracy": 0.6617502626776696,
"num_tokens": 29711200.0,
"step": 2350
},
{
"epoch": 0.1684980371771201,
"eval_loss": 1.2566256523132324,
"eval_mean_token_accuracy": 0.6639114606380463,
"eval_num_tokens": 29711200.0,
"eval_runtime": 56.2408,
"eval_samples_per_second": 7.112,
"eval_steps_per_second": 0.889,
"step": 2350
},
{
"epoch": 0.17208310179790992,
"grad_norm": 4.832096099853516,
"learning_rate": 9.200127469725941e-05,
"loss": 5.0603,
"mean_token_accuracy": 0.6628168100118637,
"num_tokens": 30343251.0,
"step": 2400
},
{
"epoch": 0.17208310179790992,
"eval_loss": 1.2529999017715454,
"eval_mean_token_accuracy": 0.6634249198436737,
"eval_num_tokens": 30343251.0,
"eval_runtime": 56.4282,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 0.886,
"step": 2400
},
{
"epoch": 0.1756681664186997,
"grad_norm": 4.870041370391846,
"learning_rate": 9.160293180369663e-05,
"loss": 5.0036,
"mean_token_accuracy": 0.6657980665564537,
"num_tokens": 30983003.0,
"step": 2450
},
{
"epoch": 0.1756681664186997,
"eval_loss": 1.2508057355880737,
"eval_mean_token_accuracy": 0.664600031375885,
"eval_num_tokens": 30983003.0,
"eval_runtime": 56.1477,
"eval_samples_per_second": 7.124,
"eval_steps_per_second": 0.891,
"step": 2450
},
{
"epoch": 0.17925323103948948,
"grad_norm": 4.8386993408203125,
"learning_rate": 9.120458891013385e-05,
"loss": 4.9378,
"mean_token_accuracy": 0.6697717472910881,
"num_tokens": 31612440.0,
"step": 2500
},
{
"epoch": 0.17925323103948948,
"eval_loss": 1.2504231929779053,
"eval_mean_token_accuracy": 0.6656661999225616,
"eval_num_tokens": 31612440.0,
"eval_runtime": 56.432,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 0.886,
"step": 2500
},
{
"epoch": 0.1828382956602793,
"grad_norm": 4.897119045257568,
"learning_rate": 9.080624601657107e-05,
"loss": 5.0576,
"mean_token_accuracy": 0.6637110111117362,
"num_tokens": 32246052.0,
"step": 2550
},
{
"epoch": 0.1828382956602793,
"eval_loss": 1.2486332654953003,
"eval_mean_token_accuracy": 0.6662761294841766,
"eval_num_tokens": 32246052.0,
"eval_runtime": 56.3188,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 0.888,
"step": 2550
},
{
"epoch": 0.18642336028106907,
"grad_norm": 4.67065954208374,
"learning_rate": 9.040790312300828e-05,
"loss": 5.0137,
"mean_token_accuracy": 0.6648938983678818,
"num_tokens": 32876620.0,
"step": 2600
},
{
"epoch": 0.18642336028106907,
"eval_loss": 1.2486134767532349,
"eval_mean_token_accuracy": 0.6653382694721222,
"eval_num_tokens": 32876620.0,
"eval_runtime": 56.3432,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 0.887,
"step": 2600
},
{
"epoch": 0.19000842490185885,
"grad_norm": 4.4095563888549805,
"learning_rate": 9.000956022944551e-05,
"loss": 5.0065,
"mean_token_accuracy": 0.6646731504797936,
"num_tokens": 33511123.0,
"step": 2650
},
{
"epoch": 0.19000842490185885,
"eval_loss": 1.2494382858276367,
"eval_mean_token_accuracy": 0.6639154195785523,
"eval_num_tokens": 33511123.0,
"eval_runtime": 56.3106,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 0.888,
"step": 2650
},
{
"epoch": 0.19359348952264865,
"grad_norm": 4.82889461517334,
"learning_rate": 8.961121733588274e-05,
"loss": 5.0066,
"mean_token_accuracy": 0.666558310687542,
"num_tokens": 34141280.0,
"step": 2700
},
{
"epoch": 0.19359348952264865,
"eval_loss": 1.2460081577301025,
"eval_mean_token_accuracy": 0.6646526777744293,
"eval_num_tokens": 34141280.0,
"eval_runtime": 56.6246,
"eval_samples_per_second": 7.064,
"eval_steps_per_second": 0.883,
"step": 2700
},
{
"epoch": 0.19717855414343843,
"grad_norm": 4.663321018218994,
"learning_rate": 8.921287444231994e-05,
"loss": 4.9428,
"mean_token_accuracy": 0.6681969156861305,
"num_tokens": 34774698.0,
"step": 2750
},
{
"epoch": 0.19717855414343843,
"eval_loss": 1.244221806526184,
"eval_mean_token_accuracy": 0.6656762886047364,
"eval_num_tokens": 34774698.0,
"eval_runtime": 56.2438,
"eval_samples_per_second": 7.112,
"eval_steps_per_second": 0.889,
"step": 2750
},
{
"epoch": 0.2007636187642282,
"grad_norm": 4.264768600463867,
"learning_rate": 8.881453154875718e-05,
"loss": 5.0028,
"mean_token_accuracy": 0.6653160175681114,
"num_tokens": 35406754.0,
"step": 2800
},
{
"epoch": 0.2007636187642282,
"eval_loss": 1.2440837621688843,
"eval_mean_token_accuracy": 0.6653203201293946,
"eval_num_tokens": 35406754.0,
"eval_runtime": 56.2314,
"eval_samples_per_second": 7.113,
"eval_steps_per_second": 0.889,
"step": 2800
},
{
"epoch": 0.20434868338501802,
"grad_norm": 4.938720226287842,
"learning_rate": 8.84161886551944e-05,
"loss": 4.9905,
"mean_token_accuracy": 0.666375992000103,
"num_tokens": 36037785.0,
"step": 2850
},
{
"epoch": 0.20434868338501802,
"eval_loss": 1.2425023317337036,
"eval_mean_token_accuracy": 0.664773497581482,
"eval_num_tokens": 36037785.0,
"eval_runtime": 56.4922,
"eval_samples_per_second": 7.081,
"eval_steps_per_second": 0.885,
"step": 2850
},
{
"epoch": 0.2079337480058078,
"grad_norm": 4.350741386413574,
"learning_rate": 8.801784576163161e-05,
"loss": 4.858,
"mean_token_accuracy": 0.6740978673100472,
"num_tokens": 36672636.0,
"step": 2900
},
{
"epoch": 0.2079337480058078,
"eval_loss": 1.2399791479110718,
"eval_mean_token_accuracy": 0.6653912532329559,
"eval_num_tokens": 36672636.0,
"eval_runtime": 57.5992,
"eval_samples_per_second": 6.945,
"eval_steps_per_second": 0.868,
"step": 2900
},
{
"epoch": 0.2115188126265976,
"grad_norm": 4.187928676605225,
"learning_rate": 8.761950286806884e-05,
"loss": 4.973,
"mean_token_accuracy": 0.6659010905027389,
"num_tokens": 37304390.0,
"step": 2950
},
{
"epoch": 0.2115188126265976,
"eval_loss": 1.239449143409729,
"eval_mean_token_accuracy": 0.6661972737312317,
"eval_num_tokens": 37304390.0,
"eval_runtime": 55.9136,
"eval_samples_per_second": 7.154,
"eval_steps_per_second": 0.894,
"step": 2950
},
{
"epoch": 0.2151038772473874,
"grad_norm": 4.3214802742004395,
"learning_rate": 8.722115997450606e-05,
"loss": 4.9911,
"mean_token_accuracy": 0.6659192404150963,
"num_tokens": 37937712.0,
"step": 3000
},
{
"epoch": 0.2151038772473874,
"eval_loss": 1.2380547523498535,
"eval_mean_token_accuracy": 0.6668792748451233,
"eval_num_tokens": 37937712.0,
"eval_runtime": 56.7402,
"eval_samples_per_second": 7.05,
"eval_steps_per_second": 0.881,
"step": 3000
},
{
"epoch": 0.21868894186817717,
"grad_norm": 5.154741287231445,
"learning_rate": 8.682281708094327e-05,
"loss": 4.9341,
"mean_token_accuracy": 0.6695549800992012,
"num_tokens": 38567208.0,
"step": 3050
},
{
"epoch": 0.21868894186817717,
"eval_loss": 1.2387843132019043,
"eval_mean_token_accuracy": 0.6668635201454163,
"eval_num_tokens": 38567208.0,
"eval_runtime": 56.2471,
"eval_samples_per_second": 7.111,
"eval_steps_per_second": 0.889,
"step": 3050
},
{
"epoch": 0.22227400648896697,
"grad_norm": 5.014278888702393,
"learning_rate": 8.64244741873805e-05,
"loss": 4.8853,
"mean_token_accuracy": 0.6707546302676201,
"num_tokens": 39198318.0,
"step": 3100
},
{
"epoch": 0.22227400648896697,
"eval_loss": 1.2394779920578003,
"eval_mean_token_accuracy": 0.6670789694786072,
"eval_num_tokens": 39198318.0,
"eval_runtime": 56.2239,
"eval_samples_per_second": 7.114,
"eval_steps_per_second": 0.889,
"step": 3100
},
{
"epoch": 0.22585907110975675,
"grad_norm": 4.228548049926758,
"learning_rate": 8.602613129381773e-05,
"loss": 4.9269,
"mean_token_accuracy": 0.6687791690230369,
"num_tokens": 39828524.0,
"step": 3150
},
{
"epoch": 0.22585907110975675,
"eval_loss": 1.2372474670410156,
"eval_mean_token_accuracy": 0.666018306016922,
"eval_num_tokens": 39828524.0,
"eval_runtime": 56.2199,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 0.889,
"step": 3150
},
{
"epoch": 0.22944413573054653,
"grad_norm": 4.169594764709473,
"learning_rate": 8.562778840025495e-05,
"loss": 4.9485,
"mean_token_accuracy": 0.6667472127079964,
"num_tokens": 40459992.0,
"step": 3200
},
{
"epoch": 0.22944413573054653,
"eval_loss": 1.2357257604599,
"eval_mean_token_accuracy": 0.6657440733909606,
"eval_num_tokens": 40459992.0,
"eval_runtime": 56.3901,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 0.887,
"step": 3200
},
{
"epoch": 0.23302920035133634,
"grad_norm": 4.309950828552246,
"learning_rate": 8.522944550669216e-05,
"loss": 4.9128,
"mean_token_accuracy": 0.671622729897499,
"num_tokens": 41094373.0,
"step": 3250
},
{
"epoch": 0.23302920035133634,
"eval_loss": 1.2348511219024658,
"eval_mean_token_accuracy": 0.6659791529178619,
"eval_num_tokens": 41094373.0,
"eval_runtime": 56.3939,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 0.887,
"step": 3250
},
{
"epoch": 0.23661426497212612,
"grad_norm": 4.153282642364502,
"learning_rate": 8.48311026131294e-05,
"loss": 4.9831,
"mean_token_accuracy": 0.66548932492733,
"num_tokens": 41725155.0,
"step": 3300
},
{
"epoch": 0.23661426497212612,
"eval_loss": 1.2328479290008545,
"eval_mean_token_accuracy": 0.6659755408763885,
"eval_num_tokens": 41725155.0,
"eval_runtime": 56.4734,
"eval_samples_per_second": 7.083,
"eval_steps_per_second": 0.885,
"step": 3300
},
{
"epoch": 0.2401993295929159,
"grad_norm": 4.901464462280273,
"learning_rate": 8.443275971956662e-05,
"loss": 4.9905,
"mean_token_accuracy": 0.6660814517736435,
"num_tokens": 42361406.0,
"step": 3350
},
{
"epoch": 0.2401993295929159,
"eval_loss": 1.2326833009719849,
"eval_mean_token_accuracy": 0.6672022414207458,
"eval_num_tokens": 42361406.0,
"eval_runtime": 56.2896,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 0.888,
"step": 3350
},
{
"epoch": 0.2437843942137057,
"grad_norm": 4.4632415771484375,
"learning_rate": 8.403441682600382e-05,
"loss": 4.8952,
"mean_token_accuracy": 0.6706425687670707,
"num_tokens": 42993909.0,
"step": 3400
},
{
"epoch": 0.2437843942137057,
"eval_loss": 1.2327020168304443,
"eval_mean_token_accuracy": 0.6668058276176453,
"eval_num_tokens": 42993909.0,
"eval_runtime": 56.2077,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 0.89,
"step": 3400
},
{
"epoch": 0.2473694588344955,
"grad_norm": 4.537699222564697,
"learning_rate": 8.363607393244104e-05,
"loss": 4.921,
"mean_token_accuracy": 0.6698031505942345,
"num_tokens": 43628010.0,
"step": 3450
},
{
"epoch": 0.2473694588344955,
"eval_loss": 1.2328044176101685,
"eval_mean_token_accuracy": 0.667005888223648,
"eval_num_tokens": 43628010.0,
"eval_runtime": 56.142,
"eval_samples_per_second": 7.125,
"eval_steps_per_second": 0.891,
"step": 3450
},
{
"epoch": 0.2509545234552853,
"grad_norm": 4.68520450592041,
"learning_rate": 8.323773103887828e-05,
"loss": 4.9443,
"mean_token_accuracy": 0.667299503982067,
"num_tokens": 44263542.0,
"step": 3500
},
{
"epoch": 0.2509545234552853,
"eval_loss": 1.2303454875946045,
"eval_mean_token_accuracy": 0.6684185063838959,
"eval_num_tokens": 44263542.0,
"eval_runtime": 56.0547,
"eval_samples_per_second": 7.136,
"eval_steps_per_second": 0.892,
"step": 3500
},
{
"epoch": 0.2545395880760751,
"grad_norm": 4.269311428070068,
"learning_rate": 8.283938814531549e-05,
"loss": 4.9117,
"mean_token_accuracy": 0.6701091477274894,
"num_tokens": 44896448.0,
"step": 3550
},
{
"epoch": 0.2545395880760751,
"eval_loss": 1.2301256656646729,
"eval_mean_token_accuracy": 0.6679345464706421,
"eval_num_tokens": 44896448.0,
"eval_runtime": 56.8192,
"eval_samples_per_second": 7.04,
"eval_steps_per_second": 0.88,
"step": 3550
},
{
"epoch": 0.25812465269686485,
"grad_norm": 4.6586198806762695,
"learning_rate": 8.244104525175271e-05,
"loss": 4.9361,
"mean_token_accuracy": 0.6700941568613052,
"num_tokens": 45535736.0,
"step": 3600
},
{
"epoch": 0.25812465269686485,
"eval_loss": 1.2280727624893188,
"eval_mean_token_accuracy": 0.668077005147934,
"eval_num_tokens": 45535736.0,
"eval_runtime": 56.7678,
"eval_samples_per_second": 7.046,
"eval_steps_per_second": 0.881,
"step": 3600
},
{
"epoch": 0.26170971731765463,
"grad_norm": 4.350837230682373,
"learning_rate": 8.204270235818994e-05,
"loss": 4.8535,
"mean_token_accuracy": 0.6710763236880303,
"num_tokens": 46168014.0,
"step": 3650
},
{
"epoch": 0.26170971731765463,
"eval_loss": 1.2273330688476562,
"eval_mean_token_accuracy": 0.6685185146331787,
"eval_num_tokens": 46168014.0,
"eval_runtime": 56.2347,
"eval_samples_per_second": 7.113,
"eval_steps_per_second": 0.889,
"step": 3650
},
{
"epoch": 0.2652947819384444,
"grad_norm": 4.489384174346924,
"learning_rate": 8.164435946462715e-05,
"loss": 4.9884,
"mean_token_accuracy": 0.6643109431862831,
"num_tokens": 46799865.0,
"step": 3700
},
{
"epoch": 0.2652947819384444,
"eval_loss": 1.228873610496521,
"eval_mean_token_accuracy": 0.6676431381702423,
"eval_num_tokens": 46799865.0,
"eval_runtime": 56.2386,
"eval_samples_per_second": 7.113,
"eval_steps_per_second": 0.889,
"step": 3700
},
{
"epoch": 0.26887984655923425,
"grad_norm": 4.438107967376709,
"learning_rate": 8.124601657106437e-05,
"loss": 4.8433,
"mean_token_accuracy": 0.6722122520208359,
"num_tokens": 47431886.0,
"step": 3750
},
{
"epoch": 0.26887984655923425,
"eval_loss": 1.2276620864868164,
"eval_mean_token_accuracy": 0.6684055602550507,
"eval_num_tokens": 47431886.0,
"eval_runtime": 56.3388,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.887,
"step": 3750
},
{
"epoch": 0.272464911180024,
"grad_norm": 4.851945877075195,
"learning_rate": 8.08476736775016e-05,
"loss": 4.9293,
"mean_token_accuracy": 0.668489234149456,
"num_tokens": 48065104.0,
"step": 3800
},
{
"epoch": 0.272464911180024,
"eval_loss": 1.2269046306610107,
"eval_mean_token_accuracy": 0.6687084710597992,
"eval_num_tokens": 48065104.0,
"eval_runtime": 56.1744,
"eval_samples_per_second": 7.121,
"eval_steps_per_second": 0.89,
"step": 3800
},
{
"epoch": 0.2760499758008138,
"grad_norm": 4.730586528778076,
"learning_rate": 8.044933078393882e-05,
"loss": 4.8258,
"mean_token_accuracy": 0.6736181953549385,
"num_tokens": 48699465.0,
"step": 3850
},
{
"epoch": 0.2760499758008138,
"eval_loss": 1.2258822917938232,
"eval_mean_token_accuracy": 0.6694431722164154,
"eval_num_tokens": 48699465.0,
"eval_runtime": 56.2799,
"eval_samples_per_second": 7.107,
"eval_steps_per_second": 0.888,
"step": 3850
},
{
"epoch": 0.2796350404216036,
"grad_norm": 4.539992809295654,
"learning_rate": 8.005098789037604e-05,
"loss": 4.9014,
"mean_token_accuracy": 0.6706485760211944,
"num_tokens": 49328518.0,
"step": 3900
},
{
"epoch": 0.2796350404216036,
"eval_loss": 1.2248101234436035,
"eval_mean_token_accuracy": 0.6695001828670502,
"eval_num_tokens": 49328518.0,
"eval_runtime": 56.4684,
"eval_samples_per_second": 7.084,
"eval_steps_per_second": 0.885,
"step": 3900
},
{
"epoch": 0.28322010504239337,
"grad_norm": 5.000583648681641,
"learning_rate": 7.965264499681326e-05,
"loss": 4.7606,
"mean_token_accuracy": 0.6768921792507172,
"num_tokens": 49957049.0,
"step": 3950
},
{
"epoch": 0.28322010504239337,
"eval_loss": 1.2227978706359863,
"eval_mean_token_accuracy": 0.6691736376285553,
"eval_num_tokens": 49957049.0,
"eval_runtime": 56.328,
"eval_samples_per_second": 7.101,
"eval_steps_per_second": 0.888,
"step": 3950
},
{
"epoch": 0.2868051696631832,
"grad_norm": 4.855432510375977,
"learning_rate": 7.925430210325048e-05,
"loss": 4.9544,
"mean_token_accuracy": 0.6679512014985085,
"num_tokens": 50593342.0,
"step": 4000
},
{
"epoch": 0.2868051696631832,
"eval_loss": 1.222544550895691,
"eval_mean_token_accuracy": 0.6694585859775544,
"eval_num_tokens": 50593342.0,
"eval_runtime": 56.2378,
"eval_samples_per_second": 7.113,
"eval_steps_per_second": 0.889,
"step": 4000
},
{
"epoch": 0.290390234283973,
"grad_norm": 4.258941173553467,
"learning_rate": 7.88559592096877e-05,
"loss": 4.8759,
"mean_token_accuracy": 0.6715140387415885,
"num_tokens": 51220159.0,
"step": 4050
},
{
"epoch": 0.290390234283973,
"eval_loss": 1.2212531566619873,
"eval_mean_token_accuracy": 0.6692110347747803,
"eval_num_tokens": 51220159.0,
"eval_runtime": 56.3598,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 0.887,
"step": 4050
},
{
"epoch": 0.29397529890476276,
"grad_norm": 4.41649055480957,
"learning_rate": 7.845761631612492e-05,
"loss": 4.8512,
"mean_token_accuracy": 0.6730743369460106,
"num_tokens": 51852052.0,
"step": 4100
},
{
"epoch": 0.29397529890476276,
"eval_loss": 1.2219711542129517,
"eval_mean_token_accuracy": 0.668809084892273,
"eval_num_tokens": 51852052.0,
"eval_runtime": 56.3222,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 0.888,
"step": 4100
},
{
"epoch": 0.29756036352555254,
"grad_norm": 5.041947841644287,
"learning_rate": 7.805927342256214e-05,
"loss": 4.8012,
"mean_token_accuracy": 0.6752137768268586,
"num_tokens": 52488117.0,
"step": 4150
},
{
"epoch": 0.29756036352555254,
"eval_loss": 1.2200063467025757,
"eval_mean_token_accuracy": 0.6689476525783539,
"eval_num_tokens": 52488117.0,
"eval_runtime": 56.3663,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 4150
},
{
"epoch": 0.3011454281463423,
"grad_norm": 4.421684741973877,
"learning_rate": 7.766093052899937e-05,
"loss": 4.9011,
"mean_token_accuracy": 0.6694827458262443,
"num_tokens": 53124002.0,
"step": 4200
},
{
"epoch": 0.3011454281463423,
"eval_loss": 1.2199760675430298,
"eval_mean_token_accuracy": 0.6698788702487946,
"eval_num_tokens": 53124002.0,
"eval_runtime": 57.1381,
"eval_samples_per_second": 7.001,
"eval_steps_per_second": 0.875,
"step": 4200
},
{
"epoch": 0.30473049276713216,
"grad_norm": 4.482224941253662,
"learning_rate": 7.726258763543659e-05,
"loss": 4.8888,
"mean_token_accuracy": 0.6703590288758278,
"num_tokens": 53754491.0,
"step": 4250
},
{
"epoch": 0.30473049276713216,
"eval_loss": 1.2201299667358398,
"eval_mean_token_accuracy": 0.6685253477096558,
"eval_num_tokens": 53754491.0,
"eval_runtime": 56.1814,
"eval_samples_per_second": 7.12,
"eval_steps_per_second": 0.89,
"step": 4250
},
{
"epoch": 0.30831555738792193,
"grad_norm": 5.163293838500977,
"learning_rate": 7.686424474187381e-05,
"loss": 4.8934,
"mean_token_accuracy": 0.6694687473773956,
"num_tokens": 54384292.0,
"step": 4300
},
{
"epoch": 0.30831555738792193,
"eval_loss": 1.2192912101745605,
"eval_mean_token_accuracy": 0.670127317905426,
"eval_num_tokens": 54384292.0,
"eval_runtime": 56.2288,
"eval_samples_per_second": 7.114,
"eval_steps_per_second": 0.889,
"step": 4300
},
{
"epoch": 0.3119006220087117,
"grad_norm": 4.469936847686768,
"learning_rate": 7.646590184831103e-05,
"loss": 4.8808,
"mean_token_accuracy": 0.671641985476017,
"num_tokens": 55015409.0,
"step": 4350
},
{
"epoch": 0.3119006220087117,
"eval_loss": 1.2187691926956177,
"eval_mean_token_accuracy": 0.6700682175159455,
"eval_num_tokens": 55015409.0,
"eval_runtime": 56.275,
"eval_samples_per_second": 7.108,
"eval_steps_per_second": 0.888,
"step": 4350
},
{
"epoch": 0.3154856866295015,
"grad_norm": 4.397490501403809,
"learning_rate": 7.606755895474825e-05,
"loss": 4.8593,
"mean_token_accuracy": 0.6729298800230026,
"num_tokens": 55645076.0,
"step": 4400
},
{
"epoch": 0.3154856866295015,
"eval_loss": 1.21873140335083,
"eval_mean_token_accuracy": 0.6701312291622162,
"eval_num_tokens": 55645076.0,
"eval_runtime": 56.3255,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 0.888,
"step": 4400
},
{
"epoch": 0.3190707512502913,
"grad_norm": 4.565478801727295,
"learning_rate": 7.566921606118547e-05,
"loss": 4.9259,
"mean_token_accuracy": 0.6678701865673066,
"num_tokens": 56277591.0,
"step": 4450
},
{
"epoch": 0.3190707512502913,
"eval_loss": 1.2173478603363037,
"eval_mean_token_accuracy": 0.6713691699504852,
"eval_num_tokens": 56277591.0,
"eval_runtime": 56.1846,
"eval_samples_per_second": 7.119,
"eval_steps_per_second": 0.89,
"step": 4450
},
{
"epoch": 0.32265581587108105,
"grad_norm": 4.387983798980713,
"learning_rate": 7.52708731676227e-05,
"loss": 4.8506,
"mean_token_accuracy": 0.6727069270610809,
"num_tokens": 56909553.0,
"step": 4500
},
{
"epoch": 0.32265581587108105,
"eval_loss": 1.2162431478500366,
"eval_mean_token_accuracy": 0.6704595732688904,
"eval_num_tokens": 56909553.0,
"eval_runtime": 56.2594,
"eval_samples_per_second": 7.11,
"eval_steps_per_second": 0.889,
"step": 4500
},
{
"epoch": 0.3262408804918709,
"grad_norm": 4.406232833862305,
"learning_rate": 7.487253027405992e-05,
"loss": 4.8975,
"mean_token_accuracy": 0.6693887722492218,
"num_tokens": 57541706.0,
"step": 4550
},
{
"epoch": 0.3262408804918709,
"eval_loss": 1.216928243637085,
"eval_mean_token_accuracy": 0.6708524739742279,
"eval_num_tokens": 57541706.0,
"eval_runtime": 56.2862,
"eval_samples_per_second": 7.107,
"eval_steps_per_second": 0.888,
"step": 4550
},
{
"epoch": 0.32982594511266067,
"grad_norm": 4.329367637634277,
"learning_rate": 7.447418738049714e-05,
"loss": 4.8734,
"mean_token_accuracy": 0.6720337501168251,
"num_tokens": 58175251.0,
"step": 4600
},
{
"epoch": 0.32982594511266067,
"eval_loss": 1.2153425216674805,
"eval_mean_token_accuracy": 0.6709867632389068,
"eval_num_tokens": 58175251.0,
"eval_runtime": 56.449,
"eval_samples_per_second": 7.086,
"eval_steps_per_second": 0.886,
"step": 4600
},
{
"epoch": 0.33341100973345045,
"grad_norm": 4.24669075012207,
"learning_rate": 7.407584448693436e-05,
"loss": 4.8742,
"mean_token_accuracy": 0.6718363285064697,
"num_tokens": 58807276.0,
"step": 4650
},
{
"epoch": 0.33341100973345045,
"eval_loss": 1.2146964073181152,
"eval_mean_token_accuracy": 0.6710757482051849,
"eval_num_tokens": 58807276.0,
"eval_runtime": 56.3596,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 0.887,
"step": 4650
},
{
"epoch": 0.3369960743542402,
"grad_norm": 4.037027835845947,
"learning_rate": 7.367750159337158e-05,
"loss": 4.8869,
"mean_token_accuracy": 0.6710193574428558,
"num_tokens": 59434602.0,
"step": 4700
},
{
"epoch": 0.3369960743542402,
"eval_loss": 1.2160181999206543,
"eval_mean_token_accuracy": 0.6700944793224335,
"eval_num_tokens": 59434602.0,
"eval_runtime": 56.8171,
"eval_samples_per_second": 7.04,
"eval_steps_per_second": 0.88,
"step": 4700
},
{
"epoch": 0.34058113897503,
"grad_norm": 4.7925262451171875,
"learning_rate": 7.32791586998088e-05,
"loss": 4.8639,
"mean_token_accuracy": 0.6717003020644188,
"num_tokens": 60067934.0,
"step": 4750
},
{
"epoch": 0.34058113897503,
"eval_loss": 1.2146656513214111,
"eval_mean_token_accuracy": 0.6714940690994262,
"eval_num_tokens": 60067934.0,
"eval_runtime": 56.6455,
"eval_samples_per_second": 7.061,
"eval_steps_per_second": 0.883,
"step": 4750
},
{
"epoch": 0.34416620359581984,
"grad_norm": 4.179026126861572,
"learning_rate": 7.288081580624602e-05,
"loss": 4.7815,
"mean_token_accuracy": 0.6770669308304786,
"num_tokens": 60700674.0,
"step": 4800
},
{
"epoch": 0.34416620359581984,
"eval_loss": 1.2132787704467773,
"eval_mean_token_accuracy": 0.6709755408763886,
"eval_num_tokens": 60700674.0,
"eval_runtime": 56.6884,
"eval_samples_per_second": 7.056,
"eval_steps_per_second": 0.882,
"step": 4800
},
{
"epoch": 0.3477512682166096,
"grad_norm": 4.608165740966797,
"learning_rate": 7.248247291268324e-05,
"loss": 4.8593,
"mean_token_accuracy": 0.6736995288729668,
"num_tokens": 61331555.0,
"step": 4850
},
{
"epoch": 0.3477512682166096,
"eval_loss": 1.2121059894561768,
"eval_mean_token_accuracy": 0.672556334733963,
"eval_num_tokens": 61331555.0,
"eval_runtime": 56.1182,
"eval_samples_per_second": 7.128,
"eval_steps_per_second": 0.891,
"step": 4850
},
{
"epoch": 0.3513363328373994,
"grad_norm": 4.966649055480957,
"learning_rate": 7.208413001912047e-05,
"loss": 4.8738,
"mean_token_accuracy": 0.671499859392643,
"num_tokens": 61965343.0,
"step": 4900
},
{
"epoch": 0.3513363328373994,
"eval_loss": 1.2120461463928223,
"eval_mean_token_accuracy": 0.6718708264827729,
"eval_num_tokens": 61965343.0,
"eval_runtime": 56.145,
"eval_samples_per_second": 7.124,
"eval_steps_per_second": 0.891,
"step": 4900
},
{
"epoch": 0.3549213974581892,
"grad_norm": 5.021463871002197,
"learning_rate": 7.168578712555767e-05,
"loss": 4.8567,
"mean_token_accuracy": 0.6712884229421615,
"num_tokens": 62596263.0,
"step": 4950
},
{
"epoch": 0.3549213974581892,
"eval_loss": 1.2121599912643433,
"eval_mean_token_accuracy": 0.671816600561142,
"eval_num_tokens": 62596263.0,
"eval_runtime": 56.6214,
"eval_samples_per_second": 7.064,
"eval_steps_per_second": 0.883,
"step": 4950
},
{
"epoch": 0.35850646207897896,
"grad_norm": 4.346203804016113,
"learning_rate": 7.128744423199491e-05,
"loss": 4.8762,
"mean_token_accuracy": 0.6713952556252479,
"num_tokens": 63232395.0,
"step": 5000
},
{
"epoch": 0.35850646207897896,
"eval_loss": 1.2105367183685303,
"eval_mean_token_accuracy": 0.6715540933609009,
"eval_num_tokens": 63232395.0,
"eval_runtime": 56.1985,
"eval_samples_per_second": 7.118,
"eval_steps_per_second": 0.89,
"step": 5000
},
{
"epoch": 0.36209152669976874,
"grad_norm": 4.725315570831299,
"learning_rate": 7.088910133843213e-05,
"loss": 4.8547,
"mean_token_accuracy": 0.6712683519721031,
"num_tokens": 63865141.0,
"step": 5050
},
{
"epoch": 0.36209152669976874,
"eval_loss": 1.2104063034057617,
"eval_mean_token_accuracy": 0.6721407020092011,
"eval_num_tokens": 63865141.0,
"eval_runtime": 56.4947,
"eval_samples_per_second": 7.08,
"eval_steps_per_second": 0.885,
"step": 5050
},
{
"epoch": 0.3656765913205586,
"grad_norm": 4.475533962249756,
"learning_rate": 7.049075844486934e-05,
"loss": 4.8331,
"mean_token_accuracy": 0.6741529366374016,
"num_tokens": 64498005.0,
"step": 5100
},
{
"epoch": 0.3656765913205586,
"eval_loss": 1.211449146270752,
"eval_mean_token_accuracy": 0.6718828630447388,
"eval_num_tokens": 64498005.0,
"eval_runtime": 56.1253,
"eval_samples_per_second": 7.127,
"eval_steps_per_second": 0.891,
"step": 5100
},
{
"epoch": 0.36926165594134835,
"grad_norm": 4.43773078918457,
"learning_rate": 7.009241555130657e-05,
"loss": 4.8576,
"mean_token_accuracy": 0.6735836458206177,
"num_tokens": 65132925.0,
"step": 5150
},
{
"epoch": 0.36926165594134835,
"eval_loss": 1.2107973098754883,
"eval_mean_token_accuracy": 0.67238405585289,
"eval_num_tokens": 65132925.0,
"eval_runtime": 56.3575,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 0.887,
"step": 5150
},
{
"epoch": 0.37284672056213813,
"grad_norm": 4.340308666229248,
"learning_rate": 6.96940726577438e-05,
"loss": 4.8199,
"mean_token_accuracy": 0.6730007353425026,
"num_tokens": 65763710.0,
"step": 5200
},
{
"epoch": 0.37284672056213813,
"eval_loss": 1.2099605798721313,
"eval_mean_token_accuracy": 0.6728368639945984,
"eval_num_tokens": 65763710.0,
"eval_runtime": 56.2959,
"eval_samples_per_second": 7.105,
"eval_steps_per_second": 0.888,
"step": 5200
},
{
"epoch": 0.3764317851829279,
"grad_norm": 4.555109024047852,
"learning_rate": 6.9295729764181e-05,
"loss": 4.8905,
"mean_token_accuracy": 0.6707694306969643,
"num_tokens": 66395012.0,
"step": 5250
},
{
"epoch": 0.3764317851829279,
"eval_loss": 1.2099387645721436,
"eval_mean_token_accuracy": 0.6722373139858245,
"eval_num_tokens": 66395012.0,
"eval_runtime": 56.4644,
"eval_samples_per_second": 7.084,
"eval_steps_per_second": 0.886,
"step": 5250
},
{
"epoch": 0.3800168498037177,
"grad_norm": 4.202060699462891,
"learning_rate": 6.889738687061822e-05,
"loss": 4.8292,
"mean_token_accuracy": 0.673227034509182,
"num_tokens": 67031872.0,
"step": 5300
},
{
"epoch": 0.3800168498037177,
"eval_loss": 1.210257887840271,
"eval_mean_token_accuracy": 0.6714678919315338,
"eval_num_tokens": 67031872.0,
"eval_runtime": 56.3175,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 0.888,
"step": 5300
},
{
"epoch": 0.38360191442450753,
"grad_norm": 4.315623760223389,
"learning_rate": 6.849904397705546e-05,
"loss": 4.8465,
"mean_token_accuracy": 0.672520759999752,
"num_tokens": 67663971.0,
"step": 5350
},
{
"epoch": 0.38360191442450753,
"eval_loss": 1.20899498462677,
"eval_mean_token_accuracy": 0.6721968007087707,
"eval_num_tokens": 67663971.0,
"eval_runtime": 56.4163,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 0.886,
"step": 5350
},
{
"epoch": 0.3871869790452973,
"grad_norm": 4.103718280792236,
"learning_rate": 6.810070108349267e-05,
"loss": 4.8568,
"mean_token_accuracy": 0.6714790239930153,
"num_tokens": 68298646.0,
"step": 5400
},
{
"epoch": 0.3871869790452973,
"eval_loss": 1.2082393169403076,
"eval_mean_token_accuracy": 0.6712930297851563,
"eval_num_tokens": 68298646.0,
"eval_runtime": 56.3666,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 5400
},
{
"epoch": 0.3907720436660871,
"grad_norm": 4.669826030731201,
"learning_rate": 6.770235818992989e-05,
"loss": 4.8767,
"mean_token_accuracy": 0.6715716090798378,
"num_tokens": 68934666.0,
"step": 5450
},
{
"epoch": 0.3907720436660871,
"eval_loss": 1.207878589630127,
"eval_mean_token_accuracy": 0.672472620010376,
"eval_num_tokens": 68934666.0,
"eval_runtime": 56.6965,
"eval_samples_per_second": 7.055,
"eval_steps_per_second": 0.882,
"step": 5450
},
{
"epoch": 0.39435710828687687,
"grad_norm": 4.467480659484863,
"learning_rate": 6.730401529636712e-05,
"loss": 4.8558,
"mean_token_accuracy": 0.6721174070239067,
"num_tokens": 69569216.0,
"step": 5500
},
{
"epoch": 0.39435710828687687,
"eval_loss": 1.207086205482483,
"eval_mean_token_accuracy": 0.6721065282821655,
"eval_num_tokens": 69569216.0,
"eval_runtime": 56.3201,
"eval_samples_per_second": 7.102,
"eval_steps_per_second": 0.888,
"step": 5500
},
{
"epoch": 0.39794217290766665,
"grad_norm": 4.608986854553223,
"learning_rate": 6.690567240280435e-05,
"loss": 4.8658,
"mean_token_accuracy": 0.6705604410171508,
"num_tokens": 70207379.0,
"step": 5550
},
{
"epoch": 0.39794217290766665,
"eval_loss": 1.2067745923995972,
"eval_mean_token_accuracy": 0.6716193425655365,
"eval_num_tokens": 70207379.0,
"eval_runtime": 57.4805,
"eval_samples_per_second": 6.959,
"eval_steps_per_second": 0.87,
"step": 5550
},
{
"epoch": 0.4015272375284564,
"grad_norm": 4.4026780128479,
"learning_rate": 6.650732950924155e-05,
"loss": 4.7928,
"mean_token_accuracy": 0.6765011212229729,
"num_tokens": 70836399.0,
"step": 5600
},
{
"epoch": 0.4015272375284564,
"eval_loss": 1.2075951099395752,
"eval_mean_token_accuracy": 0.6730300402641296,
"eval_num_tokens": 70836399.0,
"eval_runtime": 58.1991,
"eval_samples_per_second": 6.873,
"eval_steps_per_second": 0.859,
"step": 5600
},
{
"epoch": 0.40511230214924626,
"grad_norm": 4.3206048011779785,
"learning_rate": 6.610898661567877e-05,
"loss": 4.7828,
"mean_token_accuracy": 0.6755007293820381,
"num_tokens": 71465978.0,
"step": 5650
},
{
"epoch": 0.40511230214924626,
"eval_loss": 1.2060260772705078,
"eval_mean_token_accuracy": 0.6736092364788056,
"eval_num_tokens": 71465978.0,
"eval_runtime": 55.9656,
"eval_samples_per_second": 7.147,
"eval_steps_per_second": 0.893,
"step": 5650
},
{
"epoch": 0.40869736677003604,
"grad_norm": 4.6384196281433105,
"learning_rate": 6.571064372211601e-05,
"loss": 4.8045,
"mean_token_accuracy": 0.6742757317423821,
"num_tokens": 72094960.0,
"step": 5700
},
{
"epoch": 0.40869736677003604,
"eval_loss": 1.2062655687332153,
"eval_mean_token_accuracy": 0.6727207219600677,
"eval_num_tokens": 72094960.0,
"eval_runtime": 56.5884,
"eval_samples_per_second": 7.069,
"eval_steps_per_second": 0.884,
"step": 5700
},
{
"epoch": 0.4122824313908258,
"grad_norm": 4.51801872253418,
"learning_rate": 6.531230082855322e-05,
"loss": 4.8502,
"mean_token_accuracy": 0.6714598840475082,
"num_tokens": 72728939.0,
"step": 5750
},
{
"epoch": 0.4122824313908258,
"eval_loss": 1.2066096067428589,
"eval_mean_token_accuracy": 0.6731921648979187,
"eval_num_tokens": 72728939.0,
"eval_runtime": 56.4183,
"eval_samples_per_second": 7.09,
"eval_steps_per_second": 0.886,
"step": 5750
},
{
"epoch": 0.4158674960116156,
"grad_norm": 4.803595066070557,
"learning_rate": 6.491395793499044e-05,
"loss": 4.863,
"mean_token_accuracy": 0.6716452211141586,
"num_tokens": 73363737.0,
"step": 5800
},
{
"epoch": 0.4158674960116156,
"eval_loss": 1.2050178050994873,
"eval_mean_token_accuracy": 0.6734542024135589,
"eval_num_tokens": 73363737.0,
"eval_runtime": 56.4329,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 0.886,
"step": 5800
},
{
"epoch": 0.4194525606324054,
"grad_norm": 4.864405155181885,
"learning_rate": 6.451561504142767e-05,
"loss": 4.82,
"mean_token_accuracy": 0.6745044487714768,
"num_tokens": 73997522.0,
"step": 5850
},
{
"epoch": 0.4194525606324054,
"eval_loss": 1.2039889097213745,
"eval_mean_token_accuracy": 0.6733579516410828,
"eval_num_tokens": 73997522.0,
"eval_runtime": 56.3966,
"eval_samples_per_second": 7.093,
"eval_steps_per_second": 0.887,
"step": 5850
},
{
"epoch": 0.4230376252531952,
"grad_norm": 4.559540271759033,
"learning_rate": 6.411727214786488e-05,
"loss": 4.7735,
"mean_token_accuracy": 0.6764472410082817,
"num_tokens": 74632565.0,
"step": 5900
},
{
"epoch": 0.4230376252531952,
"eval_loss": 1.2041822671890259,
"eval_mean_token_accuracy": 0.6725377225875855,
"eval_num_tokens": 74632565.0,
"eval_runtime": 56.2069,
"eval_samples_per_second": 7.117,
"eval_steps_per_second": 0.89,
"step": 5900
},
{
"epoch": 0.426622689873985,
"grad_norm": 4.625767230987549,
"learning_rate": 6.37189292543021e-05,
"loss": 4.8474,
"mean_token_accuracy": 0.6734576171636582,
"num_tokens": 75264457.0,
"step": 5950
},
{
"epoch": 0.426622689873985,
"eval_loss": 1.2035109996795654,
"eval_mean_token_accuracy": 0.6731998026371002,
"eval_num_tokens": 75264457.0,
"eval_runtime": 56.7209,
"eval_samples_per_second": 7.052,
"eval_steps_per_second": 0.882,
"step": 5950
},
{
"epoch": 0.4302077544947748,
"grad_norm": 4.185346603393555,
"learning_rate": 6.332058636073932e-05,
"loss": 4.8327,
"mean_token_accuracy": 0.6721743106842041,
"num_tokens": 75893311.0,
"step": 6000
},
{
"epoch": 0.4302077544947748,
"eval_loss": 1.203436255455017,
"eval_mean_token_accuracy": 0.6730928170681,
"eval_num_tokens": 75893311.0,
"eval_runtime": 56.3504,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 0.887,
"step": 6000
},
{
"epoch": 0.43379281911556455,
"grad_norm": 4.341583251953125,
"learning_rate": 6.292224346717655e-05,
"loss": 4.8072,
"mean_token_accuracy": 0.6749084493517876,
"num_tokens": 76529617.0,
"step": 6050
},
{
"epoch": 0.43379281911556455,
"eval_loss": 1.2034169435501099,
"eval_mean_token_accuracy": 0.6731595695018768,
"eval_num_tokens": 76529617.0,
"eval_runtime": 56.4332,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 0.886,
"step": 6050
},
{
"epoch": 0.43737788373635433,
"grad_norm": 4.502080917358398,
"learning_rate": 6.252390057361377e-05,
"loss": 4.7284,
"mean_token_accuracy": 0.6789399805665016,
"num_tokens": 77158901.0,
"step": 6100
},
{
"epoch": 0.43737788373635433,
"eval_loss": 1.2037384510040283,
"eval_mean_token_accuracy": 0.6724783575534821,
"eval_num_tokens": 77158901.0,
"eval_runtime": 56.1762,
"eval_samples_per_second": 7.12,
"eval_steps_per_second": 0.89,
"step": 6100
},
{
"epoch": 0.4409629483571441,
"grad_norm": 4.407749652862549,
"learning_rate": 6.212555768005099e-05,
"loss": 4.8102,
"mean_token_accuracy": 0.6749192690849304,
"num_tokens": 77792929.0,
"step": 6150
},
{
"epoch": 0.4409629483571441,
"eval_loss": 1.2034553289413452,
"eval_mean_token_accuracy": 0.6724519121646881,
"eval_num_tokens": 77792929.0,
"eval_runtime": 56.3346,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.888,
"step": 6150
},
{
"epoch": 0.44454801297793395,
"grad_norm": 4.5488362312316895,
"learning_rate": 6.172721478648821e-05,
"loss": 4.8424,
"mean_token_accuracy": 0.6731199064850807,
"num_tokens": 78426748.0,
"step": 6200
},
{
"epoch": 0.44454801297793395,
"eval_loss": 1.2025480270385742,
"eval_mean_token_accuracy": 0.673497976064682,
"eval_num_tokens": 78426748.0,
"eval_runtime": 56.1462,
"eval_samples_per_second": 7.124,
"eval_steps_per_second": 0.891,
"step": 6200
},
{
"epoch": 0.44813307759872373,
"grad_norm": 4.52962589263916,
"learning_rate": 6.132887189292543e-05,
"loss": 4.7814,
"mean_token_accuracy": 0.6756982815265655,
"num_tokens": 79054859.0,
"step": 6250
},
{
"epoch": 0.44813307759872373,
"eval_loss": 1.2015492916107178,
"eval_mean_token_accuracy": 0.6740836083889008,
"eval_num_tokens": 79054859.0,
"eval_runtime": 56.3541,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 0.887,
"step": 6250
},
{
"epoch": 0.4517181422195135,
"grad_norm": 4.603536128997803,
"learning_rate": 6.093052899936266e-05,
"loss": 4.9195,
"mean_token_accuracy": 0.6704733854532242,
"num_tokens": 79688857.0,
"step": 6300
},
{
"epoch": 0.4517181422195135,
"eval_loss": 1.2020344734191895,
"eval_mean_token_accuracy": 0.6741014468669891,
"eval_num_tokens": 79688857.0,
"eval_runtime": 56.114,
"eval_samples_per_second": 7.128,
"eval_steps_per_second": 0.891,
"step": 6300
},
{
"epoch": 0.4553032068403033,
"grad_norm": 5.02667236328125,
"learning_rate": 6.053218610579987e-05,
"loss": 4.7774,
"mean_token_accuracy": 0.6772465297579765,
"num_tokens": 80324585.0,
"step": 6350
},
{
"epoch": 0.4553032068403033,
"eval_loss": 1.201953411102295,
"eval_mean_token_accuracy": 0.6730434691905975,
"eval_num_tokens": 80324585.0,
"eval_runtime": 56.1284,
"eval_samples_per_second": 7.127,
"eval_steps_per_second": 0.891,
"step": 6350
},
{
"epoch": 0.45888827146109307,
"grad_norm": 4.330198764801025,
"learning_rate": 6.0133843212237096e-05,
"loss": 4.8002,
"mean_token_accuracy": 0.6744606778025627,
"num_tokens": 80956366.0,
"step": 6400
},
{
"epoch": 0.45888827146109307,
"eval_loss": 1.2010780572891235,
"eval_mean_token_accuracy": 0.6726482355594635,
"eval_num_tokens": 80956366.0,
"eval_runtime": 56.1261,
"eval_samples_per_second": 7.127,
"eval_steps_per_second": 0.891,
"step": 6400
},
{
"epoch": 0.4624733360818829,
"grad_norm": 4.510508060455322,
"learning_rate": 5.973550031867432e-05,
"loss": 4.8791,
"mean_token_accuracy": 0.6714214497804641,
"num_tokens": 81587826.0,
"step": 6450
},
{
"epoch": 0.4624733360818829,
"eval_loss": 1.2023468017578125,
"eval_mean_token_accuracy": 0.672912814617157,
"eval_num_tokens": 81587826.0,
"eval_runtime": 56.3171,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 0.888,
"step": 6450
},
{
"epoch": 0.4660584007026727,
"grad_norm": 4.60286283493042,
"learning_rate": 5.933715742511153e-05,
"loss": 4.7999,
"mean_token_accuracy": 0.6754237455129624,
"num_tokens": 82222966.0,
"step": 6500
},
{
"epoch": 0.4660584007026727,
"eval_loss": 1.2008494138717651,
"eval_mean_token_accuracy": 0.6732868099212647,
"eval_num_tokens": 82222966.0,
"eval_runtime": 56.2084,
"eval_samples_per_second": 7.116,
"eval_steps_per_second": 0.89,
"step": 6500
},
{
"epoch": 0.46964346532346246,
"grad_norm": 4.842785835266113,
"learning_rate": 5.893881453154876e-05,
"loss": 4.8096,
"mean_token_accuracy": 0.6758663612604141,
"num_tokens": 82855504.0,
"step": 6550
},
{
"epoch": 0.46964346532346246,
"eval_loss": 1.200462818145752,
"eval_mean_token_accuracy": 0.6739160513877869,
"eval_num_tokens": 82855504.0,
"eval_runtime": 56.2921,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 0.888,
"step": 6550
},
{
"epoch": 0.47322852994425224,
"grad_norm": 4.244312763214111,
"learning_rate": 5.854047163798598e-05,
"loss": 4.773,
"mean_token_accuracy": 0.6780867150425911,
"num_tokens": 83488202.0,
"step": 6600
},
{
"epoch": 0.47322852994425224,
"eval_loss": 1.200437068939209,
"eval_mean_token_accuracy": 0.6741013741493225,
"eval_num_tokens": 83488202.0,
"eval_runtime": 56.4321,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 0.886,
"step": 6600
},
{
"epoch": 0.476813594565042,
"grad_norm": 4.384121894836426,
"learning_rate": 5.814212874442321e-05,
"loss": 4.8256,
"mean_token_accuracy": 0.6735525381565094,
"num_tokens": 84123092.0,
"step": 6650
},
{
"epoch": 0.476813594565042,
"eval_loss": 1.2007168531417847,
"eval_mean_token_accuracy": 0.6743572854995727,
"eval_num_tokens": 84123092.0,
"eval_runtime": 56.5132,
"eval_samples_per_second": 7.078,
"eval_steps_per_second": 0.885,
"step": 6650
},
{
"epoch": 0.4803986591858318,
"grad_norm": 5.510925769805908,
"learning_rate": 5.774378585086042e-05,
"loss": 4.7806,
"mean_token_accuracy": 0.6750581926107406,
"num_tokens": 84756126.0,
"step": 6700
},
{
"epoch": 0.4803986591858318,
"eval_loss": 1.2009855508804321,
"eval_mean_token_accuracy": 0.6739739573001862,
"eval_num_tokens": 84756126.0,
"eval_runtime": 56.6366,
"eval_samples_per_second": 7.063,
"eval_steps_per_second": 0.883,
"step": 6700
},
{
"epoch": 0.48398372380662164,
"grad_norm": 4.581708908081055,
"learning_rate": 5.7345442957297646e-05,
"loss": 4.722,
"mean_token_accuracy": 0.679350274503231,
"num_tokens": 85386870.0,
"step": 6750
},
{
"epoch": 0.48398372380662164,
"eval_loss": 1.2008088827133179,
"eval_mean_token_accuracy": 0.6728281593322754,
"eval_num_tokens": 85386870.0,
"eval_runtime": 56.4337,
"eval_samples_per_second": 7.088,
"eval_steps_per_second": 0.886,
"step": 6750
},
{
"epoch": 0.4875687884274114,
"grad_norm": 5.783533573150635,
"learning_rate": 5.694710006373487e-05,
"loss": 4.7616,
"mean_token_accuracy": 0.6766093501448631,
"num_tokens": 86015238.0,
"step": 6800
},
{
"epoch": 0.4875687884274114,
"eval_loss": 1.2004883289337158,
"eval_mean_token_accuracy": 0.6737746036052704,
"eval_num_tokens": 86015238.0,
"eval_runtime": 56.202,
"eval_samples_per_second": 7.117,
"eval_steps_per_second": 0.89,
"step": 6800
},
{
"epoch": 0.4911538530482012,
"grad_norm": 4.4624714851379395,
"learning_rate": 5.654875717017208e-05,
"loss": 4.8036,
"mean_token_accuracy": 0.6753204807639122,
"num_tokens": 86646920.0,
"step": 6850
},
{
"epoch": 0.4911538530482012,
"eval_loss": 1.1992673873901367,
"eval_mean_token_accuracy": 0.6739728832244873,
"eval_num_tokens": 86646920.0,
"eval_runtime": 56.3145,
"eval_samples_per_second": 7.103,
"eval_steps_per_second": 0.888,
"step": 6850
},
{
"epoch": 0.494738917668991,
"grad_norm": 4.528706073760986,
"learning_rate": 5.615041427660931e-05,
"loss": 4.8219,
"mean_token_accuracy": 0.673625990152359,
"num_tokens": 87279245.0,
"step": 6900
},
{
"epoch": 0.494738917668991,
"eval_loss": 1.1988134384155273,
"eval_mean_token_accuracy": 0.673334904909134,
"eval_num_tokens": 87279245.0,
"eval_runtime": 56.4107,
"eval_samples_per_second": 7.091,
"eval_steps_per_second": 0.886,
"step": 6900
},
{
"epoch": 0.49832398228978075,
"grad_norm": 4.4395527839660645,
"learning_rate": 5.575207138304653e-05,
"loss": 4.8404,
"mean_token_accuracy": 0.6731960904598236,
"num_tokens": 87908715.0,
"step": 6950
},
{
"epoch": 0.49832398228978075,
"eval_loss": 1.198786973953247,
"eval_mean_token_accuracy": 0.6738696718215942,
"eval_num_tokens": 87908715.0,
"eval_runtime": 56.4217,
"eval_samples_per_second": 7.089,
"eval_steps_per_second": 0.886,
"step": 6950
},
{
"epoch": 0.5019090469105706,
"grad_norm": 4.999813079833984,
"learning_rate": 5.535372848948375e-05,
"loss": 4.8014,
"mean_token_accuracy": 0.6743469536304474,
"num_tokens": 88541353.0,
"step": 7000
},
{
"epoch": 0.5019090469105706,
"eval_loss": 1.1984182596206665,
"eval_mean_token_accuracy": 0.6738111090660095,
"eval_num_tokens": 88541353.0,
"eval_runtime": 56.269,
"eval_samples_per_second": 7.109,
"eval_steps_per_second": 0.889,
"step": 7000
},
{
"epoch": 0.5054941115313604,
"grad_norm": 5.244815826416016,
"learning_rate": 5.4955385595920975e-05,
"loss": 4.7345,
"mean_token_accuracy": 0.6794330298900604,
"num_tokens": 89171565.0,
"step": 7050
},
{
"epoch": 0.5054941115313604,
"eval_loss": 1.1977105140686035,
"eval_mean_token_accuracy": 0.6732430410385132,
"eval_num_tokens": 89171565.0,
"eval_runtime": 56.2208,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 0.889,
"step": 7050
},
{
"epoch": 0.5090791761521501,
"grad_norm": 4.170567512512207,
"learning_rate": 5.4557042702358196e-05,
"loss": 4.7721,
"mean_token_accuracy": 0.6758232372999191,
"num_tokens": 89803955.0,
"step": 7100
},
{
"epoch": 0.5090791761521501,
"eval_loss": 1.1980831623077393,
"eval_mean_token_accuracy": 0.6739831912517548,
"eval_num_tokens": 89803955.0,
"eval_runtime": 56.1966,
"eval_samples_per_second": 7.118,
"eval_steps_per_second": 0.89,
"step": 7100
},
{
"epoch": 0.5126642407729399,
"grad_norm": 4.576419830322266,
"learning_rate": 5.415869980879541e-05,
"loss": 4.7588,
"mean_token_accuracy": 0.6762737995386123,
"num_tokens": 90440293.0,
"step": 7150
},
{
"epoch": 0.5126642407729399,
"eval_loss": 1.1975429058074951,
"eval_mean_token_accuracy": 0.673925119638443,
"eval_num_tokens": 90440293.0,
"eval_runtime": 56.1988,
"eval_samples_per_second": 7.118,
"eval_steps_per_second": 0.89,
"step": 7150
},
{
"epoch": 0.5162493053937297,
"grad_norm": 4.430201530456543,
"learning_rate": 5.376035691523263e-05,
"loss": 4.7482,
"mean_token_accuracy": 0.6770784831047059,
"num_tokens": 91071574.0,
"step": 7200
},
{
"epoch": 0.5162493053937297,
"eval_loss": 1.1972256898880005,
"eval_mean_token_accuracy": 0.6727135396003723,
"eval_num_tokens": 91071574.0,
"eval_runtime": 56.2879,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 0.888,
"step": 7200
},
{
"epoch": 0.5198343700145195,
"grad_norm": 4.440696716308594,
"learning_rate": 5.336201402166986e-05,
"loss": 4.8496,
"mean_token_accuracy": 0.6734337306022644,
"num_tokens": 91704308.0,
"step": 7250
},
{
"epoch": 0.5198343700145195,
"eval_loss": 1.196380853652954,
"eval_mean_token_accuracy": 0.674187605381012,
"eval_num_tokens": 91704308.0,
"eval_runtime": 56.1737,
"eval_samples_per_second": 7.121,
"eval_steps_per_second": 0.89,
"step": 7250
},
{
"epoch": 0.5234194346353093,
"grad_norm": 4.427169322967529,
"learning_rate": 5.2963671128107075e-05,
"loss": 4.7884,
"mean_token_accuracy": 0.6760821756720543,
"num_tokens": 92338569.0,
"step": 7300
},
{
"epoch": 0.5234194346353093,
"eval_loss": 1.1962813138961792,
"eval_mean_token_accuracy": 0.6740961968898773,
"eval_num_tokens": 92338569.0,
"eval_runtime": 56.279,
"eval_samples_per_second": 7.107,
"eval_steps_per_second": 0.888,
"step": 7300
},
{
"epoch": 0.527004499256099,
"grad_norm": 4.586068630218506,
"learning_rate": 5.25653282345443e-05,
"loss": 4.8462,
"mean_token_accuracy": 0.6725165358185768,
"num_tokens": 92970201.0,
"step": 7350
},
{
"epoch": 0.527004499256099,
"eval_loss": 1.1966549158096313,
"eval_mean_token_accuracy": 0.6744921112060547,
"eval_num_tokens": 92970201.0,
"eval_runtime": 56.2579,
"eval_samples_per_second": 7.11,
"eval_steps_per_second": 0.889,
"step": 7350
},
{
"epoch": 0.5305895638768888,
"grad_norm": 4.275878429412842,
"learning_rate": 5.2166985340981525e-05,
"loss": 4.7573,
"mean_token_accuracy": 0.6764388364553452,
"num_tokens": 93604624.0,
"step": 7400
},
{
"epoch": 0.5305895638768888,
"eval_loss": 1.1963127851486206,
"eval_mean_token_accuracy": 0.6738464891910553,
"eval_num_tokens": 93604624.0,
"eval_runtime": 56.2885,
"eval_samples_per_second": 7.106,
"eval_steps_per_second": 0.888,
"step": 7400
},
{
"epoch": 0.5341746284976787,
"grad_norm": 4.383382797241211,
"learning_rate": 5.176864244741873e-05,
"loss": 4.8151,
"mean_token_accuracy": 0.6745539313554764,
"num_tokens": 94237768.0,
"step": 7450
},
{
"epoch": 0.5341746284976787,
"eval_loss": 1.196314811706543,
"eval_mean_token_accuracy": 0.6741366982460022,
"eval_num_tokens": 94237768.0,
"eval_runtime": 56.1682,
"eval_samples_per_second": 7.121,
"eval_steps_per_second": 0.89,
"step": 7450
},
{
"epoch": 0.5377596931184685,
"grad_norm": 4.777865409851074,
"learning_rate": 5.137029955385596e-05,
"loss": 4.7179,
"mean_token_accuracy": 0.6791237652301788,
"num_tokens": 94868917.0,
"step": 7500
},
{
"epoch": 0.5377596931184685,
"eval_loss": 1.1951854228973389,
"eval_mean_token_accuracy": 0.6737760400772095,
"eval_num_tokens": 94868917.0,
"eval_runtime": 57.4744,
"eval_samples_per_second": 6.96,
"eval_steps_per_second": 0.87,
"step": 7500
},
{
"epoch": 0.5413447577392583,
"grad_norm": 5.250718116760254,
"learning_rate": 5.097195666029318e-05,
"loss": 4.853,
"mean_token_accuracy": 0.671261510848999,
"num_tokens": 95504086.0,
"step": 7550
},
{
"epoch": 0.5413447577392583,
"eval_loss": 1.1954213380813599,
"eval_mean_token_accuracy": 0.6752165842056275,
"eval_num_tokens": 95504086.0,
"eval_runtime": 56.6697,
"eval_samples_per_second": 7.058,
"eval_steps_per_second": 0.882,
"step": 7550
},
{
"epoch": 0.544929822360048,
"grad_norm": 4.873703479766846,
"learning_rate": 5.05736137667304e-05,
"loss": 4.7717,
"mean_token_accuracy": 0.6773542383313179,
"num_tokens": 96139635.0,
"step": 7600
},
{
"epoch": 0.544929822360048,
"eval_loss": 1.1948680877685547,
"eval_mean_token_accuracy": 0.6749390983581542,
"eval_num_tokens": 96139635.0,
"eval_runtime": 55.7291,
"eval_samples_per_second": 7.178,
"eval_steps_per_second": 0.897,
"step": 7600
},
{
"epoch": 0.5485148869808378,
"grad_norm": 4.877697467803955,
"learning_rate": 5.0175270873167626e-05,
"loss": 4.7886,
"mean_token_accuracy": 0.6762890338897705,
"num_tokens": 96770646.0,
"step": 7650
},
{
"epoch": 0.5485148869808378,
"eval_loss": 1.1947038173675537,
"eval_mean_token_accuracy": 0.6754334461688996,
"eval_num_tokens": 96770646.0,
"eval_runtime": 55.5575,
"eval_samples_per_second": 7.2,
"eval_steps_per_second": 0.9,
"step": 7650
},
{
"epoch": 0.5520999516016276,
"grad_norm": 4.1052117347717285,
"learning_rate": 4.977692797960485e-05,
"loss": 4.8169,
"mean_token_accuracy": 0.6731060117483139,
"num_tokens": 97404492.0,
"step": 7700
},
{
"epoch": 0.5520999516016276,
"eval_loss": 1.1950753927230835,
"eval_mean_token_accuracy": 0.6745660018920898,
"eval_num_tokens": 97404492.0,
"eval_runtime": 55.0653,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.908,
"step": 7700
},
{
"epoch": 0.5556850162224174,
"grad_norm": 4.796311855316162,
"learning_rate": 4.937858508604207e-05,
"loss": 4.7797,
"mean_token_accuracy": 0.6754096934199333,
"num_tokens": 98040499.0,
"step": 7750
},
{
"epoch": 0.5556850162224174,
"eval_loss": 1.1943681240081787,
"eval_mean_token_accuracy": 0.6758341288566589,
"eval_num_tokens": 98040499.0,
"eval_runtime": 55.0042,
"eval_samples_per_second": 7.272,
"eval_steps_per_second": 0.909,
"step": 7750
},
{
"epoch": 0.5592700808432072,
"grad_norm": 4.996248722076416,
"learning_rate": 4.898024219247929e-05,
"loss": 4.7915,
"mean_token_accuracy": 0.6766231226921081,
"num_tokens": 98666526.0,
"step": 7800
},
{
"epoch": 0.5592700808432072,
"eval_loss": 1.1935983896255493,
"eval_mean_token_accuracy": 0.6747439002990723,
"eval_num_tokens": 98666526.0,
"eval_runtime": 55.0204,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.909,
"step": 7800
},
{
"epoch": 0.562855145463997,
"grad_norm": 4.6099534034729,
"learning_rate": 4.858189929891651e-05,
"loss": 4.7901,
"mean_token_accuracy": 0.6749289181828498,
"num_tokens": 99297640.0,
"step": 7850
},
{
"epoch": 0.562855145463997,
"eval_loss": 1.1944231986999512,
"eval_mean_token_accuracy": 0.6743023383617401,
"eval_num_tokens": 99297640.0,
"eval_runtime": 55.1784,
"eval_samples_per_second": 7.249,
"eval_steps_per_second": 0.906,
"step": 7850
},
{
"epoch": 0.5664402100847867,
"grad_norm": 4.517291069030762,
"learning_rate": 4.818355640535373e-05,
"loss": 4.8626,
"mean_token_accuracy": 0.6725256371498108,
"num_tokens": 99928477.0,
"step": 7900
},
{
"epoch": 0.5664402100847867,
"eval_loss": 1.1935796737670898,
"eval_mean_token_accuracy": 0.6745834064483642,
"eval_num_tokens": 99928477.0,
"eval_runtime": 55.0686,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.908,
"step": 7900
},
{
"epoch": 0.5700252747055766,
"grad_norm": 4.5024003982543945,
"learning_rate": 4.778521351179095e-05,
"loss": 4.712,
"mean_token_accuracy": 0.6803113195300102,
"num_tokens": 100557231.0,
"step": 7950
},
{
"epoch": 0.5700252747055766,
"eval_loss": 1.1945058107376099,
"eval_mean_token_accuracy": 0.6740881907939911,
"eval_num_tokens": 100557231.0,
"eval_runtime": 54.9762,
"eval_samples_per_second": 7.276,
"eval_steps_per_second": 0.909,
"step": 7950
},
{
"epoch": 0.5736103393263664,
"grad_norm": 4.741750717163086,
"learning_rate": 4.7386870618228176e-05,
"loss": 4.7828,
"mean_token_accuracy": 0.6768678402900696,
"num_tokens": 101189608.0,
"step": 8000
},
{
"epoch": 0.5736103393263664,
"eval_loss": 1.1939120292663574,
"eval_mean_token_accuracy": 0.6754596734046936,
"eval_num_tokens": 101189608.0,
"eval_runtime": 55.073,
"eval_samples_per_second": 7.263,
"eval_steps_per_second": 0.908,
"step": 8000
},
{
"epoch": 0.5771954039471562,
"grad_norm": 4.49591064453125,
"learning_rate": 4.698852772466539e-05,
"loss": 4.8169,
"mean_token_accuracy": 0.6742269179224968,
"num_tokens": 101822322.0,
"step": 8050
},
{
"epoch": 0.5771954039471562,
"eval_loss": 1.1937360763549805,
"eval_mean_token_accuracy": 0.6749664378166199,
"eval_num_tokens": 101822322.0,
"eval_runtime": 55.0891,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.908,
"step": 8050
},
{
"epoch": 0.580780468567946,
"grad_norm": 5.148952007293701,
"learning_rate": 4.659018483110262e-05,
"loss": 4.7721,
"mean_token_accuracy": 0.6771083778142929,
"num_tokens": 102453524.0,
"step": 8100
},
{
"epoch": 0.580780468567946,
"eval_loss": 1.194778323173523,
"eval_mean_token_accuracy": 0.6745067381858826,
"eval_num_tokens": 102453524.0,
"eval_runtime": 55.1207,
"eval_samples_per_second": 7.257,
"eval_steps_per_second": 0.907,
"step": 8100
},
{
"epoch": 0.5843655331887357,
"grad_norm": 5.115074634552002,
"learning_rate": 4.619184193753984e-05,
"loss": 4.7134,
"mean_token_accuracy": 0.6806976914405822,
"num_tokens": 103083835.0,
"step": 8150
},
{
"epoch": 0.5843655331887357,
"eval_loss": 1.192982792854309,
"eval_mean_token_accuracy": 0.6747034168243409,
"eval_num_tokens": 103083835.0,
"eval_runtime": 55.0852,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.908,
"step": 8150
},
{
"epoch": 0.5879505978095255,
"grad_norm": 4.619081497192383,
"learning_rate": 4.5793499043977055e-05,
"loss": 4.796,
"mean_token_accuracy": 0.6744829830527306,
"num_tokens": 103715932.0,
"step": 8200
},
{
"epoch": 0.5879505978095255,
"eval_loss": 1.192581057548523,
"eval_mean_token_accuracy": 0.6748893690109253,
"eval_num_tokens": 103715932.0,
"eval_runtime": 55.0423,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.908,
"step": 8200
},
{
"epoch": 0.5915356624303153,
"grad_norm": 4.433931350708008,
"learning_rate": 4.539515615041428e-05,
"loss": 4.7435,
"mean_token_accuracy": 0.678233249783516,
"num_tokens": 104345485.0,
"step": 8250
},
{
"epoch": 0.5915356624303153,
"eval_loss": 1.1933448314666748,
"eval_mean_token_accuracy": 0.6751340889930725,
"eval_num_tokens": 104345485.0,
"eval_runtime": 55.0423,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.908,
"step": 8250
},
{
"epoch": 0.5951207270511051,
"grad_norm": 4.362198829650879,
"learning_rate": 4.49968132568515e-05,
"loss": 4.7737,
"mean_token_accuracy": 0.6764271047711372,
"num_tokens": 104978991.0,
"step": 8300
},
{
"epoch": 0.5951207270511051,
"eval_loss": 1.1930105686187744,
"eval_mean_token_accuracy": 0.6747807443141938,
"eval_num_tokens": 104978991.0,
"eval_runtime": 55.3077,
"eval_samples_per_second": 7.232,
"eval_steps_per_second": 0.904,
"step": 8300
},
{
"epoch": 0.5987057916718949,
"grad_norm": 4.534180641174316,
"learning_rate": 4.459847036328872e-05,
"loss": 4.7728,
"mean_token_accuracy": 0.677936093211174,
"num_tokens": 105610427.0,
"step": 8350
},
{
"epoch": 0.5987057916718949,
"eval_loss": 1.1923025846481323,
"eval_mean_token_accuracy": 0.6750785481929779,
"eval_num_tokens": 105610427.0,
"eval_runtime": 55.0617,
"eval_samples_per_second": 7.265,
"eval_steps_per_second": 0.908,
"step": 8350
},
{
"epoch": 0.6022908562926846,
"grad_norm": 5.027590274810791,
"learning_rate": 4.420012746972594e-05,
"loss": 4.7192,
"mean_token_accuracy": 0.6799935781955719,
"num_tokens": 106243018.0,
"step": 8400
},
{
"epoch": 0.6022908562926846,
"eval_loss": 1.1910535097122192,
"eval_mean_token_accuracy": 0.6752017951011657,
"eval_num_tokens": 106243018.0,
"eval_runtime": 55.087,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.908,
"step": 8400
},
{
"epoch": 0.6058759209134744,
"grad_norm": 4.658295154571533,
"learning_rate": 4.380178457616316e-05,
"loss": 4.806,
"mean_token_accuracy": 0.6742839315533637,
"num_tokens": 106877488.0,
"step": 8450
},
{
"epoch": 0.6058759209134744,
"eval_loss": 1.1911369562149048,
"eval_mean_token_accuracy": 0.6754417788982391,
"eval_num_tokens": 106877488.0,
"eval_runtime": 55.1243,
"eval_samples_per_second": 7.256,
"eval_steps_per_second": 0.907,
"step": 8450
},
{
"epoch": 0.6094609855342643,
"grad_norm": 4.897305488586426,
"learning_rate": 4.340344168260038e-05,
"loss": 4.7673,
"mean_token_accuracy": 0.6776413953304291,
"num_tokens": 107510169.0,
"step": 8500
},
{
"epoch": 0.6094609855342643,
"eval_loss": 1.1910532712936401,
"eval_mean_token_accuracy": 0.6744759595394134,
"eval_num_tokens": 107510169.0,
"eval_runtime": 55.0914,
"eval_samples_per_second": 7.261,
"eval_steps_per_second": 0.908,
"step": 8500
},
{
"epoch": 0.6130460501550541,
"grad_norm": 4.881381034851074,
"learning_rate": 4.3005098789037605e-05,
"loss": 4.8061,
"mean_token_accuracy": 0.674516750574112,
"num_tokens": 108145421.0,
"step": 8550
},
{
"epoch": 0.6130460501550541,
"eval_loss": 1.190964937210083,
"eval_mean_token_accuracy": 0.6752122223377228,
"eval_num_tokens": 108145421.0,
"eval_runtime": 54.985,
"eval_samples_per_second": 7.275,
"eval_steps_per_second": 0.909,
"step": 8550
},
{
"epoch": 0.6166311147758439,
"grad_norm": 5.073390483856201,
"learning_rate": 4.2606755895474826e-05,
"loss": 4.7739,
"mean_token_accuracy": 0.6762193894386291,
"num_tokens": 108780841.0,
"step": 8600
},
{
"epoch": 0.6166311147758439,
"eval_loss": 1.1907490491867065,
"eval_mean_token_accuracy": 0.6748946511745453,
"eval_num_tokens": 108780841.0,
"eval_runtime": 54.9525,
"eval_samples_per_second": 7.279,
"eval_steps_per_second": 0.91,
"step": 8600
},
{
"epoch": 0.6202161793966336,
"grad_norm": 4.459120750427246,
"learning_rate": 4.220841300191205e-05,
"loss": 4.7864,
"mean_token_accuracy": 0.6758550813794136,
"num_tokens": 109418807.0,
"step": 8650
},
{
"epoch": 0.6202161793966336,
"eval_loss": 1.1910618543624878,
"eval_mean_token_accuracy": 0.6749819540977477,
"eval_num_tokens": 109418807.0,
"eval_runtime": 54.9049,
"eval_samples_per_second": 7.285,
"eval_steps_per_second": 0.911,
"step": 8650
},
{
"epoch": 0.6238012440174234,
"grad_norm": 4.40315055847168,
"learning_rate": 4.181007010834927e-05,
"loss": 4.7917,
"mean_token_accuracy": 0.6742719665169716,
"num_tokens": 110057516.0,
"step": 8700
},
{
"epoch": 0.6238012440174234,
"eval_loss": 1.1901732683181763,
"eval_mean_token_accuracy": 0.674855477809906,
"eval_num_tokens": 110057516.0,
"eval_runtime": 54.9882,
"eval_samples_per_second": 7.274,
"eval_steps_per_second": 0.909,
"step": 8700
},
{
"epoch": 0.6273863086382132,
"grad_norm": 4.657465934753418,
"learning_rate": 4.141172721478649e-05,
"loss": 4.7503,
"mean_token_accuracy": 0.6786279901862144,
"num_tokens": 110689903.0,
"step": 8750
},
{
"epoch": 0.6273863086382132,
"eval_loss": 1.190616488456726,
"eval_mean_token_accuracy": 0.675126885175705,
"eval_num_tokens": 110689903.0,
"eval_runtime": 55.2291,
"eval_samples_per_second": 7.243,
"eval_steps_per_second": 0.905,
"step": 8750
},
{
"epoch": 0.630971373259003,
"grad_norm": 4.282220840454102,
"learning_rate": 4.101338432122371e-05,
"loss": 4.7029,
"mean_token_accuracy": 0.6801710060238838,
"num_tokens": 111323016.0,
"step": 8800
},
{
"epoch": 0.630971373259003,
"eval_loss": 1.1904511451721191,
"eval_mean_token_accuracy": 0.6755478191375732,
"eval_num_tokens": 111323016.0,
"eval_runtime": 54.9632,
"eval_samples_per_second": 7.278,
"eval_steps_per_second": 0.91,
"step": 8800
},
{
"epoch": 0.6345564378797928,
"grad_norm": 4.598837852478027,
"learning_rate": 4.0615041427660933e-05,
"loss": 4.7755,
"mean_token_accuracy": 0.6768260210752487,
"num_tokens": 111959283.0,
"step": 8850
},
{
"epoch": 0.6345564378797928,
"eval_loss": 1.1904475688934326,
"eval_mean_token_accuracy": 0.6753637742996216,
"eval_num_tokens": 111959283.0,
"eval_runtime": 55.0394,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.908,
"step": 8850
},
{
"epoch": 0.6381415025005825,
"grad_norm": 4.1816558837890625,
"learning_rate": 4.0216698534098155e-05,
"loss": 4.7758,
"mean_token_accuracy": 0.6754831087589264,
"num_tokens": 112596409.0,
"step": 8900
},
{
"epoch": 0.6381415025005825,
"eval_loss": 1.1898977756500244,
"eval_mean_token_accuracy": 0.6755084788799286,
"eval_num_tokens": 112596409.0,
"eval_runtime": 55.0202,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.909,
"step": 8900
},
{
"epoch": 0.6417265671213723,
"grad_norm": 4.973260402679443,
"learning_rate": 3.9818355640535376e-05,
"loss": 4.7116,
"mean_token_accuracy": 0.6818169742822647,
"num_tokens": 113224902.0,
"step": 8950
},
{
"epoch": 0.6417265671213723,
"eval_loss": 1.1896042823791504,
"eval_mean_token_accuracy": 0.6757630515098572,
"eval_num_tokens": 113224902.0,
"eval_runtime": 54.9621,
"eval_samples_per_second": 7.278,
"eval_steps_per_second": 0.91,
"step": 8950
},
{
"epoch": 0.6453116317421621,
"grad_norm": 4.470012664794922,
"learning_rate": 3.94200127469726e-05,
"loss": 4.713,
"mean_token_accuracy": 0.6787376815080642,
"num_tokens": 113858275.0,
"step": 9000
},
{
"epoch": 0.6453116317421621,
"eval_loss": 1.1898657083511353,
"eval_mean_token_accuracy": 0.6760083436965942,
"eval_num_tokens": 113858275.0,
"eval_runtime": 54.9184,
"eval_samples_per_second": 7.284,
"eval_steps_per_second": 0.91,
"step": 9000
},
{
"epoch": 0.648896696362952,
"grad_norm": 4.098659992218018,
"learning_rate": 3.902166985340981e-05,
"loss": 4.7423,
"mean_token_accuracy": 0.6774056190252304,
"num_tokens": 114495903.0,
"step": 9050
},
{
"epoch": 0.648896696362952,
"eval_loss": 1.1897211074829102,
"eval_mean_token_accuracy": 0.6754970908164978,
"eval_num_tokens": 114495903.0,
"eval_runtime": 55.043,
"eval_samples_per_second": 7.267,
"eval_steps_per_second": 0.908,
"step": 9050
},
{
"epoch": 0.6524817609837418,
"grad_norm": 4.9181976318359375,
"learning_rate": 3.862332695984704e-05,
"loss": 4.7403,
"mean_token_accuracy": 0.6782529127597808,
"num_tokens": 115123519.0,
"step": 9100
},
{
"epoch": 0.6524817609837418,
"eval_loss": 1.1898068189620972,
"eval_mean_token_accuracy": 0.6745435571670533,
"eval_num_tokens": 115123519.0,
"eval_runtime": 54.9272,
"eval_samples_per_second": 7.282,
"eval_steps_per_second": 0.91,
"step": 9100
},
{
"epoch": 0.6560668256045316,
"grad_norm": 4.978320121765137,
"learning_rate": 3.8224984066284255e-05,
"loss": 4.8028,
"mean_token_accuracy": 0.675481299161911,
"num_tokens": 115755644.0,
"step": 9150
},
{
"epoch": 0.6560668256045316,
"eval_loss": 1.1891556978225708,
"eval_mean_token_accuracy": 0.6756797277927399,
"eval_num_tokens": 115755644.0,
"eval_runtime": 55.005,
"eval_samples_per_second": 7.272,
"eval_steps_per_second": 0.909,
"step": 9150
},
{
"epoch": 0.6596518902253213,
"grad_norm": 4.682608604431152,
"learning_rate": 3.7826641172721484e-05,
"loss": 4.8228,
"mean_token_accuracy": 0.6748796856403351,
"num_tokens": 116391334.0,
"step": 9200
},
{
"epoch": 0.6596518902253213,
"eval_loss": 1.1887702941894531,
"eval_mean_token_accuracy": 0.6753370201587677,
"eval_num_tokens": 116391334.0,
"eval_runtime": 54.9714,
"eval_samples_per_second": 7.277,
"eval_steps_per_second": 0.91,
"step": 9200
},
{
"epoch": 0.6632369548461111,
"grad_norm": 4.45632791519165,
"learning_rate": 3.7428298279158705e-05,
"loss": 4.7473,
"mean_token_accuracy": 0.6801807761192322,
"num_tokens": 117026978.0,
"step": 9250
},
{
"epoch": 0.6632369548461111,
"eval_loss": 1.1893665790557861,
"eval_mean_token_accuracy": 0.6753548145294189,
"eval_num_tokens": 117026978.0,
"eval_runtime": 55.0208,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.909,
"step": 9250
},
{
"epoch": 0.6668220194669009,
"grad_norm": 4.2916951179504395,
"learning_rate": 3.702995538559592e-05,
"loss": 4.7187,
"mean_token_accuracy": 0.6798599645495415,
"num_tokens": 117660612.0,
"step": 9300
},
{
"epoch": 0.6668220194669009,
"eval_loss": 1.189585566520691,
"eval_mean_token_accuracy": 0.6756225192546844,
"eval_num_tokens": 117660612.0,
"eval_runtime": 55.0296,
"eval_samples_per_second": 7.269,
"eval_steps_per_second": 0.909,
"step": 9300
},
{
"epoch": 0.6704070840876907,
"grad_norm": 4.559842109680176,
"learning_rate": 3.663161249203315e-05,
"loss": 4.7029,
"mean_token_accuracy": 0.6812646022439003,
"num_tokens": 118290778.0,
"step": 9350
},
{
"epoch": 0.6704070840876907,
"eval_loss": 1.1883878707885742,
"eval_mean_token_accuracy": 0.6754980099201202,
"eval_num_tokens": 118290778.0,
"eval_runtime": 55.0845,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.908,
"step": 9350
},
{
"epoch": 0.6739921487084805,
"grad_norm": 4.957666873931885,
"learning_rate": 3.623326959847036e-05,
"loss": 4.716,
"mean_token_accuracy": 0.6789155259728432,
"num_tokens": 118921441.0,
"step": 9400
},
{
"epoch": 0.6739921487084805,
"eval_loss": 1.1882615089416504,
"eval_mean_token_accuracy": 0.6757899785041809,
"eval_num_tokens": 118921441.0,
"eval_runtime": 55.0393,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.908,
"step": 9400
},
{
"epoch": 0.6775772133292702,
"grad_norm": 4.460175037384033,
"learning_rate": 3.5834926704907584e-05,
"loss": 4.7439,
"mean_token_accuracy": 0.6776839691400528,
"num_tokens": 119549331.0,
"step": 9450
},
{
"epoch": 0.6775772133292702,
"eval_loss": 1.1887913942337036,
"eval_mean_token_accuracy": 0.6761759769916534,
"eval_num_tokens": 119549331.0,
"eval_runtime": 54.9213,
"eval_samples_per_second": 7.283,
"eval_steps_per_second": 0.91,
"step": 9450
},
{
"epoch": 0.68116227795006,
"grad_norm": 4.3697638511657715,
"learning_rate": 3.543658381134481e-05,
"loss": 4.7796,
"mean_token_accuracy": 0.6753658777475358,
"num_tokens": 120180706.0,
"step": 9500
},
{
"epoch": 0.68116227795006,
"eval_loss": 1.187656044960022,
"eval_mean_token_accuracy": 0.6755701994895935,
"eval_num_tokens": 120180706.0,
"eval_runtime": 54.925,
"eval_samples_per_second": 7.283,
"eval_steps_per_second": 0.91,
"step": 9500
},
{
"epoch": 0.6847473425708498,
"grad_norm": 4.676335334777832,
"learning_rate": 3.503824091778203e-05,
"loss": 4.8121,
"mean_token_accuracy": 0.6740303432941437,
"num_tokens": 120813928.0,
"step": 9550
},
{
"epoch": 0.6847473425708498,
"eval_loss": 1.1875134706497192,
"eval_mean_token_accuracy": 0.6751706182956696,
"eval_num_tokens": 120813928.0,
"eval_runtime": 55.1212,
"eval_samples_per_second": 7.257,
"eval_steps_per_second": 0.907,
"step": 9550
},
{
"epoch": 0.6883324071916397,
"grad_norm": 5.17042875289917,
"learning_rate": 3.463989802421925e-05,
"loss": 4.7668,
"mean_token_accuracy": 0.6758210748434067,
"num_tokens": 121446792.0,
"step": 9600
},
{
"epoch": 0.6883324071916397,
"eval_loss": 1.1872638463974,
"eval_mean_token_accuracy": 0.6760274660587311,
"eval_num_tokens": 121446792.0,
"eval_runtime": 55.1534,
"eval_samples_per_second": 7.252,
"eval_steps_per_second": 0.907,
"step": 9600
},
{
"epoch": 0.6919174718124295,
"grad_norm": 4.4179840087890625,
"learning_rate": 3.424155513065647e-05,
"loss": 4.7633,
"mean_token_accuracy": 0.6759152534604073,
"num_tokens": 122072069.0,
"step": 9650
},
{
"epoch": 0.6919174718124295,
"eval_loss": 1.1880455017089844,
"eval_mean_token_accuracy": 0.6756154441833496,
"eval_num_tokens": 122072069.0,
"eval_runtime": 55.0701,
"eval_samples_per_second": 7.263,
"eval_steps_per_second": 0.908,
"step": 9650
},
{
"epoch": 0.6955025364332192,
"grad_norm": 4.7966437339782715,
"learning_rate": 3.384321223709369e-05,
"loss": 4.7314,
"mean_token_accuracy": 0.6790701761841774,
"num_tokens": 122703011.0,
"step": 9700
},
{
"epoch": 0.6955025364332192,
"eval_loss": 1.187593936920166,
"eval_mean_token_accuracy": 0.6753950679302215,
"eval_num_tokens": 122703011.0,
"eval_runtime": 55.1146,
"eval_samples_per_second": 7.258,
"eval_steps_per_second": 0.907,
"step": 9700
},
{
"epoch": 0.699087601054009,
"grad_norm": 4.6988630294799805,
"learning_rate": 3.344486934353091e-05,
"loss": 4.7162,
"mean_token_accuracy": 0.680111817419529,
"num_tokens": 123334937.0,
"step": 9750
},
{
"epoch": 0.699087601054009,
"eval_loss": 1.1878883838653564,
"eval_mean_token_accuracy": 0.6757830834388733,
"eval_num_tokens": 123334937.0,
"eval_runtime": 55.0822,
"eval_samples_per_second": 7.262,
"eval_steps_per_second": 0.908,
"step": 9750
},
{
"epoch": 0.7026726656747988,
"grad_norm": 5.11058235168457,
"learning_rate": 3.3046526449968134e-05,
"loss": 4.7778,
"mean_token_accuracy": 0.6746508419513703,
"num_tokens": 123969291.0,
"step": 9800
},
{
"epoch": 0.7026726656747988,
"eval_loss": 1.1869330406188965,
"eval_mean_token_accuracy": 0.6755635273456574,
"eval_num_tokens": 123969291.0,
"eval_runtime": 55.0482,
"eval_samples_per_second": 7.266,
"eval_steps_per_second": 0.908,
"step": 9800
},
{
"epoch": 0.7062577302955886,
"grad_norm": 4.539863109588623,
"learning_rate": 3.2648183556405356e-05,
"loss": 4.7265,
"mean_token_accuracy": 0.6780777916312217,
"num_tokens": 124597342.0,
"step": 9850
},
{
"epoch": 0.7062577302955886,
"eval_loss": 1.1864935159683228,
"eval_mean_token_accuracy": 0.6759455275535583,
"eval_num_tokens": 124597342.0,
"eval_runtime": 54.9859,
"eval_samples_per_second": 7.275,
"eval_steps_per_second": 0.909,
"step": 9850
},
{
"epoch": 0.7098427949163784,
"grad_norm": 4.2660112380981445,
"learning_rate": 3.224984066284258e-05,
"loss": 4.7727,
"mean_token_accuracy": 0.6779581853747367,
"num_tokens": 125230255.0,
"step": 9900
},
{
"epoch": 0.7098427949163784,
"eval_loss": 1.1863397359848022,
"eval_mean_token_accuracy": 0.6765384769439697,
"eval_num_tokens": 125230255.0,
"eval_runtime": 55.0326,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.909,
"step": 9900
},
{
"epoch": 0.7134278595371681,
"grad_norm": 4.660075664520264,
"learning_rate": 3.18514977692798e-05,
"loss": 4.6841,
"mean_token_accuracy": 0.6811311572790146,
"num_tokens": 125863729.0,
"step": 9950
},
{
"epoch": 0.7134278595371681,
"eval_loss": 1.1863139867782593,
"eval_mean_token_accuracy": 0.676126846075058,
"eval_num_tokens": 125863729.0,
"eval_runtime": 55.1646,
"eval_samples_per_second": 7.251,
"eval_steps_per_second": 0.906,
"step": 9950
},
{
"epoch": 0.7170129241579579,
"grad_norm": 4.517760753631592,
"learning_rate": 3.145315487571702e-05,
"loss": 4.7781,
"mean_token_accuracy": 0.6761695435643196,
"num_tokens": 126496619.0,
"step": 10000
},
{
"epoch": 0.7170129241579579,
"eval_loss": 1.1863616704940796,
"eval_mean_token_accuracy": 0.6760414135456085,
"eval_num_tokens": 126496619.0,
"eval_runtime": 55.0497,
"eval_samples_per_second": 7.266,
"eval_steps_per_second": 0.908,
"step": 10000
},
{
"epoch": 0.7205979887787477,
"grad_norm": 4.821887493133545,
"learning_rate": 3.105481198215424e-05,
"loss": 4.7506,
"mean_token_accuracy": 0.677640742957592,
"num_tokens": 127129025.0,
"step": 10050
},
{
"epoch": 0.7205979887787477,
"eval_loss": 1.1864928007125854,
"eval_mean_token_accuracy": 0.6763237309455872,
"eval_num_tokens": 127129025.0,
"eval_runtime": 55.0217,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.909,
"step": 10050
},
{
"epoch": 0.7241830533995375,
"grad_norm": 5.278724670410156,
"learning_rate": 3.065646908859146e-05,
"loss": 4.7726,
"mean_token_accuracy": 0.6762316790223122,
"num_tokens": 127757800.0,
"step": 10100
},
{
"epoch": 0.7241830533995375,
"eval_loss": 1.1857789754867554,
"eval_mean_token_accuracy": 0.6754815447330474,
"eval_num_tokens": 127757800.0,
"eval_runtime": 55.2673,
"eval_samples_per_second": 7.238,
"eval_steps_per_second": 0.905,
"step": 10100
},
{
"epoch": 0.7277681180203274,
"grad_norm": 4.649436950683594,
"learning_rate": 3.025812619502868e-05,
"loss": 4.803,
"mean_token_accuracy": 0.6757835251092911,
"num_tokens": 128391102.0,
"step": 10150
},
{
"epoch": 0.7277681180203274,
"eval_loss": 1.1859068870544434,
"eval_mean_token_accuracy": 0.6756039881706237,
"eval_num_tokens": 128391102.0,
"eval_runtime": 55.0647,
"eval_samples_per_second": 7.264,
"eval_steps_per_second": 0.908,
"step": 10150
},
{
"epoch": 0.7313531826411171,
"grad_norm": 4.287916660308838,
"learning_rate": 2.9859783301465906e-05,
"loss": 4.7373,
"mean_token_accuracy": 0.677640765607357,
"num_tokens": 129023759.0,
"step": 10200
},
{
"epoch": 0.7313531826411171,
"eval_loss": 1.1861519813537598,
"eval_mean_token_accuracy": 0.6761102056503296,
"eval_num_tokens": 129023759.0,
"eval_runtime": 54.9417,
"eval_samples_per_second": 7.28,
"eval_steps_per_second": 0.91,
"step": 10200
},
{
"epoch": 0.7349382472619069,
"grad_norm": 4.765435695648193,
"learning_rate": 2.9461440407903124e-05,
"loss": 4.7741,
"mean_token_accuracy": 0.6759647503495216,
"num_tokens": 129655248.0,
"step": 10250
},
{
"epoch": 0.7349382472619069,
"eval_loss": 1.1853660345077515,
"eval_mean_token_accuracy": 0.6756195032596588,
"eval_num_tokens": 129655248.0,
"eval_runtime": 55.3223,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 0.904,
"step": 10250
},
{
"epoch": 0.7385233118826967,
"grad_norm": 4.814145088195801,
"learning_rate": 2.906309751434035e-05,
"loss": 4.783,
"mean_token_accuracy": 0.6754540035128593,
"num_tokens": 130291512.0,
"step": 10300
},
{
"epoch": 0.7385233118826967,
"eval_loss": 1.1858062744140625,
"eval_mean_token_accuracy": 0.6755566847324371,
"eval_num_tokens": 130291512.0,
"eval_runtime": 55.3243,
"eval_samples_per_second": 7.23,
"eval_steps_per_second": 0.904,
"step": 10300
},
{
"epoch": 0.7421083765034865,
"grad_norm": 4.518885135650635,
"learning_rate": 2.8664754620777567e-05,
"loss": 4.7083,
"mean_token_accuracy": 0.6788066929578781,
"num_tokens": 130922542.0,
"step": 10350
},
{
"epoch": 0.7421083765034865,
"eval_loss": 1.185410499572754,
"eval_mean_token_accuracy": 0.6754859507083892,
"eval_num_tokens": 130922542.0,
"eval_runtime": 55.3328,
"eval_samples_per_second": 7.229,
"eval_steps_per_second": 0.904,
"step": 10350
},
{
"epoch": 0.7456934411242763,
"grad_norm": 4.623891830444336,
"learning_rate": 2.8266411727214788e-05,
"loss": 4.8077,
"mean_token_accuracy": 0.6753540116548539,
"num_tokens": 131553716.0,
"step": 10400
},
{
"epoch": 0.7456934411242763,
"eval_loss": 1.18582022190094,
"eval_mean_token_accuracy": 0.6761428475379944,
"eval_num_tokens": 131553716.0,
"eval_runtime": 55.033,
"eval_samples_per_second": 7.268,
"eval_steps_per_second": 0.909,
"step": 10400
},
{
"epoch": 0.749278505745066,
"grad_norm": 4.524717807769775,
"learning_rate": 2.7868068833652013e-05,
"loss": 4.7216,
"mean_token_accuracy": 0.6801271498203277,
"num_tokens": 132187857.0,
"step": 10450
},
{
"epoch": 0.749278505745066,
"eval_loss": 1.1854428052902222,
"eval_mean_token_accuracy": 0.6755358970165253,
"eval_num_tokens": 132187857.0,
"eval_runtime": 55.0174,
"eval_samples_per_second": 7.27,
"eval_steps_per_second": 0.909,
"step": 10450
},
{
"epoch": 0.7528635703658558,
"grad_norm": 4.81862211227417,
"learning_rate": 2.746972594008923e-05,
"loss": 4.8024,
"mean_token_accuracy": 0.6740377223491669,
"num_tokens": 132818348.0,
"step": 10500
},
{
"epoch": 0.7528635703658558,
"eval_loss": 1.184729814529419,
"eval_mean_token_accuracy": 0.6763768219947814,
"eval_num_tokens": 132818348.0,
"eval_runtime": 54.9843,
"eval_samples_per_second": 7.275,
"eval_steps_per_second": 0.909,
"step": 10500
},
{
"epoch": 0.7564486349866456,
"grad_norm": 4.876639366149902,
"learning_rate": 2.707138304652645e-05,
"loss": 4.691,
"mean_token_accuracy": 0.682259525358677,
"num_tokens": 133450534.0,
"step": 10550
},
{
"epoch": 0.7564486349866456,
"eval_loss": 1.1844180822372437,
"eval_mean_token_accuracy": 0.6765269267559052,
"eval_num_tokens": 133450534.0,
"eval_runtime": 55.1025,
"eval_samples_per_second": 7.259,
"eval_steps_per_second": 0.907,
"step": 10550
},
{
"epoch": 0.7600336996074354,
"grad_norm": 5.20668363571167,
"learning_rate": 2.6673040152963674e-05,
"loss": 4.7633,
"mean_token_accuracy": 0.6769976457953453,
"num_tokens": 134081427.0,
"step": 10600
},
{
"epoch": 0.7600336996074354,
"eval_loss": 1.1848989725112915,
"eval_mean_token_accuracy": 0.6765480875968933,
"eval_num_tokens": 134081427.0,
"eval_runtime": 55.16,
"eval_samples_per_second": 7.252,
"eval_steps_per_second": 0.906,
"step": 10600
},
{
"epoch": 0.7636187642282252,
"grad_norm": 4.415744304656982,
"learning_rate": 2.6274697259400892e-05,
"loss": 4.7572,
"mean_token_accuracy": 0.6762901389598847,
"num_tokens": 134717648.0,
"step": 10650
},
{
"epoch": 0.7636187642282252,
"eval_loss": 1.1854746341705322,
"eval_mean_token_accuracy": 0.6762021934986114,
"eval_num_tokens": 134717648.0,
"eval_runtime": 55.1597,
"eval_samples_per_second": 7.252,
"eval_steps_per_second": 0.906,
"step": 10650
},
{
"epoch": 0.7672038288490151,
"grad_norm": 4.984974384307861,
"learning_rate": 2.5876354365838113e-05,
"loss": 4.6918,
"mean_token_accuracy": 0.6813731342554092,
"num_tokens": 135352362.0,
"step": 10700
},
{
"epoch": 0.7672038288490151,
"eval_loss": 1.1856120824813843,
"eval_mean_token_accuracy": 0.6763051617145538,
"eval_num_tokens": 135352362.0,
"eval_runtime": 54.9725,
"eval_samples_per_second": 7.276,
"eval_steps_per_second": 0.91,
"step": 10700
},
{
"epoch": 0.7707888934698048,
"grad_norm": 4.5358781814575195,
"learning_rate": 2.5478011472275338e-05,
"loss": 4.7242,
"mean_token_accuracy": 0.6771323186159134,
"num_tokens": 135991559.0,
"step": 10750
},
{
"epoch": 0.7707888934698048,
"eval_loss": 1.185410737991333,
"eval_mean_token_accuracy": 0.676514265537262,
"eval_num_tokens": 135991559.0,
"eval_runtime": 55.0995,
"eval_samples_per_second": 7.26,
"eval_steps_per_second": 0.907,
"step": 10750
},
{
"epoch": 0.7743739580905946,
"grad_norm": 4.364614009857178,
"learning_rate": 2.5079668578712556e-05,
"loss": 4.6947,
"mean_token_accuracy": 0.6806002199649811,
"num_tokens": 136624024.0,
"step": 10800
},
{
"epoch": 0.7743739580905946,
"eval_loss": 1.184848666191101,
"eval_mean_token_accuracy": 0.676878696680069,
"eval_num_tokens": 136624024.0,
"eval_runtime": 55.7118,
"eval_samples_per_second": 7.18,
"eval_steps_per_second": 0.897,
"step": 10800
},
{
"epoch": 0.7779590227113844,
"grad_norm": 4.293883323669434,
"learning_rate": 2.4681325685149778e-05,
"loss": 4.7454,
"mean_token_accuracy": 0.6780085292458534,
"num_tokens": 137259400.0,
"step": 10850
},
{
"epoch": 0.7779590227113844,
"eval_loss": 1.185441493988037,
"eval_mean_token_accuracy": 0.6766848075389862,
"eval_num_tokens": 137259400.0,
"eval_runtime": 56.8526,
"eval_samples_per_second": 7.036,
"eval_steps_per_second": 0.879,
"step": 10850
},
{
"epoch": 0.7815440873321742,
"grad_norm": 4.8266143798828125,
"learning_rate": 2.4282982791587e-05,
"loss": 4.7078,
"mean_token_accuracy": 0.6800830870866775,
"num_tokens": 137888937.0,
"step": 10900
},
{
"epoch": 0.7815440873321742,
"eval_loss": 1.18449068069458,
"eval_mean_token_accuracy": 0.6764636623859406,
"eval_num_tokens": 137888937.0,
"eval_runtime": 56.5392,
"eval_samples_per_second": 7.075,
"eval_steps_per_second": 0.884,
"step": 10900
},
{
"epoch": 0.785129151952964,
"grad_norm": 4.471580982208252,
"learning_rate": 2.388463989802422e-05,
"loss": 4.6928,
"mean_token_accuracy": 0.6799645683169365,
"num_tokens": 138523132.0,
"step": 10950
},
{
"epoch": 0.785129151952964,
"eval_loss": 1.1840896606445312,
"eval_mean_token_accuracy": 0.6770457863807678,
"eval_num_tokens": 138523132.0,
"eval_runtime": 56.9603,
"eval_samples_per_second": 7.022,
"eval_steps_per_second": 0.878,
"step": 10950
},
{
"epoch": 0.7887142165737537,
"grad_norm": 4.892276763916016,
"learning_rate": 2.3486297004461442e-05,
"loss": 4.7822,
"mean_token_accuracy": 0.6743768805265427,
"num_tokens": 139156280.0,
"step": 11000
},
{
"epoch": 0.7887142165737537,
"eval_loss": 1.1841989755630493,
"eval_mean_token_accuracy": 0.6765543162822724,
"eval_num_tokens": 139156280.0,
"eval_runtime": 56.1325,
"eval_samples_per_second": 7.126,
"eval_steps_per_second": 0.891,
"step": 11000
},
{
"epoch": 0.7922992811945435,
"grad_norm": 5.218216896057129,
"learning_rate": 2.3087954110898663e-05,
"loss": 4.753,
"mean_token_accuracy": 0.6765683805942535,
"num_tokens": 139786084.0,
"step": 11050
},
{
"epoch": 0.7922992811945435,
"eval_loss": 1.1838265657424927,
"eval_mean_token_accuracy": 0.6768669807910919,
"eval_num_tokens": 139786084.0,
"eval_runtime": 57.9142,
"eval_samples_per_second": 6.907,
"eval_steps_per_second": 0.863,
"step": 11050
},
{
"epoch": 0.7958843458153333,
"grad_norm": 4.109825134277344,
"learning_rate": 2.2689611217335885e-05,
"loss": 4.766,
"mean_token_accuracy": 0.6766787865757942,
"num_tokens": 140420913.0,
"step": 11100
},
{
"epoch": 0.7958843458153333,
"eval_loss": 1.183830976486206,
"eval_mean_token_accuracy": 0.6771005463600158,
"eval_num_tokens": 140420913.0,
"eval_runtime": 57.8895,
"eval_samples_per_second": 6.91,
"eval_steps_per_second": 0.864,
"step": 11100
},
{
"epoch": 0.7994694104361231,
"grad_norm": 4.745416641235352,
"learning_rate": 2.2291268323773103e-05,
"loss": 4.7743,
"mean_token_accuracy": 0.6760394325852395,
"num_tokens": 141048743.0,
"step": 11150
},
{
"epoch": 0.7994694104361231,
"eval_loss": 1.184319257736206,
"eval_mean_token_accuracy": 0.6767275559902192,
"eval_num_tokens": 141048743.0,
"eval_runtime": 56.6971,
"eval_samples_per_second": 7.055,
"eval_steps_per_second": 0.882,
"step": 11150
},
{
"epoch": 0.8030544750569129,
"grad_norm": 5.053956985473633,
"learning_rate": 2.1892925430210324e-05,
"loss": 4.7635,
"mean_token_accuracy": 0.6785035586357117,
"num_tokens": 141678181.0,
"step": 11200
},
{
"epoch": 0.8030544750569129,
"eval_loss": 1.184045672416687,
"eval_mean_token_accuracy": 0.6763345134258271,
"eval_num_tokens": 141678181.0,
"eval_runtime": 57.5357,
"eval_samples_per_second": 6.952,
"eval_steps_per_second": 0.869,
"step": 11200
},
{
"epoch": 0.8066395396777027,
"grad_norm": 4.613523006439209,
"learning_rate": 2.149458253664755e-05,
"loss": 4.7037,
"mean_token_accuracy": 0.6807671126723289,
"num_tokens": 142312431.0,
"step": 11250
},
{
"epoch": 0.8066395396777027,
"eval_loss": 1.1838032007217407,
"eval_mean_token_accuracy": 0.6768004512786865,
"eval_num_tokens": 142312431.0,
"eval_runtime": 57.4032,
"eval_samples_per_second": 6.968,
"eval_steps_per_second": 0.871,
"step": 11250
},
{
"epoch": 0.8102246042984925,
"grad_norm": 4.577108860015869,
"learning_rate": 2.109623964308477e-05,
"loss": 4.7398,
"mean_token_accuracy": 0.6783872780203819,
"num_tokens": 142944119.0,
"step": 11300
},
{
"epoch": 0.8102246042984925,
"eval_loss": 1.1838161945343018,
"eval_mean_token_accuracy": 0.676618036031723,
"eval_num_tokens": 142944119.0,
"eval_runtime": 57.4773,
"eval_samples_per_second": 6.959,
"eval_steps_per_second": 0.87,
"step": 11300
},
{
"epoch": 0.8138096689192823,
"grad_norm": 4.523055553436279,
"learning_rate": 2.069789674952199e-05,
"loss": 4.7537,
"mean_token_accuracy": 0.6783151313662529,
"num_tokens": 143575945.0,
"step": 11350
},
{
"epoch": 0.8138096689192823,
"eval_loss": 1.183152437210083,
"eval_mean_token_accuracy": 0.6768651962280273,
"eval_num_tokens": 143575945.0,
"eval_runtime": 57.8648,
"eval_samples_per_second": 6.913,
"eval_steps_per_second": 0.864,
"step": 11350
},
{
"epoch": 0.8173947335400721,
"grad_norm": 4.623985290527344,
"learning_rate": 2.029955385595921e-05,
"loss": 4.6946,
"mean_token_accuracy": 0.6809823432564736,
"num_tokens": 144207834.0,
"step": 11400
},
{
"epoch": 0.8173947335400721,
"eval_loss": 1.1837332248687744,
"eval_mean_token_accuracy": 0.6768518340587616,
"eval_num_tokens": 144207834.0,
"eval_runtime": 57.9156,
"eval_samples_per_second": 6.907,
"eval_steps_per_second": 0.863,
"step": 11400
},
{
"epoch": 0.8209797981608619,
"grad_norm": 4.573394298553467,
"learning_rate": 1.990121096239643e-05,
"loss": 4.733,
"mean_token_accuracy": 0.6784341213107109,
"num_tokens": 144840226.0,
"step": 11450
},
{
"epoch": 0.8209797981608619,
"eval_loss": 1.1838306188583374,
"eval_mean_token_accuracy": 0.6770703101158142,
"eval_num_tokens": 144840226.0,
"eval_runtime": 57.6139,
"eval_samples_per_second": 6.943,
"eval_steps_per_second": 0.868,
"step": 11450
},
{
"epoch": 0.8245648627816516,
"grad_norm": 4.6763458251953125,
"learning_rate": 1.9502868068833653e-05,
"loss": 4.7745,
"mean_token_accuracy": 0.6781399786472321,
"num_tokens": 145475463.0,
"step": 11500
},
{
"epoch": 0.8245648627816516,
"eval_loss": 1.1832276582717896,
"eval_mean_token_accuracy": 0.6768820834159851,
"eval_num_tokens": 145475463.0,
"eval_runtime": 56.936,
"eval_samples_per_second": 7.025,
"eval_steps_per_second": 0.878,
"step": 11500
},
{
"epoch": 0.8281499274024414,
"grad_norm": 4.544640064239502,
"learning_rate": 1.9104525175270875e-05,
"loss": 4.6995,
"mean_token_accuracy": 0.6816430819034577,
"num_tokens": 146110411.0,
"step": 11550
},
{
"epoch": 0.8281499274024414,
"eval_loss": 1.1833053827285767,
"eval_mean_token_accuracy": 0.6773410534858704,
"eval_num_tokens": 146110411.0,
"eval_runtime": 57.0468,
"eval_samples_per_second": 7.012,
"eval_steps_per_second": 0.876,
"step": 11550
},
{
"epoch": 0.8317349920232312,
"grad_norm": 4.831193923950195,
"learning_rate": 1.8706182281708096e-05,
"loss": 4.7265,
"mean_token_accuracy": 0.6784323596954346,
"num_tokens": 146744949.0,
"step": 11600
},
{
"epoch": 0.8317349920232312,
"eval_loss": 1.1832283735275269,
"eval_mean_token_accuracy": 0.6767588186264039,
"eval_num_tokens": 146744949.0,
"eval_runtime": 57.1574,
"eval_samples_per_second": 6.998,
"eval_steps_per_second": 0.875,
"step": 11600
},
{
"epoch": 0.835320056644021,
"grad_norm": 4.2518086433410645,
"learning_rate": 1.8307839388145317e-05,
"loss": 4.7231,
"mean_token_accuracy": 0.6789010632038116,
"num_tokens": 147377879.0,
"step": 11650
},
{
"epoch": 0.835320056644021,
"eval_loss": 1.1831552982330322,
"eval_mean_token_accuracy": 0.6766877925395965,
"eval_num_tokens": 147377879.0,
"eval_runtime": 57.2174,
"eval_samples_per_second": 6.991,
"eval_steps_per_second": 0.874,
"step": 11650
},
{
"epoch": 0.8389051212648108,
"grad_norm": 4.656574726104736,
"learning_rate": 1.7909496494582535e-05,
"loss": 4.7387,
"mean_token_accuracy": 0.6785845035314559,
"num_tokens": 148008340.0,
"step": 11700
},
{
"epoch": 0.8389051212648108,
"eval_loss": 1.1829930543899536,
"eval_mean_token_accuracy": 0.6767005050182342,
"eval_num_tokens": 148008340.0,
"eval_runtime": 56.5488,
"eval_samples_per_second": 7.074,
"eval_steps_per_second": 0.884,
"step": 11700
},
{
"epoch": 0.8424901858856005,
"grad_norm": 5.07755184173584,
"learning_rate": 1.7511153601019757e-05,
"loss": 4.6949,
"mean_token_accuracy": 0.6815642186999321,
"num_tokens": 148637370.0,
"step": 11750
},
{
"epoch": 0.8424901858856005,
"eval_loss": 1.1835424900054932,
"eval_mean_token_accuracy": 0.6766264629364014,
"eval_num_tokens": 148637370.0,
"eval_runtime": 56.844,
"eval_samples_per_second": 7.037,
"eval_steps_per_second": 0.88,
"step": 11750
},
{
"epoch": 0.8460752505063904,
"grad_norm": 4.937259674072266,
"learning_rate": 1.7112810707456982e-05,
"loss": 4.7547,
"mean_token_accuracy": 0.6751632392406464,
"num_tokens": 149266269.0,
"step": 11800
},
{
"epoch": 0.8460752505063904,
"eval_loss": 1.1830496788024902,
"eval_mean_token_accuracy": 0.6767467558383942,
"eval_num_tokens": 149266269.0,
"eval_runtime": 56.5429,
"eval_samples_per_second": 7.074,
"eval_steps_per_second": 0.884,
"step": 11800
},
{
"epoch": 0.8496603151271802,
"grad_norm": 4.5733795166015625,
"learning_rate": 1.67144678138942e-05,
"loss": 4.7398,
"mean_token_accuracy": 0.6781432759761811,
"num_tokens": 149898007.0,
"step": 11850
},
{
"epoch": 0.8496603151271802,
"eval_loss": 1.1832393407821655,
"eval_mean_token_accuracy": 0.6769970893859864,
"eval_num_tokens": 149898007.0,
"eval_runtime": 55.172,
"eval_samples_per_second": 7.25,
"eval_steps_per_second": 0.906,
"step": 11850
},
{
"epoch": 0.85324537974797,
"grad_norm": 4.531384468078613,
"learning_rate": 1.631612492033142e-05,
"loss": 4.7097,
"mean_token_accuracy": 0.6787867891788483,
"num_tokens": 150529318.0,
"step": 11900
},
{
"epoch": 0.85324537974797,
"eval_loss": 1.1830426454544067,
"eval_mean_token_accuracy": 0.6767821443080902,
"eval_num_tokens": 150529318.0,
"eval_runtime": 55.9298,
"eval_samples_per_second": 7.152,
"eval_steps_per_second": 0.894,
"step": 11900
},
{
"epoch": 0.8568304443687598,
"grad_norm": 4.669693946838379,
"learning_rate": 1.5917782026768643e-05,
"loss": 4.7679,
"mean_token_accuracy": 0.6752150565385818,
"num_tokens": 151163202.0,
"step": 11950
},
{
"epoch": 0.8568304443687598,
"eval_loss": 1.1831849813461304,
"eval_mean_token_accuracy": 0.6765953767299652,
"eval_num_tokens": 151163202.0,
"eval_runtime": 56.6538,
"eval_samples_per_second": 7.06,
"eval_steps_per_second": 0.883,
"step": 11950
},
{
"epoch": 0.8604155089895495,
"grad_norm": 4.184320449829102,
"learning_rate": 1.5519439133205864e-05,
"loss": 4.713,
"mean_token_accuracy": 0.6797751143574715,
"num_tokens": 151798569.0,
"step": 12000
},
{
"epoch": 0.8604155089895495,
"eval_loss": 1.182477593421936,
"eval_mean_token_accuracy": 0.6768123960494995,
"eval_num_tokens": 151798569.0,
"eval_runtime": 56.3382,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.887,
"step": 12000
},
{
"epoch": 0.8640005736103393,
"grad_norm": 4.763125896453857,
"learning_rate": 1.5121096239643084e-05,
"loss": 4.7338,
"mean_token_accuracy": 0.6781762626767158,
"num_tokens": 152431951.0,
"step": 12050
},
{
"epoch": 0.8640005736103393,
"eval_loss": 1.182124137878418,
"eval_mean_token_accuracy": 0.6777550578117371,
"eval_num_tokens": 152431951.0,
"eval_runtime": 56.6371,
"eval_samples_per_second": 7.063,
"eval_steps_per_second": 0.883,
"step": 12050
},
{
"epoch": 0.8675856382311291,
"grad_norm": 4.805209159851074,
"learning_rate": 1.4722753346080307e-05,
"loss": 4.7565,
"mean_token_accuracy": 0.6775122970342636,
"num_tokens": 153063882.0,
"step": 12100
},
{
"epoch": 0.8675856382311291,
"eval_loss": 1.1821281909942627,
"eval_mean_token_accuracy": 0.6782806706428528,
"eval_num_tokens": 153063882.0,
"eval_runtime": 56.6386,
"eval_samples_per_second": 7.062,
"eval_steps_per_second": 0.883,
"step": 12100
},
{
"epoch": 0.8711707028519189,
"grad_norm": 4.224789142608643,
"learning_rate": 1.4324410452517528e-05,
"loss": 4.7722,
"mean_token_accuracy": 0.6754378816485405,
"num_tokens": 153698583.0,
"step": 12150
},
{
"epoch": 0.8711707028519189,
"eval_loss": 1.182055950164795,
"eval_mean_token_accuracy": 0.6773766386508941,
"eval_num_tokens": 153698583.0,
"eval_runtime": 56.2278,
"eval_samples_per_second": 7.114,
"eval_steps_per_second": 0.889,
"step": 12150
},
{
"epoch": 0.8747557674727087,
"grad_norm": 4.622290134429932,
"learning_rate": 1.392606755895475e-05,
"loss": 4.7395,
"mean_token_accuracy": 0.678723790049553,
"num_tokens": 154330119.0,
"step": 12200
},
{
"epoch": 0.8747557674727087,
"eval_loss": 1.1820727586746216,
"eval_mean_token_accuracy": 0.6769333493709564,
"eval_num_tokens": 154330119.0,
"eval_runtime": 56.605,
"eval_samples_per_second": 7.067,
"eval_steps_per_second": 0.883,
"step": 12200
},
{
"epoch": 0.8783408320934984,
"grad_norm": 4.508255481719971,
"learning_rate": 1.352772466539197e-05,
"loss": 4.6912,
"mean_token_accuracy": 0.6804595556855202,
"num_tokens": 154963205.0,
"step": 12250
},
{
"epoch": 0.8783408320934984,
"eval_loss": 1.1818066835403442,
"eval_mean_token_accuracy": 0.6772564661502838,
"eval_num_tokens": 154963205.0,
"eval_runtime": 55.5536,
"eval_samples_per_second": 7.2,
"eval_steps_per_second": 0.9,
"step": 12250
},
{
"epoch": 0.8819258967142882,
"grad_norm": 4.340250492095947,
"learning_rate": 1.3129381771829191e-05,
"loss": 4.7228,
"mean_token_accuracy": 0.6781974649429321,
"num_tokens": 155595688.0,
"step": 12300
},
{
"epoch": 0.8819258967142882,
"eval_loss": 1.1819037199020386,
"eval_mean_token_accuracy": 0.677418692111969,
"eval_num_tokens": 155595688.0,
"eval_runtime": 55.5311,
"eval_samples_per_second": 7.203,
"eval_steps_per_second": 0.9,
"step": 12300
},
{
"epoch": 0.8855109613350781,
"grad_norm": 4.752552032470703,
"learning_rate": 1.2731038878266413e-05,
"loss": 4.7694,
"mean_token_accuracy": 0.6762826785445213,
"num_tokens": 156228825.0,
"step": 12350
},
{
"epoch": 0.8855109613350781,
"eval_loss": 1.1815353631973267,
"eval_mean_token_accuracy": 0.6776353216171265,
"eval_num_tokens": 156228825.0,
"eval_runtime": 55.7037,
"eval_samples_per_second": 7.181,
"eval_steps_per_second": 0.898,
"step": 12350
},
{
"epoch": 0.8890960259558679,
"grad_norm": 4.746983528137207,
"learning_rate": 1.2332695984703634e-05,
"loss": 4.7539,
"mean_token_accuracy": 0.6774281883239746,
"num_tokens": 156864867.0,
"step": 12400
},
{
"epoch": 0.8890960259558679,
"eval_loss": 1.1814591884613037,
"eval_mean_token_accuracy": 0.6772267067432404,
"eval_num_tokens": 156864867.0,
"eval_runtime": 55.2801,
"eval_samples_per_second": 7.236,
"eval_steps_per_second": 0.904,
"step": 12400
},
{
"epoch": 0.8926810905766577,
"grad_norm": 4.964954376220703,
"learning_rate": 1.1934353091140854e-05,
"loss": 4.652,
"mean_token_accuracy": 0.6832978922128677,
"num_tokens": 157497835.0,
"step": 12450
},
{
"epoch": 0.8926810905766577,
"eval_loss": 1.1815037727355957,
"eval_mean_token_accuracy": 0.6775709521770478,
"eval_num_tokens": 157497835.0,
"eval_runtime": 55.4264,
"eval_samples_per_second": 7.217,
"eval_steps_per_second": 0.902,
"step": 12450
},
{
"epoch": 0.8962661551974475,
"grad_norm": 4.32532262802124,
"learning_rate": 1.1536010197578075e-05,
"loss": 4.6811,
"mean_token_accuracy": 0.6813494926691055,
"num_tokens": 158131957.0,
"step": 12500
},
{
"epoch": 0.8962661551974475,
"eval_loss": 1.181230902671814,
"eval_mean_token_accuracy": 0.6782212293148041,
"eval_num_tokens": 158131957.0,
"eval_runtime": 55.638,
"eval_samples_per_second": 7.189,
"eval_steps_per_second": 0.899,
"step": 12500
},
{
"epoch": 0.8998512198182372,
"grad_norm": 4.772362232208252,
"learning_rate": 1.1137667304015297e-05,
"loss": 4.6799,
"mean_token_accuracy": 0.6808639001846314,
"num_tokens": 158765520.0,
"step": 12550
},
{
"epoch": 0.8998512198182372,
"eval_loss": 1.18119215965271,
"eval_mean_token_accuracy": 0.6776848089694977,
"eval_num_tokens": 158765520.0,
"eval_runtime": 56.5407,
"eval_samples_per_second": 7.075,
"eval_steps_per_second": 0.884,
"step": 12550
},
{
"epoch": 0.903436284439027,
"grad_norm": 4.406890869140625,
"learning_rate": 1.0739324410452518e-05,
"loss": 4.6572,
"mean_token_accuracy": 0.6824618262052536,
"num_tokens": 159396320.0,
"step": 12600
},
{
"epoch": 0.903436284439027,
"eval_loss": 1.1813263893127441,
"eval_mean_token_accuracy": 0.677431755065918,
"eval_num_tokens": 159396320.0,
"eval_runtime": 56.1014,
"eval_samples_per_second": 7.13,
"eval_steps_per_second": 0.891,
"step": 12600
},
{
"epoch": 0.9070213490598168,
"grad_norm": 4.6225786209106445,
"learning_rate": 1.034098151688974e-05,
"loss": 4.7346,
"mean_token_accuracy": 0.6780241671204567,
"num_tokens": 160028813.0,
"step": 12650
},
{
"epoch": 0.9070213490598168,
"eval_loss": 1.18108332157135,
"eval_mean_token_accuracy": 0.6777166557312012,
"eval_num_tokens": 160028813.0,
"eval_runtime": 55.9817,
"eval_samples_per_second": 7.145,
"eval_steps_per_second": 0.893,
"step": 12650
},
{
"epoch": 0.9106064136806066,
"grad_norm": 5.096744537353516,
"learning_rate": 9.942638623326961e-06,
"loss": 4.7072,
"mean_token_accuracy": 0.6779900795221329,
"num_tokens": 160660115.0,
"step": 12700
},
{
"epoch": 0.9106064136806066,
"eval_loss": 1.1813737154006958,
"eval_mean_token_accuracy": 0.6772573125362397,
"eval_num_tokens": 160660115.0,
"eval_runtime": 56.7004,
"eval_samples_per_second": 7.055,
"eval_steps_per_second": 0.882,
"step": 12700
},
{
"epoch": 0.9141914783013964,
"grad_norm": 4.954991817474365,
"learning_rate": 9.54429572976418e-06,
"loss": 4.7609,
"mean_token_accuracy": 0.6763640037178993,
"num_tokens": 161295929.0,
"step": 12750
},
{
"epoch": 0.9141914783013964,
"eval_loss": 1.1813360452651978,
"eval_mean_token_accuracy": 0.6774272322654724,
"eval_num_tokens": 161295929.0,
"eval_runtime": 55.8406,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.895,
"step": 12750
},
{
"epoch": 0.9177765429221861,
"grad_norm": 4.923917770385742,
"learning_rate": 9.145952836201404e-06,
"loss": 4.8154,
"mean_token_accuracy": 0.6726123803853988,
"num_tokens": 161927138.0,
"step": 12800
},
{
"epoch": 0.9177765429221861,
"eval_loss": 1.1811388731002808,
"eval_mean_token_accuracy": 0.6773995268344879,
"eval_num_tokens": 161927138.0,
"eval_runtime": 55.8065,
"eval_samples_per_second": 7.168,
"eval_steps_per_second": 0.896,
"step": 12800
},
{
"epoch": 0.9213616075429759,
"grad_norm": 4.706872463226318,
"learning_rate": 8.747609942638624e-06,
"loss": 4.7658,
"mean_token_accuracy": 0.6769201335310936,
"num_tokens": 162553898.0,
"step": 12850
},
{
"epoch": 0.9213616075429759,
"eval_loss": 1.1811527013778687,
"eval_mean_token_accuracy": 0.6772512257099151,
"eval_num_tokens": 162553898.0,
"eval_runtime": 55.933,
"eval_samples_per_second": 7.151,
"eval_steps_per_second": 0.894,
"step": 12850
},
{
"epoch": 0.9249466721637658,
"grad_norm": 4.29292106628418,
"learning_rate": 8.349267049075845e-06,
"loss": 4.7529,
"mean_token_accuracy": 0.678237376511097,
"num_tokens": 163187273.0,
"step": 12900
},
{
"epoch": 0.9249466721637658,
"eval_loss": 1.1813446283340454,
"eval_mean_token_accuracy": 0.6770773160457612,
"eval_num_tokens": 163187273.0,
"eval_runtime": 55.9165,
"eval_samples_per_second": 7.154,
"eval_steps_per_second": 0.894,
"step": 12900
},
{
"epoch": 0.9285317367845556,
"grad_norm": 4.577188968658447,
"learning_rate": 7.950924155513067e-06,
"loss": 4.7151,
"mean_token_accuracy": 0.6785858425498009,
"num_tokens": 163816024.0,
"step": 12950
},
{
"epoch": 0.9285317367845556,
"eval_loss": 1.181489109992981,
"eval_mean_token_accuracy": 0.6775988221168519,
"eval_num_tokens": 163816024.0,
"eval_runtime": 55.8424,
"eval_samples_per_second": 7.163,
"eval_steps_per_second": 0.895,
"step": 12950
},
{
"epoch": 0.9321168014053454,
"grad_norm": 4.593563556671143,
"learning_rate": 7.552581261950287e-06,
"loss": 4.7085,
"mean_token_accuracy": 0.6803634178638458,
"num_tokens": 164445021.0,
"step": 13000
},
{
"epoch": 0.9321168014053454,
"eval_loss": 1.1816061735153198,
"eval_mean_token_accuracy": 0.6776184713840485,
"eval_num_tokens": 164445021.0,
"eval_runtime": 55.8489,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.895,
"step": 13000
},
{
"epoch": 0.9357018660261351,
"grad_norm": 4.309682846069336,
"learning_rate": 7.1542383683875086e-06,
"loss": 4.706,
"mean_token_accuracy": 0.6809570705890655,
"num_tokens": 165073944.0,
"step": 13050
},
{
"epoch": 0.9357018660261351,
"eval_loss": 1.181320071220398,
"eval_mean_token_accuracy": 0.6780745506286621,
"eval_num_tokens": 165073944.0,
"eval_runtime": 55.6478,
"eval_samples_per_second": 7.188,
"eval_steps_per_second": 0.899,
"step": 13050
},
{
"epoch": 0.9392869306469249,
"grad_norm": 4.485646724700928,
"learning_rate": 6.755895474824729e-06,
"loss": 4.7568,
"mean_token_accuracy": 0.677568726837635,
"num_tokens": 165708351.0,
"step": 13100
},
{
"epoch": 0.9392869306469249,
"eval_loss": 1.1809498071670532,
"eval_mean_token_accuracy": 0.677814108133316,
"eval_num_tokens": 165708351.0,
"eval_runtime": 55.8268,
"eval_samples_per_second": 7.165,
"eval_steps_per_second": 0.896,
"step": 13100
},
{
"epoch": 0.9428719952677147,
"grad_norm": 4.374978065490723,
"learning_rate": 6.357552581261951e-06,
"loss": 4.7056,
"mean_token_accuracy": 0.6803146860003472,
"num_tokens": 166342580.0,
"step": 13150
},
{
"epoch": 0.9428719952677147,
"eval_loss": 1.1812047958374023,
"eval_mean_token_accuracy": 0.6775732636451721,
"eval_num_tokens": 166342580.0,
"eval_runtime": 56.8808,
"eval_samples_per_second": 7.032,
"eval_steps_per_second": 0.879,
"step": 13150
},
{
"epoch": 0.9464570598885045,
"grad_norm": 4.719696044921875,
"learning_rate": 5.959209687699171e-06,
"loss": 4.6996,
"mean_token_accuracy": 0.6807303726673126,
"num_tokens": 166972337.0,
"step": 13200
},
{
"epoch": 0.9464570598885045,
"eval_loss": 1.1809191703796387,
"eval_mean_token_accuracy": 0.6775369119644165,
"eval_num_tokens": 166972337.0,
"eval_runtime": 55.3898,
"eval_samples_per_second": 7.222,
"eval_steps_per_second": 0.903,
"step": 13200
},
{
"epoch": 0.9500421245092943,
"grad_norm": 4.4557905197143555,
"learning_rate": 5.560866794136393e-06,
"loss": 4.6934,
"mean_token_accuracy": 0.6803115239739418,
"num_tokens": 167608984.0,
"step": 13250
},
{
"epoch": 0.9500421245092943,
"eval_loss": 1.1811048984527588,
"eval_mean_token_accuracy": 0.6776836955547333,
"eval_num_tokens": 167608984.0,
"eval_runtime": 56.7029,
"eval_samples_per_second": 7.054,
"eval_steps_per_second": 0.882,
"step": 13250
},
{
"epoch": 0.953627189130084,
"grad_norm": 4.890408515930176,
"learning_rate": 5.162523900573614e-06,
"loss": 4.7407,
"mean_token_accuracy": 0.6763252380490303,
"num_tokens": 168240445.0,
"step": 13300
},
{
"epoch": 0.953627189130084,
"eval_loss": 1.1808910369873047,
"eval_mean_token_accuracy": 0.6770796060562134,
"eval_num_tokens": 168240445.0,
"eval_runtime": 56.5546,
"eval_samples_per_second": 7.073,
"eval_steps_per_second": 0.884,
"step": 13300
},
{
"epoch": 0.9572122537508738,
"grad_norm": 5.145451545715332,
"learning_rate": 4.7641810070108355e-06,
"loss": 4.7417,
"mean_token_accuracy": 0.6776255601644516,
"num_tokens": 168871775.0,
"step": 13350
},
{
"epoch": 0.9572122537508738,
"eval_loss": 1.1808017492294312,
"eval_mean_token_accuracy": 0.6777704417705536,
"eval_num_tokens": 168871775.0,
"eval_runtime": 56.3719,
"eval_samples_per_second": 7.096,
"eval_steps_per_second": 0.887,
"step": 13350
},
{
"epoch": 0.9607973183716636,
"grad_norm": 4.5986104011535645,
"learning_rate": 4.365838113448056e-06,
"loss": 4.742,
"mean_token_accuracy": 0.678845791220665,
"num_tokens": 169504277.0,
"step": 13400
},
{
"epoch": 0.9607973183716636,
"eval_loss": 1.1809656620025635,
"eval_mean_token_accuracy": 0.6775072228908539,
"eval_num_tokens": 169504277.0,
"eval_runtime": 55.886,
"eval_samples_per_second": 7.157,
"eval_steps_per_second": 0.895,
"step": 13400
},
{
"epoch": 0.9643823829924535,
"grad_norm": 4.603204250335693,
"learning_rate": 3.967495219885278e-06,
"loss": 4.7013,
"mean_token_accuracy": 0.6800284919142723,
"num_tokens": 170139597.0,
"step": 13450
},
{
"epoch": 0.9643823829924535,
"eval_loss": 1.1810020208358765,
"eval_mean_token_accuracy": 0.6775026059150696,
"eval_num_tokens": 170139597.0,
"eval_runtime": 55.5754,
"eval_samples_per_second": 7.197,
"eval_steps_per_second": 0.9,
"step": 13450
},
{
"epoch": 0.9679674476132433,
"grad_norm": 4.541078567504883,
"learning_rate": 3.5691523263224986e-06,
"loss": 4.6893,
"mean_token_accuracy": 0.6810142487287522,
"num_tokens": 170769337.0,
"step": 13500
},
{
"epoch": 0.9679674476132433,
"eval_loss": 1.1807525157928467,
"eval_mean_token_accuracy": 0.6778879475593567,
"eval_num_tokens": 170769337.0,
"eval_runtime": 55.7448,
"eval_samples_per_second": 7.176,
"eval_steps_per_second": 0.897,
"step": 13500
},
{
"epoch": 0.971552512234033,
"grad_norm": 4.519087314605713,
"learning_rate": 3.17080943275972e-06,
"loss": 4.772,
"mean_token_accuracy": 0.6758325353264809,
"num_tokens": 171402451.0,
"step": 13550
},
{
"epoch": 0.971552512234033,
"eval_loss": 1.180974006652832,
"eval_mean_token_accuracy": 0.6771935153007508,
"eval_num_tokens": 171402451.0,
"eval_runtime": 55.9385,
"eval_samples_per_second": 7.151,
"eval_steps_per_second": 0.894,
"step": 13550
},
{
"epoch": 0.9751375768548228,
"grad_norm": 4.388876914978027,
"learning_rate": 2.772466539196941e-06,
"loss": 4.7878,
"mean_token_accuracy": 0.6759749925136567,
"num_tokens": 172037593.0,
"step": 13600
},
{
"epoch": 0.9751375768548228,
"eval_loss": 1.180649995803833,
"eval_mean_token_accuracy": 0.6776048111915588,
"eval_num_tokens": 172037593.0,
"eval_runtime": 55.7077,
"eval_samples_per_second": 7.18,
"eval_steps_per_second": 0.898,
"step": 13600
},
{
"epoch": 0.9787226414756126,
"grad_norm": 4.676353931427002,
"learning_rate": 2.374123645634162e-06,
"loss": 4.7004,
"mean_token_accuracy": 0.6811859339475632,
"num_tokens": 172671223.0,
"step": 13650
},
{
"epoch": 0.9787226414756126,
"eval_loss": 1.1809673309326172,
"eval_mean_token_accuracy": 0.6772890436649323,
"eval_num_tokens": 172671223.0,
"eval_runtime": 55.9611,
"eval_samples_per_second": 7.148,
"eval_steps_per_second": 0.893,
"step": 13650
},
{
"epoch": 0.9823077060964024,
"grad_norm": 4.678284645080566,
"learning_rate": 1.975780752071383e-06,
"loss": 4.7903,
"mean_token_accuracy": 0.676692801117897,
"num_tokens": 173302207.0,
"step": 13700
},
{
"epoch": 0.9823077060964024,
"eval_loss": 1.1809345483779907,
"eval_mean_token_accuracy": 0.6775369548797607,
"eval_num_tokens": 173302207.0,
"eval_runtime": 55.9406,
"eval_samples_per_second": 7.15,
"eval_steps_per_second": 0.894,
"step": 13700
},
{
"epoch": 0.9858927707171922,
"grad_norm": 4.409168243408203,
"learning_rate": 1.5774378585086041e-06,
"loss": 4.6684,
"mean_token_accuracy": 0.6816425076127053,
"num_tokens": 173935786.0,
"step": 13750
},
{
"epoch": 0.9858927707171922,
"eval_loss": 1.1808542013168335,
"eval_mean_token_accuracy": 0.6773917138576507,
"eval_num_tokens": 173935786.0,
"eval_runtime": 55.8468,
"eval_samples_per_second": 7.162,
"eval_steps_per_second": 0.895,
"step": 13750
},
{
"epoch": 0.989477835337982,
"grad_norm": 4.55557107925415,
"learning_rate": 1.1790949649458254e-06,
"loss": 4.7765,
"mean_token_accuracy": 0.6755864906311035,
"num_tokens": 174569465.0,
"step": 13800
},
{
"epoch": 0.989477835337982,
"eval_loss": 1.18069589138031,
"eval_mean_token_accuracy": 0.6778388035297394,
"eval_num_tokens": 174569465.0,
"eval_runtime": 55.6007,
"eval_samples_per_second": 7.194,
"eval_steps_per_second": 0.899,
"step": 13800
},
{
"epoch": 0.9930628999587717,
"grad_norm": 4.513246536254883,
"learning_rate": 7.807520713830466e-07,
"loss": 4.7511,
"mean_token_accuracy": 0.6769631016254425,
"num_tokens": 175203827.0,
"step": 13850
},
{
"epoch": 0.9930628999587717,
"eval_loss": 1.1805609464645386,
"eval_mean_token_accuracy": 0.6776606893539429,
"eval_num_tokens": 175203827.0,
"eval_runtime": 55.7815,
"eval_samples_per_second": 7.171,
"eval_steps_per_second": 0.896,
"step": 13850
},
{
"epoch": 0.9966479645795615,
"grad_norm": 4.5654497146606445,
"learning_rate": 3.824091778202677e-07,
"loss": 4.7423,
"mean_token_accuracy": 0.6774308422207832,
"num_tokens": 175836938.0,
"step": 13900
},
{
"epoch": 0.9966479645795615,
"eval_loss": 1.1805065870285034,
"eval_mean_token_accuracy": 0.6777116668224334,
"eval_num_tokens": 175836938.0,
"eval_runtime": 55.8266,
"eval_samples_per_second": 7.165,
"eval_steps_per_second": 0.896,
"step": 13900
},
{
"epoch": 1.0,
"mean_token_accuracy": 0.677863019991686,
"num_tokens": 176425692.0,
"step": 13947,
"total_flos": 5.688174665250246e+18,
"train_loss": 4.990768942446951,
"train_runtime": 203046.5495,
"train_samples_per_second": 2.198,
"train_steps_per_second": 0.069
}
],
"logging_steps": 50,
"max_steps": 13947,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.688174665250246e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}