char-model-linear-40 / trainer_state.json
jm9095
Pilot model commit
a5a268b
{
"best_metric": 0.9439197182655334,
"best_model_checkpoint": "/scratch/gpfs/BG11/char-model/char-model-linear-30/checkpoint-244800",
"epoch": 30.0,
"eval_steps": 1600,
"global_step": 245640,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.19540791402051783,
"grad_norm": 1.1211559772491455,
"learning_rate": 8e-05,
"loss": 2.5747,
"step": 1600
},
{
"epoch": 0.19540791402051783,
"eval_loss": 2.3864238262176514,
"eval_runtime": 113.6763,
"eval_samples_per_second": 256.096,
"eval_steps_per_second": 8.005,
"step": 1600
},
{
"epoch": 0.39081582804103565,
"grad_norm": 0.8591320514678955,
"learning_rate": 0.00016,
"loss": 2.2616,
"step": 3200
},
{
"epoch": 0.39081582804103565,
"eval_loss": 2.154252529144287,
"eval_runtime": 114.9101,
"eval_samples_per_second": 253.346,
"eval_steps_per_second": 7.919,
"step": 3200
},
{
"epoch": 0.5862237420615535,
"grad_norm": 0.5468400716781616,
"learning_rate": 0.00019999459110993603,
"loss": 2.0831,
"step": 4800
},
{
"epoch": 0.5862237420615535,
"eval_loss": 1.941041350364685,
"eval_runtime": 114.7226,
"eval_samples_per_second": 253.76,
"eval_steps_per_second": 7.932,
"step": 4800
},
{
"epoch": 0.7816316560820713,
"grad_norm": 0.5104987621307373,
"learning_rate": 0.00019995132350009195,
"loss": 1.9311,
"step": 6400
},
{
"epoch": 0.7816316560820713,
"eval_loss": 1.7786755561828613,
"eval_runtime": 114.5387,
"eval_samples_per_second": 254.167,
"eval_steps_per_second": 7.945,
"step": 6400
},
{
"epoch": 0.9770395701025891,
"grad_norm": 0.4772449731826782,
"learning_rate": 0.0001998648070022771,
"loss": 1.8329,
"step": 8000
},
{
"epoch": 0.9770395701025891,
"eval_loss": 1.6836353540420532,
"eval_runtime": 114.4895,
"eval_samples_per_second": 254.277,
"eval_steps_per_second": 7.948,
"step": 8000
},
{
"epoch": 1.172447484123107,
"grad_norm": 0.4696632921695709,
"learning_rate": 0.00019973507905213703,
"loss": 1.7719,
"step": 9600
},
{
"epoch": 1.172447484123107,
"eval_loss": 1.6247377395629883,
"eval_runtime": 114.2314,
"eval_samples_per_second": 254.851,
"eval_steps_per_second": 7.966,
"step": 9600
},
{
"epoch": 1.3678553981436248,
"grad_norm": 0.5044004917144775,
"learning_rate": 0.00019956219578289123,
"loss": 1.7294,
"step": 11200
},
{
"epoch": 1.3678553981436248,
"eval_loss": 1.5734336376190186,
"eval_runtime": 114.5665,
"eval_samples_per_second": 254.106,
"eval_steps_per_second": 7.943,
"step": 11200
},
{
"epoch": 1.5632633121641426,
"grad_norm": 0.5493327975273132,
"learning_rate": 0.0001993462320010443,
"loss": 1.6971,
"step": 12800
},
{
"epoch": 1.5632633121641426,
"eval_loss": 1.5416444540023804,
"eval_runtime": 114.1696,
"eval_samples_per_second": 254.989,
"eval_steps_per_second": 7.971,
"step": 12800
},
{
"epoch": 1.7586712261846604,
"grad_norm": 0.536300778388977,
"learning_rate": 0.00019908728115401733,
"loss": 1.6711,
"step": 14400
},
{
"epoch": 1.7586712261846604,
"eval_loss": 1.519049048423767,
"eval_runtime": 114.3419,
"eval_samples_per_second": 254.605,
"eval_steps_per_second": 7.959,
"step": 14400
},
{
"epoch": 1.9540791402051783,
"grad_norm": 0.5437602400779724,
"learning_rate": 0.00019878545528971298,
"loss": 1.6507,
"step": 16000
},
{
"epoch": 1.9540791402051783,
"eval_loss": 1.5041334629058838,
"eval_runtime": 113.8035,
"eval_samples_per_second": 255.809,
"eval_steps_per_second": 7.996,
"step": 16000
},
{
"epoch": 2.149487054225696,
"grad_norm": 0.4997202455997467,
"learning_rate": 0.00019844088500803263,
"loss": 1.6312,
"step": 17600
},
{
"epoch": 2.149487054225696,
"eval_loss": 1.4861212968826294,
"eval_runtime": 113.9481,
"eval_samples_per_second": 255.485,
"eval_steps_per_second": 7.986,
"step": 17600
},
{
"epoch": 2.344894968246214,
"grad_norm": 0.4912782311439514,
"learning_rate": 0.00019805371940436587,
"loss": 1.6186,
"step": 19200
},
{
"epoch": 2.344894968246214,
"eval_loss": 1.4668010473251343,
"eval_runtime": 113.9896,
"eval_samples_per_second": 255.392,
"eval_steps_per_second": 7.983,
"step": 19200
},
{
"epoch": 2.5403028822667317,
"grad_norm": 0.5008465647697449,
"learning_rate": 0.00019762412600507676,
"loss": 1.6022,
"step": 20800
},
{
"epoch": 2.5403028822667317,
"eval_loss": 1.4426122903823853,
"eval_runtime": 113.8381,
"eval_samples_per_second": 255.732,
"eval_steps_per_second": 7.994,
"step": 20800
},
{
"epoch": 2.7357107962872496,
"grad_norm": 0.49787667393684387,
"learning_rate": 0.0001971522906950156,
"loss": 1.5906,
"step": 22400
},
{
"epoch": 2.7357107962872496,
"eval_loss": 1.432023525238037,
"eval_runtime": 114.1591,
"eval_samples_per_second": 255.013,
"eval_steps_per_second": 7.971,
"step": 22400
},
{
"epoch": 2.9311187103077674,
"grad_norm": 0.5572329759597778,
"learning_rate": 0.0001966384176370864,
"loss": 1.5781,
"step": 24000
},
{
"epoch": 2.9311187103077674,
"eval_loss": 1.4263314008712769,
"eval_runtime": 114.1977,
"eval_samples_per_second": 254.926,
"eval_steps_per_second": 7.969,
"step": 24000
},
{
"epoch": 3.126526624328285,
"grad_norm": 0.49415144324302673,
"learning_rate": 0.00019608272918390576,
"loss": 1.5682,
"step": 25600
},
{
"epoch": 3.126526624328285,
"eval_loss": 1.4076579809188843,
"eval_runtime": 113.9991,
"eval_samples_per_second": 255.371,
"eval_steps_per_second": 7.983,
"step": 25600
},
{
"epoch": 3.321934538348803,
"grad_norm": 0.5025901794433594,
"learning_rate": 0.00019548546578159097,
"loss": 1.5574,
"step": 27200
},
{
"epoch": 3.321934538348803,
"eval_loss": 1.407425045967102,
"eval_runtime": 114.0529,
"eval_samples_per_second": 255.25,
"eval_steps_per_second": 7.979,
"step": 27200
},
{
"epoch": 3.517342452369321,
"grad_norm": 0.525579571723938,
"learning_rate": 0.00019484688586571922,
"loss": 1.548,
"step": 28800
},
{
"epoch": 3.517342452369321,
"eval_loss": 1.3852895498275757,
"eval_runtime": 114.0777,
"eval_samples_per_second": 255.195,
"eval_steps_per_second": 7.977,
"step": 28800
},
{
"epoch": 3.7127503663898387,
"grad_norm": 0.6027827858924866,
"learning_rate": 0.00019416726574950254,
"loss": 1.5423,
"step": 30400
},
{
"epoch": 3.7127503663898387,
"eval_loss": 1.3818697929382324,
"eval_runtime": 113.8552,
"eval_samples_per_second": 255.693,
"eval_steps_per_second": 7.993,
"step": 30400
},
{
"epoch": 3.9081582804103565,
"grad_norm": 0.4942677319049835,
"learning_rate": 0.0001934468995042272,
"loss": 1.5309,
"step": 32000
},
{
"epoch": 3.9081582804103565,
"eval_loss": 1.3760185241699219,
"eval_runtime": 113.6787,
"eval_samples_per_second": 256.09,
"eval_steps_per_second": 8.005,
"step": 32000
},
{
"epoch": 4.103566194430875,
"grad_norm": 0.5741337537765503,
"learning_rate": 0.00019268609883200935,
"loss": 1.5239,
"step": 33600
},
{
"epoch": 4.103566194430875,
"eval_loss": 1.3696064949035645,
"eval_runtime": 113.7872,
"eval_samples_per_second": 255.846,
"eval_steps_per_second": 7.997,
"step": 33600
},
{
"epoch": 4.298974108451392,
"grad_norm": 0.5128276944160461,
"learning_rate": 0.00019188519293092138,
"loss": 1.5155,
"step": 35200
},
{
"epoch": 4.298974108451392,
"eval_loss": 1.3626887798309326,
"eval_runtime": 113.6907,
"eval_samples_per_second": 256.063,
"eval_steps_per_second": 8.004,
"step": 35200
},
{
"epoch": 4.49438202247191,
"grad_norm": 0.5358602404594421,
"learning_rate": 0.00019104452835254848,
"loss": 1.509,
"step": 36800
},
{
"epoch": 4.49438202247191,
"eval_loss": 1.3531229496002197,
"eval_runtime": 113.6019,
"eval_samples_per_second": 256.263,
"eval_steps_per_second": 8.01,
"step": 36800
},
{
"epoch": 4.689789936492428,
"grad_norm": 0.5138037204742432,
"learning_rate": 0.00019016446885203558,
"loss": 1.5027,
"step": 38400
},
{
"epoch": 4.689789936492428,
"eval_loss": 1.3384617567062378,
"eval_runtime": 113.5647,
"eval_samples_per_second": 256.347,
"eval_steps_per_second": 8.013,
"step": 38400
},
{
"epoch": 4.885197850512946,
"grad_norm": 0.5272678732872009,
"learning_rate": 0.00018924539523069092,
"loss": 1.494,
"step": 40000
},
{
"epoch": 4.885197850512946,
"eval_loss": 1.33444082736969,
"eval_runtime": 113.5839,
"eval_samples_per_second": 256.304,
"eval_steps_per_second": 8.012,
"step": 40000
},
{
"epoch": 5.0806057645334635,
"grad_norm": 0.5547707676887512,
"learning_rate": 0.00018828770517121334,
"loss": 1.4874,
"step": 41600
},
{
"epoch": 5.0806057645334635,
"eval_loss": 1.325605869293213,
"eval_runtime": 113.9384,
"eval_samples_per_second": 255.506,
"eval_steps_per_second": 7.987,
"step": 41600
},
{
"epoch": 5.276013678553982,
"grad_norm": 0.568523108959198,
"learning_rate": 0.00018729181306561534,
"loss": 1.4786,
"step": 43200
},
{
"epoch": 5.276013678553982,
"eval_loss": 1.3214150667190552,
"eval_runtime": 113.9936,
"eval_samples_per_second": 255.383,
"eval_steps_per_second": 7.983,
"step": 43200
},
{
"epoch": 5.471421592574499,
"grad_norm": 0.5741218328475952,
"learning_rate": 0.00018625814983591572,
"loss": 1.4719,
"step": 44800
},
{
"epoch": 5.471421592574499,
"eval_loss": 1.3142882585525513,
"eval_runtime": 114.0672,
"eval_samples_per_second": 255.218,
"eval_steps_per_second": 7.978,
"step": 44800
},
{
"epoch": 5.666829506595017,
"grad_norm": 0.5851613283157349,
"learning_rate": 0.00018518716274767993,
"loss": 1.4664,
"step": 46400
},
{
"epoch": 5.666829506595017,
"eval_loss": 1.3055411577224731,
"eval_runtime": 113.9709,
"eval_samples_per_second": 255.434,
"eval_steps_per_second": 7.984,
"step": 46400
},
{
"epoch": 5.862237420615535,
"grad_norm": 0.5467984676361084,
"learning_rate": 0.0001840793152164884,
"loss": 1.4597,
"step": 48000
},
{
"epoch": 5.862237420615535,
"eval_loss": 1.2973804473876953,
"eval_runtime": 113.8127,
"eval_samples_per_second": 255.789,
"eval_steps_per_second": 7.996,
"step": 48000
},
{
"epoch": 6.057645334636053,
"grad_norm": 0.5465243458747864,
"learning_rate": 0.0001829350866074169,
"loss": 1.4478,
"step": 49600
},
{
"epoch": 6.057645334636053,
"eval_loss": 1.2836332321166992,
"eval_runtime": 114.5167,
"eval_samples_per_second": 254.216,
"eval_steps_per_second": 7.946,
"step": 49600
},
{
"epoch": 6.25305324865657,
"grad_norm": 0.5638105273246765,
"learning_rate": 0.0001817549720276156,
"loss": 1.4453,
"step": 51200
},
{
"epoch": 6.25305324865657,
"eval_loss": 1.284009337425232,
"eval_runtime": 114.4532,
"eval_samples_per_second": 254.357,
"eval_steps_per_second": 7.951,
"step": 51200
},
{
"epoch": 6.448461162677089,
"grad_norm": 0.5834559798240662,
"learning_rate": 0.00018053948211207624,
"loss": 1.4376,
"step": 52800
},
{
"epoch": 6.448461162677089,
"eval_loss": 1.2804958820343018,
"eval_runtime": 114.2418,
"eval_samples_per_second": 254.828,
"eval_steps_per_second": 7.966,
"step": 52800
},
{
"epoch": 6.643869076697606,
"grad_norm": 0.58949214220047,
"learning_rate": 0.0001792891428026808,
"loss": 1.4311,
"step": 54400
},
{
"epoch": 6.643869076697606,
"eval_loss": 1.2712562084197998,
"eval_runtime": 114.4257,
"eval_samples_per_second": 254.418,
"eval_steps_per_second": 7.953,
"step": 54400
},
{
"epoch": 6.839276990718124,
"grad_norm": 0.65146803855896,
"learning_rate": 0.0001780044951206266,
"loss": 1.4238,
"step": 56000
},
{
"epoch": 6.839276990718124,
"eval_loss": 1.2667021751403809,
"eval_runtime": 114.5948,
"eval_samples_per_second": 254.043,
"eval_steps_per_second": 7.941,
"step": 56000
},
{
"epoch": 7.034684904738642,
"grad_norm": 0.532586395740509,
"learning_rate": 0.00017668609493232685,
"loss": 1.4191,
"step": 57600
},
{
"epoch": 7.034684904738642,
"eval_loss": 1.2611029148101807,
"eval_runtime": 113.5719,
"eval_samples_per_second": 256.331,
"eval_steps_per_second": 8.013,
"step": 57600
},
{
"epoch": 7.23009281875916,
"grad_norm": 0.606846809387207,
"learning_rate": 0.00017533451270888733,
"loss": 1.4126,
"step": 59200
},
{
"epoch": 7.23009281875916,
"eval_loss": 1.2520208358764648,
"eval_runtime": 113.6295,
"eval_samples_per_second": 256.201,
"eval_steps_per_second": 8.008,
"step": 59200
},
{
"epoch": 7.425500732779677,
"grad_norm": 0.5959519147872925,
"learning_rate": 0.00017395033327926394,
"loss": 1.4088,
"step": 60800
},
{
"epoch": 7.425500732779677,
"eval_loss": 1.2475306987762451,
"eval_runtime": 113.6566,
"eval_samples_per_second": 256.14,
"eval_steps_per_second": 8.007,
"step": 60800
},
{
"epoch": 7.620908646800196,
"grad_norm": 0.5935414433479309,
"learning_rate": 0.0001725341555772075,
"loss": 1.4023,
"step": 62400
},
{
"epoch": 7.620908646800196,
"eval_loss": 1.2453813552856445,
"eval_runtime": 113.5561,
"eval_samples_per_second": 256.367,
"eval_steps_per_second": 8.014,
"step": 62400
},
{
"epoch": 7.816316560820713,
"grad_norm": 0.667023777961731,
"learning_rate": 0.00017108659238210543,
"loss": 1.3981,
"step": 64000
},
{
"epoch": 7.816316560820713,
"eval_loss": 1.2348949909210205,
"eval_runtime": 113.5041,
"eval_samples_per_second": 256.484,
"eval_steps_per_second": 8.017,
"step": 64000
},
{
"epoch": 8.01172447484123,
"grad_norm": 0.6116629838943481,
"learning_rate": 0.00016960827005383234,
"loss": 1.3919,
"step": 65600
},
{
"epoch": 8.01172447484123,
"eval_loss": 1.2304078340530396,
"eval_runtime": 113.6689,
"eval_samples_per_second": 256.112,
"eval_steps_per_second": 8.006,
"step": 65600
},
{
"epoch": 8.20713238886175,
"grad_norm": 0.6998280882835388,
"learning_rate": 0.00016809982826172446,
"loss": 1.3842,
"step": 67200
},
{
"epoch": 8.20713238886175,
"eval_loss": 1.232326626777649,
"eval_runtime": 113.3579,
"eval_samples_per_second": 256.815,
"eval_steps_per_second": 8.028,
"step": 67200
},
{
"epoch": 8.402540302882267,
"grad_norm": 0.609658420085907,
"learning_rate": 0.00016656191970779508,
"loss": 1.3805,
"step": 68800
},
{
"epoch": 8.402540302882267,
"eval_loss": 1.2193703651428223,
"eval_runtime": 113.4616,
"eval_samples_per_second": 256.58,
"eval_steps_per_second": 8.02,
"step": 68800
},
{
"epoch": 8.597948216902784,
"grad_norm": 0.6205955147743225,
"learning_rate": 0.0001649952098443106,
"loss": 1.3753,
"step": 70400
},
{
"epoch": 8.597948216902784,
"eval_loss": 1.2129108905792236,
"eval_runtime": 113.5351,
"eval_samples_per_second": 256.414,
"eval_steps_per_second": 8.015,
"step": 70400
},
{
"epoch": 8.793356130923302,
"grad_norm": 0.660017728805542,
"learning_rate": 0.00016340037658584987,
"loss": 1.3686,
"step": 72000
},
{
"epoch": 8.793356130923302,
"eval_loss": 1.212546706199646,
"eval_runtime": 113.3123,
"eval_samples_per_second": 256.918,
"eval_steps_per_second": 8.031,
"step": 72000
},
{
"epoch": 8.98876404494382,
"grad_norm": 0.6431707739830017,
"learning_rate": 0.00016177811001597065,
"loss": 1.3647,
"step": 73600
},
{
"epoch": 8.98876404494382,
"eval_loss": 1.2093451023101807,
"eval_runtime": 113.6904,
"eval_samples_per_second": 256.064,
"eval_steps_per_second": 8.004,
"step": 73600
},
{
"epoch": 9.184171958964338,
"grad_norm": 0.6240015029907227,
"learning_rate": 0.00016012911208861095,
"loss": 1.3578,
"step": 75200
},
{
"epoch": 9.184171958964338,
"eval_loss": 1.1938095092773438,
"eval_runtime": 113.4839,
"eval_samples_per_second": 256.53,
"eval_steps_per_second": 8.019,
"step": 75200
},
{
"epoch": 9.379579872984856,
"grad_norm": 0.6457648277282715,
"learning_rate": 0.00015845409632435383,
"loss": 1.3517,
"step": 76800
},
{
"epoch": 9.379579872984856,
"eval_loss": 1.195897102355957,
"eval_runtime": 113.3384,
"eval_samples_per_second": 256.859,
"eval_steps_per_second": 8.029,
"step": 76800
},
{
"epoch": 9.574987787005373,
"grad_norm": 0.6358394622802734,
"learning_rate": 0.0001567537875016875,
"loss": 1.3487,
"step": 78400
},
{
"epoch": 9.574987787005373,
"eval_loss": 1.1895701885223389,
"eval_runtime": 113.7424,
"eval_samples_per_second": 255.947,
"eval_steps_per_second": 8.001,
"step": 78400
},
{
"epoch": 9.770395701025892,
"grad_norm": 0.6638132333755493,
"learning_rate": 0.00015502892134339392,
"loss": 1.3416,
"step": 80000
},
{
"epoch": 9.770395701025892,
"eval_loss": 1.187201976776123,
"eval_runtime": 113.9079,
"eval_samples_per_second": 255.575,
"eval_steps_per_second": 7.989,
"step": 80000
},
{
"epoch": 9.96580361504641,
"grad_norm": 0.6676363945007324,
"learning_rate": 0.00015328024419820202,
"loss": 1.3386,
"step": 81600
},
{
"epoch": 9.96580361504641,
"eval_loss": 1.1805278062820435,
"eval_runtime": 113.8732,
"eval_samples_per_second": 255.653,
"eval_steps_per_second": 7.991,
"step": 81600
},
{
"epoch": 10.161211529066927,
"grad_norm": 0.660179853439331,
"learning_rate": 0.00015150851271784278,
"loss": 1.3343,
"step": 83200
},
{
"epoch": 10.161211529066927,
"eval_loss": 1.1744478940963745,
"eval_runtime": 114.023,
"eval_samples_per_second": 255.317,
"eval_steps_per_second": 7.981,
"step": 83200
},
{
"epoch": 10.356619443087444,
"grad_norm": 0.9876406192779541,
"learning_rate": 0.00014971449352964674,
"loss": 1.3267,
"step": 84800
},
{
"epoch": 10.356619443087444,
"eval_loss": 1.1723557710647583,
"eval_runtime": 113.844,
"eval_samples_per_second": 255.718,
"eval_steps_per_second": 7.993,
"step": 84800
},
{
"epoch": 10.552027357107963,
"grad_norm": 0.6699544191360474,
"learning_rate": 0.00014789896290482433,
"loss": 1.323,
"step": 86400
},
{
"epoch": 10.552027357107963,
"eval_loss": 1.1727294921875,
"eval_runtime": 113.7804,
"eval_samples_per_second": 255.861,
"eval_steps_per_second": 7.998,
"step": 86400
},
{
"epoch": 10.74743527112848,
"grad_norm": 0.7076494097709656,
"learning_rate": 0.00014606270642257408,
"loss": 1.3197,
"step": 88000
},
{
"epoch": 10.74743527112848,
"eval_loss": 1.161535620689392,
"eval_runtime": 113.9245,
"eval_samples_per_second": 255.538,
"eval_steps_per_second": 7.988,
"step": 88000
},
{
"epoch": 10.942843185148998,
"grad_norm": 0.6479067802429199,
"learning_rate": 0.00014420651863016263,
"loss": 1.3147,
"step": 89600
},
{
"epoch": 10.942843185148998,
"eval_loss": 1.157774567604065,
"eval_runtime": 113.8055,
"eval_samples_per_second": 255.805,
"eval_steps_per_second": 7.996,
"step": 89600
},
{
"epoch": 11.138251099169516,
"grad_norm": 1.0591120719909668,
"learning_rate": 0.0001423312026991247,
"loss": 1.309,
"step": 91200
},
{
"epoch": 11.138251099169516,
"eval_loss": 1.1560285091400146,
"eval_runtime": 113.772,
"eval_samples_per_second": 255.88,
"eval_steps_per_second": 7.998,
"step": 91200
},
{
"epoch": 11.333659013190035,
"grad_norm": 0.7391919493675232,
"learning_rate": 0.00014043757007773094,
"loss": 1.304,
"step": 92800
},
{
"epoch": 11.333659013190035,
"eval_loss": 1.148970365524292,
"eval_runtime": 113.6388,
"eval_samples_per_second": 256.18,
"eval_steps_per_second": 8.008,
"step": 92800
},
{
"epoch": 11.529066927210552,
"grad_norm": 0.6884203553199768,
"learning_rate": 0.00013852644013987489,
"loss": 1.3001,
"step": 94400
},
{
"epoch": 11.529066927210552,
"eval_loss": 1.144970178604126,
"eval_runtime": 113.5724,
"eval_samples_per_second": 256.33,
"eval_steps_per_second": 8.013,
"step": 94400
},
{
"epoch": 11.72447484123107,
"grad_norm": 0.8327965140342712,
"learning_rate": 0.00013659863983053026,
"loss": 1.2952,
"step": 96000
},
{
"epoch": 11.72447484123107,
"eval_loss": 1.1416401863098145,
"eval_runtime": 113.5927,
"eval_samples_per_second": 256.284,
"eval_steps_per_second": 8.011,
"step": 96000
},
{
"epoch": 11.919882755251587,
"grad_norm": 0.6861343383789062,
"learning_rate": 0.00013465500330793215,
"loss": 1.2928,
"step": 97600
},
{
"epoch": 11.919882755251587,
"eval_loss": 1.14198899269104,
"eval_runtime": 113.7062,
"eval_samples_per_second": 256.028,
"eval_steps_per_second": 8.003,
"step": 97600
},
{
"epoch": 12.115290669272106,
"grad_norm": 0.6954674124717712,
"learning_rate": 0.0001326963715826373,
"loss": 1.2863,
"step": 99200
},
{
"epoch": 12.115290669272106,
"eval_loss": 1.135217308998108,
"eval_runtime": 114.9988,
"eval_samples_per_second": 253.151,
"eval_steps_per_second": 7.913,
"step": 99200
},
{
"epoch": 12.310698583292623,
"grad_norm": 0.6916314959526062,
"learning_rate": 0.00013072359215361915,
"loss": 1.2822,
"step": 100800
},
{
"epoch": 12.310698583292623,
"eval_loss": 1.134860873222351,
"eval_runtime": 114.7247,
"eval_samples_per_second": 253.755,
"eval_steps_per_second": 7.932,
"step": 100800
},
{
"epoch": 12.50610649731314,
"grad_norm": 0.7100492715835571,
"learning_rate": 0.00012873751864155556,
"loss": 1.2782,
"step": 102400
},
{
"epoch": 12.50610649731314,
"eval_loss": 1.1285021305084229,
"eval_runtime": 114.7756,
"eval_samples_per_second": 253.643,
"eval_steps_per_second": 7.929,
"step": 102400
},
{
"epoch": 12.701514411333658,
"grad_norm": 0.7324435114860535,
"learning_rate": 0.0001267390104194675,
"loss": 1.2726,
"step": 104000
},
{
"epoch": 12.701514411333658,
"eval_loss": 1.1195478439331055,
"eval_runtime": 114.7533,
"eval_samples_per_second": 253.692,
"eval_steps_per_second": 7.93,
"step": 104000
},
{
"epoch": 12.896922325354177,
"grad_norm": 0.7098590135574341,
"learning_rate": 0.00012472893224086873,
"loss": 1.2689,
"step": 105600
},
{
"epoch": 12.896922325354177,
"eval_loss": 1.1154309511184692,
"eval_runtime": 114.9203,
"eval_samples_per_second": 253.323,
"eval_steps_per_second": 7.919,
"step": 105600
},
{
"epoch": 13.092330239374695,
"grad_norm": 0.7113024592399597,
"learning_rate": 0.00012270815386558753,
"loss": 1.2641,
"step": 107200
},
{
"epoch": 13.092330239374695,
"eval_loss": 1.1109468936920166,
"eval_runtime": 114.8007,
"eval_samples_per_second": 253.587,
"eval_steps_per_second": 7.927,
"step": 107200
},
{
"epoch": 13.287738153395212,
"grad_norm": 0.7072641849517822,
"learning_rate": 0.00012067754968342186,
"loss": 1.258,
"step": 108800
},
{
"epoch": 13.287738153395212,
"eval_loss": 1.1076184511184692,
"eval_runtime": 114.8814,
"eval_samples_per_second": 253.409,
"eval_steps_per_second": 7.921,
"step": 108800
},
{
"epoch": 13.48314606741573,
"grad_norm": 0.7073858380317688,
"learning_rate": 0.00011863799833579153,
"loss": 1.2556,
"step": 110400
},
{
"epoch": 13.48314606741573,
"eval_loss": 1.1047865152359009,
"eval_runtime": 114.8061,
"eval_samples_per_second": 253.575,
"eval_steps_per_second": 7.926,
"step": 110400
},
{
"epoch": 13.678553981436249,
"grad_norm": 0.7156842947006226,
"learning_rate": 0.00011659038233555033,
"loss": 1.2525,
"step": 112000
},
{
"epoch": 13.678553981436249,
"eval_loss": 1.1026668548583984,
"eval_runtime": 115.0573,
"eval_samples_per_second": 253.022,
"eval_steps_per_second": 7.909,
"step": 112000
},
{
"epoch": 13.873961895456766,
"grad_norm": 0.7280173301696777,
"learning_rate": 0.00011453558768512322,
"loss": 1.2474,
"step": 113600
},
{
"epoch": 13.873961895456766,
"eval_loss": 1.1001399755477905,
"eval_runtime": 115.2209,
"eval_samples_per_second": 252.663,
"eval_steps_per_second": 7.898,
"step": 113600
},
{
"epoch": 14.069369809477283,
"grad_norm": 0.7136765122413635,
"learning_rate": 0.00011247450349313363,
"loss": 1.2442,
"step": 115200
},
{
"epoch": 14.069369809477283,
"eval_loss": 1.1011698246002197,
"eval_runtime": 114.9454,
"eval_samples_per_second": 253.268,
"eval_steps_per_second": 7.917,
"step": 115200
},
{
"epoch": 14.2647777234978,
"grad_norm": 0.7625411748886108,
"learning_rate": 0.00011040802158968633,
"loss": 1.2381,
"step": 116800
},
{
"epoch": 14.2647777234978,
"eval_loss": 1.0904619693756104,
"eval_runtime": 114.6883,
"eval_samples_per_second": 253.836,
"eval_steps_per_second": 7.935,
"step": 116800
},
{
"epoch": 14.46018563751832,
"grad_norm": 0.7648947834968567,
"learning_rate": 0.00010833703614047328,
"loss": 1.2354,
"step": 118400
},
{
"epoch": 14.46018563751832,
"eval_loss": 1.0927143096923828,
"eval_runtime": 114.6999,
"eval_samples_per_second": 253.81,
"eval_steps_per_second": 7.934,
"step": 118400
},
{
"epoch": 14.655593551538837,
"grad_norm": 0.7222533226013184,
"learning_rate": 0.00010626244325986843,
"loss": 1.2307,
"step": 120000
},
{
"epoch": 14.655593551538837,
"eval_loss": 1.0850768089294434,
"eval_runtime": 114.7495,
"eval_samples_per_second": 253.701,
"eval_steps_per_second": 7.93,
"step": 120000
},
{
"epoch": 14.851001465559355,
"grad_norm": 0.7333817481994629,
"learning_rate": 0.00010418514062317943,
"loss": 1.2264,
"step": 121600
},
{
"epoch": 14.851001465559355,
"eval_loss": 1.0831152200698853,
"eval_runtime": 114.3828,
"eval_samples_per_second": 254.514,
"eval_steps_per_second": 7.956,
"step": 121600
},
{
"epoch": 15.046409379579872,
"grad_norm": 0.7387763261795044,
"learning_rate": 0.00010210602707822416,
"loss": 1.2216,
"step": 123200
},
{
"epoch": 15.046409379579872,
"eval_loss": 1.0754677057266235,
"eval_runtime": 114.7505,
"eval_samples_per_second": 253.698,
"eval_steps_per_second": 7.93,
"step": 123200
},
{
"epoch": 15.241817293600391,
"grad_norm": 0.7571990489959717,
"learning_rate": 0.00010002600225639952,
"loss": 1.2174,
"step": 124800
},
{
"epoch": 15.241817293600391,
"eval_loss": 1.0745357275009155,
"eval_runtime": 115.0069,
"eval_samples_per_second": 253.133,
"eval_steps_per_second": 7.913,
"step": 124800
},
{
"epoch": 15.437225207620909,
"grad_norm": 0.7420470714569092,
"learning_rate": 9.794596618341145e-05,
"loss": 1.2138,
"step": 126400
},
{
"epoch": 15.437225207620909,
"eval_loss": 1.0716029405593872,
"eval_runtime": 114.9716,
"eval_samples_per_second": 253.21,
"eval_steps_per_second": 7.915,
"step": 126400
},
{
"epoch": 15.632633121641426,
"grad_norm": 0.7744555473327637,
"learning_rate": 9.586681888983431e-05,
"loss": 1.2101,
"step": 128000
},
{
"epoch": 15.632633121641426,
"eval_loss": 1.06724214553833,
"eval_runtime": 114.7701,
"eval_samples_per_second": 253.655,
"eval_steps_per_second": 7.929,
"step": 128000
},
{
"epoch": 15.828041035661943,
"grad_norm": 0.7532786726951599,
"learning_rate": 9.378946002166804e-05,
"loss": 1.2073,
"step": 129600
},
{
"epoch": 15.828041035661943,
"eval_loss": 1.0657265186309814,
"eval_runtime": 114.8312,
"eval_samples_per_second": 253.52,
"eval_steps_per_second": 7.925,
"step": 129600
},
{
"epoch": 16.02344894968246,
"grad_norm": 0.7471698522567749,
"learning_rate": 9.171478845106179e-05,
"loss": 1.2007,
"step": 131200
},
{
"epoch": 16.02344894968246,
"eval_loss": 1.060520887374878,
"eval_runtime": 115.6708,
"eval_samples_per_second": 251.68,
"eval_steps_per_second": 7.867,
"step": 131200
},
{
"epoch": 16.21885686370298,
"grad_norm": 0.7851788997650146,
"learning_rate": 8.964370188737233e-05,
"loss": 1.1982,
"step": 132800
},
{
"epoch": 16.21885686370298,
"eval_loss": 1.0585095882415771,
"eval_runtime": 115.3687,
"eval_samples_per_second": 252.339,
"eval_steps_per_second": 7.888,
"step": 132800
},
{
"epoch": 16.4142647777235,
"grad_norm": 0.7923790812492371,
"learning_rate": 8.757709648872583e-05,
"loss": 1.193,
"step": 134400
},
{
"epoch": 16.4142647777235,
"eval_loss": 1.0538368225097656,
"eval_runtime": 115.4747,
"eval_samples_per_second": 252.107,
"eval_steps_per_second": 7.881,
"step": 134400
},
{
"epoch": 16.609672691744017,
"grad_norm": 0.7890061736106873,
"learning_rate": 8.551586647425051e-05,
"loss": 1.1883,
"step": 136000
},
{
"epoch": 16.609672691744017,
"eval_loss": 1.049277663230896,
"eval_runtime": 115.1918,
"eval_samples_per_second": 252.726,
"eval_steps_per_second": 7.9,
"step": 136000
},
{
"epoch": 16.805080605764534,
"grad_norm": 0.7814080119132996,
"learning_rate": 8.346090373714858e-05,
"loss": 1.187,
"step": 137600
},
{
"epoch": 16.805080605764534,
"eval_loss": 1.0530978441238403,
"eval_runtime": 115.0942,
"eval_samples_per_second": 252.941,
"eval_steps_per_second": 7.907,
"step": 137600
},
{
"epoch": 17.00048851978505,
"grad_norm": 0.7448973059654236,
"learning_rate": 8.141309745877437e-05,
"loss": 1.1821,
"step": 139200
},
{
"epoch": 17.00048851978505,
"eval_loss": 1.0466850996017456,
"eval_runtime": 114.902,
"eval_samples_per_second": 253.364,
"eval_steps_per_second": 7.92,
"step": 139200
},
{
"epoch": 17.19589643380557,
"grad_norm": 0.8670977354049683,
"learning_rate": 7.93733337238861e-05,
"loss": 1.1779,
"step": 140800
},
{
"epoch": 17.19589643380557,
"eval_loss": 1.0422841310501099,
"eval_runtime": 115.0547,
"eval_samples_per_second": 253.028,
"eval_steps_per_second": 7.909,
"step": 140800
},
{
"epoch": 17.391304347826086,
"grad_norm": 0.840560257434845,
"learning_rate": 7.734249513723749e-05,
"loss": 1.1742,
"step": 142400
},
{
"epoch": 17.391304347826086,
"eval_loss": 1.0406625270843506,
"eval_runtime": 114.8851,
"eval_samples_per_second": 253.401,
"eval_steps_per_second": 7.921,
"step": 142400
},
{
"epoch": 17.586712261846603,
"grad_norm": 0.7812421321868896,
"learning_rate": 7.532146044167501e-05,
"loss": 1.1704,
"step": 144000
},
{
"epoch": 17.586712261846603,
"eval_loss": 1.037782907485962,
"eval_runtime": 114.6269,
"eval_samples_per_second": 253.972,
"eval_steps_per_second": 7.939,
"step": 144000
},
{
"epoch": 17.78212017586712,
"grad_norm": 0.7963674664497375,
"learning_rate": 7.33111041379063e-05,
"loss": 1.166,
"step": 145600
},
{
"epoch": 17.78212017586712,
"eval_loss": 1.0303895473480225,
"eval_runtime": 114.8536,
"eval_samples_per_second": 253.47,
"eval_steps_per_second": 7.923,
"step": 145600
},
{
"epoch": 17.97752808988764,
"grad_norm": 0.7899876236915588,
"learning_rate": 7.131229610610423e-05,
"loss": 1.1624,
"step": 147200
},
{
"epoch": 17.97752808988764,
"eval_loss": 1.0292210578918457,
"eval_runtime": 114.9662,
"eval_samples_per_second": 253.222,
"eval_steps_per_second": 7.915,
"step": 147200
},
{
"epoch": 18.17293600390816,
"grad_norm": 0.7737675905227661,
"learning_rate": 6.932590122951006e-05,
"loss": 1.158,
"step": 148800
},
{
"epoch": 18.17293600390816,
"eval_loss": 1.0299910306930542,
"eval_runtime": 115.5908,
"eval_samples_per_second": 251.854,
"eval_steps_per_second": 7.873,
"step": 148800
},
{
"epoch": 18.368343917928676,
"grad_norm": 0.8089715838432312,
"learning_rate": 6.735277902019914e-05,
"loss": 1.1554,
"step": 150400
},
{
"epoch": 18.368343917928676,
"eval_loss": 1.0288282632827759,
"eval_runtime": 115.3243,
"eval_samples_per_second": 252.436,
"eval_steps_per_second": 7.891,
"step": 150400
},
{
"epoch": 18.563751831949194,
"grad_norm": 1.2859221696853638,
"learning_rate": 6.539378324717007e-05,
"loss": 1.1497,
"step": 152000
},
{
"epoch": 18.563751831949194,
"eval_loss": 1.0212500095367432,
"eval_runtime": 115.5698,
"eval_samples_per_second": 251.9,
"eval_steps_per_second": 7.874,
"step": 152000
},
{
"epoch": 18.75915974596971,
"grad_norm": 0.8533680438995361,
"learning_rate": 6.344976156691964e-05,
"loss": 1.1467,
"step": 153600
},
{
"epoch": 18.75915974596971,
"eval_loss": 1.0225590467453003,
"eval_runtime": 115.5558,
"eval_samples_per_second": 251.93,
"eval_steps_per_second": 7.875,
"step": 153600
},
{
"epoch": 18.95456765999023,
"grad_norm": 0.8569408059120178,
"learning_rate": 6.152155515666206e-05,
"loss": 1.1444,
"step": 155200
},
{
"epoch": 18.95456765999023,
"eval_loss": 1.018567442893982,
"eval_runtime": 115.3717,
"eval_samples_per_second": 252.332,
"eval_steps_per_second": 7.888,
"step": 155200
},
{
"epoch": 19.149975574010746,
"grad_norm": 0.8161713480949402,
"learning_rate": 5.96099983503521e-05,
"loss": 1.1383,
"step": 156800
},
{
"epoch": 19.149975574010746,
"eval_loss": 1.0170692205429077,
"eval_runtime": 114.7154,
"eval_samples_per_second": 253.776,
"eval_steps_per_second": 7.933,
"step": 156800
},
{
"epoch": 19.345383488031267,
"grad_norm": 0.8285250067710876,
"learning_rate": 5.771591827766929e-05,
"loss": 1.1369,
"step": 158400
},
{
"epoch": 19.345383488031267,
"eval_loss": 1.013586163520813,
"eval_runtime": 114.7883,
"eval_samples_per_second": 253.615,
"eval_steps_per_second": 7.928,
"step": 158400
},
{
"epoch": 19.540791402051784,
"grad_norm": 0.8243290781974792,
"learning_rate": 5.5840134506119026e-05,
"loss": 1.1324,
"step": 160000
},
{
"epoch": 19.540791402051784,
"eval_loss": 1.0133509635925293,
"eval_runtime": 114.968,
"eval_samples_per_second": 253.218,
"eval_steps_per_second": 7.915,
"step": 160000
},
{
"epoch": 19.7361993160723,
"grad_norm": 0.8422598838806152,
"learning_rate": 5.398345868640643e-05,
"loss": 1.1295,
"step": 161600
},
{
"epoch": 19.7361993160723,
"eval_loss": 1.0076338052749634,
"eval_runtime": 115.0212,
"eval_samples_per_second": 253.101,
"eval_steps_per_second": 7.912,
"step": 161600
},
{
"epoch": 19.93160723009282,
"grad_norm": 0.8394371867179871,
"learning_rate": 5.2146694201235327e-05,
"loss": 1.1261,
"step": 163200
},
{
"epoch": 19.93160723009282,
"eval_loss": 1.0029668807983398,
"eval_runtime": 114.9795,
"eval_samples_per_second": 253.193,
"eval_steps_per_second": 7.914,
"step": 163200
},
{
"epoch": 20.127015144113336,
"grad_norm": 0.8226906061172485,
"learning_rate": 5.033063581768499e-05,
"loss": 1.1256,
"step": 164800
},
{
"epoch": 20.127015144113336,
"eval_loss": 1.0083191394805908,
"eval_runtime": 114.7408,
"eval_samples_per_second": 253.72,
"eval_steps_per_second": 7.931,
"step": 164800
},
{
"epoch": 20.322423058133854,
"grad_norm": 0.8865888714790344,
"learning_rate": 4.8536069343314827e-05,
"loss": 1.1189,
"step": 166400
},
{
"epoch": 20.322423058133854,
"eval_loss": 1.0004106760025024,
"eval_runtime": 114.8581,
"eval_samples_per_second": 253.461,
"eval_steps_per_second": 7.923,
"step": 166400
},
{
"epoch": 20.51783097215437,
"grad_norm": 0.8663754463195801,
"learning_rate": 4.676377128614583e-05,
"loss": 1.1148,
"step": 168000
},
{
"epoch": 20.51783097215437,
"eval_loss": 0.9975536465644836,
"eval_runtime": 114.7833,
"eval_samples_per_second": 253.626,
"eval_steps_per_second": 7.928,
"step": 168000
},
{
"epoch": 20.71323888617489,
"grad_norm": 0.845713198184967,
"learning_rate": 4.501450851866593e-05,
"loss": 1.1138,
"step": 169600
},
{
"epoch": 20.71323888617489,
"eval_loss": 0.9994989633560181,
"eval_runtime": 114.7935,
"eval_samples_per_second": 253.603,
"eval_steps_per_second": 7.927,
"step": 169600
},
{
"epoch": 20.90864680019541,
"grad_norm": 0.8745766282081604,
"learning_rate": 4.3289037946004674e-05,
"loss": 1.1102,
"step": 171200
},
{
"epoch": 20.90864680019541,
"eval_loss": 0.991565465927124,
"eval_runtime": 114.9297,
"eval_samples_per_second": 253.303,
"eval_steps_per_second": 7.918,
"step": 171200
},
{
"epoch": 21.104054714215927,
"grad_norm": 0.9098924994468689,
"learning_rate": 4.158810617842075e-05,
"loss": 1.1063,
"step": 172800
},
{
"epoch": 21.104054714215927,
"eval_loss": 0.9949960112571716,
"eval_runtime": 114.4888,
"eval_samples_per_second": 254.278,
"eval_steps_per_second": 7.948,
"step": 172800
},
{
"epoch": 21.299462628236444,
"grad_norm": 0.8534849882125854,
"learning_rate": 3.9912449208244075e-05,
"loss": 1.1029,
"step": 174400
},
{
"epoch": 21.299462628236444,
"eval_loss": 0.9888520240783691,
"eval_runtime": 114.549,
"eval_samples_per_second": 254.144,
"eval_steps_per_second": 7.944,
"step": 174400
},
{
"epoch": 21.49487054225696,
"grad_norm": 0.8766836524009705,
"learning_rate": 3.826279209141231e-05,
"loss": 1.1011,
"step": 176000
},
{
"epoch": 21.49487054225696,
"eval_loss": 0.9886327981948853,
"eval_runtime": 114.6717,
"eval_samples_per_second": 253.873,
"eval_steps_per_second": 7.936,
"step": 176000
},
{
"epoch": 21.69027845627748,
"grad_norm": 0.8900472521781921,
"learning_rate": 3.663984863373953e-05,
"loss": 1.098,
"step": 177600
},
{
"epoch": 21.69027845627748,
"eval_loss": 0.9863277077674866,
"eval_runtime": 114.7826,
"eval_samples_per_second": 253.627,
"eval_steps_per_second": 7.928,
"step": 177600
},
{
"epoch": 21.885686370297996,
"grad_norm": 0.8910781741142273,
"learning_rate": 3.504432108205271e-05,
"loss": 1.093,
"step": 179200
},
{
"epoch": 21.885686370297996,
"eval_loss": 0.9864732623100281,
"eval_runtime": 114.9598,
"eval_samples_per_second": 253.236,
"eval_steps_per_second": 7.916,
"step": 179200
},
{
"epoch": 22.081094284318514,
"grad_norm": 0.960498034954071,
"learning_rate": 3.347689982033e-05,
"loss": 1.0908,
"step": 180800
},
{
"epoch": 22.081094284318514,
"eval_loss": 0.9836633801460266,
"eval_runtime": 116.058,
"eval_samples_per_second": 250.84,
"eval_steps_per_second": 7.841,
"step": 180800
},
{
"epoch": 22.27650219833903,
"grad_norm": 0.9642614126205444,
"learning_rate": 3.193826307097183e-05,
"loss": 1.0862,
"step": 182400
},
{
"epoch": 22.27650219833903,
"eval_loss": 0.9829400777816772,
"eval_runtime": 116.0247,
"eval_samples_per_second": 250.912,
"eval_steps_per_second": 7.843,
"step": 182400
},
{
"epoch": 22.471910112359552,
"grad_norm": 0.9408681988716125,
"learning_rate": 3.042907660133447e-05,
"loss": 1.0848,
"step": 184000
},
{
"epoch": 22.471910112359552,
"eval_loss": 0.9808396100997925,
"eval_runtime": 115.8196,
"eval_samples_per_second": 251.356,
"eval_steps_per_second": 7.857,
"step": 184000
},
{
"epoch": 22.66731802638007,
"grad_norm": 0.9056561589241028,
"learning_rate": 2.89499934356528e-05,
"loss": 1.0834,
"step": 185600
},
{
"epoch": 22.66731802638007,
"eval_loss": 0.9777108430862427,
"eval_runtime": 115.8982,
"eval_samples_per_second": 251.186,
"eval_steps_per_second": 7.852,
"step": 185600
},
{
"epoch": 22.862725940400587,
"grad_norm": 0.9036485552787781,
"learning_rate": 2.7501653572476948e-05,
"loss": 1.0781,
"step": 187200
},
{
"epoch": 22.862725940400587,
"eval_loss": 0.9735616445541382,
"eval_runtime": 115.739,
"eval_samples_per_second": 251.531,
"eval_steps_per_second": 7.863,
"step": 187200
},
{
"epoch": 23.058133854421104,
"grad_norm": 0.9098331928253174,
"learning_rate": 2.6084683707745506e-05,
"loss": 1.076,
"step": 188800
},
{
"epoch": 23.058133854421104,
"eval_loss": 0.97157222032547,
"eval_runtime": 115.438,
"eval_samples_per_second": 252.187,
"eval_steps_per_second": 7.883,
"step": 188800
},
{
"epoch": 23.25354176844162,
"grad_norm": 0.9487655162811279,
"learning_rate": 2.4699696963614248e-05,
"loss": 1.0729,
"step": 190400
},
{
"epoch": 23.25354176844162,
"eval_loss": 0.9695687890052795,
"eval_runtime": 115.3054,
"eval_samples_per_second": 252.477,
"eval_steps_per_second": 7.892,
"step": 190400
},
{
"epoch": 23.44894968246214,
"grad_norm": 0.9763538241386414,
"learning_rate": 2.334729262315859e-05,
"loss": 1.0713,
"step": 192000
},
{
"epoch": 23.44894968246214,
"eval_loss": 0.9713948965072632,
"eval_runtime": 115.3921,
"eval_samples_per_second": 252.288,
"eval_steps_per_second": 7.886,
"step": 192000
},
{
"epoch": 23.644357596482656,
"grad_norm": 0.9164339303970337,
"learning_rate": 2.2028055871064014e-05,
"loss": 1.067,
"step": 193600
},
{
"epoch": 23.644357596482656,
"eval_loss": 0.9682226777076721,
"eval_runtime": 115.3414,
"eval_samples_per_second": 252.399,
"eval_steps_per_second": 7.89,
"step": 193600
},
{
"epoch": 23.839765510503174,
"grad_norm": 0.9368218183517456,
"learning_rate": 2.0742557540417086e-05,
"loss": 1.0657,
"step": 195200
},
{
"epoch": 23.839765510503174,
"eval_loss": 0.9675361514091492,
"eval_runtime": 115.2177,
"eval_samples_per_second": 252.669,
"eval_steps_per_second": 7.898,
"step": 195200
},
{
"epoch": 24.035173424523695,
"grad_norm": 0.9578301906585693,
"learning_rate": 1.9491353865706208e-05,
"loss": 1.064,
"step": 196800
},
{
"epoch": 24.035173424523695,
"eval_loss": 0.9658530950546265,
"eval_runtime": 115.8818,
"eval_samples_per_second": 251.221,
"eval_steps_per_second": 7.853,
"step": 196800
},
{
"epoch": 24.230581338544212,
"grad_norm": 0.9136309623718262,
"learning_rate": 1.8274986242139203e-05,
"loss": 1.0606,
"step": 198400
},
{
"epoch": 24.230581338544212,
"eval_loss": 0.9649612307548523,
"eval_runtime": 115.9599,
"eval_samples_per_second": 251.052,
"eval_steps_per_second": 7.848,
"step": 198400
},
{
"epoch": 24.42598925256473,
"grad_norm": 0.9447437524795532,
"learning_rate": 1.7093980991381786e-05,
"loss": 1.0594,
"step": 200000
},
{
"epoch": 24.42598925256473,
"eval_loss": 0.964313805103302,
"eval_runtime": 115.6654,
"eval_samples_per_second": 251.692,
"eval_steps_per_second": 7.868,
"step": 200000
},
{
"epoch": 24.621397166585247,
"grad_norm": 0.9168404936790466,
"learning_rate": 1.5948849133818656e-05,
"loss": 1.0551,
"step": 201600
},
{
"epoch": 24.621397166585247,
"eval_loss": 0.959920346736908,
"eval_runtime": 115.6267,
"eval_samples_per_second": 251.776,
"eval_steps_per_second": 7.87,
"step": 201600
},
{
"epoch": 24.816805080605764,
"grad_norm": 0.959356963634491,
"learning_rate": 1.4840086167435107e-05,
"loss": 1.0523,
"step": 203200
},
{
"epoch": 24.816805080605764,
"eval_loss": 0.9573366045951843,
"eval_runtime": 116.0958,
"eval_samples_per_second": 250.758,
"eval_steps_per_second": 7.838,
"step": 203200
},
{
"epoch": 25.01221299462628,
"grad_norm": 0.9525801539421082,
"learning_rate": 1.376817185341529e-05,
"loss": 1.0506,
"step": 204800
},
{
"epoch": 25.01221299462628,
"eval_loss": 0.9594299793243408,
"eval_runtime": 115.6699,
"eval_samples_per_second": 251.682,
"eval_steps_per_second": 7.867,
"step": 204800
},
{
"epoch": 25.2076209086468,
"grad_norm": 0.9626793265342712,
"learning_rate": 1.2733570008549767e-05,
"loss": 1.0494,
"step": 206400
},
{
"epoch": 25.2076209086468,
"eval_loss": 0.961174488067627,
"eval_runtime": 115.5294,
"eval_samples_per_second": 251.988,
"eval_steps_per_second": 7.877,
"step": 206400
},
{
"epoch": 25.403028822667316,
"grad_norm": 0.9470829367637634,
"learning_rate": 1.1736728304542287e-05,
"loss": 1.0448,
"step": 208000
},
{
"epoch": 25.403028822667316,
"eval_loss": 0.9573968052864075,
"eval_runtime": 115.2828,
"eval_samples_per_second": 252.527,
"eval_steps_per_second": 7.894,
"step": 208000
},
{
"epoch": 25.598436736687837,
"grad_norm": 1.021452784538269,
"learning_rate": 1.0778078074302412e-05,
"loss": 1.0442,
"step": 209600
},
{
"epoch": 25.598436736687837,
"eval_loss": 0.9558864235877991,
"eval_runtime": 115.4718,
"eval_samples_per_second": 252.113,
"eval_steps_per_second": 7.881,
"step": 209600
},
{
"epoch": 25.793844650708355,
"grad_norm": 1.0185869932174683,
"learning_rate": 9.85803412530808e-06,
"loss": 1.0406,
"step": 211200
},
{
"epoch": 25.793844650708355,
"eval_loss": 0.9554181694984436,
"eval_runtime": 115.4932,
"eval_samples_per_second": 252.067,
"eval_steps_per_second": 7.879,
"step": 211200
},
{
"epoch": 25.989252564728872,
"grad_norm": 1.0851198434829712,
"learning_rate": 8.976994560118401e-06,
"loss": 1.0415,
"step": 212800
},
{
"epoch": 25.989252564728872,
"eval_loss": 0.9549527764320374,
"eval_runtime": 115.3462,
"eval_samples_per_second": 252.388,
"eval_steps_per_second": 7.889,
"step": 212800
},
{
"epoch": 26.18466047874939,
"grad_norm": 0.9994131922721863,
"learning_rate": 8.135340604115083e-06,
"loss": 1.0386,
"step": 214400
},
{
"epoch": 26.18466047874939,
"eval_loss": 0.955303430557251,
"eval_runtime": 114.8631,
"eval_samples_per_second": 253.449,
"eval_steps_per_second": 7.922,
"step": 214400
},
{
"epoch": 26.380068392769907,
"grad_norm": 1.0339714288711548,
"learning_rate": 7.333436440546337e-06,
"loss": 1.036,
"step": 216000
},
{
"epoch": 26.380068392769907,
"eval_loss": 0.9524005055427551,
"eval_runtime": 114.8545,
"eval_samples_per_second": 253.469,
"eval_steps_per_second": 7.923,
"step": 216000
},
{
"epoch": 26.575476306790424,
"grad_norm": 1.0113097429275513,
"learning_rate": 6.571629052944928e-06,
"loss": 1.0339,
"step": 217600
},
{
"epoch": 26.575476306790424,
"eval_loss": 0.9527350664138794,
"eval_runtime": 115.092,
"eval_samples_per_second": 252.946,
"eval_steps_per_second": 7.907,
"step": 217600
},
{
"epoch": 26.77088422081094,
"grad_norm": 1.041275143623352,
"learning_rate": 5.850248074988618e-06,
"loss": 1.0324,
"step": 219200
},
{
"epoch": 26.77088422081094,
"eval_loss": 0.9510777592658997,
"eval_runtime": 114.9365,
"eval_samples_per_second": 253.288,
"eval_steps_per_second": 7.917,
"step": 219200
},
{
"epoch": 26.96629213483146,
"grad_norm": 1.0127288103103638,
"learning_rate": 5.169605647867792e-06,
"loss": 1.0308,
"step": 220800
},
{
"epoch": 26.96629213483146,
"eval_loss": 0.9494324326515198,
"eval_runtime": 115.2852,
"eval_samples_per_second": 252.521,
"eval_steps_per_second": 7.893,
"step": 220800
},
{
"epoch": 27.16170004885198,
"grad_norm": 0.9832186102867126,
"learning_rate": 4.5299962852221935e-06,
"loss": 1.0288,
"step": 222400
},
{
"epoch": 27.16170004885198,
"eval_loss": 0.9487127661705017,
"eval_runtime": 114.7853,
"eval_samples_per_second": 253.621,
"eval_steps_per_second": 7.928,
"step": 222400
},
{
"epoch": 27.357107962872497,
"grad_norm": 0.9942122101783752,
"learning_rate": 3.931696745704927e-06,
"loss": 1.0272,
"step": 224000
},
{
"epoch": 27.357107962872497,
"eval_loss": 0.9485536217689514,
"eval_runtime": 114.9745,
"eval_samples_per_second": 253.204,
"eval_steps_per_second": 7.915,
"step": 224000
},
{
"epoch": 27.552515876893015,
"grad_norm": 0.9841001033782959,
"learning_rate": 3.374965913229211e-06,
"loss": 1.0264,
"step": 225600
},
{
"epoch": 27.552515876893015,
"eval_loss": 0.9484065175056458,
"eval_runtime": 115.0105,
"eval_samples_per_second": 253.125,
"eval_steps_per_second": 7.912,
"step": 225600
},
{
"epoch": 27.747923790913532,
"grad_norm": 1.0424267053604126,
"learning_rate": 2.8600446849493812e-06,
"loss": 1.0231,
"step": 227200
},
{
"epoch": 27.747923790913532,
"eval_loss": 0.9473522901535034,
"eval_runtime": 114.9031,
"eval_samples_per_second": 253.361,
"eval_steps_per_second": 7.92,
"step": 227200
},
{
"epoch": 27.94333170493405,
"grad_norm": 1.051414132118225,
"learning_rate": 2.3871558670248374e-06,
"loss": 1.0213,
"step": 228800
},
{
"epoch": 27.94333170493405,
"eval_loss": 0.9472260475158691,
"eval_runtime": 115.1913,
"eval_samples_per_second": 252.728,
"eval_steps_per_second": 7.9,
"step": 228800
},
{
"epoch": 28.138739618954567,
"grad_norm": 1.0657280683517456,
"learning_rate": 1.9565040782119183e-06,
"loss": 1.0228,
"step": 230400
},
{
"epoch": 28.138739618954567,
"eval_loss": 0.9466701745986938,
"eval_runtime": 115.4815,
"eval_samples_per_second": 252.092,
"eval_steps_per_second": 7.88,
"step": 230400
},
{
"epoch": 28.334147532975084,
"grad_norm": 1.0056368112564087,
"learning_rate": 1.5682756613254578e-06,
"loss": 1.019,
"step": 232000
},
{
"epoch": 28.334147532975084,
"eval_loss": 0.9479278922080994,
"eval_runtime": 115.3634,
"eval_samples_per_second": 252.35,
"eval_steps_per_second": 7.888,
"step": 232000
},
{
"epoch": 28.5295554469956,
"grad_norm": 1.0009722709655762,
"learning_rate": 1.2226386026083835e-06,
"loss": 1.0172,
"step": 233600
},
{
"epoch": 28.5295554469956,
"eval_loss": 0.9463828206062317,
"eval_runtime": 115.3111,
"eval_samples_per_second": 252.465,
"eval_steps_per_second": 7.892,
"step": 233600
},
{
"epoch": 28.724963361016123,
"grad_norm": 1.0131070613861084,
"learning_rate": 9.19742459044104e-07,
"loss": 1.0178,
"step": 235200
},
{
"epoch": 28.724963361016123,
"eval_loss": 0.9452440738677979,
"eval_runtime": 115.5625,
"eval_samples_per_second": 251.916,
"eval_steps_per_second": 7.875,
"step": 235200
},
{
"epoch": 28.92037127503664,
"grad_norm": 1.0448576211929321,
"learning_rate": 6.597182936433189e-07,
"loss": 1.0146,
"step": 236800
},
{
"epoch": 28.92037127503664,
"eval_loss": 0.9440923929214478,
"eval_runtime": 115.4862,
"eval_samples_per_second": 252.082,
"eval_steps_per_second": 7.88,
"step": 236800
},
{
"epoch": 29.115779189057157,
"grad_norm": 1.0811083316802979,
"learning_rate": 4.426786187330612e-07,
"loss": 1.0131,
"step": 238400
},
{
"epoch": 29.115779189057157,
"eval_loss": 0.9451555013656616,
"eval_runtime": 114.7713,
"eval_samples_per_second": 253.652,
"eval_steps_per_second": 7.929,
"step": 238400
},
{
"epoch": 29.311187103077675,
"grad_norm": 1.1182361841201782,
"learning_rate": 2.6871734727274e-07,
"loss": 1.0122,
"step": 240000
},
{
"epoch": 29.311187103077675,
"eval_loss": 0.9468272924423218,
"eval_runtime": 115.1022,
"eval_samples_per_second": 252.923,
"eval_steps_per_second": 7.906,
"step": 240000
},
{
"epoch": 29.506595017098192,
"grad_norm": 1.080756425857544,
"learning_rate": 1.3790975221799062e-07,
"loss": 1.0098,
"step": 241600
},
{
"epoch": 29.506595017098192,
"eval_loss": 0.9463731050491333,
"eval_runtime": 114.8868,
"eval_samples_per_second": 253.397,
"eval_steps_per_second": 7.921,
"step": 241600
},
{
"epoch": 29.70200293111871,
"grad_norm": 1.1268196105957031,
"learning_rate": 5.03124339501504e-08,
"loss": 1.0107,
"step": 243200
},
{
"epoch": 29.70200293111871,
"eval_loss": 0.9453385472297668,
"eval_runtime": 114.5631,
"eval_samples_per_second": 254.113,
"eval_steps_per_second": 7.943,
"step": 243200
},
{
"epoch": 29.897410845139227,
"grad_norm": 1.0300451517105103,
"learning_rate": 5.963295785271772e-09,
"loss": 1.0083,
"step": 244800
},
{
"epoch": 29.897410845139227,
"eval_loss": 0.9439197182655334,
"eval_runtime": 115.1016,
"eval_samples_per_second": 252.924,
"eval_steps_per_second": 7.906,
"step": 244800
}
],
"logging_steps": 1600,
"max_steps": 245640,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 1600,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.783639331402416e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}