gpt2-medium_vanilla500 / trainer_state.json
schaeff's picture
Upload folder using huggingface_hub
408b46b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.02869777005849354,
"eval_steps": 100,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.739554011698708e-05,
"grad_norm": 1.8802112340927124,
"learning_rate": 5.9999999999999995e-05,
"loss": 2.9438,
"step": 1
},
{
"epoch": 0.00011479108023397416,
"grad_norm": 1.9408955574035645,
"learning_rate": 0.00011999999999999999,
"loss": 2.9429,
"step": 2
},
{
"epoch": 0.00017218662035096125,
"grad_norm": 2.9192652702331543,
"learning_rate": 0.00017999999999999998,
"loss": 2.952,
"step": 3
},
{
"epoch": 0.00022958216046794832,
"grad_norm": 2.3403642177581787,
"learning_rate": 0.00023999999999999998,
"loss": 2.9307,
"step": 4
},
{
"epoch": 0.00028697770058493544,
"grad_norm": 2.134683847427368,
"learning_rate": 0.0003,
"loss": 2.8917,
"step": 5
},
{
"epoch": 0.0003443732407019225,
"grad_norm": 1.5358260869979858,
"learning_rate": 0.00035999999999999997,
"loss": 2.9205,
"step": 6
},
{
"epoch": 0.0004017687808189096,
"grad_norm": 0.9012013673782349,
"learning_rate": 0.00041999999999999996,
"loss": 2.8937,
"step": 7
},
{
"epoch": 0.00045916432093589664,
"grad_norm": 0.9427694082260132,
"learning_rate": 0.00047999999999999996,
"loss": 2.904,
"step": 8
},
{
"epoch": 0.0005165598610528837,
"grad_norm": 1.662156105041504,
"learning_rate": 0.00054,
"loss": 2.9114,
"step": 9
},
{
"epoch": 0.0005739554011698709,
"grad_norm": 1.2877967357635498,
"learning_rate": 0.0006,
"loss": 2.9185,
"step": 10
},
{
"epoch": 0.000631350941286858,
"grad_norm": 1.3717082738876343,
"learning_rate": 0.0005999969170437548,
"loss": 2.899,
"step": 11
},
{
"epoch": 0.000688746481403845,
"grad_norm": 1.3706175088882446,
"learning_rate": 0.0005999876683017478,
"loss": 2.8522,
"step": 12
},
{
"epoch": 0.0007461420215208321,
"grad_norm": 0.7431464791297913,
"learning_rate": 0.0005999722541541584,
"loss": 2.8894,
"step": 13
},
{
"epoch": 0.0008035375616378192,
"grad_norm": 0.5839619040489197,
"learning_rate": 0.0005999506752346019,
"loss": 2.8866,
"step": 14
},
{
"epoch": 0.0008609331017548062,
"grad_norm": 0.5229901671409607,
"learning_rate": 0.0005999229324301031,
"loss": 2.8608,
"step": 15
},
{
"epoch": 0.0009183286418717933,
"grad_norm": 0.6879259943962097,
"learning_rate": 0.00059988902688106,
"loss": 2.8801,
"step": 16
},
{
"epoch": 0.0009757241819887805,
"grad_norm": 0.4949502646923065,
"learning_rate": 0.0005998489599811971,
"loss": 2.8857,
"step": 17
},
{
"epoch": 0.0010331197221057674,
"grad_norm": 0.5659216642379761,
"learning_rate": 0.0005998027333775077,
"loss": 2.8172,
"step": 18
},
{
"epoch": 0.0010905152622227546,
"grad_norm": 0.43849167227745056,
"learning_rate": 0.0005997503489701861,
"loss": 2.8479,
"step": 19
},
{
"epoch": 0.0011479108023397418,
"grad_norm": 0.5036750435829163,
"learning_rate": 0.0005996918089125504,
"loss": 2.8957,
"step": 20
},
{
"epoch": 0.0012053063424567287,
"grad_norm": 0.40093106031417847,
"learning_rate": 0.000599627115610953,
"loss": 2.8951,
"step": 21
},
{
"epoch": 0.001262701882573716,
"grad_norm": 0.3499244153499603,
"learning_rate": 0.0005995562717246821,
"loss": 2.8535,
"step": 22
},
{
"epoch": 0.0013200974226907029,
"grad_norm": 0.3672889769077301,
"learning_rate": 0.0005994792801658526,
"loss": 2.8507,
"step": 23
},
{
"epoch": 0.00137749296280769,
"grad_norm": 0.3307906985282898,
"learning_rate": 0.0005993961440992859,
"loss": 2.8597,
"step": 24
},
{
"epoch": 0.001434888502924677,
"grad_norm": 0.33352652192115784,
"learning_rate": 0.0005993068669423797,
"loss": 2.8023,
"step": 25
},
{
"epoch": 0.0014922840430416642,
"grad_norm": 0.30308255553245544,
"learning_rate": 0.0005992114523649685,
"loss": 2.864,
"step": 26
},
{
"epoch": 0.0015496795831586513,
"grad_norm": 0.2800331711769104,
"learning_rate": 0.000599109904289172,
"loss": 2.8459,
"step": 27
},
{
"epoch": 0.0016070751232756383,
"grad_norm": 0.2467849850654602,
"learning_rate": 0.0005990022268892337,
"loss": 2.8298,
"step": 28
},
{
"epoch": 0.0016644706633926255,
"grad_norm": 0.25928932428359985,
"learning_rate": 0.0005988884245913497,
"loss": 2.8061,
"step": 29
},
{
"epoch": 0.0017218662035096124,
"grad_norm": 0.2770285904407501,
"learning_rate": 0.0005987685020734869,
"loss": 2.8363,
"step": 30
},
{
"epoch": 0.0017792617436265996,
"grad_norm": 0.2888840436935425,
"learning_rate": 0.0005986424642651901,
"loss": 2.847,
"step": 31
},
{
"epoch": 0.0018366572837435866,
"grad_norm": 0.3389260172843933,
"learning_rate": 0.0005985103163473802,
"loss": 2.8185,
"step": 32
},
{
"epoch": 0.0018940528238605737,
"grad_norm": 0.3043622672557831,
"learning_rate": 0.0005983720637521404,
"loss": 2.8073,
"step": 33
},
{
"epoch": 0.001951448363977561,
"grad_norm": 0.2626359760761261,
"learning_rate": 0.0005982277121624933,
"loss": 2.8278,
"step": 34
},
{
"epoch": 0.002008843904094548,
"grad_norm": 0.2601317763328552,
"learning_rate": 0.0005980772675121675,
"loss": 2.8293,
"step": 35
},
{
"epoch": 0.002066239444211535,
"grad_norm": 0.2932066023349762,
"learning_rate": 0.0005979207359853532,
"loss": 2.842,
"step": 36
},
{
"epoch": 0.002123634984328522,
"grad_norm": 0.3828963041305542,
"learning_rate": 0.0005977581240164485,
"loss": 2.8383,
"step": 37
},
{
"epoch": 0.002181030524445509,
"grad_norm": 0.2928522527217865,
"learning_rate": 0.0005975894382897944,
"loss": 2.8291,
"step": 38
},
{
"epoch": 0.0022384260645624964,
"grad_norm": 0.2287234663963318,
"learning_rate": 0.0005974146857394005,
"loss": 2.8422,
"step": 39
},
{
"epoch": 0.0022958216046794835,
"grad_norm": 0.2722682058811188,
"learning_rate": 0.0005972338735486597,
"loss": 2.8217,
"step": 40
},
{
"epoch": 0.0023532171447964703,
"grad_norm": 0.21170516312122345,
"learning_rate": 0.0005970470091500531,
"loss": 2.831,
"step": 41
},
{
"epoch": 0.0024106126849134575,
"grad_norm": 0.22243160009384155,
"learning_rate": 0.0005968541002248439,
"loss": 2.862,
"step": 42
},
{
"epoch": 0.0024680082250304446,
"grad_norm": 0.18485133349895477,
"learning_rate": 0.0005966551547027627,
"loss": 2.8531,
"step": 43
},
{
"epoch": 0.002525403765147432,
"grad_norm": 0.21640127897262573,
"learning_rate": 0.0005964501807616806,
"loss": 2.8245,
"step": 44
},
{
"epoch": 0.0025827993052644185,
"grad_norm": 0.2716100513935089,
"learning_rate": 0.0005962391868272735,
"loss": 2.8093,
"step": 45
},
{
"epoch": 0.0026401948453814057,
"grad_norm": 0.19726517796516418,
"learning_rate": 0.0005960221815726757,
"loss": 2.8214,
"step": 46
},
{
"epoch": 0.002697590385498393,
"grad_norm": 0.2424098700284958,
"learning_rate": 0.0005957991739181231,
"loss": 2.818,
"step": 47
},
{
"epoch": 0.00275498592561538,
"grad_norm": 0.2414388209581375,
"learning_rate": 0.0005955701730305872,
"loss": 2.8491,
"step": 48
},
{
"epoch": 0.0028123814657323673,
"grad_norm": 0.25403571128845215,
"learning_rate": 0.0005953351883233972,
"loss": 2.8321,
"step": 49
},
{
"epoch": 0.002869777005849354,
"grad_norm": 0.30923786759376526,
"learning_rate": 0.0005950942294558544,
"loss": 2.8298,
"step": 50
},
{
"epoch": 0.002927172545966341,
"grad_norm": 0.22294141352176666,
"learning_rate": 0.0005948473063328338,
"loss": 2.8015,
"step": 51
},
{
"epoch": 0.0029845680860833283,
"grad_norm": 0.2882789075374603,
"learning_rate": 0.0005945944291043779,
"loss": 2.8256,
"step": 52
},
{
"epoch": 0.0030419636262003155,
"grad_norm": 0.25416064262390137,
"learning_rate": 0.0005943356081652793,
"loss": 2.8211,
"step": 53
},
{
"epoch": 0.0030993591663173027,
"grad_norm": 0.2488490343093872,
"learning_rate": 0.0005940708541546529,
"loss": 2.8618,
"step": 54
},
{
"epoch": 0.0031567547064342894,
"grad_norm": 0.27515849471092224,
"learning_rate": 0.000593800177955499,
"loss": 2.802,
"step": 55
},
{
"epoch": 0.0032141502465512766,
"grad_norm": 0.2030380666255951,
"learning_rate": 0.0005935235906942563,
"loss": 2.8229,
"step": 56
},
{
"epoch": 0.003271545786668264,
"grad_norm": 0.2384052276611328,
"learning_rate": 0.0005932411037403436,
"loss": 2.8122,
"step": 57
},
{
"epoch": 0.003328941326785251,
"grad_norm": 0.2543489336967468,
"learning_rate": 0.000592952728705693,
"loss": 2.8302,
"step": 58
},
{
"epoch": 0.003386336866902238,
"grad_norm": 0.2387794405221939,
"learning_rate": 0.000592658477444273,
"loss": 2.835,
"step": 59
},
{
"epoch": 0.003443732407019225,
"grad_norm": 0.2748169004917145,
"learning_rate": 0.0005923583620516003,
"loss": 2.834,
"step": 60
},
{
"epoch": 0.003501127947136212,
"grad_norm": 0.2565017640590668,
"learning_rate": 0.0005920523948642431,
"loss": 2.8452,
"step": 61
},
{
"epoch": 0.0035585234872531992,
"grad_norm": 0.25502678751945496,
"learning_rate": 0.0005917405884593144,
"loss": 2.8345,
"step": 62
},
{
"epoch": 0.0036159190273701864,
"grad_norm": 0.22830121219158173,
"learning_rate": 0.0005914229556539538,
"loss": 2.7989,
"step": 63
},
{
"epoch": 0.003673314567487173,
"grad_norm": 0.3146669268608093,
"learning_rate": 0.0005910995095048024,
"loss": 2.845,
"step": 64
},
{
"epoch": 0.0037307101076041603,
"grad_norm": 0.2924383580684662,
"learning_rate": 0.000590770263307464,
"loss": 2.8303,
"step": 65
},
{
"epoch": 0.0037881056477211475,
"grad_norm": 0.2577711343765259,
"learning_rate": 0.0005904352305959605,
"loss": 2.8156,
"step": 66
},
{
"epoch": 0.0038455011878381347,
"grad_norm": 0.2631978988647461,
"learning_rate": 0.0005900944251421745,
"loss": 2.833,
"step": 67
},
{
"epoch": 0.003902896727955122,
"grad_norm": 0.21994397044181824,
"learning_rate": 0.000589747860955283,
"loss": 2.8136,
"step": 68
},
{
"epoch": 0.003960292268072109,
"grad_norm": 0.3000943064689636,
"learning_rate": 0.0005893955522811827,
"loss": 2.8415,
"step": 69
},
{
"epoch": 0.004017687808189096,
"grad_norm": 0.24310976266860962,
"learning_rate": 0.0005890375136019032,
"loss": 2.8148,
"step": 70
},
{
"epoch": 0.004075083348306083,
"grad_norm": 0.24616850912570953,
"learning_rate": 0.0005886737596350122,
"loss": 2.8329,
"step": 71
},
{
"epoch": 0.00413247888842307,
"grad_norm": 0.2714521884918213,
"learning_rate": 0.0005883043053330105,
"loss": 2.8356,
"step": 72
},
{
"epoch": 0.004189874428540057,
"grad_norm": 0.2601388096809387,
"learning_rate": 0.0005879291658827176,
"loss": 2.8228,
"step": 73
},
{
"epoch": 0.004247269968657044,
"grad_norm": 0.22764116525650024,
"learning_rate": 0.0005875483567046467,
"loss": 2.801,
"step": 74
},
{
"epoch": 0.004304665508774032,
"grad_norm": 0.22346433997154236,
"learning_rate": 0.0005871618934523719,
"loss": 2.7948,
"step": 75
},
{
"epoch": 0.004362061048891018,
"grad_norm": 0.18839874863624573,
"learning_rate": 0.0005867697920118835,
"loss": 2.8341,
"step": 76
},
{
"epoch": 0.004419456589008005,
"grad_norm": 0.25794312357902527,
"learning_rate": 0.0005863720685009362,
"loss": 2.815,
"step": 77
},
{
"epoch": 0.004476852129124993,
"grad_norm": 0.2352106124162674,
"learning_rate": 0.0005859687392683856,
"loss": 2.8169,
"step": 78
},
{
"epoch": 0.0045342476692419795,
"grad_norm": 0.28784099221229553,
"learning_rate": 0.0005855598208935169,
"loss": 2.8506,
"step": 79
},
{
"epoch": 0.004591643209358967,
"grad_norm": 0.22999855875968933,
"learning_rate": 0.0005851453301853628,
"loss": 2.8377,
"step": 80
},
{
"epoch": 0.004649038749475954,
"grad_norm": 0.21411263942718506,
"learning_rate": 0.0005847252841820128,
"loss": 2.8137,
"step": 81
},
{
"epoch": 0.0047064342895929406,
"grad_norm": 0.2420736700296402,
"learning_rate": 0.0005842997001499129,
"loss": 2.7929,
"step": 82
},
{
"epoch": 0.004763829829709928,
"grad_norm": 0.24426190555095673,
"learning_rate": 0.0005838685955831558,
"loss": 2.8273,
"step": 83
},
{
"epoch": 0.004821225369826915,
"grad_norm": 0.20297811925411224,
"learning_rate": 0.0005834319882027617,
"loss": 2.7993,
"step": 84
},
{
"epoch": 0.0048786209099439025,
"grad_norm": 0.2474389523267746,
"learning_rate": 0.00058298989595595,
"loss": 2.8252,
"step": 85
},
{
"epoch": 0.004936016450060889,
"grad_norm": 0.22601982951164246,
"learning_rate": 0.0005825423370154012,
"loss": 2.8421,
"step": 86
},
{
"epoch": 0.004993411990177876,
"grad_norm": 0.24997788667678833,
"learning_rate": 0.0005820893297785106,
"loss": 2.8485,
"step": 87
},
{
"epoch": 0.005050807530294864,
"grad_norm": 0.19994623959064484,
"learning_rate": 0.0005816308928666314,
"loss": 2.8456,
"step": 88
},
{
"epoch": 0.00510820307041185,
"grad_norm": 0.19206245243549347,
"learning_rate": 0.0005811670451243093,
"loss": 2.8035,
"step": 89
},
{
"epoch": 0.005165598610528837,
"grad_norm": 0.2515026032924652,
"learning_rate": 0.0005806978056185083,
"loss": 2.8232,
"step": 90
},
{
"epoch": 0.005222994150645825,
"grad_norm": 0.22921022772789001,
"learning_rate": 0.0005802231936378267,
"loss": 2.8366,
"step": 91
},
{
"epoch": 0.0052803896907628114,
"grad_norm": 0.248809352517128,
"learning_rate": 0.000579743228691704,
"loss": 2.8331,
"step": 92
},
{
"epoch": 0.005337785230879799,
"grad_norm": 0.18247073888778687,
"learning_rate": 0.0005792579305096191,
"loss": 2.8249,
"step": 93
},
{
"epoch": 0.005395180770996786,
"grad_norm": 0.2440440058708191,
"learning_rate": 0.0005787673190402799,
"loss": 2.837,
"step": 94
},
{
"epoch": 0.0054525763111137725,
"grad_norm": 0.21160444617271423,
"learning_rate": 0.0005782714144508019,
"loss": 2.7864,
"step": 95
},
{
"epoch": 0.00550997185123076,
"grad_norm": 0.21344538033008575,
"learning_rate": 0.0005777702371258806,
"loss": 2.847,
"step": 96
},
{
"epoch": 0.005567367391347747,
"grad_norm": 0.24861139059066772,
"learning_rate": 0.0005772638076669529,
"loss": 2.8267,
"step": 97
},
{
"epoch": 0.0056247629314647345,
"grad_norm": 0.290520042181015,
"learning_rate": 0.0005767521468913501,
"loss": 2.827,
"step": 98
},
{
"epoch": 0.005682158471581721,
"grad_norm": 0.20536312460899353,
"learning_rate": 0.0005762352758314429,
"loss": 2.8476,
"step": 99
},
{
"epoch": 0.005739554011698708,
"grad_norm": 0.21782469749450684,
"learning_rate": 0.000575713215733776,
"loss": 2.844,
"step": 100
},
{
"epoch": 0.005739554011698708,
"eval_loss": 2.7509028911590576,
"eval_runtime": 85.2068,
"eval_samples_per_second": 50.641,
"eval_steps_per_second": 12.663,
"step": 100
},
{
"epoch": 0.005796949551815696,
"grad_norm": 0.2523731291294098,
"learning_rate": 0.0005751859880581954,
"loss": 2.8125,
"step": 101
},
{
"epoch": 0.005854345091932682,
"grad_norm": 0.30107325315475464,
"learning_rate": 0.0005746536144769656,
"loss": 2.8108,
"step": 102
},
{
"epoch": 0.00591174063204967,
"grad_norm": 0.24103832244873047,
"learning_rate": 0.0005741161168738794,
"loss": 2.8282,
"step": 103
},
{
"epoch": 0.005969136172166657,
"grad_norm": 0.31273001432418823,
"learning_rate": 0.0005735735173433582,
"loss": 2.8104,
"step": 104
},
{
"epoch": 0.006026531712283643,
"grad_norm": 0.19059035181999207,
"learning_rate": 0.0005730258381895433,
"loss": 2.8186,
"step": 105
},
{
"epoch": 0.006083927252400631,
"grad_norm": 0.25082021951675415,
"learning_rate": 0.0005724731019253797,
"loss": 2.8154,
"step": 106
},
{
"epoch": 0.006141322792517618,
"grad_norm": 0.23254480957984924,
"learning_rate": 0.0005719153312716904,
"loss": 2.8121,
"step": 107
},
{
"epoch": 0.006198718332634605,
"grad_norm": 0.24095705151557922,
"learning_rate": 0.0005713525491562421,
"loss": 2.8361,
"step": 108
},
{
"epoch": 0.006256113872751592,
"grad_norm": 0.17760275304317474,
"learning_rate": 0.0005707847787128034,
"loss": 2.8396,
"step": 109
},
{
"epoch": 0.006313509412868579,
"grad_norm": 0.20905229449272156,
"learning_rate": 0.0005702120432801934,
"loss": 2.8284,
"step": 110
},
{
"epoch": 0.0063709049529855665,
"grad_norm": 0.19538630545139313,
"learning_rate": 0.0005696343664013227,
"loss": 2.8417,
"step": 111
},
{
"epoch": 0.006428300493102553,
"grad_norm": 0.2408672571182251,
"learning_rate": 0.0005690517718222248,
"loss": 2.8416,
"step": 112
},
{
"epoch": 0.006485696033219541,
"grad_norm": 0.19618412852287292,
"learning_rate": 0.0005684642834910813,
"loss": 2.8683,
"step": 113
},
{
"epoch": 0.006543091573336528,
"grad_norm": 0.17854906618595123,
"learning_rate": 0.0005678719255572363,
"loss": 2.8232,
"step": 114
},
{
"epoch": 0.006600487113453514,
"grad_norm": 0.2527766227722168,
"learning_rate": 0.0005672747223702044,
"loss": 2.8219,
"step": 115
},
{
"epoch": 0.006657882653570502,
"grad_norm": 0.21465440094470978,
"learning_rate": 0.0005666726984786695,
"loss": 2.8308,
"step": 116
},
{
"epoch": 0.006715278193687489,
"grad_norm": 0.2080729454755783,
"learning_rate": 0.000566065878629476,
"loss": 2.8369,
"step": 117
},
{
"epoch": 0.006772673733804476,
"grad_norm": 0.18979360163211823,
"learning_rate": 0.0005654542877666108,
"loss": 2.7997,
"step": 118
},
{
"epoch": 0.006830069273921463,
"grad_norm": 0.20258580148220062,
"learning_rate": 0.0005648379510301792,
"loss": 2.846,
"step": 119
},
{
"epoch": 0.00688746481403845,
"grad_norm": 0.2112026810646057,
"learning_rate": 0.0005642168937553701,
"loss": 2.8521,
"step": 120
},
{
"epoch": 0.006944860354155437,
"grad_norm": 0.25105029344558716,
"learning_rate": 0.0005635911414714158,
"loss": 2.8081,
"step": 121
},
{
"epoch": 0.007002255894272424,
"grad_norm": 0.21830224990844727,
"learning_rate": 0.0005629607199005416,
"loss": 2.8161,
"step": 122
},
{
"epoch": 0.007059651434389411,
"grad_norm": 0.19216330349445343,
"learning_rate": 0.0005623256549569091,
"loss": 2.805,
"step": 123
},
{
"epoch": 0.0071170469745063985,
"grad_norm": 0.19969609379768372,
"learning_rate": 0.000561685972745551,
"loss": 2.7859,
"step": 124
},
{
"epoch": 0.007174442514623385,
"grad_norm": 0.22093947231769562,
"learning_rate": 0.0005610416995612973,
"loss": 2.8194,
"step": 125
},
{
"epoch": 0.007231838054740373,
"grad_norm": 0.2148187905550003,
"learning_rate": 0.0005603928618876952,
"loss": 2.8565,
"step": 126
},
{
"epoch": 0.0072892335948573595,
"grad_norm": 0.18277674913406372,
"learning_rate": 0.0005597394863959201,
"loss": 2.8187,
"step": 127
},
{
"epoch": 0.007346629134974346,
"grad_norm": 0.22607837617397308,
"learning_rate": 0.0005590815999436795,
"loss": 2.8607,
"step": 128
},
{
"epoch": 0.007404024675091334,
"grad_norm": 0.22417186200618744,
"learning_rate": 0.0005584192295741086,
"loss": 2.8198,
"step": 129
},
{
"epoch": 0.007461420215208321,
"grad_norm": 0.229670912027359,
"learning_rate": 0.0005577524025146591,
"loss": 2.8477,
"step": 130
},
{
"epoch": 0.007518815755325308,
"grad_norm": 0.1985808163881302,
"learning_rate": 0.0005570811461759794,
"loss": 2.8058,
"step": 131
},
{
"epoch": 0.007576211295442295,
"grad_norm": 0.22260330617427826,
"learning_rate": 0.0005564054881507886,
"loss": 2.8369,
"step": 132
},
{
"epoch": 0.007633606835559282,
"grad_norm": 0.20925524830818176,
"learning_rate": 0.0005557254562127417,
"loss": 2.8205,
"step": 133
},
{
"epoch": 0.007691002375676269,
"grad_norm": 0.26581674814224243,
"learning_rate": 0.0005550410783152882,
"loss": 2.8164,
"step": 134
},
{
"epoch": 0.007748397915793256,
"grad_norm": 0.2182077318429947,
"learning_rate": 0.0005543523825905229,
"loss": 2.8279,
"step": 135
},
{
"epoch": 0.007805793455910244,
"grad_norm": 0.24468722939491272,
"learning_rate": 0.0005536593973480297,
"loss": 2.8281,
"step": 136
},
{
"epoch": 0.007863188996027231,
"grad_norm": 0.22021321952342987,
"learning_rate": 0.0005529621510737175,
"loss": 2.8028,
"step": 137
},
{
"epoch": 0.007920584536144217,
"grad_norm": 0.20566654205322266,
"learning_rate": 0.0005522606724286498,
"loss": 2.7937,
"step": 138
},
{
"epoch": 0.007977980076261205,
"grad_norm": 0.1960543841123581,
"learning_rate": 0.0005515549902478665,
"loss": 2.8089,
"step": 139
},
{
"epoch": 0.008035375616378192,
"grad_norm": 0.2689999043941498,
"learning_rate": 0.0005508451335391975,
"loss": 2.7959,
"step": 140
},
{
"epoch": 0.008092771156495178,
"grad_norm": 0.19776718318462372,
"learning_rate": 0.0005501311314820721,
"loss": 2.8442,
"step": 141
},
{
"epoch": 0.008150166696612166,
"grad_norm": 0.2156287282705307,
"learning_rate": 0.0005494130134263184,
"loss": 2.8224,
"step": 142
},
{
"epoch": 0.008207562236729153,
"grad_norm": 0.17528703808784485,
"learning_rate": 0.0005486908088909568,
"loss": 2.8659,
"step": 143
},
{
"epoch": 0.00826495777684614,
"grad_norm": 0.1757359504699707,
"learning_rate": 0.0005479645475629872,
"loss": 2.8119,
"step": 144
},
{
"epoch": 0.008322353316963127,
"grad_norm": 0.1916513890028,
"learning_rate": 0.0005472342592961683,
"loss": 2.8069,
"step": 145
},
{
"epoch": 0.008379748857080115,
"grad_norm": 0.19162799417972565,
"learning_rate": 0.0005464999741097901,
"loss": 2.8211,
"step": 146
},
{
"epoch": 0.0084371443971971,
"grad_norm": 0.1881379634141922,
"learning_rate": 0.0005457617221874408,
"loss": 2.7954,
"step": 147
},
{
"epoch": 0.008494539937314088,
"grad_norm": 0.22305060923099518,
"learning_rate": 0.0005450195338757654,
"loss": 2.8447,
"step": 148
},
{
"epoch": 0.008551935477431076,
"grad_norm": 0.25081732869148254,
"learning_rate": 0.0005442734396832185,
"loss": 2.8205,
"step": 149
},
{
"epoch": 0.008609331017548063,
"grad_norm": 0.24046167731285095,
"learning_rate": 0.00054352347027881,
"loss": 2.8246,
"step": 150
},
{
"epoch": 0.00866672655766505,
"grad_norm": 0.20985569059848785,
"learning_rate": 0.0005427696564908447,
"loss": 2.8384,
"step": 151
},
{
"epoch": 0.008724122097782037,
"grad_norm": 0.18979063630104065,
"learning_rate": 0.000542012029305655,
"loss": 2.8261,
"step": 152
},
{
"epoch": 0.008781517637899024,
"grad_norm": 0.21513347327709198,
"learning_rate": 0.0005412506198663268,
"loss": 2.8197,
"step": 153
},
{
"epoch": 0.00883891317801601,
"grad_norm": 0.25432831048965454,
"learning_rate": 0.0005404854594714204,
"loss": 2.8091,
"step": 154
},
{
"epoch": 0.008896308718132998,
"grad_norm": 0.261273592710495,
"learning_rate": 0.0005397165795736823,
"loss": 2.8324,
"step": 155
},
{
"epoch": 0.008953704258249985,
"grad_norm": 0.22144336998462677,
"learning_rate": 0.0005389440117787538,
"loss": 2.8459,
"step": 156
},
{
"epoch": 0.009011099798366971,
"grad_norm": 0.1860560178756714,
"learning_rate": 0.000538167787843871,
"loss": 2.8552,
"step": 157
},
{
"epoch": 0.009068495338483959,
"grad_norm": 0.2402401566505432,
"learning_rate": 0.0005373879396765593,
"loss": 2.8229,
"step": 158
},
{
"epoch": 0.009125890878600947,
"grad_norm": 0.2112584114074707,
"learning_rate": 0.0005366044993333228,
"loss": 2.823,
"step": 159
},
{
"epoch": 0.009183286418717934,
"grad_norm": 0.24757996201515198,
"learning_rate": 0.0005358174990183254,
"loss": 2.8458,
"step": 160
},
{
"epoch": 0.00924068195883492,
"grad_norm": 0.20984984934329987,
"learning_rate": 0.0005350269710820675,
"loss": 2.8375,
"step": 161
},
{
"epoch": 0.009298077498951908,
"grad_norm": 0.22329501807689667,
"learning_rate": 0.0005342329480200562,
"loss": 2.815,
"step": 162
},
{
"epoch": 0.009355473039068895,
"grad_norm": 0.26144203543663025,
"learning_rate": 0.0005334354624714697,
"loss": 2.8286,
"step": 163
},
{
"epoch": 0.009412868579185881,
"grad_norm": 0.20015327632427216,
"learning_rate": 0.0005326345472178154,
"loss": 2.8304,
"step": 164
},
{
"epoch": 0.009470264119302869,
"grad_norm": 0.29256758093833923,
"learning_rate": 0.0005318302351815823,
"loss": 2.7884,
"step": 165
},
{
"epoch": 0.009527659659419856,
"grad_norm": 0.22914084792137146,
"learning_rate": 0.000531022559424888,
"loss": 2.8253,
"step": 166
},
{
"epoch": 0.009585055199536842,
"grad_norm": 0.2677003741264343,
"learning_rate": 0.0005302115531481195,
"loss": 2.8084,
"step": 167
},
{
"epoch": 0.00964245073965383,
"grad_norm": 0.2672327756881714,
"learning_rate": 0.000529397249688568,
"loss": 2.8351,
"step": 168
},
{
"epoch": 0.009699846279770817,
"grad_norm": 0.21281464397907257,
"learning_rate": 0.0005285796825190598,
"loss": 2.8463,
"step": 169
},
{
"epoch": 0.009757241819887805,
"grad_norm": 0.22858156263828278,
"learning_rate": 0.0005277588852465788,
"loss": 2.8156,
"step": 170
},
{
"epoch": 0.009814637360004791,
"grad_norm": 0.20694582164287567,
"learning_rate": 0.0005269348916108859,
"loss": 2.8392,
"step": 171
},
{
"epoch": 0.009872032900121779,
"grad_norm": 0.22438685595989227,
"learning_rate": 0.0005261077354831322,
"loss": 2.8336,
"step": 172
},
{
"epoch": 0.009929428440238766,
"grad_norm": 0.2279587984085083,
"learning_rate": 0.0005252774508644666,
"loss": 2.7972,
"step": 173
},
{
"epoch": 0.009986823980355752,
"grad_norm": 0.21278439462184906,
"learning_rate": 0.0005244440718846375,
"loss": 2.7946,
"step": 174
},
{
"epoch": 0.01004421952047274,
"grad_norm": 0.23399871587753296,
"learning_rate": 0.0005236076328005906,
"loss": 2.8648,
"step": 175
},
{
"epoch": 0.010101615060589727,
"grad_norm": 0.2649572193622589,
"learning_rate": 0.0005227681679950607,
"loss": 2.8453,
"step": 176
},
{
"epoch": 0.010159010600706713,
"grad_norm": 0.21067285537719727,
"learning_rate": 0.0005219257119751581,
"loss": 2.8357,
"step": 177
},
{
"epoch": 0.0102164061408237,
"grad_norm": 0.22862860560417175,
"learning_rate": 0.0005210802993709497,
"loss": 2.8235,
"step": 178
},
{
"epoch": 0.010273801680940688,
"grad_norm": 0.22179283201694489,
"learning_rate": 0.0005202319649340369,
"loss": 2.82,
"step": 179
},
{
"epoch": 0.010331197221057674,
"grad_norm": 0.16690605878829956,
"learning_rate": 0.0005193807435361252,
"loss": 2.8237,
"step": 180
},
{
"epoch": 0.010388592761174662,
"grad_norm": 0.21572506427764893,
"learning_rate": 0.0005185266701675927,
"loss": 2.8403,
"step": 181
},
{
"epoch": 0.01044598830129165,
"grad_norm": 0.1778525710105896,
"learning_rate": 0.0005176697799360502,
"loss": 2.8204,
"step": 182
},
{
"epoch": 0.010503383841408637,
"grad_norm": 0.18887534737586975,
"learning_rate": 0.0005168101080648989,
"loss": 2.8146,
"step": 183
},
{
"epoch": 0.010560779381525623,
"grad_norm": 0.18108077347278595,
"learning_rate": 0.0005159476898918823,
"loss": 2.853,
"step": 184
},
{
"epoch": 0.01061817492164261,
"grad_norm": 0.1870754212141037,
"learning_rate": 0.0005150825608676336,
"loss": 2.8537,
"step": 185
},
{
"epoch": 0.010675570461759598,
"grad_norm": 0.16484060883522034,
"learning_rate": 0.0005142147565542188,
"loss": 2.8194,
"step": 186
},
{
"epoch": 0.010732966001876584,
"grad_norm": 0.18527449667453766,
"learning_rate": 0.0005133443126236739,
"loss": 2.8402,
"step": 187
},
{
"epoch": 0.010790361541993572,
"grad_norm": 0.17674389481544495,
"learning_rate": 0.0005124712648565398,
"loss": 2.8412,
"step": 188
},
{
"epoch": 0.01084775708211056,
"grad_norm": 0.2521503269672394,
"learning_rate": 0.0005115956491403907,
"loss": 2.8348,
"step": 189
},
{
"epoch": 0.010905152622227545,
"grad_norm": 0.17621657252311707,
"learning_rate": 0.000510717501468359,
"loss": 2.8293,
"step": 190
},
{
"epoch": 0.010962548162344533,
"grad_norm": 0.2621336579322815,
"learning_rate": 0.0005098368579376563,
"loss": 2.8164,
"step": 191
},
{
"epoch": 0.01101994370246152,
"grad_norm": 0.18950189650058746,
"learning_rate": 0.0005089537547480885,
"loss": 2.7976,
"step": 192
},
{
"epoch": 0.011077339242578508,
"grad_norm": 0.24857239425182343,
"learning_rate": 0.0005080682282005692,
"loss": 2.8323,
"step": 193
},
{
"epoch": 0.011134734782695494,
"grad_norm": 0.16708490252494812,
"learning_rate": 0.0005071803146956262,
"loss": 2.801,
"step": 194
},
{
"epoch": 0.011192130322812481,
"grad_norm": 0.24443359673023224,
"learning_rate": 0.000506290050731906,
"loss": 2.8121,
"step": 195
},
{
"epoch": 0.011249525862929469,
"grad_norm": 0.2458924949169159,
"learning_rate": 0.0005053974729046734,
"loss": 2.8325,
"step": 196
},
{
"epoch": 0.011306921403046455,
"grad_norm": 0.2034812569618225,
"learning_rate": 0.0005045026179043067,
"loss": 2.8123,
"step": 197
},
{
"epoch": 0.011364316943163442,
"grad_norm": 0.2774895429611206,
"learning_rate": 0.0005036055225147901,
"loss": 2.8324,
"step": 198
},
{
"epoch": 0.01142171248328043,
"grad_norm": 0.22201013565063477,
"learning_rate": 0.0005027062236122014,
"loss": 2.8195,
"step": 199
},
{
"epoch": 0.011479108023397416,
"grad_norm": 0.1893691122531891,
"learning_rate": 0.0005018047581631961,
"loss": 2.8177,
"step": 200
},
{
"epoch": 0.011479108023397416,
"eval_loss": 2.749150037765503,
"eval_runtime": 85.2258,
"eval_samples_per_second": 50.63,
"eval_steps_per_second": 12.66,
"step": 200
},
{
"epoch": 0.011536503563514404,
"grad_norm": 0.2689765691757202,
"learning_rate": 0.0005009011632234881,
"loss": 2.8438,
"step": 201
},
{
"epoch": 0.011593899103631391,
"grad_norm": 0.2234533727169037,
"learning_rate": 0.0004999954759363262,
"loss": 2.8103,
"step": 202
},
{
"epoch": 0.011651294643748379,
"grad_norm": 0.25140801072120667,
"learning_rate": 0.0004990877335309675,
"loss": 2.8178,
"step": 203
},
{
"epoch": 0.011708690183865365,
"grad_norm": 0.3070688843727112,
"learning_rate": 0.0004981779733211468,
"loss": 2.8518,
"step": 204
},
{
"epoch": 0.011766085723982352,
"grad_norm": 0.25637757778167725,
"learning_rate": 0.0004972662327035431,
"loss": 2.8578,
"step": 205
},
{
"epoch": 0.01182348126409934,
"grad_norm": 0.2551119923591614,
"learning_rate": 0.0004963525491562421,
"loss": 2.8237,
"step": 206
},
{
"epoch": 0.011880876804216326,
"grad_norm": 0.2416735738515854,
"learning_rate": 0.0004954369602371958,
"loss": 2.8195,
"step": 207
},
{
"epoch": 0.011938272344333313,
"grad_norm": 0.3950039744377136,
"learning_rate": 0.0004945195035826785,
"loss": 2.8087,
"step": 208
},
{
"epoch": 0.011995667884450301,
"grad_norm": 0.16370531916618347,
"learning_rate": 0.00049360021690574,
"loss": 2.8464,
"step": 209
},
{
"epoch": 0.012053063424567287,
"grad_norm": 0.28070008754730225,
"learning_rate": 0.0004926791379946549,
"loss": 2.8377,
"step": 210
},
{
"epoch": 0.012110458964684274,
"grad_norm": 0.1902085244655609,
"learning_rate": 0.0004917563047113695,
"loss": 2.8279,
"step": 211
},
{
"epoch": 0.012167854504801262,
"grad_norm": 0.27748385071754456,
"learning_rate": 0.0004908317549899456,
"loss": 2.837,
"step": 212
},
{
"epoch": 0.012225250044918248,
"grad_norm": 0.18437190353870392,
"learning_rate": 0.0004899055268350012,
"loss": 2.8301,
"step": 213
},
{
"epoch": 0.012282645585035236,
"grad_norm": 0.22971947491168976,
"learning_rate": 0.0004889776583201479,
"loss": 2.8051,
"step": 214
},
{
"epoch": 0.012340041125152223,
"grad_norm": 0.238089457154274,
"learning_rate": 0.0004880481875864261,
"loss": 2.8162,
"step": 215
},
{
"epoch": 0.01239743666526921,
"grad_norm": 0.24253320693969727,
"learning_rate": 0.0004871171528407371,
"loss": 2.8181,
"step": 216
},
{
"epoch": 0.012454832205386197,
"grad_norm": 0.2351958006620407,
"learning_rate": 0.0004861845923542728,
"loss": 2.8136,
"step": 217
},
{
"epoch": 0.012512227745503184,
"grad_norm": 0.23203608393669128,
"learning_rate": 0.0004852505444609422,
"loss": 2.804,
"step": 218
},
{
"epoch": 0.012569623285620172,
"grad_norm": 0.1896822452545166,
"learning_rate": 0.00048431504755579575,
"loss": 2.8118,
"step": 219
},
{
"epoch": 0.012627018825737158,
"grad_norm": 0.18357349932193756,
"learning_rate": 0.0004833781400934471,
"loss": 2.8205,
"step": 220
},
{
"epoch": 0.012684414365854145,
"grad_norm": 0.23723295331001282,
"learning_rate": 0.00048243986058649246,
"loss": 2.8291,
"step": 221
},
{
"epoch": 0.012741809905971133,
"grad_norm": 0.1937919706106186,
"learning_rate": 0.0004815002476039273,
"loss": 2.8416,
"step": 222
},
{
"epoch": 0.012799205446088119,
"grad_norm": 0.19754467904567719,
"learning_rate": 0.0004805593397695613,
"loss": 2.7963,
"step": 223
},
{
"epoch": 0.012856600986205106,
"grad_norm": 0.1592610776424408,
"learning_rate": 0.00047961717576043,
"loss": 2.8264,
"step": 224
},
{
"epoch": 0.012913996526322094,
"grad_norm": 0.2083783745765686,
"learning_rate": 0.00047867379430520585,
"loss": 2.8348,
"step": 225
},
{
"epoch": 0.012971392066439082,
"grad_norm": 0.1895647495985031,
"learning_rate": 0.00047772923418260525,
"loss": 2.8212,
"step": 226
},
{
"epoch": 0.013028787606556068,
"grad_norm": 0.2173570841550827,
"learning_rate": 0.0004767835342197954,
"loss": 2.8098,
"step": 227
},
{
"epoch": 0.013086183146673055,
"grad_norm": 0.1693475991487503,
"learning_rate": 0.0004758367332907978,
"loss": 2.796,
"step": 228
},
{
"epoch": 0.013143578686790043,
"grad_norm": 0.21635355055332184,
"learning_rate": 0.00047488887031489017,
"loss": 2.843,
"step": 229
},
{
"epoch": 0.013200974226907029,
"grad_norm": 0.18521156907081604,
"learning_rate": 0.0004739399842550068,
"loss": 2.8296,
"step": 230
},
{
"epoch": 0.013258369767024016,
"grad_norm": 0.22925664484500885,
"learning_rate": 0.00047299011411613734,
"loss": 2.8287,
"step": 231
},
{
"epoch": 0.013315765307141004,
"grad_norm": 0.24881386756896973,
"learning_rate": 0.00047203929894372264,
"loss": 2.8257,
"step": 232
},
{
"epoch": 0.01337316084725799,
"grad_norm": 0.20801618695259094,
"learning_rate": 0.00047108757782205043,
"loss": 2.8241,
"step": 233
},
{
"epoch": 0.013430556387374977,
"grad_norm": 0.199665367603302,
"learning_rate": 0.0004701349898726483,
"loss": 2.7916,
"step": 234
},
{
"epoch": 0.013487951927491965,
"grad_norm": 0.25221607089042664,
"learning_rate": 0.00046918157425267584,
"loss": 2.8233,
"step": 235
},
{
"epoch": 0.013545347467608953,
"grad_norm": 0.1931813657283783,
"learning_rate": 0.00046822737015331505,
"loss": 2.8016,
"step": 236
},
{
"epoch": 0.013602743007725938,
"grad_norm": 0.17353369295597076,
"learning_rate": 0.00046727241679815894,
"loss": 2.8125,
"step": 237
},
{
"epoch": 0.013660138547842926,
"grad_norm": 0.22225958108901978,
"learning_rate": 0.0004663167534415996,
"loss": 2.824,
"step": 238
},
{
"epoch": 0.013717534087959914,
"grad_norm": 0.17010116577148438,
"learning_rate": 0.0004653604193672147,
"loss": 2.8425,
"step": 239
},
{
"epoch": 0.0137749296280769,
"grad_norm": 0.2103683203458786,
"learning_rate": 0.00046440345388615225,
"loss": 2.8641,
"step": 240
},
{
"epoch": 0.013832325168193887,
"grad_norm": 0.17934557795524597,
"learning_rate": 0.00046344589633551497,
"loss": 2.8069,
"step": 241
},
{
"epoch": 0.013889720708310875,
"grad_norm": 0.2116999328136444,
"learning_rate": 0.0004624877860767434,
"loss": 2.8601,
"step": 242
},
{
"epoch": 0.01394711624842786,
"grad_norm": 0.20861205458641052,
"learning_rate": 0.0004615291624939975,
"loss": 2.8232,
"step": 243
},
{
"epoch": 0.014004511788544848,
"grad_norm": 0.24393285810947418,
"learning_rate": 0.0004605700649925381,
"loss": 2.8041,
"step": 244
},
{
"epoch": 0.014061907328661836,
"grad_norm": 0.2089577168226242,
"learning_rate": 0.0004596105329971069,
"loss": 2.8351,
"step": 245
},
{
"epoch": 0.014119302868778822,
"grad_norm": 0.20232421159744263,
"learning_rate": 0.00045865060595030616,
"loss": 2.8171,
"step": 246
},
{
"epoch": 0.01417669840889581,
"grad_norm": 0.22081732749938965,
"learning_rate": 0.00045769032331097686,
"loss": 2.8202,
"step": 247
},
{
"epoch": 0.014234093949012797,
"grad_norm": 0.17081516981124878,
"learning_rate": 0.00045672972455257723,
"loss": 2.8358,
"step": 248
},
{
"epoch": 0.014291489489129785,
"grad_norm": 0.3317008316516876,
"learning_rate": 0.0004557688491615597,
"loss": 2.8302,
"step": 249
},
{
"epoch": 0.01434888502924677,
"grad_norm": 0.23239760100841522,
"learning_rate": 0.0004548077366357483,
"loss": 2.8191,
"step": 250
},
{
"epoch": 0.014406280569363758,
"grad_norm": 0.22138993442058563,
"learning_rate": 0.0004538464264827143,
"loss": 2.8096,
"step": 251
},
{
"epoch": 0.014463676109480746,
"grad_norm": 0.23655574023723602,
"learning_rate": 0.000452884958218153,
"loss": 2.8295,
"step": 252
},
{
"epoch": 0.014521071649597731,
"grad_norm": 0.2227945327758789,
"learning_rate": 0.000451923371364259,
"loss": 2.8158,
"step": 253
},
{
"epoch": 0.014578467189714719,
"grad_norm": 0.20443300902843475,
"learning_rate": 0.0004509617054481017,
"loss": 2.83,
"step": 254
},
{
"epoch": 0.014635862729831707,
"grad_norm": 0.22221451997756958,
"learning_rate": 0.00045,
"loss": 2.8253,
"step": 255
},
{
"epoch": 0.014693258269948693,
"grad_norm": 0.1941068023443222,
"learning_rate": 0.00044903829455189825,
"loss": 2.83,
"step": 256
},
{
"epoch": 0.01475065381006568,
"grad_norm": 0.1914331614971161,
"learning_rate": 0.0004480766286357409,
"loss": 2.8162,
"step": 257
},
{
"epoch": 0.014808049350182668,
"grad_norm": 0.21014779806137085,
"learning_rate": 0.0004471150417818469,
"loss": 2.7993,
"step": 258
},
{
"epoch": 0.014865444890299655,
"grad_norm": 0.2057676762342453,
"learning_rate": 0.00044615357351728566,
"loss": 2.8223,
"step": 259
},
{
"epoch": 0.014922840430416641,
"grad_norm": 0.19875939190387726,
"learning_rate": 0.00044519226336425165,
"loss": 2.8016,
"step": 260
},
{
"epoch": 0.014980235970533629,
"grad_norm": 0.23691999912261963,
"learning_rate": 0.0004442311508384402,
"loss": 2.8373,
"step": 261
},
{
"epoch": 0.015037631510650616,
"grad_norm": 0.1729947328567505,
"learning_rate": 0.0004432702754474228,
"loss": 2.8233,
"step": 262
},
{
"epoch": 0.015095027050767602,
"grad_norm": 0.18821187317371368,
"learning_rate": 0.00044230967668902306,
"loss": 2.8128,
"step": 263
},
{
"epoch": 0.01515242259088459,
"grad_norm": 0.2283882200717926,
"learning_rate": 0.00044134939404969387,
"loss": 2.8178,
"step": 264
},
{
"epoch": 0.015209818131001578,
"grad_norm": 0.16724412143230438,
"learning_rate": 0.000440389467002893,
"loss": 2.8249,
"step": 265
},
{
"epoch": 0.015267213671118563,
"grad_norm": 0.18209712207317352,
"learning_rate": 0.00043942993500746183,
"loss": 2.8095,
"step": 266
},
{
"epoch": 0.015324609211235551,
"grad_norm": 0.1857995092868805,
"learning_rate": 0.00043847083750600253,
"loss": 2.806,
"step": 267
},
{
"epoch": 0.015382004751352539,
"grad_norm": 0.20734605193138123,
"learning_rate": 0.0004375122139232566,
"loss": 2.8695,
"step": 268
},
{
"epoch": 0.015439400291469526,
"grad_norm": 0.23138895630836487,
"learning_rate": 0.00043655410366448495,
"loss": 2.8033,
"step": 269
},
{
"epoch": 0.015496795831586512,
"grad_norm": 0.20481987297534943,
"learning_rate": 0.0004355965461138477,
"loss": 2.8269,
"step": 270
},
{
"epoch": 0.0155541913717035,
"grad_norm": 0.2318529337644577,
"learning_rate": 0.00043463958063278524,
"loss": 2.8332,
"step": 271
},
{
"epoch": 0.015611586911820487,
"grad_norm": 0.2501411736011505,
"learning_rate": 0.00043368324655840035,
"loss": 2.8445,
"step": 272
},
{
"epoch": 0.015668982451937475,
"grad_norm": 0.26137158274650574,
"learning_rate": 0.0004327275832018411,
"loss": 2.8279,
"step": 273
},
{
"epoch": 0.015726377992054463,
"grad_norm": 0.19074887037277222,
"learning_rate": 0.0004317726298466849,
"loss": 2.8132,
"step": 274
},
{
"epoch": 0.015783773532171447,
"grad_norm": 0.26000818610191345,
"learning_rate": 0.0004308184257473241,
"loss": 2.8091,
"step": 275
},
{
"epoch": 0.015841169072288434,
"grad_norm": 0.16060984134674072,
"learning_rate": 0.0004298650101273517,
"loss": 2.8206,
"step": 276
},
{
"epoch": 0.015898564612405422,
"grad_norm": 0.284445583820343,
"learning_rate": 0.00042891242217794954,
"loss": 2.7867,
"step": 277
},
{
"epoch": 0.01595596015252241,
"grad_norm": 0.15903466939926147,
"learning_rate": 0.0004279607010562773,
"loss": 2.83,
"step": 278
},
{
"epoch": 0.016013355692639397,
"grad_norm": 0.24330751597881317,
"learning_rate": 0.0004270098858838626,
"loss": 2.817,
"step": 279
},
{
"epoch": 0.016070751232756385,
"grad_norm": 0.1687777042388916,
"learning_rate": 0.0004260600157449931,
"loss": 2.8112,
"step": 280
},
{
"epoch": 0.01612814677287337,
"grad_norm": 0.18230785429477692,
"learning_rate": 0.0004251111296851098,
"loss": 2.8394,
"step": 281
},
{
"epoch": 0.016185542312990357,
"grad_norm": 0.1889660507440567,
"learning_rate": 0.00042416326670920217,
"loss": 2.8109,
"step": 282
},
{
"epoch": 0.016242937853107344,
"grad_norm": 0.16135123372077942,
"learning_rate": 0.0004232164657802045,
"loss": 2.7953,
"step": 283
},
{
"epoch": 0.016300333393224332,
"grad_norm": 0.15787218511104584,
"learning_rate": 0.00042227076581739467,
"loss": 2.7921,
"step": 284
},
{
"epoch": 0.01635772893334132,
"grad_norm": 0.16313977539539337,
"learning_rate": 0.0004213262056947942,
"loss": 2.8107,
"step": 285
},
{
"epoch": 0.016415124473458307,
"grad_norm": 0.18806132674217224,
"learning_rate": 0.0004203828242395699,
"loss": 2.8451,
"step": 286
},
{
"epoch": 0.016472520013575295,
"grad_norm": 0.17279674112796783,
"learning_rate": 0.00041944066023043866,
"loss": 2.8333,
"step": 287
},
{
"epoch": 0.01652991555369228,
"grad_norm": 0.17451834678649902,
"learning_rate": 0.00041849975239607255,
"loss": 2.7798,
"step": 288
},
{
"epoch": 0.016587311093809266,
"grad_norm": 0.1943039745092392,
"learning_rate": 0.00041756013941350747,
"loss": 2.8011,
"step": 289
},
{
"epoch": 0.016644706633926254,
"grad_norm": 0.1578904092311859,
"learning_rate": 0.0004166218599065528,
"loss": 2.852,
"step": 290
},
{
"epoch": 0.01670210217404324,
"grad_norm": 0.20066620409488678,
"learning_rate": 0.0004156849524442042,
"loss": 2.7876,
"step": 291
},
{
"epoch": 0.01675949771416023,
"grad_norm": 0.18306495249271393,
"learning_rate": 0.0004147494555390577,
"loss": 2.817,
"step": 292
},
{
"epoch": 0.016816893254277217,
"grad_norm": 0.1622687727212906,
"learning_rate": 0.0004138154076457271,
"loss": 2.815,
"step": 293
},
{
"epoch": 0.0168742887943942,
"grad_norm": 0.2056518942117691,
"learning_rate": 0.0004128828471592628,
"loss": 2.8131,
"step": 294
},
{
"epoch": 0.01693168433451119,
"grad_norm": 0.17123937606811523,
"learning_rate": 0.00041195181241357383,
"loss": 2.8025,
"step": 295
},
{
"epoch": 0.016989079874628176,
"grad_norm": 0.2233334332704544,
"learning_rate": 0.00041102234167985204,
"loss": 2.8347,
"step": 296
},
{
"epoch": 0.017046475414745164,
"grad_norm": 0.20740529894828796,
"learning_rate": 0.0004100944731649987,
"loss": 2.8099,
"step": 297
},
{
"epoch": 0.01710387095486215,
"grad_norm": 0.20391066372394562,
"learning_rate": 0.0004091682450100543,
"loss": 2.8363,
"step": 298
},
{
"epoch": 0.01716126649497914,
"grad_norm": 0.17306548357009888,
"learning_rate": 0.0004082436952886305,
"loss": 2.8211,
"step": 299
},
{
"epoch": 0.017218662035096127,
"grad_norm": 0.24933576583862305,
"learning_rate": 0.0004073208620053451,
"loss": 2.8048,
"step": 300
},
{
"epoch": 0.017218662035096127,
"eval_loss": 2.7432332038879395,
"eval_runtime": 85.2508,
"eval_samples_per_second": 50.615,
"eval_steps_per_second": 12.657,
"step": 300
},
{
"epoch": 0.01727605757521311,
"grad_norm": 0.231708824634552,
"learning_rate": 0.00040639978309425995,
"loss": 2.8025,
"step": 301
},
{
"epoch": 0.0173334531153301,
"grad_norm": 0.15970614552497864,
"learning_rate": 0.00040548049641732137,
"loss": 2.8392,
"step": 302
},
{
"epoch": 0.017390848655447086,
"grad_norm": 0.20457029342651367,
"learning_rate": 0.0004045630397628042,
"loss": 2.8247,
"step": 303
},
{
"epoch": 0.017448244195564074,
"grad_norm": 0.1734900325536728,
"learning_rate": 0.00040364745084375787,
"loss": 2.7979,
"step": 304
},
{
"epoch": 0.01750563973568106,
"grad_norm": 0.19265452027320862,
"learning_rate": 0.00040273376729645685,
"loss": 2.8033,
"step": 305
},
{
"epoch": 0.01756303527579805,
"grad_norm": 0.19174844026565552,
"learning_rate": 0.00040182202667885317,
"loss": 2.8354,
"step": 306
},
{
"epoch": 0.017620430815915036,
"grad_norm": 0.27793413400650024,
"learning_rate": 0.00040091226646903245,
"loss": 2.797,
"step": 307
},
{
"epoch": 0.01767782635603202,
"grad_norm": 0.1806309074163437,
"learning_rate": 0.00040000452406367367,
"loss": 2.8046,
"step": 308
},
{
"epoch": 0.017735221896149008,
"grad_norm": 0.2249089479446411,
"learning_rate": 0.0003990988367765118,
"loss": 2.8125,
"step": 309
},
{
"epoch": 0.017792617436265996,
"grad_norm": 0.27839699387550354,
"learning_rate": 0.00039819524183680384,
"loss": 2.8183,
"step": 310
},
{
"epoch": 0.017850012976382983,
"grad_norm": 0.1877232789993286,
"learning_rate": 0.00039729377638779857,
"loss": 2.7989,
"step": 311
},
{
"epoch": 0.01790740851649997,
"grad_norm": 0.25160273909568787,
"learning_rate": 0.00039639447748520985,
"loss": 2.8536,
"step": 312
},
{
"epoch": 0.01796480405661696,
"grad_norm": 0.23843353986740112,
"learning_rate": 0.0003954973820956932,
"loss": 2.8064,
"step": 313
},
{
"epoch": 0.018022199596733943,
"grad_norm": 0.2549470365047455,
"learning_rate": 0.00039460252709532656,
"loss": 2.8415,
"step": 314
},
{
"epoch": 0.01807959513685093,
"grad_norm": 0.39248892664909363,
"learning_rate": 0.0003937099492680938,
"loss": 2.8137,
"step": 315
},
{
"epoch": 0.018136990676967918,
"grad_norm": 0.24034982919692993,
"learning_rate": 0.0003928196853043737,
"loss": 2.8301,
"step": 316
},
{
"epoch": 0.018194386217084905,
"grad_norm": 0.29434794187545776,
"learning_rate": 0.00039193177179943083,
"loss": 2.8288,
"step": 317
},
{
"epoch": 0.018251781757201893,
"grad_norm": 0.21636317670345306,
"learning_rate": 0.0003910462452519114,
"loss": 2.8121,
"step": 318
},
{
"epoch": 0.01830917729731888,
"grad_norm": 0.2217407375574112,
"learning_rate": 0.0003901631420623437,
"loss": 2.8551,
"step": 319
},
{
"epoch": 0.01836657283743587,
"grad_norm": 0.20126426219940186,
"learning_rate": 0.0003892824985316409,
"loss": 2.7812,
"step": 320
},
{
"epoch": 0.018423968377552852,
"grad_norm": 0.20343463122844696,
"learning_rate": 0.0003884043508596093,
"loss": 2.7959,
"step": 321
},
{
"epoch": 0.01848136391766984,
"grad_norm": 0.22265484929084778,
"learning_rate": 0.00038752873514346015,
"loss": 2.8254,
"step": 322
},
{
"epoch": 0.018538759457786828,
"grad_norm": 0.20545947551727295,
"learning_rate": 0.000386655687376326,
"loss": 2.8166,
"step": 323
},
{
"epoch": 0.018596154997903815,
"grad_norm": 0.17015507817268372,
"learning_rate": 0.00038578524344578115,
"loss": 2.806,
"step": 324
},
{
"epoch": 0.018653550538020803,
"grad_norm": 0.19378258287906647,
"learning_rate": 0.00038491743913236624,
"loss": 2.7979,
"step": 325
},
{
"epoch": 0.01871094607813779,
"grad_norm": 0.2112617790699005,
"learning_rate": 0.0003840523101081177,
"loss": 2.8149,
"step": 326
},
{
"epoch": 0.018768341618254775,
"grad_norm": 0.18846029043197632,
"learning_rate": 0.0003831898919351011,
"loss": 2.8334,
"step": 327
},
{
"epoch": 0.018825737158371762,
"grad_norm": 0.20672033727169037,
"learning_rate": 0.00038233022006394976,
"loss": 2.8061,
"step": 328
},
{
"epoch": 0.01888313269848875,
"grad_norm": 0.2700256109237671,
"learning_rate": 0.00038147332983240717,
"loss": 2.8101,
"step": 329
},
{
"epoch": 0.018940528238605737,
"grad_norm": 0.16990099847316742,
"learning_rate": 0.00038061925646387467,
"loss": 2.8227,
"step": 330
},
{
"epoch": 0.018997923778722725,
"grad_norm": 0.2140357792377472,
"learning_rate": 0.0003797680350659631,
"loss": 2.8018,
"step": 331
},
{
"epoch": 0.019055319318839713,
"grad_norm": 0.2538260221481323,
"learning_rate": 0.0003789197006290502,
"loss": 2.7725,
"step": 332
},
{
"epoch": 0.0191127148589567,
"grad_norm": 0.1694011092185974,
"learning_rate": 0.0003780742880248419,
"loss": 2.7973,
"step": 333
},
{
"epoch": 0.019170110399073684,
"grad_norm": 0.2092764526605606,
"learning_rate": 0.0003772318320049391,
"loss": 2.8256,
"step": 334
},
{
"epoch": 0.019227505939190672,
"grad_norm": 0.22675682604312897,
"learning_rate": 0.0003763923671994093,
"loss": 2.8092,
"step": 335
},
{
"epoch": 0.01928490147930766,
"grad_norm": 0.20571155846118927,
"learning_rate": 0.0003755559281153625,
"loss": 2.8176,
"step": 336
},
{
"epoch": 0.019342297019424647,
"grad_norm": 0.18606650829315186,
"learning_rate": 0.0003747225491355334,
"loss": 2.8019,
"step": 337
},
{
"epoch": 0.019399692559541635,
"grad_norm": 0.19859890639781952,
"learning_rate": 0.00037389226451686763,
"loss": 2.8036,
"step": 338
},
{
"epoch": 0.019457088099658622,
"grad_norm": 0.1632896512746811,
"learning_rate": 0.00037306510838911404,
"loss": 2.797,
"step": 339
},
{
"epoch": 0.01951448363977561,
"grad_norm": 0.17494754493236542,
"learning_rate": 0.00037224111475342116,
"loss": 2.8152,
"step": 340
},
{
"epoch": 0.019571879179892594,
"grad_norm": 0.20659732818603516,
"learning_rate": 0.00037142031748094016,
"loss": 2.8061,
"step": 341
},
{
"epoch": 0.019629274720009582,
"grad_norm": 0.18716713786125183,
"learning_rate": 0.00037060275031143184,
"loss": 2.8419,
"step": 342
},
{
"epoch": 0.01968667026012657,
"grad_norm": 0.2575749158859253,
"learning_rate": 0.0003697884468518805,
"loss": 2.7814,
"step": 343
},
{
"epoch": 0.019744065800243557,
"grad_norm": 0.19076134264469147,
"learning_rate": 0.0003689774405751119,
"loss": 2.797,
"step": 344
},
{
"epoch": 0.019801461340360545,
"grad_norm": 0.19563442468643188,
"learning_rate": 0.00036816976481841764,
"loss": 2.8269,
"step": 345
},
{
"epoch": 0.019858856880477532,
"grad_norm": 0.1790810525417328,
"learning_rate": 0.0003673654527821846,
"loss": 2.7856,
"step": 346
},
{
"epoch": 0.019916252420594516,
"grad_norm": 0.2125868797302246,
"learning_rate": 0.00036656453752853025,
"loss": 2.7973,
"step": 347
},
{
"epoch": 0.019973647960711504,
"grad_norm": 0.1454995572566986,
"learning_rate": 0.00036576705197994376,
"loss": 2.7869,
"step": 348
},
{
"epoch": 0.02003104350082849,
"grad_norm": 0.2808379530906677,
"learning_rate": 0.00036497302891793255,
"loss": 2.7923,
"step": 349
},
{
"epoch": 0.02008843904094548,
"grad_norm": 0.1776140034198761,
"learning_rate": 0.0003641825009816745,
"loss": 2.8194,
"step": 350
},
{
"epoch": 0.020145834581062467,
"grad_norm": 0.22207793593406677,
"learning_rate": 0.0003633955006666771,
"loss": 2.8234,
"step": 351
},
{
"epoch": 0.020203230121179454,
"grad_norm": 0.24642404913902283,
"learning_rate": 0.0003626120603234406,
"loss": 2.8351,
"step": 352
},
{
"epoch": 0.020260625661296442,
"grad_norm": 0.24731726944446564,
"learning_rate": 0.000361832212156129,
"loss": 2.7983,
"step": 353
},
{
"epoch": 0.020318021201413426,
"grad_norm": 0.21677981317043304,
"learning_rate": 0.0003610559882212461,
"loss": 2.8372,
"step": 354
},
{
"epoch": 0.020375416741530414,
"grad_norm": 0.28350090980529785,
"learning_rate": 0.00036028342042631755,
"loss": 2.8138,
"step": 355
},
{
"epoch": 0.0204328122816474,
"grad_norm": 0.22418756783008575,
"learning_rate": 0.00035951454052857954,
"loss": 2.7897,
"step": 356
},
{
"epoch": 0.02049020782176439,
"grad_norm": 0.27765804529190063,
"learning_rate": 0.000358749380133673,
"loss": 2.8139,
"step": 357
},
{
"epoch": 0.020547603361881377,
"grad_norm": 0.2694258391857147,
"learning_rate": 0.000357987970694345,
"loss": 2.7881,
"step": 358
},
{
"epoch": 0.020604998901998364,
"grad_norm": 0.3746117055416107,
"learning_rate": 0.00035723034350915525,
"loss": 2.8108,
"step": 359
},
{
"epoch": 0.02066239444211535,
"grad_norm": 0.22864773869514465,
"learning_rate": 0.00035647652972119,
"loss": 2.8102,
"step": 360
},
{
"epoch": 0.020719789982232336,
"grad_norm": 0.2728801369667053,
"learning_rate": 0.0003557265603167814,
"loss": 2.8046,
"step": 361
},
{
"epoch": 0.020777185522349324,
"grad_norm": 0.2561710774898529,
"learning_rate": 0.0003549804661242345,
"loss": 2.8242,
"step": 362
},
{
"epoch": 0.02083458106246631,
"grad_norm": 0.26235631108283997,
"learning_rate": 0.00035423827781255914,
"loss": 2.847,
"step": 363
},
{
"epoch": 0.0208919766025833,
"grad_norm": 0.24725806713104248,
"learning_rate": 0.0003535000258902099,
"loss": 2.7873,
"step": 364
},
{
"epoch": 0.020949372142700286,
"grad_norm": 0.2562279999256134,
"learning_rate": 0.0003527657407038317,
"loss": 2.799,
"step": 365
},
{
"epoch": 0.021006767682817274,
"grad_norm": 0.20368199050426483,
"learning_rate": 0.00035203545243701266,
"loss": 2.8011,
"step": 366
},
{
"epoch": 0.021064163222934258,
"grad_norm": 0.25594958662986755,
"learning_rate": 0.0003513091911090431,
"loss": 2.8099,
"step": 367
},
{
"epoch": 0.021121558763051246,
"grad_norm": 0.20084761083126068,
"learning_rate": 0.00035058698657368154,
"loss": 2.8249,
"step": 368
},
{
"epoch": 0.021178954303168233,
"grad_norm": 0.24110020697116852,
"learning_rate": 0.00034986886851792775,
"loss": 2.8058,
"step": 369
},
{
"epoch": 0.02123634984328522,
"grad_norm": 0.2016633003950119,
"learning_rate": 0.0003491548664608024,
"loss": 2.7935,
"step": 370
},
{
"epoch": 0.02129374538340221,
"grad_norm": 0.2722468376159668,
"learning_rate": 0.0003484450097521336,
"loss": 2.8146,
"step": 371
},
{
"epoch": 0.021351140923519196,
"grad_norm": 0.2089434564113617,
"learning_rate": 0.0003477393275713501,
"loss": 2.8231,
"step": 372
},
{
"epoch": 0.021408536463636184,
"grad_norm": 0.24770453572273254,
"learning_rate": 0.0003470378489262824,
"loss": 2.7994,
"step": 373
},
{
"epoch": 0.021465932003753168,
"grad_norm": 0.21104897558689117,
"learning_rate": 0.00034634060265197026,
"loss": 2.8189,
"step": 374
},
{
"epoch": 0.021523327543870156,
"grad_norm": 0.23374824225902557,
"learning_rate": 0.000345647617409477,
"loss": 2.783,
"step": 375
},
{
"epoch": 0.021580723083987143,
"grad_norm": 0.24334168434143066,
"learning_rate": 0.00034495892168471176,
"loss": 2.8092,
"step": 376
},
{
"epoch": 0.02163811862410413,
"grad_norm": 0.22772932052612305,
"learning_rate": 0.00034427454378725827,
"loss": 2.8178,
"step": 377
},
{
"epoch": 0.02169551416422112,
"grad_norm": 0.22545067965984344,
"learning_rate": 0.00034359451184921125,
"loss": 2.7961,
"step": 378
},
{
"epoch": 0.021752909704338106,
"grad_norm": 0.2873929738998413,
"learning_rate": 0.00034291885382402044,
"loss": 2.8408,
"step": 379
},
{
"epoch": 0.02181030524445509,
"grad_norm": 0.2099824994802475,
"learning_rate": 0.00034224759748534083,
"loss": 2.782,
"step": 380
},
{
"epoch": 0.021867700784572078,
"grad_norm": 0.32221996784210205,
"learning_rate": 0.0003415807704258913,
"loss": 2.8337,
"step": 381
},
{
"epoch": 0.021925096324689065,
"grad_norm": 0.2531490623950958,
"learning_rate": 0.0003409184000563204,
"loss": 2.8273,
"step": 382
},
{
"epoch": 0.021982491864806053,
"grad_norm": 0.3075484037399292,
"learning_rate": 0.00034026051360407973,
"loss": 2.7805,
"step": 383
},
{
"epoch": 0.02203988740492304,
"grad_norm": 0.2366313338279724,
"learning_rate": 0.0003396071381123047,
"loss": 2.8278,
"step": 384
},
{
"epoch": 0.022097282945040028,
"grad_norm": 0.2348204106092453,
"learning_rate": 0.00033895830043870266,
"loss": 2.7922,
"step": 385
},
{
"epoch": 0.022154678485157016,
"grad_norm": 0.28124627470970154,
"learning_rate": 0.00033831402725444896,
"loss": 2.8065,
"step": 386
},
{
"epoch": 0.022212074025274,
"grad_norm": 0.1927008032798767,
"learning_rate": 0.0003376743450430907,
"loss": 2.7958,
"step": 387
},
{
"epoch": 0.022269469565390988,
"grad_norm": 0.26325997710227966,
"learning_rate": 0.0003370392800994583,
"loss": 2.8313,
"step": 388
},
{
"epoch": 0.022326865105507975,
"grad_norm": 0.23394963145256042,
"learning_rate": 0.0003364088585285842,
"loss": 2.8126,
"step": 389
},
{
"epoch": 0.022384260645624963,
"grad_norm": 0.26055994629859924,
"learning_rate": 0.00033578310624462983,
"loss": 2.787,
"step": 390
},
{
"epoch": 0.02244165618574195,
"grad_norm": 0.2207145392894745,
"learning_rate": 0.0003351620489698208,
"loss": 2.796,
"step": 391
},
{
"epoch": 0.022499051725858938,
"grad_norm": 0.34231698513031006,
"learning_rate": 0.0003345457122333891,
"loss": 2.7951,
"step": 392
},
{
"epoch": 0.022556447265975922,
"grad_norm": 0.22361671924591064,
"learning_rate": 0.00033393412137052396,
"loss": 2.8251,
"step": 393
},
{
"epoch": 0.02261384280609291,
"grad_norm": 0.24573372304439545,
"learning_rate": 0.0003333273015213304,
"loss": 2.7899,
"step": 394
},
{
"epoch": 0.022671238346209897,
"grad_norm": 0.22109688818454742,
"learning_rate": 0.0003327252776297955,
"loss": 2.8178,
"step": 395
},
{
"epoch": 0.022728633886326885,
"grad_norm": 0.22289875149726868,
"learning_rate": 0.00033212807444276364,
"loss": 2.8053,
"step": 396
},
{
"epoch": 0.022786029426443873,
"grad_norm": 0.21445147693157196,
"learning_rate": 0.00033153571650891865,
"loss": 2.7998,
"step": 397
},
{
"epoch": 0.02284342496656086,
"grad_norm": 0.25061139464378357,
"learning_rate": 0.00033094822817777514,
"loss": 2.8055,
"step": 398
},
{
"epoch": 0.022900820506677848,
"grad_norm": 0.24680854380130768,
"learning_rate": 0.0003303656335986773,
"loss": 2.8143,
"step": 399
},
{
"epoch": 0.022958216046794832,
"grad_norm": 0.16644932329654694,
"learning_rate": 0.0003297879567198065,
"loss": 2.8192,
"step": 400
},
{
"epoch": 0.022958216046794832,
"eval_loss": 2.738191604614258,
"eval_runtime": 85.3252,
"eval_samples_per_second": 50.571,
"eval_steps_per_second": 12.646,
"step": 400
},
{
"epoch": 0.02301561158691182,
"grad_norm": 0.2816384434700012,
"learning_rate": 0.00032921522128719657,
"loss": 2.8209,
"step": 401
},
{
"epoch": 0.023073007127028807,
"grad_norm": 0.20395685732364655,
"learning_rate": 0.00032864745084375783,
"loss": 2.8021,
"step": 402
},
{
"epoch": 0.023130402667145795,
"grad_norm": 0.24216794967651367,
"learning_rate": 0.00032808466872830957,
"loss": 2.8447,
"step": 403
},
{
"epoch": 0.023187798207262782,
"grad_norm": 0.2526738941669464,
"learning_rate": 0.00032752689807462017,
"loss": 2.7906,
"step": 404
},
{
"epoch": 0.02324519374737977,
"grad_norm": 0.21725283563137054,
"learning_rate": 0.0003269741618104566,
"loss": 2.7943,
"step": 405
},
{
"epoch": 0.023302589287496758,
"grad_norm": 0.2765718102455139,
"learning_rate": 0.00032642648265664175,
"loss": 2.8109,
"step": 406
},
{
"epoch": 0.02335998482761374,
"grad_norm": 0.20015880465507507,
"learning_rate": 0.00032588388312612053,
"loss": 2.8239,
"step": 407
},
{
"epoch": 0.02341738036773073,
"grad_norm": 0.26865240931510925,
"learning_rate": 0.0003253463855230344,
"loss": 2.8279,
"step": 408
},
{
"epoch": 0.023474775907847717,
"grad_norm": 0.23522211611270905,
"learning_rate": 0.0003248140119418046,
"loss": 2.8123,
"step": 409
},
{
"epoch": 0.023532171447964705,
"grad_norm": 0.2388644963502884,
"learning_rate": 0.0003242867842662239,
"loss": 2.8057,
"step": 410
},
{
"epoch": 0.023589566988081692,
"grad_norm": 0.18323197960853577,
"learning_rate": 0.00032376472416855703,
"loss": 2.8193,
"step": 411
},
{
"epoch": 0.02364696252819868,
"grad_norm": 0.24734856188297272,
"learning_rate": 0.00032324785310864983,
"loss": 2.7924,
"step": 412
},
{
"epoch": 0.023704358068315664,
"grad_norm": 0.1722363829612732,
"learning_rate": 0.0003227361923330471,
"loss": 2.8242,
"step": 413
},
{
"epoch": 0.02376175360843265,
"grad_norm": 0.2052358090877533,
"learning_rate": 0.00032222976287411934,
"loss": 2.8129,
"step": 414
},
{
"epoch": 0.02381914914854964,
"grad_norm": 0.2536105811595917,
"learning_rate": 0.00032172858554919807,
"loss": 2.8207,
"step": 415
},
{
"epoch": 0.023876544688666627,
"grad_norm": 0.23084022104740143,
"learning_rate": 0.00032123268095972005,
"loss": 2.8156,
"step": 416
},
{
"epoch": 0.023933940228783614,
"grad_norm": 0.28741586208343506,
"learning_rate": 0.00032074206949038073,
"loss": 2.8008,
"step": 417
},
{
"epoch": 0.023991335768900602,
"grad_norm": 0.2419297993183136,
"learning_rate": 0.0003202567713082959,
"loss": 2.8112,
"step": 418
},
{
"epoch": 0.02404873130901759,
"grad_norm": 0.19744537770748138,
"learning_rate": 0.0003197768063621732,
"loss": 2.7894,
"step": 419
},
{
"epoch": 0.024106126849134574,
"grad_norm": 0.22780993580818176,
"learning_rate": 0.0003193021943814916,
"loss": 2.8019,
"step": 420
},
{
"epoch": 0.02416352238925156,
"grad_norm": 0.2176397144794464,
"learning_rate": 0.00031883295487569063,
"loss": 2.8183,
"step": 421
},
{
"epoch": 0.02422091792936855,
"grad_norm": 0.23891203105449677,
"learning_rate": 0.00031836910713336857,
"loss": 2.8022,
"step": 422
},
{
"epoch": 0.024278313469485537,
"grad_norm": 0.18507017195224762,
"learning_rate": 0.0003179106702214893,
"loss": 2.8013,
"step": 423
},
{
"epoch": 0.024335709009602524,
"grad_norm": 0.20408926904201508,
"learning_rate": 0.0003174576629845987,
"loss": 2.8085,
"step": 424
},
{
"epoch": 0.024393104549719512,
"grad_norm": 0.18055075407028198,
"learning_rate": 0.00031701010404404996,
"loss": 2.8341,
"step": 425
},
{
"epoch": 0.024450500089836496,
"grad_norm": 0.22974956035614014,
"learning_rate": 0.0003165680117972382,
"loss": 2.8044,
"step": 426
},
{
"epoch": 0.024507895629953484,
"grad_norm": 0.17688511312007904,
"learning_rate": 0.00031613140441684413,
"loss": 2.7866,
"step": 427
},
{
"epoch": 0.02456529117007047,
"grad_norm": 0.22350828349590302,
"learning_rate": 0.000315700299850087,
"loss": 2.7939,
"step": 428
},
{
"epoch": 0.02462268671018746,
"grad_norm": 0.2138863056898117,
"learning_rate": 0.0003152747158179871,
"loss": 2.8112,
"step": 429
},
{
"epoch": 0.024680082250304446,
"grad_norm": 0.1666262447834015,
"learning_rate": 0.0003148546698146371,
"loss": 2.8464,
"step": 430
},
{
"epoch": 0.024737477790421434,
"grad_norm": 0.23217864334583282,
"learning_rate": 0.00031444017910648293,
"loss": 2.8154,
"step": 431
},
{
"epoch": 0.02479487333053842,
"grad_norm": 0.23967209458351135,
"learning_rate": 0.00031403126073161424,
"loss": 2.8068,
"step": 432
},
{
"epoch": 0.024852268870655406,
"grad_norm": 0.2363416850566864,
"learning_rate": 0.0003136279314990637,
"loss": 2.832,
"step": 433
},
{
"epoch": 0.024909664410772393,
"grad_norm": 0.20204566419124603,
"learning_rate": 0.00031323020798811643,
"loss": 2.8118,
"step": 434
},
{
"epoch": 0.02496705995088938,
"grad_norm": 0.2645012438297272,
"learning_rate": 0.00031283810654762816,
"loss": 2.7988,
"step": 435
},
{
"epoch": 0.02502445549100637,
"grad_norm": 0.31096434593200684,
"learning_rate": 0.0003124516432953532,
"loss": 2.8021,
"step": 436
},
{
"epoch": 0.025081851031123356,
"grad_norm": 0.25740697979927063,
"learning_rate": 0.00031207083411728236,
"loss": 2.828,
"step": 437
},
{
"epoch": 0.025139246571240344,
"grad_norm": 0.24895477294921875,
"learning_rate": 0.00031169569466698937,
"loss": 2.8073,
"step": 438
},
{
"epoch": 0.02519664211135733,
"grad_norm": 0.2860502004623413,
"learning_rate": 0.00031132624036498774,
"loss": 2.8275,
"step": 439
},
{
"epoch": 0.025254037651474315,
"grad_norm": 0.3134096562862396,
"learning_rate": 0.00031096248639809674,
"loss": 2.816,
"step": 440
},
{
"epoch": 0.025311433191591303,
"grad_norm": 0.2185070812702179,
"learning_rate": 0.0003106044477188172,
"loss": 2.7799,
"step": 441
},
{
"epoch": 0.02536882873170829,
"grad_norm": 0.3582714796066284,
"learning_rate": 0.0003102521390447169,
"loss": 2.7923,
"step": 442
},
{
"epoch": 0.02542622427182528,
"grad_norm": 0.19494207203388214,
"learning_rate": 0.00030990557485782553,
"loss": 2.7999,
"step": 443
},
{
"epoch": 0.025483619811942266,
"grad_norm": 0.2574940025806427,
"learning_rate": 0.0003095647694040394,
"loss": 2.8087,
"step": 444
},
{
"epoch": 0.025541015352059254,
"grad_norm": 0.17501215636730194,
"learning_rate": 0.0003092297366925359,
"loss": 2.7817,
"step": 445
},
{
"epoch": 0.025598410892176238,
"grad_norm": 0.4073377251625061,
"learning_rate": 0.0003089004904951976,
"loss": 2.813,
"step": 446
},
{
"epoch": 0.025655806432293225,
"grad_norm": 0.21654489636421204,
"learning_rate": 0.000308577044346046,
"loss": 2.8165,
"step": 447
},
{
"epoch": 0.025713201972410213,
"grad_norm": 0.26500189304351807,
"learning_rate": 0.0003082594115406856,
"loss": 2.8229,
"step": 448
},
{
"epoch": 0.0257705975125272,
"grad_norm": 0.188262477517128,
"learning_rate": 0.00030794760513575675,
"loss": 2.8112,
"step": 449
},
{
"epoch": 0.025827993052644188,
"grad_norm": 0.3432970643043518,
"learning_rate": 0.00030764163794839966,
"loss": 2.8241,
"step": 450
},
{
"epoch": 0.025885388592761176,
"grad_norm": 0.23415225744247437,
"learning_rate": 0.0003073415225557269,
"loss": 2.8039,
"step": 451
},
{
"epoch": 0.025942784132878163,
"grad_norm": 0.2670385241508484,
"learning_rate": 0.0003070472712943069,
"loss": 2.8215,
"step": 452
},
{
"epoch": 0.026000179672995147,
"grad_norm": 0.17434735596179962,
"learning_rate": 0.00030675889625965646,
"loss": 2.8352,
"step": 453
},
{
"epoch": 0.026057575213112135,
"grad_norm": 0.2789264917373657,
"learning_rate": 0.0003064764093057437,
"loss": 2.7856,
"step": 454
},
{
"epoch": 0.026114970753229123,
"grad_norm": 0.2666022479534149,
"learning_rate": 0.0003061998220445009,
"loss": 2.8063,
"step": 455
},
{
"epoch": 0.02617236629334611,
"grad_norm": 0.22438260912895203,
"learning_rate": 0.00030592914584534706,
"loss": 2.7783,
"step": 456
},
{
"epoch": 0.026229761833463098,
"grad_norm": 0.2177169770002365,
"learning_rate": 0.00030566439183472063,
"loss": 2.786,
"step": 457
},
{
"epoch": 0.026287157373580086,
"grad_norm": 0.22771142423152924,
"learning_rate": 0.000305405570895622,
"loss": 2.7881,
"step": 458
},
{
"epoch": 0.02634455291369707,
"grad_norm": 0.29228097200393677,
"learning_rate": 0.00030515269366716613,
"loss": 2.7876,
"step": 459
},
{
"epoch": 0.026401948453814057,
"grad_norm": 0.18204721808433533,
"learning_rate": 0.00030490577054414553,
"loss": 2.8153,
"step": 460
},
{
"epoch": 0.026459343993931045,
"grad_norm": 0.19830970466136932,
"learning_rate": 0.0003046648116766027,
"loss": 2.7884,
"step": 461
},
{
"epoch": 0.026516739534048032,
"grad_norm": 0.17311398684978485,
"learning_rate": 0.00030442982696941276,
"loss": 2.8055,
"step": 462
},
{
"epoch": 0.02657413507416502,
"grad_norm": 0.21194536983966827,
"learning_rate": 0.0003042008260818768,
"loss": 2.815,
"step": 463
},
{
"epoch": 0.026631530614282008,
"grad_norm": 0.22366400063037872,
"learning_rate": 0.0003039778184273243,
"loss": 2.7994,
"step": 464
},
{
"epoch": 0.026688926154398995,
"grad_norm": 0.17785237729549408,
"learning_rate": 0.00030376081317272645,
"loss": 2.8049,
"step": 465
},
{
"epoch": 0.02674632169451598,
"grad_norm": 0.2285715490579605,
"learning_rate": 0.00030354981923831934,
"loss": 2.8105,
"step": 466
},
{
"epoch": 0.026803717234632967,
"grad_norm": 0.17985928058624268,
"learning_rate": 0.0003033448452972373,
"loss": 2.8246,
"step": 467
},
{
"epoch": 0.026861112774749955,
"grad_norm": 0.2026437669992447,
"learning_rate": 0.000303145899775156,
"loss": 2.8192,
"step": 468
},
{
"epoch": 0.026918508314866942,
"grad_norm": 0.2605213522911072,
"learning_rate": 0.0003029529908499469,
"loss": 2.826,
"step": 469
},
{
"epoch": 0.02697590385498393,
"grad_norm": 0.22592206299304962,
"learning_rate": 0.00030276612645134017,
"loss": 2.7987,
"step": 470
},
{
"epoch": 0.027033299395100917,
"grad_norm": 0.2988434433937073,
"learning_rate": 0.0003025853142605994,
"loss": 2.826,
"step": 471
},
{
"epoch": 0.027090694935217905,
"grad_norm": 0.2247052788734436,
"learning_rate": 0.0003024105617102055,
"loss": 2.815,
"step": 472
},
{
"epoch": 0.02714809047533489,
"grad_norm": 0.26565778255462646,
"learning_rate": 0.00030224187598355145,
"loss": 2.8283,
"step": 473
},
{
"epoch": 0.027205486015451877,
"grad_norm": 0.2834932804107666,
"learning_rate": 0.00030207926401464675,
"loss": 2.8088,
"step": 474
},
{
"epoch": 0.027262881555568864,
"grad_norm": 0.2396688312292099,
"learning_rate": 0.0003019227324878324,
"loss": 2.8024,
"step": 475
},
{
"epoch": 0.027320277095685852,
"grad_norm": 0.2600051760673523,
"learning_rate": 0.0003017722878375066,
"loss": 2.8258,
"step": 476
},
{
"epoch": 0.02737767263580284,
"grad_norm": 0.26368406414985657,
"learning_rate": 0.00030162793624785957,
"loss": 2.7875,
"step": 477
},
{
"epoch": 0.027435068175919827,
"grad_norm": 0.389852911233902,
"learning_rate": 0.0003014896836526197,
"loss": 2.8166,
"step": 478
},
{
"epoch": 0.02749246371603681,
"grad_norm": 0.23984675109386444,
"learning_rate": 0.0003013575357348098,
"loss": 2.8025,
"step": 479
},
{
"epoch": 0.0275498592561538,
"grad_norm": 0.24591901898384094,
"learning_rate": 0.00030123149792651307,
"loss": 2.7898,
"step": 480
},
{
"epoch": 0.027607254796270787,
"grad_norm": 0.24797213077545166,
"learning_rate": 0.00030111157540865026,
"loss": 2.8291,
"step": 481
},
{
"epoch": 0.027664650336387774,
"grad_norm": 0.2542579770088196,
"learning_rate": 0.0003009977731107663,
"loss": 2.7868,
"step": 482
},
{
"epoch": 0.027722045876504762,
"grad_norm": 0.21780452132225037,
"learning_rate": 0.00030089009571082794,
"loss": 2.8051,
"step": 483
},
{
"epoch": 0.02777944141662175,
"grad_norm": 0.2790198028087616,
"learning_rate": 0.0003007885476350314,
"loss": 2.8004,
"step": 484
},
{
"epoch": 0.027836836956738737,
"grad_norm": 0.2793212831020355,
"learning_rate": 0.00030069313305762025,
"loss": 2.8077,
"step": 485
},
{
"epoch": 0.02789423249685572,
"grad_norm": 0.2663847506046295,
"learning_rate": 0.0003006038559007141,
"loss": 2.805,
"step": 486
},
{
"epoch": 0.02795162803697271,
"grad_norm": 0.2695571482181549,
"learning_rate": 0.0003005207198341473,
"loss": 2.8102,
"step": 487
},
{
"epoch": 0.028009023577089696,
"grad_norm": 0.3027716875076294,
"learning_rate": 0.0003004437282753177,
"loss": 2.7944,
"step": 488
},
{
"epoch": 0.028066419117206684,
"grad_norm": 0.25220444798469543,
"learning_rate": 0.0003003728843890469,
"loss": 2.781,
"step": 489
},
{
"epoch": 0.02812381465732367,
"grad_norm": 0.2733742594718933,
"learning_rate": 0.0003003081910874495,
"loss": 2.8138,
"step": 490
},
{
"epoch": 0.02818121019744066,
"grad_norm": 0.23873530328273773,
"learning_rate": 0.00030024965102981387,
"loss": 2.8017,
"step": 491
},
{
"epoch": 0.028238605737557643,
"grad_norm": 0.29158100485801697,
"learning_rate": 0.0003001972666224923,
"loss": 2.8084,
"step": 492
},
{
"epoch": 0.02829600127767463,
"grad_norm": 0.3079324960708618,
"learning_rate": 0.00030015104001880274,
"loss": 2.8061,
"step": 493
},
{
"epoch": 0.02835339681779162,
"grad_norm": 0.2448122203350067,
"learning_rate": 0.00030011097311893984,
"loss": 2.7817,
"step": 494
},
{
"epoch": 0.028410792357908606,
"grad_norm": 0.3495275378227234,
"learning_rate": 0.00030007706756989683,
"loss": 2.8053,
"step": 495
},
{
"epoch": 0.028468187898025594,
"grad_norm": 0.19935691356658936,
"learning_rate": 0.000300049324765398,
"loss": 2.7985,
"step": 496
},
{
"epoch": 0.02852558343814258,
"grad_norm": 0.30157798528671265,
"learning_rate": 0.0003000277458458415,
"loss": 2.8271,
"step": 497
},
{
"epoch": 0.02858297897825957,
"grad_norm": 0.23343823850154877,
"learning_rate": 0.00030001233169825214,
"loss": 2.807,
"step": 498
},
{
"epoch": 0.028640374518376553,
"grad_norm": 0.25404173135757446,
"learning_rate": 0.0003000030829562451,
"loss": 2.8072,
"step": 499
},
{
"epoch": 0.02869777005849354,
"grad_norm": 0.28863540291786194,
"learning_rate": 0.0003,
"loss": 2.8088,
"step": 500
},
{
"epoch": 0.02869777005849354,
"eval_loss": 2.735079288482666,
"eval_runtime": 85.4355,
"eval_samples_per_second": 50.506,
"eval_steps_per_second": 12.629,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 150,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.69922551431168e+17,
"train_batch_size": 22,
"trial_name": null,
"trial_params": null
}